From aeac6b34318fe7aedf5515c72aefa8a7d9937d5b Mon Sep 17 00:00:00 2001
From: huzhenhong <455879568@qq.com>
Date: Wed, 29 Nov 2023 18:20:42 +0800
Subject: [PATCH 1/5] Support macos onnxruntime

---
 mmdeploy/backend/onnxruntime/init_plugins.py | 2 ++
 setup.py                                     | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/mmdeploy/backend/onnxruntime/init_plugins.py b/mmdeploy/backend/onnxruntime/init_plugins.py
index fd0d850fe5..3897be22a9 100644
--- a/mmdeploy/backend/onnxruntime/init_plugins.py
+++ b/mmdeploy/backend/onnxruntime/init_plugins.py
@@ -13,6 +13,7 @@ def get_ops_path() -> str:
     candidates = [
         '../../lib/libmmdeploy_onnxruntime_ops.so',
         '../../lib/mmdeploy_onnxruntime_ops.dll',
+        '../../lib/libmmdeploy_onnxruntime_ops.dylib',
     ]
     return get_file_path(os.path.dirname(__file__), candidates)
 
@@ -26,5 +27,6 @@ def get_lib_path() -> str:
     candidates = [
         '../../lib/libonnxruntime.so*',
         '../../lib/onnxruntime.dll',
+        '../../lib/libmmdeploy_onnxruntime_ops.dylib',
     ]
     return get_file_path(os.path.dirname(__file__), candidates)
diff --git a/setup.py b/setup.py
index ddd853648d..aa71fd2898 100644
--- a/setup.py
+++ b/setup.py
@@ -138,7 +138,9 @@ def get_extensions():
         # environment, the compiler will choose the appropriate compiler
         # to compile those cpp files, so there is no need to add the
         # argument
-        if platform.system() != 'Windows':
+        if platform.system() == 'Darwin':
+            extra_compile_args['cxx'] = ['-std=c++17']
+        elif platform.system() != 'Windows':
             extra_compile_args['cxx'] = ['-std=c++14']
 
         include_dirs = []

From 6a6314aeb2e4b42e3de61a886818bc0f799afd85 Mon Sep 17 00:00:00 2001
From: huzhenhong <455879568@qq.com>
Date: Mon, 11 Dec 2023 21:02:23 +0800
Subject: [PATCH 2/5] format cpp and cuda code

 mistake
---
 .clang-format                                 |  403 +-
 csrc/mmdeploy/apis/c/mmdeploy/classifier.cpp  |  196 +-
 csrc/mmdeploy/apis/c/mmdeploy/classifier.h    |  233 +-
 csrc/mmdeploy/apis/c/mmdeploy/common.cpp      |  205 +-
 csrc/mmdeploy/apis/c/mmdeploy/common.h        |  247 +-
 .../apis/c/mmdeploy/common_internal.h         |  215 +-
 csrc/mmdeploy/apis/c/mmdeploy/detector.cpp    |  208 +-
 csrc/mmdeploy/apis/c/mmdeploy/detector.h      |  231 +-
 csrc/mmdeploy/apis/c/mmdeploy/executor.cpp    |  350 +-
 csrc/mmdeploy/apis/c/mmdeploy/executor.h      |  208 +-
 .../apis/c/mmdeploy/executor_internal.h       |   66 +-
 csrc/mmdeploy/apis/c/mmdeploy/handle.h        |   83 +-
 csrc/mmdeploy/apis/c/mmdeploy/model.cpp       |   61 +-
 csrc/mmdeploy/apis/c/mmdeploy/model.h         |   53 +-
 csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp    |  137 +-
 csrc/mmdeploy/apis/c/mmdeploy/pipeline.h      |   92 +-
 .../apis/c/mmdeploy/pose_detector.cpp         |  301 +-
 csrc/mmdeploy/apis/c/mmdeploy/pose_detector.h |  208 +-
 .../mmdeploy/apis/c/mmdeploy/pose_tracker.cpp |  272 +-
 csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h  |  273 +-
 csrc/mmdeploy/apis/c/mmdeploy/restorer.cpp    |  177 +-
 csrc/mmdeploy/apis/c/mmdeploy/restorer.h      |  132 +-
 .../apis/c/mmdeploy/rotated_detector.cpp      |  200 +-
 .../apis/c/mmdeploy/rotated_detector.h        |  235 +-
 csrc/mmdeploy/apis/c/mmdeploy/segmentor.cpp   |  189 +-
 csrc/mmdeploy/apis/c/mmdeploy/segmentor.h     |  165 +-
 .../apis/c/mmdeploy/text_detector.cpp         |  281 +-
 csrc/mmdeploy/apis/c/mmdeploy/text_detector.h |  272 +-
 .../apis/c/mmdeploy/text_recognizer.cpp       |  359 +-
 .../apis/c/mmdeploy/text_recognizer.h         |  288 +-
 .../apis/c/mmdeploy/video_recognizer.cpp      |  262 +-
 .../apis/c/mmdeploy/video_recognizer.h        |  237 +-
 .../mmdeploy/apis/cxx/mmdeploy/classifier.hpp |  136 +-
 csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp    |  603 +-
 csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp  |  136 +-
 csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp  |  127 +-
 .../apis/cxx/mmdeploy/pose_detector.hpp       |  155 +-
 .../apis/cxx/mmdeploy/pose_tracker.hpp        |  304 +-
 csrc/mmdeploy/apis/cxx/mmdeploy/restorer.hpp  |  127 +-
 .../apis/cxx/mmdeploy/rotated_detector.hpp    |  138 +-
 csrc/mmdeploy/apis/cxx/mmdeploy/segmentor.hpp |  133 +-
 .../apis/cxx/mmdeploy/text_detector.hpp       |  138 +-
 .../apis/cxx/mmdeploy/text_recognizer.hpp     |  161 +-
 .../apis/cxx/mmdeploy/video_recognizer.hpp    |  170 +-
 csrc/mmdeploy/apis/java/native/common.h       |   81 +-
 .../apis/java/native/mmdeploy_Classifier.cpp  |   46 +-
 .../apis/java/native/mmdeploy_Classifier.h    |   50 +-
 .../apis/java/native/mmdeploy_Context.cpp     |   63 +-
 .../apis/java/native/mmdeploy_Context.h       |   49 +-
 .../apis/java/native/mmdeploy_Detector.cpp    |   44 +-
 .../apis/java/native/mmdeploy_Detector.h      |   50 +-
 .../apis/java/native/mmdeploy_Device.cpp      |   29 +-
 .../apis/java/native/mmdeploy_Device.h        |   37 +-
 .../apis/java/native/mmdeploy_Model.cpp       |   29 +-
 .../apis/java/native/mmdeploy_Model.h         |   37 +-
 .../java/native/mmdeploy_PoseDetector.cpp     |   45 +-
 .../apis/java/native/mmdeploy_PoseDetector.h  |   51 +-
 .../apis/java/native/mmdeploy_PoseTracker.cpp |  261 +-
 .../apis/java/native/mmdeploy_PoseTracker.h   |   86 +-
 .../apis/java/native/mmdeploy_Profiler.cpp    |   29 +-
 .../apis/java/native/mmdeploy_Profiler.h      |   37 +-
 .../apis/java/native/mmdeploy_Restorer.cpp    |   44 +-
 .../apis/java/native/mmdeploy_Restorer.h      |   49 +-
 .../java/native/mmdeploy_RotatedDetector.cpp  |   45 +-
 .../java/native/mmdeploy_RotatedDetector.h    |   51 +-
 .../apis/java/native/mmdeploy_Scheduler.cpp   |   21 +-
 .../apis/java/native/mmdeploy_Scheduler.h     |   49 +-
 .../apis/java/native/mmdeploy_Segmentor.cpp   |   44 +-
 .../apis/java/native/mmdeploy_Segmentor.h     |   50 +-
 .../java/native/mmdeploy_TextDetector.cpp     |   45 +-
 .../apis/java/native/mmdeploy_TextDetector.h  |   51 +-
 .../java/native/mmdeploy_TextRecognizer.cpp   |   56 +-
 .../java/native/mmdeploy_TextRecognizer.h     |   65 +-
 csrc/mmdeploy/apis/python/classifier.cpp      |  122 +-
 csrc/mmdeploy/apis/python/common.cpp          |  334 +-
 csrc/mmdeploy/apis/python/common.h            |   27 +-
 csrc/mmdeploy/apis/python/detector.cpp        |  161 +-
 csrc/mmdeploy/apis/python/executor.cpp        |   69 +-
 csrc/mmdeploy/apis/python/internal.cpp        |   99 +-
 csrc/mmdeploy/apis/python/pipeline.cpp        |   60 +-
 csrc/mmdeploy/apis/python/pose_detector.cpp   |  235 +-
 csrc/mmdeploy/apis/python/pose_tracker.cpp    |  314 +-
 csrc/mmdeploy/apis/python/restorer.cpp        |  120 +-
 .../mmdeploy/apis/python/rotated_detector.cpp |  141 +-
 csrc/mmdeploy/apis/python/segmentor.cpp       |  145 +-
 csrc/mmdeploy/apis/python/text_detector.cpp   |  131 +-
 csrc/mmdeploy/apis/python/text_recognizer.cpp |  162 +-
 .../mmdeploy/apis/python/video_recognizer.cpp |  165 +-
 csrc/mmdeploy/archive/json_archive.h          |  434 +-
 csrc/mmdeploy/archive/value_archive.h         |  288 +-
 .../common_cuda_helper.cuh                    |  132 +-
 .../modulated_deform_conv_cpu.h               |  129 +-
 .../modulated_deform_conv_cuda.cuh            |  218 +-
 .../backend_ops/ncnn/onnx2ncnn/fuse_pass.cpp  | 4300 ++++----
 .../backend_ops/ncnn/onnx2ncnn/fuse_pass.h    |  148 +-
 .../backend_ops/ncnn/onnx2ncnn/onnx2ncnn.cpp  | 6078 ++++++-----
 .../ncnn/onnx2ncnn/shape_inference.cpp        |  300 +-
 .../ncnn/onnx2ncnn/shape_inference.h          |    5 +-
 .../backend_ops/ncnn/onnx2ncnn/utils.h        |  761 +-
 .../ops/constantofshape/constantofshape.cpp   |   95 +-
 .../ops/constantofshape/constantofshape.h     |   21 +-
 .../backend_ops/ncnn/ops/expand/expand.cpp    |  742 +-
 .../backend_ops/ncnn/ops/expand/expand.h      |   15 +-
 .../backend_ops/ncnn/ops/gather/gather.cpp    |  301 +-
 .../backend_ops/ncnn/ops/gather/gather.h      |   21 +-
 .../backend_ops/ncnn/ops/ncnn_ops_definer.h   |   30 +-
 .../ncnn/ops/ncnn_ops_register.cpp            |   48 +-
 .../backend_ops/ncnn/ops/ncnn_ops_register.h  |    2 +-
 .../backend_ops/ncnn/ops/shape/shape.cpp      |   85 +-
 .../backend_ops/ncnn/ops/shape/shape.h        |   15 +-
 .../ncnn/ops/tensorslice/tensorslice.cpp      |  418 +-
 .../ncnn/ops/tensorslice/tensorslice.h        |   27 +-
 .../backend_ops/ncnn/ops/topk/topk.cpp        | 1914 ++--
 .../mmdeploy/backend_ops/ncnn/ops/topk/topk.h |   27 +-
 .../backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp  |   12 +-
 .../onnxruntime/common/onnxruntime_register.h |    7 +-
 .../onnxruntime/common/ort_utils.cpp          |   12 +-
 .../onnxruntime/common/ort_utils.h            |   59 +-
 .../onnxruntime/grid_sample/grid_sample.cpp   |  592 +-
 .../onnxruntime/grid_sample/grid_sample.h     |   90 +-
 .../modulated_deform_conv.cpp                 |  389 +-
 .../modulated_deform_conv.h                   |  119 +-
 .../onnxruntime/nms_match/nms_match.cpp       |  233 +-
 .../onnxruntime/nms_match/nms_match.h         |   80 +-
 .../onnxruntime/nms_rotated/nms_rotated.cpp   |  736 +-
 .../onnxruntime/nms_rotated/nms_rotated.h     |   84 +-
 .../onnxruntime/onnxruntime_register.cpp      |   41 +-
 .../roi_align_rotated/roi_align_rotated.cpp   |  458 +-
 .../roi_align_rotated/roi_align_rotated.h     |  108 +-
 .../tensorrt/batched_nms/trt_batched_nms.cpp  |  511 +-
 .../tensorrt/batched_nms/trt_batched_nms.hpp  |   94 +-
 .../trt_batched_rotated_nms.cpp               |  509 +-
 .../trt_batched_rotated_nms.hpp               |   86 +-
 .../trt_bicubic_interpolate.cpp               |  390 +-
 .../trt_bicubic_interpolate.hpp               |   89 +-
 .../trt_bicubic_interpolate_kernel.cu         |  277 +-
 .../trt_bicubic_interpolate_kernel.hpp        |    6 +-
 .../tensorrt/common/common_cuda_helper.hpp    |   87 +-
 .../common/nms/batched_nms_kernel.hpp         |   10 +-
 .../tensorrt/common/nms/cub_helper.h          |   20 +-
 .../backend_ops/tensorrt/common/nms/kernel.h  |   78 +-
 .../tensorrt/common/trt_plugin_base.hpp       |  125 +-
 .../tensorrt/common/trt_plugin_helper.hpp     |  270 +-
 .../tensorrt/common/trt_serialize.hpp         |  160 +-
 .../tensorrt/common_impl/nms/allClassNMS.cu   |  463 +-
 .../common_impl/nms/allClassRotatedNMS.cu     |  915 +-
 .../common_impl/nms/batched_nms_kernel.cpp    |  224 +-
 .../common_impl/nms/gatherNMSOutputs.cu       |  279 +-
 .../tensorrt/common_impl/nms/kernel.cu        |  141 +-
 .../tensorrt/common_impl/nms/permuteData.cu   |  110 +-
 .../common_impl/nms/sortScoresPerClass.cu     |  248 +-
 .../common_impl/nms/sortScoresPerImage.cu     |  129 +-
 .../tensorrt/common_impl/trt_cuda_helper.cu   |  150 +-
 .../tensorrt/deform_conv/trt_deform_conv.cpp  |  526 +-
 .../tensorrt/deform_conv/trt_deform_conv.hpp  |  131 +-
 .../deform_conv/trt_deform_conv_kernel.cu     |  196 +-
 .../deform_conv/trt_deform_conv_kernel.cuh    |  204 +-
 .../deform_conv/trt_deform_conv_kernel.hpp    |   16 +-
 .../tensorrt/gather_topk/gather_topk.cpp      |  309 +-
 .../tensorrt/gather_topk/gather_topk.hpp      |  100 +-
 .../gather_topk/gather_topk_kernel.cu         |   55 +-
 .../gather_topk/gather_topk_kernel.hpp        |    6 +-
 .../tensorrt/grid_priors/trt_grid_priors.cpp  |  327 +-
 .../tensorrt/grid_priors/trt_grid_priors.hpp  |   82 +-
 .../grid_priors/trt_grid_priors_kernel.cu     |   63 +-
 .../grid_priors/trt_grid_priors_kernel.hpp    |    5 +-
 .../grid_sampler/trt_grid_sampler.cpp         |  419 +-
 .../grid_sampler/trt_grid_sampler.hpp         |   94 +-
 .../grid_sampler/trt_grid_sampler_kernel.cu   |  718 +-
 .../grid_sampler/trt_grid_sampler_kernel.hpp  |   19 +-
 .../instance_norm/trt_instance_norm.cpp       |  436 +-
 .../instance_norm/trt_instance_norm.hpp       |  100 +-
 .../trt_modulated_deform_conv.cpp             |  673 +-
 .../trt_modulated_deform_conv.hpp             |  133 +-
 .../trt_modulated_deform_conv_kernel.cu       |  314 +-
 .../trt_modulated_deform_conv_kernel.hpp      |   32 +-
 .../trt_multi_level_roi_align.cpp             |  474 +-
 .../trt_multi_level_roi_align.hpp             |  102 +-
 .../trt_multi_level_roi_align_kernel.cu       |  373 +-
 .../trt_multi_level_roi_align_kernel.hpp      |    8 +-
 .../trt_multi_level_rotated_roi_align.cpp     |  494 +-
 .../trt_multi_level_rotated_roi_align.hpp     |  103 +-
 ...rt_multi_level_rotated_roi_align_kernel.cu |  348 +-
 ...t_multi_level_rotated_roi_align_kernel.hpp |    8 +-
 .../trt_ms_deform_attn.cpp                    |  364 +-
 .../trt_ms_deform_attn.hpp                    |   89 +-
 .../trt_ms_deform_attn_kernel.cu              |  121 +-
 .../trt_ms_deform_attn_kernel.cuh             |  456 +-
 .../trt_ms_deform_attn_kernel.hpp             |    9 +-
 .../tensorrt/roi_align/trt_roi_align.cpp      |  515 +-
 .../tensorrt/roi_align/trt_roi_align.hpp      |   97 +-
 .../roi_align/trt_roi_align_kernel.cu         |  203 +-
 .../roi_align/trt_roi_align_kernel.hpp        |    9 +-
 .../scaled_dot_product_attention.cpp          |  389 +-
 .../scaled_dot_product_attention.hpp          |   95 +-
 .../scaled_dot_product_attention_kernel.cu    |  144 +-
 .../scaled_dot_product_attention_kernel.hpp   |   10 +-
 .../tensorrt/scatternd/trt_scatternd.cpp      |  331 +-
 .../tensorrt/scatternd/trt_scatternd.hpp      |  100 +-
 .../scatternd/trt_scatternd_kernel.cu         |  114 +-
 .../scatternd/trt_scatternd_kernel.hpp        |    6 +-
 .../backend_ops/torchscript/ops/bind.cpp      |   19 +-
 .../ops/coreml_nms/coreml_nms_cpu.cpp         |   52 +-
 .../modulated_deform_conv_cpu.cpp             |  167 +-
 .../modulated_deform_conv_cuda.cu             |  149 +-
 .../torchscript/optimizer/bind.cpp            |   61 +-
 .../optimizer/ir/subgraph_matcher.cpp         |  643 +-
 .../optimizer/ir/subgraph_matcher.h           |   64 +-
 .../torchscript/optimizer/optimizer.cpp       |   78 +-
 .../torchscript/optimizer/optimizer.h         |    9 +-
 .../onnx/common_subgraph_elimination.cpp      |  282 +-
 .../passes/onnx/common_subgraph_elimination.h |   24 +-
 .../passes/onnx/flatten_cls_head.cpp          |  190 +-
 .../optimizer/passes/onnx/flatten_cls_head.h  |   12 +-
 .../passes/onnx/fuse_select_assign.cpp        |  287 +-
 .../passes/onnx/fuse_select_assign.h          |   18 +-
 .../passes/onnx/merge_shape_concate.cpp       |  234 +-
 .../passes/onnx/merge_shape_concate.h         |   12 +-
 .../optimizer/passes/onnx/onnx_peephole.cpp   |  158 +-
 .../optimizer/passes/onnx/onnx_peephole.h     |   12 +-
 .../torchscript/optimizer/passes/onnx/utils.h |   24 +-
 csrc/mmdeploy/codebase/common.h               |  128 +-
 csrc/mmdeploy/codebase/mmaction/base_head.cpp |  109 +-
 .../codebase/mmaction/format_shape.cpp        |  243 +-
 .../mmdeploy/codebase/mmaction/format_shape.h |   29 +-
 csrc/mmdeploy/codebase/mmaction/mmaction.cpp  |    5 +-
 csrc/mmdeploy/codebase/mmaction/mmaction.h    |   18 +-
 csrc/mmdeploy/codebase/mmcls/linear_cls.cpp   |  218 +-
 csrc/mmdeploy/codebase/mmcls/mmcls.cpp        |    5 +-
 csrc/mmdeploy/codebase/mmcls/mmcls.h          |   18 +-
 .../codebase/mmcls/multi_label_linear_cls.cpp |   88 +-
 .../codebase/mmdet/base_dense_head.cpp        |  190 +-
 .../mmdeploy/codebase/mmdet/base_dense_head.h |   27 +-
 .../codebase/mmdet/instance_segmentation.cpp  |  395 +-
 csrc/mmdeploy/codebase/mmdet/mmdet.cpp        |    5 +-
 csrc/mmdeploy/codebase/mmdet/mmdet.h          |   30 +-
 .../codebase/mmdet/object_detection.cpp       |  350 +-
 .../codebase/mmdet/object_detection.h         |   33 +-
 csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp  |  365 +-
 csrc/mmdeploy/codebase/mmdet/rtmdet_head.h    |   40 +-
 csrc/mmdeploy/codebase/mmdet/utils.cpp        |  152 +-
 csrc/mmdeploy/codebase/mmdet/utils.h          |   33 +-
 csrc/mmdeploy/codebase/mmdet/yolo_head.cpp    |  435 +-
 csrc/mmdeploy/codebase/mmdet/yolo_head.h      |   74 +-
 csrc/mmdeploy/codebase/mmedit/mmedit.cpp      |    5 +-
 csrc/mmdeploy/codebase/mmedit/mmedit.h        |    7 +-
 csrc/mmdeploy/codebase/mmedit/restorer.cpp    |  106 +-
 .../codebase/mmocr/attention_convertor.cpp    |  127 +-
 .../codebase/mmocr/base_convertor.cpp         |  298 +-
 csrc/mmdeploy/codebase/mmocr/base_convertor.h |   56 +-
 .../codebase/mmocr/contour_expand.cpp         |  227 +-
 csrc/mmdeploy/codebase/mmocr/cpu/dbnet.cpp    |  128 +-
 csrc/mmdeploy/codebase/mmocr/cpu/panet.cpp    |   98 +-
 csrc/mmdeploy/codebase/mmocr/cpu/psenet.cpp   |  102 +-
 csrc/mmdeploy/codebase/mmocr/crnn.cpp         |  130 +-
 .../mmocr/cuda/connected_component.cu         |  867 +-
 .../codebase/mmocr/cuda/connected_component.h |   29 +-
 csrc/mmdeploy/codebase/mmocr/cuda/dbnet.cpp   |  135 +-
 csrc/mmdeploy/codebase/mmocr/cuda/panet.cpp   |  185 +-
 csrc/mmdeploy/codebase/mmocr/cuda/psenet.cpp  |  130 +-
 csrc/mmdeploy/codebase/mmocr/cuda/utils.cu    |  211 +-
 csrc/mmdeploy/codebase/mmocr/cuda/utils.h     |   34 +-
 csrc/mmdeploy/codebase/mmocr/dbnet.cpp        |  263 +-
 csrc/mmdeploy/codebase/mmocr/dbnet.h          |   27 +-
 csrc/mmdeploy/codebase/mmocr/mmocr.cpp        |    5 +-
 csrc/mmdeploy/codebase/mmocr/mmocr.h          |   37 +-
 csrc/mmdeploy/codebase/mmocr/panet.cpp        |  231 +-
 csrc/mmdeploy/codebase/mmocr/panet.h          |   55 +-
 csrc/mmdeploy/codebase/mmocr/pixel_group.cpp  |  219 +-
 csrc/mmdeploy/codebase/mmocr/psenet.cpp       |  225 +-
 csrc/mmdeploy/codebase/mmocr/psenet.h         |   45 +-
 .../codebase/mmocr/rescale_to_height.cpp      |  123 +-
 csrc/mmdeploy/codebase/mmocr/resize_ocr.cpp   |  161 +-
 .../mmocr/short_scale_aspect_jitter.cpp       |  167 +-
 csrc/mmdeploy/codebase/mmocr/warp.cpp         |  105 +-
 .../mmpose/keypoints_from_heatmap.cpp         |  716 +-
 .../mmpose/keypoints_from_regression.cpp      |  207 +-
 csrc/mmdeploy/codebase/mmpose/mmpose.cpp      |    5 +-
 csrc/mmdeploy/codebase/mmpose/mmpose.h        |   29 +-
 .../codebase/mmpose/pose_tracker/common.h     |   80 +-
 .../codebase/mmpose/pose_tracker/pipeline.cpp |  207 +-
 .../mmpose/pose_tracker/pose_tracker.cpp      |  792 +-
 .../mmpose/pose_tracker/pose_tracker.h        |  133 +-
 .../mmpose/pose_tracker/smoothing_filter.cpp  |   93 +-
 .../mmpose/pose_tracker/smoothing_filter.h    |   98 +-
 .../codebase/mmpose/pose_tracker/track.cpp    |  124 +-
 .../codebase/mmpose/pose_tracker/track.h      |  125 +-
 .../mmpose/pose_tracker/tracking_filter.cpp   |  410 +-
 .../mmpose/pose_tracker/tracking_filter.h     |   58 +-
 .../codebase/mmpose/pose_tracker/utils.cpp    |  257 +-
 .../codebase/mmpose/pose_tracker/utils.h      |  154 +-
 csrc/mmdeploy/codebase/mmpose/simcc_label.cpp |  225 +-
 .../codebase/mmpose/topdown_affine.cpp        |  280 +-
 .../mmpose/topdown_get_bbox_center_scale.cpp  |   93 +-
 csrc/mmdeploy/codebase/mmrotate/mmrotate.cpp  |    5 +-
 csrc/mmdeploy/codebase/mmrotate/mmrotate.h    |   31 +-
 .../mmrotate/oriented_object_detection.cpp    |  206 +-
 csrc/mmdeploy/codebase/mmseg/mmseg.cpp        |    5 +-
 csrc/mmdeploy/codebase/mmseg/mmseg.h          |   22 +-
 csrc/mmdeploy/codebase/mmseg/segment.cpp      |  256 +-
 csrc/mmdeploy/core/archive.h                  |  220 +-
 csrc/mmdeploy/core/device.h                   |  724 +-
 csrc/mmdeploy/core/device_impl.cpp            |  755 +-
 csrc/mmdeploy/core/device_impl.h              |  295 +-
 csrc/mmdeploy/core/graph.cpp                  |  264 +-
 csrc/mmdeploy/core/graph.h                    |  129 +-
 csrc/mmdeploy/core/logger.cpp                 |  118 +-
 csrc/mmdeploy/core/logger.h                   |   75 +-
 csrc/mmdeploy/core/macro.h                    |   44 +-
 csrc/mmdeploy/core/mat.cpp                    |  139 +-
 csrc/mmdeploy/core/mat.h                      |  189 +-
 csrc/mmdeploy/core/model.cpp                  |  130 +-
 csrc/mmdeploy/core/model.h                    |  191 +-
 csrc/mmdeploy/core/model_impl.h               |   80 +-
 csrc/mmdeploy/core/module.cpp                 |    5 +-
 csrc/mmdeploy/core/module.h                   |   16 +-
 csrc/mmdeploy/core/mpl/detected.h             |   90 +-
 csrc/mmdeploy/core/mpl/iterator.h             |   11 +-
 csrc/mmdeploy/core/mpl/priority_tag.h         |   15 +-
 csrc/mmdeploy/core/mpl/span.h                 |  295 +-
 csrc/mmdeploy/core/mpl/static_any.h           |  948 +-
 csrc/mmdeploy/core/mpl/structure.h            |  502 +-
 csrc/mmdeploy/core/mpl/type_traits.h          |   69 +-
 csrc/mmdeploy/core/net.cpp                    |    5 +-
 csrc/mmdeploy/core/net.h                      |   28 +-
 csrc/mmdeploy/core/operator.cpp               |  427 +-
 csrc/mmdeploy/core/operator.h                 |  213 +-
 csrc/mmdeploy/core/profiler.cpp               |  175 +-
 csrc/mmdeploy/core/profiler.h                 |  143 +-
 csrc/mmdeploy/core/registry.cpp               |  150 +-
 csrc/mmdeploy/core/registry.h                 |  465 +-
 csrc/mmdeploy/core/serialization.h            |  615 +-
 csrc/mmdeploy/core/status_code.cpp            |   76 +-
 csrc/mmdeploy/core/status_code.h              |  258 +-
 csrc/mmdeploy/core/tensor.cpp                 |  445 +-
 csrc/mmdeploy/core/tensor.h                   |  164 +-
 csrc/mmdeploy/core/types.h                    |   85 +-
 csrc/mmdeploy/core/utils/device_utils.cpp     |   81 +-
 csrc/mmdeploy/core/utils/device_utils.h       |   37 +-
 csrc/mmdeploy/core/utils/filesystem.h         |    4 +-
 csrc/mmdeploy/core/utils/formatter.cpp        |    8 +-
 csrc/mmdeploy/core/utils/formatter.h          |  196 +-
 csrc/mmdeploy/core/utils/source_location.h    |   43 +-
 csrc/mmdeploy/core/utils/stacktrace.cpp       |  108 +-
 csrc/mmdeploy/core/utils/stacktrace.h         |   32 +-
 csrc/mmdeploy/core/value.h                    | 2553 +++--
 csrc/mmdeploy/device/acl/acl_device.cpp       |   17 +-
 csrc/mmdeploy/device/cpu/cpu_device.cpp       |  926 +-
 csrc/mmdeploy/device/cpu/cpu_device.h         |  187 +-
 csrc/mmdeploy/device/cuda/buddy_allocator.h   |  355 +-
 csrc/mmdeploy/device/cuda/cuda_device.cpp     | 1119 ++-
 csrc/mmdeploy/device/cuda/cuda_device.h       |  305 +-
 csrc/mmdeploy/device/cuda/default_allocator.h |  106 +-
 csrc/mmdeploy/device/cuda/linear_allocator.h  |  123 +-
 csrc/mmdeploy/device/device_allocator.h       |  735 +-
 csrc/mmdeploy/execution/bulk.h                |  236 +-
 csrc/mmdeploy/execution/closure.h             |  153 +-
 csrc/mmdeploy/execution/concepts.h            |  276 +-
 csrc/mmdeploy/execution/dynamic_batch.h       |   97 +-
 csrc/mmdeploy/execution/ensure_started.h      |  342 +-
 csrc/mmdeploy/execution/execute.h             |   45 +-
 csrc/mmdeploy/execution/expand.h              |  112 +-
 csrc/mmdeploy/execution/just.h                |  124 +-
 csrc/mmdeploy/execution/let_value.h           |  295 +-
 csrc/mmdeploy/execution/on.h                  |  239 +-
 csrc/mmdeploy/execution/run_loop.h            |  339 +-
 csrc/mmdeploy/execution/schedule_from.h       |  270 +-
 .../schedulers/dynamic_batch_scheduler.h      |  556 +-
 .../execution/schedulers/inlined_scheduler.h  |  142 +-
 .../execution/schedulers/intrusive_queue.h    |  186 +-
 csrc/mmdeploy/execution/schedulers/registry.h |    9 +-
 .../execution/schedulers/schedulers.cpp       |  201 +-
 .../schedulers/single_thread_context.h        |  104 +-
 .../execution/schedulers/static_thread_pool.h |  731 +-
 .../schedulers/timed_single_thread_context.h  |  417 +-
 csrc/mmdeploy/execution/split.h               |  351 +-
 csrc/mmdeploy/execution/start_detached.h      |   66 +-
 csrc/mmdeploy/execution/submit.h              |  103 +-
 csrc/mmdeploy/execution/sync_wait.h           |  148 +-
 csrc/mmdeploy/execution/tag_invoke.h          |  152 +-
 csrc/mmdeploy/execution/then.h                |  197 +-
 csrc/mmdeploy/execution/transfer.h            |   76 +-
 csrc/mmdeploy/execution/transfer_just.h       |   45 +-
 csrc/mmdeploy/execution/type_erased.h         | 1007 +-
 csrc/mmdeploy/execution/type_traits.h         |  133 +-
 csrc/mmdeploy/execution/utility.h             |  129 +-
 csrc/mmdeploy/execution/when_all.h            |  358 +-
 csrc/mmdeploy/execution/when_all_value.h      |  176 +-
 csrc/mmdeploy/experimental/module_adapter.h   |  229 +-
 csrc/mmdeploy/graph/common.h                  |  103 +-
 csrc/mmdeploy/graph/cond.cpp                  |  240 +-
 csrc/mmdeploy/graph/cond.h                    |   45 +-
 csrc/mmdeploy/graph/flattened.h               |   74 +-
 csrc/mmdeploy/graph/inference.cpp             |  145 +-
 csrc/mmdeploy/graph/inference.h               |   22 +-
 csrc/mmdeploy/graph/pipeline.cpp              |   22 +-
 csrc/mmdeploy/graph/pipeline.h                |   16 +-
 csrc/mmdeploy/graph/static_router.cpp         |  434 +-
 csrc/mmdeploy/graph/static_router.h           |   82 +-
 csrc/mmdeploy/graph/task.cpp                  |  131 +-
 csrc/mmdeploy/graph/task.h                    |   51 +-
 csrc/mmdeploy/model/directory_model_impl.cpp  |  112 +-
 csrc/mmdeploy/model/zip_model_impl.cpp        |  258 +-
 csrc/mmdeploy/net/acl/acl_net.cpp             | 1334 +--
 csrc/mmdeploy/net/acl/acl_net.h               |  103 +-
 csrc/mmdeploy/net/coreml/coreml_net.h         |   55 +-
 csrc/mmdeploy/net/ncnn/ncnn_net.cpp           |  309 +-
 csrc/mmdeploy/net/ncnn/ncnn_net.h             |   49 +-
 csrc/mmdeploy/net/net_module.cpp              |  623 +-
 csrc/mmdeploy/net/net_module.h                |   30 +-
 csrc/mmdeploy/net/openvino/openvino_net.cpp   |  522 +-
 csrc/mmdeploy/net/openvino/openvino_net.h     |   46 +-
 csrc/mmdeploy/net/ort/ort_net.cpp             |  405 +-
 csrc/mmdeploy/net/ort/ort_net.h               |   40 +-
 csrc/mmdeploy/net/ppl/ppl_net.cpp             |  714 +-
 csrc/mmdeploy/net/ppl/ppl_net.h               |   62 +-
 csrc/mmdeploy/net/rknn/rknn_net.cpp           |  570 +-
 csrc/mmdeploy/net/rknn/rknn_net.h             |   46 +-
 csrc/mmdeploy/net/snpe/snpe_net.cpp           |  511 +-
 csrc/mmdeploy/net/snpe/snpe_net.h             |   75 +-
 csrc/mmdeploy/net/torchscript/torch_net.cpp   |  444 +-
 csrc/mmdeploy/net/torchscript/torch_net.h     |   48 +-
 csrc/mmdeploy/net/trt/trt_net.cpp             |  462 +-
 csrc/mmdeploy/net/trt/trt_net.h               |  138 +-
 csrc/mmdeploy/net/tvm/tvm_net.cpp             |  575 +-
 csrc/mmdeploy/net/tvm/tvm_net.h               |   56 +-
 csrc/mmdeploy/operation/cpu/crop.cpp          |   27 +-
 .../operation/cpu/crop_resize_pad.cpp         |   31 +-
 csrc/mmdeploy/operation/cpu/cvtcolor.cpp      |   26 +-
 csrc/mmdeploy/operation/cpu/flip.cpp          |   31 +-
 csrc/mmdeploy/operation/cpu/hwc2chw.cpp       |   46 +-
 csrc/mmdeploy/operation/cpu/normalize.cpp     |   49 +-
 csrc/mmdeploy/operation/cpu/pad.cpp           |   65 +-
 csrc/mmdeploy/operation/cpu/permute.cpp       |  167 +-
 csrc/mmdeploy/operation/cpu/resize.cpp        |   37 +-
 csrc/mmdeploy/operation/cpu/to_float.cpp      |   73 +-
 csrc/mmdeploy/operation/cpu/warp_affine.cpp   |   40 +-
 csrc/mmdeploy/operation/cuda/cast.cu          |   44 +-
 csrc/mmdeploy/operation/cuda/crop.cpp         |  118 +-
 csrc/mmdeploy/operation/cuda/crop.cu          |   97 +-
 .../operation/cuda/crop_resize_pad.cpp        |  175 +-
 csrc/mmdeploy/operation/cuda/cvtcolor.cpp     |  249 +-
 csrc/mmdeploy/operation/cuda/flip.cpp         |  119 +-
 csrc/mmdeploy/operation/cuda/hwc2chw.cpp      |   79 +-
 csrc/mmdeploy/operation/cuda/normalize.cpp    |  130 +-
 csrc/mmdeploy/operation/cuda/normalize.cu     |  103 +-
 csrc/mmdeploy/operation/cuda/pad.cpp          |  166 +-
 csrc/mmdeploy/operation/cuda/permute.cpp      |  156 +-
 csrc/mmdeploy/operation/cuda/permute.cu       |   86 +-
 csrc/mmdeploy/operation/cuda/permute.h        |   27 +-
 csrc/mmdeploy/operation/cuda/resize.cpp       |  187 +-
 csrc/mmdeploy/operation/cuda/to_float.cpp     |   65 +-
 csrc/mmdeploy/operation/cuda/transpose.cu     |   76 +-
 csrc/mmdeploy/operation/cuda/warp_affine.cpp  |  237 +-
 csrc/mmdeploy/operation/dummy/operations.cpp  |  177 +-
 csrc/mmdeploy/operation/managed.h             |  399 +-
 csrc/mmdeploy/operation/operation.cpp         |   66 +-
 csrc/mmdeploy/operation/operation.h           |  237 +-
 csrc/mmdeploy/operation/vision.cpp            |   25 +-
 csrc/mmdeploy/operation/vision.h              |  189 +-
 .../preprocess/elena/elena_registry.cpp       |   49 +-
 .../preprocess/elena/elena_registry.h         |   60 +-
 csrc/mmdeploy/preprocess/elena/fused.cpp      |  266 +-
 .../preprocess/transform/center_crop.cpp      |  174 +-
 .../mmdeploy/preprocess/transform/collect.cpp |  125 +-
 .../mmdeploy/preprocess/transform/compose.cpp |  175 +-
 .../transform/default_format_bundle.cpp       |  122 +-
 .../preprocess/transform/image2tensor.cpp     |   85 +-
 .../preprocess/transform/letter_resize.cpp    |  292 +-
 csrc/mmdeploy/preprocess/transform/lift.cpp   |   53 +-
 csrc/mmdeploy/preprocess/transform/load.cpp   |  192 +-
 .../preprocess/transform/normalize.cpp        |  261 +-
 csrc/mmdeploy/preprocess/transform/pad.cpp    |  307 +-
 csrc/mmdeploy/preprocess/transform/resize.cpp |  278 +-
 .../preprocess/transform/ten_crop.cpp         |  165 +-
 .../preprocess/transform/three_crop.cpp       |  180 +-
 csrc/mmdeploy/preprocess/transform/tracer.cpp |  127 +-
 csrc/mmdeploy/preprocess/transform/tracer.h   |  173 +-
 .../preprocess/transform/transform.cpp        |   39 +-
 .../mmdeploy/preprocess/transform/transform.h |   33 +-
 csrc/mmdeploy/preprocess/transform_module.cpp |   94 +-
 csrc/mmdeploy/utils/dlpack/dlpack_utils.cpp   |  366 +-
 csrc/mmdeploy/utils/dlpack/dlpack_utils.h     |    8 +-
 csrc/mmdeploy/utils/opencv/opencv_utils.cpp   |  685 +-
 csrc/mmdeploy/utils/opencv/opencv_utils.h     |  311 +-
 demo/csrc/c/batch_image_classification.cpp    |  173 +-
 demo/csrc/c/batch_object_detection.cpp        |  275 +-
 demo/csrc/c/det_cls.cpp                       |  121 +-
 demo/csrc/c/det_pose.cpp                      |  232 +-
 demo/csrc/c/image_classification.cpp          |   93 +-
 demo/csrc/c/image_restorer.cpp                |   90 +-
 demo/csrc/c/image_segmentation.cpp            |  141 +-
 demo/csrc/c/object_detection.cpp              |  158 +-
 demo/csrc/c/ocr.cpp                           |  136 +-
 demo/csrc/c/pose_detection.cpp                |   93 +-
 demo/csrc/c/rotated_object_detection.cpp      |  119 +-
 demo/csrc/c/video_recognition.cpp             |  207 +-
 demo/csrc/cpp/classifier.cxx                  |   86 +-
 demo/csrc/cpp/det_pose.cxx                    |   95 +-
 demo/csrc/cpp/detector.cxx                    |   85 +-
 demo/csrc/cpp/pose_detector.cxx               |   77 +-
 demo/csrc/cpp/pose_tracker.cxx                |   88 +-
 demo/csrc/cpp/pose_tracker_params.h           |   44 +-
 demo/csrc/cpp/restorer.cxx                    |   77 +-
 demo/csrc/cpp/rotated_detector.cxx            |   85 +-
 demo/csrc/cpp/segmentor.cxx                   |   84 +-
 demo/csrc/cpp/text_det_recog.cxx              |   57 +-
 demo/csrc/cpp/text_ocr.cxx                    |   81 +-
 demo/csrc/cpp/utils/argparse.h                |  528 +-
 demo/csrc/cpp/utils/mediaio.h                 |  920 +-
 demo/csrc/cpp/utils/palette.h                 |  245 +-
 demo/csrc/cpp/utils/skeleton.h                |  413 +-
 demo/csrc/cpp/utils/visualize.h               |  498 +-
 demo/csrc/cpp/video_cls.cxx                   |  124 +-
 tests/test_csrc/archive/test_json_archive.cpp |   85 +-
 .../test_csrc/archive/test_value_archive.cpp  |  185 +-
 tests/test_csrc/capi/test_classifier.cpp      |   94 +-
 tests/test_csrc/capi/test_detector.cpp        |  103 +-
 tests/test_csrc/capi/test_model.cpp           |   40 +-
 tests/test_csrc/capi/test_restorer.cpp        |   90 +-
 tests/test_csrc/capi/test_segmentor.cpp       |   92 +-
 tests/test_csrc/capi/test_text_detector.cpp   |   98 +-
 tests/test_csrc/capi/test_text_recognizer.cpp |  210 +-
 tests/test_csrc/core/test_execution.cpp       |  800 +-
 tests/test_csrc/core/test_mat.cpp             |  169 +-
 tests/test_csrc/core/test_module_adapter.cpp  |   49 +-
 tests/test_csrc/core/test_registry.cpp        |  165 +-
 tests/test_csrc/core/test_span.cpp            |  155 +-
 tests/test_csrc/core/test_status_code.cpp     |   66 +-
 tests/test_csrc/core/test_value.cpp           |  581 +-
 tests/test_csrc/device/test_cpu_device.cpp    |   52 +-
 tests/test_csrc/device/test_cuda_device.cpp   |   56 +-
 tests/test_csrc/device/test_opencl_device.cpp |   56 +-
 tests/test_csrc/graph/test_cond.cpp           |   86 +-
 .../test_csrc/model/test_directory_model.cpp  |   47 +-
 tests/test_csrc/model/test_model.cpp          |   74 +-
 tests/test_csrc/model/test_zip_model.cpp      |   70 +-
 tests/test_csrc/net/test_ncnn_net.cpp         |   31 +-
 tests/test_csrc/net/test_openvino_net.cpp     |   31 +-
 tests/test_csrc/net/test_ort_net.cpp          |   31 +-
 tests/test_csrc/net/test_ppl_net.cpp          |   25 +-
 tests/test_csrc/net/test_trt_net.cpp          |   31 +-
 tests/test_csrc/preprocess/test_collect.cpp   |  172 +-
 tests/test_csrc/preprocess/test_compose.cpp   |   62 +-
 tests/test_csrc/preprocess/test_crop.cpp      |  180 +-
 .../preprocess/test_default_format_bundle.cpp |   90 +-
 .../preprocess/test_image2tensor.cpp          |   96 +-
 tests/test_csrc/preprocess/test_load.cpp      |  114 +-
 tests/test_csrc/preprocess/test_normalize.cpp |  151 +-
 tests/test_csrc/preprocess/test_pad.cpp       |  184 +-
 tests/test_csrc/preprocess/test_permute.cpp   |  181 +-
 tests/test_csrc/preprocess/test_resize.cpp    |  527 +-
 tests/test_csrc/preprocess/test_utils.cpp     |  121 +-
 tests/test_csrc/preprocess/test_utils.h       |   28 +-
 tests/test_csrc/test_resource.h               |  265 +-
 third_party/clipper/clipper.cpp               | 8871 +++++++++--------
 third_party/clipper/clipper.hpp               |  823 +-
 third_party/concurrentqueue/concurrentqueue.h | 7634 +++++++-------
 third_party/dlpack/dlpack.h                   |  374 +-
 559 files changed, 72407 insertions(+), 59911 deletions(-)
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_definer.h
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.cpp
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
 mode change 100755 => 100644 csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp

diff --git a/.clang-format b/.clang-format
index c7370bb66a..018938c588 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,156 +1,255 @@
----
-Language:        Cpp
-# BasedOnStyle:  Google
-AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
-AlignConsecutiveMacros: false
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllArgumentsOnNextLine: true
-AllowAllConstructorInitializersOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: All
-AllowShortLambdasOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: WithoutElse
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterCaseLabel:  false
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
+# reference from https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+
+# 关闭格式化
+DisableFormat:	false 
+
+# 基础格式化方案
+BasedOnStyle:	LLVM
+
+# 语言: None, Cpp, Java, JavaScript, ObjC, Proto, TableGen, TextProto
+Language:	Cpp
+
+# 标准: Cpp03, Cpp11, Auto
+Standard:	Cpp11
+
+# tab宽度
+TabWidth:	4
+
+# 使用tab字符: Never, ForIndentation, ForContinuationAndIndentation, Always
+UseTab:	Never
+
+# 访问说明符(public、private等)的偏移
+AccessModifierOffset:	-2
+
+# 缩进宽度
+IndentWidth:	4
+
+# 构造函数的初始化列表的缩进宽度
+ConstructorInitializerIndentWidth:	4
+
+# 延续的行的最小缩进宽度
+ContinuationIndentWidth:	4
+
+# 缩进case标签
+IndentCaseLabels:	true
+
+# 函数返回类型换行时，缩进函数声明或函数定义的函数名
+IndentWrappedFunctionNames:	true
+
+# 命名空间的缩进: None, Inner(缩进嵌套的命名空间中的内容), All
+NamespaceIndentation:	All
+
+# 预处理缩进, None, AfterHash, BeforeHash
+IndentPPDirectives: BeforeHash
+
+# 开括号(开圆括号、开尖括号、开方括号)后的对齐: Align, DontAlign, AlwaysBreak(总是在开括号后换行)
+AlignAfterOpenBracket:	Align
+
+# 连续赋值时，对齐所有等号
+#AlignConsecutiveAssignments:	AcrossEmptyLinesAndComments
+AlignConsecutiveAssignments: AcrossComments
+
+# 连续声明时，对齐所有声明的变量名
+AlignConsecutiveDeclarations:	AcrossEmptyLinesAndComments
+#AlignConsecutiveDeclarations:	AcrossComments
+
+#AlignEscapedNewlines: Right
+ 
+# 左对齐逃脱换行(使用反斜杠换行)的反斜杠
+#AlignEscapedNewlinesLeft:	true
+
+# 水平对齐二元和三元表达式的操作数
+AlignOperands:	true
+
+# 对齐连续的尾随的注释
+AlignTrailingComments:	true
+ 
+# 指针和引用的对齐: Left, Right, Middle
+PointerAlignment:	Left
+
+# 继承最常用的指针和引用的对齐方式
+DerivePointerAlignment:	false
+
+# 允许函数声明的所有参数在放在下一行
+AllowAllParametersOfDeclarationOnNextLine:	false
+
+# false表示函数实参要么都在同一行，要么都各自一行
+BinPackArguments:	false
+
+# false表示所有形参要么都在同一行，要么都各自一行
+BinPackParameters:	false
+
+# 允许函数调用的所有参数在放在下一行,即使BinPackParameters为false
+AllowAllArgumentsOnNextLine: false
+
+# 允许短的块放在同一行
+AllowShortBlocksOnASingleLine:	true
+
+# 允许短的case标签放在同一行
+AllowShortCaseLabelsOnASingleLine:	true
+
+# 允许短的函数放在同一行: None, InlineOnly(定义在类中), Empty(空函数), Inline(定义在类中，空函数), All
+AllowShortFunctionsOnASingleLine:	Empty
+
+# 允许短的if语句保持在同一行
+AllowShortIfStatementsOnASingleLine:	true
+
+# 允许短的循环保持在同一行
+AllowShortLoopsOnASingleLine:	true
+ 
+# 总是在定义返回类型后换行(deprecated)
+AlwaysBreakAfterDefinitionReturnType:	None
+
+# 总是在返回类型后换行: None, All, TopLevel(顶级函数，不包括在类中的函数), 
+#   AllDefinitions(所有的定义，不包括声明), TopLevelDefinitions(所有的顶级函数的定义)
+AlwaysBreakAfterReturnType:	None
+
+# 总是在多行string字面量前换行
+AlwaysBreakBeforeMultilineStrings:	false
+
+# 总是在template声明后换行
+AlwaysBreakTemplateDeclarations:	true
+
+# 构造函数的初始化列表要么都在同一行，要么都各自一行
+ConstructorInitializerAllOnOneLineOrOnePerLine:	false
+
+# 构造函数的初始化列表的逗号和分号在前,对齐参数
+BreakConstructorInitializers: BeforeComma
+
+# 自动检测函数的调用和定义是否被格式为每行一个参数(Experimental)
+ExperimentalAutoDetectBinPacking:	true
+
+# 去除C++11的列表初始化的大括号{后和}前的空格
+Cpp11BracedListStyle:	true
+
+# 大括号换行，只有当BreakBeforeBraces设置为Custom时才有效
+BraceWrapping:   
+  # class定义后面
+  AfterClass:	true
+  # 控制语句后面
+  AfterControlStatement:	true
+  # enum定义后面
+  AfterEnum: true
+  # 函数定义后面
+  AfterFunction: true
+  # 命名空间定义后面
+  AfterNamespace:	true
+  # ObjC定义后面
+  AfterObjCDeclaration:	true
+  # struct定义后面
+  AfterStruct:	true
+  # union定义后面
+  AfterUnion:	true
+  AfterExternBlock: true
+  # catch之前
+  BeforeCatch:	true
+  # else之前
+  BeforeElse:	true
+  # 缩进大括号
+  IndentBraces:	false
   SplitEmptyFunction: true
   SplitEmptyRecord: true
   SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     100
-CommentPragmas:  '^ IWYU pragma:'
+ 
+# 在二元运算符前换行: None(在操作符后换行), NonAssignment(在非赋值的操作符前换行), All(在操作符前换行)
+BreakBeforeBinaryOperators:	None
+
+# 在大括号前换行: Attach(始终将大括号附加到周围的上下文), Linux(除函数、命名空间和类定义，与Attach类似), 
+#   Mozilla(除枚举、函数、记录定义，与Attach类似), Stroustrup(除函数定义、catch、else，与Attach类似), 
+#   Allman(总是在大括号前换行), GNU(总是在大括号前换行，并对于控制语句的大括号增加额外的缩进), WebKit(在函数前换行), Custom
+#   注：这里认为语句块也属于函数
+BreakBeforeBraces:	Allman
+
+# 在三元运算符前换行
+BreakBeforeTernaryOperators:	false
+
+# 字符串字面值换行
+BreakStringLiterals: false
+
+# 每行字符的限制，0表示没有限制
+ColumnLimit:	0
+
+# 赋值对齐换行的penalty
+PenaltyBreakAssignment: 100
+
+# 在call(后对函数调用换行的penalty
+PenaltyBreakBeforeFirstCallParameter:	100
+
+# 在一个注释中引入换行的penalty
+PenaltyBreakComment:	100
+
+# 第一次在<<前换行的penalty
+PenaltyBreakFirstLessLess:	100
+
+# 在一个字符串字面量中引入换行的penalty
+PenaltyBreakString:	100
+
+# 对于每个在行字符数限制之外的字符的penalty
+PenaltyExcessCharacter:	100
+
+# 将函数的返回类型放到它自己的行的penalty
+PenaltyReturnTypeOnItsOwnLine:	100
+
+# 在C风格类型转换后添加空格
+SpaceAfterCStyleCast:	false
+ 
+# 在模板 template 关键字后面添加空格
+SpaceAfterTemplateKeyword: false
+ 
+# 在赋值运算符之前添加空格
+SpaceBeforeAssignmentOperators:	true
+
+# 开圆括号之前添加一个空格: Never, ControlStatements, Always
+SpaceBeforeParens:	ControlStatements
+
+# 在尾随的评论前添加的空格数(只适用于//)
+SpacesBeforeTrailingComments:	2
+
+# 在尖括号的<后和>前添加空格
+SpacesInAngles:	false
+
+# 在容器(ObjC和JavaScript的数组和字典等)字面量中添加空格
+SpacesInContainerLiterals:	false
+
+# 在C风格类型转换的括号中添加空格
+SpacesInCStyleCastParentheses:	false
+
+# 在圆括号的(后和)前添加空格
+SpacesInParentheses:	false
+
+# 在空的圆括号中添加空格
+SpaceInEmptyParentheses:	false
+
+# 在方括号的[后和]前添加空格，lamda表达式和未指明大小的数组的声明不受影响
+SpacesInSquareBrackets:	false
+
+# 单行最多允许的连续空格?
+PenaltyIndentedWhitespace: 10
+
+# 描述具有特殊意义的注释的正则表达式，它不应该被分割为多行或以其它方式改变
+CommentPragmas:	'^ IWYU pragma:'
+
+# 连续 namespace 
 CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: true
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Regroup
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-RawStringFormats:
-  - Language:        Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-  - Language:        TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-ReflowComments:  true
-SortIncludes:    true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Auto
-StatementMacros:
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-TabWidth:        8
-UseTab:          Never
-...
+
+# 保留在块开始处的空行
+KeepEmptyLinesAtTheStartOfBlocks:	false
+
+# 连续空行的最大数量
+MaxEmptyLinesToKeep:	2
+
+# 允许重新排版注释
+ReflowComments:	true
+
+# 允许排序#include
+SortIncludes:	false
+
+# 对#include进行排序，匹配了某正则表达式的#include拥有对应的优先级，匹配不到的则默认优先级为INT_MAX(优先级越小排序越靠前)，
+#   可以定义负数优先级从而保证某些#include永远在最前面
+IncludeCategories: 
+  - Regex:	'^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:	2
+  - Regex:	'^(<|"(gtest|isl|json)/)'
+    Priority:	3
+  - Regex:	'.*'
+    Priority:	1
\ No newline at end of file
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/classifier.cpp b/csrc/mmdeploy/apis/c/mmdeploy/classifier.cpp
index 3eec4ef90b..9faf47f349 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/classifier.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/classifier.cpp
@@ -16,118 +16,132 @@
 using namespace mmdeploy;
 using namespace std;
 
-int mmdeploy_classifier_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                               mmdeploy_classifier_t* classifier) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_classifier_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_classifier_t* classifier)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_classifier_create_v2(model, context, classifier);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_classifier_create_v2(model, context, classifier);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_classifier_create_by_path(const char* model_path, const char* device_name,
-                                       int device_id, mmdeploy_classifier_t* classifier) {
-  mmdeploy_model_t model{};
+int mmdeploy_classifier_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_classifier_t* classifier)
+{
+    mmdeploy_model_t model{};
 
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_classifier_create(model, device_name, device_id, classifier);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_classifier_create(model, device_name, device_id, classifier);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_classifier_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                  mmdeploy_classifier_t* classifier) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)classifier);
+int mmdeploy_classifier_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_classifier_t* classifier)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)classifier);
 }
 
-int mmdeploy_classifier_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                     mmdeploy_value_t* value) {
-  return mmdeploy_common_create_input(mats, mat_count, value);
+int mmdeploy_classifier_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value)
+{
+    return mmdeploy_common_create_input(mats, mat_count, value);
 }
 
-int mmdeploy_classifier_apply(mmdeploy_classifier_t classifier, const mmdeploy_mat_t* mats,
-                              int mat_count, mmdeploy_classification_t** results,
-                              int** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_classifier_create_input(mats, mat_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_classifier_apply_v2(classifier, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_classifier_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_classifier_apply(mmdeploy_classifier_t classifier, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_classification_t** results, int** result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_classifier_create_input(mats, mat_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_classifier_apply_v2(classifier, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_classifier_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_classifier_apply_v2(mmdeploy_classifier_t classifier, mmdeploy_value_t input,
-                                 mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)classifier, input, output);
+int mmdeploy_classifier_apply_v2(mmdeploy_classifier_t classifier, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)classifier, input, output);
 }
 
-int mmdeploy_classifier_apply_async(mmdeploy_classifier_t classifier, mmdeploy_sender_t input,
-                                    mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)classifier, input, output);
+int mmdeploy_classifier_apply_async(mmdeploy_classifier_t classifier, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)classifier, input, output);
 }
 
-int mmdeploy_classifier_get_result(mmdeploy_value_t output, mmdeploy_classification_t** results,
-                                   int** result_count) {
-  if (!output || !results || !result_count) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value& value = Cast(output)->front();
-
-    auto classify_outputs = from_value<vector<mmcls::Labels>>(value);
-
-    vector<int> _result_count;
-    _result_count.reserve(classify_outputs.size());
-
-    for (const auto& cls_output : classify_outputs) {
-      _result_count.push_back((int)cls_output.size());
+int mmdeploy_classifier_get_result(mmdeploy_value_t output, mmdeploy_classification_t** results, int** result_count)
+{
+    if (!output || !results || !result_count)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
-
-    auto total = std::accumulate(begin(_result_count), end(_result_count), 0);
-
-    std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
-    std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
-
-    std::unique_ptr<mmdeploy_classification_t[]> result_data(
-        new mmdeploy_classification_t[total]{});
-    auto result_ptr = result_data.get();
-    for (const auto& cls_output : classify_outputs) {
-      for (const auto& label : cls_output) {
-        result_ptr->label_id = label.label_id;
-        result_ptr->score = label.score;
-        ++result_ptr;
-      }
+    try
+    {
+        Value&      value = Cast(output)->front();
+
+        auto        classify_outputs = from_value<vector<mmcls::Labels>>(value);
+
+        vector<int> _result_count;
+        _result_count.reserve(classify_outputs.size());
+
+        for (const auto& cls_output : classify_outputs)
+        {
+            _result_count.push_back((int)cls_output.size());
+        }
+
+        auto                   total = std::accumulate(begin(_result_count), end(_result_count), 0);
+
+        std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
+        std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
+
+        std::unique_ptr<mmdeploy_classification_t[]> result_data(
+            new mmdeploy_classification_t[total]{});
+        auto result_ptr = result_data.get();
+        for (const auto& cls_output : classify_outputs)
+        {
+            for (const auto& label : cls_output)
+            {
+                result_ptr->label_id = label.label_id;
+                result_ptr->score    = label.score;
+                ++result_ptr;
+            }
+        }
+
+        *result_count = result_count_data.release();
+        *results      = result_data.release();
+
+        return MMDEPLOY_SUCCESS;
     }
-
-    *result_count = result_count_data.release();
-    *results = result_data.release();
-
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-void mmdeploy_classifier_release_result(mmdeploy_classification_t* results, const int* result_count,
-                                        int count) {
-  delete[] results;
-  delete[] result_count;
+void mmdeploy_classifier_release_result(mmdeploy_classification_t* results, const int* result_count, int count)
+{
+    delete[] results;
+    delete[] result_count;
 }
 
-void mmdeploy_classifier_destroy(mmdeploy_classifier_t classifier) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)classifier);
+void mmdeploy_classifier_destroy(mmdeploy_classifier_t classifier)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)classifier);
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/classifier.h b/csrc/mmdeploy/apis/c/mmdeploy/classifier.h
index 54e9d0215b..1681cf7fae 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/classifier.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/classifier.h
@@ -13,124 +13,125 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_classification_t {
-  int label_id;
-  float score;
-} mmdeploy_classification_t;
-
-typedef struct mmdeploy_classifier* mmdeploy_classifier_t;
-
-/**
- * @brief Create classifier's handle
- * @param[in] model an instance of mmclassification sdk model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] classifier instance of a classifier, which must be destroyed
- * by \ref mmdeploy_classifier_destroy
- * @return status of creating classifier's handle
- */
-MMDEPLOY_API int mmdeploy_classifier_create(mmdeploy_model_t model, const char* device_name,
-                                            int device_id, mmdeploy_classifier_t* classifier);
-
-/**
- * @brief Create classifier's handle
- * @param[in] model_path path of mmclassification sdk model exported by mmdeploy model converter
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] classifier instance of a classifier, which must be destroyed
- * by \ref mmdeploy_classifier_destroy
- * @return status of creating classifier's handle
- */
-MMDEPLOY_API int mmdeploy_classifier_create_by_path(const char* model_path, const char* device_name,
-                                                    int device_id,
-                                                    mmdeploy_classifier_t* classifier);
-
-/**
- * @brief Use classifier created by  \ref mmdeploy_classifier_create_by_path to get label
- * information of each image in a batch
- * @param[in] classifier classifier's handle created by  \ref mmdeploy_classifier_create_by_path
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] results a linear buffer to save classification results of each
- * image, which must be freed by \ref mmdeploy_classifier_release_result
- * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
- * classification results of each image. It must be released by \ref
- * mmdeploy_classifier_release_result
- * @return status of inference
- */
-MMDEPLOY_API int mmdeploy_classifier_apply(mmdeploy_classifier_t classifier,
-                                           const mmdeploy_mat_t* mats, int mat_count,
-                                           mmdeploy_classification_t** results, int** result_count);
-
-/**
- * @brief Release the inference result buffer created \ref mmdeploy_classifier_apply
- * @param[in] results classification results buffer
- * @param[in] result_count \p results size buffer
- * @param[in] count length of \p result_count
- */
-MMDEPLOY_API void mmdeploy_classifier_release_result(mmdeploy_classification_t* results,
-                                                     const int* result_count, int count);
-
-/**
- * @brief Destroy classifier's handle
- * @param[in] classifier classifier's handle created by \ref mmdeploy_classifier_create_by_path
- */
-MMDEPLOY_API void mmdeploy_classifier_destroy(mmdeploy_classifier_t classifier);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-/**
- * @brief Same as \ref mmdeploy_classifier_create, but allows to control execution context of tasks
- * via context
- */
-MMDEPLOY_API int mmdeploy_classifier_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                               mmdeploy_classifier_t* classifier);
-
-/**
- * @brief Pack classifier inputs into mmdeploy_value_t
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] value the packed value
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_classifier_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                  mmdeploy_value_t* value);
-
-/**
- * @brief Same as \ref mmdeploy_classifier_apply, but input and output are packed in \ref
- * mmdeploy_value_t.
- */
-MMDEPLOY_API int mmdeploy_classifier_apply_v2(mmdeploy_classifier_t classifier,
-                                              mmdeploy_value_t input, mmdeploy_value_t* output);
-
-/**
- * @brief Apply classifier asynchronously
- * @param[in] classifier handle of the classifier
- * @param[in] input input sender that will be consumed by the operation
- * @param[out] output output sender
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_classifier_apply_async(mmdeploy_classifier_t classifier,
-                                                 mmdeploy_sender_t input,
-                                                 mmdeploy_sender_t* output);
-
-/**
- *
- * @param[in] output output obtained by applying a classifier
- * @param[out] results a linear buffer containing classification results of each image, released by
- * \ref mmdeploy_classifier_release_result
- * @param[out] result_count a linear buffer containing the number of results for each input image,
- * released by \ref mmdeploy_classifier_release_result
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_classifier_get_result(mmdeploy_value_t output,
-                                                mmdeploy_classification_t** results,
-                                                int** result_count);
+    typedef struct mmdeploy_classification_t
+    {
+        int   label_id;
+        float score;
+    } mmdeploy_classification_t;
+
+    typedef struct mmdeploy_classifier* mmdeploy_classifier_t;
+
+    /**
+     * @brief Create classifier's handle
+     * @param[in] model an instance of mmclassification sdk model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] classifier instance of a classifier, which must be destroyed
+     * by \ref mmdeploy_classifier_destroy
+     * @return status of creating classifier's handle
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_classifier_t* classifier);
+
+    /**
+     * @brief Create classifier's handle
+     * @param[in] model_path path of mmclassification sdk model exported by mmdeploy model converter
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] classifier instance of a classifier, which must be destroyed
+     * by \ref mmdeploy_classifier_destroy
+     * @return status of creating classifier's handle
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_classifier_t* classifier);
+
+    /**
+     * @brief Use classifier created by  \ref mmdeploy_classifier_create_by_path to get label
+     * information of each image in a batch
+     * @param[in] classifier classifier's handle created by  \ref mmdeploy_classifier_create_by_path
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] results a linear buffer to save classification results of each
+     * image, which must be freed by \ref mmdeploy_classifier_release_result
+     * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
+     * classification results of each image. It must be released by \ref
+     * mmdeploy_classifier_release_result
+     * @return status of inference
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_apply(mmdeploy_classifier_t       classifier,
+                                                                  const mmdeploy_mat_t*       mats,
+                                                                  int                         mat_count,
+                                                                  mmdeploy_classification_t** results,
+                                                                  int**                       result_count);
+
+    /**
+     * @brief Release the inference result buffer created \ref mmdeploy_classifier_apply
+     * @param[in] results classification results buffer
+     * @param[in] result_count \p results size buffer
+     * @param[in] count length of \p result_count
+     */
+    MMDEPLOY_API void                   mmdeploy_classifier_release_result(mmdeploy_classification_t* results,
+                                                                           const int*                 result_count,
+                                                                           int                        count);
+
+    /**
+     * @brief Destroy classifier's handle
+     * @param[in] classifier classifier's handle created by \ref mmdeploy_classifier_create_by_path
+     */
+    MMDEPLOY_API void                   mmdeploy_classifier_destroy(mmdeploy_classifier_t classifier);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    /**
+     * @brief Same as \ref mmdeploy_classifier_create, but allows to control execution context of tasks
+     * via context
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_classifier_t* classifier);
+
+    /**
+     * @brief Pack classifier inputs into mmdeploy_value_t
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] value the packed value
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value);
+
+    /**
+     * @brief Same as \ref mmdeploy_classifier_apply, but input and output are packed in \ref
+     * mmdeploy_value_t.
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_apply_v2(mmdeploy_classifier_t classifier,
+                                                                     mmdeploy_value_t      input,
+                                                                     mmdeploy_value_t*     output);
+
+    /**
+     * @brief Apply classifier asynchronously
+     * @param[in] classifier handle of the classifier
+     * @param[in] input input sender that will be consumed by the operation
+     * @param[out] output output sender
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_apply_async(mmdeploy_classifier_t classifier,
+                                                                        mmdeploy_sender_t     input,
+                                                                        mmdeploy_sender_t*    output);
+
+    /**
+     *
+     * @param[in] output output obtained by applying a classifier
+     * @param[out] results a linear buffer containing classification results of each image, released by
+     * \ref mmdeploy_classifier_release_result
+     * @param[out] result_count a linear buffer containing the number of results for each input image,
+     * released by \ref mmdeploy_classifier_release_result
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_get_result(mmdeploy_value_t            output,
+                                                                       mmdeploy_classification_t** results,
+                                                                       int**                       result_count);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/common.cpp b/csrc/mmdeploy/apis/c/mmdeploy/common.cpp
index e00cc3f1cf..fff83da181 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/common.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/common.cpp
@@ -5,111 +5,142 @@
 #include "mmdeploy/core/profiler.h"
 #include "mmdeploy/executor_internal.h"
 
-mmdeploy_value_t mmdeploy_value_copy(mmdeploy_value_t value) {
-  if (!value) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(Value(*Cast(value))); });
+mmdeploy_value_t mmdeploy_value_copy(mmdeploy_value_t value)
+{
+    if (!value)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(Value(*Cast(value))); });
 }
 
-void mmdeploy_value_destroy(mmdeploy_value_t value) { delete Cast(value); }
+void mmdeploy_value_destroy(mmdeploy_value_t value)
+{
+    delete Cast(value);
+}
 
-int mmdeploy_context_create(mmdeploy_context_t* context) {
-  *context = (mmdeploy_context_t) new Value;
-  return 0;
+int mmdeploy_context_create(mmdeploy_context_t* context)
+{
+    *context = (mmdeploy_context_t) new Value;
+    return 0;
 }
 
-int mmdeploy_context_create_by_device(const char* device_name, int device_id,
-                                      mmdeploy_context_t* context) {
-  mmdeploy_device_t device{};
-  int ec = MMDEPLOY_SUCCESS;
-  mmdeploy_context_t _context{};
-  ec = mmdeploy_context_create(&_context);
-  if (ec != MMDEPLOY_SUCCESS) {
-    return ec;
-  }
-  ec = mmdeploy_device_create(device_name, device_id, &device);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_context_create_by_device(const char* device_name, int device_id, mmdeploy_context_t* context)
+{
+    mmdeploy_device_t  device{};
+    int                ec = MMDEPLOY_SUCCESS;
+    mmdeploy_context_t _context{};
+    ec = mmdeploy_context_create(&_context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_device_create(device_name, device_id, &device);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_context_add(_context, MMDEPLOY_TYPE_DEVICE, nullptr, device);
+    mmdeploy_device_destroy(device);
+    if (ec == MMDEPLOY_SUCCESS)
+    {
+        *context = _context;
+    }
     return ec;
-  }
-  ec = mmdeploy_context_add(_context, MMDEPLOY_TYPE_DEVICE, nullptr, device);
-  mmdeploy_device_destroy(device);
-  if (ec == MMDEPLOY_SUCCESS) {
-    *context = _context;
-  }
-  return ec;
 }
 
-void mmdeploy_context_destroy(mmdeploy_context_t context) { delete Cast(context); }
+void mmdeploy_context_destroy(mmdeploy_context_t context)
+{
+    delete Cast(context);
+}
 
-int mmdeploy_common_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                 mmdeploy_value_t* value) {
-  if (mat_count && mats == nullptr) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    auto input = std::make_unique<Value>(Value{Value::kArray});
-    for (int i = 0; i < mat_count; ++i) {
-      input->front().push_back({{"ori_img", Cast(mats[i])}});
+int mmdeploy_common_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value)
+{
+    if (mat_count && mats == nullptr)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
-    *value = Cast(input.release());
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_SUCCESS;
+    try
+    {
+        auto input = std::make_unique<Value>(Value{Value::kArray});
+        for (int i = 0; i < mat_count; ++i)
+        {
+            input->front().push_back({{"ori_img", Cast(mats[i])}});
+        }
+        *value = Cast(input.release());
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_device_create(const char* device_name, int device_id, mmdeploy_device_t* device) {
-  Device tmp(device_name, device_id);
-  if (tmp.platform_id() == -1) {
-    MMDEPLOY_ERROR("Device \"{}\" not found", device_name);
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  *device = (mmdeploy_device_t) new Device(tmp);
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_device_create(const char* device_name, int device_id, mmdeploy_device_t* device)
+{
+    Device tmp(device_name, device_id);
+    if (tmp.platform_id() == -1)
+    {
+        MMDEPLOY_ERROR("Device \"{}\" not found", device_name);
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+    *device = (mmdeploy_device_t) new Device(tmp);
+    return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_device_destroy(mmdeploy_device_t device) { delete (Device*)device; }
-
-int mmdeploy_profiler_create(const char* path, mmdeploy_profiler_t* profiler) {
-  *profiler = (mmdeploy_profiler_t) new profiler::Profiler(path);
-  return MMDEPLOY_SUCCESS;
+void mmdeploy_device_destroy(mmdeploy_device_t device)
+{
+    delete (Device*)device;
 }
 
-void mmdeploy_profiler_destroy(mmdeploy_profiler_t profiler) {
-  if (profiler) {
-    auto p = (profiler::Profiler*)profiler;
-    p->Release();
-    delete p;
-  }
+int mmdeploy_profiler_create(const char* path, mmdeploy_profiler_t* profiler)
+{
+    *profiler = (mmdeploy_profiler_t) new profiler::Profiler(path);
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_context_add(mmdeploy_context_t context, mmdeploy_context_type_t type, const char* name,
-                         const void* object) {
-  auto& ctx = *Cast(context);
-  switch (type) {
-    case MMDEPLOY_TYPE_DEVICE: {
-      const auto& device = *(Device*)object;
-      ctx["device"] = device;
-      ctx["stream"] = Stream(device);
-      break;
+void mmdeploy_profiler_destroy(mmdeploy_profiler_t profiler)
+{
+    if (profiler)
+    {
+        auto p = (profiler::Profiler*)profiler;
+        p->Release();
+        delete p;
     }
-    case MMDEPLOY_TYPE_SCHEDULER:
-      ctx["scheduler"][name] = *Cast((const mmdeploy_scheduler_t)object);
-      break;
-    case MMDEPLOY_TYPE_MODEL:
-      ctx["model"][name] = *Cast((const mmdeploy_model_t)object);
-      break;
-    case MMDEPLOY_TYPE_PROFILER: {
-      const auto& profiler = *(profiler::Profiler*)object;
-      profiler::Scope* root(profiler.scope());
-      ctx["scope"] = root;
-      break;
+}
+
+int mmdeploy_context_add(mmdeploy_context_t context, mmdeploy_context_type_t type, const char* name, const void* object)
+{
+    auto& ctx = *Cast(context);
+    switch (type)
+    {
+        case MMDEPLOY_TYPE_DEVICE:
+        {
+            const auto& device = *(Device*)object;
+            ctx["device"]      = device;
+            ctx["stream"]      = Stream(device);
+            break;
+        }
+        case MMDEPLOY_TYPE_SCHEDULER:
+            ctx["scheduler"][name] = *Cast((const mmdeploy_scheduler_t)object);
+            break;
+        case MMDEPLOY_TYPE_MODEL:
+            ctx["model"][name] = *Cast((const mmdeploy_model_t)object);
+            break;
+        case MMDEPLOY_TYPE_PROFILER:
+        {
+            const auto&      profiler = *(profiler::Profiler*)object;
+            profiler::Scope* root(profiler.scope());
+            ctx["scope"] = root;
+            break;
+        }
+        default:
+            return MMDEPLOY_E_NOT_SUPPORTED;
     }
-    default:
-      return MMDEPLOY_E_NOT_SUPPORTED;
-  }
-  return 0;
+    return 0;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/common.h b/csrc/mmdeploy/apis/c/mmdeploy/common.h
index c665134cbf..26b92973ca 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/common.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/common.h
@@ -6,19 +6,19 @@
 #include <stdint.h>  // NOLINT
 
 #ifndef MMDEPLOY_EXPORT
-#ifdef _MSC_VER
-#define MMDEPLOY_EXPORT __declspec(dllexport)
-#else
-#define MMDEPLOY_EXPORT __attribute__((visibility("default")))
-#endif
+    #ifdef _MSC_VER
+        #define MMDEPLOY_EXPORT __declspec(dllexport)
+    #else
+        #define MMDEPLOY_EXPORT __attribute__((visibility("default")))
+    #endif
 #endif
 
 #ifndef MMDEPLOY_API
-#ifdef MMDEPLOY_API_EXPORTS
-#define MMDEPLOY_API MMDEPLOY_EXPORT
-#else
-#define MMDEPLOY_API
-#endif
+    #ifdef MMDEPLOY_API_EXPORTS
+        #define MMDEPLOY_API MMDEPLOY_EXPORT
+    #else
+        #define MMDEPLOY_API
+    #endif
 #endif
 
 // clang-format off
@@ -54,136 +54,137 @@ typedef enum mmdeploy_status_t {
 
 // clang-format on
 
-typedef struct mmdeploy_device* mmdeploy_device_t;
+typedef struct mmdeploy_device*   mmdeploy_device_t;
 
 typedef struct mmdeploy_profiler* mmdeploy_profiler_t;
 
-typedef struct mmdeploy_mat_t {
-  uint8_t* data;
-  int height;
-  int width;
-  int channel;
-  mmdeploy_pixel_format_t format;
-  mmdeploy_data_type_t type;
-  mmdeploy_device_t device;
+typedef struct mmdeploy_mat_t
+{
+    uint8_t*                data;
+    int                     height;
+    int                     width;
+    int                     channel;
+    mmdeploy_pixel_format_t format;
+    mmdeploy_data_type_t    type;
+    mmdeploy_device_t       device;
 } mmdeploy_mat_t;
 
-typedef struct mmdeploy_rect_t {
-  float left;
-  float top;
-  float right;
-  float bottom;
+typedef struct mmdeploy_rect_t
+{
+    float left;
+    float top;
+    float right;
+    float bottom;
 } mmdeploy_rect_t;
 
-typedef struct mmdeploy_point_t {
-  float x;
-  float y;
+typedef struct mmdeploy_point_t
+{
+    float x;
+    float y;
 } mmdeploy_point_t;
 
-typedef struct mmdeploy_value* mmdeploy_value_t;
+typedef struct mmdeploy_value*   mmdeploy_value_t;
 
 typedef struct mmdeploy_context* mmdeploy_context_t;
 
-typedef enum mmdeploy_context_type_t {
-  MMDEPLOY_TYPE_DEVICE = 0,
-  MMDEPLOY_TYPE_STREAM = 1,
-  MMDEPLOY_TYPE_MODEL = 2,
-  MMDEPLOY_TYPE_SCHEDULER = 3,
-  MMDEPLOY_TYPE_MAT = 4,
-  MMDEPLOY_TYPE_PROFILER = 5,
+typedef enum mmdeploy_context_type_t
+{
+    MMDEPLOY_TYPE_DEVICE    = 0,
+    MMDEPLOY_TYPE_STREAM    = 1,
+    MMDEPLOY_TYPE_MODEL     = 2,
+    MMDEPLOY_TYPE_SCHEDULER = 3,
+    MMDEPLOY_TYPE_MAT       = 4,
+    MMDEPLOY_TYPE_PROFILER  = 5,
 } mmdeploy_context_type_t;
 
 #if __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-/**
- * Copy value
- * @param value
- * @return
- */
-MMDEPLOY_API mmdeploy_value_t mmdeploy_value_copy(mmdeploy_value_t value);
-
-/**
- * Destroy value
- * @param value
- */
-MMDEPLOY_API void mmdeploy_value_destroy(mmdeploy_value_t value);
-
-/**
- * Create device handle
- * @param device_name
- * @param device_id
- * @param device
- * @return
- */
-MMDEPLOY_API int mmdeploy_device_create(const char* device_name, int device_id,
-                                        mmdeploy_device_t* device);
-
-/**
- * Destroy device handle
- * @param device
- */
-MMDEPLOY_API void mmdeploy_device_destroy(mmdeploy_device_t device);
-
-/**
- * Create profiler
- * @param path path to save the profile data
- * @param profiler handle for profiler, should be added to context and deleted by
- * mmdeploy_profiler_destroy
- * @return status of create
- */
-MMDEPLOY_API int mmdeploy_profiler_create(const char* path, mmdeploy_profiler_t* profiler);
-
-/**
- * Destroy profiler handle
- * @param profiler handle for profiler, profile data will be written to disk after this call
- */
-MMDEPLOY_API void mmdeploy_profiler_destroy(mmdeploy_profiler_t profiler);
-
-/**
- * Create context
- * @param context
- * @return
- */
-MMDEPLOY_API int mmdeploy_context_create(mmdeploy_context_t* context);
-
-/**
- * Create context
- * @param device_name
- * @param device_id
- * @param context
- * @return
- */
-MMDEPLOY_API int mmdeploy_context_create_by_device(const char* device_name, int device_id,
-                                                   mmdeploy_context_t* context);
-
-/**
- * Destroy context
- * @param context
- */
-MMDEPLOY_API void mmdeploy_context_destroy(mmdeploy_context_t context);
-
-/**
- * Add context object
- * @param context
- * @param type
- * @param name
- * @param object
- * @return
- */
-MMDEPLOY_API int mmdeploy_context_add(mmdeploy_context_t context, mmdeploy_context_type_t type,
-                                      const char* name, const void* object);
-
-/**
- * Create input value from array of mats
- * @param mats
- * @param mat_count
- * @param value
- * @return
- */
-MMDEPLOY_API int mmdeploy_common_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                              mmdeploy_value_t* value);
+    /**
+     * Copy value
+     * @param value
+     * @return
+     */
+    MMDEPLOY_API mmdeploy_value_t mmdeploy_value_copy(mmdeploy_value_t value);
+
+    /**
+     * Destroy value
+     * @param value
+     */
+    MMDEPLOY_API void             mmdeploy_value_destroy(mmdeploy_value_t value);
+
+    /**
+     * Create device handle
+     * @param device_name
+     * @param device_id
+     * @param device
+     * @return
+     */
+    MMDEPLOY_API int              mmdeploy_device_create(const char* device_name, int device_id, mmdeploy_device_t* device);
+
+    /**
+     * Destroy device handle
+     * @param device
+     */
+    MMDEPLOY_API void             mmdeploy_device_destroy(mmdeploy_device_t device);
+
+    /**
+     * Create profiler
+     * @param path path to save the profile data
+     * @param profiler handle for profiler, should be added to context and deleted by
+     * mmdeploy_profiler_destroy
+     * @return status of create
+     */
+    MMDEPLOY_API int              mmdeploy_profiler_create(const char* path, mmdeploy_profiler_t* profiler);
+
+    /**
+     * Destroy profiler handle
+     * @param profiler handle for profiler, profile data will be written to disk after this call
+     */
+    MMDEPLOY_API void             mmdeploy_profiler_destroy(mmdeploy_profiler_t profiler);
+
+    /**
+     * Create context
+     * @param context
+     * @return
+     */
+    MMDEPLOY_API int              mmdeploy_context_create(mmdeploy_context_t* context);
+
+    /**
+     * Create context
+     * @param device_name
+     * @param device_id
+     * @param context
+     * @return
+     */
+    MMDEPLOY_API int              mmdeploy_context_create_by_device(const char* device_name, int device_id, mmdeploy_context_t* context);
+
+    /**
+     * Destroy context
+     * @param context
+     */
+    MMDEPLOY_API void             mmdeploy_context_destroy(mmdeploy_context_t context);
+
+    /**
+     * Add context object
+     * @param context
+     * @param type
+     * @param name
+     * @param object
+     * @return
+     */
+    MMDEPLOY_API int              mmdeploy_context_add(mmdeploy_context_t context, mmdeploy_context_type_t type, const char* name, const void* object);
+
+    /**
+     * Create input value from array of mats
+     * @param mats
+     * @param mat_count
+     * @param value
+     * @return
+     */
+    MMDEPLOY_API int              mmdeploy_common_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value);
 
 #if __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h b/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h
index a1ddecb54d..6beb2f6b5e 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h
@@ -12,93 +12,152 @@
 
 using namespace mmdeploy;
 
-namespace {
-
-inline mmdeploy_value_t Cast(Value* s) { return reinterpret_cast<mmdeploy_value_t>(s); }
-
-inline Value* Cast(mmdeploy_value_t s) { return reinterpret_cast<Value*>(s); }
-
-inline Value Take(mmdeploy_value_t v) {
-  auto value = std::move(*Cast(v));
-  mmdeploy_value_destroy(v);
-  return value;
-}
-
-inline Value* Cast(mmdeploy_context_t c) { return reinterpret_cast<Value*>(c); }
-
-inline mmdeploy_value_t Take(Value v) {
-  return Cast(new Value(std::move(v)));  // NOLINT
-}
-
-inline mmdeploy_pipeline_t Cast(AsyncHandle* pipeline) {
-  return reinterpret_cast<mmdeploy_pipeline_t>(pipeline);
-}
-
-inline AsyncHandle* Cast(mmdeploy_pipeline_t pipeline) {
-  return reinterpret_cast<AsyncHandle*>(pipeline);
-}
-
-inline mmdeploy_model_t Cast(Model* model) { return reinterpret_cast<mmdeploy_model_t>(model); }
-
-inline Model* Cast(mmdeploy_model_t model) { return reinterpret_cast<Model*>(model); }
-
-inline Mat Cast(const mmdeploy_mat_t& mat) {
-  return Mat{mat.height,         mat.width, PixelFormat(mat.format),
-             DataType(mat.type), mat.data,  mat.device ? *(const Device*)mat.device : Device{0}};
-}
-
-template <typename F>
-std::invoke_result_t<F> Guard(F f) {
-  try {
-    return f();
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return nullptr;
-}
-
-template <typename T, typename SFINAE = void>
-class wrapped {};
-
-template <typename T>
-class wrapped<T, std::void_t<decltype(Cast(T{}))>> {
- public:
-  wrapped() noexcept : v_(nullptr) {}
-  explicit wrapped(T v) noexcept : v_(v) {}
-
-  void reset() {
-    if (v_) {
-      delete Cast(v_);
-      v_ = nullptr;
+namespace
+{
+
+    inline mmdeploy_value_t Cast(Value* s)
+    {
+        return reinterpret_cast<mmdeploy_value_t>(s);
+    }
+
+    inline Value* Cast(mmdeploy_value_t s)
+    {
+        return reinterpret_cast<Value*>(s);
+    }
+
+    inline Value Take(mmdeploy_value_t v)
+    {
+        auto value = std::move(*Cast(v));
+        mmdeploy_value_destroy(v);
+        return value;
+    }
+
+    inline Value* Cast(mmdeploy_context_t c)
+    {
+        return reinterpret_cast<Value*>(c);
     }
-  }
 
-  ~wrapped() { reset(); }
+    inline mmdeploy_value_t Take(Value v)
+    {
+        return Cast(new Value(std::move(v)));  // NOLINT
+    }
 
-  wrapped(const wrapped&) = delete;
-  wrapped& operator=(const wrapped&) = delete;
+    inline mmdeploy_pipeline_t Cast(AsyncHandle* pipeline)
+    {
+        return reinterpret_cast<mmdeploy_pipeline_t>(pipeline);
+    }
 
-  wrapped(wrapped&& other) noexcept : v_(other.release()) {}
-  wrapped& operator=(wrapped&& other) noexcept {
-    reset();
-    v_ = other.release();
-    return *this;
-  }
+    inline AsyncHandle* Cast(mmdeploy_pipeline_t pipeline)
+    {
+        return reinterpret_cast<AsyncHandle*>(pipeline);
+    }
 
-  T release() noexcept { return std::exchange(v_, nullptr); }
+    inline mmdeploy_model_t Cast(Model* model)
+    {
+        return reinterpret_cast<mmdeploy_model_t>(model);
+    }
 
-  auto operator*() { return Cast(v_); }
-  auto operator-> () { return Cast(v_); }
+    inline Model* Cast(mmdeploy_model_t model)
+    {
+        return reinterpret_cast<Model*>(model);
+    }
 
-  T* ptr() noexcept { return &v_; }
+    inline Mat Cast(const mmdeploy_mat_t& mat)
+    {
+        return Mat{mat.height, mat.width, PixelFormat(mat.format), DataType(mat.type), mat.data, mat.device ? *(const Device*)mat.device : Device{0}};
+    }
 
-  operator T() const noexcept { return v_; }  // NOLINT
+    template<typename F>
+    std::invoke_result_t<F> Guard(F f)
+    {
+        try
+        {
+            return f();
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+        }
+        catch (...)
+        {
+            MMDEPLOY_ERROR("unknown exception caught");
+        }
+        return nullptr;
+    }
 
- private:
-  T v_;
-};
+    template<typename T, typename SFINAE = void>
+    class wrapped
+    {
+    };
+
+    template<typename T>
+    class wrapped<T, std::void_t<decltype(Cast(T{}))>>
+    {
+      public:
+        wrapped() noexcept
+            : v_(nullptr)
+        {
+        }
+        explicit wrapped(T v) noexcept
+            : v_(v)
+        {
+        }
+
+        void reset()
+        {
+            if (v_)
+            {
+                delete Cast(v_);
+                v_ = nullptr;
+            }
+        }
+
+        ~wrapped()
+        {
+            reset();
+        }
+
+        wrapped(const wrapped&)            = delete;
+        wrapped& operator=(const wrapped&) = delete;
+
+        wrapped(wrapped&& other) noexcept
+            : v_(other.release())
+        {
+        }
+        wrapped& operator=(wrapped&& other) noexcept
+        {
+            reset();
+            v_ = other.release();
+            return *this;
+        }
+
+        T release() noexcept
+        {
+            return std::exchange(v_, nullptr);
+        }
+
+        auto operator*()
+        {
+            return Cast(v_);
+        }
+        auto operator->()
+        {
+            return Cast(v_);
+        }
+
+        T* ptr() noexcept
+        {
+            return &v_;
+        }
+
+        operator T() const noexcept
+        {
+            return v_;
+        }  // NOLINT
+
+      private:
+        T v_;
+    };
 
 }  // namespace
 
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp b/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp
index aadf92fb62..30ea52fcab 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp
@@ -24,126 +24,142 @@ using ResultType = mmdeploy::Structure<mmdeploy_detection_t,
                                        std::deque<mmdeploy_instance_mask_t>,       //
                                        std::vector<mmdeploy::framework::Buffer>>;  //
 
-int mmdeploy_detector_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                             mmdeploy_detector_t* detector) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_detector_t* detector)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_detector_create_v2(model, context, detector);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_detector_create_v2(model, context, detector);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                mmdeploy_detector_t* detector) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
+int mmdeploy_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_detector_t* detector)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
 }
 
-int mmdeploy_detector_create_by_path(const char* model_path, const char* device_name, int device_id,
-                                     mmdeploy_detector_t* detector) {
-  mmdeploy_model_t model{};
+int mmdeploy_detector_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_detector_t* detector)
+{
+    mmdeploy_model_t model{};
 
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_detector_create(model, device_name, device_id, detector);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_detector_create(model, device_name, device_id, detector);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                   mmdeploy_value_t* input) {
-  return mmdeploy_common_create_input(mats, mat_count, input);
+int mmdeploy_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input)
+{
+    return mmdeploy_common_create_input(mats, mat_count, input);
 }
 
-int mmdeploy_detector_apply(mmdeploy_detector_t detector, const mmdeploy_mat_t* mats, int mat_count,
-                            mmdeploy_detection_t** results, int** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_detector_create_input(mats, mat_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_detector_apply_v2(detector, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_detector_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_detector_apply(mmdeploy_detector_t detector, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_detection_t** results, int** result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_detector_create_input(mats, mat_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_detector_apply_v2(detector, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_detector_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_detector_apply_v2(mmdeploy_detector_t detector, mmdeploy_value_t input,
-                               mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_detector_apply_v2(mmdeploy_detector_t detector, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_detector_apply_async(mmdeploy_detector_t detector, mmdeploy_sender_t input,
-                                  mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_detector_apply_async(mmdeploy_detector_t detector, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_detector_get_result(mmdeploy_value_t output, mmdeploy_detection_t** results,
-                                 int** result_count) {
-  if (!output || !results || !result_count) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value& value = Cast(output)->front();
-    auto detector_outputs = from_value<vector<mmdet::Detections>>(value);
-
-    vector<int> _result_count(detector_outputs.size());
-    size_t total = 0;
-    for (size_t i = 0; i < detector_outputs.size(); ++i) {
-      _result_count[i] = static_cast<int>(detector_outputs[i].size());
-      total += detector_outputs[i].size();
+int mmdeploy_detector_get_result(mmdeploy_value_t output, mmdeploy_detection_t** results, int** result_count)
+{
+    if (!output || !results || !result_count)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
+    try
+    {
+        Value&      value            = Cast(output)->front();
+        auto        detector_outputs = from_value<vector<mmdet::Detections>>(value);
+
+        vector<int> _result_count(detector_outputs.size());
+        size_t      total = 0;
+        for (size_t i = 0; i < detector_outputs.size(); ++i)
+        {
+            _result_count[i] = static_cast<int>(detector_outputs[i].size());
+            total += detector_outputs[i].size();
+        }
 
-    ResultType r({total, 1, 1, 1});
-    auto [result_data, result_count_vec, masks, buffers] = r.pointers();
-
-    auto result_ptr = result_data;
-
-    for (const auto& det_output : detector_outputs) {
-      for (const auto& detection : det_output) {
-        result_ptr->label_id = detection.label_id;
-        result_ptr->score = detection.score;
-        const auto& bbox = detection.bbox;
-        result_ptr->bbox = {bbox[0], bbox[1], bbox[2], bbox[3]};
-        auto mask_byte_size = detection.mask.byte_size();
-        if (mask_byte_size) {
-          auto& mask = detection.mask;
-          result_ptr->mask = &masks->emplace_back();
-          buffers->push_back(mask.buffer());
-          result_ptr->mask->data = mask.data<char>();
-          result_ptr->mask->width = mask.width();
-          result_ptr->mask->height = mask.height();
+        ResultType r({total, 1, 1, 1});
+        auto [result_data, result_count_vec, masks, buffers] = r.pointers();
+
+        auto result_ptr = result_data;
+
+        for (const auto& det_output : detector_outputs)
+        {
+            for (const auto& detection : det_output)
+            {
+                result_ptr->label_id = detection.label_id;
+                result_ptr->score    = detection.score;
+                const auto& bbox     = detection.bbox;
+                result_ptr->bbox     = {bbox[0], bbox[1], bbox[2], bbox[3]};
+                auto mask_byte_size  = detection.mask.byte_size();
+                if (mask_byte_size)
+                {
+                    auto& mask       = detection.mask;
+                    result_ptr->mask = &masks->emplace_back();
+                    buffers->push_back(mask.buffer());
+                    result_ptr->mask->data   = mask.data<char>();
+                    result_ptr->mask->width  = mask.width();
+                    result_ptr->mask->height = mask.height();
+                }
+                ++result_ptr;
+            }
         }
-        ++result_ptr;
-      }
-    }
 
-    *result_count_vec = std::move(_result_count);
-    *result_count = result_count_vec->data();
-    *results = result_data;
-    r.release();
+        *result_count_vec = std::move(_result_count);
+        *result_count     = result_count_vec->data();
+        *results          = result_data;
+        r.release();
 
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-void mmdeploy_detector_release_result(mmdeploy_detection_t* results, const int* result_count,
-                                      int count) {
-  auto num_dets = std::accumulate(result_count, result_count + count, 0);
-  ResultType deleter({static_cast<size_t>(num_dets), 1, 1, 1}, results);
+void mmdeploy_detector_release_result(mmdeploy_detection_t* results, const int* result_count, int count)
+{
+    auto       num_dets = std::accumulate(result_count, result_count + count, 0);
+    ResultType deleter({static_cast<size_t>(num_dets), 1, 1, 1}, results);
 }
 
-void mmdeploy_detector_destroy(mmdeploy_detector_t detector) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
+void mmdeploy_detector_destroy(mmdeploy_detector_t detector)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/detector.h b/csrc/mmdeploy/apis/c/mmdeploy/detector.h
index 5c5ba2f356..713214ca4f 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/detector.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/detector.h
@@ -13,124 +13,123 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_instance_mask_t {
-  char* data;
-  int height;
-  int width;
-} mmdeploy_instance_mask_t;
-
-typedef struct mmdeploy_detection_t {
-  int label_id;
-  float score;
-  mmdeploy_rect_t bbox;
-  mmdeploy_instance_mask_t* mask;
-} mmdeploy_detection_t;
-
-typedef struct mmdeploy_detector* mmdeploy_detector_t;
-
-/**
- * @brief Create detector's handle
- * @param[in] model an instance of mmdetection sdk model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector instance of a detector
- * @return status of creating detector's handle
- */
-MMDEPLOY_API int mmdeploy_detector_create(mmdeploy_model_t model, const char* device_name,
-                                          int device_id, mmdeploy_detector_t* detector);
-
-/**
- * @brief Create detector's handle
- * @param[in] model_path path of mmdetection sdk model exported by mmdeploy model converter
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector instance of a detector
- * @return status of creating detector's handle
- */
-MMDEPLOY_API int mmdeploy_detector_create_by_path(const char* model_path, const char* device_name,
-                                                  int device_id, mmdeploy_detector_t* detector);
-
-/**
- * @brief Apply detector to batch images and get their inference results
- * @param[in] detector detector's handle created by \ref mmdeploy_detector_create_by_path
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] results a linear buffer to save detection results of each image. It must be released
- * by \ref mmdeploy_detector_release_result
- * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
- * detection results of each image. And it must be released by \ref
- * mmdeploy_detector_release_result
- * @return status of inference
- */
-MMDEPLOY_API int mmdeploy_detector_apply(mmdeploy_detector_t detector, const mmdeploy_mat_t* mats,
-                                         int mat_count, mmdeploy_detection_t** results,
-                                         int** result_count);
-
-/** @brief Release the inference result buffer created by \ref mmdeploy_detector_apply
- * @param[in] results detection results buffer
- * @param[in] result_count  \p results size buffer
- * @param[in] count length of \p result_count
- */
-MMDEPLOY_API void mmdeploy_detector_release_result(mmdeploy_detection_t* results,
-                                                   const int* result_count, int count);
-
-/**
- * @brief Destroy detector's handle
- * @param[in] detector detector's handle created by \ref mmdeploy_detector_create_by_path
- */
-MMDEPLOY_API void mmdeploy_detector_destroy(mmdeploy_detector_t detector);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-/**
- * @brief Same as \ref mmdeploy_detector_create, but allows to control execution context of tasks
- * via context
- */
-MMDEPLOY_API int mmdeploy_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                             mmdeploy_detector_t* detector);
-
-/**
- * @brief Pack detector inputs into mmdeploy_value_t
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @return the created value
- */
-MMDEPLOY_API int mmdeploy_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                mmdeploy_value_t* input);
-
-/**
- * @brief Same as \ref mmdeploy_detector_apply, but input and output are packed in \ref
- * mmdeploy_value_t.
- */
-MMDEPLOY_API int mmdeploy_detector_apply_v2(mmdeploy_detector_t detector, mmdeploy_value_t input,
-                                            mmdeploy_value_t* output);
-
-/**
- * @brief Apply detector asynchronously
- * @param[in] detector handle to the detector
- * @param[in] input input sender
- * @return output sender
- */
-MMDEPLOY_API int mmdeploy_detector_apply_async(mmdeploy_detector_t detector,
-                                               mmdeploy_sender_t input, mmdeploy_sender_t* output);
-
-/**
- * @brief Unpack detector output from a mmdeploy_value_t
- * @param[in] output output obtained by applying a detector
- * @param[out] results a linear buffer to save detection results of each image. It must be released
- * by \ref mmdeploy_detector_release_result
- * @param[out] result_count a linear buffer with length number of input images to save the number of
- * detection results of each image. Must be released by \ref
- * mmdeploy_detector_release_result
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_detector_get_result(mmdeploy_value_t output,
-                                              mmdeploy_detection_t** results, int** result_count);
+    typedef struct mmdeploy_instance_mask_t
+    {
+        char* data;
+        int   height;
+        int   width;
+    } mmdeploy_instance_mask_t;
+
+    typedef struct mmdeploy_detection_t
+    {
+        int                       label_id;
+        float                     score;
+        mmdeploy_rect_t           bbox;
+        mmdeploy_instance_mask_t* mask;
+    } mmdeploy_detection_t;
+
+    typedef struct mmdeploy_detector* mmdeploy_detector_t;
+
+    /**
+     * @brief Create detector's handle
+     * @param[in] model an instance of mmdetection sdk model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector instance of a detector
+     * @return status of creating detector's handle
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_detector_t* detector);
+
+    /**
+     * @brief Create detector's handle
+     * @param[in] model_path path of mmdetection sdk model exported by mmdeploy model converter
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector instance of a detector
+     * @return status of creating detector's handle
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_detector_t* detector);
+
+    /**
+     * @brief Apply detector to batch images and get their inference results
+     * @param[in] detector detector's handle created by \ref mmdeploy_detector_create_by_path
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] results a linear buffer to save detection results of each image. It must be released
+     * by \ref mmdeploy_detector_release_result
+     * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
+     * detection results of each image. And it must be released by \ref
+     * mmdeploy_detector_release_result
+     * @return status of inference
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_apply(mmdeploy_detector_t detector, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_detection_t** results, int** result_count);
+
+    /** @brief Release the inference result buffer created by \ref mmdeploy_detector_apply
+     * @param[in] results detection results buffer
+     * @param[in] result_count  \p results size buffer
+     * @param[in] count length of \p result_count
+     */
+    MMDEPLOY_API void                 mmdeploy_detector_release_result(mmdeploy_detection_t* results,
+                                                                       const int*            result_count,
+                                                                       int                   count);
+
+    /**
+     * @brief Destroy detector's handle
+     * @param[in] detector detector's handle created by \ref mmdeploy_detector_create_by_path
+     */
+    MMDEPLOY_API void                 mmdeploy_detector_destroy(mmdeploy_detector_t detector);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    /**
+     * @brief Same as \ref mmdeploy_detector_create, but allows to control execution context of tasks
+     * via context
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_detector_t* detector);
+
+    /**
+     * @brief Pack detector inputs into mmdeploy_value_t
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @return the created value
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input);
+
+    /**
+     * @brief Same as \ref mmdeploy_detector_apply, but input and output are packed in \ref
+     * mmdeploy_value_t.
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_apply_v2(mmdeploy_detector_t detector, mmdeploy_value_t input, mmdeploy_value_t* output);
+
+    /**
+     * @brief Apply detector asynchronously
+     * @param[in] detector handle to the detector
+     * @param[in] input input sender
+     * @return output sender
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_apply_async(mmdeploy_detector_t detector,
+                                                                    mmdeploy_sender_t   input,
+                                                                    mmdeploy_sender_t*  output);
+
+    /**
+     * @brief Unpack detector output from a mmdeploy_value_t
+     * @param[in] output output obtained by applying a detector
+     * @param[out] results a linear buffer to save detection results of each image. It must be released
+     * by \ref mmdeploy_detector_release_result
+     * @param[out] result_count a linear buffer with length number of input images to save the number of
+     * detection results of each image. Must be released by \ref
+     * mmdeploy_detector_release_result
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_get_result(mmdeploy_value_t       output,
+                                                                   mmdeploy_detection_t** results,
+                                                                   int**                  result_count);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp b/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp
index 2fdfb9091f..e73ffe0606 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp
@@ -9,199 +9,261 @@
 
 using namespace mmdeploy;
 
-namespace {
+namespace
+{
 
-mmdeploy_scheduler_t CreateScheduler(const char* type, const Value& config = Value()) {
-  try {
-    auto creator = gRegistry<SchedulerType>().Get(type);
-    if (!creator) {
-      MMDEPLOY_ERROR("Creator for {} not found. Available schedulers: {}", type,
-                     gRegistry<SchedulerType>().List());
-      return nullptr;
+    mmdeploy_scheduler_t CreateScheduler(const char* type, const Value& config = Value())
+    {
+        try
+        {
+            auto creator = gRegistry<SchedulerType>().Get(type);
+            if (!creator)
+            {
+                MMDEPLOY_ERROR("Creator for {} not found. Available schedulers: {}", type, gRegistry<SchedulerType>().List());
+                return nullptr;
+            }
+            return Cast(new SchedulerType(creator->Create(config)));
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("failed to create Scheduler: {} ({}), config: {}", type, e.what(), config);
+            return nullptr;
+        }
     }
-    return Cast(new SchedulerType(creator->Create(config)));
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("failed to create Scheduler: {} ({}), config: {}", type, e.what(), config);
-    return nullptr;
-  }
-}
 
 }  // namespace
 
-mmdeploy_sender_t mmdeploy_sender_copy(mmdeploy_sender_t input) {
-  if (!input) {
-    return nullptr;
-  }
-  return Take(SenderType(*Cast(input)));
+mmdeploy_sender_t mmdeploy_sender_copy(mmdeploy_sender_t input)
+{
+    if (!input)
+    {
+        return nullptr;
+    }
+    return Take(SenderType(*Cast(input)));
 }
 
-int mmdeploy_sender_destroy(mmdeploy_sender_t sender) {
-  delete Cast(sender);
-  return 0;
+int mmdeploy_sender_destroy(mmdeploy_sender_t sender)
+{
+    delete Cast(sender);
+    return 0;
 }
 
-mmdeploy_scheduler_t mmdeploy_executor_inline() { return CreateScheduler("Inline"); }
+mmdeploy_scheduler_t mmdeploy_executor_inline()
+{
+    return CreateScheduler("Inline");
+}
 
-mmdeploy_scheduler_t mmdeploy_executor_system_pool() {
-  // create a thread pool context and hold its shared handle
-  static auto scheduler = *Cast(CreateScheduler("ThreadPool"));
-  // return a copy of the handle to the thread pool
-  return Cast(new SchedulerType(scheduler));
+mmdeploy_scheduler_t mmdeploy_executor_system_pool()
+{
+    // create a thread pool context and hold its shared handle
+    static auto scheduler = *Cast(CreateScheduler("ThreadPool"));
+    // return a copy of the handle to the thread pool
+    return Cast(new SchedulerType(scheduler));
 }
 
-mmdeploy_scheduler_t mmdeploy_executor_create_thread_pool(int num_threads) {
-  return CreateScheduler("ThreadPool", {{"num_threads", num_threads}});
+mmdeploy_scheduler_t mmdeploy_executor_create_thread_pool(int num_threads)
+{
+    return CreateScheduler("ThreadPool", {{"num_threads", num_threads}});
 }
 
-mmdeploy_scheduler_t mmdeploy_executor_create_thread() { return CreateScheduler("SingleThread"); }
+mmdeploy_scheduler_t mmdeploy_executor_create_thread()
+{
+    return CreateScheduler("SingleThread");
+}
 
 mmdeploy_scheduler_t mmdeploy_executor_dynamic_batch(mmdeploy_scheduler_t scheduler,
-                                                     int max_batch_size, int timeout) {
-  if (!scheduler) {
-    return nullptr;
-  }
-  return CreateScheduler(
-      "DynamicBatch",
-      {{"scheduler", *Cast(scheduler)}, {"max_batch_size", max_batch_size}, {"timeout", timeout}});
+                                                     int                  max_batch_size,
+                                                     int                  timeout)
+{
+    if (!scheduler)
+    {
+        return nullptr;
+    }
+    return CreateScheduler(
+        "DynamicBatch",
+        {{"scheduler", *Cast(scheduler)}, {"max_batch_size", max_batch_size}, {"timeout", timeout}});
 }
 
-int mmdeploy_scheduler_destroy(mmdeploy_scheduler_t scheduler) {
-  delete Cast(scheduler);
-  return 0;
+int mmdeploy_scheduler_destroy(mmdeploy_scheduler_t scheduler)
+{
+    delete Cast(scheduler);
+    return 0;
 }
 
-mmdeploy_sender_t mmdeploy_executor_just(mmdeploy_value_t value) {
-  if (value) {
-    return Guard([&] { return Take(Just(*Cast(value))); });
-  } else {
-    return Take(Just(Value()));
-  }
+mmdeploy_sender_t mmdeploy_executor_just(mmdeploy_value_t value)
+{
+    if (value)
+    {
+        return Guard([&]
+                     { return Take(Just(*Cast(value))); });
+    }
+    else
+    {
+        return Take(Just(Value()));
+    }
 }
 
-mmdeploy_sender_t mmdeploy_executor_schedule(mmdeploy_scheduler_t scheduler) {
-  if (!scheduler) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(Then(Schedule(*Cast(scheduler)), [] { return Value(); })); });
+mmdeploy_sender_t mmdeploy_executor_schedule(mmdeploy_scheduler_t scheduler)
+{
+    if (!scheduler)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(Then(Schedule(*Cast(scheduler)), []
+                                    { return Value(); })); });
 }
 
 mmdeploy_sender_t mmdeploy_executor_transfer_just(mmdeploy_scheduler_t scheduler,
-                                                  mmdeploy_value_t value) {
-  if (!scheduler || !value) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(TransferJust(*Cast(scheduler), *Cast(value))); });
-}
-
-mmdeploy_sender_t mmdeploy_executor_transfer(mmdeploy_sender_t input,
-                                             mmdeploy_scheduler_t scheduler) {
-  if (!input || !scheduler) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(Transfer(Take(input), *Cast(scheduler))); });
-}
-
-mmdeploy_sender_t mmdeploy_executor_on(mmdeploy_scheduler_t scheduler, mmdeploy_sender_t input) {
-  if (!scheduler || !input) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(On(*Cast(scheduler), Take(input))); });
-}
-
-mmdeploy_sender_t mmdeploy_executor_then(mmdeploy_sender_t input, mmdeploy_then_fn_t fn,
-                                         void* context) {
-  if (!input || !fn) {
-    return nullptr;
-  }
-  return Guard([&] {
-    return Take(Then(Take(input), [fn, context](Value args) {
+                                                  mmdeploy_value_t     value)
+{
+    if (!scheduler || !value)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(TransferJust(*Cast(scheduler), *Cast(value))); });
+}
+
+mmdeploy_sender_t mmdeploy_executor_transfer(mmdeploy_sender_t    input,
+                                             mmdeploy_scheduler_t scheduler)
+{
+    if (!input || !scheduler)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(Transfer(Take(input), *Cast(scheduler))); });
+}
+
+mmdeploy_sender_t mmdeploy_executor_on(mmdeploy_scheduler_t scheduler, mmdeploy_sender_t input)
+{
+    if (!scheduler || !input)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(On(*Cast(scheduler), Take(input))); });
+}
+
+mmdeploy_sender_t mmdeploy_executor_then(mmdeploy_sender_t input, mmdeploy_then_fn_t fn, void* context)
+{
+    if (!input || !fn)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(Then(Take(input), [fn, context](Value args)
+                                    {
       auto out = Cast(fn(Take(std::move(args)), context));
       Value ret(std::move(*out));
       delete out;
-      return ret;
-    }));
-  });
-}
-
-mmdeploy_sender_t mmdeploy_executor_let_value(mmdeploy_sender_t input, mmdeploy_let_value_fn_t fn,
-                                              void* context) {
-  if (!input || !fn) {
-    return nullptr;
-  }
-  return Guard([&] {
-    return Take(LetValue(Take(input), [fn, context](Value& args) {
+      return ret; })); });
+}
+
+mmdeploy_sender_t mmdeploy_executor_let_value(mmdeploy_sender_t input, mmdeploy_let_value_fn_t fn, void* context)
+{
+    if (!input || !fn)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(LetValue(Take(input), [fn, context](Value& args)
+                                        {
       auto out = Cast(fn(Cast(&args), context));
       SenderType ret(std::move(*out));
       delete out;
-      return ret;
-    }));
-  });
+      return ret; })); });
 }
 
-mmdeploy_sender_t mmdeploy_executor_split(mmdeploy_sender_t input) {
-  if (!input) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(Split(Take(input))); });
+mmdeploy_sender_t mmdeploy_executor_split(mmdeploy_sender_t input)
+{
+    if (!input)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(Split(Take(input))); });
 }
 
-mmdeploy_sender_t mmdeploy_executor_when_all(mmdeploy_sender_t inputs[], int32_t n) {
-  if (!inputs) {
-    return nullptr;
-  }
-  return Guard([&] {
+mmdeploy_sender_t mmdeploy_executor_when_all(mmdeploy_sender_t inputs[], int32_t n)
+{
+    if (!inputs)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 {
     std::vector<SenderType> senders;
     senders.reserve(n);
     for (int i = 0; i < n; ++i) {
       senders.emplace_back(Take(inputs[i]));
     }
     return Take(
-        Then(WhenAll(std::move(senders)), [](Value::Array&& v) { return Value(std::move(v)); }));
-  });
+        Then(WhenAll(std::move(senders)), [](Value::Array&& v) { return Value(std::move(v)); })); });
 }
 
-mmdeploy_sender_t mmdeploy_executor_ensure_started(mmdeploy_sender_t input) {
-  if (!input) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(EnsureStarted(Take(input))); });
+mmdeploy_sender_t mmdeploy_executor_ensure_started(mmdeploy_sender_t input)
+{
+    if (!input)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(EnsureStarted(Take(input))); });
 }
 
-int mmdeploy_executor_start_detached(mmdeploy_sender_t input) {
-  if (!input) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    StartDetached(Take(input));
-    return 0;
-  } catch (...) {
-  }
-  return MMDEPLOY_E_FAIL;
+int mmdeploy_executor_start_detached(mmdeploy_sender_t input)
+{
+    if (!input)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+    try
+    {
+        StartDetached(Take(input));
+        return 0;
+    }
+    catch (...)
+    {
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-mmdeploy_value_t mmdeploy_executor_sync_wait(mmdeploy_sender_t input) {
-  if (!input) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(std::get<Value>(SyncWait(Take(input)))); });
+mmdeploy_value_t mmdeploy_executor_sync_wait(mmdeploy_sender_t input)
+{
+    if (!input)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(std::get<Value>(SyncWait(Take(input)))); });
 }
 
-int mmdeploy_executor_sync_wait_v2(mmdeploy_sender_t sender, mmdeploy_value_t* value) {
-  if (!sender) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  auto result = mmdeploy_executor_sync_wait(sender);
-  if (!result) {
-    return MMDEPLOY_E_FAIL;
-  }
-  if (value) {
-    *value = result;
-  } else {
-    mmdeploy_value_destroy(result);
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_executor_sync_wait_v2(mmdeploy_sender_t sender, mmdeploy_value_t* value)
+{
+    if (!sender)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+    auto result = mmdeploy_executor_sync_wait(sender);
+    if (!result)
+    {
+        return MMDEPLOY_E_FAIL;
+    }
+    if (value)
+    {
+        *value = result;
+    }
+    else
+    {
+        mmdeploy_value_destroy(result);
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_executor_execute(mmdeploy_scheduler_t scheduler, void (*fn)(void*), void* context) {
-  Execute(*Cast(scheduler), [fn, context] { fn(context); });
+void mmdeploy_executor_execute(mmdeploy_scheduler_t scheduler, void (*fn)(void*), void* context)
+{
+    Execute(*Cast(scheduler), [fn, context]
+            { fn(context); });
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/executor.h b/csrc/mmdeploy/apis/c/mmdeploy/executor.h
index a2c8ffa387..4b044a6b51 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/executor.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/executor.h
@@ -6,133 +6,135 @@
 #include "mmdeploy/common.h"
 
 #if __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-/******************************************************************************
- * Experimental asynchronous APIs */
+    /******************************************************************************
+     * Experimental asynchronous APIs */
 
-typedef mmdeploy_value_t (*mmdeploy_then_fn_t)(mmdeploy_value_t, void*);
+    typedef mmdeploy_value_t (*mmdeploy_then_fn_t)(mmdeploy_value_t, void*);
 
-typedef mmdeploy_value_t (*mmdeploy_then_fn_v2_t)(mmdeploy_value_t*, void*);
-
-typedef int (*mmdeploy_then_fn_v3_t)(mmdeploy_value_t* input, mmdeploy_value_t* output, void*);
+    typedef mmdeploy_value_t (*mmdeploy_then_fn_v2_t)(mmdeploy_value_t*, void*);
+
+    typedef int (*mmdeploy_then_fn_v3_t)(mmdeploy_value_t* input, mmdeploy_value_t* output, void*);
+
+    struct mmdeploy_sender;
+    struct mmdeploy_scheduler;
+
+    typedef struct mmdeploy_sender*    mmdeploy_sender_t;
+    typedef struct mmdeploy_scheduler* mmdeploy_scheduler_t;
 
-struct mmdeploy_sender;
-struct mmdeploy_scheduler;
+    typedef mmdeploy_sender_t (*mmdeploy_let_value_fn_t)(mmdeploy_value_t, void*);
 
-typedef struct mmdeploy_sender* mmdeploy_sender_t;
-typedef struct mmdeploy_scheduler* mmdeploy_scheduler_t;
+    ///////////////////////////////////////////////////////////////////////////////
+    // Scheduler
+    ///////////////////////////////////////////////////////////////////////////////
+    MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_inline();
 
-typedef mmdeploy_sender_t (*mmdeploy_let_value_fn_t)(mmdeploy_value_t, void*);
+    MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_system_pool();
 
-///////////////////////////////////////////////////////////////////////////////
-// Scheduler
-///////////////////////////////////////////////////////////////////////////////
-MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_inline();
+    /**
+     * Create a thread pool with the given number of worker threads
+     * @param[in] num_threads
+     * @return the handle to the created thread pool
+     */
+    MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_create_thread_pool(int num_threads);
 
-MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_system_pool();
+    MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_create_thread();
 
-/**
- * Create a thread pool with the given number of worker threads
- * @param[in] num_threads
- * @return the handle to the created thread pool
- */
-MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_create_thread_pool(int num_threads);
+    MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_dynamic_batch(mmdeploy_scheduler_t scheduler,
+                                                                      int                  max_batch_size,
+                                                                      int                  timeout);
 
-MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_create_thread();
+    MMDEPLOY_API int                  mmdeploy_scheduler_destroy(mmdeploy_scheduler_t scheduler);
 
-MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_dynamic_batch(mmdeploy_scheduler_t scheduler,
-                                                                  int max_batch_size, int timeout);
+    ///////////////////////////////////////////////////////////////////////////////
+    // Utilities
+    ///////////////////////////////////////////////////////////////////////////////
 
-MMDEPLOY_API int mmdeploy_scheduler_destroy(mmdeploy_scheduler_t scheduler);
+    /**
+     * @brief Create a copy of a copyable sender. Only senders created by \ref mmdeploy_executor_split
+     * is copyable for now.
+     * @param[in] input copyable sender,
+     * @return the sender created, or nullptr if the sender is not copyable
+     */
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_sender_copy(mmdeploy_sender_t input);
 
-///////////////////////////////////////////////////////////////////////////////
-// Utilities
-///////////////////////////////////////////////////////////////////////////////
+    /**
+     * @brief Destroy a sender, notice that all sender adapters will consume input senders, only unused
+     * senders should be destroyed using this function.
+     * @param[in] input
+     */
+    MMDEPLOY_API int                  mmdeploy_sender_destroy(mmdeploy_sender_t sender);
 
-/**
- * @brief Create a copy of a copyable sender. Only senders created by \ref mmdeploy_executor_split
- * is copyable for now.
- * @param[in] input copyable sender,
- * @return the sender created, or nullptr if the sender is not copyable
- */
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_sender_copy(mmdeploy_sender_t input);
+    ///////////////////////////////////////////////////////////////////////////////
+    // Sender factories
+    ///////////////////////////////////////////////////////////////////////////////
 
-/**
- * @brief Destroy a sender, notice that all sender adapters will consume input senders, only unused
- * senders should be destroyed using this function.
- * @param[in] input
- */
-MMDEPLOY_API int mmdeploy_sender_destroy(mmdeploy_sender_t sender);
+    /**
+     * @brief Create a sender that sends the provided value
+     * @param[in] value
+     * @return created sender
+     */
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_just(mmdeploy_value_t value);
 
-///////////////////////////////////////////////////////////////////////////////
-// Sender factories
-///////////////////////////////////////////////////////////////////////////////
+    /**
+     * @brief
+     * @param[in] scheduler
+     * @return the sender created
+     */
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_schedule(mmdeploy_scheduler_t scheduler);
 
-/**
- * @brief Create a sender that sends the provided value
- * @param[in] value
- * @return created sender
- */
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_just(mmdeploy_value_t value);
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_transfer_just(mmdeploy_scheduler_t scheduler,
+                                                                      mmdeploy_value_t     value);
 
-/**
- * @brief
- * @param[in] scheduler
- * @return the sender created
- */
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_schedule(mmdeploy_scheduler_t scheduler);
+    ///////////////////////////////////////////////////////////////////////////////
+    // Sender adapters
+    ///////////////////////////////////////////////////////////////////////////////
 
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_transfer_just(mmdeploy_scheduler_t scheduler,
-                                                               mmdeploy_value_t value);
+    /**
+     * Transfer the execution to the execution agent of the provided scheduler
+     * @param[in] input
+     * @param[in] scheduler
+     * @return the sender created
+     */
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_transfer(mmdeploy_sender_t    input,
+                                                                 mmdeploy_scheduler_t scheduler);
 
-///////////////////////////////////////////////////////////////////////////////
-// Sender adapters
-///////////////////////////////////////////////////////////////////////////////
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_on(mmdeploy_scheduler_t scheduler,
+                                                           mmdeploy_sender_t    input);
 
-/**
- * Transfer the execution to the execution agent of the provided scheduler
- * @param[in] input
- * @param[in] scheduler
- * @return the sender created
- */
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_transfer(mmdeploy_sender_t input,
-                                                          mmdeploy_scheduler_t scheduler);
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_then(mmdeploy_sender_t  input,
+                                                             mmdeploy_then_fn_t fn,
+                                                             void*              context);
 
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_on(mmdeploy_scheduler_t scheduler,
-                                                    mmdeploy_sender_t input);
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_let_value(mmdeploy_sender_t       input,
+                                                                  mmdeploy_let_value_fn_t fn,
+                                                                  void*                   context);
 
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_then(mmdeploy_sender_t input,
-                                                      mmdeploy_then_fn_t fn, void* context);
-
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_let_value(mmdeploy_sender_t input,
-                                                           mmdeploy_let_value_fn_t fn,
-                                                           void* context);
-
-/**
- * Convert the input sender into a sender that is copyable via \ref mmdeploy_sender_copy. Notice
- * that this function doesn't make the sender multi-shot, it just return a sender that is copyable.
- * @param[in] input
- * @return the sender that is copyable
- */
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_split(mmdeploy_sender_t input);
-
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_when_all(mmdeploy_sender_t inputs[], int32_t n);
-
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_ensure_started(mmdeploy_sender_t input);
-
-///////////////////////////////////////////////////////////////////////////////
-// Sender consumers
-///////////////////////////////////////////////////////////////////////////////
-MMDEPLOY_API int mmdeploy_executor_start_detached(mmdeploy_sender_t input);
-
-MMDEPLOY_API mmdeploy_value_t mmdeploy_executor_sync_wait(mmdeploy_sender_t input);
-
-MMDEPLOY_API int mmdeploy_executor_sync_wait_v2(mmdeploy_sender_t input, mmdeploy_value_t* output);
-
-MMDEPLOY_API void mmdeploy_executor_execute(mmdeploy_scheduler_t scheduler, void (*fn)(void*),
-                                            void* context);
+    /**
+     * Convert the input sender into a sender that is copyable via \ref mmdeploy_sender_copy. Notice
+     * that this function doesn't make the sender multi-shot, it just return a sender that is copyable.
+     * @param[in] input
+     * @return the sender that is copyable
+     */
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_split(mmdeploy_sender_t input);
+
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_when_all(mmdeploy_sender_t inputs[], int32_t n);
+
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_ensure_started(mmdeploy_sender_t input);
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Sender consumers
+    ///////////////////////////////////////////////////////////////////////////////
+    MMDEPLOY_API int                  mmdeploy_executor_start_detached(mmdeploy_sender_t input);
+
+    MMDEPLOY_API mmdeploy_value_t     mmdeploy_executor_sync_wait(mmdeploy_sender_t input);
+
+    MMDEPLOY_API int                  mmdeploy_executor_sync_wait_v2(mmdeploy_sender_t input, mmdeploy_value_t* output);
+
+    MMDEPLOY_API void                 mmdeploy_executor_execute(mmdeploy_scheduler_t scheduler, void (*fn)(void*), void* context);
 
 #if __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/executor_internal.h b/csrc/mmdeploy/apis/c/mmdeploy/executor_internal.h
index 95f39fe009..0ae8c2a529 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/executor_internal.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/executor_internal.h
@@ -8,33 +8,49 @@
 
 using namespace mmdeploy;
 
-using SenderType = TypeErasedSender<Value>;
+using SenderType    = TypeErasedSender<Value>;
 using SchedulerType = TypeErasedScheduler<Value>;
 
-namespace {
-
-inline SchedulerType* Cast(mmdeploy_scheduler_t s) { return reinterpret_cast<SchedulerType*>(s); }
-
-inline mmdeploy_scheduler_t Cast(SchedulerType* s) {
-  return reinterpret_cast<mmdeploy_scheduler_t>(s);
-}
-
-inline SenderType* Cast(mmdeploy_sender_t s) { return reinterpret_cast<SenderType*>(s); }
-
-inline mmdeploy_sender_t Cast(SenderType* s) { return reinterpret_cast<mmdeploy_sender_t>(s); }
-
-inline SenderType Take(mmdeploy_sender_t s) {
-  auto sender = std::move(*Cast(s));
-  mmdeploy_sender_destroy(s);
-  return sender;
-}
-
-inline mmdeploy_sender_t Take(SenderType s) { return Cast(new SenderType(std::move(s))); }
-
-template <typename T, std::enable_if_t<_is_sender<T>, int> = 0>
-inline mmdeploy_sender_t Take(T& s) {
-  return Take(SenderType(std::move(s)));
-}
+namespace
+{
+
+    inline SchedulerType* Cast(mmdeploy_scheduler_t s)
+    {
+        return reinterpret_cast<SchedulerType*>(s);
+    }
+
+    inline mmdeploy_scheduler_t Cast(SchedulerType* s)
+    {
+        return reinterpret_cast<mmdeploy_scheduler_t>(s);
+    }
+
+    inline SenderType* Cast(mmdeploy_sender_t s)
+    {
+        return reinterpret_cast<SenderType*>(s);
+    }
+
+    inline mmdeploy_sender_t Cast(SenderType* s)
+    {
+        return reinterpret_cast<mmdeploy_sender_t>(s);
+    }
+
+    inline SenderType Take(mmdeploy_sender_t s)
+    {
+        auto sender = std::move(*Cast(s));
+        mmdeploy_sender_destroy(s);
+        return sender;
+    }
+
+    inline mmdeploy_sender_t Take(SenderType s)
+    {
+        return Cast(new SenderType(std::move(s)));
+    }
+
+    template<typename T, std::enable_if_t<_is_sender<T>, int> = 0>
+    inline mmdeploy_sender_t Take(T& s)
+    {
+        return Take(SenderType(std::move(s)));
+    }
 
 }  // namespace
 
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/handle.h b/csrc/mmdeploy/apis/c/mmdeploy/handle.h
index 006ddaae3d..d2ccde1ef5 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/handle.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/handle.h
@@ -11,42 +11,53 @@
 #include "mmdeploy/graph/common.h"
 #include "mmdeploy/graph/static_router.h"
 
-namespace mmdeploy {
-
-using namespace framework;
-
-namespace {
-
-class AsyncHandle {
- public:
-  AsyncHandle(const char* device_name, int device_id, Value config)
-      : AsyncHandle(SetContext(std::move(config), device_name, device_id)) {}
-
-  explicit AsyncHandle(const Value& config) {
-    if (auto builder = graph::Builder::CreateFromConfig(config).value()) {
-      node_ = builder->Build().value();
-    } else {
-      MMDEPLOY_ERROR("failed to find creator for node");
-      throw_exception(eEntryNotFound);
-    }
-  }
-
-  graph::Sender<Value> Process(graph::Sender<Value> input) {
-    return node_->Process(std::move(input));
-  }
-
- private:
-  static Value SetContext(Value config, const char* device_name, int device_id) {
-    Device device(device_name, device_id);
-    Stream stream(device);
-    config["context"].update({{"device", device}, {"stream", stream}});
-    return config;
-  }
-
-  std::unique_ptr<graph::Node> node_;
-};
-
-}  // namespace
+namespace mmdeploy
+{
+
+    using namespace framework;
+
+    namespace
+    {
+
+        class AsyncHandle
+        {
+          public:
+            AsyncHandle(const char* device_name, int device_id, Value config)
+                : AsyncHandle(SetContext(std::move(config), device_name, device_id))
+            {
+            }
+
+            explicit AsyncHandle(const Value& config)
+            {
+                if (auto builder = graph::Builder::CreateFromConfig(config).value())
+                {
+                    node_ = builder->Build().value();
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("failed to find creator for node");
+                    throw_exception(eEntryNotFound);
+                }
+            }
+
+            graph::Sender<Value> Process(graph::Sender<Value> input)
+            {
+                return node_->Process(std::move(input));
+            }
+
+          private:
+            static Value SetContext(Value config, const char* device_name, int device_id)
+            {
+                Device device(device_name, device_id);
+                Stream stream(device);
+                config["context"].update({{"device", device}, {"stream", stream}});
+                return config;
+            }
+
+            std::unique_ptr<graph::Node> node_;
+        };
+
+    }  // namespace
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/model.cpp b/csrc/mmdeploy/apis/c/mmdeploy/model.cpp
index 6d202bce81..08af517522 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/model.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/model.cpp
@@ -12,30 +12,45 @@
 
 using namespace mmdeploy;
 
-int mmdeploy_model_create_by_path(const char* path, mmdeploy_model_t* model) {
-  try {
-    auto ptr = std::make_unique<Model>(path);
-    *model = reinterpret_cast<mmdeploy_model_t>(ptr.release());
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("failed to create model: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+int mmdeploy_model_create_by_path(const char* path, mmdeploy_model_t* model)
+{
+    try
+    {
+        auto ptr = std::make_unique<Model>(path);
+        *model   = reinterpret_cast<mmdeploy_model_t>(ptr.release());
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("failed to create model: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-int mmdeploy_model_create(const void* buffer, int size, mmdeploy_model_t* model) {
-  try {
-    auto ptr = std::make_unique<Model>(buffer, size);
-    *model = reinterpret_cast<mmdeploy_model_t>(ptr.release());
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("failed to create model: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+int mmdeploy_model_create(const void* buffer, int size, mmdeploy_model_t* model)
+{
+    try
+    {
+        auto ptr = std::make_unique<Model>(buffer, size);
+        *model   = reinterpret_cast<mmdeploy_model_t>(ptr.release());
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("failed to create model: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-void mmdeploy_model_destroy(mmdeploy_model_t model) { delete reinterpret_cast<Model*>(model); }
+void mmdeploy_model_destroy(mmdeploy_model_t model)
+{
+    delete reinterpret_cast<Model*>(model);
+}
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/model.h b/csrc/mmdeploy/apis/c/mmdeploy/model.h
index 394d2902c2..ddea967f1a 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/model.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/model.h
@@ -11,34 +11,35 @@
 #include "mmdeploy/common.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_model* mmdeploy_model_t;
-
-/**
- * @brief Create SDK Model instance from given model path
- * @param[in] path model path
- * @param[out] model sdk model instance that must be destroyed by \ref mmdeploy_model_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_model_create_by_path(const char* path, mmdeploy_model_t* model);
-
-/**
- * @brief Create SDK Model instance from memory
- * @param[in] buffer a linear buffer contains the model information
- * @param[in] size size of \p buffer in bytes
- * @param[out] model sdk model instance that must be destroyed by \ref mmdeploy_model_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_model_create(const void* buffer, int size, mmdeploy_model_t* model);
-
-/**
- * @brief Destroy model instance
- * @param[in] model sdk model instance created by \ref mmdeploy_model_create_by_path or \ref
- * mmdeploy_model_create
- */
-MMDEPLOY_API void mmdeploy_model_destroy(mmdeploy_model_t model);
+    typedef struct mmdeploy_model* mmdeploy_model_t;
+
+    /**
+     * @brief Create SDK Model instance from given model path
+     * @param[in] path model path
+     * @param[out] model sdk model instance that must be destroyed by \ref mmdeploy_model_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int               mmdeploy_model_create_by_path(const char* path, mmdeploy_model_t* model);
+
+    /**
+     * @brief Create SDK Model instance from memory
+     * @param[in] buffer a linear buffer contains the model information
+     * @param[in] size size of \p buffer in bytes
+     * @param[out] model sdk model instance that must be destroyed by \ref mmdeploy_model_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int               mmdeploy_model_create(const void* buffer, int size, mmdeploy_model_t* model);
+
+    /**
+     * @brief Destroy model instance
+     * @param[in] model sdk model instance created by \ref mmdeploy_model_create_by_path or \ref
+     * mmdeploy_model_create
+     */
+    MMDEPLOY_API void              mmdeploy_model_destroy(mmdeploy_model_t model);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp b/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp
index a9a02807ee..b0d3d6a220 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp
@@ -6,73 +6,90 @@
 #include "mmdeploy/executor_internal.h"
 #include "mmdeploy/handle.h"
 
-int mmdeploy_pipeline_create_v3(mmdeploy_value_t config, mmdeploy_context_t context,
-                                mmdeploy_pipeline_t* pipeline) {
-  try {
-    auto _config = *Cast(config);
-    if (context) {
-      if (!_config.contains("context")) {
-        _config["context"] = Value::Object();
-      }
-      update(_config["context"].object(), Cast(context)->object(), 2);
+int mmdeploy_pipeline_create_v3(mmdeploy_value_t config, mmdeploy_context_t context, mmdeploy_pipeline_t* pipeline)
+{
+    try
+    {
+        auto _config = *Cast(config);
+        if (context)
+        {
+            if (!_config.contains("context"))
+            {
+                _config["context"] = Value::Object();
+            }
+            update(_config["context"].object(), Cast(context)->object(), 2);
+        }
+        auto _handle = std::make_unique<AsyncHandle>(std::move(_config));
+        *pipeline    = Cast(_handle.release());
+        return MMDEPLOY_SUCCESS;
     }
-    auto _handle = std::make_unique<AsyncHandle>(std::move(_config));
-    *pipeline = Cast(_handle.release());
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("exception caught: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("exception caught: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-int mmdeploy_pipeline_create_from_model(mmdeploy_model_t model, mmdeploy_context_t context,
-                                        mmdeploy_pipeline_t* pipeline) {
-  auto config = Cast(model)->ReadConfig("pipeline.json");
-  auto _context = *Cast(context);
-  _context["model"] = *Cast(model);
-  return mmdeploy_pipeline_create_v3(Cast(&config.value()), (mmdeploy_context_t)&_context,
-                                     pipeline);
+int mmdeploy_pipeline_create_from_model(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_pipeline_t* pipeline)
+{
+    auto config       = Cast(model)->ReadConfig("pipeline.json");
+    auto _context     = *Cast(context);
+    _context["model"] = *Cast(model);
+    return mmdeploy_pipeline_create_v3(Cast(&config.value()), (mmdeploy_context_t)&_context, pipeline);
 }
 
-int mmdeploy_pipeline_apply_async(mmdeploy_pipeline_t pipeline, mmdeploy_sender_t input,
-                                  mmdeploy_sender_t* output) {
-  if (!pipeline || !input || !output) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    auto h = Cast(pipeline);
-    *output = Take(h->Process(Take(input)));
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("exception caught: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+int mmdeploy_pipeline_apply_async(mmdeploy_pipeline_t pipeline, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    if (!pipeline || !input || !output)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+    try
+    {
+        auto h  = Cast(pipeline);
+        *output = Take(h->Process(Take(input)));
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("exception caught: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-void mmdeploy_pipeline_destroy(mmdeploy_pipeline_t pipeline) {
-  if (pipeline != nullptr) {
-    delete Cast(pipeline);
-  }
+void mmdeploy_pipeline_destroy(mmdeploy_pipeline_t pipeline)
+{
+    if (pipeline != nullptr)
+    {
+        delete Cast(pipeline);
+    }
 }
 
-int mmdeploy_pipeline_apply(mmdeploy_pipeline_t pipeline, mmdeploy_value_t input,
-                            mmdeploy_value_t* output) {
-  auto input_sender = mmdeploy_executor_just(input);
-  if (!input_sender) {
-    return MMDEPLOY_E_FAIL;
-  }
-  mmdeploy_sender_t output_sender{};
-  if (auto ec = mmdeploy_pipeline_apply_async(pipeline, input_sender, &output_sender)) {
-    return ec;
-  }
-  auto _output = mmdeploy_executor_sync_wait(output_sender);
-  if (!_output) {
-    return MMDEPLOY_E_FAIL;
-  }
-  *output = _output;
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_pipeline_apply(mmdeploy_pipeline_t pipeline, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    auto input_sender = mmdeploy_executor_just(input);
+    if (!input_sender)
+    {
+        return MMDEPLOY_E_FAIL;
+    }
+    mmdeploy_sender_t output_sender{};
+    if (auto ec = mmdeploy_pipeline_apply_async(pipeline, input_sender, &output_sender))
+    {
+        return ec;
+    }
+    auto _output = mmdeploy_executor_sync_wait(output_sender);
+    if (!_output)
+    {
+        return MMDEPLOY_E_FAIL;
+    }
+    *output = _output;
+    return MMDEPLOY_SUCCESS;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pipeline.h b/csrc/mmdeploy/apis/c/mmdeploy/pipeline.h
index 55ccf1e67c..faf523863f 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pipeline.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pipeline.h
@@ -8,59 +8,59 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-/******************************************************************************
- * Experimental pipeline APIs */
+    /******************************************************************************
+     * Experimental pipeline APIs */
 
-typedef struct mmdeploy_pipeline* mmdeploy_pipeline_t;
+    typedef struct mmdeploy_pipeline* mmdeploy_pipeline_t;
 
-/**
- * Create pipeline
- * @param config
- * @param context
- * @param pipeline
- * @return
- */
-MMDEPLOY_API int mmdeploy_pipeline_create_v3(mmdeploy_value_t config, mmdeploy_context_t context,
-                                             mmdeploy_pipeline_t* pipeline);
-/**
- * Create pipeline from internal pipeline config of the model
- * @param model
- * @param context
- * @param pipeline
- * @return
- */
-MMDEPLOY_API int mmdeploy_pipeline_create_from_model(mmdeploy_model_t model,
-                                                     mmdeploy_context_t context,
-                                                     mmdeploy_pipeline_t* pipeline);
+    /**
+     * Create pipeline
+     * @param config
+     * @param context
+     * @param pipeline
+     * @return
+     */
+    MMDEPLOY_API int                  mmdeploy_pipeline_create_v3(mmdeploy_value_t config, mmdeploy_context_t context, mmdeploy_pipeline_t* pipeline);
+    /**
+     * Create pipeline from internal pipeline config of the model
+     * @param model
+     * @param context
+     * @param pipeline
+     * @return
+     */
+    MMDEPLOY_API int                  mmdeploy_pipeline_create_from_model(mmdeploy_model_t     model,
+                                                                          mmdeploy_context_t   context,
+                                                                          mmdeploy_pipeline_t* pipeline);
 
-/**
- * @brief Apply pipeline
- * @param[in] pipeline handle of the pipeline
- * @param[in] input input value
- * @param[out] output output value
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pipeline_apply(mmdeploy_pipeline_t pipeline, mmdeploy_value_t input,
-                                         mmdeploy_value_t* output);
+    /**
+     * @brief Apply pipeline
+     * @param[in] pipeline handle of the pipeline
+     * @param[in] input input value
+     * @param[out] output output value
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_pipeline_apply(mmdeploy_pipeline_t pipeline, mmdeploy_value_t input, mmdeploy_value_t* output);
 
-/**
- * Apply pipeline asynchronously
- * @param pipeline handle of the pipeline
- * @param input input sender that will be consumed by the operation
- * @param output output sender
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pipeline_apply_async(mmdeploy_pipeline_t pipeline,
-                                               mmdeploy_sender_t input, mmdeploy_sender_t* output);
+    /**
+     * Apply pipeline asynchronously
+     * @param pipeline handle of the pipeline
+     * @param input input sender that will be consumed by the operation
+     * @param output output sender
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_pipeline_apply_async(mmdeploy_pipeline_t pipeline,
+                                                                    mmdeploy_sender_t   input,
+                                                                    mmdeploy_sender_t*  output);
 
-/**
- * @brief destroy pipeline
- * @param[in] pipeline
- */
-MMDEPLOY_API void mmdeploy_pipeline_destroy(mmdeploy_pipeline_t pipeline);
+    /**
+     * @brief destroy pipeline
+     * @param[in] pipeline
+     */
+    MMDEPLOY_API void                 mmdeploy_pipeline_destroy(mmdeploy_pipeline_t pipeline);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.cpp b/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.cpp
index 46f9921e62..ee0cc0c564 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.cpp
@@ -16,164 +16,197 @@
 using namespace std;
 using namespace mmdeploy;
 
-int mmdeploy_pose_detector_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                                  mmdeploy_pose_detector_t* detector) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_pose_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_pose_detector_t* detector)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_pose_detector_create_v2(model, context, detector);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_pose_detector_create_v2(model, context, detector);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_pose_detector_create_by_path(const char* model_path, const char* device_name,
-                                          int device_id, mmdeploy_pose_detector_t* detector) {
-  mmdeploy_model_t model{};
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+int mmdeploy_pose_detector_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_pose_detector_t* detector)
+{
+    mmdeploy_model_t model{};
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_pose_detector_create(model, device_name, device_id, detector);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_pose_detector_create(model, device_name, device_id, detector);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_pose_detector_apply(mmdeploy_pose_detector_t detector, const mmdeploy_mat_t* mats,
-                                 int mat_count, mmdeploy_pose_detection_t** results) {
-  return mmdeploy_pose_detector_apply_bbox(detector, mats, mat_count, nullptr, nullptr, results);
+int mmdeploy_pose_detector_apply(mmdeploy_pose_detector_t detector, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_pose_detection_t** results)
+{
+    return mmdeploy_pose_detector_apply_bbox(detector, mats, mat_count, nullptr, nullptr, results);
 }
 
-int mmdeploy_pose_detector_apply_bbox(mmdeploy_pose_detector_t detector, const mmdeploy_mat_t* mats,
-                                      int mat_count, const mmdeploy_rect_t* bboxes,
-                                      const int* bbox_count, mmdeploy_pose_detection_t** results) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec =
-          mmdeploy_pose_detector_create_input(mats, mat_count, bboxes, bbox_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_pose_detector_apply_v2(detector, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_pose_detector_get_result(output, results)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_pose_detector_apply_bbox(mmdeploy_pose_detector_t detector, const mmdeploy_mat_t* mats, int mat_count, const mmdeploy_rect_t* bboxes, const int* bbox_count, mmdeploy_pose_detection_t** results)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec =
+            mmdeploy_pose_detector_create_input(mats, mat_count, bboxes, bbox_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_pose_detector_apply_v2(detector, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_pose_detector_get_result(output, results))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_pose_detector_release_result(mmdeploy_pose_detection_t* results, int count) {
-  if (results == nullptr) {
-    return;
-  }
-  for (int i = 0; i < count; ++i) {
-    delete[] results[i].point;
-    delete[] results[i].score;
-  }
-  delete[] results;
+void mmdeploy_pose_detector_release_result(mmdeploy_pose_detection_t* results, int count)
+{
+    if (results == nullptr)
+    {
+        return;
+    }
+    for (int i = 0; i < count; ++i)
+    {
+        delete[] results[i].point;
+        delete[] results[i].score;
+    }
+    delete[] results;
 }
 
-void mmdeploy_pose_detector_destroy(mmdeploy_pose_detector_t detector) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
+void mmdeploy_pose_detector_destroy(mmdeploy_pose_detector_t detector)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
 }
 
-int mmdeploy_pose_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                     mmdeploy_pose_detector_t* detector) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
+int mmdeploy_pose_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_pose_detector_t* detector)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
 }
 
-int mmdeploy_pose_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                        const mmdeploy_rect_t* bboxes, const int* bbox_count,
-                                        mmdeploy_value_t* value) {
-  if (mat_count && mats == nullptr) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value::Array input_images;
-
-    auto add_bbox = [&](const Mat& img, const mmdeploy_rect_t* bbox) {
-      Value::Array b;
-      if (bbox) {
-        float width = bbox->right - bbox->left + 1;
-        float height = bbox->bottom - bbox->top + 1;
-        b = {bbox->left, bbox->top, width, height, 1.0};
-      } else {
-        b = {0, 0, img.width(), img.height(), 1.0};
-      }
-      input_images.push_back({{"ori_img", img}, {"bbox", std::move(b)}});
-    };
-
-    for (int i = 0; i < mat_count; ++i) {
-      auto _mat = Cast(mats[i]);
-      if (bboxes && bbox_count) {
-        for (int j = 0; j < bbox_count[i]; ++j) {
-          add_bbox(_mat, bboxes++);
-        }
-      } else {  // inference whole image
-        add_bbox(_mat, nullptr);
-      }
+int mmdeploy_pose_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, const mmdeploy_rect_t* bboxes, const int* bbox_count, mmdeploy_value_t* value)
+{
+    if (mat_count && mats == nullptr)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
+    try
+    {
+        Value::Array input_images;
+
+        auto         add_bbox = [&](const Mat& img, const mmdeploy_rect_t* bbox)
+        {
+            Value::Array b;
+            if (bbox)
+            {
+                float width  = bbox->right - bbox->left + 1;
+                float height = bbox->bottom - bbox->top + 1;
+                b            = {bbox->left, bbox->top, width, height, 1.0};
+            }
+            else
+            {
+                b = {0, 0, img.width(), img.height(), 1.0};
+            }
+            input_images.push_back({{"ori_img", img}, {"bbox", std::move(b)}});
+        };
+
+        for (int i = 0; i < mat_count; ++i)
+        {
+            auto _mat = Cast(mats[i]);
+            if (bboxes && bbox_count)
+            {
+                for (int j = 0; j < bbox_count[i]; ++j)
+                {
+                    add_bbox(_mat, bboxes++);
+                }
+            }
+            else
+            {  // inference whole image
+                add_bbox(_mat, nullptr);
+            }
+        }
 
-    *value = Take(Value{std::move(input_images)});
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        *value = Take(Value{std::move(input_images)});
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-int mmdeploy_pose_detector_apply_v2(mmdeploy_pose_detector_t detector, mmdeploy_value_t input,
-                                    mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_pose_detector_apply_v2(mmdeploy_pose_detector_t detector, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_pose_detector_apply_async(mmdeploy_pose_detector_t detector, mmdeploy_sender_t input,
-                                       mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_pose_detector_apply_async(mmdeploy_pose_detector_t detector, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_pose_detector_get_result(mmdeploy_value_t output,
-                                      mmdeploy_pose_detection_t** results) {
-  if (!output || !results) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    std::vector<mmpose::PoseDetectorOutput> detections;
-    from_value(Cast(output)->front(), detections);
-
-    size_t count = detections.size();
-
-    auto deleter = [&](mmdeploy_pose_detection_t* p) {
-      mmdeploy_pose_detector_release_result(p, static_cast<int>(count));
-    };
-
-    std::unique_ptr<mmdeploy_pose_detection_t[], decltype(deleter)> _results(
-        new mmdeploy_pose_detection_t[count]{}, deleter);
-
-    size_t result_idx = 0;
-    for (const auto& bbox_result : detections) {
-      auto& res = _results[result_idx++];
-      auto size = bbox_result.key_points.size();
-
-      res.point = new mmdeploy_point_t[size];
-      res.score = new float[size];
-      res.length = static_cast<int>(size);
-
-      for (int k = 0; k < size; k++) {
-        res.point[k].x = bbox_result.key_points[k].bbox[0];
-        res.point[k].y = bbox_result.key_points[k].bbox[1];
-        res.score[k] = bbox_result.key_points[k].score;
-      }
+int mmdeploy_pose_detector_get_result(mmdeploy_value_t            output,
+                                      mmdeploy_pose_detection_t** results)
+{
+    if (!output || !results)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
+    try
+    {
+        std::vector<mmpose::PoseDetectorOutput> detections;
+        from_value(Cast(output)->front(), detections);
+
+        size_t count = detections.size();
+
+        auto   deleter = [&](mmdeploy_pose_detection_t* p)
+        {
+            mmdeploy_pose_detector_release_result(p, static_cast<int>(count));
+        };
+
+        std::unique_ptr<mmdeploy_pose_detection_t[], decltype(deleter)> _results(
+            new mmdeploy_pose_detection_t[count]{},
+            deleter);
+
+        size_t result_idx = 0;
+        for (const auto& bbox_result : detections)
+        {
+            auto& res  = _results[result_idx++];
+            auto  size = bbox_result.key_points.size();
+
+            res.point  = new mmdeploy_point_t[size];
+            res.score  = new float[size];
+            res.length = static_cast<int>(size);
+
+            for (int k = 0; k < size; k++)
+            {
+                res.point[k].x = bbox_result.key_points[k].bbox[0];
+                res.point[k].y = bbox_result.key_points[k].bbox[1];
+                res.score[k]   = bbox_result.key_points[k].score;
+            }
+        }
 
-    *results = _results.release();
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        *results = _results.release();
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.h b/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.h
index ff0987cee4..6fceb99f72 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.h
@@ -13,111 +13,113 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_pose_detection_t {
-  mmdeploy_point_t* point;  ///< keypoint
-  float* score;             ///< keypoint score
-  int length;               ///< number of keypoint
-} mmdeploy_pose_detection_t;
-
-typedef struct mmdeploy_pose_detector* mmdeploy_pose_detector_t;
-
-/**
- * @brief Create a pose detector instance
- * @param[in] model an instance of mmpose model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector handle of the created pose detector, which must be destroyed
- * by \ref mmdeploy_pose_detector_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_detector_create(mmdeploy_model_t model, const char* device_name,
-                                               int device_id, mmdeploy_pose_detector_t* detector);
-
-/**
- * @brief Create a pose detector instance
- * @param[in] model_path path to pose detection model
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector handle of the created pose detector, which must be destroyed
- * by \ref mmdeploy_pose_detector_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_detector_create_by_path(const char* model_path,
-                                                       const char* device_name, int device_id,
-                                                       mmdeploy_pose_detector_t* detector);
-
-/**
- * @brief Apply pose detector to a batch of images with full image roi
- * @param[in] detector pose detector's handle created by \ref
- * mmdeploy_pose_detector_create_by_path
- * @param[in] images a batch of images
- * @param[in] count number of images in the batch
- * @param[out] results a linear buffer contains the pose result, must be release
- * by \ref mmdeploy_pose_detector_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_detector_apply(mmdeploy_pose_detector_t detector,
-                                              const mmdeploy_mat_t* mats, int mat_count,
-                                              mmdeploy_pose_detection_t** results);
-
-/**
- * @brief Apply pose detector to a batch of images supplied with bboxes(roi)
- * @param[in] detector pose detector's handle created by \ref
- * mmdeploy_pose_detector_create_by_path
- * @param[in] images a batch of images
- * @param[in] image_count number of images in the batch
- * @param[in] bboxes bounding boxes(roi) detected by mmdet
- * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
- * @param[out] results a linear buffer contains the pose result, which has the same length as \p
- * bboxes, must be release by \ref mmdeploy_pose_detector_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_detector_apply_bbox(mmdeploy_pose_detector_t detector,
-                                                   const mmdeploy_mat_t* mats, int mat_count,
-                                                   const mmdeploy_rect_t* bboxes,
-                                                   const int* bbox_count,
-                                                   mmdeploy_pose_detection_t** results);
-
-/** @brief Release result buffer returned by \ref mmdeploy_pose_detector_apply or \ref
- * mmdeploy_pose_detector_apply_bbox
- * @param[in] results result buffer by pose detector
- * @param[in] count length of \p result
- */
-MMDEPLOY_API void mmdeploy_pose_detector_release_result(mmdeploy_pose_detection_t* results,
-                                                        int count);
-
-/**
- * @brief destroy pose_detector
- * @param[in] detector handle of pose_detector created by \ref
- * mmdeploy_pose_detector_create_by_path or \ref mmdeploy_pose_detector_create
- */
-MMDEPLOY_API void mmdeploy_pose_detector_destroy(mmdeploy_pose_detector_t detector);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-MMDEPLOY_API int mmdeploy_pose_detector_create_v2(mmdeploy_model_t model,
-                                                  mmdeploy_context_t context,
-                                                  mmdeploy_pose_detector_t* detector);
-
-MMDEPLOY_API int mmdeploy_pose_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                     const mmdeploy_rect_t* bboxes,
-                                                     const int* bbox_count,
-                                                     mmdeploy_value_t* value);
-
-MMDEPLOY_API int mmdeploy_pose_detector_apply_v2(mmdeploy_pose_detector_t detector,
-                                                 mmdeploy_value_t input, mmdeploy_value_t* output);
-
-MMDEPLOY_API int mmdeploy_pose_detector_apply_async(mmdeploy_pose_detector_t detector,
-                                                    mmdeploy_sender_t input,
-                                                    mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_pose_detector_get_result(mmdeploy_value_t output,
-                                                   mmdeploy_pose_detection_t** results);
+    typedef struct mmdeploy_pose_detection_t
+    {
+        mmdeploy_point_t* point;   ///< keypoint
+        float*            score;   ///< keypoint score
+        int               length;  ///< number of keypoint
+    } mmdeploy_pose_detection_t;
+
+    typedef struct mmdeploy_pose_detector* mmdeploy_pose_detector_t;
+
+    /**
+     * @brief Create a pose detector instance
+     * @param[in] model an instance of mmpose model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector handle of the created pose detector, which must be destroyed
+     * by \ref mmdeploy_pose_detector_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                       mmdeploy_pose_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_pose_detector_t* detector);
+
+    /**
+     * @brief Create a pose detector instance
+     * @param[in] model_path path to pose detection model
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector handle of the created pose detector, which must be destroyed
+     * by \ref mmdeploy_pose_detector_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                       mmdeploy_pose_detector_create_by_path(const char*               model_path,
+                                                                                 const char*               device_name,
+                                                                                 int                       device_id,
+                                                                                 mmdeploy_pose_detector_t* detector);
+
+    /**
+     * @brief Apply pose detector to a batch of images with full image roi
+     * @param[in] detector pose detector's handle created by \ref
+     * mmdeploy_pose_detector_create_by_path
+     * @param[in] images a batch of images
+     * @param[in] count number of images in the batch
+     * @param[out] results a linear buffer contains the pose result, must be release
+     * by \ref mmdeploy_pose_detector_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                       mmdeploy_pose_detector_apply(mmdeploy_pose_detector_t    detector,
+                                                                        const mmdeploy_mat_t*       mats,
+                                                                        int                         mat_count,
+                                                                        mmdeploy_pose_detection_t** results);
+
+    /**
+     * @brief Apply pose detector to a batch of images supplied with bboxes(roi)
+     * @param[in] detector pose detector's handle created by \ref
+     * mmdeploy_pose_detector_create_by_path
+     * @param[in] images a batch of images
+     * @param[in] image_count number of images in the batch
+     * @param[in] bboxes bounding boxes(roi) detected by mmdet
+     * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
+     * @param[out] results a linear buffer contains the pose result, which has the same length as \p
+     * bboxes, must be release by \ref mmdeploy_pose_detector_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                       mmdeploy_pose_detector_apply_bbox(mmdeploy_pose_detector_t    detector,
+                                                                             const mmdeploy_mat_t*       mats,
+                                                                             int                         mat_count,
+                                                                             const mmdeploy_rect_t*      bboxes,
+                                                                             const int*                  bbox_count,
+                                                                             mmdeploy_pose_detection_t** results);
+
+    /** @brief Release result buffer returned by \ref mmdeploy_pose_detector_apply or \ref
+     * mmdeploy_pose_detector_apply_bbox
+     * @param[in] results result buffer by pose detector
+     * @param[in] count length of \p result
+     */
+    MMDEPLOY_API void                      mmdeploy_pose_detector_release_result(mmdeploy_pose_detection_t* results,
+                                                                                 int                        count);
+
+    /**
+     * @brief destroy pose_detector
+     * @param[in] detector handle of pose_detector created by \ref
+     * mmdeploy_pose_detector_create_by_path or \ref mmdeploy_pose_detector_create
+     */
+    MMDEPLOY_API void                      mmdeploy_pose_detector_destroy(mmdeploy_pose_detector_t detector);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    MMDEPLOY_API int                       mmdeploy_pose_detector_create_v2(mmdeploy_model_t          model,
+                                                                            mmdeploy_context_t        context,
+                                                                            mmdeploy_pose_detector_t* detector);
+
+    MMDEPLOY_API int                       mmdeploy_pose_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, const mmdeploy_rect_t* bboxes, const int* bbox_count, mmdeploy_value_t* value);
+
+    MMDEPLOY_API int                       mmdeploy_pose_detector_apply_v2(mmdeploy_pose_detector_t detector,
+                                                                           mmdeploy_value_t         input,
+                                                                           mmdeploy_value_t*        output);
+
+    MMDEPLOY_API int                       mmdeploy_pose_detector_apply_async(mmdeploy_pose_detector_t detector,
+                                                                              mmdeploy_sender_t        input,
+                                                                              mmdeploy_sender_t*       output);
+
+    MMDEPLOY_API int                       mmdeploy_pose_detector_get_result(mmdeploy_value_t            output,
+                                                                             mmdeploy_pose_detection_t** results);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.cpp b/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.cpp
index 113b520c39..d2587b1949 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.cpp
@@ -9,18 +9,21 @@
 #include "mmdeploy/core/mpl/structure.h"
 #include "mmdeploy/pipeline.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-using namespace framework;
+    using namespace framework;
 
 }  // namespace mmdeploy
 
 using namespace mmdeploy;
 
-namespace {
+namespace
+{
 
-Value config_template() {
-  static const auto json = R"(
+    Value config_template()
+    {
+        static const auto json   = R"(
 {
   "type": "Pipeline",
   "input": ["img", "force_det", "state"],
@@ -77,149 +80,184 @@ Value config_template() {
   ]
 }
 )"_json;
-  static const auto config = from_json<Value>(json);
-  return config;
-}
+        static const auto config = from_json<Value>(json);
+        return config;
+    }
 
 }  // namespace
 
-int mmdeploy_pose_tracker_default_params(mmdeploy_pose_tracker_param_t* params) {
-  mmpose::_pose_tracker::SetDefaultParams(*params);
-  return 0;
+int mmdeploy_pose_tracker_default_params(mmdeploy_pose_tracker_param_t* params)
+{
+    mmpose::_pose_tracker::SetDefaultParams(*params);
+    return 0;
 }
 
-int mmdeploy_pose_tracker_create(mmdeploy_model_t det_model, mmdeploy_model_t pose_model,
-                                 mmdeploy_context_t context, mmdeploy_pose_tracker_t* pipeline) {
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_MODEL, "detection", det_model);
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_MODEL, "pose", pose_model);
-  auto config = config_template();
-  return mmdeploy_pipeline_create_v3(Cast(&config), context, (mmdeploy_pipeline_t*)pipeline);
+int mmdeploy_pose_tracker_create(mmdeploy_model_t det_model, mmdeploy_model_t pose_model, mmdeploy_context_t context, mmdeploy_pose_tracker_t* pipeline)
+{
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_MODEL, "detection", det_model);
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_MODEL, "pose", pose_model);
+    auto config = config_template();
+    return mmdeploy_pipeline_create_v3(Cast(&config), context, (mmdeploy_pipeline_t*)pipeline);
 }
 
-void mmdeploy_pose_tracker_destroy(mmdeploy_pose_tracker_t pipeline) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)pipeline);
+void mmdeploy_pose_tracker_destroy(mmdeploy_pose_tracker_t pipeline)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)pipeline);
 }
 
-int mmdeploy_pose_tracker_create_state(mmdeploy_pose_tracker_t pipeline,
+int mmdeploy_pose_tracker_create_state(mmdeploy_pose_tracker_t              pipeline,
                                        const mmdeploy_pose_tracker_param_t* params,
-                                       mmdeploy_pose_tracker_state_t* state) {
-  try {
-    auto create_fn = gRegistry<Module>().Create("pose_tracker::Create", Value()).value();
-    *state = reinterpret_cast<mmdeploy_pose_tracker_state_t>(new Value(
-        create_fn->Process({const_cast<mmdeploy_pose_tracker_param_t*>(params)}).value()[0]));
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+                                       mmdeploy_pose_tracker_state_t*       state)
+{
+    try
+    {
+        auto create_fn = gRegistry<Module>().Create("pose_tracker::Create", Value()).value();
+        *state         = reinterpret_cast<mmdeploy_pose_tracker_state_t>(new Value(
+            create_fn->Process({const_cast<mmdeploy_pose_tracker_param_t*>(params)}).value()[0]));
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-void mmdeploy_pose_tracker_destroy_state(mmdeploy_pose_tracker_state_t state) {
-  delete reinterpret_cast<Value*>(state);
+void mmdeploy_pose_tracker_destroy_state(mmdeploy_pose_tracker_state_t state)
+{
+    delete reinterpret_cast<Value*>(state);
 }
 
 int mmdeploy_pose_tracker_create_input(mmdeploy_pose_tracker_state_t* states,
-                                       const mmdeploy_mat_t* frames, const int32_t* use_detect,
-                                       int batch_size, mmdeploy_value_t* value) {
-  try {
-    Value::Array images;
-    Value::Array use_dets;
-    Value::Array trackers;
-    for (int i = 0; i < batch_size; ++i) {
-      images.push_back({{"ori_img", Cast(frames[i])}});
-      use_dets.emplace_back(use_detect ? use_detect[i] : -1);
-      trackers.push_back(*reinterpret_cast<Value*>(states[i]));
+                                       const mmdeploy_mat_t*          frames,
+                                       const int32_t*                 use_detect,
+                                       int                            batch_size,
+                                       mmdeploy_value_t*              value)
+{
+    try
+    {
+        Value::Array images;
+        Value::Array use_dets;
+        Value::Array trackers;
+        for (int i = 0; i < batch_size; ++i)
+        {
+            images.push_back({{"ori_img", Cast(frames[i])}});
+            use_dets.emplace_back(use_detect ? use_detect[i] : -1);
+            trackers.push_back(*reinterpret_cast<Value*>(states[i]));
+        }
+        *value = Take(Value{std::move(images), std::move(use_dets), std::move(trackers)});
+        return MMDEPLOY_SUCCESS;
     }
-    *value = Take(Value{std::move(images), std::move(use_dets), std::move(trackers)});
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-using ResultType = mmdeploy::Structure<mmdeploy_pose_tracker_target_t, std::vector<int32_t>,
-                                       std::vector<mmpose::_pose_tracker::TrackerResult>>;
+using ResultType = mmdeploy::Structure<mmdeploy_pose_tracker_target_t, std::vector<int32_t>, std::vector<mmpose::_pose_tracker::TrackerResult>>;
 
-int mmdeploy_pose_tracker_get_result(mmdeploy_value_t output,
+int mmdeploy_pose_tracker_get_result(mmdeploy_value_t                 output,
                                      mmdeploy_pose_tracker_target_t** results,
-                                     int32_t** result_count) {
-  if (!output || !results) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    // convert result from Values
-    std::vector<mmpose::_pose_tracker::TrackerResult> res;
-    from_value(Cast(output)->front(), res);
-
-    size_t total = 0;
-    for (const auto& r : res) {
-      total += r.bboxes.size();
+                                     int32_t**                        result_count)
+{
+    if (!output || !results)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
+    try
+    {
+        // convert result from Values
+        std::vector<mmpose::_pose_tracker::TrackerResult> res;
+        from_value(Cast(output)->front(), res);
 
-    // preserve space for the output structure
-    ResultType result_type({total, 1, 1});
-    auto [result_data, result_cnt, result_holder] = result_type.pointers();
+        size_t total = 0;
+        for (const auto& r : res)
+        {
+            total += r.bboxes.size();
+        }
 
-    auto result_ptr = result_data;
+        // preserve space for the output structure
+        ResultType result_type({total, 1, 1});
+        auto [result_data, result_cnt, result_holder] = result_type.pointers();
 
-    result_holder->swap(res);
+        auto result_ptr = result_data;
 
-    // build output structure
-    for (auto& r : *result_holder) {
-      for (int j = 0; j < r.bboxes.size(); ++j) {
-        auto& p = *result_ptr++;
-        p.keypoint_count = static_cast<int32_t>(r.keypoints[j].size());
-        p.keypoints = r.keypoints[j].data();
-        p.scores = r.scores[j].data();
-        p.bbox = r.bboxes[j];
-        p.target_id = r.track_ids[j];
-      }
-      result_cnt->push_back(r.bboxes.size());
-      // debug info
-      //  p.reserved0 = new std::vector(r.pose_input_bboxes);
-      //  p.reserved1 = new std::vector(r.pose_output_bboxes);
-    }
+        result_holder->swap(res);
 
-    *results = result_data;
-    *result_count = result_cnt->data();
-    result_type.release();
+        // build output structure
+        for (auto& r : *result_holder)
+        {
+            for (int j = 0; j < r.bboxes.size(); ++j)
+            {
+                auto& p          = *result_ptr++;
+                p.keypoint_count = static_cast<int32_t>(r.keypoints[j].size());
+                p.keypoints      = r.keypoints[j].data();
+                p.scores         = r.scores[j].data();
+                p.bbox           = r.bboxes[j];
+                p.target_id      = r.track_ids[j];
+            }
+            result_cnt->push_back(r.bboxes.size());
+            // debug info
+            //  p.reserved0 = new std::vector(r.pose_input_bboxes);
+            //  p.reserved1 = new std::vector(r.pose_output_bboxes);
+        }
 
-    return MMDEPLOY_SUCCESS;
+        *results      = result_data;
+        *result_count = result_cnt->data();
+        result_type.release();
 
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-int mmdeploy_pose_tracker_apply(mmdeploy_pose_tracker_t pipeline,
-                                mmdeploy_pose_tracker_state_t* states, const mmdeploy_mat_t* frames,
-                                const int32_t* use_detect, int32_t count,
-                                mmdeploy_pose_tracker_target_t** results, int32_t** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec =
-          mmdeploy_pose_tracker_create_input(states, frames, use_detect, count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_pipeline_apply((mmdeploy_pipeline_t)pipeline, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_pose_tracker_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_pose_tracker_apply(mmdeploy_pose_tracker_t          pipeline,
+                                mmdeploy_pose_tracker_state_t*   states,
+                                const mmdeploy_mat_t*            frames,
+                                const int32_t*                   use_detect,
+                                int32_t                          count,
+                                mmdeploy_pose_tracker_target_t** results,
+                                int32_t**                        result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec =
+            mmdeploy_pose_tracker_create_input(states, frames, use_detect, count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_pipeline_apply((mmdeploy_pipeline_t)pipeline, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_pose_tracker_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
 void mmdeploy_pose_tracker_release_result(mmdeploy_pose_tracker_target_t* results,
-                                          const int32_t* result_count, int count) {
-  auto total = std::accumulate(result_count, result_count + count, 0);
-  ResultType deleter({static_cast<size_t>(total), 1, 1}, results);
+                                          const int32_t*                  result_count,
+                                          int                             count)
+{
+    auto       total = std::accumulate(result_count, result_count + count, 0);
+    ResultType deleter({static_cast<size_t>(total), 1, 1}, results);
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h b/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h
index 4b27fbab8a..c8191b40fa 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h
@@ -14,142 +14,147 @@
 #include "mmdeploy/pose_detector.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_pose_tracker* mmdeploy_pose_tracker_t;
-typedef struct mmdeploy_pose_tracker_state* mmdeploy_pose_tracker_state_t;
-
-typedef struct mmdeploy_pose_tracker_param_t {
-  // detection interval, default = 1
-  int32_t det_interval;
-  // detection label use for pose estimation, default = 0
-  int32_t det_label;
-  // detection score threshold, default = 0.5
-  float det_thr;
-  // detection minimum bbox size (compute as sqrt(area)), default = -1
-  float det_min_bbox_size;
-  // nms iou threshold for merging detected bboxes and bboxes from tracked targets, default = 0.7
-  float det_nms_thr;
-
-  // max number of bboxes used for pose estimation per frame, default = -1
-  int32_t pose_max_num_bboxes;
-  // threshold for visible key-points, default = 0.5
-  float pose_kpt_thr;
-  // min number of key-points for valid poses (-1 indicates ceil(n_kpts/2)), default = -1
-  int32_t pose_min_keypoints;
-  // scale for expanding key-points to bbox, default = 1.25
-  float pose_bbox_scale;
-  // min pose bbox size, tracks with bbox size smaller than the threshold will be dropped,
-  // default = -1
-  float pose_min_bbox_size;
-  // nms oks/iou threshold for suppressing overlapped poses, useful when multiple pose estimations
-  // collapse to the same target, default = 0.5
-  float pose_nms_thr;
-  // keypoint sigmas for computing OKS, will use IOU if not set, default = nullptr
-  float* keypoint_sigmas;
-  // size of keypoint sigma array, must be consistent with the number of key-points, default = 0
-  int32_t keypoint_sigmas_size;
-
-  // iou threshold for associating missing tracks, default = 0.4
-  float track_iou_thr;
-  // max number of missing frames before a missing tracks is removed, default = 10
-  int32_t track_max_missing;
-  // track history size, default = 1
-  int32_t track_history_size;
-
-  // weight of position for setting covariance matrices of kalman filters, default = 0.05
-  float std_weight_position;
-  // weight of velocity for setting covariance matrices of kalman filters, default = 0.00625
-  float std_weight_velocity;
-
-  // params for the one-euro filter for smoothing the outputs - (beta, fc_min, fc_derivative)
-  // default = (0.007, 1, 1)
-  float smooth_params[3];
-} mmdeploy_pose_tracker_param_t;
-
-typedef struct mmdeploy_pose_tracker_target_t {
-  mmdeploy_point_t* keypoints;  // key-points of the target
-  int32_t keypoint_count;       // size of `keypoints` array
-  float* scores;                // scores of each key-point
-  mmdeploy_rect_t bbox;         // estimated bbox from key-points
-  uint32_t target_id;           // target id from internal tracker
-} mmdeploy_pose_tracker_target_t;
-
-/**
- * @brief Fill params with default parameters
- * @param[in,out] params
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_tracker_default_params(mmdeploy_pose_tracker_param_t* params);
-
-/**
- * @brief Create pose tracker pipeline
- * @param[in] det_model detection model object, created by \ref mmdeploy_model_create
- * @param[in] pose_model pose model object
- * @param[in] context context object describing execution environment (device, profiler, etc...),
- * created by \ref mmdeploy_context_create
- * @param[out] pipeline handle of the created pipeline
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_tracker_create(mmdeploy_model_t det_model,
-                                              mmdeploy_model_t pose_model,
-                                              mmdeploy_context_t context,
-                                              mmdeploy_pose_tracker_t* pipeline);
-
-/**
- * @brief Destroy pose tracker pipeline
- * @param[in] pipeline
- */
-MMDEPLOY_API void mmdeploy_pose_tracker_destroy(mmdeploy_pose_tracker_t pipeline);
-
-/**
- * @brief Create a tracker state handle corresponds to a video stream
- * @param[in] pipeline handle of a pose tracker pipeline
- * @param[in] params params for creating the tracker state
- * @param[out] state handle of the created tracker state
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_tracker_create_state(mmdeploy_pose_tracker_t pipeline,
-                                                    const mmdeploy_pose_tracker_param_t* params,
-                                                    mmdeploy_pose_tracker_state_t* state);
-
-/**
- * @brief Destroy tracker state
- * @param[in] state handle of the tracker state
- */
-MMDEPLOY_API void mmdeploy_pose_tracker_destroy_state(mmdeploy_pose_tracker_state_t state);
-
-/**
- * @brief Apply pose tracker pipeline, notice that this function supports batch operation by feeding
- * arrays of size \p count to \p states, \p frames and \p use_detect
- * @param[in] pipeline handle of a pose tracker pipeline
- * @param[in] states tracker states handles, array of size \p count
- * @param[in] frames input frames of size \p count
- * @param[in] use_detect control the use of detector, array of size \p count
- *   -1: use params.det_interval, 0: don't use detector, 1: force use detector
- * @param[in] count batch size
- * @param[out] results a linear buffer contains the tracked targets of input frames. Should be
- * released by \ref mmdeploy_pose_tracker_release_result
- * @param[out] result_count a linear buffer of size \p count contains the number of tracked
- * targets of the frames. Should be released by \ref mmdeploy_pose_tracker_release_result
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_tracker_apply(mmdeploy_pose_tracker_t pipeline,
-                                             mmdeploy_pose_tracker_state_t* states,
-                                             const mmdeploy_mat_t* frames,
-                                             const int32_t* use_detect, int32_t count,
-                                             mmdeploy_pose_tracker_target_t** results,
-                                             int32_t** result_count);
-
-/**
- * @brief Release result objects
- * @param[in] results
- * @param[in] result_count
- * @param[in] count
- */
-MMDEPLOY_API void mmdeploy_pose_tracker_release_result(mmdeploy_pose_tracker_target_t* results,
-                                                       const int32_t* result_count, int count);
+    typedef struct mmdeploy_pose_tracker*       mmdeploy_pose_tracker_t;
+    typedef struct mmdeploy_pose_tracker_state* mmdeploy_pose_tracker_state_t;
+
+    typedef struct mmdeploy_pose_tracker_param_t
+    {
+        // detection interval, default = 1
+        int32_t det_interval;
+        // detection label use for pose estimation, default = 0
+        int32_t det_label;
+        // detection score threshold, default = 0.5
+        float   det_thr;
+        // detection minimum bbox size (compute as sqrt(area)), default = -1
+        float   det_min_bbox_size;
+        // nms iou threshold for merging detected bboxes and bboxes from tracked targets, default = 0.7
+        float   det_nms_thr;
+
+        // max number of bboxes used for pose estimation per frame, default = -1
+        int32_t pose_max_num_bboxes;
+        // threshold for visible key-points, default = 0.5
+        float   pose_kpt_thr;
+        // min number of key-points for valid poses (-1 indicates ceil(n_kpts/2)), default = -1
+        int32_t pose_min_keypoints;
+        // scale for expanding key-points to bbox, default = 1.25
+        float   pose_bbox_scale;
+        // min pose bbox size, tracks with bbox size smaller than the threshold will be dropped,
+        // default = -1
+        float   pose_min_bbox_size;
+        // nms oks/iou threshold for suppressing overlapped poses, useful when multiple pose estimations
+        // collapse to the same target, default = 0.5
+        float   pose_nms_thr;
+        // keypoint sigmas for computing OKS, will use IOU if not set, default = nullptr
+        float*  keypoint_sigmas;
+        // size of keypoint sigma array, must be consistent with the number of key-points, default = 0
+        int32_t keypoint_sigmas_size;
+
+        // iou threshold for associating missing tracks, default = 0.4
+        float   track_iou_thr;
+        // max number of missing frames before a missing tracks is removed, default = 10
+        int32_t track_max_missing;
+        // track history size, default = 1
+        int32_t track_history_size;
+
+        // weight of position for setting covariance matrices of kalman filters, default = 0.05
+        float   std_weight_position;
+        // weight of velocity for setting covariance matrices of kalman filters, default = 0.00625
+        float   std_weight_velocity;
+
+        // params for the one-euro filter for smoothing the outputs - (beta, fc_min, fc_derivative)
+        // default = (0.007, 1, 1)
+        float   smooth_params[3];
+    } mmdeploy_pose_tracker_param_t;
+
+    typedef struct mmdeploy_pose_tracker_target_t
+    {
+        mmdeploy_point_t* keypoints;       // key-points of the target
+        int32_t           keypoint_count;  // size of `keypoints` array
+        float*            scores;          // scores of each key-point
+        mmdeploy_rect_t   bbox;            // estimated bbox from key-points
+        uint32_t          target_id;       // target id from internal tracker
+    } mmdeploy_pose_tracker_target_t;
+
+    /**
+     * @brief Fill params with default parameters
+     * @param[in,out] params
+     * @return status of the operation
+     */
+    MMDEPLOY_API int  mmdeploy_pose_tracker_default_params(mmdeploy_pose_tracker_param_t* params);
+
+    /**
+     * @brief Create pose tracker pipeline
+     * @param[in] det_model detection model object, created by \ref mmdeploy_model_create
+     * @param[in] pose_model pose model object
+     * @param[in] context context object describing execution environment (device, profiler, etc...),
+     * created by \ref mmdeploy_context_create
+     * @param[out] pipeline handle of the created pipeline
+     * @return status of the operation
+     */
+    MMDEPLOY_API int  mmdeploy_pose_tracker_create(mmdeploy_model_t         det_model,
+                                                   mmdeploy_model_t         pose_model,
+                                                   mmdeploy_context_t       context,
+                                                   mmdeploy_pose_tracker_t* pipeline);
+
+    /**
+     * @brief Destroy pose tracker pipeline
+     * @param[in] pipeline
+     */
+    MMDEPLOY_API void mmdeploy_pose_tracker_destroy(mmdeploy_pose_tracker_t pipeline);
+
+    /**
+     * @brief Create a tracker state handle corresponds to a video stream
+     * @param[in] pipeline handle of a pose tracker pipeline
+     * @param[in] params params for creating the tracker state
+     * @param[out] state handle of the created tracker state
+     * @return status of the operation
+     */
+    MMDEPLOY_API int  mmdeploy_pose_tracker_create_state(mmdeploy_pose_tracker_t              pipeline,
+                                                         const mmdeploy_pose_tracker_param_t* params,
+                                                         mmdeploy_pose_tracker_state_t*       state);
+
+    /**
+     * @brief Destroy tracker state
+     * @param[in] state handle of the tracker state
+     */
+    MMDEPLOY_API void mmdeploy_pose_tracker_destroy_state(mmdeploy_pose_tracker_state_t state);
+
+    /**
+     * @brief Apply pose tracker pipeline, notice that this function supports batch operation by feeding
+     * arrays of size \p count to \p states, \p frames and \p use_detect
+     * @param[in] pipeline handle of a pose tracker pipeline
+     * @param[in] states tracker states handles, array of size \p count
+     * @param[in] frames input frames of size \p count
+     * @param[in] use_detect control the use of detector, array of size \p count
+     *   -1: use params.det_interval, 0: don't use detector, 1: force use detector
+     * @param[in] count batch size
+     * @param[out] results a linear buffer contains the tracked targets of input frames. Should be
+     * released by \ref mmdeploy_pose_tracker_release_result
+     * @param[out] result_count a linear buffer of size \p count contains the number of tracked
+     * targets of the frames. Should be released by \ref mmdeploy_pose_tracker_release_result
+     * @return status of the operation
+     */
+    MMDEPLOY_API int  mmdeploy_pose_tracker_apply(mmdeploy_pose_tracker_t          pipeline,
+                                                  mmdeploy_pose_tracker_state_t*   states,
+                                                  const mmdeploy_mat_t*            frames,
+                                                  const int32_t*                   use_detect,
+                                                  int32_t                          count,
+                                                  mmdeploy_pose_tracker_target_t** results,
+                                                  int32_t**                        result_count);
+
+    /**
+     * @brief Release result objects
+     * @param[in] results
+     * @param[in] result_count
+     * @param[in] count
+     */
+    MMDEPLOY_API void mmdeploy_pose_tracker_release_result(mmdeploy_pose_tracker_target_t* results,
+                                                           const int32_t*                  result_count,
+                                                           int                             count);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/restorer.cpp b/csrc/mmdeploy/apis/c/mmdeploy/restorer.cpp
index 9ca2ca65f7..49f8487d12 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/restorer.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/restorer.cpp
@@ -16,106 +16,121 @@ using namespace mmdeploy;
 
 using ResultType = mmdeploy::Structure<mmdeploy_mat_t, mmdeploy::framework::Buffer>;
 
-int mmdeploy_restorer_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                             mmdeploy_restorer_t* restorer) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_restorer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_restorer_t* restorer)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_restorer_create_v2(model, context, restorer);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_restorer_create_v2(model, context, restorer);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_restorer_create_by_path(const char* model_path, const char* device_name, int device_id,
-                                     mmdeploy_restorer_t* restorer) {
-  mmdeploy_model_t model{};
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+int mmdeploy_restorer_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_restorer_t* restorer)
+{
+    mmdeploy_model_t model{};
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_restorer_create(model, device_name, device_id, restorer);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_restorer_create(model, device_name, device_id, restorer);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_restorer_apply(mmdeploy_restorer_t restorer, const mmdeploy_mat_t* images, int count,
-                            mmdeploy_mat_t** results) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_restorer_create_input(images, count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_restorer_apply_v2(restorer, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_restorer_get_result(output, results)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_restorer_apply(mmdeploy_restorer_t restorer, const mmdeploy_mat_t* images, int count, mmdeploy_mat_t** results)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_restorer_create_input(images, count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_restorer_apply_v2(restorer, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_restorer_get_result(output, results))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_restorer_release_result(mmdeploy_mat_t* results, int count) {
-  ResultType deleter{static_cast<size_t>(count), results};
+void mmdeploy_restorer_release_result(mmdeploy_mat_t* results, int count)
+{
+    ResultType deleter{static_cast<size_t>(count), results};
 }
 
-void mmdeploy_restorer_destroy(mmdeploy_restorer_t restorer) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)restorer);
+void mmdeploy_restorer_destroy(mmdeploy_restorer_t restorer)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)restorer);
 }
 
-int mmdeploy_restorer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                mmdeploy_restorer_t* restorer) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)restorer);
+int mmdeploy_restorer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_restorer_t* restorer)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)restorer);
 }
 
-int mmdeploy_restorer_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                   mmdeploy_value_t* value) {
-  return mmdeploy_common_create_input(mats, mat_count, value);
+int mmdeploy_restorer_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value)
+{
+    return mmdeploy_common_create_input(mats, mat_count, value);
 }
 
-int mmdeploy_restorer_apply_v2(mmdeploy_restorer_t restorer, mmdeploy_value_t input,
-                               mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)restorer, input, output);
+int mmdeploy_restorer_apply_v2(mmdeploy_restorer_t restorer, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)restorer, input, output);
 }
 
-int mmdeploy_restorer_apply_async(mmdeploy_restorer_t restorer, mmdeploy_sender_t input,
-                                  mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)restorer, input, output);
+int mmdeploy_restorer_apply_async(mmdeploy_restorer_t restorer, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)restorer, input, output);
 }
 
-int mmdeploy_restorer_get_result(mmdeploy_value_t output, mmdeploy_mat_t** results) {
-  if (!output || !results) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    const Value& value = Cast(output)->front();
-
-    auto restorer_output = from_value<std::vector<mmedit::RestorerOutput>>(value);
-    auto count = restorer_output.size();
-
-    ResultType r(count);
-    auto [_results, buffers] = r.pointers();
-
-    for (int i = 0; i < count; ++i) {
-      auto upscale = restorer_output[i];
-      auto& res = _results[i];
-      res.data = upscale.data<uint8_t>();
-      buffers[i] = upscale.buffer();
-      res.format = (mmdeploy_pixel_format_t)upscale.pixel_format();
-      res.height = upscale.height();
-      res.width = upscale.width();
-      res.channel = upscale.channel();
-      res.type = (mmdeploy_data_type_t)upscale.type();
+int mmdeploy_restorer_get_result(mmdeploy_value_t output, mmdeploy_mat_t** results)
+{
+    if (!output || !results)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
-
-    *results = _results;
-    r.release();
-
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    try
+    {
+        const Value& value = Cast(output)->front();
+
+        auto         restorer_output = from_value<std::vector<mmedit::RestorerOutput>>(value);
+        auto         count           = restorer_output.size();
+
+        ResultType   r(count);
+        auto [_results, buffers] = r.pointers();
+
+        for (int i = 0; i < count; ++i)
+        {
+            auto  upscale = restorer_output[i];
+            auto& res     = _results[i];
+            res.data      = upscale.data<uint8_t>();
+            buffers[i]    = upscale.buffer();
+            res.format    = (mmdeploy_pixel_format_t)upscale.pixel_format();
+            res.height    = upscale.height();
+            res.width     = upscale.width();
+            res.channel   = upscale.channel();
+            res.type      = (mmdeploy_data_type_t)upscale.type();
+        }
+
+        *results = _results;
+        r.release();
+
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/restorer.h b/csrc/mmdeploy/apis/c/mmdeploy/restorer.h
index 9ab529850f..5c8533102f 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/restorer.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/restorer.h
@@ -13,76 +13,72 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_restorer* mmdeploy_restorer_t;
-
-/**
- * @brief Create a restorer instance
- * @param[in] model an instance of image restoration model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] restorer handle of the created restorer, which must be destroyed
- * by \ref mmdeploy_restorer_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_restorer_create(mmdeploy_model_t model, const char* device_name,
-                                          int device_id, mmdeploy_restorer_t* restorer);
-
-/**
- * @brief Create a restorer instance
- * @param[in] model_path path to image restoration model
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] restorer handle of the created restorer, which must be destroyed
- * by \ref mmdeploy_restorer_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_restorer_create_by_path(const char* model_path, const char* device_name,
-                                                  int device_id, mmdeploy_restorer_t* restorer);
-
-/**
- * @brief Apply restorer to a batch of images
- * @param[in] restorer restorer's handle created by \ref mmdeploy_restorer_create_by_path
- * @param[in] images a batch of images
- * @param[in] count number of images in the batch
- * @param[out] results a linear buffer contains the restored images, must be release
- * by \ref mmdeploy_restorer_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_restorer_apply(mmdeploy_restorer_t restorer, const mmdeploy_mat_t* images,
-                                         int count, mmdeploy_mat_t** results);
-
-/** @brief Release result buffer returned by \ref mmdeploy_restorer_apply
- * @param[in] results result buffer by restorer
- * @param[in] count length of \p result
- */
-MMDEPLOY_API void mmdeploy_restorer_release_result(mmdeploy_mat_t* results, int count);
-
-/**
- * @brief destroy restorer
- * @param[in] restorer handle of restorer created by \ref mmdeploy_restorer_create_by_path
- */
-MMDEPLOY_API void mmdeploy_restorer_destroy(mmdeploy_restorer_t restorer);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-MMDEPLOY_API int mmdeploy_restorer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                             mmdeploy_restorer_t* restorer);
-
-MMDEPLOY_API int mmdeploy_restorer_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                mmdeploy_value_t* value);
-
-MMDEPLOY_API int mmdeploy_restorer_apply_v2(mmdeploy_restorer_t restorer, mmdeploy_value_t input,
-                                            mmdeploy_value_t* output);
-
-MMDEPLOY_API int mmdeploy_restorer_apply_async(mmdeploy_restorer_t restorer,
-                                               mmdeploy_sender_t input, mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_restorer_get_result(mmdeploy_value_t output, mmdeploy_mat_t** results);
+    typedef struct mmdeploy_restorer* mmdeploy_restorer_t;
+
+    /**
+     * @brief Create a restorer instance
+     * @param[in] model an instance of image restoration model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] restorer handle of the created restorer, which must be destroyed
+     * by \ref mmdeploy_restorer_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_restorer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_restorer_t* restorer);
+
+    /**
+     * @brief Create a restorer instance
+     * @param[in] model_path path to image restoration model
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] restorer handle of the created restorer, which must be destroyed
+     * by \ref mmdeploy_restorer_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_restorer_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_restorer_t* restorer);
+
+    /**
+     * @brief Apply restorer to a batch of images
+     * @param[in] restorer restorer's handle created by \ref mmdeploy_restorer_create_by_path
+     * @param[in] images a batch of images
+     * @param[in] count number of images in the batch
+     * @param[out] results a linear buffer contains the restored images, must be release
+     * by \ref mmdeploy_restorer_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_restorer_apply(mmdeploy_restorer_t restorer, const mmdeploy_mat_t* images, int count, mmdeploy_mat_t** results);
+
+    /** @brief Release result buffer returned by \ref mmdeploy_restorer_apply
+     * @param[in] results result buffer by restorer
+     * @param[in] count length of \p result
+     */
+    MMDEPLOY_API void                 mmdeploy_restorer_release_result(mmdeploy_mat_t* results, int count);
+
+    /**
+     * @brief destroy restorer
+     * @param[in] restorer handle of restorer created by \ref mmdeploy_restorer_create_by_path
+     */
+    MMDEPLOY_API void                 mmdeploy_restorer_destroy(mmdeploy_restorer_t restorer);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    MMDEPLOY_API int                  mmdeploy_restorer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_restorer_t* restorer);
+
+    MMDEPLOY_API int                  mmdeploy_restorer_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value);
+
+    MMDEPLOY_API int                  mmdeploy_restorer_apply_v2(mmdeploy_restorer_t restorer, mmdeploy_value_t input, mmdeploy_value_t* output);
+
+    MMDEPLOY_API int                  mmdeploy_restorer_apply_async(mmdeploy_restorer_t restorer,
+                                                                    mmdeploy_sender_t   input,
+                                                                    mmdeploy_sender_t*  output);
+
+    MMDEPLOY_API int                  mmdeploy_restorer_get_result(mmdeploy_value_t output, mmdeploy_mat_t** results);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.cpp b/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.cpp
index d2172c54b8..04d537a376 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.cpp
@@ -15,124 +15,146 @@
 using namespace std;
 using namespace mmdeploy;
 
-int mmdeploy_rotated_detector_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                                     mmdeploy_rotated_detector_t* detector) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_rotated_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_rotated_detector_t* detector)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_rotated_detector_create_v2(model, context, detector);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_rotated_detector_create_v2(model, context, detector);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_rotated_detector_create_by_path(const char* model_path, const char* device_name,
-                                             int device_id, mmdeploy_rotated_detector_t* detector) {
-  mmdeploy_model_t model{};
+int mmdeploy_rotated_detector_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_rotated_detector_t* detector)
+{
+    mmdeploy_model_t model{};
 
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_rotated_detector_create(model, device_name, device_id, detector);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_rotated_detector_create(model, device_name, device_id, detector);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_rotated_detector_apply(mmdeploy_rotated_detector_t detector,
-                                    const mmdeploy_mat_t* mats, int mat_count,
-                                    mmdeploy_rotated_detection_t** results, int** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_rotated_detector_create_input(mats, mat_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_rotated_detector_apply_v2(detector, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_rotated_detector_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_rotated_detector_apply(mmdeploy_rotated_detector_t    detector,
+                                    const mmdeploy_mat_t*          mats,
+                                    int                            mat_count,
+                                    mmdeploy_rotated_detection_t** results,
+                                    int**                          result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_rotated_detector_create_input(mats, mat_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_rotated_detector_apply_v2(detector, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_rotated_detector_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
 void mmdeploy_rotated_detector_release_result(mmdeploy_rotated_detection_t* results,
-                                              const int* result_count) {
-  delete[] results;
-  delete[] result_count;
+                                              const int*                    result_count)
+{
+    delete[] results;
+    delete[] result_count;
 }
 
-void mmdeploy_rotated_detector_destroy(mmdeploy_rotated_detector_t detector) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
+void mmdeploy_rotated_detector_destroy(mmdeploy_rotated_detector_t detector)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
 }
 
-int mmdeploy_rotated_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                        mmdeploy_rotated_detector_t* detector) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
+int mmdeploy_rotated_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_rotated_detector_t* detector)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
 }
 
-int mmdeploy_rotated_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                           mmdeploy_value_t* input) {
-  return mmdeploy_common_create_input(mats, mat_count, input);
+int mmdeploy_rotated_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input)
+{
+    return mmdeploy_common_create_input(mats, mat_count, input);
 }
 
-int mmdeploy_rotated_detector_apply_v2(mmdeploy_rotated_detector_t detector, mmdeploy_value_t input,
-                                       mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_rotated_detector_apply_v2(mmdeploy_rotated_detector_t detector, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
 }
 
 int mmdeploy_rotated_detector_apply_async(mmdeploy_rotated_detector_t detector,
-                                          mmdeploy_sender_t input, mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
+                                          mmdeploy_sender_t           input,
+                                          mmdeploy_sender_t*          output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_rotated_detector_get_result(mmdeploy_value_t output,
+int mmdeploy_rotated_detector_get_result(mmdeploy_value_t               output,
                                          mmdeploy_rotated_detection_t** results,
-                                         int** result_count) {
-  if (!output || !results || !result_count) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-
-  try {
-    Value& value = Cast(output)->front();
-    auto detector_outputs = from_value<vector<mmrotate::RotatedDetectorOutput>>(value);
-
-    vector<int> _result_count;
-    _result_count.reserve(detector_outputs.size());
-    for (const auto& det_output : detector_outputs) {
-      _result_count.push_back((int)det_output.detections.size());
+                                         int**                          result_count)
+{
+    if (!output || !results || !result_count)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
 
-    auto total = std::accumulate(_result_count.begin(), _result_count.end(), 0);
+    try
+    {
+        Value&      value            = Cast(output)->front();
+        auto        detector_outputs = from_value<vector<mmrotate::RotatedDetectorOutput>>(value);
 
-    std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
-    std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
-
-    std::unique_ptr<mmdeploy_rotated_detection_t[]> result_data(
-        new mmdeploy_rotated_detection_t[total]{});
-    auto result_ptr = result_data.get();
-
-    for (const auto& det_output : detector_outputs) {
-      for (const auto& detection : det_output.detections) {
-        result_ptr->label_id = detection.label_id;
-        result_ptr->score = detection.score;
-        const auto& rbbox = detection.rbbox;
-        for (int i = 0; i < 5; i++) {
-          result_ptr->rbbox[i] = rbbox[i];
+        vector<int> _result_count;
+        _result_count.reserve(detector_outputs.size());
+        for (const auto& det_output : detector_outputs)
+        {
+            _result_count.push_back((int)det_output.detections.size());
         }
-        ++result_ptr;
-      }
-    }
 
-    *result_count = result_count_data.release();
-    *results = result_data.release();
+        auto                   total = std::accumulate(_result_count.begin(), _result_count.end(), 0);
+
+        std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
+        std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
+
+        std::unique_ptr<mmdeploy_rotated_detection_t[]> result_data(
+            new mmdeploy_rotated_detection_t[total]{});
+        auto result_ptr = result_data.get();
+
+        for (const auto& det_output : detector_outputs)
+        {
+            for (const auto& detection : det_output.detections)
+            {
+                result_ptr->label_id = detection.label_id;
+                result_ptr->score    = detection.score;
+                const auto& rbbox    = detection.rbbox;
+                for (int i = 0; i < 5; i++)
+                {
+                    result_ptr->rbbox[i] = rbbox[i];
+                }
+                ++result_ptr;
+            }
+        }
 
-    return MMDEPLOY_SUCCESS;
+        *result_count = result_count_data.release();
+        *results      = result_data.release();
 
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.h b/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.h
index 35125a74ff..1d745debae 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.h
@@ -13,125 +13,126 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_rotated_detection_t {
-  int label_id;
-  float score;
-  float rbbox[5];  // cx, cy, w, h, angle
-} mmdeploy_rotated_detection_t;
-
-typedef struct mmdeploy_rotated_detector* mmdeploy_rotated_detector_t;
-
-/**
- * @brief Create rotated detector's handle
- * @param[in] model an instance of mmrotate sdk model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector instance of a rotated detector
- * @return status of creating rotated detector's handle
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_create(mmdeploy_model_t model, const char* device_name,
-                                                  int device_id,
-                                                  mmdeploy_rotated_detector_t* detector);
-
-/**
- * @brief Create rotated detector's handle
- * @param[in] model_path path of mmrotate sdk model exported by mmdeploy model converter
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector instance of a rotated detector
- * @return status of creating rotated detector's handle
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_create_by_path(const char* model_path,
-                                                          const char* device_name, int device_id,
-                                                          mmdeploy_rotated_detector_t* detector);
-
-/**
- * @brief Apply rotated detector to batch images and get their inference results
- * @param[in] detector rotated detector's handle created by \ref
- * mmdeploy_rotated_detector_create_by_path
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] results a linear buffer to save detection results of each image. It must be released
- * by \ref mmdeploy_rotated_detector_release_result
- * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
- * detection results of each image. And it must be released by \ref
- * mmdeploy_rotated_detector_release_result
- * @return status of inference
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_apply(mmdeploy_rotated_detector_t detector,
-                                                 const mmdeploy_mat_t* mats, int mat_count,
-                                                 mmdeploy_rotated_detection_t** results,
-                                                 int** result_count);
-
-/** @brief Release the inference result buffer created by \ref mmdeploy_rotated_detector_apply
- * @param[in] results rotated detection results buffer
- * @param[in] result_count  \p results size buffer
- */
-MMDEPLOY_API void mmdeploy_rotated_detector_release_result(mmdeploy_rotated_detection_t* results,
-                                                           const int* result_count);
-
-/**
- * @brief Destroy rotated detector's handle
- * @param[in] detector rotated detector's handle created by \ref
- * mmdeploy_rotated_detector_create_by_path or by \ref mmdeploy_rotated_detector_create
- */
-MMDEPLOY_API void mmdeploy_rotated_detector_destroy(mmdeploy_rotated_detector_t detector);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-/**
- * @brief Same as \ref mmdeploy_detector_create, but allows to control execution context of tasks
- * via context
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_create_v2(mmdeploy_model_t model,
-                                                     mmdeploy_context_t context,
-                                                     mmdeploy_rotated_detector_t* detector);
-
-/**
- * @brief Pack rotated detector inputs into mmdeploy_value_t
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @return the created value
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                        mmdeploy_value_t* input);
-
-/**
- * @brief Same as \ref mmdeploy_rotated_detector_apply, but input and output are packed in \ref
- * mmdeploy_value_t.
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_apply_v2(mmdeploy_rotated_detector_t detector,
-                                                    mmdeploy_value_t input,
-                                                    mmdeploy_value_t* output);
-
-/**
- * @brief Apply rotated detector asynchronously
- * @param[in] detector handle to the detector
- * @param[in] input input sender
- * @return output sender
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_apply_async(mmdeploy_rotated_detector_t detector,
-                                                       mmdeploy_sender_t input,
-                                                       mmdeploy_sender_t* output);
-
-/**
- * @brief Unpack rotated detector output from a mmdeploy_value_t
- * @param[in] output output obtained by applying a detector
- * @param[out] results a linear buffer to save detection results of each image. It must be released
- * by \ref mmdeploy_detector_release_result
- * @param[out] result_count a linear buffer with length number of input images to save the number of
- * detection results of each image. Must be released by \ref
- * mmdeploy_detector_release_result
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_get_result(mmdeploy_value_t output,
-                                                      mmdeploy_rotated_detection_t** results,
-                                                      int** result_count);
+    typedef struct mmdeploy_rotated_detection_t
+    {
+        int   label_id;
+        float score;
+        float rbbox[5];  // cx, cy, w, h, angle
+    } mmdeploy_rotated_detection_t;
+
+    typedef struct mmdeploy_rotated_detector* mmdeploy_rotated_detector_t;
+
+    /**
+     * @brief Create rotated detector's handle
+     * @param[in] model an instance of mmrotate sdk model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector instance of a rotated detector
+     * @return status of creating rotated detector's handle
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_rotated_detector_t* detector);
+
+    /**
+     * @brief Create rotated detector's handle
+     * @param[in] model_path path of mmrotate sdk model exported by mmdeploy model converter
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector instance of a rotated detector
+     * @return status of creating rotated detector's handle
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_create_by_path(const char*                  model_path,
+                                                                                       const char*                  device_name,
+                                                                                       int                          device_id,
+                                                                                       mmdeploy_rotated_detector_t* detector);
+
+    /**
+     * @brief Apply rotated detector to batch images and get their inference results
+     * @param[in] detector rotated detector's handle created by \ref
+     * mmdeploy_rotated_detector_create_by_path
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] results a linear buffer to save detection results of each image. It must be released
+     * by \ref mmdeploy_rotated_detector_release_result
+     * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
+     * detection results of each image. And it must be released by \ref
+     * mmdeploy_rotated_detector_release_result
+     * @return status of inference
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_apply(mmdeploy_rotated_detector_t    detector,
+                                                                              const mmdeploy_mat_t*          mats,
+                                                                              int                            mat_count,
+                                                                              mmdeploy_rotated_detection_t** results,
+                                                                              int**                          result_count);
+
+    /** @brief Release the inference result buffer created by \ref mmdeploy_rotated_detector_apply
+     * @param[in] results rotated detection results buffer
+     * @param[in] result_count  \p results size buffer
+     */
+    MMDEPLOY_API void                         mmdeploy_rotated_detector_release_result(mmdeploy_rotated_detection_t* results,
+                                                                                       const int*                    result_count);
+
+    /**
+     * @brief Destroy rotated detector's handle
+     * @param[in] detector rotated detector's handle created by \ref
+     * mmdeploy_rotated_detector_create_by_path or by \ref mmdeploy_rotated_detector_create
+     */
+    MMDEPLOY_API void                         mmdeploy_rotated_detector_destroy(mmdeploy_rotated_detector_t detector);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    /**
+     * @brief Same as \ref mmdeploy_detector_create, but allows to control execution context of tasks
+     * via context
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_create_v2(mmdeploy_model_t             model,
+                                                                                  mmdeploy_context_t           context,
+                                                                                  mmdeploy_rotated_detector_t* detector);
+
+    /**
+     * @brief Pack rotated detector inputs into mmdeploy_value_t
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @return the created value
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input);
+
+    /**
+     * @brief Same as \ref mmdeploy_rotated_detector_apply, but input and output are packed in \ref
+     * mmdeploy_value_t.
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_apply_v2(mmdeploy_rotated_detector_t detector,
+                                                                                 mmdeploy_value_t            input,
+                                                                                 mmdeploy_value_t*           output);
+
+    /**
+     * @brief Apply rotated detector asynchronously
+     * @param[in] detector handle to the detector
+     * @param[in] input input sender
+     * @return output sender
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_apply_async(mmdeploy_rotated_detector_t detector,
+                                                                                    mmdeploy_sender_t           input,
+                                                                                    mmdeploy_sender_t*          output);
+
+    /**
+     * @brief Unpack rotated detector output from a mmdeploy_value_t
+     * @param[in] output output obtained by applying a detector
+     * @param[out] results a linear buffer to save detection results of each image. It must be released
+     * by \ref mmdeploy_detector_release_result
+     * @param[out] result_count a linear buffer with length number of input images to save the number of
+     * detection results of each image. Must be released by \ref
+     * mmdeploy_detector_release_result
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_get_result(mmdeploy_value_t               output,
+                                                                                   mmdeploy_rotated_detection_t** results,
+                                                                                   int**                          result_count);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/segmentor.cpp b/csrc/mmdeploy/apis/c/mmdeploy/segmentor.cpp
index c982df39e5..9ec8ae366c 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/segmentor.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/segmentor.cpp
@@ -18,111 +18,128 @@ using namespace mmdeploy;
 
 using ResultType = mmdeploy::Structure<mmdeploy_segmentation_t, mmdeploy::framework::Buffer>;
 
-int mmdeploy_segmentor_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                              mmdeploy_segmentor_t* segmentor) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_segmentor_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_segmentor_t* segmentor)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_segmentor_create_v2(model, context, segmentor);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_segmentor_create_v2(model, context, segmentor);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_segmentor_create_by_path(const char* model_path, const char* device_name,
-                                      int device_id, mmdeploy_segmentor_t* segmentor) {
-  mmdeploy_model_t model{};
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+int mmdeploy_segmentor_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_segmentor_t* segmentor)
+{
+    mmdeploy_model_t model{};
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_segmentor_create(model, device_name, device_id, segmentor);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_segmentor_create(model, device_name, device_id, segmentor);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_segmentor_apply(mmdeploy_segmentor_t segmentor, const mmdeploy_mat_t* mats,
-                             int mat_count, mmdeploy_segmentation_t** results) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_segmentor_create_input(mats, mat_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_segmentor_apply_v2(segmentor, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_segmentor_get_result(output, results)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_segmentor_apply(mmdeploy_segmentor_t segmentor, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_segmentation_t** results)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_segmentor_create_input(mats, mat_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_segmentor_apply_v2(segmentor, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_segmentor_get_result(output, results))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_segmentor_release_result(mmdeploy_segmentation_t* results, int count) {
-  ResultType deleter(static_cast<size_t>(count), results);
+void mmdeploy_segmentor_release_result(mmdeploy_segmentation_t* results, int count)
+{
+    ResultType deleter(static_cast<size_t>(count), results);
 }
 
-void mmdeploy_segmentor_destroy(mmdeploy_segmentor_t segmentor) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)segmentor);
+void mmdeploy_segmentor_destroy(mmdeploy_segmentor_t segmentor)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)segmentor);
 }
 
-int mmdeploy_segmentor_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                 mmdeploy_segmentor_t* segmentor) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)segmentor);
+int mmdeploy_segmentor_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_segmentor_t* segmentor)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)segmentor);
 }
 
-int mmdeploy_segmentor_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                    mmdeploy_value_t* value) {
-  return mmdeploy_common_create_input(mats, mat_count, value);
+int mmdeploy_segmentor_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value)
+{
+    return mmdeploy_common_create_input(mats, mat_count, value);
 }
 
-int mmdeploy_segmentor_apply_v2(mmdeploy_segmentor_t segmentor, mmdeploy_value_t input,
-                                mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)segmentor, input, output);
+int mmdeploy_segmentor_apply_v2(mmdeploy_segmentor_t segmentor, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)segmentor, input, output);
 }
 
-int mmdeploy_segmentor_apply_async(mmdeploy_segmentor_t segmentor, mmdeploy_sender_t input,
-                                   mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)segmentor, input, output);
+int mmdeploy_segmentor_apply_async(mmdeploy_segmentor_t segmentor, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)segmentor, input, output);
 }
 
-int mmdeploy_segmentor_get_result(mmdeploy_value_t output, mmdeploy_segmentation_t** results) {
-  try {
-    const auto& value = Cast(output)->front();
-    size_t image_count = value.size();
-
-    ResultType r(image_count);
-    auto [results_data, buffers] = r.pointers();
-
-    auto results_ptr = results_data;
-
-    for (auto i = 0; i < image_count; ++i, ++results_ptr) {
-      auto& output_item = value[i];
-      MMDEPLOY_DEBUG("the {}-th item in output: {}", i, output_item);
-      auto segmentor_output = from_value<mmseg::SegmentorOutput>(output_item);
-      results_ptr->height = segmentor_output.height;
-      results_ptr->width = segmentor_output.width;
-      results_ptr->classes = segmentor_output.classes;
-      auto& mask = segmentor_output.mask;
-      auto& score = segmentor_output.score;
-      results_ptr->mask = nullptr;
-      results_ptr->score = nullptr;
-      if (mask.shape().size()) {
-        results_ptr->mask = mask.data<int>();
-        buffers[i] = mask.buffer();
-      } else {
-        results_ptr->score = score.data<float>();
-        buffers[i] = score.buffer();
-      }
+int mmdeploy_segmentor_get_result(mmdeploy_value_t output, mmdeploy_segmentation_t** results)
+{
+    try
+    {
+        const auto& value       = Cast(output)->front();
+        size_t      image_count = value.size();
+
+        ResultType  r(image_count);
+        auto [results_data, buffers] = r.pointers();
+
+        auto results_ptr = results_data;
+
+        for (auto i = 0; i < image_count; ++i, ++results_ptr)
+        {
+            auto& output_item = value[i];
+            MMDEPLOY_DEBUG("the {}-th item in output: {}", i, output_item);
+            auto segmentor_output = from_value<mmseg::SegmentorOutput>(output_item);
+            results_ptr->height   = segmentor_output.height;
+            results_ptr->width    = segmentor_output.width;
+            results_ptr->classes  = segmentor_output.classes;
+            auto& mask            = segmentor_output.mask;
+            auto& score           = segmentor_output.score;
+            results_ptr->mask     = nullptr;
+            results_ptr->score    = nullptr;
+            if (mask.shape().size())
+            {
+                results_ptr->mask = mask.data<int>();
+                buffers[i]        = mask.buffer();
+            }
+            else
+            {
+                results_ptr->score = score.data<float>();
+                buffers[i]         = score.buffer();
+            }
+        }
+
+        *results = results_data;
+        r.release();
+
+        return MMDEPLOY_SUCCESS;
     }
-
-    *results = results_data;
-    r.release();
-
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("exception caught: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("exception caught: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/segmentor.h b/csrc/mmdeploy/apis/c/mmdeploy/segmentor.h
index 65bcfd03f3..8d885a275b 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/segmentor.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/segmentor.h
@@ -13,91 +13,90 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_segmentation_t {
-  int height;    ///< height of \p mask that equals to the input image's height
-  int width;     ///< width of \p mask that equals to the input image's width
-  int classes;   ///< the number of labels in \p mask
-  int* mask;     ///< segmentation mask of the input image, in which mask[i * width + j] indicates
-                 ///< the label id of pixel at (i, j), this field might be null
-  float* score;  ///< segmentation score map of the input image in CHW format, in which
-                 ///< score[height * width * k + i * width + j] indicates the score
-                 ///< of class k at pixel (i, j), this field might be null
-} mmdeploy_segmentation_t;
-
-typedef struct mmdeploy_segmentor* mmdeploy_segmentor_t;
-
-/**
- * @brief Create segmentor's handle
- * @param[in] model an instance of mmsegmentation sdk model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] segmentor instance of a segmentor, which must be destroyed
- * by \ref mmdeploy_segmentor_destroy
- * @return status of creating segmentor's handle
- */
-MMDEPLOY_API int mmdeploy_segmentor_create(mmdeploy_model_t model, const char* device_name,
-                                           int device_id, mmdeploy_segmentor_t* segmentor);
-
-/**
- * @brief Create segmentor's handle
- * @param[in] model_path path of mmsegmentation sdk model exported by mmdeploy model converter
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] segmentor instance of a segmentor, which must be destroyed
- * by \ref mmdeploy_segmentor_destroy
- * @return status of creating segmentor's handle
- */
-MMDEPLOY_API int mmdeploy_segmentor_create_by_path(const char* model_path, const char* device_name,
-                                                   int device_id, mmdeploy_segmentor_t* segmentor);
-
-/**
- * @brief Apply segmentor to batch images and get their inference results
- * @param[in] segmentor segmentor's handle created by \ref mmdeploy_segmentor_create_by_path or \ref
- * mmdeploy_segmentor_create
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] results a linear buffer of length \p mat_count to save segmentation result of each
- * image. It must be released by \ref mmdeploy_segmentor_release_result
- * @return status of inference
- */
-MMDEPLOY_API int mmdeploy_segmentor_apply(mmdeploy_segmentor_t segmentor,
-                                          const mmdeploy_mat_t* mats, int mat_count,
-                                          mmdeploy_segmentation_t** results);
-
-/**
- * @brief Release result buffer returned by \ref mmdeploy_segmentor_apply
- * @param[in] results result buffer
- * @param[in] count length of \p results
- */
-MMDEPLOY_API void mmdeploy_segmentor_release_result(mmdeploy_segmentation_t* results, int count);
-
-/**
- * @brief Destroy segmentor's handle
- * @param[in] segmentor segmentor's handle created by \ref mmdeploy_segmentor_create_by_path
- */
-MMDEPLOY_API void mmdeploy_segmentor_destroy(mmdeploy_segmentor_t segmentor);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-MMDEPLOY_API int mmdeploy_segmentor_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                              mmdeploy_segmentor_t* segmentor);
-
-MMDEPLOY_API int mmdeploy_segmentor_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                 mmdeploy_value_t* value);
-
-MMDEPLOY_API int mmdeploy_segmentor_apply_v2(mmdeploy_segmentor_t segmentor, mmdeploy_value_t input,
-                                             mmdeploy_value_t* output);
-
-MMDEPLOY_API int mmdeploy_segmentor_apply_async(mmdeploy_segmentor_t segmentor,
-                                                mmdeploy_sender_t input, mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_segmentor_get_result(mmdeploy_value_t output,
-                                               mmdeploy_segmentation_t** results);
+    typedef struct mmdeploy_segmentation_t
+    {
+        int    height;   ///< height of \p mask that equals to the input image's height
+        int    width;    ///< width of \p mask that equals to the input image's width
+        int    classes;  ///< the number of labels in \p mask
+        int*   mask;     ///< segmentation mask of the input image, in which mask[i * width + j] indicates
+                         ///< the label id of pixel at (i, j), this field might be null
+        float* score;    ///< segmentation score map of the input image in CHW format, in which
+                         ///< score[height * width * k + i * width + j] indicates the score
+                         ///< of class k at pixel (i, j), this field might be null
+    } mmdeploy_segmentation_t;
+
+    typedef struct mmdeploy_segmentor* mmdeploy_segmentor_t;
+
+    /**
+     * @brief Create segmentor's handle
+     * @param[in] model an instance of mmsegmentation sdk model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] segmentor instance of a segmentor, which must be destroyed
+     * by \ref mmdeploy_segmentor_destroy
+     * @return status of creating segmentor's handle
+     */
+    MMDEPLOY_API int                   mmdeploy_segmentor_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_segmentor_t* segmentor);
+
+    /**
+     * @brief Create segmentor's handle
+     * @param[in] model_path path of mmsegmentation sdk model exported by mmdeploy model converter
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] segmentor instance of a segmentor, which must be destroyed
+     * by \ref mmdeploy_segmentor_destroy
+     * @return status of creating segmentor's handle
+     */
+    MMDEPLOY_API int                   mmdeploy_segmentor_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_segmentor_t* segmentor);
+
+    /**
+     * @brief Apply segmentor to batch images and get their inference results
+     * @param[in] segmentor segmentor's handle created by \ref mmdeploy_segmentor_create_by_path or \ref
+     * mmdeploy_segmentor_create
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] results a linear buffer of length \p mat_count to save segmentation result of each
+     * image. It must be released by \ref mmdeploy_segmentor_release_result
+     * @return status of inference
+     */
+    MMDEPLOY_API int                   mmdeploy_segmentor_apply(mmdeploy_segmentor_t      segmentor,
+                                                                const mmdeploy_mat_t*     mats,
+                                                                int                       mat_count,
+                                                                mmdeploy_segmentation_t** results);
+
+    /**
+     * @brief Release result buffer returned by \ref mmdeploy_segmentor_apply
+     * @param[in] results result buffer
+     * @param[in] count length of \p results
+     */
+    MMDEPLOY_API void                  mmdeploy_segmentor_release_result(mmdeploy_segmentation_t* results, int count);
+
+    /**
+     * @brief Destroy segmentor's handle
+     * @param[in] segmentor segmentor's handle created by \ref mmdeploy_segmentor_create_by_path
+     */
+    MMDEPLOY_API void                  mmdeploy_segmentor_destroy(mmdeploy_segmentor_t segmentor);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    MMDEPLOY_API int                   mmdeploy_segmentor_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_segmentor_t* segmentor);
+
+    MMDEPLOY_API int                   mmdeploy_segmentor_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value);
+
+    MMDEPLOY_API int                   mmdeploy_segmentor_apply_v2(mmdeploy_segmentor_t segmentor, mmdeploy_value_t input, mmdeploy_value_t* output);
+
+    MMDEPLOY_API int                   mmdeploy_segmentor_apply_async(mmdeploy_segmentor_t segmentor,
+                                                                      mmdeploy_sender_t    input,
+                                                                      mmdeploy_sender_t*   output);
+
+    MMDEPLOY_API int                   mmdeploy_segmentor_get_result(mmdeploy_value_t          output,
+                                                                     mmdeploy_segmentation_t** results);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/text_detector.cpp b/csrc/mmdeploy/apis/c/mmdeploy/text_detector.cpp
index 576af07762..44b124187f 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/text_detector.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/text_detector.cpp
@@ -16,158 +16,186 @@
 using namespace std;
 using namespace mmdeploy;
 
-int mmdeploy_text_detector_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                                  mmdeploy_text_detector_t* detector) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_text_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_text_detector_t* detector)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_text_detector_create_v2(model, context, detector);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_text_detector_create_v2(model, context, detector);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_text_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                     mmdeploy_text_detector_t* detector) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
+int mmdeploy_text_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_text_detector_t* detector)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
 }
 
-int mmdeploy_text_detector_create_by_path(const char* model_path, const char* device_name,
-                                          int device_id, mmdeploy_text_detector_t* detector) {
-  mmdeploy_model_t model{};
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+int mmdeploy_text_detector_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_text_detector_t* detector)
+{
+    mmdeploy_model_t model{};
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_text_detector_create(model, device_name, device_id, detector);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_text_detector_create(model, device_name, device_id, detector);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_text_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                        mmdeploy_value_t* input) {
-  return mmdeploy_common_create_input(mats, mat_count, input);
+int mmdeploy_text_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input)
+{
+    return mmdeploy_common_create_input(mats, mat_count, input);
 }
 
-int mmdeploy_text_detector_apply(mmdeploy_text_detector_t detector, const mmdeploy_mat_t* mats,
-                                 int mat_count, mmdeploy_text_detection_t** results,
-                                 int** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_text_detector_create_input(mats, mat_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_text_detector_apply_v2(detector, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_text_detector_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_text_detector_apply(mmdeploy_text_detector_t detector, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_text_detection_t** results, int** result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_text_detector_create_input(mats, mat_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_text_detector_apply_v2(detector, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_text_detector_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_text_detector_apply_v2(mmdeploy_text_detector_t detector, mmdeploy_value_t input,
-                                    mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_text_detector_apply_v2(mmdeploy_text_detector_t detector, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_text_detector_apply_async(mmdeploy_text_detector_t detector, mmdeploy_sender_t input,
-                                       mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_text_detector_apply_async(mmdeploy_text_detector_t detector, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_text_detector_get_result(mmdeploy_value_t output, mmdeploy_text_detection_t** results,
-                                      int** result_count) {
-  if (!output || !results || !result_count) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value& value = reinterpret_cast<Value*>(output)->front();
-    auto detector_outputs = from_value<std::vector<mmocr::TextDetections>>(value);
-
-    vector<int> _result_count;
-    _result_count.reserve(detector_outputs.size());
-    for (const auto& det_output : detector_outputs) {
-      _result_count.push_back((int)det_output.size());
+int mmdeploy_text_detector_get_result(mmdeploy_value_t output, mmdeploy_text_detection_t** results, int** result_count)
+{
+    if (!output || !results || !result_count)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
-
-    auto total = std::accumulate(_result_count.begin(), _result_count.end(), 0);
-
-    std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
-    std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
-
-    std::unique_ptr<mmdeploy_text_detection_t[]> result_data(
-        new mmdeploy_text_detection_t[total]{});
-    auto result_ptr = result_data.get();
-
-    for (const auto& det_output : detector_outputs) {
-      for (auto i = 0; i < det_output.size(); ++i, ++result_ptr) {
-        result_ptr->score = det_output[i].score;
-        auto& bbox = det_output[i].bbox;
-        for (auto j = 0; j < bbox.size(); j += 2) {
-          result_ptr->bbox[j / 2].x = bbox[j];
-          result_ptr->bbox[j / 2].y = bbox[j + 1];
+    try
+    {
+        Value&      value            = reinterpret_cast<Value*>(output)->front();
+        auto        detector_outputs = from_value<std::vector<mmocr::TextDetections>>(value);
+
+        vector<int> _result_count;
+        _result_count.reserve(detector_outputs.size());
+        for (const auto& det_output : detector_outputs)
+        {
+            _result_count.push_back((int)det_output.size());
         }
-      }
-    }
 
-    *result_count = result_count_data.release();
-    *results = result_data.release();
+        auto                   total = std::accumulate(_result_count.begin(), _result_count.end(), 0);
+
+        std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
+        std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
+
+        std::unique_ptr<mmdeploy_text_detection_t[]> result_data(
+            new mmdeploy_text_detection_t[total]{});
+        auto result_ptr = result_data.get();
+
+        for (const auto& det_output : detector_outputs)
+        {
+            for (auto i = 0; i < det_output.size(); ++i, ++result_ptr)
+            {
+                result_ptr->score = det_output[i].score;
+                auto& bbox        = det_output[i].bbox;
+                for (auto j = 0; j < bbox.size(); j += 2)
+                {
+                    result_ptr->bbox[j / 2].x = bbox[j];
+                    result_ptr->bbox[j / 2].y = bbox[j + 1];
+                }
+            }
+        }
 
-    return MMDEPLOY_SUCCESS;
+        *result_count = result_count_data.release();
+        *results      = result_data.release();
 
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return 0;
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return 0;
 }
 
 void mmdeploy_text_detector_release_result(mmdeploy_text_detection_t* results,
-                                           const int* result_count, int count) {
-  delete[] results;
-  delete[] result_count;
+                                           const int*                 result_count,
+                                           int                        count)
+{
+    delete[] results;
+    delete[] result_count;
 }
 
-void mmdeploy_text_detector_destroy(mmdeploy_text_detector_t detector) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
+void mmdeploy_text_detector_destroy(mmdeploy_text_detector_t detector)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
 }
 
-int mmdeploy_text_detector_apply_async_v2(mmdeploy_text_detector_t detector,
-                                          const mmdeploy_mat_t* imgs, int img_count,
-                                          mmdeploy_text_detector_continue_t cont, void* context,
-                                          mmdeploy_sender_t* output) {
-  mmdeploy_sender_t result_sender{};
-  if (auto ec = mmdeploy_text_detector_apply_async_v3(detector, imgs, img_count, &result_sender)) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_text_detector_continue_async(result_sender, cont, context, output)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_text_detector_apply_async_v2(mmdeploy_text_detector_t          detector,
+                                          const mmdeploy_mat_t*             imgs,
+                                          int                               img_count,
+                                          mmdeploy_text_detector_continue_t cont,
+                                          void*                             context,
+                                          mmdeploy_sender_t*                output)
+{
+    mmdeploy_sender_t result_sender{};
+    if (auto ec = mmdeploy_text_detector_apply_async_v3(detector, imgs, img_count, &result_sender))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_text_detector_continue_async(result_sender, cont, context, output))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
 int mmdeploy_text_detector_apply_async_v3(mmdeploy_text_detector_t detector,
-                                          const mmdeploy_mat_t* imgs, int img_count,
-                                          mmdeploy_sender_t* output) {
-  wrapped<mmdeploy_value_t> input_val;
-  if (auto ec = mmdeploy_text_detector_create_input(imgs, img_count, input_val.ptr())) {
-    return ec;
-  }
-  mmdeploy_sender_t input_sndr = mmdeploy_executor_just(input_val);
-  if (auto ec = mmdeploy_text_detector_apply_async(detector, input_sndr, output)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+                                          const mmdeploy_mat_t*    imgs,
+                                          int                      img_count,
+                                          mmdeploy_sender_t*       output)
+{
+    wrapped<mmdeploy_value_t> input_val;
+    if (auto ec = mmdeploy_text_detector_create_input(imgs, img_count, input_val.ptr()))
+    {
+        return ec;
+    }
+    mmdeploy_sender_t input_sndr = mmdeploy_executor_just(input_val);
+    if (auto ec = mmdeploy_text_detector_apply_async(detector, input_sndr, output))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_text_detector_continue_async(mmdeploy_sender_t input,
-                                          mmdeploy_text_detector_continue_t cont, void* context,
-                                          mmdeploy_sender_t* output) {
-  auto sender = Guard([&] {
-    return Take(
-        LetValue(Take(input), [fn = cont, context](Value& value) -> TypeErasedSender<Value> {
+int mmdeploy_text_detector_continue_async(mmdeploy_sender_t                 input,
+                                          mmdeploy_text_detector_continue_t cont,
+                                          void*                             context,
+                                          mmdeploy_sender_t*                output)
+{
+    auto sender = Guard([&]
+                        { return Take(
+                              LetValue(Take(input), [fn = cont, context](Value& value) -> TypeErasedSender<Value>
+                                       {
           mmdeploy_text_detection_t* results{};
           int* result_count{};
           if (auto ec = mmdeploy_text_detector_get_result(Cast(&value), &results, &result_count)) {
@@ -178,12 +206,11 @@ int mmdeploy_text_detector_continue_async(mmdeploy_sender_t input,
           if (auto ec = fn(results, result_count, context, &output); ec || !output) {
             return Just(Value());
           }
-          return Take(output);
-        }));
-  });
-  if (sender) {
-    *output = sender;
-    return MMDEPLOY_SUCCESS;
-  }
-  return MMDEPLOY_E_FAIL;
+          return Take(output); })); });
+    if (sender)
+    {
+        *output = sender;
+        return MMDEPLOY_SUCCESS;
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/text_detector.h b/csrc/mmdeploy/apis/c/mmdeploy/text_detector.h
index a3c38dc6f6..da363940d7 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/text_detector.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/text_detector.h
@@ -13,141 +13,147 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_text_detection_t {
-  mmdeploy_point_t bbox[4];  ///< a text bounding box of which the vertex are in clock-wise
-  float score;
-} mmdeploy_text_detection_t;
-
-typedef struct mmdeploy_text_detector* mmdeploy_text_detector_t;
-
-/**
- * @brief Create text-detector's handle
- * @param[in] model an instance of mmocr text detection model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector instance of a text-detector, which must be destroyed
- * by \ref mmdeploy_text_detector_destroy
- * @return status of creating text-detector's handle
- */
-MMDEPLOY_API int mmdeploy_text_detector_create(mmdeploy_model_t model, const char* device_name,
-                                               int device_id, mmdeploy_text_detector_t* detector);
-
-/**
- * @brief Create text-detector's handle
- * @param[in] model_path path to text detection model
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device
- * @param[out] detector instance of a text-detector, which must be destroyed
- * by \ref mmdeploy_text_detector_destroy
- * @return status of creating text-detector's handle
- */
-MMDEPLOY_API int mmdeploy_text_detector_create_by_path(const char* model_path,
-                                                       const char* device_name, int device_id,
-                                                       mmdeploy_text_detector_t* detector);
-
-/**
- * @brief Apply text-detector to batch images and get their inference results
- * @param[in] detector text-detector's handle created by \ref mmdeploy_text_detector_create_by_path
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] results a linear buffer to save text detection results of each
- * image. It must be released by calling \ref mmdeploy_text_detector_release_result
- * @param[out] result_count a linear buffer of length \p mat_count to save the number of detection
- * results of each image. It must be released by \ref mmdeploy_detector_release_result
- * @return status of inference
- */
-MMDEPLOY_API int mmdeploy_text_detector_apply(mmdeploy_text_detector_t detector,
-                                              const mmdeploy_mat_t* mats, int mat_count,
-                                              mmdeploy_text_detection_t** results,
-                                              int** result_count);
-
-/** @brief Release the inference result buffer returned by \ref mmdeploy_text_detector_apply
- * @param[in] results text detection result buffer
- * @param[in] result_count  \p results size buffer
- * @param[in] count the length of buffer \p result_count
- */
-MMDEPLOY_API void mmdeploy_text_detector_release_result(mmdeploy_text_detection_t* results,
-                                                        const int* result_count, int count);
-
-/**
- * @brief Destroy text-detector's handle
- * @param[in] detector text-detector's handle created by \ref mmdeploy_text_detector_create_by_path
- * or \ref mmdeploy_text_detector_create
- */
-MMDEPLOY_API void mmdeploy_text_detector_destroy(mmdeploy_text_detector_t detector);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-/**
- * @brief Same as \ref mmdeploy_text_detector_create, but allows to control execution context of
- * tasks via context
- */
-MMDEPLOY_API int mmdeploy_text_detector_create_v2(mmdeploy_model_t model,
-                                                  mmdeploy_context_t context,
-                                                  mmdeploy_text_detector_t* detector);
-
-/**
- * @brief Pack text-detector inputs into mmdeploy_value_t
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @return the created value
- */
-MMDEPLOY_API int mmdeploy_text_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                     mmdeploy_value_t* input);
-
-/**
- * @brief Same as \ref mmdeploy_text_detector_apply, but input and output are packed in \ref
- * mmdeploy_value_t.
- */
-MMDEPLOY_API int mmdeploy_text_detector_apply_v2(mmdeploy_text_detector_t detector,
-                                                 mmdeploy_value_t input, mmdeploy_value_t* output);
-
-/**
- * @brief Apply text-detector asynchronously
- * @param[in] detector handle to the detector
- * @param[in] input input sender that will be consumed by the operation
- * @return output sender
- */
-MMDEPLOY_API int mmdeploy_text_detector_apply_async(mmdeploy_text_detector_t detector,
-                                                    mmdeploy_sender_t input,
-                                                    mmdeploy_sender_t* output);
-
-/**
- * @brief Unpack detector output from a mmdeploy_value_t
- * @param[in] output output sender returned by applying a detector
- * @param[out] results a linear buffer to save detection results of each image. It must be
- * released by \ref mmdeploy_text_detector_release_result
- * @param[out] result_count a linear buffer with length number of input images to save the
- * number of detection results of each image. Must be released by \ref
- * mmdeploy_text_detector_release_result
- * @return status of the operation
- */
-MMDEPLOY_API
-int mmdeploy_text_detector_get_result(mmdeploy_value_t output, mmdeploy_text_detection_t** results,
-                                      int** result_count);
-
-typedef int (*mmdeploy_text_detector_continue_t)(mmdeploy_text_detection_t* results,
-                                                 int* result_count, void* context,
-                                                 mmdeploy_sender_t* output);
-
-// MMDEPLOY_API int mmdeploy_text_detector_apply_async_v2(mm_handle_t handle, const mm_mat_t* imgs,
-//                                                        int img_count,
-//                                                        mmdeploy_text_detector_continuation_t
-//                                                        cont, void* context, mmdeploy_sender_t*
-//                                                        output);
-
-MMDEPLOY_API int mmdeploy_text_detector_apply_async_v3(mmdeploy_text_detector_t detector,
-                                                       const mmdeploy_mat_t* imgs, int img_count,
-                                                       mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_text_detector_continue_async(mmdeploy_sender_t input,
-                                                       mmdeploy_text_detector_continue_t cont,
-                                                       void* context, mmdeploy_sender_t* output);
+    typedef struct mmdeploy_text_detection_t
+    {
+        mmdeploy_point_t bbox[4];  ///< a text bounding box of which the vertex are in clock-wise
+        float            score;
+    } mmdeploy_text_detection_t;
+
+    typedef struct mmdeploy_text_detector* mmdeploy_text_detector_t;
+
+    /**
+     * @brief Create text-detector's handle
+     * @param[in] model an instance of mmocr text detection model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector instance of a text-detector, which must be destroyed
+     * by \ref mmdeploy_text_detector_destroy
+     * @return status of creating text-detector's handle
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_text_detector_t* detector);
+
+    /**
+     * @brief Create text-detector's handle
+     * @param[in] model_path path to text detection model
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device
+     * @param[out] detector instance of a text-detector, which must be destroyed
+     * by \ref mmdeploy_text_detector_destroy
+     * @return status of creating text-detector's handle
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_create_by_path(const char*               model_path,
+                                                                                 const char*               device_name,
+                                                                                 int                       device_id,
+                                                                                 mmdeploy_text_detector_t* detector);
+
+    /**
+     * @brief Apply text-detector to batch images and get their inference results
+     * @param[in] detector text-detector's handle created by \ref mmdeploy_text_detector_create_by_path
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] results a linear buffer to save text detection results of each
+     * image. It must be released by calling \ref mmdeploy_text_detector_release_result
+     * @param[out] result_count a linear buffer of length \p mat_count to save the number of detection
+     * results of each image. It must be released by \ref mmdeploy_detector_release_result
+     * @return status of inference
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_apply(mmdeploy_text_detector_t    detector,
+                                                                        const mmdeploy_mat_t*       mats,
+                                                                        int                         mat_count,
+                                                                        mmdeploy_text_detection_t** results,
+                                                                        int**                       result_count);
+
+    /** @brief Release the inference result buffer returned by \ref mmdeploy_text_detector_apply
+     * @param[in] results text detection result buffer
+     * @param[in] result_count  \p results size buffer
+     * @param[in] count the length of buffer \p result_count
+     */
+    MMDEPLOY_API void                      mmdeploy_text_detector_release_result(mmdeploy_text_detection_t* results,
+                                                                                 const int*                 result_count,
+                                                                                 int                        count);
+
+    /**
+     * @brief Destroy text-detector's handle
+     * @param[in] detector text-detector's handle created by \ref mmdeploy_text_detector_create_by_path
+     * or \ref mmdeploy_text_detector_create
+     */
+    MMDEPLOY_API void                      mmdeploy_text_detector_destroy(mmdeploy_text_detector_t detector);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    /**
+     * @brief Same as \ref mmdeploy_text_detector_create, but allows to control execution context of
+     * tasks via context
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_create_v2(mmdeploy_model_t          model,
+                                                                            mmdeploy_context_t        context,
+                                                                            mmdeploy_text_detector_t* detector);
+
+    /**
+     * @brief Pack text-detector inputs into mmdeploy_value_t
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @return the created value
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input);
+
+    /**
+     * @brief Same as \ref mmdeploy_text_detector_apply, but input and output are packed in \ref
+     * mmdeploy_value_t.
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_apply_v2(mmdeploy_text_detector_t detector,
+                                                                           mmdeploy_value_t         input,
+                                                                           mmdeploy_value_t*        output);
+
+    /**
+     * @brief Apply text-detector asynchronously
+     * @param[in] detector handle to the detector
+     * @param[in] input input sender that will be consumed by the operation
+     * @return output sender
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_apply_async(mmdeploy_text_detector_t detector,
+                                                                              mmdeploy_sender_t        input,
+                                                                              mmdeploy_sender_t*       output);
+
+    /**
+     * @brief Unpack detector output from a mmdeploy_value_t
+     * @param[in] output output sender returned by applying a detector
+     * @param[out] results a linear buffer to save detection results of each image. It must be
+     * released by \ref mmdeploy_text_detector_release_result
+     * @param[out] result_count a linear buffer with length number of input images to save the
+     * number of detection results of each image. Must be released by \ref
+     * mmdeploy_text_detector_release_result
+     * @return status of the operation
+     */
+    MMDEPLOY_API
+    int mmdeploy_text_detector_get_result(mmdeploy_value_t output, mmdeploy_text_detection_t** results, int** result_count);
+
+    typedef int (*mmdeploy_text_detector_continue_t)(mmdeploy_text_detection_t* results,
+                                                     int*                       result_count,
+                                                     void*                      context,
+                                                     mmdeploy_sender_t*         output);
+
+    // MMDEPLOY_API int mmdeploy_text_detector_apply_async_v2(mm_handle_t handle, const mm_mat_t* imgs,
+    //                                                        int img_count,
+    //                                                        mmdeploy_text_detector_continuation_t
+    //                                                        cont, void* context, mmdeploy_sender_t*
+    //                                                        output);
+
+    MMDEPLOY_API int mmdeploy_text_detector_apply_async_v3(mmdeploy_text_detector_t detector,
+                                                           const mmdeploy_mat_t*    imgs,
+                                                           int                      img_count,
+                                                           mmdeploy_sender_t*       output);
+
+    MMDEPLOY_API int mmdeploy_text_detector_continue_async(mmdeploy_sender_t                 input,
+                                                           mmdeploy_text_detector_continue_t cont,
+                                                           void*                             context,
+                                                           mmdeploy_sender_t*                output);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.cpp b/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.cpp
index 3c8cfbb5c6..4c94666add 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.cpp
@@ -19,10 +19,12 @@
 
 using namespace mmdeploy;
 
-namespace {
+namespace
+{
 
-Value config_template(const Model& model) {
-  // clang-format off
+    Value config_template(const Model& model)
+    {
+        // clang-format off
   return {
     {"type", "Pipeline"},
     {"input", {"imgs", "bboxes"}},
@@ -44,194 +46,238 @@ Value config_template(const Model& model) {
     },
     {"output", "texts"},
   };
-  // clang-format on
-}
+        // clang-format on
+    }
 
 }  // namespace
 
-int mmdeploy_text_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                                    mmdeploy_text_recognizer_t* recognizer) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_text_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_text_recognizer_t* recognizer)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_text_recognizer_create_v2(model, context, recognizer);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_text_recognizer_create_v2(model, context, recognizer);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_text_recognizer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                       mmdeploy_text_recognizer_t* recognizer) {
-  auto config = config_template(*Cast(model));
-  return mmdeploy_pipeline_create_v3(Cast(&config), context, (mmdeploy_pipeline_t*)recognizer);
+int mmdeploy_text_recognizer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_text_recognizer_t* recognizer)
+{
+    auto config = config_template(*Cast(model));
+    return mmdeploy_pipeline_create_v3(Cast(&config), context, (mmdeploy_pipeline_t*)recognizer);
 }
 
-int mmdeploy_text_recognizer_create_by_path(const char* model_path, const char* device_name,
-                                            int device_id, mmdeploy_text_recognizer_t* recognizer) {
-  mmdeploy_model_t model{};
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+int mmdeploy_text_recognizer_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_text_recognizer_t* recognizer)
+{
+    mmdeploy_model_t model{};
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_text_recognizer_create(model, device_name, device_id, recognizer);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_text_recognizer_create(model, device_name, device_id, recognizer);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_text_recognizer_apply(mmdeploy_text_recognizer_t recognizer,
-                                   const mmdeploy_mat_t* images, int count,
-                                   mmdeploy_text_recognition_t** results) {
-  return mmdeploy_text_recognizer_apply_bbox(recognizer, images, count, nullptr, nullptr, results);
+int mmdeploy_text_recognizer_apply(mmdeploy_text_recognizer_t    recognizer,
+                                   const mmdeploy_mat_t*         images,
+                                   int                           count,
+                                   mmdeploy_text_recognition_t** results)
+{
+    return mmdeploy_text_recognizer_apply_bbox(recognizer, images, count, nullptr, nullptr, results);
 }
 
-int mmdeploy_text_recognizer_create_input(const mmdeploy_mat_t* images, int image_count,
-                                          const mmdeploy_text_detection_t* bboxes,
-                                          const int* bbox_count, mmdeploy_value_t* output) {
-  if (image_count && images == nullptr) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value::Array input_images;
-    Value::Array input_bboxes;
-
-    auto add_bbox = [&](Mat img, const mmdeploy_text_detection_t* det) {
-      if (det) {
-        const auto& b = det->bbox;
-        Value::Array bbox{b[0].x, b[0].y, b[1].x, b[1].y, b[2].x, b[2].y, b[3].x, b[3].y};
-        input_bboxes.push_back({{"bbox", std::move(bbox)}});
-      } else {
-        input_bboxes.push_back(nullptr);
-      }
-      input_images.push_back({{"ori_img", img}});
-    };
-
-    for (int i = 0; i < image_count; ++i) {
-      auto _mat = Cast(images[i]);
-      if (bboxes && bbox_count) {
-        for (int j = 0; j < bbox_count[i]; ++j) {
-          add_bbox(_mat, bboxes++);
-        }
-      } else {  // inference with whole image
-        add_bbox(_mat, nullptr);
-      }
+int mmdeploy_text_recognizer_create_input(const mmdeploy_mat_t* images, int image_count, const mmdeploy_text_detection_t* bboxes, const int* bbox_count, mmdeploy_value_t* output)
+{
+    if (image_count && images == nullptr)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
+    try
+    {
+        Value::Array input_images;
+        Value::Array input_bboxes;
 
-    *output = Take(Value{std::move(input_images), std::move(input_bboxes)});
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("exception caught: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        auto         add_bbox = [&](Mat img, const mmdeploy_text_detection_t* det)
+        {
+            if (det)
+            {
+                const auto&  b = det->bbox;
+                Value::Array bbox{b[0].x, b[0].y, b[1].x, b[1].y, b[2].x, b[2].y, b[3].x, b[3].y};
+                input_bboxes.push_back({{"bbox", std::move(bbox)}});
+            }
+            else
+            {
+                input_bboxes.push_back(nullptr);
+            }
+            input_images.push_back({{"ori_img", img}});
+        };
+
+        for (int i = 0; i < image_count; ++i)
+        {
+            auto _mat = Cast(images[i]);
+            if (bboxes && bbox_count)
+            {
+                for (int j = 0; j < bbox_count[i]; ++j)
+                {
+                    add_bbox(_mat, bboxes++);
+                }
+            }
+            else
+            {  // inference with whole image
+                add_bbox(_mat, nullptr);
+            }
+        }
+
+        *output = Take(Value{std::move(input_images), std::move(input_bboxes)});
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("exception caught: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-int mmdeploy_text_recognizer_apply_bbox(mmdeploy_text_recognizer_t recognizer,
-                                        const mmdeploy_mat_t* images, int image_count,
+int mmdeploy_text_recognizer_apply_bbox(mmdeploy_text_recognizer_t       recognizer,
+                                        const mmdeploy_mat_t*            images,
+                                        int                              image_count,
                                         const mmdeploy_text_detection_t* bboxes,
-                                        const int* bbox_count,
-                                        mmdeploy_text_recognition_t** results) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_text_recognizer_create_input(images, image_count, bboxes, bbox_count,
-                                                      input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_text_recognizer_apply_v2(recognizer, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_text_recognizer_get_result(output, results)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+                                        const int*                       bbox_count,
+                                        mmdeploy_text_recognition_t**    results)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_text_recognizer_create_input(images, image_count, bboxes, bbox_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_text_recognizer_apply_v2(recognizer, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_text_recognizer_get_result(output, results))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_text_recognizer_apply_v2(mmdeploy_text_recognizer_t recognizer, mmdeploy_value_t input,
-                                      mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)recognizer, input, output);
+int mmdeploy_text_recognizer_apply_v2(mmdeploy_text_recognizer_t recognizer, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)recognizer, input, output);
 }
 
 int mmdeploy_text_recognizer_apply_async(mmdeploy_text_recognizer_t recognizer,
-                                         mmdeploy_sender_t input, mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)recognizer, input, output);
+                                         mmdeploy_sender_t          input,
+                                         mmdeploy_sender_t*         output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)recognizer, input, output);
 }
 
-MMDEPLOY_API int mmdeploy_text_recognizer_get_result(mmdeploy_value_t output,
-                                                     mmdeploy_text_recognition_t** results) {
-  if (!output || !results) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    std::vector<mmocr::TextRecognition> recognitions;
-    from_value(Cast(output)->front(), recognitions);
+MMDEPLOY_API int mmdeploy_text_recognizer_get_result(mmdeploy_value_t              output,
+                                                     mmdeploy_text_recognition_t** results)
+{
+    if (!output || !results)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+    try
+    {
+        std::vector<mmocr::TextRecognition> recognitions;
+        from_value(Cast(output)->front(), recognitions);
 
-    size_t count = recognitions.size();
+        size_t count = recognitions.size();
 
-    auto deleter = [&](mmdeploy_text_recognition_t* p) {
-      mmdeploy_text_recognizer_release_result(p, static_cast<int>(count));
-    };
+        auto   deleter = [&](mmdeploy_text_recognition_t* p)
+        {
+            mmdeploy_text_recognizer_release_result(p, static_cast<int>(count));
+        };
 
-    std::unique_ptr<mmdeploy_text_recognition_t[], decltype(deleter)> _results(
-        new mmdeploy_text_recognition_t[count]{}, deleter);
+        std::unique_ptr<mmdeploy_text_recognition_t[], decltype(deleter)> _results(
+            new mmdeploy_text_recognition_t[count]{},
+            deleter);
 
-    size_t result_idx = 0;
-    for (const auto& bbox_result : recognitions) {
-      auto& res = _results[result_idx++];
+        size_t result_idx = 0;
+        for (const auto& bbox_result : recognitions)
+        {
+            auto& res = _results[result_idx++];
 
-      auto& score = bbox_result.score;
-      res.length = static_cast<int>(score.size());
+            auto& score = bbox_result.score;
+            res.length  = static_cast<int>(score.size());
 
-      res.score = new float[score.size()];
-      std::copy_n(score.data(), score.size(), res.score);
+            res.score = new float[score.size()];
+            std::copy_n(score.data(), score.size(), res.score);
 
-      auto text = bbox_result.text;
-      res.text = new char[text.length() + 1];
-      std::copy_n(text.data(), text.length() + 1, res.text);
-    }
+            auto text = bbox_result.text;
+            res.text  = new char[text.length() + 1];
+            std::copy_n(text.data(), text.length() + 1, res.text);
+        }
 
-    *results = _results.release();
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("exception caught: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_SUCCESS;
+        *results = _results.release();
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("exception caught: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_text_recognizer_release_result(mmdeploy_text_recognition_t* results, int count) {
-  for (int i = 0; i < count; ++i) {
-    delete[] results[i].score;
-    delete[] results[i].text;
-  }
-  delete[] results;
+void mmdeploy_text_recognizer_release_result(mmdeploy_text_recognition_t* results, int count)
+{
+    for (int i = 0; i < count; ++i)
+    {
+        delete[] results[i].score;
+        delete[] results[i].text;
+    }
+    delete[] results;
 }
 
-void mmdeploy_text_recognizer_destroy(mmdeploy_text_recognizer_t recognizer) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)recognizer);
+void mmdeploy_text_recognizer_destroy(mmdeploy_text_recognizer_t recognizer)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)recognizer);
 }
 
-int mmdeploy_text_recognizer_apply_async_v3(mmdeploy_text_recognizer_t recognizer,
-                                            const mmdeploy_mat_t* imgs, int img_count,
+int mmdeploy_text_recognizer_apply_async_v3(mmdeploy_text_recognizer_t       recognizer,
+                                            const mmdeploy_mat_t*            imgs,
+                                            int                              img_count,
                                             const mmdeploy_text_detection_t* bboxes,
-                                            const int* bbox_count, mmdeploy_sender_t* output) {
-  wrapped<mmdeploy_value_t> input_val;
-  if (auto ec = mmdeploy_text_recognizer_create_input(imgs, img_count, bboxes, bbox_count,
-                                                      input_val.ptr())) {
-    return ec;
-  }
-  mmdeploy_sender_t input_sndr = mmdeploy_executor_just(input_val);
-  if (auto ec = mmdeploy_text_recognizer_apply_async(recognizer, input_sndr, output)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+                                            const int*                       bbox_count,
+                                            mmdeploy_sender_t*               output)
+{
+    wrapped<mmdeploy_value_t> input_val;
+    if (auto ec = mmdeploy_text_recognizer_create_input(imgs, img_count, bboxes, bbox_count, input_val.ptr()))
+    {
+        return ec;
+    }
+    mmdeploy_sender_t input_sndr = mmdeploy_executor_just(input_val);
+    if (auto ec = mmdeploy_text_recognizer_apply_async(recognizer, input_sndr, output))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_text_recognizer_continue_async(mmdeploy_sender_t input,
-                                            mmdeploy_text_recognizer_continue_t cont, void* context,
-                                            mmdeploy_sender_t* output) {
-  auto sender = Guard([&] {
-    return Take(
-        LetValue(Take(input), [fn = cont, context](Value& value) -> TypeErasedSender<Value> {
+int mmdeploy_text_recognizer_continue_async(mmdeploy_sender_t                   input,
+                                            mmdeploy_text_recognizer_continue_t cont,
+                                            void*                               context,
+                                            mmdeploy_sender_t*                  output)
+{
+    auto sender = Guard([&]
+                        { return Take(
+                              LetValue(Take(input), [fn = cont, context](Value& value) -> TypeErasedSender<Value>
+                                       {
           mmdeploy_text_recognition_t* results{};
           if (auto ec = mmdeploy_text_recognizer_get_result(Cast(&value), &results)) {
             return Just(Value());
@@ -241,12 +287,11 @@ int mmdeploy_text_recognizer_continue_async(mmdeploy_sender_t input,
           if (auto ec = fn(results, context, &output); ec || !output) {
             return Just(Value());
           }
-          return Take(output);
-        }));
-  });
-  if (sender) {
-    *output = sender;
-    return MMDEPLOY_SUCCESS;
-  }
-  return MMDEPLOY_E_FAIL;
+          return Take(output); })); });
+    if (sender)
+    {
+        *output = sender;
+        return MMDEPLOY_SUCCESS;
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.h b/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.h
index 6c18928242..f20c878028 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.h
@@ -13,149 +13,155 @@
 #include "mmdeploy/text_detector.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_text_recognition_t {
-  char* text;
-  float* score;
-  int length;
-} mmdeploy_text_recognition_t;
-
-typedef struct mmdeploy_text_recognizer* mmdeploy_text_recognizer_t;
-
-/**
- * @brief Create a text recognizer instance
- * @param[in] model an instance of mmocr text recognition model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] recognizer handle of the created text recognizer, which must be destroyed
- * by \ref mmdeploy_text_recognizer_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_create(mmdeploy_model_t model, const char* device_name,
-                                                 int device_id,
-                                                 mmdeploy_text_recognizer_t* recognizer);
-
-/**
- * @brief Create a text recognizer instance
- * @param[in] model_path path to text recognition model
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] recognizer handle of the created text recognizer, which must be destroyed
- * by \ref mmdeploy_text_recognizer_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_create_by_path(const char* model_path,
-                                                         const char* device_name, int device_id,
-                                                         mmdeploy_text_recognizer_t* recognizer);
-
-/**
- * @brief Apply text recognizer to a batch of text images
- * @param[in] recognizer text recognizer's handle created by \ref
- * mmdeploy_text_recognizer_create_by_path
- * @param[in] images a batch of text images
- * @param[in] count number of images in the batch
- * @param[out] results a linear buffer contains the recognized text, must be release
- * by \ref mmdeploy_text_recognizer_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_apply(mmdeploy_text_recognizer_t recognizer,
-                                                const mmdeploy_mat_t* images, int count,
-                                                mmdeploy_text_recognition_t** results);
-
-/**
- * @brief Apply text recognizer to a batch of images supplied with text bboxes
- * @param[in] recognizer text recognizer's handle created by \ref
- * mmdeploy_text_recognizer_create_by_path
- * @param[in] images a batch of text images
- * @param[in] image_count number of images in the batch
- * @param[in] bboxes bounding boxes detected by text detector
- * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
- * @param[out] results a linear buffer contains the recognized text, which has the same length as \p
- * bboxes, must be release by \ref mmdeploy_text_recognizer_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_apply_bbox(mmdeploy_text_recognizer_t recognizer,
-                                                     const mmdeploy_mat_t* images, int image_count,
-                                                     const mmdeploy_text_detection_t* bboxes,
-                                                     const int* bbox_count,
-                                                     mmdeploy_text_recognition_t** results);
-
-/** @brief Release result buffer returned by \ref mmdeploy_text_recognizer_apply or \ref
- * mmdeploy_text_recognizer_apply_bbox
- * @param[in] results result buffer by text recognizer
- * @param[in] count length of \p result
- */
-MMDEPLOY_API void mmdeploy_text_recognizer_release_result(mmdeploy_text_recognition_t* results,
-                                                          int count);
-
-/**
- * @brief destroy text recognizer
- * @param[in] recognizer handle of text recognizer created by \ref
- * mmdeploy_text_recognizer_create_by_path or \ref mmdeploy_text_recognizer_create
- */
-MMDEPLOY_API void mmdeploy_text_recognizer_destroy(mmdeploy_text_recognizer_t recognizer);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-/**
- * @brief Same as \ref mmdeploy_text_recognizer_create, but allows to control execution context of
- * tasks via context
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_create_v2(mmdeploy_model_t model,
-                                                    mmdeploy_context_t context,
-                                                    mmdeploy_text_recognizer_t* recognizer);
-
-/**
- * @brief Pack text-recognizer inputs into mmdeploy_value_t
- * @param[in] images a batch of images
- * @param[in] image_count number of images in the batch
- * @param[in] bboxes bounding boxes detected by text detector
- * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
- * @return value created
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_create_input(const mmdeploy_mat_t* images,
-                                                       int image_count,
-                                                       const mmdeploy_text_detection_t* bboxes,
-                                                       const int* bbox_count,
-                                                       mmdeploy_value_t* output);
-
-MMDEPLOY_API int mmdeploy_text_recognizer_apply_v2(mmdeploy_text_recognizer_t recognizer,
-                                                   mmdeploy_value_t input,
-                                                   mmdeploy_value_t* output);
-
-/**
- * @brief Same as \ref mmdeploy_text_recognizer_apply_bbox, but input and output are packed in \ref
- * mmdeploy_value_t.
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_apply_async(mmdeploy_text_recognizer_t recognizer,
-                                                      mmdeploy_sender_t input,
-                                                      mmdeploy_sender_t* output);
-
-typedef int (*mmdeploy_text_recognizer_continue_t)(mmdeploy_text_recognition_t* results,
-                                                   void* context, mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_text_recognizer_apply_async_v3(mmdeploy_text_recognizer_t recognizer,
-                                                         const mmdeploy_mat_t* imgs, int img_count,
-                                                         const mmdeploy_text_detection_t* bboxes,
-                                                         const int* bbox_count,
-                                                         mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_text_recognizer_continue_async(mmdeploy_sender_t input,
-                                                         mmdeploy_text_recognizer_continue_t cont,
-                                                         void* context, mmdeploy_sender_t* output);
-
-/**
- * @brief Unpack text-recognizer output from a mmdeploy_value_t
- * @param[in] output
- * @param[out] results
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_get_result(mmdeploy_value_t output,
-                                                     mmdeploy_text_recognition_t** results);
+    typedef struct mmdeploy_text_recognition_t
+    {
+        char*  text;
+        float* score;
+        int    length;
+    } mmdeploy_text_recognition_t;
+
+    typedef struct mmdeploy_text_recognizer* mmdeploy_text_recognizer_t;
+
+    /**
+     * @brief Create a text recognizer instance
+     * @param[in] model an instance of mmocr text recognition model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] recognizer handle of the created text recognizer, which must be destroyed
+     * by \ref mmdeploy_text_recognizer_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_text_recognizer_t* recognizer);
+
+    /**
+     * @brief Create a text recognizer instance
+     * @param[in] model_path path to text recognition model
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] recognizer handle of the created text recognizer, which must be destroyed
+     * by \ref mmdeploy_text_recognizer_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_create_by_path(const char*                 model_path,
+                                                                                     const char*                 device_name,
+                                                                                     int                         device_id,
+                                                                                     mmdeploy_text_recognizer_t* recognizer);
+
+    /**
+     * @brief Apply text recognizer to a batch of text images
+     * @param[in] recognizer text recognizer's handle created by \ref
+     * mmdeploy_text_recognizer_create_by_path
+     * @param[in] images a batch of text images
+     * @param[in] count number of images in the batch
+     * @param[out] results a linear buffer contains the recognized text, must be release
+     * by \ref mmdeploy_text_recognizer_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_apply(mmdeploy_text_recognizer_t    recognizer,
+                                                                            const mmdeploy_mat_t*         images,
+                                                                            int                           count,
+                                                                            mmdeploy_text_recognition_t** results);
+
+    /**
+     * @brief Apply text recognizer to a batch of images supplied with text bboxes
+     * @param[in] recognizer text recognizer's handle created by \ref
+     * mmdeploy_text_recognizer_create_by_path
+     * @param[in] images a batch of text images
+     * @param[in] image_count number of images in the batch
+     * @param[in] bboxes bounding boxes detected by text detector
+     * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
+     * @param[out] results a linear buffer contains the recognized text, which has the same length as \p
+     * bboxes, must be release by \ref mmdeploy_text_recognizer_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_apply_bbox(mmdeploy_text_recognizer_t       recognizer,
+                                                                                 const mmdeploy_mat_t*            images,
+                                                                                 int                              image_count,
+                                                                                 const mmdeploy_text_detection_t* bboxes,
+                                                                                 const int*                       bbox_count,
+                                                                                 mmdeploy_text_recognition_t**    results);
+
+    /** @brief Release result buffer returned by \ref mmdeploy_text_recognizer_apply or \ref
+     * mmdeploy_text_recognizer_apply_bbox
+     * @param[in] results result buffer by text recognizer
+     * @param[in] count length of \p result
+     */
+    MMDEPLOY_API void                        mmdeploy_text_recognizer_release_result(mmdeploy_text_recognition_t* results,
+                                                                                     int                          count);
+
+    /**
+     * @brief destroy text recognizer
+     * @param[in] recognizer handle of text recognizer created by \ref
+     * mmdeploy_text_recognizer_create_by_path or \ref mmdeploy_text_recognizer_create
+     */
+    MMDEPLOY_API void                        mmdeploy_text_recognizer_destroy(mmdeploy_text_recognizer_t recognizer);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    /**
+     * @brief Same as \ref mmdeploy_text_recognizer_create, but allows to control execution context of
+     * tasks via context
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_create_v2(mmdeploy_model_t            model,
+                                                                                mmdeploy_context_t          context,
+                                                                                mmdeploy_text_recognizer_t* recognizer);
+
+    /**
+     * @brief Pack text-recognizer inputs into mmdeploy_value_t
+     * @param[in] images a batch of images
+     * @param[in] image_count number of images in the batch
+     * @param[in] bboxes bounding boxes detected by text detector
+     * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
+     * @return value created
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_create_input(const mmdeploy_mat_t*            images,
+                                                                                   int                              image_count,
+                                                                                   const mmdeploy_text_detection_t* bboxes,
+                                                                                   const int*                       bbox_count,
+                                                                                   mmdeploy_value_t*                output);
+
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_apply_v2(mmdeploy_text_recognizer_t recognizer,
+                                                                               mmdeploy_value_t           input,
+                                                                               mmdeploy_value_t*          output);
+
+    /**
+     * @brief Same as \ref mmdeploy_text_recognizer_apply_bbox, but input and output are packed in \ref
+     * mmdeploy_value_t.
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_apply_async(mmdeploy_text_recognizer_t recognizer,
+                                                                                  mmdeploy_sender_t          input,
+                                                                                  mmdeploy_sender_t*         output);
+
+    typedef int (*mmdeploy_text_recognizer_continue_t)(mmdeploy_text_recognition_t* results,
+                                                       void*                        context,
+                                                       mmdeploy_sender_t*           output);
+
+    MMDEPLOY_API int mmdeploy_text_recognizer_apply_async_v3(mmdeploy_text_recognizer_t       recognizer,
+                                                             const mmdeploy_mat_t*            imgs,
+                                                             int                              img_count,
+                                                             const mmdeploy_text_detection_t* bboxes,
+                                                             const int*                       bbox_count,
+                                                             mmdeploy_sender_t*               output);
+
+    MMDEPLOY_API int mmdeploy_text_recognizer_continue_async(mmdeploy_sender_t                   input,
+                                                             mmdeploy_text_recognizer_continue_t cont,
+                                                             void*                               context,
+                                                             mmdeploy_sender_t*                  output);
+
+    /**
+     * @brief Unpack text-recognizer output from a mmdeploy_value_t
+     * @param[in] output
+     * @param[out] results
+     * @return status of the operation
+     */
+    MMDEPLOY_API int mmdeploy_text_recognizer_get_result(mmdeploy_value_t              output,
+                                                         mmdeploy_text_recognition_t** results);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.cpp b/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.cpp
index de71e57842..3f0ab3c305 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.cpp
@@ -20,146 +20,178 @@
 
 using namespace mmdeploy;
 
-int mmdeploy_video_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                                     mmdeploy_video_recognizer_t* recognizer) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_video_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_video_recognizer_t* recognizer)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_video_recognizer_create_v2(model, context, recognizer);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_video_recognizer_create_v2(model, context, recognizer);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_video_recognizer_create_by_path(const char* model_path, const char* device_name,
-                                             int device_id,
-                                             mmdeploy_video_recognizer_t* recognizer) {
-  mmdeploy_model_t model{};
+int mmdeploy_video_recognizer_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_video_recognizer_t* recognizer)
+{
+    mmdeploy_model_t model{};
 
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_video_recognizer_create(model, device_name, device_id, recognizer);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_video_recognizer_create(model, device_name, device_id, recognizer);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
-int mmdeploy_video_recognizer_apply(mmdeploy_video_recognizer_t recognizer,
-                                    const mmdeploy_mat_t* images,
-                                    const mmdeploy_video_sample_info_t* video_info, int video_count,
-                                    mmdeploy_video_recognition_t** results, int** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec =
-          mmdeploy_video_recognizer_create_input(images, video_info, video_count, input.ptr())) {
-    return ec;
-  }
+int mmdeploy_video_recognizer_apply(mmdeploy_video_recognizer_t         recognizer,
+                                    const mmdeploy_mat_t*               images,
+                                    const mmdeploy_video_sample_info_t* video_info,
+                                    int                                 video_count,
+                                    mmdeploy_video_recognition_t**      results,
+                                    int**                               result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec =
+            mmdeploy_video_recognizer_create_input(images, video_info, video_count, input.ptr()))
+    {
+        return ec;
+    }
 
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_video_recognizer_apply_v2(recognizer, input, output.ptr())) {
-    return ec;
-  }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_video_recognizer_apply_v2(recognizer, input, output.ptr()))
+    {
+        return ec;
+    }
 
-  if (auto ec = mmdeploy_video_recognizer_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+    if (auto ec = mmdeploy_video_recognizer_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
 void mmdeploy_video_recognizer_release_result(mmdeploy_video_recognition_t* results,
-                                              int* result_count, int video_count) {
-  delete[] results;
-  delete[] result_count;
+                                              int*                          result_count,
+                                              int                           video_count)
+{
+    delete[] results;
+    delete[] result_count;
 }
 
-void mmdeploy_video_recognizer_destroy(mmdeploy_video_recognizer_t recognizer) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)recognizer);
+void mmdeploy_video_recognizer_destroy(mmdeploy_video_recognizer_t recognizer)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)recognizer);
 }
 
-int mmdeploy_video_recognizer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                        mmdeploy_video_recognizer_t* recognizer) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)recognizer);
+int mmdeploy_video_recognizer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_video_recognizer_t* recognizer)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)recognizer);
 }
 
-int mmdeploy_video_recognizer_create_input(const mmdeploy_mat_t* images,
+int mmdeploy_video_recognizer_create_input(const mmdeploy_mat_t*               images,
                                            const mmdeploy_video_sample_info_t* video_info,
-                                           int video_count, mmdeploy_value_t* value) {
-  if (video_count && (images == nullptr || video_info == nullptr)) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    auto input = std::make_unique<Value>(Value{Value::kArray});
-    auto sample = std::make_unique<Value>(Value::kArray);
-    for (int i = 0; i < video_count; ++i) {
-      int clip_len = video_info[i].clip_len;
-      int num_clips = video_info[i].num_clips;
-      int n_mat = clip_len * num_clips;
-      for (int j = 0; j < n_mat; j++) {
-        mmdeploy::Mat _mat{images[j].height,
-                           images[j].width,
-                           PixelFormat(images[j].format),
-                           DataType(images[j].type),
-                           images[j].data,
-                           images[j].device ? *(const Device*)(images[j].device) : Device{0}};
-        sample->push_back({{"ori_img", _mat}, {"clip_len", clip_len}, {"num_clips", num_clips}});
-      }
-      input->front().push_back(std::move(*sample.release()));
+                                           int                                 video_count,
+                                           mmdeploy_value_t*                   value)
+{
+    if (video_count && (images == nullptr || video_info == nullptr))
+    {
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+    try
+    {
+        auto input  = std::make_unique<Value>(Value{Value::kArray});
+        auto sample = std::make_unique<Value>(Value::kArray);
+        for (int i = 0; i < video_count; ++i)
+        {
+            int clip_len  = video_info[i].clip_len;
+            int num_clips = video_info[i].num_clips;
+            int n_mat     = clip_len * num_clips;
+            for (int j = 0; j < n_mat; j++)
+            {
+                mmdeploy::Mat _mat{images[j].height,
+                                   images[j].width,
+                                   PixelFormat(images[j].format),
+                                   DataType(images[j].type),
+                                   images[j].data,
+                                   images[j].device ? *(const Device*)(images[j].device) : Device{0}};
+                sample->push_back({{"ori_img", _mat}, {"clip_len", clip_len}, {"num_clips", num_clips}});
+            }
+            input->front().push_back(std::move(*sample.release()));
+        }
+        *value = Cast(input.release());
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
     }
-    *value = Cast(input.release());
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_SUCCESS;
+    return MMDEPLOY_SUCCESS;
 }
 
 int mmdeploy_video_recognizer_apply_v2(mmdeploy_video_recognizer_t recognizer,
-                                       mmdeploy_value_t input, mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)recognizer, input, output);
+                                       mmdeploy_value_t            input,
+                                       mmdeploy_value_t*           output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)recognizer, input, output);
 }
 
-int mmdeploy_video_recognizer_get_result(mmdeploy_value_t output,
+int mmdeploy_video_recognizer_get_result(mmdeploy_value_t               output,
                                          mmdeploy_video_recognition_t** results,
-                                         int** result_count) {
-  if (!output || !results || !result_count) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value& value = Cast(output)->front();
-
-    auto classify_outputs = from_value<std::vector<mmaction::Labels>>(value);
-
-    std::vector<int> _result_count;
-    _result_count.reserve(classify_outputs.size());
-
-    for (const auto& cls_output : classify_outputs) {
-      _result_count.push_back((int)cls_output.size());
+                                         int**                          result_count)
+{
+    if (!output || !results || !result_count)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
-
-    auto total = std::accumulate(begin(_result_count), end(_result_count), 0);
-
-    std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
-    std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
-
-    std::unique_ptr<mmdeploy_video_recognition_t[]> result_data(
-        new mmdeploy_video_recognition_t[total]{});
-    auto result_ptr = result_data.get();
-    for (const auto& cls_output : classify_outputs) {
-      for (const auto& label : cls_output) {
-        result_ptr->label_id = label.label_id;
-        result_ptr->score = label.score;
-        ++result_ptr;
-      }
+    try
+    {
+        Value&           value = Cast(output)->front();
+
+        auto             classify_outputs = from_value<std::vector<mmaction::Labels>>(value);
+
+        std::vector<int> _result_count;
+        _result_count.reserve(classify_outputs.size());
+
+        for (const auto& cls_output : classify_outputs)
+        {
+            _result_count.push_back((int)cls_output.size());
+        }
+
+        auto                   total = std::accumulate(begin(_result_count), end(_result_count), 0);
+
+        std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
+        std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
+
+        std::unique_ptr<mmdeploy_video_recognition_t[]> result_data(
+            new mmdeploy_video_recognition_t[total]{});
+        auto result_ptr = result_data.get();
+        for (const auto& cls_output : classify_outputs)
+        {
+            for (const auto& label : cls_output)
+            {
+                result_ptr->label_id = label.label_id;
+                result_ptr->score    = label.score;
+                ++result_ptr;
+            }
+        }
+
+        *result_count = result_count_data.release();
+        *results      = result_data.release();
+
+        return MMDEPLOY_SUCCESS;
     }
-
-    *result_count = result_count_data.release();
-    *results = result_data.release();
-
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.h b/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.h
index e98b2bd07e..6893170e7d 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.h
@@ -13,124 +13,129 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_video_recognition_t {
-  int label_id;
-  float score;
-} mmdeploy_video_recognition_t;
-
-typedef struct mmdeploy_video_sample_info_t {
-  int clip_len;
-  int num_clips;
-} mmdeploy_video_sample_info_t;
-
-typedef struct mmdeploy_video_recognizer* mmdeploy_video_recognizer_t;
-
-/**
- * @brief Create video recognizer's handle
- * @param[in] model an instance of mmaction sdk model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] recognizer handle of the created video recognizer, which must be destroyed
- * by \ref mmdeploy_video_recognizer_destroy
- * @return status of creating video recognizer's handle
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_create(mmdeploy_model_t model, const char* device_name,
-                                                  int device_id,
-                                                  mmdeploy_video_recognizer_t* recognizer);
-
-/**
- * @brief Create a video recognizer instance
- * @param[in] model_path path to video recognition model
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] recognizer handle of the created video recognizer, which must be destroyed
- * by \ref mmdeploy_video_recognizer_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_create_by_path(const char* model_path,
-                                                          const char* device_name, int device_id,
-                                                          mmdeploy_video_recognizer_t* recognizer);
-
-/**
- * @brief Apply video recognizer to a batch of videos
- * @param[in] recognizer video recognizer's handle created by \ref
- * mmdeploy_video_recognizer_create_by_path
- * @param[in] images a batch of videos
- * @param[in] video_info video information of each video
- * @param[in] video_count number of videos
- * @param[out] results a linear buffer contains the recognized video, must be release
- * by \ref mmdeploy_video_recognizer_release_result
- * @param[out] result_count a linear buffer with length being \p video_count to save the number of
- * recognition results of each video. It must be released by \ref
- * mmdeploy_video_recognizer_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_apply(mmdeploy_video_recognizer_t recognizer,
-                                                 const mmdeploy_mat_t* images,
-                                                 const mmdeploy_video_sample_info_t* video_info,
-                                                 int video_count,
-                                                 mmdeploy_video_recognition_t** results,
-                                                 int** result_count);
-
-/** @brief Release result buffer returned by \ref mmdeploy_video_recognizer_apply
- * @param[in] results result buffer by video recognizer
- * @param[in] result_count \p results size buffer
- * @param[in] video_count length of \p result_count
- */
-MMDEPLOY_API void mmdeploy_video_recognizer_release_result(mmdeploy_video_recognition_t* results,
-                                                           int* result_count, int video_count);
-
-/**
- * @brief destroy video recognizer
- * @param[in] recognizer handle of video recognizer created by \ref
- * mmdeploy_video_recognizer_create_by_path or \ref mmdeploy_video_recognizer_create
- */
-MMDEPLOY_API void mmdeploy_video_recognizer_destroy(mmdeploy_video_recognizer_t recognizer);
-
-/**
- * @brief Same as \ref mmdeploy_video_recognizer_create, but allows to control execution context of
- * tasks via context
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_create_v2(mmdeploy_model_t model,
-                                                     mmdeploy_context_t context,
-                                                     mmdeploy_video_recognizer_t* recognizer);
-
-/**
- * @brief Pack video recognizer inputs into mmdeploy_value_t
- * @param[in] images a batch of videos
- * @param[in] video_info video information of each video
- * @param[in] video_count number of videos in the batch
- * @param[out] value created value
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_create_input(
-    const mmdeploy_mat_t* images, const mmdeploy_video_sample_info_t* video_info, int video_count,
-    mmdeploy_value_t* value);
-
-/**
- * @brief Apply video recognizer to a batch of videos
- * @param[in] input packed input
- * @param[out] output inference output
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_apply_v2(mmdeploy_video_recognizer_t recognizer,
-                                                    mmdeploy_value_t input,
-                                                    mmdeploy_value_t* output);
-
-/**
- * @brief Apply video recognizer to a batch of videos
- * @param[in] output inference output
- * @param[out] results structured output
- * @param[out] result_count number of each videos
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_get_result(mmdeploy_value_t output,
-                                                      mmdeploy_video_recognition_t** results,
-                                                      int** result_count);
+    typedef struct mmdeploy_video_recognition_t
+    {
+        int   label_id;
+        float score;
+    } mmdeploy_video_recognition_t;
+
+    typedef struct mmdeploy_video_sample_info_t
+    {
+        int clip_len;
+        int num_clips;
+    } mmdeploy_video_sample_info_t;
+
+    typedef struct mmdeploy_video_recognizer* mmdeploy_video_recognizer_t;
+
+    /**
+     * @brief Create video recognizer's handle
+     * @param[in] model an instance of mmaction sdk model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] recognizer handle of the created video recognizer, which must be destroyed
+     * by \ref mmdeploy_video_recognizer_destroy
+     * @return status of creating video recognizer's handle
+     */
+    MMDEPLOY_API int                          mmdeploy_video_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_video_recognizer_t* recognizer);
+
+    /**
+     * @brief Create a video recognizer instance
+     * @param[in] model_path path to video recognition model
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] recognizer handle of the created video recognizer, which must be destroyed
+     * by \ref mmdeploy_video_recognizer_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                          mmdeploy_video_recognizer_create_by_path(const char*                  model_path,
+                                                                                       const char*                  device_name,
+                                                                                       int                          device_id,
+                                                                                       mmdeploy_video_recognizer_t* recognizer);
+
+    /**
+     * @brief Apply video recognizer to a batch of videos
+     * @param[in] recognizer video recognizer's handle created by \ref
+     * mmdeploy_video_recognizer_create_by_path
+     * @param[in] images a batch of videos
+     * @param[in] video_info video information of each video
+     * @param[in] video_count number of videos
+     * @param[out] results a linear buffer contains the recognized video, must be release
+     * by \ref mmdeploy_video_recognizer_release_result
+     * @param[out] result_count a linear buffer with length being \p video_count to save the number of
+     * recognition results of each video. It must be released by \ref
+     * mmdeploy_video_recognizer_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                          mmdeploy_video_recognizer_apply(mmdeploy_video_recognizer_t         recognizer,
+                                                                              const mmdeploy_mat_t*               images,
+                                                                              const mmdeploy_video_sample_info_t* video_info,
+                                                                              int                                 video_count,
+                                                                              mmdeploy_video_recognition_t**      results,
+                                                                              int**                               result_count);
+
+    /** @brief Release result buffer returned by \ref mmdeploy_video_recognizer_apply
+     * @param[in] results result buffer by video recognizer
+     * @param[in] result_count \p results size buffer
+     * @param[in] video_count length of \p result_count
+     */
+    MMDEPLOY_API void                         mmdeploy_video_recognizer_release_result(mmdeploy_video_recognition_t* results,
+                                                                                       int*                          result_count,
+                                                                                       int                           video_count);
+
+    /**
+     * @brief destroy video recognizer
+     * @param[in] recognizer handle of video recognizer created by \ref
+     * mmdeploy_video_recognizer_create_by_path or \ref mmdeploy_video_recognizer_create
+     */
+    MMDEPLOY_API void                         mmdeploy_video_recognizer_destroy(mmdeploy_video_recognizer_t recognizer);
+
+    /**
+     * @brief Same as \ref mmdeploy_video_recognizer_create, but allows to control execution context of
+     * tasks via context
+     */
+    MMDEPLOY_API int                          mmdeploy_video_recognizer_create_v2(mmdeploy_model_t             model,
+                                                                                  mmdeploy_context_t           context,
+                                                                                  mmdeploy_video_recognizer_t* recognizer);
+
+    /**
+     * @brief Pack video recognizer inputs into mmdeploy_value_t
+     * @param[in] images a batch of videos
+     * @param[in] video_info video information of each video
+     * @param[in] video_count number of videos in the batch
+     * @param[out] value created value
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                          mmdeploy_video_recognizer_create_input(
+                                 const mmdeploy_mat_t*               images,
+                                 const mmdeploy_video_sample_info_t* video_info,
+                                 int                                 video_count,
+                                 mmdeploy_value_t*                   value);
+
+    /**
+     * @brief Apply video recognizer to a batch of videos
+     * @param[in] input packed input
+     * @param[out] output inference output
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int mmdeploy_video_recognizer_apply_v2(mmdeploy_video_recognizer_t recognizer,
+                                                        mmdeploy_value_t            input,
+                                                        mmdeploy_value_t*           output);
+
+    /**
+     * @brief Apply video recognizer to a batch of videos
+     * @param[in] output inference output
+     * @param[out] results structured output
+     * @param[out] result_count number of each videos
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int mmdeploy_video_recognizer_get_result(mmdeploy_value_t               output,
+                                                          mmdeploy_video_recognition_t** results,
+                                                          int**                          result_count);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp
index 1d9880fb7d..bf4772bcfb 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp
@@ -6,68 +6,80 @@
 #include "mmdeploy/classifier.h"
 #include "mmdeploy/common.hpp"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using Classification = mmdeploy_classification_t;
-
-class Classifier : public NonMovable {
- public:
-  Classifier(const Model& model, const Context& context) {
-    auto ec = mmdeploy_classifier_create_v2(model, context, &classifier_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~Classifier() {
-    if (classifier_) {
-      mmdeploy_classifier_destroy(classifier_);
-      classifier_ = {};
-    }
-  }
-
-  using Result = Result_<Classification>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    Classification* results{};
-    int* result_count{};
-    auto ec = mmdeploy_classifier_apply(classifier_, reinterpret(images.data()),
-                                        static_cast<int>(images.size()), &results, &result_count);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    std::shared_ptr<Classification> data(results, [result_count, count = images.size()](auto p) {
-      mmdeploy_classifier_release_result(p, result_count, count);
-    });
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, result_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& img) { return Apply(Span{img})[0]; }
-
- private:
-  mmdeploy_classifier_t classifier_{};
-};
-
-}  // namespace cxx
-
-using cxx::Classification;
-using cxx::Classifier;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using Classification = mmdeploy_classification_t;
+
+        class Classifier : public NonMovable
+        {
+          public:
+            Classifier(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_classifier_create_v2(model, context, &classifier_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~Classifier()
+            {
+                if (classifier_)
+                {
+                    mmdeploy_classifier_destroy(classifier_);
+                    classifier_ = {};
+                }
+            }
+
+            using Result = Result_<Classification>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                Classification* results{};
+                int*            result_count{};
+                auto            ec = mmdeploy_classifier_apply(classifier_, reinterpret(images.data()), static_cast<int>(images.size()), &results, &result_count);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::vector<Result> rets;
+                rets.reserve(images.size());
+
+                std::shared_ptr<Classification> data(results, [result_count, count = images.size()](auto p)
+                                                     { mmdeploy_classifier_release_result(p, result_count, count); });
+
+                size_t                          offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, result_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& img)
+            {
+                return Apply(Span{img})[0];
+            }
+
+          private:
+            mmdeploy_classifier_t classifier_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::Classification;
+    using cxx::Classifier;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
index 610c3a8b9e..a7547aa7c7 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
@@ -16,253 +16,378 @@
 #include "mmdeploy/model.h"
 
 #ifndef MMDEPLOY_CXX_USE_OPENCV
-#define MMDEPLOY_CXX_USE_OPENCV 1
+    #define MMDEPLOY_CXX_USE_OPENCV 1
 #endif
 
 #if MMDEPLOY_CXX_USE_OPENCV
-#include "opencv2/core/core.hpp"
+    #include "opencv2/core/core.hpp"
 #endif
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using Rect = mmdeploy_rect_t;
-
-template <typename T>
-class UniqueHandle : public NonCopyable {
- public:
-  UniqueHandle() = default;
-  explicit UniqueHandle(T handle) : handle_(handle) {}
-
-  // derived class must destroy the object and reset `handle_`
-  ~UniqueHandle() { assert(handle_ == nullptr); }
-
-  UniqueHandle(UniqueHandle&& o) noexcept : handle_(std::exchange(o.handle_, nullptr)) {}
-  UniqueHandle& operator=(UniqueHandle&& o) noexcept {
-    if (this != &o) {
-      handle_ = std::exchange(o.handle_, nullptr);
-    }
-    return *this;
-  }
-
-  explicit operator T() const noexcept { return handle_; }
-  T operator->() const noexcept { return handle_; }
-
- protected:
-  T handle_{};
-};
-
-class Model {
- public:
-  explicit Model(const char* path) {
-    mmdeploy_model_t model{};
-    auto ec = mmdeploy_model_create_by_path(path, &model);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    model_.reset(model, [](auto p) { mmdeploy_model_destroy(p); });
-  }
-
-  explicit Model(const std::string& path) : Model(path.c_str()) {}
-
-  Model(const void* buffer, size_t size) {
-    mmdeploy_model_t model{};
-    auto ec = mmdeploy_model_create(buffer, static_cast<int>(size), &model);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    model_.reset(model, [](auto p) { mmdeploy_model_destroy(p); });
-  }
-
-  operator mmdeploy_model_t() const noexcept { return model_.get(); }
-
- private:
-  std::shared_ptr<mmdeploy_model> model_{};
-};
-
-class Device {
- public:
-  explicit Device(std::string name, int index = 0) : name_(std::move(name)), index_(index) {
-    mmdeploy_device_t device{};
-    auto ec = mmdeploy_device_create(name_.c_str(), index, &device);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    device_.reset(device, [](auto p) { mmdeploy_device_destroy(p); });
-  }
-
-  const char* name() const noexcept { return name_.c_str(); }
-  int index() const noexcept { return index_; }
-
-  operator mmdeploy_device_t() const noexcept { return device_.get(); }
-
- private:
-  std::string name_;
-  int index_;
-  std::shared_ptr<mmdeploy_device> device_;
-};
-
-class Profiler {
- public:
-  explicit Profiler(std::string_view path) : path_(path) {
-    mmdeploy_profiler_t profiler{};
-    auto ec = mmdeploy_profiler_create(path_.c_str(), &profiler);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    profiler_.reset(profiler, [](auto p) { mmdeploy_profiler_destroy(p); });
-  };
-
-  operator mmdeploy_profiler_t() const noexcept { return profiler_.get(); }
-
- private:
-  std::string path_;
-  std::shared_ptr<mmdeploy_profiler> profiler_;
-};
-
-class Mat {
- public:
-  Mat() : desc_{} {}
-
-  Mat(int height, int width, int channels, mmdeploy_pixel_format_t format,
-      mmdeploy_data_type_t type, uint8_t* data, mmdeploy_device_t device = nullptr)
-      : desc_{data, height, width, channels, format, type, device} {}
-
-  Mat(const mmdeploy_mat_t& desc) : desc_(desc) {}  // NOLINT
-
-  const mmdeploy_mat_t& desc() const noexcept { return desc_; }
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using Rect = mmdeploy_rect_t;
+
+        template<typename T>
+        class UniqueHandle : public NonCopyable
+        {
+          public:
+            UniqueHandle() = default;
+            explicit UniqueHandle(T handle)
+                : handle_(handle)
+            {
+            }
+
+            // derived class must destroy the object and reset `handle_`
+            ~UniqueHandle()
+            {
+                assert(handle_ == nullptr);
+            }
+
+            UniqueHandle(UniqueHandle&& o) noexcept
+                : handle_(std::exchange(o.handle_, nullptr))
+            {
+            }
+            UniqueHandle& operator=(UniqueHandle&& o) noexcept
+            {
+                if (this != &o)
+                {
+                    handle_ = std::exchange(o.handle_, nullptr);
+                }
+                return *this;
+            }
+
+            explicit operator T() const noexcept
+            {
+                return handle_;
+            }
+            T operator->() const noexcept
+            {
+                return handle_;
+            }
+
+          protected:
+            T handle_{};
+        };
+
+        class Model
+        {
+          public:
+            explicit Model(const char* path)
+            {
+                mmdeploy_model_t model{};
+                auto             ec = mmdeploy_model_create_by_path(path, &model);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+                model_.reset(model, [](auto p)
+                             { mmdeploy_model_destroy(p); });
+            }
+
+            explicit Model(const std::string& path)
+                : Model(path.c_str())
+            {
+            }
+
+            Model(const void* buffer, size_t size)
+            {
+                mmdeploy_model_t model{};
+                auto             ec = mmdeploy_model_create(buffer, static_cast<int>(size), &model);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+                model_.reset(model, [](auto p)
+                             { mmdeploy_model_destroy(p); });
+            }
+
+            operator mmdeploy_model_t() const noexcept
+            {
+                return model_.get();
+            }
+
+          private:
+            std::shared_ptr<mmdeploy_model> model_{};
+        };
+
+        class Device
+        {
+          public:
+            explicit Device(std::string name, int index = 0)
+                : name_(std::move(name))
+                , index_(index)
+            {
+                mmdeploy_device_t device{};
+                auto              ec = mmdeploy_device_create(name_.c_str(), index, &device);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+                device_.reset(device, [](auto p)
+                              { mmdeploy_device_destroy(p); });
+            }
+
+            const char* name() const noexcept
+            {
+                return name_.c_str();
+            }
+            int index() const noexcept
+            {
+                return index_;
+            }
+
+            operator mmdeploy_device_t() const noexcept
+            {
+                return device_.get();
+            }
+
+          private:
+            std::string                      name_;
+            int                              index_;
+            std::shared_ptr<mmdeploy_device> device_;
+        };
+
+        class Profiler
+        {
+          public:
+            explicit Profiler(std::string_view path)
+                : path_(path)
+            {
+                mmdeploy_profiler_t profiler{};
+                auto                ec = mmdeploy_profiler_create(path_.c_str(), &profiler);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+                profiler_.reset(profiler, [](auto p)
+                                { mmdeploy_profiler_destroy(p); });
+            };
+
+            operator mmdeploy_profiler_t() const noexcept
+            {
+                return profiler_.get();
+            }
+
+          private:
+            std::string                        path_;
+            std::shared_ptr<mmdeploy_profiler> profiler_;
+        };
+
+        class Mat
+        {
+          public:
+            Mat()
+                : desc_{}
+            {
+            }
+
+            Mat(int height, int width, int channels, mmdeploy_pixel_format_t format, mmdeploy_data_type_t type, uint8_t* data, mmdeploy_device_t device = nullptr)
+                : desc_{data, height, width, channels, format, type, device}
+            {
+            }
+
+            Mat(const mmdeploy_mat_t& desc)
+                : desc_(desc)
+            {
+            }  // NOLINT
+
+            const mmdeploy_mat_t& desc() const noexcept
+            {
+                return desc_;
+            }
 
 #if MMDEPLOY_CXX_USE_OPENCV
-  Mat(const cv::Mat& mat, mmdeploy_pixel_format_t pixel_format)
-      : desc_{mat.data, mat.rows, mat.cols, mat.channels(), pixel_format, GetCvType(mat.depth())} {
-    if (pixel_format == MMDEPLOY_PIXEL_FORMAT_COUNT) {
-      throw_exception(eNotSupported);
-    }
-    if (desc_.type == MMDEPLOY_DATA_TYPE_COUNT) {
-      throw_exception(eNotSupported);
-    }
-  }
-  Mat(const cv::Mat& mat) : Mat(mat, GetCvFormat(mat.channels())) {}
-
-  static mmdeploy_data_type_t GetCvType(int depth) {
-    switch (depth) {
-      case CV_8U:
-        return MMDEPLOY_DATA_TYPE_UINT8;
-      case CV_32F:
-        return MMDEPLOY_DATA_TYPE_FLOAT;
-      default:
-        return MMDEPLOY_DATA_TYPE_COUNT;
-    }
-  }
-  static mmdeploy_pixel_format_t GetCvFormat(int channels) {
-    switch (channels) {
-      case 1:
-        return MMDEPLOY_PIXEL_FORMAT_GRAYSCALE;
-      case 3:
-        return MMDEPLOY_PIXEL_FORMAT_BGR;
-      case 4:
-        return MMDEPLOY_PIXEL_FORMAT_BGRA;
-      default:
-        return MMDEPLOY_PIXEL_FORMAT_COUNT;
-    }
-  }
+            Mat(const cv::Mat& mat, mmdeploy_pixel_format_t pixel_format)
+                : desc_{mat.data, mat.rows, mat.cols, mat.channels(), pixel_format, GetCvType(mat.depth())}
+            {
+                if (pixel_format == MMDEPLOY_PIXEL_FORMAT_COUNT)
+                {
+                    throw_exception(eNotSupported);
+                }
+                if (desc_.type == MMDEPLOY_DATA_TYPE_COUNT)
+                {
+                    throw_exception(eNotSupported);
+                }
+            }
+            Mat(const cv::Mat& mat)
+                : Mat(mat, GetCvFormat(mat.channels()))
+            {
+            }
+
+            static mmdeploy_data_type_t GetCvType(int depth)
+            {
+                switch (depth)
+                {
+                    case CV_8U:
+                        return MMDEPLOY_DATA_TYPE_UINT8;
+                    case CV_32F:
+                        return MMDEPLOY_DATA_TYPE_FLOAT;
+                    default:
+                        return MMDEPLOY_DATA_TYPE_COUNT;
+                }
+            }
+            static mmdeploy_pixel_format_t GetCvFormat(int channels)
+            {
+                switch (channels)
+                {
+                    case 1:
+                        return MMDEPLOY_PIXEL_FORMAT_GRAYSCALE;
+                    case 3:
+                        return MMDEPLOY_PIXEL_FORMAT_BGR;
+                    case 4:
+                        return MMDEPLOY_PIXEL_FORMAT_BGRA;
+                    default:
+                        return MMDEPLOY_PIXEL_FORMAT_COUNT;
+                }
+            }
 #endif
- private:
-  mmdeploy_mat_t desc_;
-};
-
-template <typename T>
-class Result_ {
- public:
-  using value_type = T;
-  using size_type = size_t;
-  using difference_type = ptrdiff_t;
-  using reference = T&;
-  using const_reference = const T&;
-  using pointer = T*;
-  using const_pointer = const T*;
-  using iterator = T*;
-  using const_iterator = T*;
-
-  Result_(size_t offset, size_t size, std::shared_ptr<T> data)
-      : offset_(offset), size_(size), data_(std::move(data)) {}
-
-  T& operator[](size_t index) const noexcept { return *(data_.get() + offset_ + index); }
-  size_t size() const noexcept { return size_; }
-  T* begin() const noexcept { return data_.get() + offset_; }
-  T* end() const noexcept { return begin() + size_; }
-
-  T* operator->() const noexcept { return data_.get(); }
-  T& operator*() const noexcept { return *data_; }
-
- private:
-  size_t offset_;
-  size_t size_;
-  std::shared_ptr<T> data_;
-};
-
-inline const mmdeploy_mat_t* reinterpret(const Mat* p) {
-  return reinterpret_cast<const mmdeploy_mat_t*>(p);
-}
-
-class Scheduler {
- public:
-  explicit Scheduler(mmdeploy_scheduler_t scheduler) {
-    scheduler_.reset(scheduler, [](auto p) { mmdeploy_scheduler_destroy(p); });
-  }
-
-  static Scheduler ThreadPool(int num_threads) {
-    return Scheduler(mmdeploy_executor_create_thread_pool(num_threads));
-  }
-  static Scheduler Thread() { return Scheduler(mmdeploy_executor_create_thread()); }
-
-  operator mmdeploy_scheduler_t() const noexcept { return scheduler_.get(); }
-
- private:
-  std::shared_ptr<mmdeploy_scheduler> scheduler_;
-};
-
-class Context {
- public:
-  Context() {
-    mmdeploy_context_t context{};
-    mmdeploy_context_create(&context);
-    context_.reset(context, [](auto p) { mmdeploy_context_destroy(p); });
-  }
-  /* implicit */ Context(const Device& device) : Context() { Add(device); }
-
-  void Add(const std::string& name, const Scheduler& scheduler) {
-    mmdeploy_context_add(*this, MMDEPLOY_TYPE_SCHEDULER, name.c_str(), scheduler);
-  }
-
-  void Add(const std::string& name, const Model& model) {
-    mmdeploy_context_add(*this, MMDEPLOY_TYPE_MODEL, name.c_str(), model);
-  }
-
-  void Add(const Device& device) {
-    mmdeploy_context_add(*this, MMDEPLOY_TYPE_DEVICE, nullptr, device);
-  }
-
-  void Add(const Profiler& profiler) {
-    mmdeploy_context_add(*this, MMDEPLOY_TYPE_PROFILER, nullptr, profiler);
-  }
-
-  operator mmdeploy_context_t() const noexcept { return context_.get(); }
-
- private:
-  std::shared_ptr<mmdeploy_context> context_;
-};
-
-}  // namespace cxx
-
-using cxx::Context;
-using cxx::Device;
-using cxx::Mat;
-using cxx::Model;
-using cxx::Profiler;
-using cxx::Rect;
-using cxx::Scheduler;
+          private:
+            mmdeploy_mat_t desc_;
+        };
+
+        template<typename T>
+        class Result_
+        {
+          public:
+            using value_type      = T;
+            using size_type       = size_t;
+            using difference_type = ptrdiff_t;
+            using reference       = T&;
+            using const_reference = const T&;
+            using pointer         = T*;
+            using const_pointer   = const T*;
+            using iterator        = T*;
+            using const_iterator  = T*;
+
+            Result_(size_t offset, size_t size, std::shared_ptr<T> data)
+                : offset_(offset)
+                , size_(size)
+                , data_(std::move(data))
+            {
+            }
+
+            T& operator[](size_t index) const noexcept
+            {
+                return *(data_.get() + offset_ + index);
+            }
+            size_t size() const noexcept
+            {
+                return size_;
+            }
+            T* begin() const noexcept
+            {
+                return data_.get() + offset_;
+            }
+            T* end() const noexcept
+            {
+                return begin() + size_;
+            }
+
+            T* operator->() const noexcept
+            {
+                return data_.get();
+            }
+            T& operator*() const noexcept
+            {
+                return *data_;
+            }
+
+          private:
+            size_t             offset_;
+            size_t             size_;
+            std::shared_ptr<T> data_;
+        };
+
+        inline const mmdeploy_mat_t* reinterpret(const Mat* p)
+        {
+            return reinterpret_cast<const mmdeploy_mat_t*>(p);
+        }
+
+        class Scheduler
+        {
+          public:
+            explicit Scheduler(mmdeploy_scheduler_t scheduler)
+            {
+                scheduler_.reset(scheduler, [](auto p)
+                                 { mmdeploy_scheduler_destroy(p); });
+            }
+
+            static Scheduler ThreadPool(int num_threads)
+            {
+                return Scheduler(mmdeploy_executor_create_thread_pool(num_threads));
+            }
+            static Scheduler Thread()
+            {
+                return Scheduler(mmdeploy_executor_create_thread());
+            }
+
+            operator mmdeploy_scheduler_t() const noexcept
+            {
+                return scheduler_.get();
+            }
+
+          private:
+            std::shared_ptr<mmdeploy_scheduler> scheduler_;
+        };
+
+        class Context
+        {
+          public:
+            Context()
+            {
+                mmdeploy_context_t context{};
+                mmdeploy_context_create(&context);
+                context_.reset(context, [](auto p)
+                               { mmdeploy_context_destroy(p); });
+            }
+            /* implicit */ Context(const Device& device)
+                : Context()
+            {
+                Add(device);
+            }
+
+            void Add(const std::string& name, const Scheduler& scheduler)
+            {
+                mmdeploy_context_add(*this, MMDEPLOY_TYPE_SCHEDULER, name.c_str(), scheduler);
+            }
+
+            void Add(const std::string& name, const Model& model)
+            {
+                mmdeploy_context_add(*this, MMDEPLOY_TYPE_MODEL, name.c_str(), model);
+            }
+
+            void Add(const Device& device)
+            {
+                mmdeploy_context_add(*this, MMDEPLOY_TYPE_DEVICE, nullptr, device);
+            }
+
+            void Add(const Profiler& profiler)
+            {
+                mmdeploy_context_add(*this, MMDEPLOY_TYPE_PROFILER, nullptr, profiler);
+            }
+
+            operator mmdeploy_context_t() const noexcept
+            {
+                return context_.get();
+            }
+
+          private:
+            std::shared_ptr<mmdeploy_context> context_;
+        };
+
+    }  // namespace cxx
+
+    using cxx::Context;
+    using cxx::Device;
+    using cxx::Mat;
+    using cxx::Model;
+    using cxx::Profiler;
+    using cxx::Rect;
+    using cxx::Scheduler;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp
index 847505bbe7..6f38a20d90 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp
@@ -6,68 +6,80 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/detector.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using Detection = mmdeploy_detection_t;
-
-class Detector : public NonMovable {
- public:
-  Detector(const Model& model, const Context& context) {
-    auto ec = mmdeploy_detector_create_v2(model, context, &detector_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~Detector() {
-    if (detector_) {
-      mmdeploy_detector_destroy(detector_);
-      detector_ = {};
-    }
-  }
-
-  using Result = Result_<Detection>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    Detection* results{};
-    int* result_count{};
-    auto ec = mmdeploy_detector_apply(detector_, reinterpret(images.data()),
-                                      static_cast<int>(images.size()), &results, &result_count);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::shared_ptr<Detection> data(results, [result_count, count = images.size()](auto p) {
-      mmdeploy_detector_release_result(p, result_count, count);
-    });
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, result_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image) { return Apply(Span{image})[0]; }
-
- private:
-  mmdeploy_detector_t detector_{};
-};
-
-}  // namespace cxx
-
-using cxx::Detection;
-using cxx::Detector;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using Detection = mmdeploy_detection_t;
+
+        class Detector : public NonMovable
+        {
+          public:
+            Detector(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_detector_create_v2(model, context, &detector_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~Detector()
+            {
+                if (detector_)
+                {
+                    mmdeploy_detector_destroy(detector_);
+                    detector_ = {};
+                }
+            }
+
+            using Result = Result_<Detection>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                Detection* results{};
+                int*       result_count{};
+                auto       ec = mmdeploy_detector_apply(detector_, reinterpret(images.data()), static_cast<int>(images.size()), &results, &result_count);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::shared_ptr<Detection> data(results, [result_count, count = images.size()](auto p)
+                                                { mmdeploy_detector_release_result(p, result_count, count); });
+
+                std::vector<Result>        rets;
+                rets.reserve(images.size());
+
+                size_t offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, result_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image)
+            {
+                return Apply(Span{image})[0];
+            }
+
+          private:
+            mmdeploy_detector_t detector_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::Detection;
+    using cxx::Detector;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp
index e20ec6a224..c5f07f56af 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp
@@ -7,72 +7,87 @@
 #include "mmdeploy/core/value.h"
 #include "mmdeploy/pipeline.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-namespace cxx {
+    namespace cxx
+    {
 
-class Pipeline : public NonMovable {
- public:
-  Pipeline(const Value& config, const Context& context) {
-    mmdeploy_pipeline_t pipeline{};
-    auto ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, context, &pipeline);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    pipeline_ = pipeline;
-  }
+        class Pipeline : public NonMovable
+        {
+          public:
+            Pipeline(const Value& config, const Context& context)
+            {
+                mmdeploy_pipeline_t pipeline{};
+                auto                ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, context, &pipeline);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+                pipeline_ = pipeline;
+            }
 
-  ~Pipeline() {
-    if (pipeline_) {
-      mmdeploy_pipeline_destroy(pipeline_);
-      pipeline_ = nullptr;
-    }
-  }
+            ~Pipeline()
+            {
+                if (pipeline_)
+                {
+                    mmdeploy_pipeline_destroy(pipeline_);
+                    pipeline_ = nullptr;
+                }
+            }
 
-  Value Apply(const Value& inputs) {
-    mmdeploy_value_t tmp{};
-    auto ec = mmdeploy_pipeline_apply(pipeline_, (mmdeploy_value_t)&inputs, &tmp);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    Value output = std::move(*(Value*)tmp);
-    mmdeploy_value_destroy(tmp);
-    return output;
-  }
+            Value Apply(const Value& inputs)
+            {
+                mmdeploy_value_t tmp{};
+                auto             ec = mmdeploy_pipeline_apply(pipeline_, (mmdeploy_value_t)&inputs, &tmp);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+                Value output = std::move(*(Value*)tmp);
+                mmdeploy_value_destroy(tmp);
+                return output;
+            }
 
-  Value Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-    mmdeploy_value_t inputs{};
-    auto ec = mmdeploy_common_create_input(reinterpret(images.data()),
-                                           static_cast<int>(images.size()), &inputs);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    auto outputs = Apply(*reinterpret_cast<Value*>(inputs));
-    mmdeploy_value_destroy(inputs);
+            Value Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+                mmdeploy_value_t inputs{};
+                auto             ec = mmdeploy_common_create_input(reinterpret(images.data()),
+                                                       static_cast<int>(images.size()),
+                                                       &inputs);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+                auto outputs = Apply(*reinterpret_cast<Value*>(inputs));
+                mmdeploy_value_destroy(inputs);
 
-    return outputs;
-  }
+                return outputs;
+            }
 
-  Value Apply(const Mat& image) {
-    auto outputs = Apply(Span{image});
-    Value::Array rets;
-    rets.reserve(outputs.size());
-    for (auto& output : outputs) {
-      rets.push_back(std::move(output[0]));
-    }
-    return rets;
-  }
+            Value Apply(const Mat& image)
+            {
+                auto         outputs = Apply(Span{image});
+                Value::Array rets;
+                rets.reserve(outputs.size());
+                for (auto& output : outputs)
+                {
+                    rets.push_back(std::move(output[0]));
+                }
+                return rets;
+            }
 
- private:
-  mmdeploy_pipeline_t pipeline_{};
-};
+          private:
+            mmdeploy_pipeline_t pipeline_{};
+        };
 
-}  // namespace cxx
+    }  // namespace cxx
 
-using cxx::Pipeline;
+    using cxx::Pipeline;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp
index 7432a417fc..6a157f5228 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp
@@ -6,79 +6,88 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/pose_detector.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using PoseDetection = mmdeploy_pose_detection_t;
-
-class PoseDetector : public NonMovable {
- public:
-  PoseDetector(const Model& model, const Context& context) {
-    auto ec = mmdeploy_pose_detector_create_v2(model, context, &detector_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~PoseDetector() {
-    if (detector_) {
-      mmdeploy_pose_detector_destroy(detector_);
-      detector_ = {};
-    }
-  }
-
-  using Result = Result_<PoseDetection>;
-
-  std::vector<Result> Apply(Span<const Mat> images, Span<const Rect> bboxes,
-                            Span<const int> bbox_count) {
-    if (images.empty()) {
-      return {};
-    }
-
-    const mmdeploy_rect_t* p_bboxes{};
-    const int* p_bbox_count{};
-
-    if (!bboxes.empty()) {
-      p_bboxes = bboxes.data();
-      p_bbox_count = bbox_count.data();
-    }
-
-    PoseDetection* results{};
-    auto ec = mmdeploy_pose_detector_apply_bbox(detector_, reinterpret(images.data()),
-                                                static_cast<int>(images.size()), p_bboxes,
-                                                p_bbox_count, &results);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::shared_ptr<PoseDetection> data(results, [count = images.size()](auto p) {
-      mmdeploy_pose_detector_release_result(p, count);
-    });
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, bboxes.empty() ? 1 : bbox_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image, Span<const Rect> bboxes = {}) {
-    return Apply(Span{image}, bboxes, {static_cast<int>(bboxes.size())})[0];
-  }
-
- private:
-  mmdeploy_pose_detector_t detector_{};
-};
-
-}  // namespace cxx
-
-using cxx::PoseDetection;
-using cxx::PoseDetector;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using PoseDetection = mmdeploy_pose_detection_t;
+
+        class PoseDetector : public NonMovable
+        {
+          public:
+            PoseDetector(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_pose_detector_create_v2(model, context, &detector_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~PoseDetector()
+            {
+                if (detector_)
+                {
+                    mmdeploy_pose_detector_destroy(detector_);
+                    detector_ = {};
+                }
+            }
+
+            using Result = Result_<PoseDetection>;
+
+            std::vector<Result> Apply(Span<const Mat> images, Span<const Rect> bboxes, Span<const int> bbox_count)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                const mmdeploy_rect_t* p_bboxes{};
+                const int*             p_bbox_count{};
+
+                if (!bboxes.empty())
+                {
+                    p_bboxes     = bboxes.data();
+                    p_bbox_count = bbox_count.data();
+                }
+
+                PoseDetection* results{};
+                auto           ec = mmdeploy_pose_detector_apply_bbox(detector_, reinterpret(images.data()), static_cast<int>(images.size()), p_bboxes, p_bbox_count, &results);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::shared_ptr<PoseDetection> data(results, [count = images.size()](auto p)
+                                                    { mmdeploy_pose_detector_release_result(p, count); });
+
+                std::vector<Result>            rets;
+                rets.reserve(images.size());
+
+                size_t offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, bboxes.empty() ? 1 : bbox_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image, Span<const Rect> bboxes = {})
+            {
+                return Apply(Span{image}, bboxes, {static_cast<int>(bboxes.size())})[0];
+            }
+
+          private:
+            mmdeploy_pose_detector_t detector_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::PoseDetection;
+    using cxx::PoseDetector;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/pose_tracker.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/pose_tracker.hpp
index 077ec75700..e1e330ce05 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/pose_tracker.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/pose_tracker.hpp
@@ -6,145 +6,171 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/pose_tracker.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-class PoseTracker : public UniqueHandle<mmdeploy_pose_tracker_t> {
- public:
-  using Result = Result_<mmdeploy_pose_tracker_target_t>;
-  class State;
-  class Params;
-
- public:
-  /**
-   * @brief Create pose tracker pipeline
-   * @param detect object detection model
-   * @param pose pose estimation model
-   * @param context execution context
-   */
-  PoseTracker(const Model& detect, const Model& pose, const Context& context) {
-    auto ec = mmdeploy_pose_tracker_create(detect, pose, context, &handle_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-  ~PoseTracker() {
-    if (handle_) {
-      mmdeploy_pose_tracker_destroy(handle_);
-      handle_ = {};
-    }
-  }
-  PoseTracker(PoseTracker&&) noexcept = default;
-
-  /**
-   * @brief Create a tracker state corresponds to a video stream
-   * @param params params for creating the tracker state
-   * @return created tracker state
-   */
-  State CreateState(const Params& params);
-
-  /**
-   * @brief Apply pose tracker pipeline
-   * @param state tracker state
-   * @param frame input video frame
-   * @param detect control the use of detector
-   *   -1: use params.det_interval, 0: don't use detector, 1: force use detector
-   * @return
-   */
-  Result Apply(State& state, const Mat& frame, int detect = -1);
-
-  /**
-   * @brief batched version of Apply
-   * @param states
-   * @param frames
-   * @param detects
-   * @return
-   */
-  std::vector<Result> Apply(const Span<State>& states, const Span<const Mat>& frames,
-                            const Span<const int>& detects = {});
-
- public:
-  /**
-   * see \ref mmdeploy/pose_tracker.h for detail
-   */
-  class Params : public UniqueHandle<mmdeploy_pose_tracker_param_t*> {
-   public:
-    explicit Params() {
-      handle_ = new mmdeploy_pose_tracker_param_t{};
-      mmdeploy_pose_tracker_default_params(handle_);
-    }
-    ~Params() {
-      if (handle_) {
-        delete handle_;
-        handle_ = {};
-      }
-    }
-  };
-
-  class State : public UniqueHandle<mmdeploy_pose_tracker_state_t> {
-   public:
-    explicit State(mmdeploy_pose_tracker_t pipeline, const mmdeploy_pose_tracker_param_t* params) {
-      auto ec = mmdeploy_pose_tracker_create_state(pipeline, params, &handle_);
-      if (ec != MMDEPLOY_SUCCESS) {
-        throw_exception(static_cast<ErrorCode>(ec));
-      }
-    }
-    ~State() {
-      if (handle_) {
-        mmdeploy_pose_tracker_destroy_state(handle_);
-        handle_ = {};
-      }
-    }
-    State(State&&) noexcept = default;
-  };
-};
-
-inline PoseTracker::State PoseTracker::CreateState(const PoseTracker::Params& params) {
-  return State(handle_, static_cast<mmdeploy_pose_tracker_param_t*>(params));
-}
-
-inline std::vector<PoseTracker::Result> PoseTracker::Apply(const Span<State>& states,
-                                                           const Span<const Mat>& frames,
-                                                           const Span<const int32_t>& detects) {
-  if (frames.empty()) {
-    return {};
-  }
-  mmdeploy_pose_tracker_target_t* results{};
-  int32_t* result_count{};
-
-  auto ec = mmdeploy_pose_tracker_apply(
-      handle_, reinterpret_cast<mmdeploy_pose_tracker_state_t*>(states.data()),
-      reinterpret(frames.data()), detects.data(), static_cast<int32_t>(frames.size()), &results,
-      &result_count);
-  if (ec != MMDEPLOY_SUCCESS) {
-    throw_exception(static_cast<ErrorCode>(ec));
-  }
-
-  std::shared_ptr<mmdeploy_pose_tracker_target_t> data(
-      results, [result_count, count = frames.size()](auto p) {
-        mmdeploy_pose_tracker_release_result(p, result_count, count);
-      });
-
-  std::vector<Result> rets;
-  rets.reserve(frames.size());
-
-  size_t offset = 0;
-  for (size_t i = 0; i < frames.size(); ++i) {
-    offset += rets.emplace_back(offset, result_count[i], data).size();
-  }
-
-  return rets;
-}
-
-inline PoseTracker::Result PoseTracker::Apply(PoseTracker::State& state, const Mat& frame,
-                                              int32_t detect) {
-  return Apply(Span(&state, 1), Span{frame}, Span{detect})[0];
-}
-
-}  // namespace cxx
-
-using cxx::PoseTracker;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        class PoseTracker : public UniqueHandle<mmdeploy_pose_tracker_t>
+        {
+          public:
+            using Result = Result_<mmdeploy_pose_tracker_target_t>;
+            class State;
+            class Params;
+
+          public:
+            /**
+             * @brief Create pose tracker pipeline
+             * @param detect object detection model
+             * @param pose pose estimation model
+             * @param context execution context
+             */
+            PoseTracker(const Model& detect, const Model& pose, const Context& context)
+            {
+                auto ec = mmdeploy_pose_tracker_create(detect, pose, context, &handle_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+            ~PoseTracker()
+            {
+                if (handle_)
+                {
+                    mmdeploy_pose_tracker_destroy(handle_);
+                    handle_ = {};
+                }
+            }
+            PoseTracker(PoseTracker&&) noexcept = default;
+
+            /**
+             * @brief Create a tracker state corresponds to a video stream
+             * @param params params for creating the tracker state
+             * @return created tracker state
+             */
+            State               CreateState(const Params& params);
+
+            /**
+             * @brief Apply pose tracker pipeline
+             * @param state tracker state
+             * @param frame input video frame
+             * @param detect control the use of detector
+             *   -1: use params.det_interval, 0: don't use detector, 1: force use detector
+             * @return
+             */
+            Result              Apply(State& state, const Mat& frame, int detect = -1);
+
+            /**
+             * @brief batched version of Apply
+             * @param states
+             * @param frames
+             * @param detects
+             * @return
+             */
+            std::vector<Result> Apply(const Span<State>& states, const Span<const Mat>& frames, const Span<const int>& detects = {});
+
+          public:
+            /**
+             * see \ref mmdeploy/pose_tracker.h for detail
+             */
+            class Params : public UniqueHandle<mmdeploy_pose_tracker_param_t*>
+            {
+              public:
+                explicit Params()
+                {
+                    handle_ = new mmdeploy_pose_tracker_param_t{};
+                    mmdeploy_pose_tracker_default_params(handle_);
+                }
+                ~Params()
+                {
+                    if (handle_)
+                    {
+                        delete handle_;
+                        handle_ = {};
+                    }
+                }
+            };
+
+            class State : public UniqueHandle<mmdeploy_pose_tracker_state_t>
+            {
+              public:
+                explicit State(mmdeploy_pose_tracker_t pipeline, const mmdeploy_pose_tracker_param_t* params)
+                {
+                    auto ec = mmdeploy_pose_tracker_create_state(pipeline, params, &handle_);
+                    if (ec != MMDEPLOY_SUCCESS)
+                    {
+                        throw_exception(static_cast<ErrorCode>(ec));
+                    }
+                }
+                ~State()
+                {
+                    if (handle_)
+                    {
+                        mmdeploy_pose_tracker_destroy_state(handle_);
+                        handle_ = {};
+                    }
+                }
+                State(State&&) noexcept = default;
+            };
+        };
+
+        inline PoseTracker::State PoseTracker::CreateState(const PoseTracker::Params& params)
+        {
+            return State(handle_, static_cast<mmdeploy_pose_tracker_param_t*>(params));
+        }
+
+        inline std::vector<PoseTracker::Result> PoseTracker::Apply(const Span<State>&         states,
+                                                                   const Span<const Mat>&     frames,
+                                                                   const Span<const int32_t>& detects)
+        {
+            if (frames.empty())
+            {
+                return {};
+            }
+            mmdeploy_pose_tracker_target_t* results{};
+            int32_t*                        result_count{};
+
+            auto                            ec = mmdeploy_pose_tracker_apply(
+                handle_,
+                reinterpret_cast<mmdeploy_pose_tracker_state_t*>(states.data()),
+                reinterpret(frames.data()),
+                detects.data(),
+                static_cast<int32_t>(frames.size()),
+                &results,
+                &result_count);
+            if (ec != MMDEPLOY_SUCCESS)
+            {
+                throw_exception(static_cast<ErrorCode>(ec));
+            }
+
+            std::shared_ptr<mmdeploy_pose_tracker_target_t> data(
+                results,
+                [result_count, count = frames.size()](auto p)
+                {
+                    mmdeploy_pose_tracker_release_result(p, result_count, count);
+                });
+
+            std::vector<Result> rets;
+            rets.reserve(frames.size());
+
+            size_t offset = 0;
+            for (size_t i = 0; i < frames.size(); ++i)
+            {
+                offset += rets.emplace_back(offset, result_count[i], data).size();
+            }
+
+            return rets;
+        }
+
+        inline PoseTracker::Result PoseTracker::Apply(PoseTracker::State& state, const Mat& frame, int32_t detect)
+        {
+            return Apply(Span(&state, 1), Span{frame}, Span{detect})[0];
+        }
+
+    }  // namespace cxx
+
+    using cxx::PoseTracker;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/restorer.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/restorer.hpp
index 671c5c2d0c..dcf9ab75af 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/restorer.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/restorer.hpp
@@ -6,62 +6,77 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/restorer.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-class Restorer : public NonMovable {
- public:
-  Restorer(const Model& model, const Context& context) {
-    auto ec = mmdeploy_restorer_create_v2(model, context, &restorer_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~Restorer() {
-    if (restorer_) {
-      mmdeploy_restorer_destroy(restorer_);
-      restorer_ = {};
-    }
-  }
-
-  using Result = Result_<mmdeploy_mat_t>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    mmdeploy_mat_t* results{};
-    auto ec = mmdeploy_restorer_apply(restorer_, reinterpret(images.data()),
-                                      static_cast<int>(images.size()), &results);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    std::shared_ptr<mmdeploy_mat_t> data(
-        results, [count = images.size()](auto p) { mmdeploy_restorer_release_result(p, count); });
-
-    for (size_t i = 0; i < images.size(); ++i) {
-      rets.emplace_back(i, 1, data);
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image) { return Apply(Span{image})[0]; }
-
- private:
-  mmdeploy_restorer_t restorer_{};
-};
-
-}  // namespace cxx
-
-using cxx::Restorer;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        class Restorer : public NonMovable
+        {
+          public:
+            Restorer(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_restorer_create_v2(model, context, &restorer_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~Restorer()
+            {
+                if (restorer_)
+                {
+                    mmdeploy_restorer_destroy(restorer_);
+                    restorer_ = {};
+                }
+            }
+
+            using Result = Result_<mmdeploy_mat_t>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                mmdeploy_mat_t* results{};
+                auto            ec = mmdeploy_restorer_apply(restorer_, reinterpret(images.data()), static_cast<int>(images.size()), &results);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::vector<Result> rets;
+                rets.reserve(images.size());
+
+                std::shared_ptr<mmdeploy_mat_t> data(
+                    results,
+                    [count = images.size()](auto p)
+                    { mmdeploy_restorer_release_result(p, count); });
+
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    rets.emplace_back(i, 1, data);
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image)
+            {
+                return Apply(Span{image})[0];
+            }
+
+          private:
+            mmdeploy_restorer_t restorer_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::Restorer;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/rotated_detector.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/rotated_detector.hpp
index fa065b0f0c..5a224f6fa5 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/rotated_detector.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/rotated_detector.hpp
@@ -6,69 +6,81 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/rotated_detector.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using RotatedDetection = mmdeploy_rotated_detection_t;
-
-class RotatedDetector : public NonMovable {
- public:
-  RotatedDetector(const Model& model, const Context& context) {
-    auto ec = mmdeploy_rotated_detector_create_v2(model, context, &detector_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~RotatedDetector() {
-    if (detector_) {
-      mmdeploy_rotated_detector_destroy(detector_);
-      detector_ = {};
-    }
-  }
-
-  using Result = Result_<RotatedDetection>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    RotatedDetection* results{};
-    int* result_count{};
-    auto ec =
-        mmdeploy_rotated_detector_apply(detector_, reinterpret(images.data()),
-                                        static_cast<int>(images.size()), &results, &result_count);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::shared_ptr<RotatedDetection> data(results, [result_count](auto p) {
-      mmdeploy_rotated_detector_release_result(p, result_count);
-    });
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, result_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image) { return Apply(Span{image})[0]; }
-
- private:
-  mmdeploy_rotated_detector_t detector_{};
-};
-
-}  // namespace cxx
-
-using cxx::RotatedDetection;
-using cxx::RotatedDetector;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using RotatedDetection = mmdeploy_rotated_detection_t;
+
+        class RotatedDetector : public NonMovable
+        {
+          public:
+            RotatedDetector(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_rotated_detector_create_v2(model, context, &detector_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~RotatedDetector()
+            {
+                if (detector_)
+                {
+                    mmdeploy_rotated_detector_destroy(detector_);
+                    detector_ = {};
+                }
+            }
+
+            using Result = Result_<RotatedDetection>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                RotatedDetection* results{};
+                int*              result_count{};
+                auto              ec =
+                    mmdeploy_rotated_detector_apply(detector_, reinterpret(images.data()), static_cast<int>(images.size()), &results, &result_count);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::shared_ptr<RotatedDetection> data(results, [result_count](auto p)
+                                                       { mmdeploy_rotated_detector_release_result(p, result_count); });
+
+                std::vector<Result>               rets;
+                rets.reserve(images.size());
+
+                size_t offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, result_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image)
+            {
+                return Apply(Span{image})[0];
+            }
+
+          private:
+            mmdeploy_rotated_detector_t detector_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::RotatedDetection;
+    using cxx::RotatedDetector;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/segmentor.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/segmentor.hpp
index fe53023d1c..7ad98a91bb 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/segmentor.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/segmentor.hpp
@@ -6,65 +6,80 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/segmentor.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using Segmentation = mmdeploy_segmentation_t;
-
-class Segmentor : public NonMovable {
- public:
-  Segmentor(const Model& model, const Context& context) {
-    auto ec = mmdeploy_segmentor_create_v2(model, context, &segmentor_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~Segmentor() {
-    if (segmentor_) {
-      mmdeploy_segmentor_destroy(segmentor_);
-      segmentor_ = {};
-    }
-  }
-
-  using Result = Result_<Segmentation>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    Segmentation* results{};
-    auto ec = mmdeploy_segmentor_apply(segmentor_, reinterpret(images.data()),
-                                       static_cast<int>(images.size()), &results);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    std::shared_ptr<Segmentation> data(
-        results, [count = images.size()](auto p) { mmdeploy_segmentor_release_result(p, count); });
-
-    for (size_t i = 0; i < images.size(); ++i) {
-      rets.emplace_back(i, 1, data);
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image) { return Apply(Span{image})[0]; }
-
- private:
-  mmdeploy_segmentor_t segmentor_{};
-};
-
-}  // namespace cxx
-
-using cxx::Segmentation;
-using cxx::Segmentor;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using Segmentation = mmdeploy_segmentation_t;
+
+        class Segmentor : public NonMovable
+        {
+          public:
+            Segmentor(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_segmentor_create_v2(model, context, &segmentor_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~Segmentor()
+            {
+                if (segmentor_)
+                {
+                    mmdeploy_segmentor_destroy(segmentor_);
+                    segmentor_ = {};
+                }
+            }
+
+            using Result = Result_<Segmentation>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                Segmentation* results{};
+                auto          ec = mmdeploy_segmentor_apply(segmentor_, reinterpret(images.data()), static_cast<int>(images.size()), &results);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::vector<Result> rets;
+                rets.reserve(images.size());
+
+                std::shared_ptr<Segmentation> data(
+                    results,
+                    [count = images.size()](auto p)
+                    { mmdeploy_segmentor_release_result(p, count); });
+
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    rets.emplace_back(i, 1, data);
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image)
+            {
+                return Apply(Span{image})[0];
+            }
+
+          private:
+            mmdeploy_segmentor_t segmentor_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::Segmentation;
+    using cxx::Segmentor;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/text_detector.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/text_detector.hpp
index d848715405..56f2f02f18 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/text_detector.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/text_detector.hpp
@@ -6,69 +6,81 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/text_detector.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using TextDetection = mmdeploy_text_detection_t;
-
-class TextDetector : public NonMovable {
- public:
-  TextDetector(const Model& model, const Context& context) {
-    auto ec = mmdeploy_text_detector_create_v2(model, context, &detector_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~TextDetector() {
-    if (detector_) {
-      mmdeploy_text_detector_destroy(detector_);
-      detector_ = {};
-    }
-  }
-
-  using Result = Result_<TextDetection>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    TextDetection* results{};
-    int* result_count{};
-    auto ec =
-        mmdeploy_text_detector_apply(detector_, reinterpret(images.data()),
-                                     static_cast<int>(images.size()), &results, &result_count);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::shared_ptr<TextDetection> data(results, [result_count, count = images.size()](auto p) {
-      mmdeploy_text_detector_release_result(p, result_count, count);
-    });
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, result_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image) { return Apply(Span{image})[0]; }
-
- private:
-  mmdeploy_text_detector_t detector_{};
-};
-
-}  // namespace cxx
-
-using cxx::TextDetection;
-using cxx::TextDetector;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using TextDetection = mmdeploy_text_detection_t;
+
+        class TextDetector : public NonMovable
+        {
+          public:
+            TextDetector(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_text_detector_create_v2(model, context, &detector_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~TextDetector()
+            {
+                if (detector_)
+                {
+                    mmdeploy_text_detector_destroy(detector_);
+                    detector_ = {};
+                }
+            }
+
+            using Result = Result_<TextDetection>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                TextDetection* results{};
+                int*           result_count{};
+                auto           ec =
+                    mmdeploy_text_detector_apply(detector_, reinterpret(images.data()), static_cast<int>(images.size()), &results, &result_count);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::shared_ptr<TextDetection> data(results, [result_count, count = images.size()](auto p)
+                                                    { mmdeploy_text_detector_release_result(p, result_count, count); });
+
+                std::vector<Result>            rets;
+                rets.reserve(images.size());
+
+                size_t offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, result_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image)
+            {
+                return Apply(Span{image})[0];
+            }
+
+          private:
+            mmdeploy_text_detector_t detector_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::TextDetection;
+    using cxx::TextDetector;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/text_recognizer.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/text_recognizer.hpp
index eba8ea3902..31c741e2ee 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/text_recognizer.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/text_recognizer.hpp
@@ -9,82 +9,91 @@
 #include "mmdeploy/text_detector.hpp"
 #include "mmdeploy/text_recognizer.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using TextRecognition = mmdeploy_text_recognition_t;
-
-class TextRecognizer : public NonMovable {
- public:
-  TextRecognizer(const Model& model, const Context& context) {
-    auto ec = mmdeploy_text_recognizer_create_v2(model, context, &recognizer_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~TextRecognizer() {
-    if (recognizer_) {
-      mmdeploy_text_recognizer_destroy(recognizer_);
-      recognizer_ = {};
-    }
-  }
-
-  using Result = Result_<TextRecognition>;
-
-  std::vector<Result> Apply(Span<const Mat> images, Span<const TextDetection> bboxes,
-                            Span<const int> bbox_count) {
-    if (images.empty()) {
-      return {};
-    }
-
-    const TextDetection* p_bboxes{};
-    const int* p_bbox_count{};
-
-    auto n_total_bboxes = static_cast<int>(images.size());
-
-    if (!bboxes.empty()) {
-      p_bboxes = bboxes.data();
-      p_bbox_count = bbox_count.data();
-      n_total_bboxes = std::accumulate(bbox_count.begin(), bbox_count.end(), 0);
-    }
-
-    TextRecognition* results{};
-    auto ec = mmdeploy_text_recognizer_apply_bbox(recognizer_, reinterpret(images.data()),
-                                                  static_cast<int>(images.size()), p_bboxes,
-                                                  p_bbox_count, &results);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::shared_ptr<TextRecognition> data(results, [count = n_total_bboxes](auto p) {
-      mmdeploy_text_recognizer_release_result(p, count);
-    });
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, bboxes.empty() ? 1 : bbox_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image, Span<const TextDetection> bboxes = {}) {
-    return Apply(Span{image}, bboxes, {static_cast<int>(bboxes.size())})[0];
-  }
-
- private:
-  mmdeploy_text_recognizer_t recognizer_{};
-};
-
-}  // namespace cxx
-
-using cxx::TextRecognition;
-using cxx::TextRecognizer;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using TextRecognition = mmdeploy_text_recognition_t;
+
+        class TextRecognizer : public NonMovable
+        {
+          public:
+            TextRecognizer(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_text_recognizer_create_v2(model, context, &recognizer_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~TextRecognizer()
+            {
+                if (recognizer_)
+                {
+                    mmdeploy_text_recognizer_destroy(recognizer_);
+                    recognizer_ = {};
+                }
+            }
+
+            using Result = Result_<TextRecognition>;
+
+            std::vector<Result> Apply(Span<const Mat> images, Span<const TextDetection> bboxes, Span<const int> bbox_count)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                const TextDetection* p_bboxes{};
+                const int*           p_bbox_count{};
+
+                auto                 n_total_bboxes = static_cast<int>(images.size());
+
+                if (!bboxes.empty())
+                {
+                    p_bboxes       = bboxes.data();
+                    p_bbox_count   = bbox_count.data();
+                    n_total_bboxes = std::accumulate(bbox_count.begin(), bbox_count.end(), 0);
+                }
+
+                TextRecognition* results{};
+                auto             ec = mmdeploy_text_recognizer_apply_bbox(recognizer_, reinterpret(images.data()), static_cast<int>(images.size()), p_bboxes, p_bbox_count, &results);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::shared_ptr<TextRecognition> data(results, [count = n_total_bboxes](auto p)
+                                                      { mmdeploy_text_recognizer_release_result(p, count); });
+
+                std::vector<Result>              rets;
+                rets.reserve(images.size());
+
+                size_t offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, bboxes.empty() ? 1 : bbox_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image, Span<const TextDetection> bboxes = {})
+            {
+                return Apply(Span{image}, bboxes, {static_cast<int>(bboxes.size())})[0];
+            }
+
+          private:
+            mmdeploy_text_recognizer_t recognizer_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::TextRecognition;
+    using cxx::TextRecognizer;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/video_recognizer.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/video_recognizer.hpp
index 583b28dd59..ed3569e242 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/video_recognizer.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/video_recognizer.hpp
@@ -6,85 +6,97 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/video_recognizer.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using VideoRecognition = mmdeploy_video_recognition_t;
-using VideoSampleInfo = mmdeploy_video_sample_info_t;
-
-class VideoRecognizer : public NonMovable {
- public:
-  VideoRecognizer(const Model& model, const Context& context) {
-    auto ec = mmdeploy_video_recognizer_create_v2(model, context, &recognizer_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~VideoRecognizer() {
-    if (recognizer_) {
-      mmdeploy_video_recognizer_destroy(recognizer_);
-      recognizer_ = {};
-    }
-  }
-
-  using Result = Result_<VideoRecognition>;
-
-  std::vector<Result> Apply(Span<const std::vector<Mat>> videos,
-                            Span<const VideoSampleInfo> infos) {
-    if (videos.empty()) {
-      return {};
-    }
-
-    int video_count = videos.size();
-
-    VideoRecognition* results{};
-    int* result_count{};
-    std::vector<Mat> images;
-    std::vector<VideoSampleInfo> video_info;
-    for (int i = 0; i < videos.size(); i++) {
-      for (auto& mat : videos[i]) {
-        images.push_back(mat);
-      }
-      video_info.push_back(infos[i]);
-    }
-
-    auto ec =
-        mmdeploy_video_recognizer_apply(recognizer_, reinterpret(images.data()), video_info.data(),
-                                        video_count, &results, &result_count);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::vector<Result> rets;
-    rets.reserve(video_count);
-
-    std::shared_ptr<VideoRecognition> data(results, [result_count, count = video_count](auto p) {
-      mmdeploy_video_recognizer_release_result(p, result_count, count);
-    });
-
-    size_t offset = 0;
-    for (size_t i = 0; i < video_count; ++i) {
-      offset += rets.emplace_back(offset, result_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const std::vector<Mat>& video, const VideoSampleInfo info) {
-    return Apply(Span{video}, Span{info})[0];
-  }
-
- private:
-  mmdeploy_video_recognizer_t recognizer_{};
-};
-
-}  // namespace cxx
-
-using cxx::VideoRecognition;
-using cxx::VideoRecognizer;
-using cxx::VideoSampleInfo;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using VideoRecognition = mmdeploy_video_recognition_t;
+        using VideoSampleInfo  = mmdeploy_video_sample_info_t;
+
+        class VideoRecognizer : public NonMovable
+        {
+          public:
+            VideoRecognizer(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_video_recognizer_create_v2(model, context, &recognizer_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~VideoRecognizer()
+            {
+                if (recognizer_)
+                {
+                    mmdeploy_video_recognizer_destroy(recognizer_);
+                    recognizer_ = {};
+                }
+            }
+
+            using Result = Result_<VideoRecognition>;
+
+            std::vector<Result> Apply(Span<const std::vector<Mat>> videos,
+                                      Span<const VideoSampleInfo>  infos)
+            {
+                if (videos.empty())
+                {
+                    return {};
+                }
+
+                int                          video_count = videos.size();
+
+                VideoRecognition*            results{};
+                int*                         result_count{};
+                std::vector<Mat>             images;
+                std::vector<VideoSampleInfo> video_info;
+                for (int i = 0; i < videos.size(); i++)
+                {
+                    for (auto& mat : videos[i])
+                    {
+                        images.push_back(mat);
+                    }
+                    video_info.push_back(infos[i]);
+                }
+
+                auto ec =
+                    mmdeploy_video_recognizer_apply(recognizer_, reinterpret(images.data()), video_info.data(), video_count, &results, &result_count);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::vector<Result> rets;
+                rets.reserve(video_count);
+
+                std::shared_ptr<VideoRecognition> data(results, [result_count, count = video_count](auto p)
+                                                       { mmdeploy_video_recognizer_release_result(p, result_count, count); });
+
+                size_t                            offset = 0;
+                for (size_t i = 0; i < video_count; ++i)
+                {
+                    offset += rets.emplace_back(offset, result_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const std::vector<Mat>& video, const VideoSampleInfo info)
+            {
+                return Apply(Span{video}, Span{info})[0];
+            }
+
+          private:
+            mmdeploy_video_recognizer_t recognizer_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::VideoRecognition;
+    using cxx::VideoRecognizer;
+    using cxx::VideoSampleInfo;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/java/native/common.h b/csrc/mmdeploy/apis/java/native/common.h
index ba2601e5f1..045dc02a35 100644
--- a/csrc/mmdeploy/apis/java/native/common.h
+++ b/csrc/mmdeploy/apis/java/native/common.h
@@ -10,45 +10,48 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-template <typename F>
-static auto With(JNIEnv *env, jobjectArray imgs, F f) noexcept {
-  auto mat_clazz = env->FindClass("mmdeploy/Mat");
-  auto shape_field = env->GetFieldID(mat_clazz, "shape", "[I");
-  auto format_field = env->GetFieldID(mat_clazz, "format", "I");
-  auto type_field = env->GetFieldID(mat_clazz, "type", "I");
-  auto data_field = env->GetFieldID(mat_clazz, "data", "[B");
-  auto num = env->GetArrayLength(imgs);
-  std::vector<mmdeploy_mat_t> mats;
-  std::vector<jbyteArray> datum;
-
-  mats.reserve(num);
-  datum.reserve(num);
-
-  for (int i = 0; i < num; ++i) {
-    auto obj = env->GetObjectArrayElement(imgs, i);
-    auto shape_obj = env->GetObjectField(obj, shape_field);
-    auto shape = env->GetIntArrayElements((jintArray)shape_obj, nullptr);
-    auto format = env->GetIntField(obj, format_field);
-    auto type = env->GetIntField(obj, type_field);
-    auto &mat = mats.emplace_back();
-    mat.height = shape[0];
-    mat.width = shape[1];
-    mat.channel = shape[2];
-    env->ReleaseIntArrayElements((jintArray)shape_obj, shape, JNI_ABORT);
-    mat.format = (mmdeploy_pixel_format_t)format;
-    mat.type = (mmdeploy_data_type_t)type;
-    auto data_obj = env->GetObjectField(obj, data_field);
-    mat.data = (uint8_t *)env->GetByteArrayElements((jbyteArray)data_obj, nullptr);
-    datum.push_back((jbyteArray)data_obj);
-  }
-
-  auto ret = f(mats.data(), mats.size());  // ! f must not throw
-
-  for (int i = 0; i < num; ++i) {
-    env->ReleaseByteArrayElements(datum[i], (jbyte *)mats[i].data, JNI_ABORT);
-  }
-
-  return ret;
+template<typename F>
+static auto With(JNIEnv* env, jobjectArray imgs, F f) noexcept
+{
+    auto                        mat_clazz    = env->FindClass("mmdeploy/Mat");
+    auto                        shape_field  = env->GetFieldID(mat_clazz, "shape", "[I");
+    auto                        format_field = env->GetFieldID(mat_clazz, "format", "I");
+    auto                        type_field   = env->GetFieldID(mat_clazz, "type", "I");
+    auto                        data_field   = env->GetFieldID(mat_clazz, "data", "[B");
+    auto                        num          = env->GetArrayLength(imgs);
+    std::vector<mmdeploy_mat_t> mats;
+    std::vector<jbyteArray>     datum;
+
+    mats.reserve(num);
+    datum.reserve(num);
+
+    for (int i = 0; i < num; ++i)
+    {
+        auto  obj       = env->GetObjectArrayElement(imgs, i);
+        auto  shape_obj = env->GetObjectField(obj, shape_field);
+        auto  shape     = env->GetIntArrayElements((jintArray)shape_obj, nullptr);
+        auto  format    = env->GetIntField(obj, format_field);
+        auto  type      = env->GetIntField(obj, type_field);
+        auto& mat       = mats.emplace_back();
+        mat.height      = shape[0];
+        mat.width       = shape[1];
+        mat.channel     = shape[2];
+        env->ReleaseIntArrayElements((jintArray)shape_obj, shape, JNI_ABORT);
+        mat.format    = (mmdeploy_pixel_format_t)format;
+        mat.type      = (mmdeploy_data_type_t)type;
+        auto data_obj = env->GetObjectField(obj, data_field);
+        mat.data      = (uint8_t*)env->GetByteArrayElements((jbyteArray)data_obj, nullptr);
+        datum.push_back((jbyteArray)data_obj);
+    }
+
+    auto ret = f(mats.data(), mats.size());  // ! f must not throw
+
+    for (int i = 0; i < num; ++i)
+    {
+        env->ReleaseByteArrayElements(datum[i], (jbyte*)mats[i].data, JNI_ABORT);
+    }
+
+    return ret;
 }
 
 #endif  // MMDEPLOY_CSRC_APIS_JAVA_NATIVE_COMMON_H_
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.cpp
index 2a3309361e..6664a65289 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.cpp
@@ -6,30 +6,33 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Classifier_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                      jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_classifier_t classifier{};
-  auto ec =
-      mmdeploy_classifier_create_by_path(model_path, device_name, (int)device_id, &classifier);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create classifier, code = {}", ec);
-    return -1;
-  }
-  return (jlong)classifier;
+jlong Java_mmdeploy_Classifier_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                  model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                  device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_classifier_t classifier{};
+    auto                  ec =
+        mmdeploy_classifier_create_by_path(model_path, device_name, (int)device_id, &classifier);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create classifier, code = {}", ec);
+        return -1;
+    }
+    return (jlong)classifier;
 }
 
-void Java_mmdeploy_Classifier_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Classifier_destroy");
-  mmdeploy_classifier_destroy((mmdeploy_classifier_t)handle);
+void Java_mmdeploy_Classifier_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Classifier_destroy");
+    mmdeploy_classifier_destroy((mmdeploy_classifier_t)handle);
 }
 
-jobjectArray Java_mmdeploy_Classifier_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                            jobjectArray images, jintArray counts) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_Classifier_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images, jintArray counts)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_classification_t *results{};
     int *result_count{};
     auto ec = mmdeploy_classifier_apply((mmdeploy_classifier_t)handle, imgs, size, &results,
@@ -55,6 +58,5 @@ jobjectArray Java_mmdeploy_Classifier_apply(JNIEnv *env, jobject thiz, jlong han
     }
     env->ReleaseIntArrayElements(counts, counts_array, 0);
     mmdeploy_classifier_release_result(results, result_count, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.h
index 16a06b5fba..84adf58aa3 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.h
@@ -3,33 +3,33 @@
 /* Header for class mmdeploy_Classifier */
 
 #ifndef _Included_mmdeploy_Classifier
-#define _Included_mmdeploy_Classifier
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Classifier
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Classifier_create(JNIEnv *, jobject, jstring, jstring, jint);
+    #define _Included_mmdeploy_Classifier
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Classifier
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_Classifier_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_Classifier
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Classifier_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Classifier
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_Classifier_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_Classifier
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/Classifier/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Classifier_apply(JNIEnv *, jobject, jlong,
-                                                              jobjectArray, jintArray);
+    /*
+     * Class:     mmdeploy_Classifier
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/Classifier/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Classifier_apply(JNIEnv*, jobject, jlong, jobjectArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Context.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Context.cpp
index dbd401724e..e875a66ead 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Context.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Context.cpp
@@ -8,36 +8,43 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Context_create(JNIEnv *env, jobject) {
-  mmdeploy_context_t context{};
-  mmdeploy_context_create(&context);
-  return (jlong)context;
+jlong Java_mmdeploy_Context_create(JNIEnv* env, jobject)
+{
+    mmdeploy_context_t context{};
+    mmdeploy_context_create(&context);
+    return (jlong)context;
 }
 
-jint Java_mmdeploy_Context_add(JNIEnv *env, jobject, jlong context_, jint contextType, jstring name,
-                               jlong handle) {
-  auto object_name = env->GetStringUTFChars(name, nullptr);
-  if ((int)contextType == MMDEPLOY_TYPE_SCHEDULER) {
-    mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType,
-                         object_name, (mmdeploy_scheduler_t)handle);
-  } else if ((int)contextType == MMDEPLOY_TYPE_MODEL) {
-    mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType,
-                         object_name, (mmdeploy_model_t)handle);
-  } else if ((int)contextType == MMDEPLOY_TYPE_DEVICE) {
-    mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType,
-                         nullptr, (mmdeploy_device_t)handle);
-  } else if ((int)contextType == MMDEPLOY_TYPE_PROFILER) {
-    mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType,
-                         nullptr, (mmdeploy_profiler_t)handle);
-  } else {
-    MMDEPLOY_ERROR("wrong context type, got {}", (int)contextType);
-    return MMDEPLOY_E_NOT_SUPPORTED;
-  }
-  env->ReleaseStringUTFChars(name, object_name);
-  return 0;
+jint Java_mmdeploy_Context_add(JNIEnv* env, jobject, jlong context_, jint contextType, jstring name, jlong handle)
+{
+    auto object_name = env->GetStringUTFChars(name, nullptr);
+    if ((int)contextType == MMDEPLOY_TYPE_SCHEDULER)
+    {
+        mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType, object_name, (mmdeploy_scheduler_t)handle);
+    }
+    else if ((int)contextType == MMDEPLOY_TYPE_MODEL)
+    {
+        mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType, object_name, (mmdeploy_model_t)handle);
+    }
+    else if ((int)contextType == MMDEPLOY_TYPE_DEVICE)
+    {
+        mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType, nullptr, (mmdeploy_device_t)handle);
+    }
+    else if ((int)contextType == MMDEPLOY_TYPE_PROFILER)
+    {
+        mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType, nullptr, (mmdeploy_profiler_t)handle);
+    }
+    else
+    {
+        MMDEPLOY_ERROR("wrong context type, got {}", (int)contextType);
+        return MMDEPLOY_E_NOT_SUPPORTED;
+    }
+    env->ReleaseStringUTFChars(name, object_name);
+    return 0;
 }
 
-void Java_mmdeploy_Context_destroy(JNIEnv *, jobject, jlong context_) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Context_destroy");
-  mmdeploy_context_destroy((mmdeploy_context_t)context_);
+void Java_mmdeploy_Context_destroy(JNIEnv*, jobject, jlong context_)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Context_destroy");
+    mmdeploy_context_destroy((mmdeploy_context_t)context_);
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Context.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Context.h
index 42df819580..00e24065c6 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Context.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Context.h
@@ -3,32 +3,33 @@
 /* Header for class mmdeploy_Context */
 
 #ifndef _Included_mmdeploy_Context
-#define _Included_mmdeploy_Context
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Context
- * Method:    create
- * Signature: ()J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Context_create(JNIEnv *, jobject);
+    #define _Included_mmdeploy_Context
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Context
+     * Method:    create
+     * Signature: ()J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Context_create(JNIEnv*, jobject);
 
-/*
- * Class:     mmdeploy_Context
- * Method:    add
- * Signature: (JILjava/lang/String;J)I
- */
-JNIEXPORT jint JNICALL Java_mmdeploy_Context_add(JNIEnv *, jobject, jlong, jint, jstring, jlong);
+    /*
+     * Class:     mmdeploy_Context
+     * Method:    add
+     * Signature: (JILjava/lang/String;J)I
+     */
+    JNIEXPORT jint JNICALL  Java_mmdeploy_Context_add(JNIEnv*, jobject, jlong, jint, jstring, jlong);
 
-/*
- * Class:     mmdeploy_Context
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Context_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Context
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL  Java_mmdeploy_Context_destroy(JNIEnv*, jobject, jlong);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.cpp
index c03ff1a1ff..6e8a32dac7 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.cpp
@@ -6,29 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Detector_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                    jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_detector_t detector{};
-  auto ec = mmdeploy_detector_create_by_path(model_path, device_name, (int)device_id, &detector);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create detector, code = {}", ec);
-    return -1;
-  }
-  return (jlong)detector;
+jlong Java_mmdeploy_Detector_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_detector_t detector{};
+    auto                ec = mmdeploy_detector_create_by_path(model_path, device_name, (int)device_id, &detector);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create detector, code = {}", ec);
+        return -1;
+    }
+    return (jlong)detector;
 }
 
-void Java_mmdeploy_Detector_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Detector_destroy");  // maybe use info?
-  mmdeploy_detector_destroy((mmdeploy_detector_t)handle);
+void Java_mmdeploy_Detector_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Detector_destroy");  // maybe use info?
+    mmdeploy_detector_destroy((mmdeploy_detector_t)handle);
 }
 
-jobjectArray Java_mmdeploy_Detector_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                          jobjectArray images, jintArray counts) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_Detector_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images, jintArray counts)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_detection_t *results{};
     int *result_count{};
     auto ec =
@@ -79,6 +82,5 @@ jobjectArray Java_mmdeploy_Detector_apply(JNIEnv *env, jobject thiz, jlong handl
     }
     env->ReleaseIntArrayElements(counts, counts_array, 0);
     mmdeploy_detector_release_result(results, result_count, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.h
index 41e711d15a..578643efc8 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.h
@@ -3,33 +3,33 @@
 /* Header for class mmdeploy_Detector */
 
 #ifndef _Included_mmdeploy_Detector
-#define _Included_mmdeploy_Detector
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Detector
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Detector_create(JNIEnv *, jobject, jstring, jstring, jint);
+    #define _Included_mmdeploy_Detector
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Detector
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_Detector_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_Detector
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Detector_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Detector
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_Detector_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_Detector
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/Detector/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Detector_apply(JNIEnv *, jobject, jlong, jobjectArray,
-                                                            jintArray);
+    /*
+     * Class:     mmdeploy_Detector
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/Detector/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Detector_apply(JNIEnv*, jobject, jlong, jobjectArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Device.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Device.cpp
index 8dbec9285b..8160210ed5 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Device.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Device.cpp
@@ -6,19 +6,22 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Device_create(JNIEnv *env, jobject, jstring name, jint index) {
-  auto device_name = env->GetStringUTFChars(name, nullptr);
-  mmdeploy_device_t device{};
-  auto ec = mmdeploy_device_create(device_name, (int)index, &device);
-  env->ReleaseStringUTFChars(name, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create device, code = {}", ec);
-    return -1;
-  }
-  return (jlong)device;
+jlong Java_mmdeploy_Device_create(JNIEnv* env, jobject, jstring name, jint index)
+{
+    auto              device_name = env->GetStringUTFChars(name, nullptr);
+    mmdeploy_device_t device{};
+    auto              ec = mmdeploy_device_create(device_name, (int)index, &device);
+    env->ReleaseStringUTFChars(name, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create device, code = {}", ec);
+        return -1;
+    }
+    return (jlong)device;
 }
 
-void Java_mmdeploy_Device_destroy(JNIEnv *, jobject, jlong device_) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Device_destroy");
-  mmdeploy_device_destroy((mmdeploy_device_t)device_);
+void Java_mmdeploy_Device_destroy(JNIEnv*, jobject, jlong device_)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Device_destroy");
+    mmdeploy_device_destroy((mmdeploy_device_t)device_);
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Device.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Device.h
index 7d7ee9dee7..e751d0f781 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Device.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Device.h
@@ -3,25 +3,26 @@
 /* Header for class mmdeploy_Device */
 
 #ifndef _Included_mmdeploy_Device
-#define _Included_mmdeploy_Device
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Device
- * Method:    create
- * Signature: (Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Device_create(JNIEnv *, jobject, jstring, jint);
+    #define _Included_mmdeploy_Device
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Device
+     * Method:    create
+     * Signature: (Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Device_create(JNIEnv*, jobject, jstring, jint);
 
-/*
- * Class:     mmdeploy_Device
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Device_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Device
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL  Java_mmdeploy_Device_destroy(JNIEnv*, jobject, jlong);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Model.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Model.cpp
index 2bbc9a6920..821b1e988e 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Model.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Model.cpp
@@ -6,19 +6,22 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Model_create(JNIEnv *env, jobject, jstring path) {
-  auto model_path = env->GetStringUTFChars(path, nullptr);
-  mmdeploy_model_t model{};
-  auto ec = mmdeploy_model_create_by_path(model_path, &model);
-  env->ReleaseStringUTFChars(path, model_path);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create model, code = {}", ec);
-    return -1;
-  }
-  return (jlong)model;
+jlong Java_mmdeploy_Model_create(JNIEnv* env, jobject, jstring path)
+{
+    auto             model_path = env->GetStringUTFChars(path, nullptr);
+    mmdeploy_model_t model{};
+    auto             ec = mmdeploy_model_create_by_path(model_path, &model);
+    env->ReleaseStringUTFChars(path, model_path);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create model, code = {}", ec);
+        return -1;
+    }
+    return (jlong)model;
 }
 
-void Java_mmdeploy_Model_destroy(JNIEnv *, jobject, jlong model_) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Model_destroy");
-  mmdeploy_model_destroy((mmdeploy_model_t)model_);
+void Java_mmdeploy_Model_destroy(JNIEnv*, jobject, jlong model_)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Model_destroy");
+    mmdeploy_model_destroy((mmdeploy_model_t)model_);
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Model.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Model.h
index 11e23a1a81..9fc714c259 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Model.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Model.h
@@ -3,25 +3,26 @@
 /* Header for class mmdeploy_Model */
 
 #ifndef _Included_mmdeploy_Model
-#define _Included_mmdeploy_Model
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Model
- * Method:    create
- * Signature: (Ljava/lang/String;)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Model_create(JNIEnv *, jobject, jstring);
+    #define _Included_mmdeploy_Model
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Model
+     * Method:    create
+     * Signature: (Ljava/lang/String;)J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Model_create(JNIEnv*, jobject, jstring);
 
-/*
- * Class:     mmdeploy_Model
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Model_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Model
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL  Java_mmdeploy_Model_destroy(JNIEnv*, jobject, jlong);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.cpp
index 4956555a6e..aac54574a0 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.cpp
@@ -6,30 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_PoseDetector_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                        jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_pose_detector_t pose_estimator{};
-  auto ec = mmdeploy_pose_detector_create_by_path(model_path, device_name, (int)device_id,
-                                                  &pose_estimator);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create pose estimator, code = {}", ec);
-    return -1;
-  }
-  return (jlong)pose_estimator;
+jlong Java_mmdeploy_PoseDetector_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                     model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                     device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_pose_detector_t pose_estimator{};
+    auto                     ec = mmdeploy_pose_detector_create_by_path(model_path, device_name, (int)device_id, &pose_estimator);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create pose estimator, code = {}", ec);
+        return -1;
+    }
+    return (jlong)pose_estimator;
 }
 
-void Java_mmdeploy_PoseDetector_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_PoseDetector_destroy");
-  mmdeploy_pose_detector_destroy((mmdeploy_pose_detector_t)handle);
+void Java_mmdeploy_PoseDetector_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_PoseDetector_destroy");
+    mmdeploy_pose_detector_destroy((mmdeploy_pose_detector_t)handle);
 }
 
-jobjectArray Java_mmdeploy_PoseDetector_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                              jobjectArray images) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_PoseDetector_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_pose_detection_t *results{};
     auto ec = mmdeploy_pose_detector_apply((mmdeploy_pose_detector_t)handle, imgs, size, &results);
     if (ec) {
@@ -55,6 +57,5 @@ jobjectArray Java_mmdeploy_PoseDetector_apply(JNIEnv *env, jobject thiz, jlong h
       env->SetObjectArrayElement(array, i, res);
     }
     mmdeploy_pose_detector_release_result(results, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.h b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.h
index a50b7fd821..87c70ac0a6 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.h
@@ -3,34 +3,33 @@
 /* Header for class mmdeploy_PoseDetector */
 
 #ifndef _Included_mmdeploy_PoseDetector
-#define _Included_mmdeploy_PoseDetector
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_PoseDetector
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_PoseDetector_create(JNIEnv *, jobject, jstring, jstring,
-                                                          jint);
+    #define _Included_mmdeploy_PoseDetector
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_PoseDetector
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_PoseDetector_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_PoseDetector
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_PoseDetector_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_PoseDetector
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_PoseDetector_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_PoseDetector
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/PoseDetector/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_PoseDetector_apply(JNIEnv *, jobject, jlong,
-                                                                jobjectArray);
+    /*
+     * Class:     mmdeploy_PoseDetector
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/PoseDetector/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_PoseDetector_apply(JNIEnv*, jobject, jlong, jobjectArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.cpp
index c0d1685729..61fd42eb07 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.cpp
@@ -6,143 +6,161 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_PoseTracker_create(JNIEnv *env, jobject, jlong detModel, jlong poseModel,
-                                       jlong context) {
-  mmdeploy_pose_tracker_t pose_tracker{};
-  auto ec = mmdeploy_pose_tracker_create((mmdeploy_model_t)detModel, (mmdeploy_model_t)poseModel,
-                                         (mmdeploy_context_t)context, &pose_tracker);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create pose tracker, code = {}", ec);
-    return -1;
-  }
-  return (jlong)pose_tracker;
+jlong Java_mmdeploy_PoseTracker_create(JNIEnv* env, jobject, jlong detModel, jlong poseModel, jlong context)
+{
+    mmdeploy_pose_tracker_t pose_tracker{};
+    auto                    ec = mmdeploy_pose_tracker_create((mmdeploy_model_t)detModel, (mmdeploy_model_t)poseModel, (mmdeploy_context_t)context, &pose_tracker);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create pose tracker, code = {}", ec);
+        return -1;
+    }
+    return (jlong)pose_tracker;
 }
 
-void Java_mmdeploy_PoseTracker_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_PoseTracker_destroy");
-  mmdeploy_pose_tracker_destroy((mmdeploy_pose_tracker_t)handle);
+void Java_mmdeploy_PoseTracker_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_PoseTracker_destroy");
+    mmdeploy_pose_tracker_destroy((mmdeploy_pose_tracker_t)handle);
 }
 
-jobject param_cpp_to_java(JNIEnv *env, mmdeploy_pose_tracker_param_t *params) {
-  auto param_cls = env->FindClass("mmdeploy/PoseTracker$Params");
-  auto param_ctor = env->GetMethodID(param_cls, "<init>", "(IIFFFIFIFFF[FIFIIFF[F)V");
+jobject param_cpp_to_java(JNIEnv* env, mmdeploy_pose_tracker_param_t* params)
+{
+    auto        param_cls  = env->FindClass("mmdeploy/PoseTracker$Params");
+    auto        param_ctor = env->GetMethodID(param_cls, "<init>", "(IIFFFIFIFFF[FIFIIFF[F)V");
 
-  jfloatArray keypointSigmas = env->NewFloatArray(params->keypoint_sigmas_size);
-  env->SetFloatArrayRegion(keypointSigmas, 0, params->keypoint_sigmas_size,
-                           (jfloat *)params->keypoint_sigmas);
-  jfloatArray smoothParams = env->NewFloatArray(3);
-  env->SetFloatArrayRegion(smoothParams, 0, 3, (jfloat *)params->smooth_params);
+    jfloatArray keypointSigmas = env->NewFloatArray(params->keypoint_sigmas_size);
+    env->SetFloatArrayRegion(keypointSigmas, 0, params->keypoint_sigmas_size, (jfloat*)params->keypoint_sigmas);
+    jfloatArray smoothParams = env->NewFloatArray(3);
+    env->SetFloatArrayRegion(smoothParams, 0, 3, (jfloat*)params->smooth_params);
 
-  auto param = env->NewObject(
-      param_cls, param_ctor, (jint)params->det_interval, (jint)params->det_label,
-      (jfloat)params->det_thr, (jfloat)params->det_min_bbox_size, (jfloat)params->det_nms_thr,
-      (jint)params->pose_max_num_bboxes, (jfloat)params->pose_kpt_thr,
-      (jint)params->pose_min_keypoints, (jfloat)params->pose_bbox_scale,
-      (jfloat)params->pose_min_bbox_size, (jfloat)params->pose_nms_thr, keypointSigmas,
-      (jint)params->keypoint_sigmas_size, (jfloat)params->track_iou_thr,
-      (jint)params->track_max_missing, (jint)params->track_history_size,
-      (jfloat)params->std_weight_position, (jfloat)params->std_weight_velocity, smoothParams);
-  return param;
+    auto param = env->NewObject(
+        param_cls,
+        param_ctor,
+        (jint)params->det_interval,
+        (jint)params->det_label,
+        (jfloat)params->det_thr,
+        (jfloat)params->det_min_bbox_size,
+        (jfloat)params->det_nms_thr,
+        (jint)params->pose_max_num_bboxes,
+        (jfloat)params->pose_kpt_thr,
+        (jint)params->pose_min_keypoints,
+        (jfloat)params->pose_bbox_scale,
+        (jfloat)params->pose_min_bbox_size,
+        (jfloat)params->pose_nms_thr,
+        keypointSigmas,
+        (jint)params->keypoint_sigmas_size,
+        (jfloat)params->track_iou_thr,
+        (jint)params->track_max_missing,
+        (jint)params->track_history_size,
+        (jfloat)params->std_weight_position,
+        (jfloat)params->std_weight_velocity,
+        smoothParams);
+    return param;
 }
 
-void param_java_to_cpp(JNIEnv *env, mmdeploy_pose_tracker_param_t *params, jobject customParam) {
-  auto param_cls = env->FindClass("mmdeploy/PoseTracker$Params");
-  auto param_ctor = env->GetMethodID(param_cls, "<init>", "(IIFFFIFIFFF[FIFIIFF[F)V");
+void param_java_to_cpp(JNIEnv* env, mmdeploy_pose_tracker_param_t* params, jobject customParam)
+{
+    auto     param_cls  = env->FindClass("mmdeploy/PoseTracker$Params");
+    auto     param_ctor = env->GetMethodID(param_cls, "<init>", "(IIFFFIFIFFF[FIFIIFF[F)V");
 
-  jfieldID fieldID_detInterval = env->GetFieldID(param_cls, "detInterval", "I");
-  jint detInterval = env->GetIntField(customParam, fieldID_detInterval);
-  params->det_interval = (int)detInterval;
-  jfieldID fieldID_detLabel = env->GetFieldID(param_cls, "detLabel", "I");
-  jint detLabel = env->GetIntField(customParam, fieldID_detLabel);
-  params->det_label = (int)detLabel;
-  jfieldID fieldID_detThr = env->GetFieldID(param_cls, "detThr", "F");
-  jfloat detThr = env->GetFloatField(customParam, fieldID_detThr);
-  params->det_thr = (float)detThr;
-  jfieldID fieldID_detMinBboxSize = env->GetFieldID(param_cls, "detMinBboxSize", "F");
-  jfloat detMinBboxSize = env->GetFloatField(customParam, fieldID_detMinBboxSize);
-  params->det_min_bbox_size = (float)detMinBboxSize;
-  jfieldID fieldID_detNmsThr = env->GetFieldID(param_cls, "detNmsThr", "F");
-  jfloat detNmsThr = env->GetFloatField(customParam, fieldID_detNmsThr);
-  params->det_nms_thr = (float)detNmsThr;
-  jfieldID fieldID_poseMaxNumBboxes = env->GetFieldID(param_cls, "poseMaxNumBboxes", "I");
-  jint poseMaxNumBboxes = env->GetIntField(customParam, fieldID_poseMaxNumBboxes);
-  params->pose_max_num_bboxes = (int)poseMaxNumBboxes;
-  jfieldID fieldID_poseKptThr = env->GetFieldID(param_cls, "poseKptThr", "F");
-  jfloat poseKptThr = env->GetFloatField(customParam, fieldID_poseKptThr);
-  params->pose_kpt_thr = (float)poseKptThr;
-  jfieldID fieldID_poseMinKeypoints = env->GetFieldID(param_cls, "poseMinKeypoints", "I");
-  jint poseMinKeypoints = env->GetIntField(customParam, fieldID_poseMinKeypoints);
-  params->pose_min_keypoints = (int)poseMinKeypoints;
-  jfieldID fieldID_poseBboxScale = env->GetFieldID(param_cls, "poseBboxScale", "F");
-  jfloat poseBboxScale = env->GetFloatField(customParam, fieldID_poseBboxScale);
-  params->pose_bbox_scale = (float)poseBboxScale;
-  jfieldID fieldID_poseMinBboxSize = env->GetFieldID(param_cls, "poseMinBboxSize", "F");
-  jfloat poseMinBboxSize = env->GetFloatField(customParam, fieldID_poseMinBboxSize);
-  params->pose_min_bbox_size = (float)poseMinBboxSize;
-  jfieldID fieldID_poseNmsThr = env->GetFieldID(param_cls, "poseNmsThr", "F");
-  jfloat poseNmsThr = env->GetFloatField(customParam, fieldID_poseNmsThr);
-  params->pose_nms_thr = (float)poseNmsThr;
-  jfieldID fieldID_keypointSigmas = env->GetFieldID(param_cls, "keypointSigmas", "[F");
-  auto keypointSigmasObj = env->GetObjectField(customParam, fieldID_keypointSigmas);
-  float *keypointSigmas =
-      (float *)env->GetFloatArrayElements((jfloatArray)keypointSigmasObj, nullptr);
-  params->keypoint_sigmas = keypointSigmas;
-  env->ReleaseFloatArrayElements((jfloatArray)keypointSigmasObj, keypointSigmas, JNI_ABORT);
-  jfieldID fieldID_keypointSigmasSize = env->GetFieldID(param_cls, "keypointSigmasSize", "I");
-  jint keypointSigmasSize = env->GetIntField(customParam, fieldID_keypointSigmasSize);
-  params->keypoint_sigmas_size = keypointSigmasSize;
-  jfieldID fieldID_trackIouThr = env->GetFieldID(param_cls, "trackIouThr", "F");
-  jfloat trackIouThr = env->GetFloatField(customParam, fieldID_trackIouThr);
-  params->track_iou_thr = trackIouThr;
-  jfieldID fieldID_trackMaxMissing = env->GetFieldID(param_cls, "trackMaxMissing", "I");
-  jint trackMaxMissing = env->GetIntField(customParam, fieldID_trackMaxMissing);
-  params->track_max_missing = trackMaxMissing;
-  jfieldID fieldID_trackHistorySize = env->GetFieldID(param_cls, "trackHistorySize", "I");
-  jint trackHistorySize = env->GetIntField(customParam, fieldID_trackHistorySize);
-  params->track_history_size = trackHistorySize;
-  jfieldID fieldID_stdWeightPosition = env->GetFieldID(param_cls, "stdWeightPosition", "F");
-  jfloat stdWeightPosition = env->GetFloatField(customParam, fieldID_stdWeightPosition);
-  params->std_weight_position = stdWeightPosition;
-  jfieldID fieldID_stdWeightVelocity = env->GetFieldID(param_cls, "stdWeightVelocity", "F");
-  jfloat stdWeightVelocity = env->GetFloatField(customParam, fieldID_stdWeightVelocity);
-  params->std_weight_velocity = stdWeightVelocity;
-  jfieldID fieldID_smoothParams = env->GetFieldID(param_cls, "smoothParams", "[F");
-  auto smoothParamsObj = env->GetObjectField(customParam, fieldID_smoothParams);
-  float *smoothParams = (float *)env->GetFloatArrayElements((jfloatArray)smoothParamsObj, nullptr);
-  params->smooth_params[0] = smoothParams[0];
-  params->smooth_params[1] = smoothParams[1];
-  params->smooth_params[2] = smoothParams[2];
-  env->ReleaseFloatArrayElements((jfloatArray)smoothParamsObj, smoothParams, JNI_ABORT);
+    jfieldID fieldID_detInterval      = env->GetFieldID(param_cls, "detInterval", "I");
+    jint     detInterval              = env->GetIntField(customParam, fieldID_detInterval);
+    params->det_interval              = (int)detInterval;
+    jfieldID fieldID_detLabel         = env->GetFieldID(param_cls, "detLabel", "I");
+    jint     detLabel                 = env->GetIntField(customParam, fieldID_detLabel);
+    params->det_label                 = (int)detLabel;
+    jfieldID fieldID_detThr           = env->GetFieldID(param_cls, "detThr", "F");
+    jfloat   detThr                   = env->GetFloatField(customParam, fieldID_detThr);
+    params->det_thr                   = (float)detThr;
+    jfieldID fieldID_detMinBboxSize   = env->GetFieldID(param_cls, "detMinBboxSize", "F");
+    jfloat   detMinBboxSize           = env->GetFloatField(customParam, fieldID_detMinBboxSize);
+    params->det_min_bbox_size         = (float)detMinBboxSize;
+    jfieldID fieldID_detNmsThr        = env->GetFieldID(param_cls, "detNmsThr", "F");
+    jfloat   detNmsThr                = env->GetFloatField(customParam, fieldID_detNmsThr);
+    params->det_nms_thr               = (float)detNmsThr;
+    jfieldID fieldID_poseMaxNumBboxes = env->GetFieldID(param_cls, "poseMaxNumBboxes", "I");
+    jint     poseMaxNumBboxes         = env->GetIntField(customParam, fieldID_poseMaxNumBboxes);
+    params->pose_max_num_bboxes       = (int)poseMaxNumBboxes;
+    jfieldID fieldID_poseKptThr       = env->GetFieldID(param_cls, "poseKptThr", "F");
+    jfloat   poseKptThr               = env->GetFloatField(customParam, fieldID_poseKptThr);
+    params->pose_kpt_thr              = (float)poseKptThr;
+    jfieldID fieldID_poseMinKeypoints = env->GetFieldID(param_cls, "poseMinKeypoints", "I");
+    jint     poseMinKeypoints         = env->GetIntField(customParam, fieldID_poseMinKeypoints);
+    params->pose_min_keypoints        = (int)poseMinKeypoints;
+    jfieldID fieldID_poseBboxScale    = env->GetFieldID(param_cls, "poseBboxScale", "F");
+    jfloat   poseBboxScale            = env->GetFloatField(customParam, fieldID_poseBboxScale);
+    params->pose_bbox_scale           = (float)poseBboxScale;
+    jfieldID fieldID_poseMinBboxSize  = env->GetFieldID(param_cls, "poseMinBboxSize", "F");
+    jfloat   poseMinBboxSize          = env->GetFloatField(customParam, fieldID_poseMinBboxSize);
+    params->pose_min_bbox_size        = (float)poseMinBboxSize;
+    jfieldID fieldID_poseNmsThr       = env->GetFieldID(param_cls, "poseNmsThr", "F");
+    jfloat   poseNmsThr               = env->GetFloatField(customParam, fieldID_poseNmsThr);
+    params->pose_nms_thr              = (float)poseNmsThr;
+    jfieldID fieldID_keypointSigmas   = env->GetFieldID(param_cls, "keypointSigmas", "[F");
+    auto     keypointSigmasObj        = env->GetObjectField(customParam, fieldID_keypointSigmas);
+    float*   keypointSigmas =
+        (float*)env->GetFloatArrayElements((jfloatArray)keypointSigmasObj, nullptr);
+    params->keypoint_sigmas = keypointSigmas;
+    env->ReleaseFloatArrayElements((jfloatArray)keypointSigmasObj, keypointSigmas, JNI_ABORT);
+    jfieldID fieldID_keypointSigmasSize = env->GetFieldID(param_cls, "keypointSigmasSize", "I");
+    jint     keypointSigmasSize         = env->GetIntField(customParam, fieldID_keypointSigmasSize);
+    params->keypoint_sigmas_size        = keypointSigmasSize;
+    jfieldID fieldID_trackIouThr        = env->GetFieldID(param_cls, "trackIouThr", "F");
+    jfloat   trackIouThr                = env->GetFloatField(customParam, fieldID_trackIouThr);
+    params->track_iou_thr               = trackIouThr;
+    jfieldID fieldID_trackMaxMissing    = env->GetFieldID(param_cls, "trackMaxMissing", "I");
+    jint     trackMaxMissing            = env->GetIntField(customParam, fieldID_trackMaxMissing);
+    params->track_max_missing           = trackMaxMissing;
+    jfieldID fieldID_trackHistorySize   = env->GetFieldID(param_cls, "trackHistorySize", "I");
+    jint     trackHistorySize           = env->GetIntField(customParam, fieldID_trackHistorySize);
+    params->track_history_size          = trackHistorySize;
+    jfieldID fieldID_stdWeightPosition  = env->GetFieldID(param_cls, "stdWeightPosition", "F");
+    jfloat   stdWeightPosition          = env->GetFloatField(customParam, fieldID_stdWeightPosition);
+    params->std_weight_position         = stdWeightPosition;
+    jfieldID fieldID_stdWeightVelocity  = env->GetFieldID(param_cls, "stdWeightVelocity", "F");
+    jfloat   stdWeightVelocity          = env->GetFloatField(customParam, fieldID_stdWeightVelocity);
+    params->std_weight_velocity         = stdWeightVelocity;
+    jfieldID fieldID_smoothParams       = env->GetFieldID(param_cls, "smoothParams", "[F");
+    auto     smoothParamsObj            = env->GetObjectField(customParam, fieldID_smoothParams);
+    float*   smoothParams               = (float*)env->GetFloatArrayElements((jfloatArray)smoothParamsObj, nullptr);
+    params->smooth_params[0]            = smoothParams[0];
+    params->smooth_params[1]            = smoothParams[1];
+    params->smooth_params[2]            = smoothParams[2];
+    env->ReleaseFloatArrayElements((jfloatArray)smoothParamsObj, smoothParams, JNI_ABORT);
 }
 
-jobject Java_mmdeploy_PoseTracker_setDefaultParams(JNIEnv *env, jobject) {
-  mmdeploy_pose_tracker_param_t params{};
-  mmdeploy_pose_tracker_default_params(&params);
-  return param_cpp_to_java(env, &params);
+jobject Java_mmdeploy_PoseTracker_setDefaultParams(JNIEnv* env, jobject)
+{
+    mmdeploy_pose_tracker_param_t params{};
+    mmdeploy_pose_tracker_default_params(&params);
+    return param_cpp_to_java(env, &params);
 }
 
-jlong Java_mmdeploy_PoseTracker_createState(JNIEnv *env, jobject, jlong pipeline,
-                                            jobject paramsObject) {
-  mmdeploy_pose_tracker_state_t state{};
-  mmdeploy_pose_tracker_param_t params{};
-  param_java_to_cpp(env, &params, paramsObject);
-  auto ec = mmdeploy_pose_tracker_create_state((mmdeploy_pose_tracker_t)pipeline, &params, &state);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create pose tracker state, code = {}", ec);
-    return -1;
-  }
-  return (jlong)state;
+jlong Java_mmdeploy_PoseTracker_createState(JNIEnv* env, jobject, jlong pipeline, jobject paramsObject)
+{
+    mmdeploy_pose_tracker_state_t state{};
+    mmdeploy_pose_tracker_param_t params{};
+    param_java_to_cpp(env, &params, paramsObject);
+    auto ec = mmdeploy_pose_tracker_create_state((mmdeploy_pose_tracker_t)pipeline, &params, &state);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create pose tracker state, code = {}", ec);
+        return -1;
+    }
+    return (jlong)state;
 }
 
-void Java_mmdeploy_PoseTracker_destroyState(JNIEnv *, jobject, jlong state) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_PoseTracker_destroy");
-  mmdeploy_pose_tracker_destroy_state((mmdeploy_pose_tracker_state_t)state);
+void Java_mmdeploy_PoseTracker_destroyState(JNIEnv*, jobject, jlong state)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_PoseTracker_destroy");
+    mmdeploy_pose_tracker_destroy_state((mmdeploy_pose_tracker_state_t)state);
 }
 
-jobjectArray Java_mmdeploy_PoseTracker_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                             jlongArray states, jobjectArray frames,
-                                             jintArray detects, jintArray counts) {
-  return With(env, frames, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_PoseTracker_apply(JNIEnv* env, jobject thiz, jlong handle, jlongArray states, jobjectArray frames, jintArray detects, jintArray counts)
+{
+    return With(env, frames, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_pose_tracker_target_t *results{};
     int *result_count{};
     auto states_array = env->GetLongArrayElements(states, nullptr);
@@ -189,6 +207,5 @@ jobjectArray Java_mmdeploy_PoseTracker_apply(JNIEnv *env, jobject thiz, jlong ha
     env->ReleaseLongArrayElements(states, states_array, 0);
     env->ReleaseIntArrayElements(detects, detects_array, 0);
     mmdeploy_pose_tracker_release_result(results, result_count, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.h b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.h
index 8e8d3905c8..1de79b1eaa 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.h
@@ -3,54 +3,54 @@
 /* Header for class mmdeploy_PoseTracker */
 
 #ifndef _Included_mmdeploy_PoseTracker
-#define _Included_mmdeploy_PoseTracker
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    create
- * Signature: (JJJ)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_PoseTracker_create(JNIEnv *, jobject, jlong, jlong, jlong);
+    #define _Included_mmdeploy_PoseTracker
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    create
+     * Signature: (JJJ)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_PoseTracker_create(JNIEnv*, jobject, jlong, jlong, jlong);
 
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_PoseTracker_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_PoseTracker_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    createState
- * Signature: (JLmmdeploy/PoseTracker/Params;)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_PoseTracker_createState(JNIEnv *, jobject, jlong, jobject);
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    createState
+     * Signature: (JLmmdeploy/PoseTracker/Params;)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_PoseTracker_createState(JNIEnv*, jobject, jlong, jobject);
 
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    destroyState
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_PoseTracker_destroyState(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    destroyState
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_PoseTracker_destroyState(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    setDefaultParams
- * Signature: ()Lmmdeploy/PoseTracker/Params;
- */
-JNIEXPORT jobject JNICALL Java_mmdeploy_PoseTracker_setDefaultParams(JNIEnv *, jobject);
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    setDefaultParams
+     * Signature: ()Lmmdeploy/PoseTracker/Params;
+     */
+    JNIEXPORT jobject JNICALL      Java_mmdeploy_PoseTracker_setDefaultParams(JNIEnv*, jobject);
 
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    apply
- * Signature: (J[J[Lmmdeploy/Mat;[I[I)[Lmmdeploy/PoseTracker/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_PoseTracker_apply(JNIEnv *, jobject, jlong, jlongArray,
-                                                               jobjectArray, jintArray, jintArray);
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    apply
+     * Signature: (J[J[Lmmdeploy/Mat;[I[I)[Lmmdeploy/PoseTracker/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_PoseTracker_apply(JNIEnv*, jobject, jlong, jlongArray, jobjectArray, jintArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.cpp
index 2c63233c5c..2ff419ec7a 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.cpp
@@ -6,19 +6,22 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Profiler_create(JNIEnv *env, jobject, jstring path) {
-  auto profiler_path = env->GetStringUTFChars(path, nullptr);
-  mmdeploy_profiler_t profiler{};
-  auto ec = mmdeploy_profiler_create(profiler_path, &profiler);
-  env->ReleaseStringUTFChars(path, profiler_path);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create profiler, code = {}", ec);
-    return -1;
-  }
-  return (jlong)profiler;
+jlong Java_mmdeploy_Profiler_create(JNIEnv* env, jobject, jstring path)
+{
+    auto                profiler_path = env->GetStringUTFChars(path, nullptr);
+    mmdeploy_profiler_t profiler{};
+    auto                ec = mmdeploy_profiler_create(profiler_path, &profiler);
+    env->ReleaseStringUTFChars(path, profiler_path);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create profiler, code = {}", ec);
+        return -1;
+    }
+    return (jlong)profiler;
 }
 
-void Java_mmdeploy_Profiler_destroy(JNIEnv *, jobject, jlong profiler_) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Profiler_destroy");
-  mmdeploy_profiler_destroy((mmdeploy_profiler_t)profiler_);
+void Java_mmdeploy_Profiler_destroy(JNIEnv*, jobject, jlong profiler_)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Profiler_destroy");
+    mmdeploy_profiler_destroy((mmdeploy_profiler_t)profiler_);
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.h
index 2bcdbc42cc..9e829ad38c 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.h
@@ -3,25 +3,26 @@
 /* Header for class mmdeploy_Profiler */
 
 #ifndef _Included_mmdeploy_Profiler
-#define _Included_mmdeploy_Profiler
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Profiler
- * Method:    create
- * Signature: (Ljava/lang/String;)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Profiler_create(JNIEnv *, jobject, jstring);
+    #define _Included_mmdeploy_Profiler
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Profiler
+     * Method:    create
+     * Signature: (Ljava/lang/String;)J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Profiler_create(JNIEnv*, jobject, jstring);
 
-/*
- * Class:     mmdeploy_Profiler
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Profiler_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Profiler
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL  Java_mmdeploy_Profiler_destroy(JNIEnv*, jobject, jlong);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.cpp
index f124d5edae..abc630afa6 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.cpp
@@ -6,29 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Restorer_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                    jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_restorer_t restorer{};
-  auto ec = mmdeploy_restorer_create_by_path(model_path, device_name, (int)device_id, &restorer);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create restorer, code = {}", ec);
-    return -1;
-  }
-  return (jlong)restorer;
+jlong Java_mmdeploy_Restorer_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_restorer_t restorer{};
+    auto                ec = mmdeploy_restorer_create_by_path(model_path, device_name, (int)device_id, &restorer);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create restorer, code = {}", ec);
+        return -1;
+    }
+    return (jlong)restorer;
 }
 
-void Java_mmdeploy_Restorer_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Restorer_destroy");
-  mmdeploy_restorer_destroy((mmdeploy_restorer_t)handle);
+void Java_mmdeploy_Restorer_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Restorer_destroy");
+    mmdeploy_restorer_destroy((mmdeploy_restorer_t)handle);
 }
 
-jobjectArray Java_mmdeploy_Restorer_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                          jobjectArray images) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_Restorer_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_mat_t *results{};
     auto ec = mmdeploy_restorer_apply((mmdeploy_restorer_t)handle, imgs, size, &results);
     if (ec) {
@@ -68,6 +71,5 @@ jobjectArray Java_mmdeploy_Restorer_apply(JNIEnv *env, jobject thiz, jlong handl
       current_result++;
     }
     mmdeploy_restorer_release_result(results, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.h
index 78b09787fe..7a4aec079b 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.h
@@ -3,32 +3,33 @@
 /* Header for class mmdeploy_Restorer */
 
 #ifndef _Included_mmdeploy_Restorer
-#define _Included_mmdeploy_Restorer
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Restorer
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Restorer_create(JNIEnv *, jobject, jstring, jstring, jint);
+    #define _Included_mmdeploy_Restorer
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Restorer
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_Restorer_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_Restorer
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Restorer_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Restorer
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_Restorer_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_Restorer
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/Restorer/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Restorer_apply(JNIEnv *, jobject, jlong, jobjectArray);
+    /*
+     * Class:     mmdeploy_Restorer
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/Restorer/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Restorer_apply(JNIEnv*, jobject, jlong, jobjectArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.cpp
index 3872e7e158..9b34659aa5 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.cpp
@@ -6,30 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_RotatedDetector_create(JNIEnv *env, jobject, jstring modelPath,
-                                           jstring deviceName, jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_rotated_detector_t rotated_detector{};
-  auto ec = mmdeploy_rotated_detector_create_by_path(model_path, device_name, (int)device_id,
-                                                     &rotated_detector);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create rotated detector, code = {}", ec);
-    return -1;
-  }
-  return (jlong)rotated_detector;
+jlong Java_mmdeploy_RotatedDetector_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                        model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                        device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_rotated_detector_t rotated_detector{};
+    auto                        ec = mmdeploy_rotated_detector_create_by_path(model_path, device_name, (int)device_id, &rotated_detector);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create rotated detector, code = {}", ec);
+        return -1;
+    }
+    return (jlong)rotated_detector;
 }
 
-void Java_mmdeploy_RotatedDetector_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_RotatedDetector_destroy");
-  mmdeploy_rotated_detector_destroy((mmdeploy_rotated_detector_t)handle);
+void Java_mmdeploy_RotatedDetector_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_RotatedDetector_destroy");
+    mmdeploy_rotated_detector_destroy((mmdeploy_rotated_detector_t)handle);
 }
 
-jobjectArray Java_mmdeploy_RotatedDetector_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                                 jobjectArray images, jintArray counts) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_RotatedDetector_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images, jintArray counts)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_rotated_detection_t *results{};
     int *result_count{};
     auto ec = mmdeploy_rotated_detector_apply((mmdeploy_rotated_detector_t)handle, imgs, size,
@@ -56,6 +58,5 @@ jobjectArray Java_mmdeploy_RotatedDetector_apply(JNIEnv *env, jobject thiz, jlon
     }
     env->ReleaseIntArrayElements(counts, counts_array, 0);
     mmdeploy_rotated_detector_release_result(results, result_count);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.h b/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.h
index 6de527ec40..7327b791ea 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.h
@@ -3,34 +3,33 @@
 /* Header for class mmdeploy_RotatedDetector */
 
 #ifndef _Included_mmdeploy_RotatedDetector
-#define _Included_mmdeploy_RotatedDetector
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_RotatedDetector
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_RotatedDetector_create(JNIEnv *, jobject, jstring, jstring,
-                                                             jint);
+    #define _Included_mmdeploy_RotatedDetector
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_RotatedDetector
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_RotatedDetector_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_RotatedDetector
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_RotatedDetector_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_RotatedDetector
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_RotatedDetector_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_RotatedDetector
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/RotatedDetector/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_RotatedDetector_apply(JNIEnv *, jobject, jlong,
-                                                                   jobjectArray, jintArray);
+    /*
+     * Class:     mmdeploy_RotatedDetector
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/RotatedDetector/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_RotatedDetector_apply(JNIEnv*, jobject, jlong, jobjectArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.cpp
index 2c1f1c42c0..3ab391c44d 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.cpp
@@ -7,17 +7,20 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Scheduler_createThreadPool(JNIEnv *env, jobject, jint numThreads) {
-  mmdeploy_scheduler_t scheduler = mmdeploy_executor_create_thread_pool((int)numThreads);
-  return (jlong)scheduler;
+jlong Java_mmdeploy_Scheduler_createThreadPool(JNIEnv* env, jobject, jint numThreads)
+{
+    mmdeploy_scheduler_t scheduler = mmdeploy_executor_create_thread_pool((int)numThreads);
+    return (jlong)scheduler;
 }
 
-jlong Java_mmdeploy_Scheduler_createThread(JNIEnv *env, jobject) {
-  mmdeploy_scheduler_t scheduler = mmdeploy_executor_create_thread();
-  return (jlong)scheduler;
+jlong Java_mmdeploy_Scheduler_createThread(JNIEnv* env, jobject)
+{
+    mmdeploy_scheduler_t scheduler = mmdeploy_executor_create_thread();
+    return (jlong)scheduler;
 }
 
-void Java_mmdeploy_Scheduler_destroy(JNIEnv *, jobject, jlong scheduler_) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Scheduler_destroy");
-  mmdeploy_scheduler_destroy((mmdeploy_scheduler_t)scheduler_);
+void Java_mmdeploy_Scheduler_destroy(JNIEnv*, jobject, jlong scheduler_)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Scheduler_destroy");
+    mmdeploy_scheduler_destroy((mmdeploy_scheduler_t)scheduler_);
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.h
index 363015cf95..8774db0fc7 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.h
@@ -3,32 +3,33 @@
 /* Header for class mmdeploy_Scheduler */
 
 #ifndef _Included_mmdeploy_Scheduler
-#define _Included_mmdeploy_Scheduler
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Scheduler
- * Method:    createThreadPool
- * Signature: (I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Scheduler_createThreadPool(JNIEnv *, jclass, jint);
+    #define _Included_mmdeploy_Scheduler
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Scheduler
+     * Method:    createThreadPool
+     * Signature: (I)J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Scheduler_createThreadPool(JNIEnv*, jclass, jint);
 
-/*
- * Class:     mmdeploy_Scheduler
- * Method:    createThread
- * Signature: ()J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Scheduler_createThread(JNIEnv *, jclass);
+    /*
+     * Class:     mmdeploy_Scheduler
+     * Method:    createThread
+     * Signature: ()J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Scheduler_createThread(JNIEnv*, jclass);
 
-/*
- * Class:     mmdeploy_Scheduler
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Scheduler_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Scheduler
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL  Java_mmdeploy_Scheduler_destroy(JNIEnv*, jobject, jlong);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.cpp
index 12df31a49e..8942041c8c 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.cpp
@@ -6,29 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Segmentor_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                     jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_segmentor_t segmentor{};
-  auto ec = mmdeploy_segmentor_create_by_path(model_path, device_name, (int)device_id, &segmentor);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create segmentor, code = {}", ec);
-    return -1;
-  }
-  return (jlong)segmentor;
+jlong Java_mmdeploy_Segmentor_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                 model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                 device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_segmentor_t segmentor{};
+    auto                 ec = mmdeploy_segmentor_create_by_path(model_path, device_name, (int)device_id, &segmentor);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create segmentor, code = {}", ec);
+        return -1;
+    }
+    return (jlong)segmentor;
 }
 
-void Java_mmdeploy_Segmentor_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Segmentor_destroy");
-  mmdeploy_segmentor_destroy((mmdeploy_segmentor_t)handle);
+void Java_mmdeploy_Segmentor_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Segmentor_destroy");
+    mmdeploy_segmentor_destroy((mmdeploy_segmentor_t)handle);
 }
 
-jobjectArray Java_mmdeploy_Segmentor_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                           jobjectArray images) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_Segmentor_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_segmentation_t *results{};
     auto ec = mmdeploy_segmentor_apply((mmdeploy_segmentor_t)handle, imgs, size, &results);
     if (ec) {
@@ -65,6 +68,5 @@ jobjectArray Java_mmdeploy_Segmentor_apply(JNIEnv *env, jobject thiz, jlong hand
       env->SetObjectArrayElement(array, i, res);
     }
     mmdeploy_segmentor_release_result(results, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.h
index afdf157bec..ec42c52dd5 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.h
@@ -3,33 +3,33 @@
 /* Header for class mmdeploy_Segmentor */
 
 #ifndef _Included_mmdeploy_Segmentor
-#define _Included_mmdeploy_Segmentor
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Segmentor
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Segmentor_create(JNIEnv *, jobject, jstring, jstring, jint);
+    #define _Included_mmdeploy_Segmentor
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Segmentor
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_Segmentor_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_Segmentor
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Segmentor_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Segmentor
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_Segmentor_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_Segmentor
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/Segmentor/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Segmentor_apply(JNIEnv *, jobject, jlong,
-                                                             jobjectArray);
+    /*
+     * Class:     mmdeploy_Segmentor
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/Segmentor/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Segmentor_apply(JNIEnv*, jobject, jlong, jobjectArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.cpp
index 943d1e625b..adc1abe5cd 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.cpp
@@ -6,30 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_TextDetector_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                        jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_text_detector_t text_detector{};
-  auto ec = mmdeploy_text_detector_create_by_path(model_path, device_name, (int)device_id,
-                                                  &text_detector);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create text_detector, code = {}", ec);
-    return -1;
-  }
-  return (jlong)text_detector;
+jlong Java_mmdeploy_TextDetector_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                     model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                     device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_text_detector_t text_detector{};
+    auto                     ec = mmdeploy_text_detector_create_by_path(model_path, device_name, (int)device_id, &text_detector);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create text_detector, code = {}", ec);
+        return -1;
+    }
+    return (jlong)text_detector;
 }
 
-void Java_mmdeploy_TextDetector_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_TextDetector_destroy");
-  mmdeploy_text_detector_destroy((mmdeploy_text_detector_t)handle);
+void Java_mmdeploy_TextDetector_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_TextDetector_destroy");
+    mmdeploy_text_detector_destroy((mmdeploy_text_detector_t)handle);
 }
 
-jobjectArray Java_mmdeploy_TextDetector_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                              jobjectArray images, jintArray counts) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_TextDetector_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images, jintArray counts)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_text_detection_t *results{};
     int *result_count{};
     auto ec = mmdeploy_text_detector_apply((mmdeploy_text_detector_t)handle, imgs, size, &results,
@@ -61,6 +63,5 @@ jobjectArray Java_mmdeploy_TextDetector_apply(JNIEnv *env, jobject thiz, jlong h
     }
     env->ReleaseIntArrayElements(counts, counts_array, 0);
     mmdeploy_text_detector_release_result(results, result_count, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.h b/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.h
index dc5574f77b..6a5df47924 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.h
@@ -3,34 +3,33 @@
 /* Header for class mmdeploy_TextDetector */
 
 #ifndef _Included_mmdeploy_TextDetector
-#define _Included_mmdeploy_TextDetector
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_TextDetector
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_TextDetector_create(JNIEnv *, jobject, jstring, jstring,
-                                                          jint);
+    #define _Included_mmdeploy_TextDetector
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_TextDetector
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_TextDetector_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_TextDetector
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_TextDetector_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_TextDetector
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_TextDetector_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_TextDetector
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/TextDetector/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextDetector_apply(JNIEnv *, jobject, jlong,
-                                                                jobjectArray, jintArray);
+    /*
+     * Class:     mmdeploy_TextDetector
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/TextDetector/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextDetector_apply(JNIEnv*, jobject, jlong, jobjectArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.cpp
index 06987fb623..607b7c2ee8 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.cpp
@@ -6,30 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_TextRecognizer_create(JNIEnv *env, jobject, jstring modelPath,
-                                          jstring deviceName, jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_text_recognizer_t text_recognizer{};
-  auto ec = mmdeploy_text_recognizer_create_by_path(model_path, device_name, (int)device_id,
-                                                    &text_recognizer);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create text recognizer, code = {}", ec);
-    return -1;
-  }
-  return (jlong)text_recognizer;
+jlong Java_mmdeploy_TextRecognizer_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                       model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                       device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_text_recognizer_t text_recognizer{};
+    auto                       ec = mmdeploy_text_recognizer_create_by_path(model_path, device_name, (int)device_id, &text_recognizer);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create text recognizer, code = {}", ec);
+        return -1;
+    }
+    return (jlong)text_recognizer;
 }
 
-void Java_mmdeploy_TextRecognizer_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_TextRecognizer_destroy");  // maybe use info?
-  mmdeploy_text_recognizer_destroy((mmdeploy_text_recognizer_t)handle);
+void Java_mmdeploy_TextRecognizer_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_TextRecognizer_destroy");  // maybe use info?
+    mmdeploy_text_recognizer_destroy((mmdeploy_text_recognizer_t)handle);
 }
 
-jobjectArray Java_mmdeploy_TextRecognizer_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                                jobjectArray images) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_TextRecognizer_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_text_recognition_t *results{};
     auto ec =
         mmdeploy_text_recognizer_apply((mmdeploy_text_recognizer_t)handle, imgs, size, &results);
@@ -51,13 +53,12 @@ jobjectArray Java_mmdeploy_TextRecognizer_apply(JNIEnv *env, jobject thiz, jlong
       env->SetObjectArrayElement(array, i, res);
     }
     mmdeploy_text_recognizer_release_result(results, size);
-    return array;
-  });
+    return array; });
 }
-jobjectArray Java_mmdeploy_TextRecognizer_applyBbox(JNIEnv *env, jobject thiz, jlong handle,
-                                                    jobjectArray images, jobjectArray bboxes,
-                                                    jintArray bbox_count) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) {
+jobjectArray Java_mmdeploy_TextRecognizer_applyBbox(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images, jobjectArray bboxes, jintArray bbox_count)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size)
+                {
     mmdeploy_text_recognition_t *recog_results{};
     auto *det_results = new mmdeploy_text_detection_t[env->GetArrayLength(bboxes)];
     int *det_result_count = new int[env->GetArrayLength(bbox_count)];
@@ -100,6 +101,5 @@ jobjectArray Java_mmdeploy_TextRecognizer_applyBbox(JNIEnv *env, jobject thiz, j
     }
     mmdeploy_text_recognizer_release_result(recog_results, size);
     mmdeploy_text_detector_release_result(det_results, det_result_count, 1);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.h b/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.h
index 721c17f2b6..13ed048b7e 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.h
@@ -3,43 +3,40 @@
 /* Header for class mmdeploy_TextRecognizer */
 
 #ifndef _Included_mmdeploy_TextRecognizer
-#define _Included_mmdeploy_TextRecognizer
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_TextRecognizer
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_TextRecognizer_create(JNIEnv *, jobject, jstring, jstring,
-                                                            jint);
+    #define _Included_mmdeploy_TextRecognizer
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_TextRecognizer
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_TextRecognizer_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_TextRecognizer
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_TextRecognizer_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_TextRecognizer
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_TextRecognizer_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_TextRecognizer
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/TextRecognizer/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextRecognizer_apply(JNIEnv *, jobject, jlong,
-                                                                  jobjectArray);
+    /*
+     * Class:     mmdeploy_TextRecognizer
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/TextRecognizer/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextRecognizer_apply(JNIEnv*, jobject, jlong, jobjectArray);
 
-/*
- * Class:     mmdeploy_TextRecognizer
- * Method:    applyBbox
- * Signature: (J[Lmmdeploy/Mat;[Lmmdeploy/TextDetector/Result;[I)[Lmmdeploy/TextRecognizer/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextRecognizer_applyBbox(JNIEnv *, jobject, jlong,
-                                                                      jobjectArray, jobjectArray,
-                                                                      jintArray);
+    /*
+     * Class:     mmdeploy_TextRecognizer
+     * Method:    applyBbox
+     * Signature: (J[Lmmdeploy/Mat;[Lmmdeploy/TextDetector/Result;[I)[Lmmdeploy/TextRecognizer/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextRecognizer_applyBbox(JNIEnv*, jobject, jlong, jobjectArray, jobjectArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/python/classifier.cpp b/csrc/mmdeploy/apis/python/classifier.cpp
index 9916909c86..983b3357b5 100644
--- a/csrc/mmdeploy/apis/python/classifier.cpp
+++ b/csrc/mmdeploy/apis/python/classifier.cpp
@@ -4,64 +4,76 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyClassifier {
- public:
-  PyClassifier(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_classifier_create_by_path(model_path, device_name, device_id, &classifier_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create classifier");
-    }
-  }
-  ~PyClassifier() {
-    mmdeploy_classifier_destroy(classifier_);
-    classifier_ = {};
-  }
+    class PyClassifier
+    {
+      public:
+        PyClassifier(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_classifier_create_by_path(model_path, device_name, device_id, &classifier_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create classifier");
+            }
+        }
+        ~PyClassifier()
+        {
+            mmdeploy_classifier_destroy(classifier_);
+            classifier_ = {};
+        }
 
-  std::vector<std::vector<std::tuple<int, float>>> Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_classification_t* results{};
-    int* result_count{};
-    auto status = mmdeploy_classifier_apply(classifier_, mats.data(), (int)mats.size(), &results,
-                                            &result_count);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply classifier, code: " + std::to_string(status));
-    }
-    auto output = std::vector<std::vector<std::tuple<int, float>>>{};
-    output.reserve(mats.size());
-    auto result_ptr = results;
-    for (int i = 0; i < mats.size(); ++i) {
-      std::vector<std::tuple<int, float>> label_score;
-      for (int j = 0; j < result_count[i]; ++j) {
-        label_score.emplace_back(result_ptr[j].label_id, result_ptr[j].score);
-      }
-      output.push_back(std::move(label_score));
-      result_ptr += result_count[i];
-    }
-    mmdeploy_classifier_release_result(results, result_count, (int)mats.size());
-    return output;
-  }
+        std::vector<std::vector<std::tuple<int, float>>> Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_classification_t* results{};
+            int*                       result_count{};
+            auto                       status = mmdeploy_classifier_apply(classifier_, mats.data(), (int)mats.size(), &results, &result_count);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply classifier, code: " + std::to_string(status));
+            }
+            auto output = std::vector<std::vector<std::tuple<int, float>>>{};
+            output.reserve(mats.size());
+            auto result_ptr = results;
+            for (int i = 0; i < mats.size(); ++i)
+            {
+                std::vector<std::tuple<int, float>> label_score;
+                for (int j = 0; j < result_count[i]; ++j)
+                {
+                    label_score.emplace_back(result_ptr[j].label_id, result_ptr[j].score);
+                }
+                output.push_back(std::move(label_score));
+                result_ptr += result_count[i];
+            }
+            mmdeploy_classifier_release_result(results, result_count, (int)mats.size());
+            return output;
+        }
 
- private:
-  mmdeploy_classifier_t classifier_{};
-};
+      private:
+        mmdeploy_classifier_t classifier_{};
+    };
 
-static PythonBindingRegisterer register_classifier{[](py::module& m) {
-  py::class_<PyClassifier>(m, "Classifier")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyClassifier>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyClassifier* self, const PyImage& img) { return self->Apply(std::vector{img})[0]; })
-      .def("batch", &PyClassifier::Apply);
-}};
+    static PythonBindingRegisterer register_classifier{[](py::module& m)
+                                                       {
+                                                           py::class_<PyClassifier>(m, "Classifier")
+                                                               .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                             { return std::make_unique<PyClassifier>(model_path, device_name, device_id); }),
+                                                                    py::arg("model_path"),
+                                                                    py::arg("device_name"),
+                                                                    py::arg("device_id") = 0)
+                                                               .def("__call__",
+                                                                    [](PyClassifier* self, const PyImage& img)
+                                                                    { return self->Apply(std::vector{img})[0]; })
+                                                               .def("batch", &PyClassifier::Apply);
+                                                       }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/common.cpp b/csrc/mmdeploy/apis/python/common.cpp
index de4e1adf0a..72ed22089a 100644
--- a/csrc/mmdeploy/apis/python/common.cpp
+++ b/csrc/mmdeploy/apis/python/common.cpp
@@ -7,166 +7,214 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "pybind11/numpy.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-std::vector<void (*)(py::module&)>& gPythonBindings() {
-  static std::vector<void (*)(py::module&)> v;
-  return v;
-}
-
-mmdeploy_mat_t GetMat(const PyImage& img) {
-  auto info = img.request();
-  if (info.ndim != 3) {
-    fprintf(stderr, "info.ndim = %d\n", (int)info.ndim);
-    throw std::runtime_error("continuous uint8 HWC array expected");
-  }
-  auto channels = (int)info.shape[2];
-  mmdeploy_mat_t mat{};
-  if (channels == 1) {
-    mat.format = MMDEPLOY_PIXEL_FORMAT_GRAYSCALE;
-  } else if (channels == 3) {
-    mat.format = MMDEPLOY_PIXEL_FORMAT_BGR;
-  } else {
-    throw std::runtime_error("images of 1 or 3 channels are supported");
-  }
-  mat.height = (int)info.shape[0];
-  mat.width = (int)info.shape[1];
-  mat.channel = channels;
-  mat.type = MMDEPLOY_DATA_TYPE_UINT8;
-  mat.data = (uint8_t*)info.ptr;
-  return mat;
-}
+    std::vector<void (*)(py::module&)>& gPythonBindings()
+    {
+        static std::vector<void (*)(py::module&)> v;
+        return v;
+    }
 
-py::object ToPyObject(const Value& value) {
-  switch (value.type()) {
-    case ValueType::kNull:
-      return py::none();
-    case ValueType::kBool:
-      return py::bool_(value.get<bool>());
-    case ValueType::kInt:
-      return py::int_(value.get<int64_t>());
-    case ValueType::kUInt:
-      return py::int_(value.get<uint64_t>());
-    case ValueType::kFloat:
-      return py::float_(value.get<double>());
-    case ValueType::kString:
-      return py::str(value.get<std::string>());
-    case ValueType::kArray: {
-      py::list list;
-      for (const auto& x : value) {
-        list.append(ToPyObject(x));
-      }
-      return list;
+    mmdeploy_mat_t GetMat(const PyImage& img)
+    {
+        auto info = img.request();
+        if (info.ndim != 3)
+        {
+            fprintf(stderr, "info.ndim = %d\n", (int)info.ndim);
+            throw std::runtime_error("continuous uint8 HWC array expected");
+        }
+        auto           channels = (int)info.shape[2];
+        mmdeploy_mat_t mat{};
+        if (channels == 1)
+        {
+            mat.format = MMDEPLOY_PIXEL_FORMAT_GRAYSCALE;
+        }
+        else if (channels == 3)
+        {
+            mat.format = MMDEPLOY_PIXEL_FORMAT_BGR;
+        }
+        else
+        {
+            throw std::runtime_error("images of 1 or 3 channels are supported");
+        }
+        mat.height  = (int)info.shape[0];
+        mat.width   = (int)info.shape[1];
+        mat.channel = channels;
+        mat.type    = MMDEPLOY_DATA_TYPE_UINT8;
+        mat.data    = (uint8_t*)info.ptr;
+        return mat;
     }
-    case ValueType::kObject: {
-      py::dict dict;
-      for (auto it = value.begin(); it != value.end(); ++it) {
-        dict[it.key().c_str()] = ToPyObject(*it);
-      }
-      return dict;
+
+    py::object ToPyObject(const Value& value)
+    {
+        switch (value.type())
+        {
+            case ValueType::kNull:
+                return py::none();
+            case ValueType::kBool:
+                return py::bool_(value.get<bool>());
+            case ValueType::kInt:
+                return py::int_(value.get<int64_t>());
+            case ValueType::kUInt:
+                return py::int_(value.get<uint64_t>());
+            case ValueType::kFloat:
+                return py::float_(value.get<double>());
+            case ValueType::kString:
+                return py::str(value.get<std::string>());
+            case ValueType::kArray:
+            {
+                py::list list;
+                for (const auto& x : value)
+                {
+                    list.append(ToPyObject(x));
+                }
+                return list;
+            }
+            case ValueType::kObject:
+            {
+                py::dict dict;
+                for (auto it = value.begin(); it != value.end(); ++it)
+                {
+                    dict[it.key().c_str()] = ToPyObject(*it);
+                }
+                return dict;
+            }
+            case ValueType::kAny:
+                return py::str("<any>");
+            default:
+                return py::str("<unknown>");
+        }
     }
-    case ValueType::kAny:
-      return py::str("<any>");
-    default:
-      return py::str("<unknown>");
-  }
-}
 
-std::optional<Value> _to_value_internal(const void* object, mmdeploy_context_type_t type);
+    std::optional<Value> _to_value_internal(const void* object, mmdeploy_context_type_t type);
 
-Value FromPyObject(const py::object& obj) {
-  if (py::isinstance<py::none>(obj)) {
-    return nullptr;
-  } else if (py::isinstance<py::bool_>(obj)) {
-    return obj.cast<bool>();
-  } else if (py::isinstance<py::int_>(obj)) {
-    return obj.cast<int>();
-  } else if (py::isinstance<py::float_>(obj)) {
-    return obj.cast<double>();
-  } else if (py::isinstance<py::str>(obj)) {
-    return obj.cast<std::string>();
-  } else if (py::isinstance<py::list>(obj) || py::isinstance<py::tuple>(obj)) {
-    py::list src(obj);
-    Value::Array dst;
-    dst.reserve(src.size());
-    for (const auto& item : src) {
-      dst.push_back(FromPyObject(py::reinterpret_borrow<py::object>(item)));
+    Value                FromPyObject(const py::object& obj)
+    {
+        if (py::isinstance<py::none>(obj))
+        {
+            return nullptr;
+        }
+        else if (py::isinstance<py::bool_>(obj))
+        {
+            return obj.cast<bool>();
+        }
+        else if (py::isinstance<py::int_>(obj))
+        {
+            return obj.cast<int>();
+        }
+        else if (py::isinstance<py::float_>(obj))
+        {
+            return obj.cast<double>();
+        }
+        else if (py::isinstance<py::str>(obj))
+        {
+            return obj.cast<std::string>();
+        }
+        else if (py::isinstance<py::list>(obj) || py::isinstance<py::tuple>(obj))
+        {
+            py::list     src(obj);
+            Value::Array dst;
+            dst.reserve(src.size());
+            for (const auto& item : src)
+            {
+                dst.push_back(FromPyObject(py::reinterpret_borrow<py::object>(item)));
+            }
+            return dst;
+        }
+        else if (py::isinstance<py::dict>(obj))
+        {
+            py::dict      src(obj);
+            Value::Object dst;
+            for (const auto& item : src)
+            {
+                dst.emplace(item.first.cast<std::string>(),
+                            FromPyObject(py::reinterpret_borrow<py::object>(item.second)));
+            }
+            return dst;
+        }
+        else if (py::isinstance<py::array>(obj))
+        {
+            const auto& array = obj.cast<py::array>();
+            return *_to_value_internal(&array, MMDEPLOY_TYPE_MAT);
+        }
+        else if (py::isinstance<Model>(obj))
+        {
+            const auto& model =
+                *reinterpret_cast<framework::Model*>(static_cast<mmdeploy_model_t>(obj.cast<Model>()));
+            return model;
+        }
+        else
+        {
+            std::stringstream ss;
+            ss << obj.get_type();
+            MMDEPLOY_ERROR("unsupported Python object type: {}", ss.str());
+            return nullptr;
+        }
+        return nullptr;
     }
-    return dst;
-  } else if (py::isinstance<py::dict>(obj)) {
-    py::dict src(obj);
-    Value::Object dst;
-    for (const auto& item : src) {
-      dst.emplace(item.first.cast<std::string>(),
-                  FromPyObject(py::reinterpret_borrow<py::object>(item.second)));
-    }
-    return dst;
-  } else if (py::isinstance<py::array>(obj)) {
-    const auto& array = obj.cast<py::array>();
-    return *_to_value_internal(&array, MMDEPLOY_TYPE_MAT);
-  } else if (py::isinstance<Model>(obj)) {
-    const auto& model =
-        *reinterpret_cast<framework::Model*>(static_cast<mmdeploy_model_t>(obj.cast<Model>()));
-    return model;
-  } else {
-    std::stringstream ss;
-    ss << obj.get_type();
-    MMDEPLOY_ERROR("unsupported Python object type: {}", ss.str());
-    return nullptr;
-  }
-  return nullptr;
-}
 
-std::pair<std::string, int> parse_device(const std::string& device) {
-  auto pos = device.find(':');
-  if (pos == std::string::npos) {
-    return {device, 0};  // logic for index -1 is not ready on some devices
-  }
-  auto name = device.substr(0, pos);
-  auto index = std::stoi(device.substr(pos + 1));
-  return {name, index};
-}
+    std::pair<std::string, int> parse_device(const std::string& device)
+    {
+        auto pos = device.find(':');
+        if (pos == std::string::npos)
+        {
+            return {device, 0};  // logic for index -1 is not ready on some devices
+        }
+        auto name  = device.substr(0, pos);
+        auto index = std::stoi(device.substr(pos + 1));
+        return {name, index};
+    }
 
-static PythonBindingRegisterer register_model{[](py::module& m) {
-  py::class_<Model>(m, "Model")
-      .def(py::init([](const py::str& path) {
+    static PythonBindingRegisterer register_model{[](py::module& m)
+                                                  {
+                                                      py::class_<Model>(m, "Model")
+                                                          .def(py::init([](const py::str& path)
+                                                                        {
         MMDEPLOY_DEBUG("py::init([](const py::str& path)");
-        return Model(path.cast<std::string>().c_str());
-      }))
-      .def(py::init([](const py::bytes& buffer) {
+        return Model(path.cast<std::string>().c_str()); }))
+                                                          .def(py::init([](const py::bytes& buffer)
+                                                                        {
         MMDEPLOY_DEBUG("py::init([](const py::bytes& buffer)");
         py::buffer_info info(py::buffer(buffer).request());
-        return Model(info.ptr, info.size);
-      }));
-}};
+        return Model(info.ptr, info.size); }));
+                                                  }};
 
-static PythonBindingRegisterer register_device{[](py::module& m) {
-  py::class_<Device>(m, "Device")
-      .def(py::init([](const std::string& device) {
+    static PythonBindingRegisterer register_device{[](py::module& m)
+                                                   {
+                                                       py::class_<Device>(m, "Device")
+                                                           .def(py::init([](const std::string& device)
+                                                                         {
         auto [name, index] = parse_device(device);
-        return Device(name, index);
-      }))
-      .def(py::init([](const std::string& name, int index) { return Device(name, index); }));
-}};
+        return Device(name, index); }))
+                                                           .def(py::init([](const std::string& name, int index)
+                                                                         { return Device(name, index); }));
+                                                   }};
 
-static PythonBindingRegisterer register_context{[](py::module& m) {
-  py::class_<Context>(m, "Context")
-      .def(py::init([](const Device& device) { return Context(device); }))
-      .def("add", [](Context* self, const std::string& name, const Scheduler& sched) {
-        self->Add(name, sched);
-      });
-}};
+    static PythonBindingRegisterer register_context{[](py::module& m)
+                                                    {
+                                                        py::class_<Context>(m, "Context")
+                                                            .def(py::init([](const Device& device)
+                                                                          { return Context(device); }))
+                                                            .def("add", [](Context* self, const std::string& name, const Scheduler& sched)
+                                                                 { self->Add(name, sched); });
+                                                    }};
 
-static PythonBindingRegisterer register_scheduler{[](py::module& m) {
-  py::class_<Scheduler>(m, "Scheduler")
-      .def_static("thread_pool", [](int n_workers) { return Scheduler::ThreadPool(n_workers); })
-      .def_static("thread", [] { return Scheduler::Thread(); });
-}};
+    static PythonBindingRegisterer register_scheduler{[](py::module& m)
+                                                      {
+                                                          py::class_<Scheduler>(m, "Scheduler")
+                                                              .def_static("thread_pool", [](int n_workers)
+                                                                          { return Scheduler::ThreadPool(n_workers); })
+                                                              .def_static("thread", []
+                                                                          { return Scheduler::Thread(); });
+                                                      }};
 
 }  // namespace mmdeploy::python
 
-PYBIND11_MODULE(mmdeploy_runtime, m) {
-  for (const auto& f : mmdeploy::python::gPythonBindings()) {
-    f(m);
-  }
+PYBIND11_MODULE(mmdeploy_runtime, m)
+{
+    for (const auto& f : mmdeploy::python::gPythonBindings())
+    {
+        f(m);
+    }
 }
diff --git a/csrc/mmdeploy/apis/python/common.h b/csrc/mmdeploy/apis/python/common.h
index 5b1ca96b74..e50ed76007 100644
--- a/csrc/mmdeploy/apis/python/common.h
+++ b/csrc/mmdeploy/apis/python/common.h
@@ -13,24 +13,27 @@
 
 namespace py = pybind11;
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-using PyImage = py::array_t<uint8_t, py::array::c_style | py::array::forcecast>;
+    using PyImage = py::array_t<uint8_t, py::array::c_style | py::array::forcecast>;
 
-std::vector<void (*)(py::module&)>& gPythonBindings();
+    std::vector<void (*)(py::module&)>& gPythonBindings();
 
-mmdeploy_mat_t GetMat(const PyImage& img);
+    mmdeploy_mat_t                      GetMat(const PyImage& img);
 
-py::object ToPyObject(const Value& value);
+    py::object                          ToPyObject(const Value& value);
 
-Value FromPyObject(const py::object& obj);
+    Value                               FromPyObject(const py::object& obj);
 
-class PythonBindingRegisterer {
- public:
-  explicit PythonBindingRegisterer(void (*register_fn)(py::module& m)) {
-    gPythonBindings().push_back(register_fn);
-  }
-};
+    class PythonBindingRegisterer
+    {
+      public:
+        explicit PythonBindingRegisterer(void (*register_fn)(py::module& m))
+        {
+            gPythonBindings().push_back(register_fn);
+        }
+    };
 
 }  // namespace mmdeploy::python
 
diff --git a/csrc/mmdeploy/apis/python/detector.cpp b/csrc/mmdeploy/apis/python/detector.cpp
index 057a92ab00..137998f6b7 100644
--- a/csrc/mmdeploy/apis/python/detector.cpp
+++ b/csrc/mmdeploy/apis/python/detector.cpp
@@ -4,82 +4,97 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyDetector {
- public:
-  PyDetector(const char* model_path, const char* device_name, int device_id) {
-    auto status = mmdeploy_detector_create_by_path(model_path, device_name, device_id, &detector_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create detector");
-    }
-  }
-  py::list Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_detection_t* detection{};
-    int* result_count{};
-    auto status = mmdeploy_detector_apply(detector_, mats.data(), (int)mats.size(), &detection,
-                                          &result_count);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply detector, code: " + std::to_string(status));
-    }
-    using Sptr = std::shared_ptr<mmdeploy_detection_t>;
-    Sptr holder(detection, [result_count, n = mats.size()](auto p) {
-      mmdeploy_detector_release_result(p, result_count, n);
-    });
-    auto output = py::list{};
-    auto result = detection;
-    for (int i = 0; i < mats.size(); ++i) {
-      auto bboxes = py::array_t<float>({result_count[i], 5});
-      auto labels = py::array_t<int>(result_count[i]);
-      auto masks = std::vector<py::array>();
-      masks.reserve(result_count[i]);
-      for (int j = 0; j < result_count[i]; ++j, ++result) {
-        auto bbox = bboxes.mutable_data(j);
-        bbox[0] = result->bbox.left;
-        bbox[1] = result->bbox.top;
-        bbox[2] = result->bbox.right;
-        bbox[3] = result->bbox.bottom;
-        bbox[4] = result->score;
-        labels.mutable_at(j) = result->label_id;
-        if (result->mask) {
-          masks.emplace_back(std::array{result->mask->height, result->mask->width},  // shape
-                             reinterpret_cast<uint8_t*>(result->mask->data),         // data
-                             py::capsule(new Sptr(holder),                           // handle
-                                         [](void* p) { delete reinterpret_cast<Sptr*>(p); }));
-        } else {
-          masks.emplace_back();
+    class PyDetector
+    {
+      public:
+        PyDetector(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status = mmdeploy_detector_create_by_path(model_path, device_name, device_id, &detector_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create detector");
+            }
+        }
+        py::list Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_detection_t* detection{};
+            int*                  result_count{};
+            auto                  status = mmdeploy_detector_apply(detector_, mats.data(), (int)mats.size(), &detection, &result_count);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply detector, code: " + std::to_string(status));
+            }
+            using Sptr = std::shared_ptr<mmdeploy_detection_t>;
+            Sptr holder(detection, [result_count, n = mats.size()](auto p)
+                        { mmdeploy_detector_release_result(p, result_count, n); });
+            auto output = py::list{};
+            auto result = detection;
+            for (int i = 0; i < mats.size(); ++i)
+            {
+                auto bboxes = py::array_t<float>({result_count[i], 5});
+                auto labels = py::array_t<int>(result_count[i]);
+                auto masks  = std::vector<py::array>();
+                masks.reserve(result_count[i]);
+                for (int j = 0; j < result_count[i]; ++j, ++result)
+                {
+                    auto bbox            = bboxes.mutable_data(j);
+                    bbox[0]              = result->bbox.left;
+                    bbox[1]              = result->bbox.top;
+                    bbox[2]              = result->bbox.right;
+                    bbox[3]              = result->bbox.bottom;
+                    bbox[4]              = result->score;
+                    labels.mutable_at(j) = result->label_id;
+                    if (result->mask)
+                    {
+                        masks.emplace_back(std::array{result->mask->height, result->mask->width},  // shape
+                                           reinterpret_cast<uint8_t*>(result->mask->data),         // data
+                                           py::capsule(new Sptr(holder),                           // handle
+                                                       [](void* p)
+                                                       { delete reinterpret_cast<Sptr*>(p); }));
+                    }
+                    else
+                    {
+                        masks.emplace_back();
+                    }
+                }
+                output.append(py::make_tuple(std::move(bboxes), std::move(labels), std::move(masks)));
+            }
+            return output;
+        }
+        ~PyDetector()
+        {
+            mmdeploy_detector_destroy(detector_);
+            detector_ = {};
         }
-      }
-      output.append(py::make_tuple(std::move(bboxes), std::move(labels), std::move(masks)));
-    }
-    return output;
-  }
-  ~PyDetector() {
-    mmdeploy_detector_destroy(detector_);
-    detector_ = {};
-  }
 
- private:
-  mmdeploy_detector_t detector_{};
-};
+      private:
+        mmdeploy_detector_t detector_{};
+    };
 
-static PythonBindingRegisterer register_detector{[](py::module& m) {
-  py::class_<PyDetector>(m, "Detector")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyDetector>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyDetector* self, const PyImage& img) -> py::tuple {
-             return self->Apply(std::vector{img})[0];
-           })
-      .def("batch", &PyDetector::Apply);
-}};
+    static PythonBindingRegisterer register_detector{[](py::module& m)
+                                                     {
+                                                         py::class_<PyDetector>(m, "Detector")
+                                                             .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                           { return std::make_unique<PyDetector>(model_path, device_name, device_id); }),
+                                                                  py::arg("model_path"),
+                                                                  py::arg("device_name"),
+                                                                  py::arg("device_id") = 0)
+                                                             .def("__call__",
+                                                                  [](PyDetector* self, const PyImage& img) -> py::tuple
+                                                                  {
+                                                                      return self->Apply(std::vector{img})[0];
+                                                                  })
+                                                             .def("batch", &PyDetector::Apply);
+                                                     }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/executor.cpp b/csrc/mmdeploy/apis/python/executor.cpp
index eaa5c1144b..489985f232 100644
--- a/csrc/mmdeploy/apis/python/executor.cpp
+++ b/csrc/mmdeploy/apis/python/executor.cpp
@@ -8,39 +8,48 @@
 #include "mmdeploy/execution/schedulers/single_thread_context.h"
 #include "mmdeploy/execution/schedulers/static_thread_pool.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-struct PySender {
-  TypeErasedSender<Value> sender_;
-
-  explicit PySender(TypeErasedSender<Value> sender) : sender_(std::move(sender)) {}
-
-  struct gil_guarded_deleter {
-    void operator()(py::object* p) const {
-      py::gil_scoped_acquire _;
-      delete p;
-    }
-  };
-  using object_ptr = std::unique_ptr<py::object, gil_guarded_deleter>;
-
-  py::object __await__() {
-    auto future = py::module::import("concurrent.futures").attr("Future")();
+    struct PySender
     {
-      py::gil_scoped_release _;
-      StartDetached(std::move(sender_) |
-                    Then([future = object_ptr{new py::object(future)}](const Value& value) mutable {
+        TypeErasedSender<Value> sender_;
+
+        explicit PySender(TypeErasedSender<Value> sender)
+            : sender_(std::move(sender))
+        {
+        }
+
+        struct gil_guarded_deleter
+        {
+            void operator()(py::object* p) const
+            {
+                py::gil_scoped_acquire _;
+                delete p;
+            }
+        };
+        using object_ptr = std::unique_ptr<py::object, gil_guarded_deleter>;
+
+        py::object __await__()
+        {
+            auto future = py::module::import("concurrent.futures").attr("Future")();
+            {
+                py::gil_scoped_release _;
+                StartDetached(std::move(sender_) |
+                              Then([future = object_ptr{new py::object(future)}](const Value& value) mutable
+                                   {
                       py::gil_scoped_acquire _;
                       future->attr("set_result")(ToPyObject(value));
-                      delete future.release();
-                    }));
-    }
-    return py::module::import("asyncio").attr("wrap_future")(future).attr("__await__")();
-  }
-};
-
-static PythonBindingRegisterer register_sender{[](py::module& m) {
-  py::class_<PySender, std::unique_ptr<PySender>>(m, "PySender")
-      .def("__await__", &PySender::__await__);
-}};
+                      delete future.release(); }));
+            }
+            return py::module::import("asyncio").attr("wrap_future")(future).attr("__await__")();
+        }
+    };
+
+    static PythonBindingRegisterer register_sender{[](py::module& m)
+                                                   {
+                                                       py::class_<PySender, std::unique_ptr<PySender>>(m, "PySender")
+                                                           .def("__await__", &PySender::__await__);
+                                                   }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/internal.cpp b/csrc/mmdeploy/apis/python/internal.cpp
index 7373c1f184..8c38f5a7ce 100644
--- a/csrc/mmdeploy/apis/python/internal.cpp
+++ b/csrc/mmdeploy/apis/python/internal.cpp
@@ -9,49 +9,60 @@
 #include "mmdeploy/core/model.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
-
-namespace python {
-
-framework::Mat _get_mat(const PyImage& img) {
-  auto info = img.request();
-  if (info.ndim != 3) {
-    fprintf(stderr, "info.ndim = %d\n", (int)info.ndim);
-    throw std::runtime_error("continuous uint8 HWC array expected");
-  }
-  auto channels = (int)info.shape[2];
-  PixelFormat format;
-  if (channels == 1) {
-    format = PixelFormat::kGRAYSCALE;
-  } else if (channels == 3) {
-    format = PixelFormat::kBGR;
-  } else {
-    throw std::runtime_error("images of 1 or 3 channels are supported");
-  }
-
-  return {
-      (int)info.shape[0],                             // height
-      (int)info.shape[1],                             // width
-      format,                                         // format
-      DataType::kINT8,                                // type
-      std::shared_ptr<void>(info.ptr, [](void*) {}),  // data
-      framework::Device(0),                           // device
-  };
-}
-
-std::optional<Value> _to_value_internal(const void* object, mmdeploy_context_type_t type) {
-  switch (type) {
-    case MMDEPLOY_TYPE_MODEL:
-      return Value(*(const framework::Model*)object);
-    case MMDEPLOY_TYPE_DEVICE:
-      return Value(*(const framework::Device*)object);
-    case MMDEPLOY_TYPE_MAT:
-      return _get_mat(*(const py::array*)object);
-    default:
-      return std::nullopt;
-  }
-}
-
-}  // namespace python
+namespace mmdeploy
+{
+
+    namespace python
+    {
+
+        framework::Mat _get_mat(const PyImage& img)
+        {
+            auto info = img.request();
+            if (info.ndim != 3)
+            {
+                fprintf(stderr, "info.ndim = %d\n", (int)info.ndim);
+                throw std::runtime_error("continuous uint8 HWC array expected");
+            }
+            auto        channels = (int)info.shape[2];
+            PixelFormat format;
+            if (channels == 1)
+            {
+                format = PixelFormat::kGRAYSCALE;
+            }
+            else if (channels == 3)
+            {
+                format = PixelFormat::kBGR;
+            }
+            else
+            {
+                throw std::runtime_error("images of 1 or 3 channels are supported");
+            }
+
+            return {
+                (int)info.shape[0],                             // height
+                (int)info.shape[1],                             // width
+                format,                                         // format
+                DataType::kINT8,                                // type
+                std::shared_ptr<void>(info.ptr, [](void*) {}),  // data
+                framework::Device(0),                           // device
+            };
+        }
+
+        std::optional<Value> _to_value_internal(const void* object, mmdeploy_context_type_t type)
+        {
+            switch (type)
+            {
+                case MMDEPLOY_TYPE_MODEL:
+                    return Value(*(const framework::Model*)object);
+                case MMDEPLOY_TYPE_DEVICE:
+                    return Value(*(const framework::Device*)object);
+                case MMDEPLOY_TYPE_MAT:
+                    return _get_mat(*(const py::array*)object);
+                default:
+                    return std::nullopt;
+            }
+        }
+
+    }  // namespace python
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/apis/python/pipeline.cpp b/csrc/mmdeploy/apis/python/pipeline.cpp
index e3e6237e44..114bce2095 100644
--- a/csrc/mmdeploy/apis/python/pipeline.cpp
+++ b/csrc/mmdeploy/apis/python/pipeline.cpp
@@ -7,41 +7,47 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-using namespace std::literals;
+    using namespace std::literals;
 
-static PythonBindingRegisterer register_pipeline{[](py::module& m) {
-  py::class_<Pipeline>(m, "Pipeline")
-      .def(py::init([](const py::object& config, const Context& context) {
+    static PythonBindingRegisterer register_pipeline{[](py::module& m)
+                                                     {
+                                                         py::class_<Pipeline>(m, "Pipeline")
+                                                             .def(py::init([](const py::object& config, const Context& context)
+                                                                           {
         auto _config = FromPyObject(config);
-        return std::make_unique<Pipeline>(_config, context);
-      }))
-      .def("__call__",
-           [](Pipeline* pipeline, const py::args& args) {
-             auto inputs = FromPyObject(args);
-             for (auto& input : inputs) {
-               input = Value::Array{std::move(input)};
-             }
-             auto outputs = pipeline->Apply(inputs);
-             for (auto& output : outputs) {
-               output = std::move(output[0]);
-             }
-             py::tuple rets(outputs.size());
-             for (int i = 0; i < outputs.size(); ++i) {
-               rets[i] = ToPyObject(outputs[i]);
-             }
-             return rets;
-           })
-      .def("batch", [](Pipeline* pipeline, const py::args& args) {
+        return std::make_unique<Pipeline>(_config, context); }))
+                                                             .def("__call__",
+                                                                  [](Pipeline* pipeline, const py::args& args)
+                                                                  {
+                                                                      auto inputs = FromPyObject(args);
+                                                                      for (auto& input : inputs)
+                                                                      {
+                                                                          input = Value::Array{std::move(input)};
+                                                                      }
+                                                                      auto outputs = pipeline->Apply(inputs);
+                                                                      for (auto& output : outputs)
+                                                                      {
+                                                                          output = std::move(output[0]);
+                                                                      }
+                                                                      py::tuple rets(outputs.size());
+                                                                      for (int i = 0; i < outputs.size(); ++i)
+                                                                      {
+                                                                          rets[i] = ToPyObject(outputs[i]);
+                                                                      }
+                                                                      return rets;
+                                                                  })
+                                                             .def("batch", [](Pipeline* pipeline, const py::args& args)
+                                                                  {
         auto inputs = FromPyObject(args);
         auto outputs = pipeline->Apply(inputs);
         py::tuple rets(outputs.size());
         for (int i = 0; i < outputs.size(); ++i) {
           rets[i] = ToPyObject(outputs[i]);
         }
-        return rets;
-      });
-}};
+        return rets; });
+                                                     }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/pose_detector.cpp b/csrc/mmdeploy/apis/python/pose_detector.cpp
index f9d99eaf14..b6dc96560a 100644
--- a/csrc/mmdeploy/apis/python/pose_detector.cpp
+++ b/csrc/mmdeploy/apis/python/pose_detector.cpp
@@ -7,122 +7,143 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-using Rect = std::array<float, 4>;
+    using Rect = std::array<float, 4>;
 
-class PyPoseDetector {
- public:
-  PyPoseDetector(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_pose_detector_create_by_path(model_path, device_name, device_id, &detector_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create pose_detector");
-    }
-  }
-  py::list Apply(const std::vector<PyImage>& imgs, const std::vector<std::vector<Rect>>& bboxes) {
-    if (imgs.size() == 0 && bboxes.size() == 0) {
-      return py::list{};
-    }
-    if (bboxes.size() != 0 && bboxes.size() != imgs.size()) {
-      std::ostringstream os;
-      os << "imgs length not equal with vboxes [" << imgs.size() << " vs " << bboxes.size() << "]";
-      throw std::invalid_argument(os.str());
-    }
+    class PyPoseDetector
+    {
+      public:
+        PyPoseDetector(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_pose_detector_create_by_path(model_path, device_name, device_id, &detector_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create pose_detector");
+            }
+        }
+        py::list Apply(const std::vector<PyImage>& imgs, const std::vector<std::vector<Rect>>& bboxes)
+        {
+            if (imgs.size() == 0 && bboxes.size() == 0)
+            {
+                return py::list{};
+            }
+            if (bboxes.size() != 0 && bboxes.size() != imgs.size())
+            {
+                std::ostringstream os;
+                os << "imgs length not equal with vboxes [" << imgs.size() << " vs " << bboxes.size() << "]";
+                throw std::invalid_argument(os.str());
+            }
 
-    std::vector<mmdeploy_mat_t> mats;
-    std::vector<mmdeploy_rect_t> boxes;
-    std::vector<int> bbox_count;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
+            std::vector<mmdeploy_mat_t>  mats;
+            std::vector<mmdeploy_rect_t> boxes;
+            std::vector<int>             bbox_count;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
 
-    for (auto _boxes : bboxes) {
-      for (auto _box : _boxes) {
-        mmdeploy_rect_t box = {_box[0], _box[1], _box[2], _box[3]};
-        boxes.push_back(box);
-      }
-      bbox_count.push_back(_boxes.size());
-    }
+            for (auto _boxes : bboxes)
+            {
+                for (auto _box : _boxes)
+                {
+                    mmdeploy_rect_t box = {_box[0], _box[1], _box[2], _box[3]};
+                    boxes.push_back(box);
+                }
+                bbox_count.push_back(_boxes.size());
+            }
 
-    // full image
-    if (bboxes.size() == 0) {
-      for (int i = 0; i < mats.size(); i++) {
-        mmdeploy_rect_t box = {0.f, 0.f, mats[i].width - 1.f, mats[i].height - 1.f};
-        boxes.push_back(box);
-        bbox_count.push_back(1);
-      }
-    }
+            // full image
+            if (bboxes.size() == 0)
+            {
+                for (int i = 0; i < mats.size(); i++)
+                {
+                    mmdeploy_rect_t box = {0.f, 0.f, mats[i].width - 1.f, mats[i].height - 1.f};
+                    boxes.push_back(box);
+                    bbox_count.push_back(1);
+                }
+            }
 
-    mmdeploy_pose_detection_t* detection{};
-    auto status = mmdeploy_pose_detector_apply_bbox(detector_, mats.data(), (int)mats.size(),
-                                                    boxes.data(), bbox_count.data(), &detection);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply pose_detector, code: " + std::to_string(status));
-    }
+            mmdeploy_pose_detection_t* detection{};
+            auto                       status = mmdeploy_pose_detector_apply_bbox(detector_, mats.data(), (int)mats.size(), boxes.data(), bbox_count.data(), &detection);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply pose_detector, code: " + std::to_string(status));
+            }
 
-    auto output = py::list{};
-    auto result = detection;
-    for (int i = 0; i < mats.size(); i++) {
-      int n_point = result->length;
-      auto pred = py::array_t<float>({bbox_count[i], n_point, 3});
-      auto dst = pred.mutable_data();
-      for (int j = 0; j < bbox_count[i]; j++) {
-        for (int k = 0; k < n_point; k++) {
-          dst[0] = result->point[k].x;
-          dst[1] = result->point[k].y;
-          dst[2] = result->score[k];
-          dst += 3;
-        }
-        result++;
-      }
-      output.append(std::move(pred));
-    }
+            auto output = py::list{};
+            auto result = detection;
+            for (int i = 0; i < mats.size(); i++)
+            {
+                int  n_point = result->length;
+                auto pred    = py::array_t<float>({bbox_count[i], n_point, 3});
+                auto dst     = pred.mutable_data();
+                for (int j = 0; j < bbox_count[i]; j++)
+                {
+                    for (int k = 0; k < n_point; k++)
+                    {
+                        dst[0] = result->point[k].x;
+                        dst[1] = result->point[k].y;
+                        dst[2] = result->score[k];
+                        dst += 3;
+                    }
+                    result++;
+                }
+                output.append(std::move(pred));
+            }
 
-    int total = std::accumulate(bbox_count.begin(), bbox_count.end(), 0);
-    mmdeploy_pose_detector_release_result(detection, total);
-    return output;
-  }
-  ~PyPoseDetector() {
-    mmdeploy_pose_detector_destroy(detector_);
-    detector_ = {};
-  }
+            int total = std::accumulate(bbox_count.begin(), bbox_count.end(), 0);
+            mmdeploy_pose_detector_release_result(detection, total);
+            return output;
+        }
+        ~PyPoseDetector()
+        {
+            mmdeploy_pose_detector_destroy(detector_);
+            detector_ = {};
+        }
 
- private:
-  mmdeploy_pose_detector_t detector_{};
-};
+      private:
+        mmdeploy_pose_detector_t detector_{};
+    };
 
-static PythonBindingRegisterer register_pose_detector{[](py::module& m) {
-  py::class_<PyPoseDetector>(m, "PoseDetector")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyPoseDetector>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyPoseDetector* self, const PyImage& img) -> py::array {
-             return self->Apply({img}, {})[0];
-           })
-      .def(
-          "__call__",
-          [](PyPoseDetector* self, const PyImage& img, const Rect& box) -> py::array {
-            std::vector<std::vector<Rect>> bboxes;
-            bboxes.push_back({box});
-            return self->Apply({img}, bboxes)[0];
-          },
-          py::arg("img"), py::arg("box"))
-      .def(
-          "__call__",
-          [](PyPoseDetector* self, const PyImage& img,
-             const std::vector<Rect>& bboxes) -> py::array {
-            std::vector<std::vector<Rect>> _bboxes;
-            _bboxes.push_back(bboxes);
-            return self->Apply({img}, _bboxes)[0];
-          },
-          py::arg("img"), py::arg("bboxes"))
-      .def("batch", &PyPoseDetector::Apply, py::arg("imgs"),
-           py::arg("bboxes") = std::vector<std::vector<Rect>>());
-}};
+    static PythonBindingRegisterer register_pose_detector{[](py::module& m)
+                                                          {
+                                                              py::class_<PyPoseDetector>(m, "PoseDetector")
+                                                                  .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                                { return std::make_unique<PyPoseDetector>(model_path, device_name, device_id); }),
+                                                                       py::arg("model_path"),
+                                                                       py::arg("device_name"),
+                                                                       py::arg("device_id") = 0)
+                                                                  .def("__call__",
+                                                                       [](PyPoseDetector* self, const PyImage& img) -> py::array
+                                                                       {
+                                                                           return self->Apply({img}, {})[0];
+                                                                       })
+                                                                  .def(
+                                                                      "__call__",
+                                                                      [](PyPoseDetector* self, const PyImage& img, const Rect& box) -> py::array
+                                                                      {
+                                                                          std::vector<std::vector<Rect>> bboxes;
+                                                                          bboxes.push_back({box});
+                                                                          return self->Apply({img}, bboxes)[0];
+                                                                      },
+                                                                      py::arg("img"),
+                                                                      py::arg("box"))
+                                                                  .def(
+                                                                      "__call__",
+                                                                      [](PyPoseDetector* self, const PyImage& img, const std::vector<Rect>& bboxes) -> py::array
+                                                                      {
+                                                                          std::vector<std::vector<Rect>> _bboxes;
+                                                                          _bboxes.push_back(bboxes);
+                                                                          return self->Apply({img}, _bboxes)[0];
+                                                                      },
+                                                                      py::arg("img"),
+                                                                      py::arg("bboxes"))
+                                                                  .def("batch", &PyPoseDetector::Apply, py::arg("imgs"), py::arg("bboxes") = std::vector<std::vector<Rect>>());
+                                                          }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/pose_tracker.cpp b/csrc/mmdeploy/apis/python/pose_tracker.cpp
index 035ce3cdd1..c14f2450e8 100644
--- a/csrc/mmdeploy/apis/python/pose_tracker.cpp
+++ b/csrc/mmdeploy/apis/python/pose_tracker.cpp
@@ -5,146 +5,200 @@
 #include "common.h"
 #include "mmdeploy/common.hpp"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-namespace {
+    namespace
+    {
 
-std::vector<py::tuple> Apply(mmdeploy::PoseTracker* self,
-                             const std::vector<mmdeploy::PoseTracker::State*>& _states,
-                             const std::vector<PyImage>& _frames, std::vector<int> detect) {
-  std::vector<mmdeploy_pose_tracker_state_t> tmp;
-  for (const auto& s : _states) {
-    tmp.push_back(static_cast<mmdeploy_pose_tracker_state_t>(*s));
-  }
-  mmdeploy::Span states(reinterpret_cast<mmdeploy::PoseTracker::State*>(tmp.data()), tmp.size());
-  std::vector<mmdeploy::Mat> frames;
-  for (const auto& f : _frames) {
-    frames.emplace_back(GetMat(f));
-  }
-  if (detect.empty()) {
-    detect.resize(frames.size(), -1);
-  }
-  assert(states.size() == frames.size());
-  assert(states.size() == detect.size());
-  auto results = self->Apply(states, frames, detect);
-  std::vector<py::tuple> batch_ret;
-  batch_ret.reserve(frames.size());
-  for (const auto& rs : results) {
-    py::array_t<float> keypoints(
-        {static_cast<int>(rs.size()), rs.size() > 0 ? rs[0].keypoint_count : 0, 3});
-    py::array_t<float> bboxes({static_cast<int>(rs.size()), 4});
-    py::array_t<uint32_t> track_ids(static_cast<int>(rs.size()));
-    auto kpts_ptr = keypoints.mutable_data();
-    auto bbox_ptr = bboxes.mutable_data();
-    auto track_id_ptr = track_ids.mutable_data();
-    for (const auto& r : rs) {
-      for (int i = 0; i < r.keypoint_count; ++i) {
-        kpts_ptr[0] = r.keypoints[i].x;
-        kpts_ptr[1] = r.keypoints[i].y;
-        kpts_ptr[2] = r.scores[i];
-        kpts_ptr += 3;
-      }
-      {
-        auto tmp_bbox = (std::array<float, 4>&)r.bbox;
-        bbox_ptr[0] = tmp_bbox[0];
-        bbox_ptr[1] = tmp_bbox[1];
-        bbox_ptr[2] = tmp_bbox[2];
-        bbox_ptr[3] = tmp_bbox[3];
-        bbox_ptr += 4;
-      }
-      *track_id_ptr++ = r.target_id;
-    }
-    batch_ret.push_back(
-        py::make_tuple(std::move(keypoints), std::move(bboxes), std::move(track_ids)));
-  }
-  return batch_ret;
-}
+        std::vector<py::tuple> Apply(mmdeploy::PoseTracker*                            self,
+                                     const std::vector<mmdeploy::PoseTracker::State*>& _states,
+                                     const std::vector<PyImage>&                       _frames,
+                                     std::vector<int>                                  detect)
+        {
+            std::vector<mmdeploy_pose_tracker_state_t> tmp;
+            for (const auto& s : _states)
+            {
+                tmp.push_back(static_cast<mmdeploy_pose_tracker_state_t>(*s));
+            }
+            mmdeploy::Span             states(reinterpret_cast<mmdeploy::PoseTracker::State*>(tmp.data()), tmp.size());
+            std::vector<mmdeploy::Mat> frames;
+            for (const auto& f : _frames)
+            {
+                frames.emplace_back(GetMat(f));
+            }
+            if (detect.empty())
+            {
+                detect.resize(frames.size(), -1);
+            }
+            assert(states.size() == frames.size());
+            assert(states.size() == detect.size());
+            auto                   results = self->Apply(states, frames, detect);
+            std::vector<py::tuple> batch_ret;
+            batch_ret.reserve(frames.size());
+            for (const auto& rs : results)
+            {
+                py::array_t<float> keypoints(
+                    {static_cast<int>(rs.size()), rs.size() > 0 ? rs[0].keypoint_count : 0, 3});
+                py::array_t<float>    bboxes({static_cast<int>(rs.size()), 4});
+                py::array_t<uint32_t> track_ids(static_cast<int>(rs.size()));
+                auto                  kpts_ptr     = keypoints.mutable_data();
+                auto                  bbox_ptr     = bboxes.mutable_data();
+                auto                  track_id_ptr = track_ids.mutable_data();
+                for (const auto& r : rs)
+                {
+                    for (int i = 0; i < r.keypoint_count; ++i)
+                    {
+                        kpts_ptr[0] = r.keypoints[i].x;
+                        kpts_ptr[1] = r.keypoints[i].y;
+                        kpts_ptr[2] = r.scores[i];
+                        kpts_ptr += 3;
+                    }
+                    {
+                        auto tmp_bbox = (std::array<float, 4>&)r.bbox;
+                        bbox_ptr[0]   = tmp_bbox[0];
+                        bbox_ptr[1]   = tmp_bbox[1];
+                        bbox_ptr[2]   = tmp_bbox[2];
+                        bbox_ptr[3]   = tmp_bbox[3];
+                        bbox_ptr += 4;
+                    }
+                    *track_id_ptr++ = r.target_id;
+                }
+                batch_ret.push_back(
+                    py::make_tuple(std::move(keypoints), std::move(bboxes), std::move(track_ids)));
+            }
+            return batch_ret;
+        }
 
-template <typename T, size_t N>
-void Copy(const py::handle& h, T (&a)[N]) {
-  auto array = h.cast<py::array_t<float>>();
-  assert(array.size() == N);
-  auto data = array.data();
-  for (int i = 0; i < N; ++i) {
-    a[i] = data[i];
-  }
-}
+        template<typename T, size_t N>
+        void Copy(const py::handle& h, T (&a)[N])
+        {
+            auto array = h.cast<py::array_t<float>>();
+            assert(array.size() == N);
+            auto data = array.data();
+            for (int i = 0; i < N; ++i)
+            {
+                a[i] = data[i];
+            }
+        }
 
-void Parse(const py::dict& dict, PoseTracker::Params& params, py::array_t<float>& sigmas) {
-  for (const auto& [_name, value] : dict) {
-    auto name = _name.cast<std::string>();
-    if (name == "det_interval") {
-      params->det_interval = value.cast<int32_t>();
-    } else if (name == "det_label") {
-      params->det_label = value.cast<int32_t>();
-    } else if (name == "det_thr") {
-      params->det_thr = value.cast<float>();
-    } else if (name == "det_min_bbox_size") {
-      params->det_min_bbox_size = value.cast<float>();
-    } else if (name == "det_nms_thr") {
-      params->det_nms_thr = value.cast<float>();
-    } else if (name == "pose_max_num_bboxes") {
-      params->pose_max_num_bboxes = value.cast<int32_t>();
-    } else if (name == "pose_min_keypoints") {
-      params->pose_min_keypoints = value.cast<int32_t>();
-    } else if (name == "pose_min_bbox_size") {
-      params->pose_min_bbox_size = value.cast<float>();
-    } else if (name == "pose_nms_thr") {
-      params->pose_nms_thr = value.cast<float>();
-    } else if (name == "track_kpt_thr") {
-      params->pose_kpt_thr = value.cast<float>();
-    } else if (name == "track_iou_thr") {
-      params->track_iou_thr = value.cast<float>();
-    } else if (name == "pose_bbox_scale") {
-      params->pose_bbox_scale = value.cast<float>();
-    } else if (name == "track_max_missing") {
-      params->track_max_missing = value.cast<float>();
-    } else if (name == "track_history_size") {
-      params->track_history_size = value.cast<int32_t>();
-    } else if (name == "keypoint_sigmas") {
-      sigmas = value.cast<py::array_t<float>>();
-      params->keypoint_sigmas = const_cast<float*>(sigmas.data());
-      params->keypoint_sigmas_size = sigmas.size();
-    } else if (name == "std_weight_position") {
-      params->std_weight_position = value.cast<float>();
-    } else if (name == "std_weight_velocity") {
-      params->std_weight_velocity = value.cast<float>();
-    } else if (name == "smooth_params") {
-      Copy(value, params->smooth_params);
-    } else {
-      MMDEPLOY_ERROR("unused argument: {}", name);
-    }
-  }
-}
+        void Parse(const py::dict& dict, PoseTracker::Params& params, py::array_t<float>& sigmas)
+        {
+            for (const auto& [_name, value] : dict)
+            {
+                auto name = _name.cast<std::string>();
+                if (name == "det_interval")
+                {
+                    params->det_interval = value.cast<int32_t>();
+                }
+                else if (name == "det_label")
+                {
+                    params->det_label = value.cast<int32_t>();
+                }
+                else if (name == "det_thr")
+                {
+                    params->det_thr = value.cast<float>();
+                }
+                else if (name == "det_min_bbox_size")
+                {
+                    params->det_min_bbox_size = value.cast<float>();
+                }
+                else if (name == "det_nms_thr")
+                {
+                    params->det_nms_thr = value.cast<float>();
+                }
+                else if (name == "pose_max_num_bboxes")
+                {
+                    params->pose_max_num_bboxes = value.cast<int32_t>();
+                }
+                else if (name == "pose_min_keypoints")
+                {
+                    params->pose_min_keypoints = value.cast<int32_t>();
+                }
+                else if (name == "pose_min_bbox_size")
+                {
+                    params->pose_min_bbox_size = value.cast<float>();
+                }
+                else if (name == "pose_nms_thr")
+                {
+                    params->pose_nms_thr = value.cast<float>();
+                }
+                else if (name == "track_kpt_thr")
+                {
+                    params->pose_kpt_thr = value.cast<float>();
+                }
+                else if (name == "track_iou_thr")
+                {
+                    params->track_iou_thr = value.cast<float>();
+                }
+                else if (name == "pose_bbox_scale")
+                {
+                    params->pose_bbox_scale = value.cast<float>();
+                }
+                else if (name == "track_max_missing")
+                {
+                    params->track_max_missing = value.cast<float>();
+                }
+                else if (name == "track_history_size")
+                {
+                    params->track_history_size = value.cast<int32_t>();
+                }
+                else if (name == "keypoint_sigmas")
+                {
+                    sigmas                       = value.cast<py::array_t<float>>();
+                    params->keypoint_sigmas      = const_cast<float*>(sigmas.data());
+                    params->keypoint_sigmas_size = sigmas.size();
+                }
+                else if (name == "std_weight_position")
+                {
+                    params->std_weight_position = value.cast<float>();
+                }
+                else if (name == "std_weight_velocity")
+                {
+                    params->std_weight_velocity = value.cast<float>();
+                }
+                else if (name == "smooth_params")
+                {
+                    Copy(value, params->smooth_params);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unused argument: {}", name);
+                }
+            }
+        }
 
-}  // namespace
+    }  // namespace
 
-static PythonBindingRegisterer register_pose_tracker{[](py::module& m) {
-  py::class_<mmdeploy::PoseTracker::State>(m, "PoseTracker.State");
-  py::class_<mmdeploy::PoseTracker>(m, "PoseTracker")
-      .def(py::init([](const char* det_model_path, const char* pose_model_path,
-                       const char* device_name, int device_id) {
-             return mmdeploy::PoseTracker(
-                 mmdeploy::Model(det_model_path), mmdeploy::Model(pose_model_path),
-                 mmdeploy::Context(mmdeploy::Device(device_name, device_id)));
-           }),
-           py::arg("det_model"), py::arg("pose_model"), py::arg("device_name"),
-           py::arg("device_id") = 0)
-      .def(
-          "__call__",
-          [](mmdeploy::PoseTracker* self, mmdeploy::PoseTracker::State* state, const PyImage& img,
-             int detect) { return Apply(self, {state}, {img}, {detect})[0]; },
-          py::arg("state"), py::arg("frame"), py::arg("detect") = -1)
-      .def("batch", &Apply, py::arg("states"), py::arg("frames"),
-           py::arg("detects") = std::vector<int>{})
-      .def("create_state", [](mmdeploy::PoseTracker* self, const py::kwargs& kwargs) {
+    static PythonBindingRegisterer register_pose_tracker{[](py::module& m)
+                                                         {
+                                                             py::class_<mmdeploy::PoseTracker::State>(m, "PoseTracker.State");
+                                                             py::class_<mmdeploy::PoseTracker>(m, "PoseTracker")
+                                                                 .def(py::init([](const char* det_model_path, const char* pose_model_path, const char* device_name, int device_id)
+                                                                               { return mmdeploy::PoseTracker(
+                                                                                     mmdeploy::Model(det_model_path),
+                                                                                     mmdeploy::Model(pose_model_path),
+                                                                                     mmdeploy::Context(mmdeploy::Device(device_name, device_id))); }),
+                                                                      py::arg("det_model"),
+                                                                      py::arg("pose_model"),
+                                                                      py::arg("device_name"),
+                                                                      py::arg("device_id") = 0)
+                                                                 .def(
+                                                                     "__call__",
+                                                                     [](mmdeploy::PoseTracker* self, mmdeploy::PoseTracker::State* state, const PyImage& img, int detect)
+                                                                     { return Apply(self, {state}, {img}, {detect})[0]; },
+                                                                     py::arg("state"),
+                                                                     py::arg("frame"),
+                                                                     py::arg("detect") = -1)
+                                                                 .def("batch", &Apply, py::arg("states"), py::arg("frames"), py::arg("detects") = std::vector<int>{})
+                                                                 .def("create_state", [](mmdeploy::PoseTracker* self, const py::kwargs& kwargs)
+                                                                      {
         PoseTracker::Params params;
         py::array_t<float> sigmas;
         if (kwargs) {
           Parse(kwargs, params, sigmas);
         }
-        return self->CreateState(params);
-      });
-}};
+        return self->CreateState(params); });
+                                                         }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/restorer.cpp b/csrc/mmdeploy/apis/python/restorer.cpp
index 771af2a6c4..ddd4c0a8ff 100644
--- a/csrc/mmdeploy/apis/python/restorer.cpp
+++ b/csrc/mmdeploy/apis/python/restorer.cpp
@@ -4,63 +4,77 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyRestorer {
- public:
-  PyRestorer(const char* model_path, const char* device_name, int device_id) {
-    auto status = mmdeploy_restorer_create_by_path(model_path, device_name, device_id, &restorer_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create restorer");
-    }
-  }
-  ~PyRestorer() {
-    mmdeploy_restorer_destroy(restorer_);
-    restorer_ = {};
-  }
+    class PyRestorer
+    {
+      public:
+        PyRestorer(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status = mmdeploy_restorer_create_by_path(model_path, device_name, device_id, &restorer_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create restorer");
+            }
+        }
+        ~PyRestorer()
+        {
+            mmdeploy_restorer_destroy(restorer_);
+            restorer_ = {};
+        }
 
-  std::vector<py::array> Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_mat_t* results{};
-    auto status = mmdeploy_restorer_apply(restorer_, mats.data(), (int)mats.size(), &results);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply restorer, code: " + std::to_string(status));
-    }
-    using Sptr = std::shared_ptr<mmdeploy_mat_t>;
-    Sptr holder(results, [n = mats.size()](auto p) { mmdeploy_restorer_release_result(p, n); });
+        std::vector<py::array> Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_mat_t* results{};
+            auto            status = mmdeploy_restorer_apply(restorer_, mats.data(), (int)mats.size(), &results);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply restorer, code: " + std::to_string(status));
+            }
+            using Sptr = std::shared_ptr<mmdeploy_mat_t>;
+            Sptr                   holder(results, [n = mats.size()](auto p)
+                        { mmdeploy_restorer_release_result(p, n); });
 
-    std::vector<py::array> rets(mats.size());
-    for (int i = 0; i < mats.size(); ++i) {
-      rets[i] = {
-          {results[i].height, results[i].width, results[i].channel},       // shape
-          results[i].data,                                                 // data
-          py::capsule(new Sptr(holder),                                    // handle
-                      [](void* p) { delete reinterpret_cast<Sptr*>(p); })  //
-      };
-    }
-    return rets;
-  }
+            std::vector<py::array> rets(mats.size());
+            for (int i = 0; i < mats.size(); ++i)
+            {
+                rets[i] = {
+                    {results[i].height, results[i].width, results[i].channel},  // shape
+                    results[i].data,                                            // data
+                    py::capsule(new Sptr(holder),                               // handle
+                                [](void* p)
+                                { delete reinterpret_cast<Sptr*>(p); })  //
+                };
+            }
+            return rets;
+        }
 
- private:
-  mmdeploy_restorer_t restorer_{};
-};
+      private:
+        mmdeploy_restorer_t restorer_{};
+    };
 
-static PythonBindingRegisterer register_restorer{[](py::module& m) {
-  py::class_<PyRestorer>(m, "Restorer")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyRestorer>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyRestorer* self, const PyImage& img) -> py::array {
-             return self->Apply(std::vector{img})[0];
-           })
-      .def("batch", &PyRestorer::Apply);
-}};
+    static PythonBindingRegisterer register_restorer{[](py::module& m)
+                                                     {
+                                                         py::class_<PyRestorer>(m, "Restorer")
+                                                             .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                           { return std::make_unique<PyRestorer>(model_path, device_name, device_id); }),
+                                                                  py::arg("model_path"),
+                                                                  py::arg("device_name"),
+                                                                  py::arg("device_id") = 0)
+                                                             .def("__call__",
+                                                                  [](PyRestorer* self, const PyImage& img) -> py::array
+                                                                  {
+                                                                      return self->Apply(std::vector{img})[0];
+                                                                  })
+                                                             .def("batch", &PyRestorer::Apply);
+                                                     }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/rotated_detector.cpp b/csrc/mmdeploy/apis/python/rotated_detector.cpp
index bc760b04e4..148b31fa6e 100644
--- a/csrc/mmdeploy/apis/python/rotated_detector.cpp
+++ b/csrc/mmdeploy/apis/python/rotated_detector.cpp
@@ -4,74 +4,87 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyRotatedDetector {
- public:
-  PyRotatedDetector(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_rotated_detector_create_by_path(model_path, device_name, device_id, &detector_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create rotated detector");
-    }
-  }
-  py::list Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
+    class PyRotatedDetector
+    {
+      public:
+        PyRotatedDetector(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_rotated_detector_create_by_path(model_path, device_name, device_id, &detector_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create rotated detector");
+            }
+        }
+        py::list Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
 
-    mmdeploy_rotated_detection_t* rbboxes{};
-    int* res_count{};
-    auto status = mmdeploy_rotated_detector_apply(detector_, mats.data(), (int)mats.size(),
-                                                  &rbboxes, &res_count);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply rotated detector, code: " + std::to_string(status));
-    }
-    auto output = py::list{};
-    auto result = rbboxes;
-    auto counts = res_count;
-    for (int i = 0; i < mats.size(); i++) {
-      auto _dets = py::array_t<float>({*counts, 6});
-      auto _labels = py::array_t<int>({*counts});
-      auto dets = _dets.mutable_data();
-      auto labels = _labels.mutable_data();
-      for (int j = 0; j < *counts; j++) {
-        for (int k = 0; k < 5; k++) {
-          *dets++ = result->rbbox[k];
+            mmdeploy_rotated_detection_t* rbboxes{};
+            int*                          res_count{};
+            auto                          status = mmdeploy_rotated_detector_apply(detector_, mats.data(), (int)mats.size(), &rbboxes, &res_count);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply rotated detector, code: " + std::to_string(status));
+            }
+            auto output = py::list{};
+            auto result = rbboxes;
+            auto counts = res_count;
+            for (int i = 0; i < mats.size(); i++)
+            {
+                auto _dets   = py::array_t<float>({*counts, 6});
+                auto _labels = py::array_t<int>({*counts});
+                auto dets    = _dets.mutable_data();
+                auto labels  = _labels.mutable_data();
+                for (int j = 0; j < *counts; j++)
+                {
+                    for (int k = 0; k < 5; k++)
+                    {
+                        *dets++ = result->rbbox[k];
+                    }
+                    *dets++   = result->score;
+                    *labels++ = result->label_id;
+                    result++;
+                }
+                counts++;
+                output.append(py::make_tuple(std::move(_dets), std::move(_labels)));
+            }
+            mmdeploy_rotated_detector_release_result(rbboxes, res_count);
+            return output;
+        }
+        ~PyRotatedDetector()
+        {
+            mmdeploy_rotated_detector_destroy(detector_);
+            detector_ = {};
         }
-        *dets++ = result->score;
-        *labels++ = result->label_id;
-        result++;
-      }
-      counts++;
-      output.append(py::make_tuple(std::move(_dets), std::move(_labels)));
-    }
-    mmdeploy_rotated_detector_release_result(rbboxes, res_count);
-    return output;
-  }
-  ~PyRotatedDetector() {
-    mmdeploy_rotated_detector_destroy(detector_);
-    detector_ = {};
-  }
 
- private:
-  mmdeploy_rotated_detector_t detector_{};
-};
+      private:
+        mmdeploy_rotated_detector_t detector_{};
+    };
 
-static PythonBindingRegisterer register_rotated_detector{[](py::module& m) {
-  py::class_<PyRotatedDetector>(m, "RotatedDetector")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyRotatedDetector>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyRotatedDetector* self, const PyImage& img) -> py::tuple {
-             return self->Apply(std::vector{img})[0];
-           })
-      .def("batch", &PyRotatedDetector::Apply);
-}};
+    static PythonBindingRegisterer register_rotated_detector{[](py::module& m)
+                                                             {
+                                                                 py::class_<PyRotatedDetector>(m, "RotatedDetector")
+                                                                     .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                                   { return std::make_unique<PyRotatedDetector>(model_path, device_name, device_id); }),
+                                                                          py::arg("model_path"),
+                                                                          py::arg("device_name"),
+                                                                          py::arg("device_id") = 0)
+                                                                     .def("__call__",
+                                                                          [](PyRotatedDetector* self, const PyImage& img) -> py::tuple
+                                                                          {
+                                                                              return self->Apply(std::vector{img})[0];
+                                                                          })
+                                                                     .def("batch", &PyRotatedDetector::Apply);
+                                                             }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/segmentor.cpp b/csrc/mmdeploy/apis/python/segmentor.cpp
index 940972ab61..9e1db508c7 100644
--- a/csrc/mmdeploy/apis/python/segmentor.cpp
+++ b/csrc/mmdeploy/apis/python/segmentor.cpp
@@ -4,74 +4,91 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PySegmentor {
- public:
-  PySegmentor(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_segmentor_create_by_path(model_path, device_name, device_id, &segmentor_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create segmentor");
-    }
-  }
-  ~PySegmentor() {
-    mmdeploy_segmentor_destroy(segmentor_);
-    segmentor_ = {};
-  }
+    class PySegmentor
+    {
+      public:
+        PySegmentor(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_segmentor_create_by_path(model_path, device_name, device_id, &segmentor_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create segmentor");
+            }
+        }
+        ~PySegmentor()
+        {
+            mmdeploy_segmentor_destroy(segmentor_);
+            segmentor_ = {};
+        }
 
-  std::vector<py::array> Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_segmentation_t* segm{};
-    auto status = mmdeploy_segmentor_apply(segmentor_, mats.data(), (int)mats.size(), &segm);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply segmentor, code: " + std::to_string(status));
-    }
-    using Sptr = std::shared_ptr<mmdeploy_segmentation_t>;
-    Sptr holder(segm, [n = mats.size()](auto p) { mmdeploy_segmentor_release_result(p, n); });
+        std::vector<py::array> Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_segmentation_t* segm{};
+            auto                     status = mmdeploy_segmentor_apply(segmentor_, mats.data(), (int)mats.size(), &segm);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply segmentor, code: " + std::to_string(status));
+            }
+            using Sptr = std::shared_ptr<mmdeploy_segmentation_t>;
+            Sptr                   holder(segm, [n = mats.size()](auto p)
+                        { mmdeploy_segmentor_release_result(p, n); });
 
-    std::vector<py::array> rets(mats.size());
-    for (size_t i = 0; i < mats.size(); ++i) {
-      if (segm[i].mask != nullptr) {
-        rets[i] = {
-            {segm[i].height, segm[i].width},                                 // shape
-            segm[i].mask,                                                    // mask
-            py::capsule(new Sptr(holder),                                    // handle
-                        [](void* p) { delete reinterpret_cast<Sptr*>(p); })  //
-        };
-      }
-      if (segm[i].score != nullptr) {
-        rets[i] = {
-            {segm[i].classes, segm[i].height, segm[i].width},                // shape
-            segm[i].score,                                                   // score
-            py::capsule(new Sptr(holder),                                    // handle
-                        [](void* p) { delete reinterpret_cast<Sptr*>(p); })  //
-        };
-      }
-    }
-    return rets;
-  }
+            std::vector<py::array> rets(mats.size());
+            for (size_t i = 0; i < mats.size(); ++i)
+            {
+                if (segm[i].mask != nullptr)
+                {
+                    rets[i] = {
+                        {segm[i].height, segm[i].width},  // shape
+                        segm[i].mask,                     // mask
+                        py::capsule(new Sptr(holder),     // handle
+                                    [](void* p)
+                                    { delete reinterpret_cast<Sptr*>(p); })  //
+                    };
+                }
+                if (segm[i].score != nullptr)
+                {
+                    rets[i] = {
+                        {segm[i].classes, segm[i].height, segm[i].width},  // shape
+                        segm[i].score,                                     // score
+                        py::capsule(new Sptr(holder),                      // handle
+                                    [](void* p)
+                                    { delete reinterpret_cast<Sptr*>(p); })  //
+                    };
+                }
+            }
+            return rets;
+        }
 
- private:
-  mmdeploy_segmentor_t segmentor_{};
-};
+      private:
+        mmdeploy_segmentor_t segmentor_{};
+    };
 
-static PythonBindingRegisterer register_segmentor{[](py::module& m) {
-  py::class_<PySegmentor>(m, "Segmentor")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PySegmentor>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PySegmentor* self, const PyImage& img) -> py::array {
-             return self->Apply(std::vector{img})[0];
-           })
-      .def("batch", &PySegmentor::Apply);
-}};
+    static PythonBindingRegisterer register_segmentor{[](py::module& m)
+                                                      {
+                                                          py::class_<PySegmentor>(m, "Segmentor")
+                                                              .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                            { return std::make_unique<PySegmentor>(model_path, device_name, device_id); }),
+                                                                   py::arg("model_path"),
+                                                                   py::arg("device_name"),
+                                                                   py::arg("device_id") = 0)
+                                                              .def("__call__",
+                                                                   [](PySegmentor* self, const PyImage& img) -> py::array
+                                                                   {
+                                                                       return self->Apply(std::vector{img})[0];
+                                                                   })
+                                                              .def("batch", &PySegmentor::Apply);
+                                                      }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/text_detector.cpp b/csrc/mmdeploy/apis/python/text_detector.cpp
index 19762d08ec..1326588a1f 100644
--- a/csrc/mmdeploy/apis/python/text_detector.cpp
+++ b/csrc/mmdeploy/apis/python/text_detector.cpp
@@ -4,68 +4,81 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyTextDetector {
- public:
-  PyTextDetector(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_text_detector_create_by_path(model_path, device_name, device_id, &detector_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create text_detector");
-    }
-  }
-  std::vector<py::array_t<float>> Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_text_detection_t* detection{};
-    int* result_count{};
-    auto status = mmdeploy_text_detector_apply(detector_, mats.data(), (int)mats.size(), &detection,
-                                               &result_count);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply text_detector, code: " + std::to_string(status));
-    }
-    auto output = std::vector<py::array_t<float>>{};
-    auto result = detection;
-    for (int i = 0; i < mats.size(); ++i) {
-      auto bboxes = py::array_t<float>({result_count[i], 9});
-      for (int j = 0; j < result_count[i]; ++j, ++result) {
-        auto data = bboxes.mutable_data(j);
-        for (const auto& p : result->bbox) {
-          *data++ = p.x;
-          *data++ = p.y;
+    class PyTextDetector
+    {
+      public:
+        PyTextDetector(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_text_detector_create_by_path(model_path, device_name, device_id, &detector_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create text_detector");
+            }
+        }
+        std::vector<py::array_t<float>> Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_text_detection_t* detection{};
+            int*                       result_count{};
+            auto                       status = mmdeploy_text_detector_apply(detector_, mats.data(), (int)mats.size(), &detection, &result_count);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply text_detector, code: " + std::to_string(status));
+            }
+            auto output = std::vector<py::array_t<float>>{};
+            auto result = detection;
+            for (int i = 0; i < mats.size(); ++i)
+            {
+                auto bboxes = py::array_t<float>({result_count[i], 9});
+                for (int j = 0; j < result_count[i]; ++j, ++result)
+                {
+                    auto data = bboxes.mutable_data(j);
+                    for (const auto& p : result->bbox)
+                    {
+                        *data++ = p.x;
+                        *data++ = p.y;
+                    }
+                    *data++ = result->score;
+                }
+                output.push_back(std::move(bboxes));
+            }
+            mmdeploy_text_detector_release_result(detection, result_count, (int)mats.size());
+            return output;
+        }
+        ~PyTextDetector()
+        {
+            mmdeploy_text_detector_destroy(detector_);
+            detector_ = {};
         }
-        *data++ = result->score;
-      }
-      output.push_back(std::move(bboxes));
-    }
-    mmdeploy_text_detector_release_result(detection, result_count, (int)mats.size());
-    return output;
-  }
-  ~PyTextDetector() {
-    mmdeploy_text_detector_destroy(detector_);
-    detector_ = {};
-  }
 
- private:
-  mmdeploy_text_detector_t detector_{};
-};
+      private:
+        mmdeploy_text_detector_t detector_{};
+    };
 
-static PythonBindingRegisterer register_text_detector{[](py::module& m) {
-  py::class_<PyTextDetector>(m, "TextDetector")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyTextDetector>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyTextDetector* self, const PyImage& img) -> py::array {
-             return self->Apply(std::vector{img})[0];
-           })
-      .def("batch", &PyTextDetector::Apply);
-}};
+    static PythonBindingRegisterer register_text_detector{[](py::module& m)
+                                                          {
+                                                              py::class_<PyTextDetector>(m, "TextDetector")
+                                                                  .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                                { return std::make_unique<PyTextDetector>(model_path, device_name, device_id); }),
+                                                                       py::arg("model_path"),
+                                                                       py::arg("device_name"),
+                                                                       py::arg("device_id") = 0)
+                                                                  .def("__call__",
+                                                                       [](PyTextDetector* self, const PyImage& img) -> py::array
+                                                                       {
+                                                                           return self->Apply(std::vector{img})[0];
+                                                                       })
+                                                                  .def("batch", &PyTextDetector::Apply);
+                                                          }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/text_recognizer.cpp b/csrc/mmdeploy/apis/python/text_recognizer.cpp
index 317f55103a..1b3bc92af8 100644
--- a/csrc/mmdeploy/apis/python/text_recognizer.cpp
+++ b/csrc/mmdeploy/apis/python/text_recognizer.cpp
@@ -4,79 +4,99 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyTextRecognizer {
- public:
-  PyTextRecognizer(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_text_recognizer_create_by_path(model_path, device_name, device_id, &recognizer_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create text_recognizer");
-    }
-  }
-  std::vector<std::tuple<std::string, std::vector<float>>> Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_text_recognition_t* results{};
-    auto status =
-        mmdeploy_text_recognizer_apply(recognizer_, mats.data(), (int)mats.size(), &results);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply text_recognizer, code: " + std::to_string(status));
-    }
-    auto output = std::vector<std::tuple<std::string, std::vector<float>>>{};
-    for (int i = 0; i < mats.size(); ++i) {
-      std::vector<float> score(results[i].score, results[i].score + results[i].length);
-      output.emplace_back(results[i].text, std::move(score));
-    }
-    mmdeploy_text_recognizer_release_result(results, (int)mats.size());
-    return output;
-  }
-  std::vector<std::tuple<std::string, std::vector<float>>> Apply(const PyImage& img,
-                                                                 const std::vector<float>& bboxes) {
-    if (bboxes.size() * sizeof(float) % sizeof(mmdeploy_text_detection_t)) {
-      throw std::invalid_argument("bboxes is not a list of 'mmdeploy_text_detection_t'");
-    }
-    auto mat = GetMat(img);
-    int bbox_count = bboxes.size() * sizeof(float) / sizeof(mmdeploy_text_detection_t);
-    mmdeploy_text_recognition_t* results{};
-    auto status = mmdeploy_text_recognizer_apply_bbox(
-        recognizer_, &mat, 1, (mmdeploy_text_detection_t*)bboxes.data(), &bbox_count, &results);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply text_recognizer, code: " + std::to_string(status));
-    }
-    auto output = std::vector<std::tuple<std::string, std::vector<float>>>{};
-    for (int i = 0; i < bbox_count; ++i) {
-      std::vector<float> score(results[i].score, results[i].score + results[i].length);
-      output.emplace_back(results[i].text, std::move(score));
-    }
-    mmdeploy_text_recognizer_release_result(results, bbox_count);
-    return output;
-  }
-  ~PyTextRecognizer() {
-    mmdeploy_text_recognizer_destroy(recognizer_);
-    recognizer_ = {};
-  }
+    class PyTextRecognizer
+    {
+      public:
+        PyTextRecognizer(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_text_recognizer_create_by_path(model_path, device_name, device_id, &recognizer_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create text_recognizer");
+            }
+        }
+        std::vector<std::tuple<std::string, std::vector<float>>> Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_text_recognition_t* results{};
+            auto                         status =
+                mmdeploy_text_recognizer_apply(recognizer_, mats.data(), (int)mats.size(), &results);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply text_recognizer, code: " + std::to_string(status));
+            }
+            auto output = std::vector<std::tuple<std::string, std::vector<float>>>{};
+            for (int i = 0; i < mats.size(); ++i)
+            {
+                std::vector<float> score(results[i].score, results[i].score + results[i].length);
+                output.emplace_back(results[i].text, std::move(score));
+            }
+            mmdeploy_text_recognizer_release_result(results, (int)mats.size());
+            return output;
+        }
+        std::vector<std::tuple<std::string, std::vector<float>>> Apply(const PyImage&            img,
+                                                                       const std::vector<float>& bboxes)
+        {
+            if (bboxes.size() * sizeof(float) % sizeof(mmdeploy_text_detection_t))
+            {
+                throw std::invalid_argument("bboxes is not a list of 'mmdeploy_text_detection_t'");
+            }
+            auto                         mat        = GetMat(img);
+            int                          bbox_count = bboxes.size() * sizeof(float) / sizeof(mmdeploy_text_detection_t);
+            mmdeploy_text_recognition_t* results{};
+            auto                         status = mmdeploy_text_recognizer_apply_bbox(
+                recognizer_,
+                &mat,
+                1,
+                (mmdeploy_text_detection_t*)bboxes.data(),
+                &bbox_count,
+                &results);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply text_recognizer, code: " + std::to_string(status));
+            }
+            auto output = std::vector<std::tuple<std::string, std::vector<float>>>{};
+            for (int i = 0; i < bbox_count; ++i)
+            {
+                std::vector<float> score(results[i].score, results[i].score + results[i].length);
+                output.emplace_back(results[i].text, std::move(score));
+            }
+            mmdeploy_text_recognizer_release_result(results, bbox_count);
+            return output;
+        }
+        ~PyTextRecognizer()
+        {
+            mmdeploy_text_recognizer_destroy(recognizer_);
+            recognizer_ = {};
+        }
 
- private:
-  mmdeploy_text_recognizer_t recognizer_{};
-};
+      private:
+        mmdeploy_text_recognizer_t recognizer_{};
+    };
 
-static PythonBindingRegisterer register_text_recognizer{[](py::module& m) {
-  py::class_<PyTextRecognizer>(m, "TextRecognizer")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyTextRecognizer>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__", [](PyTextRecognizer* self,
-                          const PyImage& img) { return self->Apply(std::vector{img})[0]; })
-      .def("__call__", [](PyTextRecognizer* self, const PyImage& img,
-                          const std::vector<float>& bboxes) { return self->Apply(img, bboxes); })
-      .def("batch", py::overload_cast<const std::vector<PyImage>&>(&PyTextRecognizer::Apply));
-}};
+    static PythonBindingRegisterer register_text_recognizer{[](py::module& m)
+                                                            {
+                                                                py::class_<PyTextRecognizer>(m, "TextRecognizer")
+                                                                    .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                                  { return std::make_unique<PyTextRecognizer>(model_path, device_name, device_id); }),
+                                                                         py::arg("model_path"),
+                                                                         py::arg("device_name"),
+                                                                         py::arg("device_id") = 0)
+                                                                    .def("__call__", [](PyTextRecognizer* self, const PyImage& img)
+                                                                         { return self->Apply(std::vector{img})[0]; })
+                                                                    .def("__call__", [](PyTextRecognizer* self, const PyImage& img, const std::vector<float>& bboxes)
+                                                                         { return self->Apply(img, bboxes); })
+                                                                    .def("batch", py::overload_cast<const std::vector<PyImage>&>(&PyTextRecognizer::Apply));
+                                                            }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/video_recognizer.cpp b/csrc/mmdeploy/apis/python/video_recognizer.cpp
index 7c70337e51..ac2e691be3 100644
--- a/csrc/mmdeploy/apis/python/video_recognizer.cpp
+++ b/csrc/mmdeploy/apis/python/video_recognizer.cpp
@@ -4,85 +4,102 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyVideoRecognizer {
- public:
-  PyVideoRecognizer(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_video_recognizer_create_by_path(model_path, device_name, device_id, &recognizer_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create video_recognizer");
-    }
-  }
-  std::vector<std::vector<std::tuple<int, float>>> Apply(
-      const std::vector<std::vector<PyImage>>& imgs, const std::vector<std::pair<int, int>>& info) {
-    if (info.size() != imgs.size()) {
-      throw std::invalid_argument("the length of info is not equal with imgs");
-    }
-    for (int i = 0; i < info.size(); i++) {
-      if (imgs[i].size() != info[i].first * info[i].second) {
-        throw std::invalid_argument("invalid info");
-      }
-    }
-    int total = 0;
-    for (int i = 0; i < imgs.size(); i++) {
-      total += imgs[i].size();
-    }
-    std::vector<mmdeploy_mat_t> clips;
-    std::vector<mmdeploy_video_sample_info_t> clip_info;
-    clips.reserve(total);
-    clip_info.reserve(total);
-    for (int i = 0; i < imgs.size(); i++) {
-      for (const auto& img : imgs[i]) {
-        auto mat = GetMat(img);
-        clips.push_back(mat);
-      }
-      clip_info.push_back({info[i].first, info[i].second});
-    }
+    class PyVideoRecognizer
+    {
+      public:
+        PyVideoRecognizer(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_video_recognizer_create_by_path(model_path, device_name, device_id, &recognizer_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create video_recognizer");
+            }
+        }
+        std::vector<std::vector<std::tuple<int, float>>> Apply(
+            const std::vector<std::vector<PyImage>>& imgs,
+            const std::vector<std::pair<int, int>>&  info)
+        {
+            if (info.size() != imgs.size())
+            {
+                throw std::invalid_argument("the length of info is not equal with imgs");
+            }
+            for (int i = 0; i < info.size(); i++)
+            {
+                if (imgs[i].size() != info[i].first * info[i].second)
+                {
+                    throw std::invalid_argument("invalid info");
+                }
+            }
+            int total = 0;
+            for (int i = 0; i < imgs.size(); i++)
+            {
+                total += imgs[i].size();
+            }
+            std::vector<mmdeploy_mat_t>               clips;
+            std::vector<mmdeploy_video_sample_info_t> clip_info;
+            clips.reserve(total);
+            clip_info.reserve(total);
+            for (int i = 0; i < imgs.size(); i++)
+            {
+                for (const auto& img : imgs[i])
+                {
+                    auto mat = GetMat(img);
+                    clips.push_back(mat);
+                }
+                clip_info.push_back({info[i].first, info[i].second});
+            }
 
-    mmdeploy_video_recognition_t* results{};
-    int* result_count{};
-    auto status = mmdeploy_video_recognizer_apply(recognizer_, clips.data(), clip_info.data(), 1,
-                                                  &results, &result_count);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply video_recognizer, code: " + std::to_string(status));
-    }
+            mmdeploy_video_recognition_t* results{};
+            int*                          result_count{};
+            auto                          status = mmdeploy_video_recognizer_apply(recognizer_, clips.data(), clip_info.data(), 1, &results, &result_count);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply video_recognizer, code: " + std::to_string(status));
+            }
 
-    auto output = std::vector<std::vector<std::tuple<int, float>>>{};
-    output.reserve(imgs.size());
-    auto result_ptr = results;
-    for (int i = 0; i < imgs.size(); ++i) {
-      std::vector<std::tuple<int, float>> label_score;
-      for (int j = 0; j < result_count[i]; ++j) {
-        label_score.emplace_back(result_ptr[j].label_id, result_ptr[j].score);
-      }
-      output.push_back(std::move(label_score));
-      result_ptr += result_count[i];
-    }
-    mmdeploy_video_recognizer_release_result(results, result_count, (int)imgs.size());
-    return output;
-  }
+            auto output = std::vector<std::vector<std::tuple<int, float>>>{};
+            output.reserve(imgs.size());
+            auto result_ptr = results;
+            for (int i = 0; i < imgs.size(); ++i)
+            {
+                std::vector<std::tuple<int, float>> label_score;
+                for (int j = 0; j < result_count[i]; ++j)
+                {
+                    label_score.emplace_back(result_ptr[j].label_id, result_ptr[j].score);
+                }
+                output.push_back(std::move(label_score));
+                result_ptr += result_count[i];
+            }
+            mmdeploy_video_recognizer_release_result(results, result_count, (int)imgs.size());
+            return output;
+        }
 
-  ~PyVideoRecognizer() {
-    mmdeploy_video_recognizer_destroy(recognizer_);
-    recognizer_ = {};
-  }
+        ~PyVideoRecognizer()
+        {
+            mmdeploy_video_recognizer_destroy(recognizer_);
+            recognizer_ = {};
+        }
 
- private:
-  mmdeploy_video_recognizer_t recognizer_{};
-};
+      private:
+        mmdeploy_video_recognizer_t recognizer_{};
+    };
 
-static PythonBindingRegisterer register_video_recognizer{[](py::module& m) {
-  py::class_<PyVideoRecognizer>(m, "VideoRecognizer")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyVideoRecognizer>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyVideoRecognizer* self, const std::vector<PyImage>& imgs,
-              const std::pair<int, int>& info) { return self->Apply({imgs}, {info})[0]; })
-      .def("batch", &PyVideoRecognizer::Apply);
-}};
+    static PythonBindingRegisterer register_video_recognizer{[](py::module& m)
+                                                             {
+                                                                 py::class_<PyVideoRecognizer>(m, "VideoRecognizer")
+                                                                     .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                                   { return std::make_unique<PyVideoRecognizer>(model_path, device_name, device_id); }),
+                                                                          py::arg("model_path"),
+                                                                          py::arg("device_name"),
+                                                                          py::arg("device_id") = 0)
+                                                                     .def("__call__",
+                                                                          [](PyVideoRecognizer* self, const std::vector<PyImage>& imgs, const std::pair<int, int>& info)
+                                                                          { return self->Apply({imgs}, {info})[0]; })
+                                                                     .def("batch", &PyVideoRecognizer::Apply);
+                                                             }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/archive/json_archive.h b/csrc/mmdeploy/archive/json_archive.h
index 2803ee22b2..cf03005856 100644
--- a/csrc/mmdeploy/archive/json_archive.h
+++ b/csrc/mmdeploy/archive/json_archive.h
@@ -7,207 +7,247 @@
 #include "mmdeploy/core/archive.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
-
-namespace detail {
-
-template <typename T>
-nlohmann::json to_json_impl(T&& val);
-
-inline nlohmann::json value_to_json(const Value& value) {
-  switch (value.type()) {
-    case ValueType::kNull:
-      return {};
-    case ValueType::kBool:
-      return value.get<bool>();
-    case ValueType::kInt:
-      return value.get<int64_t>();
-    case ValueType::kUInt:
-      return value.get<uint64_t>();
-    case ValueType::kFloat:
-      return value.get<double>();
-    case ValueType::kString:
-      return value.get<std::string>();
-    case ValueType::kArray: {
-      nlohmann::json json = nlohmann::json::value_t::array;
-      for (const auto& x : value) {
-        json.push_back(value_to_json(x));
-      }
-      return json;
+namespace mmdeploy
+{
+
+    namespace detail
+    {
+
+        template<typename T>
+        nlohmann::json        to_json_impl(T&& val);
+
+        inline nlohmann::json value_to_json(const Value& value)
+        {
+            switch (value.type())
+            {
+                case ValueType::kNull:
+                    return {};
+                case ValueType::kBool:
+                    return value.get<bool>();
+                case ValueType::kInt:
+                    return value.get<int64_t>();
+                case ValueType::kUInt:
+                    return value.get<uint64_t>();
+                case ValueType::kFloat:
+                    return value.get<double>();
+                case ValueType::kString:
+                    return value.get<std::string>();
+                case ValueType::kArray:
+                {
+                    nlohmann::json json = nlohmann::json::value_t::array;
+                    for (const auto& x : value)
+                    {
+                        json.push_back(value_to_json(x));
+                    }
+                    return json;
+                }
+                case ValueType::kObject:
+                {
+                    nlohmann::json json = nlohmann::json::value_t::object;
+                    for (auto it = value.begin(); it != value.end(); ++it)
+                    {
+                        auto key  = it.key();
+                        json[key] = value_to_json(*it);
+                    }
+                    return json;
+                }
+                case ValueType::kAny:
+                    return "<any>";
+                default:
+                    return "<unknown>";
+            }
+        }
+
+    }  // namespace detail
+
+    template<typename T, std::enable_if_t<!is_value_v<uncvref_t<T>>, int> = 0>
+    nlohmann::json to_json(T&& val)
+    {
+        return detail::to_json_impl(std::forward<T>(val));
     }
-    case ValueType::kObject: {
-      nlohmann::json json = nlohmann::json::value_t::object;
-      for (auto it = value.begin(); it != value.end(); ++it) {
-        auto key = it.key();
-        json[key] = value_to_json(*it);
-      }
-      return json;
+
+    inline nlohmann::json to_json(const Value& value)
+    {
+        return detail::value_to_json(value);
+    }
+
+    // save to JSON
+    class JsonOutputArchive : public OutputArchive<JsonOutputArchive>
+    {
+      public:
+        explicit JsonOutputArchive(nlohmann::json& data)
+            : data_(data)
+        {
+        }
+
+        void init(...) {}
+
+        template<typename T>
+        void named_value(const std::string& name, T&& val)
+        {
+            data_[name] = to_json(std::forward<T>(val));
+        }
+
+        template<typename T>
+        void item(T&& val)
+        {
+            data_.push_back(to_json(std::forward<T>(val)));
+        }
+
+        template<typename T, typename V = uncvref_t<T>, std::enable_if_t<std::disjunction_v<std::is_arithmetic<V>, std::is_same<V, const char*>, std::is_same<V, std::string>, std::is_same<V, nlohmann::json>>, int> = 0>
+        void native(T&& val)
+        {
+            data_ = std::forward<T>(val);
+        }
+
+      private:
+        nlohmann::json& data_;
+    };
+
+    namespace detail
+    {
+
+        template<typename T>
+        inline nlohmann::json to_json_impl(T&& val)
+        {
+            nlohmann::json    json;
+            JsonOutputArchive archive(json);
+            archive(std::forward<T>(val));
+            return json;
+        }
+
+    }  // namespace detail
+
+    namespace detail
+    {
+
+        inline Value json_to_value(const nlohmann::json& json)
+        {
+            using value_t = nlohmann::json::value_t;
+            switch (json.type())
+            {
+                case value_t::null:
+                    return {};
+                case value_t::boolean:
+                    return json.get<bool>();
+                case value_t::number_integer:
+                    return json.get<int64_t>();
+                case value_t::number_unsigned:
+                    return json.get<uint64_t>();
+                case value_t::number_float:
+                    return json.get<double>();
+                case value_t::string:
+                    return json.get<std::string>();
+                case value_t::array:
+                {
+                    Value value = ValueType::kArray;
+                    for (const auto& x : json)
+                    {
+                        value.push_back(json_to_value(x));
+                    }
+                    return value;
+                }
+                case value_t::object:
+                {
+                    Value value = ValueType::kObject;
+                    for (const auto& proxy : json.items())
+                    {
+                        value[proxy.key()] = json_to_value(proxy.value());
+                    }
+                    return value;
+                }
+                default:
+                    MMDEPLOY_ERROR("unsupported json type: {}", json.type_name());
+                    return {};
+            }
+        }
+
+        template<typename T>
+        void from_json_impl(const nlohmann::json& json, T&& val);
+
+    }  // namespace detail
+
+    template<typename T, std::enable_if_t<!std::is_same_v<Value, uncvref_t<T>>, int> = 0>
+    void from_json(const nlohmann::json& json, T&& val)
+    {
+        detail::from_json_impl(json, std::forward<T>(val));
     }
-    case ValueType::kAny:
-      return "<any>";
-    default:
-      return "<unknown>";
-  }
-}
-
-}  // namespace detail
-
-template <typename T, std::enable_if_t<!is_value_v<uncvref_t<T>>, int> = 0>
-nlohmann::json to_json(T&& val) {
-  return detail::to_json_impl(std::forward<T>(val));
-}
-
-inline nlohmann::json to_json(const Value& value) { return detail::value_to_json(value); }
-
-// save to JSON
-class JsonOutputArchive : public OutputArchive<JsonOutputArchive> {
- public:
-  explicit JsonOutputArchive(nlohmann::json& data) : data_(data) {}
-
-  void init(...) {}
-
-  template <typename T>
-  void named_value(const std::string& name, T&& val) {
-    data_[name] = to_json(std::forward<T>(val));
-  }
-
-  template <typename T>
-  void item(T&& val) {
-    data_.push_back(to_json(std::forward<T>(val)));
-  }
-
-  template <typename T, typename V = uncvref_t<T>,
-            std::enable_if_t<
-                std::disjunction_v<std::is_arithmetic<V>, std::is_same<V, const char*>,
-                                   std::is_same<V, std::string>, std::is_same<V, nlohmann::json>>,
-                int> = 0>
-  void native(T&& val) {
-    data_ = std::forward<T>(val);
-  }
-
- private:
-  nlohmann::json& data_;
-};
-
-namespace detail {
-
-template <typename T>
-inline nlohmann::json to_json_impl(T&& val) {
-  nlohmann::json json;
-  JsonOutputArchive archive(json);
-  archive(std::forward<T>(val));
-  return json;
-}
-
-}  // namespace detail
-
-namespace detail {
-
-inline Value json_to_value(const nlohmann::json& json) {
-  using value_t = nlohmann::json::value_t;
-  switch (json.type()) {
-    case value_t::null:
-      return {};
-    case value_t::boolean:
-      return json.get<bool>();
-    case value_t::number_integer:
-      return json.get<int64_t>();
-    case value_t::number_unsigned:
-      return json.get<uint64_t>();
-    case value_t::number_float:
-      return json.get<double>();
-    case value_t::string:
-      return json.get<std::string>();
-    case value_t::array: {
-      Value value = ValueType::kArray;
-      for (const auto& x : json) {
-        value.push_back(json_to_value(x));
-      }
-      return value;
+
+    inline void from_json(const nlohmann::json& json, Value& val)
+    {
+        val = detail::json_to_value(json);
     }
-    case value_t::object: {
-      Value value = ValueType::kObject;
-      for (const auto& proxy : json.items()) {
-        value[proxy.key()] = json_to_value(proxy.value());
-      }
-      return value;
+
+    template<typename T>
+    T from_json(const nlohmann::json& json);
+
+    // load from JSON
+    class JsonInputArchive : public InputArchive<JsonInputArchive>
+    {
+      public:
+        explicit JsonInputArchive(const nlohmann::json& data)
+            : data_(data)
+        {
+        }
+
+        template<typename SizeType>
+        void init(SizeType& size)
+        {
+            size  = static_cast<SizeType>(data_.size());
+            iter_ = data_.begin();
+        }
+
+        template<typename T>
+        void named_value(std::string& name, T& val)
+        {
+            name = iter_.key();
+            from_json(*iter_++, std::forward<T>(val));
+        }
+
+        template<typename T>
+        void named_value(const std::string& name, T&& val)
+        {
+            from_json(data_[name], std::forward<T>(val));
+        }
+
+        template<typename T>
+        void item(T&& val)
+        {
+            from_json(*iter_++, std::forward<T>(val));
+        }
+
+        template<typename T>
+        void native(T&& val)
+        {
+            data_.get_to(val);
+        }
+
+      private:
+        const nlohmann::json&          data_;
+        nlohmann::json::const_iterator iter_;
+    };
+
+    namespace detail
+    {
+
+        template<typename T>
+        inline void from_json_impl(const nlohmann::json& json, T&& val)
+        {
+            JsonInputArchive archive(json);
+            archive(std::forward<T>(val));
+        }
+
+    }  // namespace detail
+
+    template<typename T>
+    inline T from_json(const nlohmann::json& json)
+    {
+        T val{};
+        from_json(json, val);
+        return val;
     }
-    default:
-      MMDEPLOY_ERROR("unsupported json type: {}", json.type_name());
-      return {};
-  }
-}
-
-template <typename T>
-void from_json_impl(const nlohmann::json& json, T&& val);
-
-}  // namespace detail
-
-template <typename T, std::enable_if_t<!std::is_same_v<Value, uncvref_t<T>>, int> = 0>
-void from_json(const nlohmann::json& json, T&& val) {
-  detail::from_json_impl(json, std::forward<T>(val));
-}
-
-inline void from_json(const nlohmann::json& json, Value& val) { val = detail::json_to_value(json); }
-
-template <typename T>
-T from_json(const nlohmann::json& json);
-
-// load from JSON
-class JsonInputArchive : public InputArchive<JsonInputArchive> {
- public:
-  explicit JsonInputArchive(const nlohmann::json& data) : data_(data) {}
-
-  template <typename SizeType>
-  void init(SizeType& size) {
-    size = static_cast<SizeType>(data_.size());
-    iter_ = data_.begin();
-  }
-
-  template <typename T>
-  void named_value(std::string& name, T& val) {
-    name = iter_.key();
-    from_json(*iter_++, std::forward<T>(val));
-  }
-
-  template <typename T>
-  void named_value(const std::string& name, T&& val) {
-    from_json(data_[name], std::forward<T>(val));
-  }
-
-  template <typename T>
-  void item(T&& val) {
-    from_json(*iter_++, std::forward<T>(val));
-  }
-
-  template <typename T>
-  void native(T&& val) {
-    data_.get_to(val);
-  }
-
- private:
-  const nlohmann::json& data_;
-  nlohmann::json::const_iterator iter_;
-};
-
-namespace detail {
-
-template <typename T>
-inline void from_json_impl(const nlohmann::json& json, T&& val) {
-  JsonInputArchive archive(json);
-  archive(std::forward<T>(val));
-}
-
-}  // namespace detail
-
-template <typename T>
-inline T from_json(const nlohmann::json& json) {
-  T val{};
-  from_json(json, val);
-  return val;
-}
-
-void from_json(const nlohmann::json& json, Value& val);
+
+    void from_json(const nlohmann::json& json, Value& val);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/archive/value_archive.h b/csrc/mmdeploy/archive/value_archive.h
index 2f559c1a10..f3245f0dfc 100644
--- a/csrc/mmdeploy/archive/value_archive.h
+++ b/csrc/mmdeploy/archive/value_archive.h
@@ -6,131 +6,169 @@
 #include "mmdeploy/core/archive.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
-
-template <typename T>
-Value to_value(T&& val);
-
-// save to Value
-class ValueOutputArchive : public OutputArchive<ValueOutputArchive> {
- public:
-  explicit ValueOutputArchive(Value& data) : data_(data) {}
-
-  template <typename T>
-  void init(array_tag<T>) {
-    data_ = ValueType::kArray;
-  }
-
-  template <typename T>
-  void init(object_tag<T>) {
-    data_ = ValueType::kObject;
-  }
-
-  template <typename T>
-  void named_value(const std::string& name, T&& val) {
-    data_[name] = to_value(std::forward<T>(val));
-  }
-
-  template <typename T>
-  void item(T&& val) {
-    data_.push_back(to_value(std::forward<T>(val)));
-  }
-
-  template <typename T, std::enable_if_t<std::is_constructible_v<Value, T>, int> = 0>
-  void native(T&& val) {
-    data_ = std::forward<T>(val);
-  };
-
- private:
-  Value& data_;
-};
-
-template <typename T>
-inline Value to_value(T&& val) {
-  Value value;
-  ValueOutputArchive archive(value);
-  archive(std::forward<T>(val));
-  return value;
-}
-
-// fast path
-inline Value to_value(const Value& v) { return v; }
-inline Value to_value(Value&& v) { return std::move(v); }
-
-template <typename T>
-void from_value(const Value& value, T&& x);
-
-template <typename T>
-T from_value(const Value& value);
-
-// load from Value
-class ValueInputArchive : public InputArchive<ValueInputArchive> {
- public:
-  explicit ValueInputArchive(const Value& data) : data_(data) {}
-
-  template <typename SizeType>
-  void init(SizeType& size) {
-    size = static_cast<SizeType>(data_.size());
-    iter_ = data_.begin();
-  }
-
-  template <typename T>
-  void named_value(std::string& name, T& val) {
-    name = iter_.key();
-    from_value(*iter_, std::forward<T>(val));
-    ++iter_;
-  }
-
-  template <typename T>
-  void named_value(const std::string& name, T&& val) {
-    from_value(data_[name], std::forward<T>(val));
-  }
-
-  template <typename T>
-  void item(T&& val) {
-    from_value(*iter_, std::forward<T>(val));
-    ++iter_;
-  }
-
-  template <typename T>
-  void native(T&& val) {
-    data_.get_to(val);
-  }
-
-  template <typename T>
-  void value(T&& value) {}
-
- private:
-  const Value& data_;
-  Value::const_iterator iter_;
-};
-
-template <typename T>
-void from_value(const Value& value, T&& x) {
-  ValueInputArchive archive(value);
-  archive(std::forward<T>(x));
-}
-
-// Required to avoid Value::Pointer being unwrapped by Value::get_to()
-inline void from_value(const Value& value, Value& x) { x = value; }
-
-template <typename T>
-inline T from_value(const Value& value) {
-  T x{};
-  from_value(value, x);
-  return x;
-}
-
-namespace detail {
-
-inline void load(ValueInputArchive& archive, Value& v) { archive.native(v); }
-
-template <class T, std::enable_if_t<std::is_same<std::decay_t<T>, Value>::value, bool> = true>
-inline void save(ValueOutputArchive& archive, T&& v) {
-  archive.native(std::forward<T>(v));
-}
-
-}  // namespace detail
+namespace mmdeploy
+{
+
+    template<typename T>
+    Value to_value(T&& val);
+
+    // save to Value
+    class ValueOutputArchive : public OutputArchive<ValueOutputArchive>
+    {
+      public:
+        explicit ValueOutputArchive(Value& data)
+            : data_(data)
+        {
+        }
+
+        template<typename T>
+        void init(array_tag<T>)
+        {
+            data_ = ValueType::kArray;
+        }
+
+        template<typename T>
+        void init(object_tag<T>)
+        {
+            data_ = ValueType::kObject;
+        }
+
+        template<typename T>
+        void named_value(const std::string& name, T&& val)
+        {
+            data_[name] = to_value(std::forward<T>(val));
+        }
+
+        template<typename T>
+        void item(T&& val)
+        {
+            data_.push_back(to_value(std::forward<T>(val)));
+        }
+
+        template<typename T, std::enable_if_t<std::is_constructible_v<Value, T>, int> = 0>
+        void native(T&& val)
+        {
+            data_ = std::forward<T>(val);
+        };
+
+      private:
+        Value& data_;
+    };
+
+    template<typename T>
+    inline Value to_value(T&& val)
+    {
+        Value              value;
+        ValueOutputArchive archive(value);
+        archive(std::forward<T>(val));
+        return value;
+    }
+
+    // fast path
+    inline Value to_value(const Value& v)
+    {
+        return v;
+    }
+    inline Value to_value(Value&& v)
+    {
+        return std::move(v);
+    }
+
+    template<typename T>
+    void from_value(const Value& value, T&& x);
+
+    template<typename T>
+    T from_value(const Value& value);
+
+    // load from Value
+    class ValueInputArchive : public InputArchive<ValueInputArchive>
+    {
+      public:
+        explicit ValueInputArchive(const Value& data)
+            : data_(data)
+        {
+        }
+
+        template<typename SizeType>
+        void init(SizeType& size)
+        {
+            size  = static_cast<SizeType>(data_.size());
+            iter_ = data_.begin();
+        }
+
+        template<typename T>
+        void named_value(std::string& name, T& val)
+        {
+            name = iter_.key();
+            from_value(*iter_, std::forward<T>(val));
+            ++iter_;
+        }
+
+        template<typename T>
+        void named_value(const std::string& name, T&& val)
+        {
+            from_value(data_[name], std::forward<T>(val));
+        }
+
+        template<typename T>
+        void item(T&& val)
+        {
+            from_value(*iter_, std::forward<T>(val));
+            ++iter_;
+        }
+
+        template<typename T>
+        void native(T&& val)
+        {
+            data_.get_to(val);
+        }
+
+        template<typename T>
+        void value(T&& value)
+        {
+        }
+
+      private:
+        const Value&          data_;
+        Value::const_iterator iter_;
+    };
+
+    template<typename T>
+    void from_value(const Value& value, T&& x)
+    {
+        ValueInputArchive archive(value);
+        archive(std::forward<T>(x));
+    }
+
+    // Required to avoid Value::Pointer being unwrapped by Value::get_to()
+    inline void from_value(const Value& value, Value& x)
+    {
+        x = value;
+    }
+
+    template<typename T>
+    inline T from_value(const Value& value)
+    {
+        T x{};
+        from_value(value, x);
+        return x;
+    }
+
+    namespace detail
+    {
+
+        inline void load(ValueInputArchive& archive, Value& v)
+        {
+            archive.native(v);
+        }
+
+        template<class T, std::enable_if_t<std::is_same<std::decay_t<T>, Value>::value, bool> = true>
+        inline void save(ValueOutputArchive& archive, T&& v)
+        {
+            archive.native(std::forward<T>(v));
+        }
+
+    }  // namespace detail
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh
index 02c57c62e6..d1b3195669 100644
--- a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh
+++ b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh
@@ -8,25 +8,27 @@
 #include <algorithm>
 
 #define CUDA_1D_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 
 #define THREADS_PER_BLOCK 512
 
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-inline int GET_BLOCKS(const int N) {
-  int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
-  int max_block_num = 4096;
-  return std::min(optimal_block_num, max_block_num);
+inline int GET_BLOCKS(const int N)
+{
+    int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
+    int max_block_num     = 4096;
+    return std::min(optimal_block_num, max_block_num);
 }
 
-#define cudaCheckError()                                                               \
-  {                                                                                    \
-    cudaError_t e = cudaGetLastError();                                                \
-    if (e != cudaSuccess) {                                                            \
-      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
-      exit(0);                                                                         \
-    }                                                                                  \
-  }
+#define cudaCheckError()                                                                     \
+    {                                                                                        \
+        cudaError_t e = cudaGetLastError();                                                  \
+        if (e != cudaSuccess)                                                                \
+        {                                                                                    \
+            printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+            exit(0);                                                                         \
+        }                                                                                    \
+    }
 
 /**
  * Returns a view of the original tensor with its dimensions permuted.
@@ -38,57 +40,59 @@ inline int GET_BLOCKS(const int N) {
  * @param[in] src_dim dim of src tensor
  * @param[in] stream cuda stream handle
  */
-template <class scalar_t>
-void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim,
-                   cudaStream_t stream = 0);
-
-template <typename scalar_t>
-cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha,
-                              const scalar_t* A, int lda, const scalar_t* B, int ldb,
-                              const scalar_t* beta, scalar_t* C, int ldc);
-
-template <typename scalar_t>
-__device__ scalar_t bilinear_interpolate(const scalar_t* input, const int height, const int width,
-                                         scalar_t y, scalar_t x) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  int y_low = (int)y;
-  int x_low = (int)x;
-  int y_high;
-  int x_high;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (scalar_t)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (scalar_t)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  scalar_t ly = y - y_low;
-  scalar_t lx = x - x_low;
-  scalar_t hy = 1. - ly, hx = 1. - lx;
-  // do bilinear interpolation
-  scalar_t v1 = input[y_low * width + x_low];
-  scalar_t v2 = input[y_low * width + x_high];
-  scalar_t v3 = input[y_high * width + x_low];
-  scalar_t v4 = input[y_high * width + x_high];
-  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  return val;
+template<class scalar_t>
+void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim, cudaStream_t stream = 0);
+
+template<typename scalar_t>
+cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha, const scalar_t* A, int lda, const scalar_t* B, int ldb, const scalar_t* beta, scalar_t* C, int ldc);
+
+template<typename scalar_t>
+__device__ scalar_t bilinear_interpolate(const scalar_t* input, const int height, const int width, scalar_t y, scalar_t x)
+{
+    // deal with cases that inverse elements are out of feature map boundary
+    if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+    if (y <= 0) y = 0;
+    if (x <= 0) x = 0;
+
+    int y_low = (int)y;
+    int x_low = (int)x;
+    int y_high;
+    int x_high;
+
+    if (y_low >= height - 1)
+    {
+        y_high = y_low = height - 1;
+        y              = (scalar_t)y_low;
+    }
+    else
+    {
+        y_high = y_low + 1;
+    }
+
+    if (x_low >= width - 1)
+    {
+        x_high = x_low = width - 1;
+        x              = (scalar_t)x_low;
+    }
+    else
+    {
+        x_high = x_low + 1;
+    }
+
+    scalar_t ly = y - y_low;
+    scalar_t lx = x - x_low;
+    scalar_t hy = 1. - ly, hx = 1. - lx;
+    // do bilinear interpolation
+    scalar_t v1 = input[y_low * width + x_low];
+    scalar_t v2 = input[y_low * width + x_high];
+    scalar_t v3 = input[y_high * width + x_low];
+    scalar_t v4 = input[y_high * width + x_high];
+    scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+    scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+    return val;
 }
 
 #endif  // COMMON_CUDA_HELPER
diff --git a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h
index a37e243109..4bd17cd0d3 100644
--- a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h
+++ b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h
@@ -1,82 +1,83 @@
 #include <cmath>
 #include <cstdint>
 
-template <typename T>
-T bilinear_interpolate_2d(const T *src, const int64_t src_h, const int64_t src_w, const T h,
-                          const T w) {
-  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
-    return 0;
-  }
+template<typename T>
+T bilinear_interpolate_2d(const T* src, const int64_t src_h, const int64_t src_w, const T h, const T w)
+{
+    if (h <= -1 || src_h <= h || w <= -1 || src_w <= w)
+    {
+        return 0;
+    }
 
-  int64_t h_low = floor(h);
-  int64_t w_low = floor(w);
-  int64_t h_high = h_low + 1;
-  int64_t w_high = w_low + 1;
+    int64_t h_low  = floor(h);
+    int64_t w_low  = floor(w);
+    int64_t h_high = h_low + 1;
+    int64_t w_high = w_low + 1;
 
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh;
-  T hw = 1 - lw;
+    T       lh = h - h_low;
+    T       lw = w - w_low;
+    T       hh = 1 - lh;
+    T       hw = 1 - lw;
 
-  T v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
-  T v2 = 0;
-  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
-  T v3 = 0;
-  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
-  T v4 = 0;
-  if (h_high <= src_h - 1 && w_high <= src_w - 1) v4 = src[h_high * src_w + w_high];
+    T       v1 = 0;
+    if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
+    T v2 = 0;
+    if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
+    T v3 = 0;
+    if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
+    T v4 = 0;
+    if (h_high <= src_h - 1 && w_high <= src_w - 1) v4 = src[h_high * src_w + w_high];
 
-  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+    T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
 
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
+    T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return val;
 }
 
 // output: (channels * kernel_h * kernel_w, dst_h * dst_w)
-template <typename T>
-void deformable_im2col_2d(const T *input, const T *offset, const T *mask, const int64_t src_h,
-                          const int64_t src_w, const int64_t kernel_h, const int64_t kernel_w,
-                          const int64_t pad_h, const int64_t pad_w, const int64_t stride_h,
-                          const int64_t stride_w, const int64_t dilation_h,
-                          const int64_t dilation_w, const int64_t channels,
-                          const int64_t offset_groups, const int64_t dst_h, const int64_t dst_w,
-                          const bool use_mask, T *columns) {
-  const int64_t workload = channels * dst_h * dst_w;
-  for (int64_t index = 0; index != workload; ++index) {
-    const int64_t ow = index % dst_w;
-    const int64_t oh = (index / dst_w) % dst_h;
-    const int64_t ic = index / (dst_w * dst_h);
-    const int64_t oc = ic * kernel_h * kernel_w;
+template<typename T>
+void deformable_im2col_2d(const T* input, const T* offset, const T* mask, const int64_t src_h, const int64_t src_w, const int64_t kernel_h, const int64_t kernel_w, const int64_t pad_h, const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, const int64_t dilation_h, const int64_t dilation_w, const int64_t channels, const int64_t offset_groups, const int64_t dst_h, const int64_t dst_w, const bool use_mask, T* columns)
+{
+    const int64_t workload = channels * dst_h * dst_w;
+    for (int64_t index = 0; index != workload; ++index)
+    {
+        const int64_t ow = index % dst_w;
+        const int64_t oh = (index / dst_w) % dst_h;
+        const int64_t ic = index / (dst_w * dst_h);
+        const int64_t oc = ic * kernel_h * kernel_w;
 
-    int64_t c_per_offset_grp = channels / offset_groups;
-    const int64_t grp_idx = ic / c_per_offset_grp;
+        int64_t       c_per_offset_grp = channels / offset_groups;
+        const int64_t grp_idx          = ic / c_per_offset_grp;
 
-    auto columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
-    auto input_ptr = input + ic * (src_h * src_w);
-    auto offset_ptr = offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
-    auto mask_ptr = mask;
-    if (use_mask) {
-      mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
-    }
+        auto          columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
+        auto          input_ptr   = input + ic * (src_h * src_w);
+        auto          offset_ptr  = offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
+        auto          mask_ptr    = mask;
+        if (use_mask)
+        {
+            mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
+        }
 
-    for (int64_t kh = 0; kh < kernel_h; ++kh) {
-      for (int64_t kw = 0; kw < kernel_w; ++kw) {
-        const int64_t mask_idx = kh * kernel_w + kw;
-        const int64_t offset_idx = 2 * mask_idx;
+        for (int64_t kh = 0; kh < kernel_h; ++kh)
+        {
+            for (int64_t kw = 0; kw < kernel_w; ++kw)
+            {
+                const int64_t mask_idx   = kh * kernel_w + kw;
+                const int64_t offset_idx = 2 * mask_idx;
 
-        T mask_value = 1;
-        if (use_mask) {
-          mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
-        }
+                T             mask_value = 1;
+                if (use_mask)
+                {
+                    mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
+                }
 
-        const T offset_h = offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
-        const T offset_w = offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
-        const T ih = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
-        const T iw = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
-        *columns_ptr = mask_value * bilinear_interpolate_2d<T>(input_ptr, src_h, src_w, ih, iw);
-        columns_ptr += dst_h * dst_w;
-      }
+                const T offset_h = offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
+                const T offset_w = offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
+                const T ih       = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
+                const T iw       = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
+                *columns_ptr     = mask_value * bilinear_interpolate_2d<T>(input_ptr, src_h, src_w, ih, iw);
+                columns_ptr += dst_h * dst_w;
+            }
+        }
     }
-  }
 }
diff --git a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh
index 43166e7d6b..6051c4762b 100644
--- a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh
+++ b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh
@@ -71,110 +71,130 @@
 
 #include "common_cuda_helper.cuh"
 
-template <typename T>
-__device__ float mdcn_im2col_bilinear(const T *input, const int data_width, const int height,
-                                      const int width, float h, float w) {
-  int h_low = floorf(h);
-  int w_low = floorf(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh, hw = 1 - lw;
-
-  T v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
-  T v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) v2 = input[h_low * data_width + w_high];
-  T v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) v3 = input[h_high * data_width + w_low];
-  T v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1) v4 = input[h_high * data_width + w_high];
-
-  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return float(val);
+template<typename T>
+__device__ float mdcn_im2col_bilinear(const T* input, const int data_width, const int height, const int width, float h, float w)
+{
+    int h_low  = floorf(h);
+    int w_low  = floorf(w);
+    int h_high = h_low + 1;
+    int w_high = w_low + 1;
+
+    T   lh = h - h_low;
+    T   lw = w - w_low;
+    T   hh = 1 - lh, hw = 1 - lw;
+
+    T   v1 = 0;
+    if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+    T v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1) v2 = input[h_low * data_width + w_high];
+    T v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0) v3 = input[h_high * data_width + w_low];
+    T v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1) v4 = input[h_high * data_width + w_high];
+
+    T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+    T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return float(val);
 }
-template <>
-__device__ float mdcn_im2col_bilinear<__half>(const __half *input, const int data_width,
-                                              const int height, const int width, float h, float w) {
-  int h_low = floorf(h);
-  int w_low = floorf(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  float lh = h - h_low;
-  float lw = w - w_low;
-  float hh = 1 - lh, hw = 1 - lw;
-
-  float v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = __half2float(input[h_low * data_width + w_low]);
-  float v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) v2 = __half2float(input[h_low * data_width + w_high]);
-  float v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) v3 = __half2float(input[h_high * data_width + w_low]);
-  float v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-    v4 = __half2float(input[h_high * data_width + w_high]);
-
-  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
+template<>
+__device__ float mdcn_im2col_bilinear<__half>(const __half* input, const int data_width, const int height, const int width, float h, float w)
+{
+    int   h_low  = floorf(h);
+    int   w_low  = floorf(w);
+    int   h_high = h_low + 1;
+    int   w_high = w_low + 1;
+
+    float lh = h - h_low;
+    float lw = w - w_low;
+    float hh = 1 - lh, hw = 1 - lw;
+
+    float v1 = 0;
+    if (h_low >= 0 && w_low >= 0) v1 = __half2float(input[h_low * data_width + w_low]);
+    float v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1) v2 = __half2float(input[h_low * data_width + w_high]);
+    float v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0) v3 = __half2float(input[h_high * data_width + w_low]);
+    float v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1)
+        v4 = __half2float(input[h_high * data_width + w_high]);
+
+    float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+    float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return val;
 }
 
-template <typename T>
+template<typename T>
 __global__ void modulated_deformable_im2col_gpu_kernel(
-    const int n, const T *data_im, const T *data_offset, const T *data_mask, const int height,
-    const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size, const int num_channels,
-    const int deformable_group, const int height_col, const int width_col, T *data_col) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    // index index of output matrix
-    const int w_col = index % width_col;
-    const int h_col = (index / width_col) % height_col;
-    const int b_col = (index / width_col / height_col) % batch_size;
-    const int c_im = (index / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-
-    T *data_col_ptr =
-        data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    const T *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
-    const T *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) *
-                                                 2 * kernel_h * kernel_w * height_col * width_col;
-
-    const T *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) *
-                                             kernel_h * kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
-            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
-        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        const T mask = data_mask_ptr[data_mask_hw_ptr];
-        float val = 0.0f;
-        const float h_im = h_in + i * dilation_h + (float)offset_h;
-        const float w_im = w_in + j * dilation_w + (float)offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
-          val = mdcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
-        *data_col_ptr = (T)(val * (float)mask);
-        data_col_ptr += batch_size * height_col * width_col;
-      }
+    const int n,
+    const T*  data_im,
+    const T*  data_offset,
+    const T*  data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    T*        data_col)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        // index index of output matrix
+        const int w_col = index % width_col;
+        const int h_col = (index / width_col) % height_col;
+        const int b_col = (index / width_col / height_col) % batch_size;
+        const int c_im  = (index / width_col / height_col) / batch_size;
+        const int c_col = c_im * kernel_h * kernel_w;
+
+        // compute deformable group index
+        const int deformable_group_index = c_im / channel_per_deformable_group;
+
+        const int h_in = h_col * stride_h - pad_h;
+        const int w_in = w_col * stride_w - pad_w;
+
+        T*        data_col_ptr =
+            data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+        const T* data_im_ptr     = data_im + (b_col * num_channels + c_im) * height * width;
+        const T* data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) *
+                                                     2 * kernel_h * kernel_w * height_col * width_col;
+
+        const T* data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) *
+                                                 kernel_h * kernel_w * height_col * width_col;
+
+        for (int i = 0; i < kernel_h; ++i)
+        {
+            for (int j = 0; j < kernel_w; ++j)
+            {
+                const int data_offset_h_ptr =
+                    ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+                const int data_offset_w_ptr =
+                    ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+                const int   data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+                const T     offset_h         = data_offset_ptr[data_offset_h_ptr];
+                const T     offset_w         = data_offset_ptr[data_offset_w_ptr];
+                const T     mask             = data_mask_ptr[data_mask_hw_ptr];
+                float       val              = 0.0f;
+                const float h_im             = h_in + i * dilation_h + (float)offset_h;
+                const float w_im             = w_in + j * dilation_w + (float)offset_w;
+                if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+                    val = mdcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+                *data_col_ptr = (T)(val * (float)mask);
+                data_col_ptr += batch_size * height_col * width_col;
+            }
+        }
     }
-  }
 }
 
 #endif  // TRT_MODULATED_DEFORM_CONV_KERNEL_CUH
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.cpp b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.cpp
index 4d620e4c82..274ba76bca 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.cpp
@@ -1,355 +1,402 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 #include "fuse_pass.h"
 
-void fuse_identity(onnx::GraphProto* mutable_graph,
+void fuse_identity(onnx::GraphProto*                         mutable_graph,
                    std::map<std::string, onnx::TensorProto>& weights,
-                   std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                   int& reduced_node_count) {
-  // fuse
-  // identity -->  op
-  // to
-  // noop_reducencnn --> op
-  const int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; ++i) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-    for (int j = 0; j < node->input_size(); ++j) {
-      std::string output_name = node->input(j);
-      onnx::NodeProto* last_node = find_node_by_output_name(mutable_graph, output_name);
-      if (last_node && last_node->op_type() == "Identity") {
-        node->set_input(j, last_node->input(0));
-        node_reference[last_node->output(0)] -= 1;
-        node_reference[last_node->input(0)] += 1;
-        if (node_reference[last_node->output(0)] == 0) {
-          last_node->set_op_type("noop_reducedncnn");
-          node_reference[last_node->input(0)] -= 1;
-          reduced_node_count += 1;
+                   std::map<std::string, int>&               node_reference,
+                   std::set<std::string>&                    blob_names,
+                   int&                                      reduced_node_count)
+{
+    // fuse
+    // identity -->  op
+    // to
+    // noop_reducencnn --> op
+    const int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; ++i)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+        for (int j = 0; j < node->input_size(); ++j)
+        {
+            std::string      output_name = node->input(j);
+            onnx::NodeProto* last_node   = find_node_by_output_name(mutable_graph, output_name);
+            if (last_node && last_node->op_type() == "Identity")
+            {
+                node->set_input(j, last_node->input(0));
+                node_reference[last_node->output(0)] -= 1;
+                node_reference[last_node->input(0)] += 1;
+                if (node_reference[last_node->output(0)] == 0)
+                {
+                    last_node->set_op_type("noop_reducedncnn");
+                    node_reference[last_node->input(0)] -= 1;
+                    reduced_node_count += 1;
+                }
+            }
         }
-      }
     }
-  }
 }
 
-void fuse_rewrite_gather(onnx::GraphProto* mutable_graph,
+void fuse_rewrite_gather(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count) {
-  const int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; ++i) {
-    onnx::NodeProto* gather = mutable_graph->mutable_node(i);
-    if (gather->op_type() != "Gather") {
-      continue;
-    }
-    if (weights.find(std::string(gather->input(1))) == weights.end()) {
-      continue;
-    }
-    auto indices = get_node_attr_from_input_ai(weights[gather->input(1)]);
-    if (indices.size() != 1) {
-      continue;
-    }
-
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count)
+{
+    const int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; ++i)
     {
-      // reconstruct node connections
-      node_reference[gather->input(1)] -= 1;
-      std::string origin_inp = gather->input(0);
-      gather->clear_input();
-      gather->add_input(origin_inp);
-    }
+        onnx::NodeProto* gather = mutable_graph->mutable_node(i);
+        if (gather->op_type() != "Gather")
+        {
+            continue;
+        }
+        if (weights.find(std::string(gather->input(1))) == weights.end())
+        {
+            continue;
+        }
+        auto indices = get_node_attr_from_input_ai(weights[gather->input(1)]);
+        if (indices.size() != 1)
+        {
+            continue;
+        }
 
-    {
-      // update axis, starts and ends
-      int axis = get_node_attr_i(*gather, "axis", 1) - 1;
+        {
+            // reconstruct node connections
+            node_reference[gather->input(1)] -= 1;
+            std::string origin_inp = gather->input(0);
+            gather->clear_input();
+            gather->add_input(origin_inp);
+        }
+
+        {
+            // update axis, starts and ends
+            int axis = get_node_attr_i(*gather, "axis", 1) - 1;
 
-      gather->set_op_type("Crop");
-      gather->clear_attribute();
+            gather->set_op_type("Crop");
+            gather->clear_attribute();
 
-      int indice = indices[0];
-      set_node_attr_ai(*gather, "starts", std::vector<int>{indice});
-      set_node_attr_ai(*gather, "ends", std::vector<int>{indice + 1});
-      set_node_attr_ai(*gather, "axis", std::vector<int>{axis});
+            int indice = indices[0];
+            set_node_attr_ai(*gather, "starts", std::vector<int>{indice});
+            set_node_attr_ai(*gather, "ends", std::vector<int>{indice + 1});
+            set_node_attr_ai(*gather, "axis", std::vector<int>{axis});
+        }
     }
-  }
 }
 
-void fuse_weight_reshape(onnx::GraphProto* mutable_graph,
+void fuse_weight_reshape(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // weight <= Reshape(weight)
-    if (node->op_type() == "Reshape") {
-      // check weight
-      if (weights.find(node->input(0)) == weights.end()) continue;
-
-      weights[node->output(0)] = weights[node->input(0)];
-
-      // set weight shape directly
-      std::vector<int> shape;
-      if (node->input_size() == 1) {
-        shape = get_node_attr_ai(*node, "shape");
-      } else if (node->input_size() == 2) {
-        // opset 5
-        shape = get_node_attr_from_input_ai(weights[node->input(1)]);
-      }
-
-      weights[node->output(0)].clear_dims();
-      for (int j = 0; j < shape.size(); j++) {
-        weights[node->output(0)].add_dims(shape[j]);
-      }
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(0)] -= 1;
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
-
-      reduced_node_count += 1;
-      i += 1;
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // weight <= Reshape(weight)
+        if (node->op_type() == "Reshape")
+        {
+            // check weight
+            if (weights.find(node->input(0)) == weights.end()) continue;
+
+            weights[node->output(0)] = weights[node->input(0)];
+
+            // set weight shape directly
+            std::vector<int> shape;
+            if (node->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node, "shape");
+            }
+            else if (node->input_size() == 2)
+            {
+                // opset 5
+                shape = get_node_attr_from_input_ai(weights[node->input(1)]);
+            }
+
+            weights[node->output(0)].clear_dims();
+            for (int j = 0; j < shape.size(); j++)
+            {
+                weights[node->output(0)].add_dims(shape[j]);
+            }
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(0)] -= 1;
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
+
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_weight_transpose(onnx::GraphProto* mutable_graph,
+void fuse_weight_transpose(onnx::GraphProto*                         mutable_graph,
                            std::map<std::string, onnx::TensorProto>& weights,
-                           std::map<std::string, int>& node_reference,
-                           std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // weight <= Transpose(weight)
-    if (node->op_type() == "Transpose") {
-      // check weight
-      if (weights.find(node->input(0)) == weights.end()) continue;
-
-      if (weights[node->input(0)].dims_size() != 2) continue;
-
-      // perm = (1, 0)
-      std::vector<int> perm = get_node_attr_ai(*node, "perm");
-      if (perm.size() != 2) continue;
-      if (perm[0] != 1 || perm[1] != 0) continue;
-
-      weights[node->output(0)] = weights[node->input(0)];
-
-      // permute weight
-      {
-        onnx::TensorProto& B = weights[node->output(0)];
-
-        const int h = B.dims(0);
-        const int w = B.dims(1);
-
-        std::vector<float> permuted_data;
-        permuted_data.reserve((size_t)h * w);
-        const float* bptr =
-            B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
-
-        for (int j = 0; j < w; j++) {
-          for (int k = 0; k < h; k++) {
-            float vb = bptr[k * w + j];
-            permuted_data.push_back(vb);
-          }
-        }
-
-        B.set_dims(0, w);
-        B.set_dims(1, h);
-
-        if (B.has_raw_data()) {
-          B.set_raw_data(permuted_data.data(), permuted_data.size() * sizeof(float));
-        } else {
-          for (int j = 0; j < (int)permuted_data.size(); j++) B.set_float_data(j, permuted_data[j]);
+                           std::map<std::string, int>&               node_reference,
+                           std::set<std::string>&                    blob_names,
+                           int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // weight <= Transpose(weight)
+        if (node->op_type() == "Transpose")
+        {
+            // check weight
+            if (weights.find(node->input(0)) == weights.end()) continue;
+
+            if (weights[node->input(0)].dims_size() != 2) continue;
+
+            // perm = (1, 0)
+            std::vector<int> perm = get_node_attr_ai(*node, "perm");
+            if (perm.size() != 2) continue;
+            if (perm[0] != 1 || perm[1] != 0) continue;
+
+            weights[node->output(0)] = weights[node->input(0)];
+
+            // permute weight
+            {
+                onnx::TensorProto& B = weights[node->output(0)];
+
+                const int          h = B.dims(0);
+                const int          w = B.dims(1);
+
+                std::vector<float> permuted_data;
+                permuted_data.reserve((size_t)h * w);
+                const float* bptr =
+                    B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
+
+                for (int j = 0; j < w; j++)
+                {
+                    for (int k = 0; k < h; k++)
+                    {
+                        float vb = bptr[k * w + j];
+                        permuted_data.push_back(vb);
+                    }
+                }
+
+                B.set_dims(0, w);
+                B.set_dims(1, h);
+
+                if (B.has_raw_data())
+                {
+                    B.set_raw_data(permuted_data.data(), permuted_data.size() * sizeof(float));
+                }
+                else
+                {
+                    for (int j = 0; j < (int)permuted_data.size(); j++) B.set_float_data(j, permuted_data[j]);
+                }
+            }
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(0)] -= 1;
+
+            reduced_node_count += 1;
+            i += 1;
         }
-      }
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(0)] -= 1;
-
-      reduced_node_count += 1;
-      i += 1;
     }
-  }
 }
 
-void fuse_shufflechannel(onnx::GraphProto* mutable_graph,
+void fuse_shufflechannel(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // ShuffleChannel <= Reshape - Transpose - Reshape
-    // ShuffleChannel <= Reshape - Transpose - Constant - Reshape
-    if (node->op_type() == "Reshape") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      std::vector<int> shape;
-      if (node->input_size() == 1) {
-        shape = get_node_attr_ai(*node, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node->input(1)) == weights.end()) continue;
-
-        shape = get_node_attr_from_input_ai(weights[node->input(1)]);
-      }
-
-      // 1 groups channels_per_group, height, width
-      // reverse style = channels_per_group, groups, height * width
-      if (shape.size() != 5 && shape.size() != 3) continue;
-
-      if (shape.size() == 5 && shape[0] != 1) continue;
-
-      if (i + 2 >= node_count) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-
-      if (node3->op_type() == "Constant") {
-        if (i + 3 >= node_count) continue;
-
-        node3 = mutable_graph->mutable_node(i + 3);
-      }
-
-      if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      // 0 2 1 3 4
-      // reverse style = 1 0 2
-      std::vector<int> perm = get_node_attr_ai(*node2, "perm");
-      if (perm.size() != 5 && perm.size() != 3) continue;
-
-      if (perm.size() == 5 &&
-          (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3 || perm[4] != 4))
-        continue;
-
-      if (perm.size() == 3 && (perm[0] != 1 || perm[1] != 0 || perm[2] != 2)) continue;
-
-      std::vector<int> shape3;
-      if (node3->input_size() == 1) {
-        shape3 = get_node_attr_ai(*node3, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node3->input(1)) == weights.end()) continue;
-
-        shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      }
-
-      // 1, -1, height, width
-      // reverse style = group, -1, channels_per_group, height, width
-      if (shape3.size() != 4 && shape3.size() != 5) continue;
-
-      if (shape3.size() == 4 &&
-          (shape3[0] != 1 || (shape3[1] != -1 && shape3[1] != shape[1] * shape[2])))
-        continue;
-
-      if (shape3.size() == 5 &&
-          (shape3[0] != shape[1] || shape3[2] != shape[0] || shape3[3] * shape3[4] != shape[2]))
-        continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-
-      node3->set_op_type("ShuffleChannel");
-      node3->set_input(0, node->input(0));
-
-      onnx::AttributeProto* attr_group = node3->add_attribute();
-      attr_group->set_name("group");
-      attr_group->set_i(shape[1]);
-
-      onnx::AttributeProto* attr_reverse = node3->add_attribute();
-      attr_reverse->set_name("reverse");
-      attr_reverse->set_i(shape.size() == 3);
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // ShuffleChannel <= Reshape - Transpose - Reshape
+        // ShuffleChannel <= Reshape - Transpose - Constant - Reshape
+        if (node->op_type() == "Reshape")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            std::vector<int> shape;
+            if (node->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node->input(1)) == weights.end()) continue;
+
+                shape = get_node_attr_from_input_ai(weights[node->input(1)]);
+            }
+
+            // 1 groups channels_per_group, height, width
+            // reverse style = channels_per_group, groups, height * width
+            if (shape.size() != 5 && shape.size() != 3) continue;
+
+            if (shape.size() == 5 && shape[0] != 1) continue;
+
+            if (i + 2 >= node_count) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+
+            if (node3->op_type() == "Constant")
+            {
+                if (i + 3 >= node_count) continue;
+
+                node3 = mutable_graph->mutable_node(i + 3);
+            }
+
+            if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            // 0 2 1 3 4
+            // reverse style = 1 0 2
+            std::vector<int> perm = get_node_attr_ai(*node2, "perm");
+            if (perm.size() != 5 && perm.size() != 3) continue;
+
+            if (perm.size() == 5 &&
+                (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3 || perm[4] != 4))
+                continue;
+
+            if (perm.size() == 3 && (perm[0] != 1 || perm[1] != 0 || perm[2] != 2)) continue;
+
+            std::vector<int> shape3;
+            if (node3->input_size() == 1)
+            {
+                shape3 = get_node_attr_ai(*node3, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node3->input(1)) == weights.end()) continue;
+
+                shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            }
+
+            // 1, -1, height, width
+            // reverse style = group, -1, channels_per_group, height, width
+            if (shape3.size() != 4 && shape3.size() != 5) continue;
+
+            if (shape3.size() == 4 &&
+                (shape3[0] != 1 || (shape3[1] != -1 && shape3[1] != shape[1] * shape[2])))
+                continue;
+
+            if (shape3.size() == 5 &&
+                (shape3[0] != shape[1] || shape3[2] != shape[0] || shape3[3] * shape3[4] != shape[2]))
+                continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+
+            node3->set_op_type("ShuffleChannel");
+            node3->set_input(0, node->input(0));
+
+            onnx::AttributeProto* attr_group = node3->add_attribute();
+            attr_group->set_name("group");
+            attr_group->set_i(shape[1]);
+
+            onnx::AttributeProto* attr_reverse = node3->add_attribute();
+            attr_reverse->set_name("reverse");
+            attr_reverse->set_i(shape.size() == 3);
 
-      reduced_node_count += 2;
-      i += 2;
+            reduced_node_count += 2;
+            i += 2;
+        }
     }
-  }
 }
 
-void fuse_shufflechannel_split(onnx::GraphProto* mutable_graph,
+void fuse_shufflechannel_split(onnx::GraphProto*                         mutable_graph,
                                std::map<std::string, onnx::TensorProto>& weights,
-                               std::map<std::string, int>& node_reference,
-                               std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                               std::map<std::string, int>&               node_reference,
+                               std::set<std::string>&                    blob_names,
+                               int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // Split <= ShuffleChannel(reverse type) - Gather(0) - Gather(1)
-    if (node->op_type() == "ShuffleChannel") {
-      // reverse = 1
-      int reverse = get_node_attr_i(*node, "reverse");
-      if (reverse != 1) continue;
+        // Split <= ShuffleChannel(reverse type) - Gather(0) - Gather(1)
+        if (node->op_type() == "ShuffleChannel")
+        {
+            // reverse = 1
+            int reverse = get_node_attr_i(*node, "reverse");
+            if (reverse != 1) continue;
 
-      if (i + 2 >= node_count) continue;
+            if (i + 2 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
 
-      if (node2->op_type() != "Gather" || node3->op_type() != "Gather") continue;
+            if (node2->op_type() != "Gather" || node3->op_type() != "Gather") continue;
 
-      if (node2->input(0) != node->output(0) || node3->input(0) != node->output(0)) continue;
+            if (node2->input(0) != node->output(0) || node3->input(0) != node->output(0)) continue;
 
-      // axis = 0
-      int gather2_axis = get_node_attr_i(*node2, "axis");
-      if (gather2_axis != 0) continue;
+            // axis = 0
+            int gather2_axis = get_node_attr_i(*node2, "axis");
+            if (gather2_axis != 0) continue;
 
-      // indices = 0
-      if (weights.find(node2->input(1)) == weights.end()) continue;
+            // indices = 0
+            if (weights.find(node2->input(1)) == weights.end()) continue;
 
-      std::vector<int> gather2_indices = get_node_attr_from_input_ai(weights[node2->input(1)]);
-      if (gather2_indices.size() != 1 || gather2_indices[0] != 0) continue;
+            std::vector<int> gather2_indices = get_node_attr_from_input_ai(weights[node2->input(1)]);
+            if (gather2_indices.size() != 1 || gather2_indices[0] != 0) continue;
 
-      // axis = 0
-      int gather3_axis = get_node_attr_i(*node3, "axis");
-      if (gather3_axis != 0) continue;
+            // axis = 0
+            int gather3_axis = get_node_attr_i(*node3, "axis");
+            if (gather3_axis != 0) continue;
 
-      // indices = 1
-      if (weights.find(node3->input(1)) == weights.end()) continue;
+            // indices = 1
+            if (weights.find(node3->input(1)) == weights.end()) continue;
 
-      std::vector<int> gather3_indices = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      if (gather3_indices.size() != 1 || gather3_indices[0] != 1) continue;
+            std::vector<int> gather3_indices = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            if (gather3_indices.size() != 1 || gather3_indices[0] != 1) continue;
 
-      // reduce
-      node2->set_op_type("noop_reducedncnn");
+            // reduce
+            node2->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 2;
-      node_reference[node2->input(1)] -= 1;
-      node_reference[node3->input(1)] -= 1;
+            node_reference[node->output(0)] -= 2;
+            node_reference[node2->input(1)] -= 1;
+            node_reference[node3->input(1)] -= 1;
 
-      node3->set_op_type("Split");
-      node3->clear_input();
-      node3->add_input(node->output(0));
-      node3->add_output(node3->output(0));
-      node3->set_output(0, node2->output(0));
+            node3->set_op_type("Split");
+            node3->clear_input();
+            node3->add_input(node->output(0));
+            node3->add_output(node3->output(0));
+            node3->set_output(0, node2->output(0));
 
-      node3->clear_attribute();
-      onnx::AttributeProto* attr_axis = node3->add_attribute();
-      attr_axis->set_name("axis");
-      attr_axis->set_i(1);
+            node3->clear_attribute();
+            onnx::AttributeProto* attr_axis = node3->add_attribute();
+            attr_axis->set_name("axis");
+            attr_axis->set_i(1);
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
 /**
@@ -369,2034 +416,2209 @@ void fuse_shufflechannel_split(onnx::GraphProto* mutable_graph,
  * @param blob_names
  * @param reduced_node_count
  */
-void fuse_conv_reshape(onnx::GraphProto* mutable_graph,
+void fuse_conv_reshape(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count) {
-  std::map<std::string, std::vector<int>> shape_context;
-  const int node_count = mutable_graph->node_size();
-
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* conv = mutable_graph->mutable_node(i);
-
-    if (conv->op_type() != "Conv") {
-      continue;
-    }
-
-    if (i + 4 >= node_count) {
-      continue;
-    }
-
-    onnx::NodeProto *shape = nullptr, *slice = nullptr, *concat = nullptr, *reshape = nullptr;
-
-    // match [Shape ... Slice, Concat ... Reshape] from near sequence, skip useless Constant
-    std::vector<std::tuple<std::string, onnx::NodeProto**>> candidates = {
-        {"Shape", &shape}, {"Slice", &slice}, {"Concat", &concat}, {"Reshape", &reshape}};
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count)
+{
+    std::map<std::string, std::vector<int>> shape_context;
+    const int                               node_count = mutable_graph->node_size();
+
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* conv = mutable_graph->mutable_node(i);
 
-    int MAX = std::min(10, node_count - i - 1);
-    int pos_candidate = 0;
+        if (conv->op_type() != "Conv")
+        {
+            continue;
+        }
 
-    for (int j = 0; j < MAX; ++j) {
-      auto node_ptr = mutable_graph->mutable_node(j + i + 1);
-      if (node_ptr->op_type() == "Constant") {
-        continue;
-      }
-      if (node_ptr->op_type() == std::get<0>(candidates[pos_candidate])) {
-        *(std::get<1>(candidates[pos_candidate])) = node_ptr;
-        pos_candidate++;
-      }
-    }
+        if (i + 4 >= node_count)
+        {
+            continue;
+        }
 
-    if (pos_candidate != candidates.size()) {
-      // not match the sequence
-      continue;
-    }
+        onnx::NodeProto *                                       shape = nullptr, *slice = nullptr, *concat = nullptr, *reshape = nullptr;
+
+        // match [Shape ... Slice, Concat ... Reshape] from near sequence, skip useless Constant
+        std::vector<std::tuple<std::string, onnx::NodeProto**>> candidates = {
+            {"Shape", &shape},
+            {"Slice", &slice},
+            {"Concat", &concat},
+            {"Reshape", &reshape}};
+
+        int MAX           = std::min(10, node_count - i - 1);
+        int pos_candidate = 0;
+
+        for (int j = 0; j < MAX; ++j)
+        {
+            auto node_ptr = mutable_graph->mutable_node(j + i + 1);
+            if (node_ptr->op_type() == "Constant")
+            {
+                continue;
+            }
+            if (node_ptr->op_type() == std::get<0>(candidates[pos_candidate]))
+            {
+                *(std::get<1>(candidates[pos_candidate])) = node_ptr;
+                pos_candidate++;
+            }
+        }
 
-    if (node_reference[conv->output(0)] != 2 || node_reference[shape->output(0)] != 1 ||
-        node_reference[slice->output(0)] != 1 || node_reference[concat->output(0)] != 1 ||
-        node_reference[reshape->output(0)] != 1) {
-      continue;
-    }
+        if (pos_candidate != candidates.size())
+        {
+            // not match the sequence
+            continue;
+        }
 
-    // check the connections
-    if (shape->input(0) != conv->output(0) || reshape->input(0) != conv->output(0)) {
-      continue;
-    }
-    if (slice->input(0) != shape->output(0)) {
-      continue;
-    }
-    if (concat->input(0) != slice->output(0)) {
-      continue;
-    }
-    if (reshape->input(0) != conv->output(0) || reshape->input(1) != concat->output(0)) {
-      continue;
-    }
+        if (node_reference[conv->output(0)] != 2 || node_reference[shape->output(0)] != 1 ||
+            node_reference[slice->output(0)] != 1 || node_reference[concat->output(0)] != 1 ||
+            node_reference[reshape->output(0)] != 1)
+        {
+            continue;
+        }
 
-    // add reshape attr
-    auto result = query_shape(mutable_graph, concat, weights, shape_context);
-    if (!std::get<0>(result)) {
-      continue;
-    }
-    set_node_attr_ai(*reshape, "shape", std::get<1>(result));
+        // check the connections
+        if (shape->input(0) != conv->output(0) || reshape->input(0) != conv->output(0))
+        {
+            continue;
+        }
+        if (slice->input(0) != shape->output(0))
+        {
+            continue;
+        }
+        if (concat->input(0) != slice->output(0))
+        {
+            continue;
+        }
+        if (reshape->input(0) != conv->output(0) || reshape->input(1) != concat->output(0))
+        {
+            continue;
+        }
 
-    // reconstruct graph
-    {
-      // remove reference
-      node_reference[reshape->input(1)] -= 1;
-      node_reference[concat->input(0)] -= 1;
-      node_reference[slice->input(0)] -= 1;
-      node_reference[shape->input(0)] -= 1;
-
-      // remove tensor/blob on edge
-      blob_names.erase(slice->input(0));
-      blob_names.erase(slice->input(1));
-      blob_names.erase(slice->input(2));
-      blob_names.erase(slice->input(3));
-      weights.erase(slice->input(1));
-      weights.erase(slice->input(2));
-      weights.erase(slice->input(3));
-
-      blob_names.erase(concat->input(0));
-      blob_names.erase(concat->input(1));
-      weights.erase(concat->input(1));
-
-      blob_names.erase(reshape->input(0));
-
-      // update edge
-      shape->clear_input();
-      reshape->clear_input();
-      reshape->add_input(conv->output(0));
-
-      shape->set_op_type("noop_reducedncnn");
-      slice->set_op_type("noop_reducedncnn");
-      concat->set_op_type("noop_reducedncnn");
-
-      reduced_node_count += 3;
+        // add reshape attr
+        auto result = query_shape(mutable_graph, concat, weights, shape_context);
+        if (!std::get<0>(result))
+        {
+            continue;
+        }
+        set_node_attr_ai(*reshape, "shape", std::get<1>(result));
+
+        // reconstruct graph
+        {
+            // remove reference
+            node_reference[reshape->input(1)] -= 1;
+            node_reference[concat->input(0)] -= 1;
+            node_reference[slice->input(0)] -= 1;
+            node_reference[shape->input(0)] -= 1;
+
+            // remove tensor/blob on edge
+            blob_names.erase(slice->input(0));
+            blob_names.erase(slice->input(1));
+            blob_names.erase(slice->input(2));
+            blob_names.erase(slice->input(3));
+            weights.erase(slice->input(1));
+            weights.erase(slice->input(2));
+            weights.erase(slice->input(3));
+
+            blob_names.erase(concat->input(0));
+            blob_names.erase(concat->input(1));
+            weights.erase(concat->input(1));
+
+            blob_names.erase(reshape->input(0));
+
+            // update edge
+            shape->clear_input();
+            reshape->clear_input();
+            reshape->add_input(conv->output(0));
+
+            shape->set_op_type("noop_reducedncnn");
+            slice->set_op_type("noop_reducedncnn");
+            concat->set_op_type("noop_reducedncnn");
+
+            reduced_node_count += 3;
+        }
+        i += 3;
     }
-    i += 3;
-  }
 }
 
-void fuse_binaryop_with_scalar(onnx::GraphProto* mutable_graph,
+void fuse_binaryop_with_scalar(onnx::GraphProto*                         mutable_graph,
                                std::map<std::string, onnx::TensorProto>& weights,
-                               std::map<std::string, int>& node_reference,
-                               std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                               std::map<std::string, int>&               node_reference,
+                               std::set<std::string>&                    blob_names,
+                               int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // Add/Sub/Mul/Div/Min/Max/Pow
-    if (node->op_type() == "Add" || node->op_type() == "Sub" || node->op_type() == "Mul" ||
-        node->op_type() == "Div" || node->op_type() == "Max" || node->op_type() == "Min" ||
-        node->op_type() == "Pow") {
-      if (weights.find(node->input(1)) == weights.end()) continue;
+        // Add/Sub/Mul/Div/Min/Max/Pow
+        if (node->op_type() == "Add" || node->op_type() == "Sub" || node->op_type() == "Mul" ||
+            node->op_type() == "Div" || node->op_type() == "Max" || node->op_type() == "Min" ||
+            node->op_type() == "Pow")
+        {
+            if (weights.find(node->input(1)) == weights.end()) continue;
 
-      const onnx::TensorProto& scalar_b = weights[node->input(1)];
-      if (scalar_b.dims_size() != 0 || get_tensor_proto_data_size(scalar_b) != 1) continue;
+            const onnx::TensorProto& scalar_b = weights[node->input(1)];
+            if (scalar_b.dims_size() != 0 || get_tensor_proto_data_size(scalar_b) != 1) continue;
 
-      float b = get_node_attr_from_input<float>(scalar_b);
+            float b = get_node_attr_from_input<float>(scalar_b);
 
-      node_reference[node->input(1)] -= 1;
+            node_reference[node->input(1)] -= 1;
 
-      std::string input = node->input(0);
+            std::string input = node->input(0);
 
-      node->clear_input();
-      node->add_input(input);
+            node->clear_input();
+            node->add_input(input);
 
-      onnx::AttributeProto* attr_with_scalar = node->add_attribute();
-      attr_with_scalar->set_name("with_scalar");
-      attr_with_scalar->set_i(1);
+            onnx::AttributeProto* attr_with_scalar = node->add_attribute();
+            attr_with_scalar->set_name("with_scalar");
+            attr_with_scalar->set_i(1);
 
-      onnx::AttributeProto* attr_b = node->add_attribute();
-      attr_b->set_name("b");
-      attr_b->set_f(b);
+            onnx::AttributeProto* attr_b = node->add_attribute();
+            attr_b->set_name("b");
+            attr_b->set_f(b);
+        }
     }
-  }
 }
 
-void fuse_hardswish(onnx::GraphProto* mutable_graph,
+void fuse_hardswish(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Div(/6)
-    // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Mul(*(1/6))
-    // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Constant - Div(/6)
-    // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Constant - Mul(*(1/6))
-    //     out = x * F.relu6(x + 3, inplace=True) / 6
-    if (node->op_type() == "Add") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      if (i + 3 >= node_count) continue;
-
-      if (weights.find(node->input(1)) == weights.end()) continue;
-
-      const onnx::TensorProto& add_three = weights[node->input(1)];
-      if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1) continue;
-
-      float constant_add_three = get_node_attr_from_input<float>(add_three);
-      if (constant_add_three != 3.f) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-
-      if (node4->op_type() == "Constant") {
-        if (i + 4 >= node_count) continue;
-
-        node4 = mutable_graph->mutable_node(i + 4);
-      }
-
-      if (node2->op_type() != "Clip" || node3->op_type() != "Mul" ||
-          (node4->op_type() != "Div" && node4->op_type() != "Mul"))
-        continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      float relu6_min;
-      float relu6_max;
-      if (node2->input_size() == 1) {
-        relu6_min = get_node_attr_f(*node2, "min", -FLT_MAX);
-        relu6_max = get_node_attr_f(*node2, "max", FLT_MAX);
-      } else {
-        const onnx::TensorProto& min_tp = weights[node2->input(1)];
-        const onnx::TensorProto& max_tp = weights[node2->input(2)];
-
-        relu6_min = get_node_attr_from_input<float>(min_tp);
-        relu6_max = get_node_attr_from_input<float>(max_tp);
-      }
-      if (relu6_min != 0.f || relu6_max != 6.f) continue;
-
-      if (node_reference[node3->output(0)] != 1) continue;
-
-      if (node3->input(0) != node->input(0) || node3->input(1) != node2->output(0)) continue;
-
-      if (weights.find(node4->input(1)) == weights.end()) continue;
-
-      const onnx::TensorProto& div_six = weights[node4->input(1)];
-      if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1) continue;
-
-      float constant_div_six = get_node_attr_from_input<float>(div_six);
-      if (node4->op_type() == "Div" && constant_div_six != 6.f) continue;
-      if (node4->op_type() == "Mul" && constant_div_six != 1 / 6.f) continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(0)] -= 1;
-      node_reference[node->input(1)] -= 1;
-      node_reference[node->output(0)] -= 1;
-      if (node2->input_size() == 3) {
-        node_reference[node2->input(1)] -= 1;
-        node_reference[node2->input(2)] -= 1;
-      }
-      node_reference[node2->output(0)] -= 1;
-      node_reference[node3->output(0)] -= 1;
-      node_reference[node4->input(1)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      blob_names.erase(node3->output(0));
-
-      node4->set_op_type("HardSwish");
-      node4->clear_input();
-      node4->add_input(node->input(0));
-
-      onnx::AttributeProto* attr_alpha = node4->add_attribute();
-      attr_alpha->set_name("alpha");
-      attr_alpha->set_f(1.f / 6.f);
-
-      onnx::AttributeProto* attr_beta = node4->add_attribute();
-      attr_beta->set_name("beta");
-      attr_beta->set_f(3.f / 6.f);
-
-      reduced_node_count += 3;
-      i += 3;
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Div(/6)
+        // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Mul(*(1/6))
+        // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Constant - Div(/6)
+        // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Constant - Mul(*(1/6))
+        //     out = x * F.relu6(x + 3, inplace=True) / 6
+        if (node->op_type() == "Add")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            if (i + 3 >= node_count) continue;
+
+            if (weights.find(node->input(1)) == weights.end()) continue;
+
+            const onnx::TensorProto& add_three = weights[node->input(1)];
+            if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1) continue;
+
+            float constant_add_three = get_node_attr_from_input<float>(add_three);
+            if (constant_add_three != 3.f) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
+
+            if (node4->op_type() == "Constant")
+            {
+                if (i + 4 >= node_count) continue;
+
+                node4 = mutable_graph->mutable_node(i + 4);
+            }
+
+            if (node2->op_type() != "Clip" || node3->op_type() != "Mul" ||
+                (node4->op_type() != "Div" && node4->op_type() != "Mul"))
+                continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            float relu6_min;
+            float relu6_max;
+            if (node2->input_size() == 1)
+            {
+                relu6_min = get_node_attr_f(*node2, "min", -FLT_MAX);
+                relu6_max = get_node_attr_f(*node2, "max", FLT_MAX);
+            }
+            else
+            {
+                const onnx::TensorProto& min_tp = weights[node2->input(1)];
+                const onnx::TensorProto& max_tp = weights[node2->input(2)];
+
+                relu6_min = get_node_attr_from_input<float>(min_tp);
+                relu6_max = get_node_attr_from_input<float>(max_tp);
+            }
+            if (relu6_min != 0.f || relu6_max != 6.f) continue;
+
+            if (node_reference[node3->output(0)] != 1) continue;
+
+            if (node3->input(0) != node->input(0) || node3->input(1) != node2->output(0)) continue;
+
+            if (weights.find(node4->input(1)) == weights.end()) continue;
+
+            const onnx::TensorProto& div_six = weights[node4->input(1)];
+            if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1) continue;
+
+            float constant_div_six = get_node_attr_from_input<float>(div_six);
+            if (node4->op_type() == "Div" && constant_div_six != 6.f) continue;
+            if (node4->op_type() == "Mul" && constant_div_six != 1 / 6.f) continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(0)] -= 1;
+            node_reference[node->input(1)] -= 1;
+            node_reference[node->output(0)] -= 1;
+            if (node2->input_size() == 3)
+            {
+                node_reference[node2->input(1)] -= 1;
+                node_reference[node2->input(2)] -= 1;
+            }
+            node_reference[node2->output(0)] -= 1;
+            node_reference[node3->output(0)] -= 1;
+            node_reference[node4->input(1)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            blob_names.erase(node3->output(0));
+
+            node4->set_op_type("HardSwish");
+            node4->clear_input();
+            node4->add_input(node->input(0));
+
+            onnx::AttributeProto* attr_alpha = node4->add_attribute();
+            attr_alpha->set_name("alpha");
+            attr_alpha->set_f(1.f / 6.f);
+
+            onnx::AttributeProto* attr_beta = node4->add_attribute();
+            attr_beta->set_name("beta");
+            attr_beta->set_f(3.f / 6.f);
+
+            reduced_node_count += 3;
+            i += 3;
+        }
     }
-  }
 
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // HardSwish <= HardSigmoid - Mul
-    //     out = x * hsigmoid(x)
-    if (node->op_type() == "HardSigmoid") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // HardSwish <= HardSigmoid - Mul
+        //     out = x * hsigmoid(x)
+        if (node->op_type() == "HardSigmoid")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      float alpha = get_node_attr_f(*node, "alpha", 0.2f);
-      float beta = get_node_attr_f(*node, "beta", 0.5f);
+            float alpha = get_node_attr_f(*node, "alpha", 0.2f);
+            float beta  = get_node_attr_f(*node, "beta", 0.5f);
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "Mul") continue;
+            if (node2->op_type() != "Mul") continue;
 
-      if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0)) continue;
+            if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
 
-      node_reference[node->input(0)] -= 1;
-      node_reference[node->output(0)] -= 1;
+            node_reference[node->input(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      node2->set_op_type("HardSwish");
-      node2->clear_input();
-      node2->add_input(node->input(0));
+            node2->set_op_type("HardSwish");
+            node2->clear_input();
+            node2->add_input(node->input(0));
 
-      onnx::AttributeProto* attr_alpha = node2->add_attribute();
-      attr_alpha->set_name("alpha");
-      attr_alpha->set_f(alpha);
+            onnx::AttributeProto* attr_alpha = node2->add_attribute();
+            attr_alpha->set_name("alpha");
+            attr_alpha->set_f(alpha);
 
-      onnx::AttributeProto* attr_beta = node2->add_attribute();
-      attr_beta->set_name("beta");
-      attr_beta->set_f(beta);
+            onnx::AttributeProto* attr_beta = node2->add_attribute();
+            attr_beta->set_name("beta");
+            attr_beta->set_f(beta);
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_hardsigmoid(onnx::GraphProto* mutable_graph,
+void fuse_hardsigmoid(onnx::GraphProto*                         mutable_graph,
                       std::map<std::string, onnx::TensorProto>& weights,
-                      std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                      int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // HardSigmoid <= Add(+3) - Clip(0,6) - Div(/6)
-    // HardSigmoid <= Add(+3) - Clip(0,6) - Mul(*(1/6))
-    // HardSigmoid <= Add(+3) - Clip(0,6) - Constant - Div(/6)
-    // HardSigmoid <= Add(+3) - Clip(0,6) - Constant - Mul(*(1/6))
-    //     out = F.relu6(x + 3, inplace=True) / 6
-    if (node->op_type() == "Add") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      if (i + 2 >= node_count) continue;
-
-      if (weights.find(node->input(1)) == weights.end()) continue;
-
-      const onnx::TensorProto& add_three = weights[node->input(1)];
-      if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1) continue;
-
-      float constant_add_three = get_node_attr_from_input<float>(add_three);
-      if (constant_add_three != 3.f) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-
-      if (node3->op_type() == "Constant") {
-        if (i + 3 >= node_count) continue;
-
-        node3 = mutable_graph->mutable_node(i + 3);
-      }
-
-      if (node2->op_type() != "Clip" || (node3->op_type() != "Div" && node3->op_type() != "Mul"))
-        continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      float relu6_min;
-      float relu6_max;
-      if (node2->input_size() == 1) {
-        relu6_min = get_node_attr_f(*node2, "min", -FLT_MAX);
-        relu6_max = get_node_attr_f(*node2, "max", FLT_MAX);
-      } else {
-        const onnx::TensorProto& min_tp = weights[node2->input(1)];
-        const onnx::TensorProto& max_tp = weights[node2->input(2)];
-
-        relu6_min = get_node_attr_from_input<float>(min_tp);
-        relu6_max = get_node_attr_from_input<float>(max_tp);
-      }
-      if (relu6_min != 0.f || relu6_max != 6.f) continue;
-
-      if (weights.find(node3->input(1)) == weights.end()) continue;
-
-      const onnx::TensorProto& div_six = weights[node3->input(1)];
-      if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1) continue;
-
-      float constant_div_six = get_node_attr_from_input<float>(div_six);
-      if (node3->op_type() == "Div" && constant_div_six != 6.f) continue;
-      if (node3->op_type() == "Mul" && constant_div_six != 1 / 6.f) continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(1)] -= 1;
-      node_reference[node->output(0)] -= 1;
-      if (node2->input_size() == 3) {
-        node_reference[node2->input(1)] -= 1;
-        node_reference[node2->input(2)] -= 1;
-      }
-      node_reference[node2->output(0)] -= 1;
-      node_reference[node3->input(1)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-
-      node3->set_op_type("HardSigmoid");
-      node3->clear_input();
-      node3->add_input(node->input(0));
-
-      onnx::AttributeProto* attr_alpha = node3->add_attribute();
-      attr_alpha->set_name("alpha");
-      attr_alpha->set_f(1.f / 6.f);
-
-      onnx::AttributeProto* attr_beta = node3->add_attribute();
-      attr_beta->set_name("beta");
-      attr_beta->set_f(3.f / 6.f);
-
-      reduced_node_count += 2;
-      i += 2;
+                      std::map<std::string, int>&               node_reference,
+                      std::set<std::string>&                    blob_names,
+                      int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // HardSigmoid <= Add(+3) - Clip(0,6) - Div(/6)
+        // HardSigmoid <= Add(+3) - Clip(0,6) - Mul(*(1/6))
+        // HardSigmoid <= Add(+3) - Clip(0,6) - Constant - Div(/6)
+        // HardSigmoid <= Add(+3) - Clip(0,6) - Constant - Mul(*(1/6))
+        //     out = F.relu6(x + 3, inplace=True) / 6
+        if (node->op_type() == "Add")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            if (i + 2 >= node_count) continue;
+
+            if (weights.find(node->input(1)) == weights.end()) continue;
+
+            const onnx::TensorProto& add_three = weights[node->input(1)];
+            if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1) continue;
+
+            float constant_add_three = get_node_attr_from_input<float>(add_three);
+            if (constant_add_three != 3.f) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+
+            if (node3->op_type() == "Constant")
+            {
+                if (i + 3 >= node_count) continue;
+
+                node3 = mutable_graph->mutable_node(i + 3);
+            }
+
+            if (node2->op_type() != "Clip" || (node3->op_type() != "Div" && node3->op_type() != "Mul"))
+                continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            float relu6_min;
+            float relu6_max;
+            if (node2->input_size() == 1)
+            {
+                relu6_min = get_node_attr_f(*node2, "min", -FLT_MAX);
+                relu6_max = get_node_attr_f(*node2, "max", FLT_MAX);
+            }
+            else
+            {
+                const onnx::TensorProto& min_tp = weights[node2->input(1)];
+                const onnx::TensorProto& max_tp = weights[node2->input(2)];
+
+                relu6_min = get_node_attr_from_input<float>(min_tp);
+                relu6_max = get_node_attr_from_input<float>(max_tp);
+            }
+            if (relu6_min != 0.f || relu6_max != 6.f) continue;
+
+            if (weights.find(node3->input(1)) == weights.end()) continue;
+
+            const onnx::TensorProto& div_six = weights[node3->input(1)];
+            if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1) continue;
+
+            float constant_div_six = get_node_attr_from_input<float>(div_six);
+            if (node3->op_type() == "Div" && constant_div_six != 6.f) continue;
+            if (node3->op_type() == "Mul" && constant_div_six != 1 / 6.f) continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(1)] -= 1;
+            node_reference[node->output(0)] -= 1;
+            if (node2->input_size() == 3)
+            {
+                node_reference[node2->input(1)] -= 1;
+                node_reference[node2->input(2)] -= 1;
+            }
+            node_reference[node2->output(0)] -= 1;
+            node_reference[node3->input(1)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+
+            node3->set_op_type("HardSigmoid");
+            node3->clear_input();
+            node3->add_input(node->input(0));
+
+            onnx::AttributeProto* attr_alpha = node3->add_attribute();
+            attr_alpha->set_name("alpha");
+            attr_alpha->set_f(1.f / 6.f);
+
+            onnx::AttributeProto* attr_beta = node3->add_attribute();
+            attr_beta->set_name("beta");
+            attr_beta->set_f(3.f / 6.f);
+
+            reduced_node_count += 2;
+            i += 2;
+        }
     }
-  }
 }
 
-void fuse_swish(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights,
-                std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+void fuse_swish(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference, std::set<std::string>& blob_names, int& reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // Swish <= Sigmoid - Mul
-    //     x * torch.sigmoid(x)
-    if (node->op_type() == "Sigmoid") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // Swish <= Sigmoid - Mul
+        //     x * torch.sigmoid(x)
+        if (node->op_type() == "Sigmoid")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "Mul") continue;
+            if (node2->op_type() != "Mul") continue;
 
-      if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0)) continue;
+            if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
 
-      node_reference[node->input(0)] -= 1;
-      node_reference[node->output(0)] -= 1;
+            node_reference[node->input(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      node2->set_op_type("Swish");
-      node2->clear_input();
-      node2->add_input(node->input(0));
+            node2->set_op_type("Swish");
+            node2->clear_input();
+            node2->add_input(node->input(0));
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_batchnorm1d_squeeze_unsqueeze(onnx::GraphProto* mutable_graph,
+void fuse_batchnorm1d_squeeze_unsqueeze(onnx::GraphProto*                         mutable_graph,
                                         std::map<std::string, onnx::TensorProto>& weights,
-                                        std::map<std::string, int>& node_reference,
-                                        std::set<std::string>& blob_names,
-                                        int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                                        std::map<std::string, int>&               node_reference,
+                                        std::set<std::string>&                    blob_names,
+                                        int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // BatchNormalization <= Unsqueeze - BatchNormalization - Squeeze
-    if (node->op_type() == "Unsqueeze") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // BatchNormalization <= Unsqueeze - BatchNormalization - Squeeze
+        if (node->op_type() == "Unsqueeze")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 2 >= node_count) continue;
+            if (i + 2 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
 
-      if (node2->op_type() != "BatchNormalization" || node3->op_type() != "Squeeze") continue;
+            if (node2->op_type() != "BatchNormalization" || node3->op_type() != "Squeeze") continue;
 
-      if (node_reference[node2->output(0)] != 1) continue;
+            if (node_reference[node2->output(0)] != 1) continue;
 
-      if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0)) continue;
+            if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
 
-      node2->set_input(0, node->input(0));
-      node2->set_output(0, node3->output(0));
+            node2->set_input(0, node->input(0));
+            node2->set_output(0, node3->output(0));
 
-      reduced_node_count += 2;
-      i += 2;
+            reduced_node_count += 2;
+            i += 2;
+        }
     }
-  }
 }
 
-void fuse_unsqueeze_prelu(onnx::GraphProto* mutable_graph,
+void fuse_unsqueeze_prelu(onnx::GraphProto*                         mutable_graph,
                           std::map<std::string, onnx::TensorProto>& weights,
-                          std::map<std::string, int>& node_reference,
-                          std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                          std::map<std::string, int>&               node_reference,
+                          std::set<std::string>&                    blob_names,
+                          int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // PReLU <= Unsqueeze - PReLU
-    if (node->op_type() == "Unsqueeze") {
-      // check weight
-      if (weights.find(node->input(0)) == weights.end()) continue;
+        // PReLU <= Unsqueeze - PReLU
+        if (node->op_type() == "Unsqueeze")
+        {
+            // check weight
+            if (weights.find(node->input(0)) == weights.end()) continue;
 
-      onnx::TensorProto& B = weights[node->input(0)];
-      if (B.dims_size() != 1) continue;
+            onnx::TensorProto& B = weights[node->input(0)];
+            if (B.dims_size() != 1) continue;
 
-      if (node_reference[node->output(0)] != 1) continue;
+            if (node_reference[node->output(0)] != 1) continue;
 
-      // axes = (1, 2)
-      std::vector<int> axes = get_node_attr_ai(*node, "axes");
-      if (axes.size() != 2) continue;
-      if (axes[0] != 1 || axes[1] != 2) continue;
+            // axes = (1, 2)
+            std::vector<int> axes = get_node_attr_ai(*node, "axes");
+            if (axes.size() != 2) continue;
+            if (axes[0] != 1 || axes[1] != 2) continue;
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "PRelu") continue;
+            if (node2->op_type() != "PRelu") continue;
 
-      if (node2->input(1) != node->output(0)) continue;
+            if (node2->input(1) != node->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      node2->set_input(1, node->input(0));
+            node2->set_input(1, node->input(0));
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_normalize(onnx::GraphProto* mutable_graph,
+void fuse_normalize(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // Normalize <= X - ReduceL2 - Clip - Expand - Div
-    // Normalize <= X - ReduceL2 - Clip - Shape - Expand - Div
-    if (node->op_type() == "ReduceL2") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      // axes = (1)
-      std::vector<int> axes = get_node_attr_ai(*node, "axes");
-      if (axes.size() != 1) continue;
-      if (axes[0] != 1) continue;
-
-      if (i + 3 >= node_count) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-
-      bool has_shape_node = node3->op_type() == "Shape";
-      onnx::NodeProto* node_shape = 0;
-      if (has_shape_node) {
-        if (i + 4 >= node_count) continue;
-
-        node_shape = node3;
-        node3 = mutable_graph->mutable_node(i + 3);
-        node4 = mutable_graph->mutable_node(i + 4);
-      }
-
-      if (node2->op_type() != "Clip" || node3->op_type() != "Expand" || node4->op_type() != "Div")
-        continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      if (node_reference[node3->output(0)] != 1) continue;
-
-      if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
-          node4->input(0) != node->input(0) || node4->input(1) != node3->output(0))
-        continue;
-
-      if (has_shape_node) {
-        if (node_shape->input(0) != node->input(0) || node3->input(1) != node_shape->output(0))
-          continue;
-      }
-
-      // +eps
-      float clip_min;
-      if (node2->input_size() == 1) {
-        clip_min = get_node_attr_f(*node2, "min", -FLT_MAX);
-      } else {
-        const onnx::TensorProto& min_tp = weights[node2->input(1)];
-
-        clip_min = get_node_attr_from_input<float>(min_tp);
-      }
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      if (has_shape_node) {
-        node_shape->set_op_type("noop_reducedncnn");
-      }
-      node3->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(0)] -= has_shape_node ? 2 : 1;
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (has_shape_node) {
-        node_reference[node_shape->output(0)] -= 1;
-      }
-      node_reference[node3->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      if (has_shape_node) {
-        blob_names.erase(node_shape->output(0));
-      }
-      blob_names.erase(node3->output(0));
-
-      node4->set_op_type("Normalize");
-      node4->clear_input();
-      node4->add_input(node->input(0));
-
-      onnx::AttributeProto* attr_alpha = node4->add_attribute();
-      attr_alpha->set_name("eps");
-      attr_alpha->set_f(clip_min);
-
-      reduced_node_count += has_shape_node ? 4 : 3;
-      i += has_shape_node ? 4 : 3;
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // Normalize <= X - ReduceL2 - Clip - Expand - Div
+        // Normalize <= X - ReduceL2 - Clip - Shape - Expand - Div
+        if (node->op_type() == "ReduceL2")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            // axes = (1)
+            std::vector<int> axes = get_node_attr_ai(*node, "axes");
+            if (axes.size() != 1) continue;
+            if (axes[0] != 1) continue;
+
+            if (i + 3 >= node_count) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
+
+            bool             has_shape_node = node3->op_type() == "Shape";
+            onnx::NodeProto* node_shape     = 0;
+            if (has_shape_node)
+            {
+                if (i + 4 >= node_count) continue;
+
+                node_shape = node3;
+                node3      = mutable_graph->mutable_node(i + 3);
+                node4      = mutable_graph->mutable_node(i + 4);
+            }
+
+            if (node2->op_type() != "Clip" || node3->op_type() != "Expand" || node4->op_type() != "Div")
+                continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            if (node_reference[node3->output(0)] != 1) continue;
+
+            if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
+                node4->input(0) != node->input(0) || node4->input(1) != node3->output(0))
+                continue;
+
+            if (has_shape_node)
+            {
+                if (node_shape->input(0) != node->input(0) || node3->input(1) != node_shape->output(0))
+                    continue;
+            }
+
+            // +eps
+            float clip_min;
+            if (node2->input_size() == 1)
+            {
+                clip_min = get_node_attr_f(*node2, "min", -FLT_MAX);
+            }
+            else
+            {
+                const onnx::TensorProto& min_tp = weights[node2->input(1)];
+
+                clip_min = get_node_attr_from_input<float>(min_tp);
+            }
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            if (has_shape_node)
+            {
+                node_shape->set_op_type("noop_reducedncnn");
+            }
+            node3->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(0)] -= has_shape_node ? 2 : 1;
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (has_shape_node)
+            {
+                node_reference[node_shape->output(0)] -= 1;
+            }
+            node_reference[node3->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            if (has_shape_node)
+            {
+                blob_names.erase(node_shape->output(0));
+            }
+            blob_names.erase(node3->output(0));
+
+            node4->set_op_type("Normalize");
+            node4->clear_input();
+            node4->add_input(node->input(0));
+
+            onnx::AttributeProto* attr_alpha = node4->add_attribute();
+            attr_alpha->set_name("eps");
+            attr_alpha->set_f(clip_min);
+
+            reduced_node_count += has_shape_node ? 4 : 3;
+            i += has_shape_node ? 4 : 3;
+        }
     }
-  }
 }
 
-void fuse_groupnorm(onnx::GraphProto* mutable_graph,
+void fuse_groupnorm(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // GroupNorm <= X - Reshape - InstanceNormalization - Reshape - Mul - Add
-    if (node->op_type() == "Reshape") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      std::vector<int> shape;
-      if (node->input_size() == 1) {
-        shape = get_node_attr_ai(*node, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node->input(1)) == weights.end()) continue;
-
-        shape = get_node_attr_from_input_ai(weights[node->input(1)]);
-      }
-
-      // 0, group, -1
-      if (shape.size() != 3) continue;
-
-      if (shape[0] != 0 || shape[2] != -1) continue;
-
-      int groups = shape[1];
-
-      if (i + 4 >= node_count) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-      onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
-
-      if (node2->op_type() != "InstanceNormalization" || node3->op_type() != "Reshape" ||
-          node4->op_type() != "Mul" || node5->op_type() != "Add")
-        continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      if (node_reference[node3->output(0)] != 1) continue;
-
-      if (node_reference[node4->output(0)] != 1) continue;
-
-      if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
-          node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0))
-        continue;
-
-      // +eps
-      float eps = get_node_attr_f(*node2, "epsilon", 1e-05f);
-
-      // InstanceNormalization S=1 B=0
-      std::vector<float> S = get_node_attr_from_input_af(weights[node2->input(1)]);
-      std::vector<float> B = get_node_attr_from_input_af(weights[node2->input(2)]);
-      if ((int)S.size() != groups || (int)B.size() != groups) continue;
-
-      bool instancenorm_affine = false;
-      for (int j = 0; j < groups; j++) {
-        if (S[j] != 1.f || B[j] != 0.f) {
-          instancenorm_affine = true;
-          break;
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // GroupNorm <= X - Reshape - InstanceNormalization - Reshape - Mul - Add
+        if (node->op_type() == "Reshape")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            std::vector<int> shape;
+            if (node->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node->input(1)) == weights.end()) continue;
+
+                shape = get_node_attr_from_input_ai(weights[node->input(1)]);
+            }
+
+            // 0, group, -1
+            if (shape.size() != 3) continue;
+
+            if (shape[0] != 0 || shape[2] != -1) continue;
+
+            int groups = shape[1];
+
+            if (i + 4 >= node_count) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
+            onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
+
+            if (node2->op_type() != "InstanceNormalization" || node3->op_type() != "Reshape" ||
+                node4->op_type() != "Mul" || node5->op_type() != "Add")
+                continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            if (node_reference[node3->output(0)] != 1) continue;
+
+            if (node_reference[node4->output(0)] != 1) continue;
+
+            if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
+                node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0))
+                continue;
+
+            // +eps
+            float              eps = get_node_attr_f(*node2, "epsilon", 1e-05f);
+
+            // InstanceNormalization S=1 B=0
+            std::vector<float> S = get_node_attr_from_input_af(weights[node2->input(1)]);
+            std::vector<float> B = get_node_attr_from_input_af(weights[node2->input(2)]);
+            if ((int)S.size() != groups || (int)B.size() != groups) continue;
+
+            bool instancenorm_affine = false;
+            for (int j = 0; j < groups; j++)
+            {
+                if (S[j] != 1.f || B[j] != 0.f)
+                {
+                    instancenorm_affine = true;
+                    break;
+                }
+            }
+
+            if (instancenorm_affine) continue;
+
+            std::vector<int> shape2;
+            if (node3->input_size() == 1)
+            {
+                shape2 = get_node_attr_ai(*node3, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node3->input(1)) == weights.end()) continue;
+
+                shape2 = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            }
+
+            // 1, channels, w, h
+            if (shape2.size() != 4) continue;
+
+            if (shape2[0] != 1) continue;
+
+            int                channels = shape2[1];
+
+            // affine
+            std::vector<float> affine_S = get_node_attr_from_input_af(weights[node4->input(1)]);
+            std::vector<float> affine_B = get_node_attr_from_input_af(weights[node5->input(1)]);
+            if (affine_S.size() == 1 && affine_S[0] == 1.f && affine_B.size() == 1 &&
+                affine_B[0] == 0.f)
+            {
+                // no affine
+            }
+            else if ((int)affine_S.size() != channels && (int)affine_B.size() != channels)
+            {
+                // we only allow per-channel affine
+                continue;
+            }
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
+            node4->set_op_type("noop_reducedncnn");
+
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->input(1)] -= 1;
+            node_reference[node2->input(2)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
+            node_reference[node3->output(0)] -= 1;
+            node_reference[node4->output(0)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            blob_names.erase(node3->output(0));
+            blob_names.erase(node4->output(0));
+
+            std::string affine_scale = node4->input(1);
+            std::string affine_bias  = node5->input(1);
+
+            node5->set_op_type("GroupNorm");
+            node5->clear_input();
+            node5->add_input(node->input(0));
+            node5->add_input(affine_scale);
+            node5->add_input(affine_bias);
+
+            onnx::AttributeProto* attr_groups = node5->add_attribute();
+            attr_groups->set_name("groups");
+            attr_groups->set_i(groups);
+
+            onnx::AttributeProto* attr_channels = node5->add_attribute();
+            attr_channels->set_name("channels");
+            attr_channels->set_i(channels);
+
+            onnx::AttributeProto* attr_eps = node5->add_attribute();
+            attr_eps->set_name("epsilon");
+            attr_eps->set_f(eps);
+
+            onnx::AttributeProto* attr_affine = node5->add_attribute();
+            attr_affine->set_name("affine");
+            attr_affine->set_i(1);
+
+            reduced_node_count += 4;
+            i += 4;
         }
-      }
-
-      if (instancenorm_affine) continue;
-
-      std::vector<int> shape2;
-      if (node3->input_size() == 1) {
-        shape2 = get_node_attr_ai(*node3, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node3->input(1)) == weights.end()) continue;
-
-        shape2 = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      }
-
-      // 1, channels, w, h
-      if (shape2.size() != 4) continue;
-
-      if (shape2[0] != 1) continue;
-
-      int channels = shape2[1];
-
-      // affine
-      std::vector<float> affine_S = get_node_attr_from_input_af(weights[node4->input(1)]);
-      std::vector<float> affine_B = get_node_attr_from_input_af(weights[node5->input(1)]);
-      if (affine_S.size() == 1 && affine_S[0] == 1.f && affine_B.size() == 1 &&
-          affine_B[0] == 0.f) {
-        // no affine
-      } else if ((int)affine_S.size() != channels && (int)affine_B.size() != channels) {
-        // we only allow per-channel affine
-        continue;
-      }
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
-      node4->set_op_type("noop_reducedncnn");
-
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->input(1)] -= 1;
-      node_reference[node2->input(2)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
-      node_reference[node3->output(0)] -= 1;
-      node_reference[node4->output(0)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      blob_names.erase(node3->output(0));
-      blob_names.erase(node4->output(0));
-
-      std::string affine_scale = node4->input(1);
-      std::string affine_bias = node5->input(1);
-
-      node5->set_op_type("GroupNorm");
-      node5->clear_input();
-      node5->add_input(node->input(0));
-      node5->add_input(affine_scale);
-      node5->add_input(affine_bias);
-
-      onnx::AttributeProto* attr_groups = node5->add_attribute();
-      attr_groups->set_name("groups");
-      attr_groups->set_i(groups);
-
-      onnx::AttributeProto* attr_channels = node5->add_attribute();
-      attr_channels->set_name("channels");
-      attr_channels->set_i(channels);
-
-      onnx::AttributeProto* attr_eps = node5->add_attribute();
-      attr_eps->set_name("epsilon");
-      attr_eps->set_f(eps);
-
-      onnx::AttributeProto* attr_affine = node5->add_attribute();
-      attr_affine->set_name("affine");
-      attr_affine->set_i(1);
-
-      reduced_node_count += 4;
-      i += 4;
     }
-  }
 }
 
-void fuse_layernorm(onnx::GraphProto* mutable_graph,
+void fuse_layernorm(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // LayerNorm <= X - ReduceMean - Sub - Pow - ReduceMean - Add - Sqrt - Div
-    // LayerNorm <= X - ReduceMean - Sub - Pow - ReduceMean - Add - Sqrt - Div -
-    // Mul - Add
-    if (node->op_type() == "ReduceMean") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      std::vector<int> axes = get_node_attr_ai(*node, "axes");
-
-      // -1
-      // -2 -1
-      if (axes.size() != 1 && axes.size() != 2) continue;
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-      int normed_axes = (int)axes.size();
-      if (normed_axes == 1 && axes[0] != -1) continue;
-      if (normed_axes == 2 && (axes[0] != -2 || axes[1] != -1)) continue;
+        // LayerNorm <= X - ReduceMean - Sub - Pow - ReduceMean - Add - Sqrt - Div
+        // LayerNorm <= X - ReduceMean - Sub - Pow - ReduceMean - Add - Sqrt - Div -
+        // Mul - Add
+        if (node->op_type() == "ReduceMean")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 6 >= node_count) continue;
+            std::vector<int> axes = get_node_attr_ai(*node, "axes");
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-      onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
-      onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
-      onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
+            // -1
+            // -2 -1
+            if (axes.size() != 1 && axes.size() != 2) continue;
 
-      if (node2->op_type() != "Sub" || node3->op_type() != "Pow" ||
-          node4->op_type() != "ReduceMean" || node5->op_type() != "Add" ||
-          node6->op_type() != "Sqrt" || node7->op_type() != "Div")
-        continue;
+            int normed_axes = (int)axes.size();
+            if (normed_axes == 1 && axes[0] != -1) continue;
+            if (normed_axes == 2 && (axes[0] != -2 || axes[1] != -1)) continue;
 
-      if (node_reference[node2->output(0)] != 2) continue;
+            if (i + 6 >= node_count) continue;
 
-      if (node_reference[node3->output(0)] != 1) continue;
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
+            onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
+            onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
+            onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
 
-      if (node_reference[node4->output(0)] != 1) continue;
+            if (node2->op_type() != "Sub" || node3->op_type() != "Pow" ||
+                node4->op_type() != "ReduceMean" || node5->op_type() != "Add" ||
+                node6->op_type() != "Sqrt" || node7->op_type() != "Div")
+                continue;
 
-      if (node_reference[node5->output(0)] != 1) continue;
+            if (node_reference[node2->output(0)] != 2) continue;
 
-      if (node_reference[node6->output(0)] != 1) continue;
+            if (node_reference[node3->output(0)] != 1) continue;
 
-      if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0) ||
-          node3->input(0) != node2->output(0) || node4->input(0) != node3->output(0) ||
-          node5->input(0) != node4->output(0) || node6->input(0) != node5->output(0) ||
-          node7->input(0) != node2->output(0) || node7->input(1) != node6->output(0))
-        continue;
+            if (node_reference[node4->output(0)] != 1) continue;
 
-      if (weights.find(node3->input(1)) == weights.end()) continue;
+            if (node_reference[node5->output(0)] != 1) continue;
 
-      const onnx::TensorProto& pow_two = weights[node3->input(1)];
-      if (pow_two.dims_size() != 0 || get_tensor_proto_data_size(pow_two) != 1) continue;
+            if (node_reference[node6->output(0)] != 1) continue;
 
-      float constant_pow_two = get_node_attr_from_input<float>(pow_two);
-      if (constant_pow_two != 2.f) continue;
+            if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0) ||
+                node3->input(0) != node2->output(0) || node4->input(0) != node3->output(0) ||
+                node5->input(0) != node4->output(0) || node6->input(0) != node5->output(0) ||
+                node7->input(0) != node2->output(0) || node7->input(1) != node6->output(0))
+                continue;
 
-      std::vector<int> axes4 = get_node_attr_ai(*node4, "axes");
+            if (weights.find(node3->input(1)) == weights.end()) continue;
 
-      // -1
-      // -2 -1
-      if ((int)axes4.size() != normed_axes) continue;
+            const onnx::TensorProto& pow_two = weights[node3->input(1)];
+            if (pow_two.dims_size() != 0 || get_tensor_proto_data_size(pow_two) != 1) continue;
 
-      if (normed_axes == 1 && axes4[0] != -1) continue;
-      if (normed_axes == 2 && (axes4[0] != -2 || axes4[1] != -1)) continue;
+            float constant_pow_two = get_node_attr_from_input<float>(pow_two);
+            if (constant_pow_two != 2.f) continue;
 
-      if (weights.find(node5->input(1)) == weights.end()) continue;
+            std::vector<int> axes4 = get_node_attr_ai(*node4, "axes");
 
-      const onnx::TensorProto& add_eps = weights[node5->input(1)];
-      if (add_eps.dims_size() != 0 || get_tensor_proto_data_size(add_eps) != 1) continue;
+            // -1
+            // -2 -1
+            if ((int)axes4.size() != normed_axes) continue;
 
-      float eps = get_node_attr_from_input<float>(add_eps);
+            if (normed_axes == 1 && axes4[0] != -1) continue;
+            if (normed_axes == 2 && (axes4[0] != -2 || axes4[1] != -1)) continue;
 
-      int affine = 0;
-      while (i + 8 < node_count) {
-        onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
-        onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
+            if (weights.find(node5->input(1)) == weights.end()) continue;
 
-        if (node8->op_type() != "Mul" || node9->op_type() != "Add") break;
+            const onnx::TensorProto& add_eps = weights[node5->input(1)];
+            if (add_eps.dims_size() != 0 || get_tensor_proto_data_size(add_eps) != 1) continue;
 
-        if (node_reference[node7->output(0)] != 1) break;
+            float eps = get_node_attr_from_input<float>(add_eps);
 
-        if (node_reference[node8->output(0)] != 1) break;
+            int   affine = 0;
+            while (i + 8 < node_count)
+            {
+                onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
+                onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
 
-        if (node8->input(0) != node7->output(0) || node9->input(0) != node8->output(0)) break;
+                if (node8->op_type() != "Mul" || node9->op_type() != "Add") break;
 
-        // affine
-        std::vector<float> affine_S = get_node_attr_from_input_af(weights[node8->input(1)]);
-        std::vector<float> affine_B = get_node_attr_from_input_af(weights[node9->input(1)]);
-        if (affine_S.size() != affine_B.size()) break;
+                if (node_reference[node7->output(0)] != 1) break;
 
-        affine = 1;
-        break;
-      }
+                if (node_reference[node8->output(0)] != 1) break;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
-      node4->set_op_type("noop_reducedncnn");
-      node5->set_op_type("noop_reducedncnn");
-      node6->set_op_type("noop_reducedncnn");
+                if (node8->input(0) != node7->output(0) || node9->input(0) != node8->output(0)) break;
 
-      node_reference[node->input(0)] -= 1;
-      node_reference[node2->input(0)] -= 1;
-      node_reference[node2->input(1)] -= 1;
-      node_reference[node3->input(0)] -= 1;
-      node_reference[node3->input(1)] -= 1;
-      node_reference[node4->input(0)] -= 1;
-      node_reference[node5->input(0)] -= 1;
-      node_reference[node5->input(1)] -= 1;
-      node_reference[node6->input(0)] -= 1;
-      node_reference[node7->input(0)] -= 1;
-      node_reference[node7->input(1)] -= 1;
+                // affine
+                std::vector<float> affine_S = get_node_attr_from_input_af(weights[node8->input(1)]);
+                std::vector<float> affine_B = get_node_attr_from_input_af(weights[node9->input(1)]);
+                if (affine_S.size() != affine_B.size()) break;
 
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      blob_names.erase(node3->output(0));
-      blob_names.erase(node4->output(0));
-      blob_names.erase(node5->output(0));
-      blob_names.erase(node6->output(0));
+                affine = 1;
+                break;
+            }
 
-      node_reference[node->input(0)] += 1;
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
+            node4->set_op_type("noop_reducedncnn");
+            node5->set_op_type("noop_reducedncnn");
+            node6->set_op_type("noop_reducedncnn");
 
-      if (affine == 0) {
-        node7->set_op_type("LayerNorm");
-        node7->clear_input();
-        node7->add_input(node->input(0));
+            node_reference[node->input(0)] -= 1;
+            node_reference[node2->input(0)] -= 1;
+            node_reference[node2->input(1)] -= 1;
+            node_reference[node3->input(0)] -= 1;
+            node_reference[node3->input(1)] -= 1;
+            node_reference[node4->input(0)] -= 1;
+            node_reference[node5->input(0)] -= 1;
+            node_reference[node5->input(1)] -= 1;
+            node_reference[node6->input(0)] -= 1;
+            node_reference[node7->input(0)] -= 1;
+            node_reference[node7->input(1)] -= 1;
 
-        onnx::AttributeProto* attr_eps = node7->add_attribute();
-        attr_eps->set_name("epsilon");
-        attr_eps->set_f(eps);
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            blob_names.erase(node3->output(0));
+            blob_names.erase(node4->output(0));
+            blob_names.erase(node5->output(0));
+            blob_names.erase(node6->output(0));
 
-        onnx::AttributeProto* attr_affine = node7->add_attribute();
-        attr_affine->set_name("affine");
-        attr_affine->set_i(affine);
+            node_reference[node->input(0)] += 1;
 
-        reduced_node_count += 6;
-        i += 6;
-      } else  // if (affine == 1)
-      {
-        onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
-        onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
+            if (affine == 0)
+            {
+                node7->set_op_type("LayerNorm");
+                node7->clear_input();
+                node7->add_input(node->input(0));
 
-        node7->set_op_type("noop_reducedncnn");
-        node8->set_op_type("noop_reducedncnn");
+                onnx::AttributeProto* attr_eps = node7->add_attribute();
+                attr_eps->set_name("epsilon");
+                attr_eps->set_f(eps);
 
-        node_reference[node8->input(0)] -= 1;
-        node_reference[node9->input(0)] -= 1;
+                onnx::AttributeProto* attr_affine = node7->add_attribute();
+                attr_affine->set_name("affine");
+                attr_affine->set_i(affine);
 
-        blob_names.erase(node7->output(0));
-        blob_names.erase(node8->output(0));
+                reduced_node_count += 6;
+                i += 6;
+            }
+            else  // if (affine == 1)
+            {
+                onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
+                onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
 
-        std::string affine_scale = node8->input(1);
-        std::string affine_bias = node9->input(1);
+                node7->set_op_type("noop_reducedncnn");
+                node8->set_op_type("noop_reducedncnn");
 
-        node9->set_op_type("LayerNorm");
-        node9->clear_input();
-        node9->add_input(node->input(0));
-        node9->add_input(affine_scale);
-        node9->add_input(affine_bias);
-
-        onnx::AttributeProto* attr_eps = node9->add_attribute();
-        attr_eps->set_name("epsilon");
-        attr_eps->set_f(eps);
-
-        onnx::AttributeProto* attr_affine = node9->add_attribute();
-        attr_affine->set_name("affine");
-        attr_affine->set_i(affine);
-
-        reduced_node_count += 8;
-        i += 8;
-      }
+                node_reference[node8->input(0)] -= 1;
+                node_reference[node9->input(0)] -= 1;
+
+                blob_names.erase(node7->output(0));
+                blob_names.erase(node8->output(0));
+
+                std::string affine_scale = node8->input(1);
+                std::string affine_bias  = node9->input(1);
+
+                node9->set_op_type("LayerNorm");
+                node9->clear_input();
+                node9->add_input(node->input(0));
+                node9->add_input(affine_scale);
+                node9->add_input(affine_bias);
+
+                onnx::AttributeProto* attr_eps = node9->add_attribute();
+                attr_eps->set_name("epsilon");
+                attr_eps->set_f(eps);
+
+                onnx::AttributeProto* attr_affine = node9->add_attribute();
+                attr_affine->set_name("affine");
+                attr_affine->set_i(affine);
+
+                reduced_node_count += 8;
+                i += 8;
+            }
+        }
     }
-  }
 }
 
-void fuse_flatten(onnx::GraphProto* mutable_graph,
+void fuse_flatten(onnx::GraphProto*                         mutable_graph,
                   std::map<std::string, onnx::TensorProto>& weights,
-                  std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                  int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // Flatten <= X - Shape - Gather - Constant - Unsqueeze - Unsqueeze - Concat
-    // - Reshape
-    if (node->op_type() == "Shape") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      if (i + 6 >= node_count) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-      onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
-      onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
-      onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
-
-      if (node2->op_type() != "Gather" || node3->op_type() != "Constant" ||
-          node4->op_type() != "Unsqueeze" || node5->op_type() != "Unsqueeze" ||
-          node6->op_type() != "Concat" || node7->op_type() != "Reshape")
-        continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      //             if (node_reference[node3->output(0)] != 1)
-      //                 continue;
-
-      if (node_reference[node4->output(0)] != 1) continue;
-
-      if (node_reference[node5->output(0)] != 1) continue;
-
-      if (node_reference[node6->output(0)] != 1) continue;
-
-      if (node2->input(0) != node->output(0) || node4->input(0) != node2->output(0) ||
-          node5->input(0) != node3->output(0) || node6->input(0) != node4->output(0) ||
-          node6->input(1) != node5->output(0) || node7->input(0) != node->input(0) ||
-          node7->input(1) != node6->output(0))
-        continue;
-
-      // axis = 0
-      int gather_axis = get_node_attr_i(*node2, "axis");
-      if (gather_axis != 0) continue;
-
-      // indices = 0
-      if (weights.find(node2->input(1)) == weights.end()) continue;
-
-      std::vector<int> gather_indices = get_node_attr_from_input_ai(weights[node2->input(1)]);
-      if (gather_indices.size() != 1 || gather_indices[0] != 0) continue;
-
-      // axes = (0)
-      std::vector<int> unsqueeze_axes = get_node_attr_ai(*node4, "axes");
-      if (unsqueeze_axes.size() != 1) continue;
-      if (unsqueeze_axes[0] != 0) continue;
-
-      // axes = (0)
-      std::vector<int> unsqueeze2_axes = get_node_attr_ai(*node5, "axes");
-      if (unsqueeze2_axes.size() != 1) continue;
-      if (unsqueeze2_axes[0] != 0) continue;
-
-      // data = -1
-      if (weights.find(node5->input(0)) == weights.end()) continue;
-
-      std::vector<int> unsqueeze2_data = get_node_attr_from_input_ai(weights[node5->input(0)]);
-      if (unsqueeze2_data.size() != 1 || unsqueeze2_data[0] != -1) continue;
-
-      // axis = 0
-      int concat_axis = get_node_attr_i(*node6, "axis");
-      if (concat_axis != 0) continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      //             node3->set_op_type("noop_reducedncnn");
-      node4->set_op_type("noop_reducedncnn");
-      node5->set_op_type("noop_reducedncnn");
-      node6->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(0)] -= 1;
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->input(1)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      //             node_reference[node3->output(0)] -= 1;
-      node_reference[node4->output(0)] -= 1;
-      node_reference[node5->input(0)] -= 1;
-      node_reference[node5->output(0)] -= 1;
-      node_reference[node6->output(0)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      //             blob_names.erase(node3->output(0));
-      blob_names.erase(node4->output(0));
-      blob_names.erase(node5->output(0));
-      blob_names.erase(node6->output(0));
-
-      node7->set_op_type("Flatten");
-      node7->clear_input();
-      node7->add_input(node->input(0));
-
-      reduced_node_count += 5;
-      i += 5;
+                  std::map<std::string, int>&               node_reference,
+                  std::set<std::string>&                    blob_names,
+                  int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // Flatten <= X - Shape - Gather - Constant - Unsqueeze - Unsqueeze - Concat
+        // - Reshape
+        if (node->op_type() == "Shape")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            if (i + 6 >= node_count) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
+            onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
+            onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
+            onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
+
+            if (node2->op_type() != "Gather" || node3->op_type() != "Constant" ||
+                node4->op_type() != "Unsqueeze" || node5->op_type() != "Unsqueeze" ||
+                node6->op_type() != "Concat" || node7->op_type() != "Reshape")
+                continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            //             if (node_reference[node3->output(0)] != 1)
+            //                 continue;
+
+            if (node_reference[node4->output(0)] != 1) continue;
+
+            if (node_reference[node5->output(0)] != 1) continue;
+
+            if (node_reference[node6->output(0)] != 1) continue;
+
+            if (node2->input(0) != node->output(0) || node4->input(0) != node2->output(0) ||
+                node5->input(0) != node3->output(0) || node6->input(0) != node4->output(0) ||
+                node6->input(1) != node5->output(0) || node7->input(0) != node->input(0) ||
+                node7->input(1) != node6->output(0))
+                continue;
+
+            // axis = 0
+            int gather_axis = get_node_attr_i(*node2, "axis");
+            if (gather_axis != 0) continue;
+
+            // indices = 0
+            if (weights.find(node2->input(1)) == weights.end()) continue;
+
+            std::vector<int> gather_indices = get_node_attr_from_input_ai(weights[node2->input(1)]);
+            if (gather_indices.size() != 1 || gather_indices[0] != 0) continue;
+
+            // axes = (0)
+            std::vector<int> unsqueeze_axes = get_node_attr_ai(*node4, "axes");
+            if (unsqueeze_axes.size() != 1) continue;
+            if (unsqueeze_axes[0] != 0) continue;
+
+            // axes = (0)
+            std::vector<int> unsqueeze2_axes = get_node_attr_ai(*node5, "axes");
+            if (unsqueeze2_axes.size() != 1) continue;
+            if (unsqueeze2_axes[0] != 0) continue;
+
+            // data = -1
+            if (weights.find(node5->input(0)) == weights.end()) continue;
+
+            std::vector<int> unsqueeze2_data = get_node_attr_from_input_ai(weights[node5->input(0)]);
+            if (unsqueeze2_data.size() != 1 || unsqueeze2_data[0] != -1) continue;
+
+            // axis = 0
+            int concat_axis = get_node_attr_i(*node6, "axis");
+            if (concat_axis != 0) continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            //             node3->set_op_type("noop_reducedncnn");
+            node4->set_op_type("noop_reducedncnn");
+            node5->set_op_type("noop_reducedncnn");
+            node6->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->input(1)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            //             node_reference[node3->output(0)] -= 1;
+            node_reference[node4->output(0)] -= 1;
+            node_reference[node5->input(0)] -= 1;
+            node_reference[node5->output(0)] -= 1;
+            node_reference[node6->output(0)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            //             blob_names.erase(node3->output(0));
+            blob_names.erase(node4->output(0));
+            blob_names.erase(node5->output(0));
+            blob_names.erase(node6->output(0));
+
+            node7->set_op_type("Flatten");
+            node7->clear_input();
+            node7->add_input(node->input(0));
+
+            reduced_node_count += 5;
+            i += 5;
+        }
     }
-  }
 }
 
-void fuse_pixelshuffle(onnx::GraphProto* mutable_graph,
+void fuse_pixelshuffle(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // PixelShuffle <= Reshape - Transpose - Reshape
-    // PixelShuffle <= Reshape - Transpose - Constant - Reshape
-    if (node->op_type() == "Reshape") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // PixelShuffle <= Reshape - Transpose - Reshape
+        // PixelShuffle <= Reshape - Transpose - Constant - Reshape
+        if (node->op_type() == "Reshape")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      std::vector<int> shape;
-      if (node->input_size() == 1) {
-        shape = get_node_attr_ai(*node, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node->input(1)) == weights.end()) continue;
+            std::vector<int> shape;
+            if (node->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node->input(1)) == weights.end()) continue;
 
-        shape = get_node_attr_from_input_ai(weights[node->input(1)]);
-      }
+                shape = get_node_attr_from_input_ai(weights[node->input(1)]);
+            }
 
-      // -1, 3, upscale_factor, upscale_factor, height, width
-      if (shape.size() != 6) continue;
+            // -1, 3, upscale_factor, upscale_factor, height, width
+            if (shape.size() != 6) continue;
 
-      if (shape[0] != 1 && shape[0] != -1) continue;
+            if (shape[0] != 1 && shape[0] != -1) continue;
 
-      if (shape[2] != shape[3]) continue;
+            if (shape[2] != shape[3]) continue;
 
-      if (i + 2 >= node_count) continue;
+            if (i + 2 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
 
-      if (node3->op_type() == "Constant") {
-        if (i + 3 >= node_count) continue;
+            if (node3->op_type() == "Constant")
+            {
+                if (i + 3 >= node_count) continue;
 
-        node3 = mutable_graph->mutable_node(i + 3);
-      }
+                node3 = mutable_graph->mutable_node(i + 3);
+            }
 
-      if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
+            if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
 
-      if (node_reference[node2->output(0)] != 1) continue;
+            if (node_reference[node2->output(0)] != 1) continue;
 
-      // 0 1 4 2 5 3
-      std::vector<int> perm = get_node_attr_ai(*node2, "perm");
-      if (perm.size() != 6) continue;
+            // 0 1 4 2 5 3
+            std::vector<int> perm = get_node_attr_ai(*node2, "perm");
+            if (perm.size() != 6) continue;
 
-      if (perm[0] != 0 || perm[1] != 1 || perm[2] != 4 || perm[3] != 2 || perm[4] != 5 ||
-          perm[5] != 3)
-        continue;
+            if (perm[0] != 0 || perm[1] != 1 || perm[2] != 4 || perm[3] != 2 || perm[4] != 5 ||
+                perm[5] != 3)
+                continue;
 
-      std::vector<int> shape3;
-      if (node3->input_size() == 1) {
-        shape3 = get_node_attr_ai(*node3, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node3->input(1)) == weights.end()) continue;
+            std::vector<int> shape3;
+            if (node3->input_size() == 1)
+            {
+                shape3 = get_node_attr_ai(*node3, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node3->input(1)) == weights.end()) continue;
 
-        shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      }
+                shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            }
 
-      // -1, 3, height, width
-      if (shape3.size() != 4) continue;
+            // -1, 3, height, width
+            if (shape3.size() != 4) continue;
 
-      if (shape3[0] != 1 && shape3[0] != -1) continue;
+            if (shape3[0] != 1 && shape3[0] != -1) continue;
 
-      if (shape3[1] != shape[1] || shape3[2] != shape[2] * shape[4] ||
-          shape3[3] != shape[3] * shape[5])
-        continue;
+            if (shape3[1] != shape[1] || shape3[2] != shape[2] * shape[4] ||
+                shape3[3] != shape[3] * shape[5])
+                continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
 
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
 
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
 
-      node3->set_op_type("PixelShuffle");
-      node3->set_input(0, node->input(0));
+            node3->set_op_type("PixelShuffle");
+            node3->set_input(0, node->input(0));
 
-      onnx::AttributeProto* attr_group = node3->add_attribute();
-      attr_group->set_name("scale_factor");
-      attr_group->set_i(shape[2]);
+            onnx::AttributeProto* attr_group = node3->add_attribute();
+            attr_group->set_name("scale_factor");
+            attr_group->set_i(shape[2]);
 
-      reduced_node_count += 2;
-      i += 2;
+            reduced_node_count += 2;
+            i += 2;
+        }
     }
-  }
 }
 
-void fuse_reorg(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights,
-                std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+void fuse_reorg(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference, std::set<std::string>& blob_names, int& reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // PixelShuffle <= Reshape - Transpose - Reshape
-    // PixelShuffle <= Reshape - Transpose - Constant - Reshape
-    if (node->op_type() == "Reshape") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // PixelShuffle <= Reshape - Transpose - Reshape
+        // PixelShuffle <= Reshape - Transpose - Constant - Reshape
+        if (node->op_type() == "Reshape")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      std::vector<int> shape;
-      if (node->input_size() == 1) {
-        shape = get_node_attr_ai(*node, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node->input(1)) == weights.end()) continue;
+            std::vector<int> shape;
+            if (node->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node->input(1)) == weights.end()) continue;
 
-        shape = get_node_attr_from_input_ai(weights[node->input(1)]);
-      }
+                shape = get_node_attr_from_input_ai(weights[node->input(1)]);
+            }
 
-      // -1, 3, out_height, block_size, out_width, block_size
-      if (shape.size() != 6) continue;
+            // -1, 3, out_height, block_size, out_width, block_size
+            if (shape.size() != 6) continue;
 
-      if (shape[0] != 1 && shape[0] != -1) continue;
+            if (shape[0] != 1 && shape[0] != -1) continue;
 
-      if (shape[3] != shape[5]) continue;
+            if (shape[3] != shape[5]) continue;
 
-      if (i + 2 >= node_count) continue;
+            if (i + 2 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
 
-      if (node3->op_type() == "Constant") {
-        if (i + 3 >= node_count) continue;
+            if (node3->op_type() == "Constant")
+            {
+                if (i + 3 >= node_count) continue;
 
-        node3 = mutable_graph->mutable_node(i + 3);
-      }
+                node3 = mutable_graph->mutable_node(i + 3);
+            }
 
-      if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
+            if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
 
-      if (node_reference[node2->output(0)] != 1) continue;
+            if (node_reference[node2->output(0)] != 1) continue;
 
-      // 0 1 3 5 2 4
-      std::vector<int> perm = get_node_attr_ai(*node2, "perm");
-      if (perm.size() != 6) continue;
+            // 0 1 3 5 2 4
+            std::vector<int> perm = get_node_attr_ai(*node2, "perm");
+            if (perm.size() != 6) continue;
 
-      if (perm[0] != 0 || perm[1] != 1 || perm[2] != 3 || perm[3] != 5 || perm[4] != 2 ||
-          perm[5] != 4)
-        continue;
+            if (perm[0] != 0 || perm[1] != 1 || perm[2] != 3 || perm[3] != 5 || perm[4] != 2 ||
+                perm[5] != 4)
+                continue;
 
-      std::vector<int> shape3;
-      if (node3->input_size() == 1) {
-        shape3 = get_node_attr_ai(*node3, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node3->input(1)) == weights.end()) continue;
+            std::vector<int> shape3;
+            if (node3->input_size() == 1)
+            {
+                shape3 = get_node_attr_ai(*node3, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node3->input(1)) == weights.end()) continue;
 
-        shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      }
+                shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            }
 
-      // -1, out_channels, out_height, out_width
-      if (shape3.size() != 4) continue;
+            // -1, out_channels, out_height, out_width
+            if (shape3.size() != 4) continue;
 
-      if (shape3[0] != 1 && shape3[0] != -1) continue;
+            if (shape3[0] != 1 && shape3[0] != -1) continue;
 
-      if (shape3[1] != shape[1] * shape[3] * shape[5] || shape3[2] != shape[2] ||
-          shape3[3] != shape[4])
-        continue;
+            if (shape3[1] != shape[1] * shape[3] * shape[5] || shape3[2] != shape[2] ||
+                shape3[3] != shape[4])
+                continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
 
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
 
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
 
-      node3->set_op_type("Reorg");
-      node3->set_input(0, node->input(0));
+            node3->set_op_type("Reorg");
+            node3->set_input(0, node->input(0));
 
-      onnx::AttributeProto* attr_group = node3->add_attribute();
-      attr_group->set_name("stride");
-      attr_group->set_i(shape[3]);
+            onnx::AttributeProto* attr_group = node3->add_attribute();
+            attr_group->set_name("stride");
+            attr_group->set_i(shape[3]);
 
-      reduced_node_count += 2;
-      i += 2;
+            reduced_node_count += 2;
+            i += 2;
+        }
     }
-  }
 }
 
-void fuse_expand_broadcast(onnx::GraphProto* mutable_graph,
+void fuse_expand_broadcast(onnx::GraphProto*                         mutable_graph,
                            std::map<std::string, onnx::TensorProto>& weights,
-                           std::map<std::string, int>& node_reference,
-                           std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                           std::map<std::string, int>&               node_reference,
+                           std::set<std::string>&                    blob_names,
+                           int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // Add/Sub/Mul/Div/Min/Max <= Expand - Add/Sub/Mul/Div/Min/Max
-    if (node->op_type() == "Expand") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // Add/Sub/Mul/Div/Min/Max <= Expand - Add/Sub/Mul/Div/Min/Max
+        if (node->op_type() == "Expand")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "Add" && node2->op_type() != "Sub" && node2->op_type() != "Mul" &&
-          node2->op_type() != "Div" && node2->op_type() != "Min" && node2->op_type() != "Max")
-        continue;
+            if (node2->op_type() != "Add" && node2->op_type() != "Sub" && node2->op_type() != "Mul" &&
+                node2->op_type() != "Div" && node2->op_type() != "Min" && node2->op_type() != "Max")
+                continue;
 
-      if (node2->input(1) != node->output(0) && node2->input(0) != node->output(0)) continue;
+            if (node2->input(1) != node->output(0) && node2->input(0) != node->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
+            node_reference[node->output(0)] -= 1;
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      if (node2->input(0) == node->output(0)) {
-        node2->set_input(0, node->input(0));
-      } else {
-        node2->set_input(1, node->input(0));
-      }
+            if (node2->input(0) == node->output(0))
+            {
+                node2->set_input(0, node->input(0));
+            }
+            else
+            {
+                node2->set_input(1, node->input(0));
+            }
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_lstm_gru_rnn(onnx::GraphProto* mutable_graph,
+void fuse_lstm_gru_rnn(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // LSTM(bi) <= LSTM(bi) - Transpose - Reshape - Transpose
-    // or LSTM(bi) <= LSTM(bi) - Transpose Constant - Reshape - Transpose
-    if (node->op_type() == "LSTM" || node->op_type() == "GRU" || node->op_type() == "RNN") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // LSTM(bi) <= LSTM(bi) - Transpose - Reshape - Transpose
+        // or LSTM(bi) <= LSTM(bi) - Transpose Constant - Reshape - Transpose
+        if (node->op_type() == "LSTM" || node->op_type() == "GRU" || node->op_type() == "RNN")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 2 >= node_count) continue;
+            if (i + 2 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
 
-      // skip if second ops is constant
-      if (node3->op_type() == "Constant") {
-        if (i + 3 >= node_count) continue;
-        node3 = mutable_graph->mutable_node(i + 3);
-        i += 1;
-      }
+            // skip if second ops is constant
+            if (node3->op_type() == "Constant")
+            {
+                if (i + 3 >= node_count) continue;
+                node3 = mutable_graph->mutable_node(i + 3);
+                i += 1;
+            }
 
-      if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
+            if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
 
-      if (node_reference[node2->output(0)] != 1) continue;
+            if (node_reference[node2->output(0)] != 1) continue;
 
-      if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0)) continue;
+            if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0)) continue;
 
-      std::string direction = get_node_attr_s(*node, "direction");
-      if (direction != "bidirectional") continue;
+            std::string direction = get_node_attr_s(*node, "direction");
+            if (direction != "bidirectional") continue;
 
-      // 0 2 1 3
-      std::vector<int> perm = get_node_attr_ai(*node2, "perm");
-      if (perm.size() != 4) continue;
+            // 0 2 1 3
+            std::vector<int> perm = get_node_attr_ai(*node2, "perm");
+            if (perm.size() != 4) continue;
 
-      if (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3) continue;
+            if (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3) continue;
 
-      std::vector<int> shape;
-      if (node3->input_size() == 1) {
-        shape = get_node_attr_ai(*node3, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node3->input(1)) == weights.end()) continue;
+            std::vector<int> shape;
+            if (node3->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node3, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node3->input(1)) == weights.end()) continue;
 
-        shape = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      }
+                shape = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            }
 
-      // 0 0 -1
-      if (shape.size() != 3) continue;
+            // 0 0 -1
+            if (shape.size() != 3) continue;
 
-      if (shape[0] != 0 || shape[1] != 0 || shape[2] != -1) continue;
+            if (shape[0] != 0 || shape[1] != 0 || shape[2] != -1) continue;
 
-      // reduce
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
+            // reduce
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
 
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
 
-      node->set_output(0, node3->output(0));
+            node->set_output(0, node3->output(0));
 
-      reduced_node_count += 2;
-      i += 2;
+            reduced_node_count += 2;
+            i += 2;
 
-      if (i + 1 < node_count) {
-        if (node_reference[node3->output(0)] != 1) continue;
+            if (i + 1 < node_count)
+            {
+                if (node_reference[node3->output(0)] != 1) continue;
 
-        onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 1);
+                onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 1);
 
-        if (node4->op_type() != "Transpose") continue;
+                if (node4->op_type() != "Transpose") continue;
 
-        if (node4->input(0) != node->output(0)) continue;
+                if (node4->input(0) != node->output(0)) continue;
 
-        // 1 0 2
-        std::vector<int> perm4 = get_node_attr_ai(*node4, "perm");
-        if (perm4.size() != 3) continue;
+                // 1 0 2
+                std::vector<int> perm4 = get_node_attr_ai(*node4, "perm");
+                if (perm4.size() != 3) continue;
 
-        if (perm4[0] != 1 || perm4[1] != 0 || perm4[2] != 2) continue;
+                if (perm4[0] != 1 || perm4[1] != 0 || perm4[2] != 2) continue;
 
-        // reduce
-        node4->set_op_type("noop_reducedncnn");
+                // reduce
+                node4->set_op_type("noop_reducedncnn");
 
-        node_reference[node->output(0)] -= 1;
+                node_reference[node->output(0)] -= 1;
 
-        blob_names.erase(node->output(0));
+                blob_names.erase(node->output(0));
 
-        node->set_output(0, node4->output(0));
+                node->set_output(0, node4->output(0));
 
-        reduced_node_count += 1;
-        i += 1;
-      }
+                reduced_node_count += 1;
+                i += 1;
+            }
+        }
     }
-  }
 
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // LSTM(uni) <= LSTM(uni) - Squeeze - Transpose
-    if (node->op_type() == "LSTM" || node->op_type() == "GRU" || node->op_type() == "RNN") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // LSTM(uni) <= LSTM(uni) - Squeeze - Transpose
+        if (node->op_type() == "LSTM" || node->op_type() == "GRU" || node->op_type() == "RNN")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "Squeeze") continue;
+            if (node2->op_type() != "Squeeze") continue;
 
-      if (node2->input(0) != node->output(0)) continue;
+            if (node2->input(0) != node->output(0)) continue;
 
-      std::string direction = get_node_attr_s(*node, "direction");
-      if (direction == "bidirectional") continue;
+            std::string direction = get_node_attr_s(*node, "direction");
+            if (direction == "bidirectional") continue;
 
-      // 1
-      std::vector<int> axes = get_node_attr_ai(*node2, "axes");
-      if (axes.size() != 1) continue;
+            // 1
+            std::vector<int> axes = get_node_attr_ai(*node2, "axes");
+            if (axes.size() != 1) continue;
 
-      if (axes[0] != 1) continue;
+            if (axes[0] != 1) continue;
 
-      // reduce
-      node2->set_op_type("noop_reducedncnn");
+            // reduce
+            node2->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      node->set_output(0, node2->output(0));
+            node->set_output(0, node2->output(0));
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
 
-      if (i + 1 < node_count) {
-        if (node_reference[node2->output(0)] != 1) continue;
+            if (i + 1 < node_count)
+            {
+                if (node_reference[node2->output(0)] != 1) continue;
 
-        onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 1);
+                onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 1);
 
-        if (node3->op_type() != "Transpose") continue;
+                if (node3->op_type() != "Transpose") continue;
 
-        if (node3->input(0) != node->output(0)) continue;
+                if (node3->input(0) != node->output(0)) continue;
 
-        // 1 0 2
-        std::vector<int> perm4 = get_node_attr_ai(*node3, "perm");
-        if (perm4.size() != 3) continue;
+                // 1 0 2
+                std::vector<int> perm4 = get_node_attr_ai(*node3, "perm");
+                if (perm4.size() != 3) continue;
 
-        if (perm4[0] != 1 || perm4[1] != 0 || perm4[2] != 2) continue;
+                if (perm4[0] != 1 || perm4[1] != 0 || perm4[2] != 2) continue;
 
-        // reduce
-        node3->set_op_type("noop_reducedncnn");
+                // reduce
+                node3->set_op_type("noop_reducedncnn");
 
-        node_reference[node->output(0)] -= 1;
+                node_reference[node->output(0)] -= 1;
 
-        blob_names.erase(node->output(0));
+                blob_names.erase(node->output(0));
 
-        node->set_output(0, node3->output(0));
+                node->set_output(0, node3->output(0));
 
-        reduced_node_count += 1;
-        i += 1;
-      }
+                reduced_node_count += 1;
+                i += 1;
+            }
+        }
     }
-  }
 
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // LSTM <= Transpose - LSTM
-    if (node->op_type() == "Transpose") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // LSTM <= Transpose - LSTM
+        if (node->op_type() == "Transpose")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      // 1 0 2
-      std::vector<int> perm = get_node_attr_ai(*node, "perm");
-      if (perm.size() != 3) continue;
+            // 1 0 2
+            std::vector<int> perm = get_node_attr_ai(*node, "perm");
+            if (perm.size() != 3) continue;
 
-      if (perm[0] != 1 || perm[1] != 0 || perm[2] != 2) continue;
+            if (perm[0] != 1 || perm[1] != 0 || perm[2] != 2) continue;
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "LSTM" && node->op_type() != "GRU" && node->op_type() != "RNN")
-        continue;
+            if (node2->op_type() != "LSTM" && node->op_type() != "GRU" && node->op_type() != "RNN")
+                continue;
 
-      if (node2->input(0) != node->output(0)) continue;
+            if (node2->input(0) != node->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      node2->set_input(0, node->input(0));
+            node2->set_input(0, node->input(0));
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_multiheadattention(onnx::GraphProto* mutable_graph,
+void fuse_multiheadattention(onnx::GraphProto*                         mutable_graph,
                              std::map<std::string, onnx::TensorProto>& weights,
-                             std::map<std::string, int>& node_reference,
-                             std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // MultiHeadAttention <= MatMul(q) - Add
-    //                      - MatMul(k) - Add
-    //                      - MatMul(v) - Add
-    //                      - Mul
-    //                      - Reshape - Transpose
-    //                      - Reshape - Reshape - Transpose - Transpose
-    //                      - Gemm - Softmax - Gemm - Transpose - Reshape -
-    //                      MatMul - Add
-    if (node->op_type() == "MatMul") {
-      if (i + 19 >= node_count) continue;
-
-      if (node_reference[node->output(0)] != 1) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-      onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
-      onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
-      onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
-      onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
-      onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
-      onnx::NodeProto* node10 = mutable_graph->mutable_node(i + 9);
-      onnx::NodeProto* node11 = mutable_graph->mutable_node(i + 10);
-      onnx::NodeProto* node12 = mutable_graph->mutable_node(i + 11);
-      onnx::NodeProto* node13 = mutable_graph->mutable_node(i + 12);
-      onnx::NodeProto* node14 = mutable_graph->mutable_node(i + 13);
-      onnx::NodeProto* node15 = mutable_graph->mutable_node(i + 14);
-      onnx::NodeProto* node16 = mutable_graph->mutable_node(i + 15);
-      onnx::NodeProto* node17 = mutable_graph->mutable_node(i + 16);
-      onnx::NodeProto* node18 = mutable_graph->mutable_node(i + 17);
-      onnx::NodeProto* node19 = mutable_graph->mutable_node(i + 18);
-      onnx::NodeProto* node20 = mutable_graph->mutable_node(i + 19);
-
-      if (node2->op_type() != "Add" || node3->op_type() != "MatMul" || node4->op_type() != "Add" ||
-          node5->op_type() != "MatMul" || node6->op_type() != "Add" || node7->op_type() != "Mul" ||
-          node8->op_type() != "Reshape" || node9->op_type() != "Transpose" ||
-          node10->op_type() != "Reshape" || node11->op_type() != "Reshape" ||
-          node12->op_type() != "Transpose" || node13->op_type() != "Transpose" ||
-          node14->op_type() != "MatMul" || node15->op_type() != "Softmax" ||
-          node16->op_type() != "MatMul" || node17->op_type() != "Transpose" ||
-          node18->op_type() != "Reshape" || node19->op_type() != "MatMul" ||
-          node20->op_type() != "Add")
-        continue;
-
-      if (node_reference[node2->output(0)] != 1 || node_reference[node3->output(0)] != 1 ||
-          node_reference[node4->output(0)] != 1 || node_reference[node5->output(0)] != 1 ||
-          node_reference[node6->output(0)] != 1 || node_reference[node7->output(0)] != 1 ||
-          node_reference[node8->output(0)] != 1 || node_reference[node9->output(0)] != 1 ||
-          node_reference[node10->output(0)] != 1 || node_reference[node11->output(0)] != 1 ||
-          node_reference[node12->output(0)] != 1 || node_reference[node13->output(0)] != 1 ||
-          node_reference[node14->output(0)] != 1 || node_reference[node15->output(0)] != 1 ||
-          node_reference[node16->output(0)] != 1 || node_reference[node17->output(0)] != 1 ||
-          node_reference[node18->output(0)] != 1 || node_reference[node19->output(0)] != 1)
-        continue;
-
-      if (node2->input(0) != node->output(0) || node4->input(0) != node3->output(0) ||
-          node6->input(0) != node5->output(0) || node7->input(0) != node2->output(0) ||
-          node8->input(0) != node7->output(0) || node9->input(0) != node8->output(0) ||
-          node10->input(0) != node4->output(0) || node11->input(0) != node6->output(0) ||
-          node12->input(0) != node11->output(0) || node13->input(0) != node10->output(0) ||
-          node14->input(0) != node9->output(0) || node14->input(1) != node13->output(0) ||
-          node15->input(0) != node14->output(0) || node16->input(0) != node15->output(0) ||
-          node16->input(1) != node12->output(0) || node17->input(0) != node16->output(0) ||
-          node18->input(0) != node17->output(0) || node19->input(0) != node18->output(0) ||
-          node20->input(0) != node19->output(0))
-        continue;
-
-      std::vector<float> q_B = get_node_attr_from_input_af(weights[node2->input(1)]);
-      std::vector<float> k_B = get_node_attr_from_input_af(weights[node4->input(1)]);
-      std::vector<float> v_B = get_node_attr_from_input_af(weights[node6->input(1)]);
-      std::vector<float> o_B = get_node_attr_from_input_af(weights[node20->input(1)]);
-
-      if (q_B.size() != k_B.size() || q_B.size() != v_B.size() || q_B.size() != o_B.size())
-        continue;
-
-      int embed_dim = q_B.size();
-
-      // 1 0 2
-      std::vector<int> perm9 = get_node_attr_ai(*node9, "perm");
-      std::vector<int> perm12 = get_node_attr_ai(*node12, "perm");
-      if (perm9.size() != 3 || perm12.size() != 3) continue;
-
-      if (perm9[0] != 1 || perm9[1] != 0 || perm9[2] != 2 || perm12[0] != 1 || perm12[1] != 0 ||
-          perm12[2] != 2)
-        continue;
-
-      // 1 2 0
-      std::vector<int> perm13 = get_node_attr_ai(*node13, "perm");
-      if (perm13.size() != 3) continue;
-
-      if (perm13[0] != 1 || perm13[1] != 2 || perm13[2] != 0) continue;
-
-      // 1 0 2
-      std::vector<int> perm17 = get_node_attr_ai(*node17, "perm");
-      if (perm17.size() != 3) continue;
-
-      if (perm17[0] != 1 || perm17[1] != 0 || perm17[2] != 2) continue;
-
-      int softmax_axis = get_node_attr_i(*node15, "axis");
-      if (softmax_axis != 2) continue;
-
-      // 1/-1, seqlen * num_heads, embed_dim / num_heads
-      std::vector<int> shape8;
-      std::vector<int> shape10;
-      std::vector<int> shape11;
-      if (node8->input_size() == 1) {
-        shape8 = get_node_attr_ai(*node8, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node8->input(1)) == weights.end()) continue;
-
-        shape8 = get_node_attr_from_input_ai(weights[node8->input(1)]);
-      }
-      if (node10->input_size() == 1) {
-        shape10 = get_node_attr_ai(*node10, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node10->input(1)) == weights.end()) continue;
-
-        shape10 = get_node_attr_from_input_ai(weights[node10->input(1)]);
-      }
-      if (node11->input_size() == 1) {
-        shape11 = get_node_attr_ai(*node11, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node11->input(1)) == weights.end()) continue;
-
-        shape11 = get_node_attr_from_input_ai(weights[node11->input(1)]);
-      }
-
-      if (shape8.size() != 3 || shape10.size() != 3 || shape11.size() != 3) continue;
-
-      if (shape8[1] != shape10[1] || shape8[1] != shape11[1] || shape8[2] != shape10[2] ||
-          shape8[2] != shape11[2])
-        continue;
-
-      int num_heads = embed_dim / shape8[2];
-
-      // 1, seqlen, embed_dim
-      std::vector<int> shape18;
-      if (node18->input_size() == 1) {
-        shape18 = get_node_attr_ai(*node18, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node18->input(1)) == weights.end()) continue;
-
-        shape18 = get_node_attr_from_input_ai(weights[node18->input(1)]);
-      }
-
-      if (shape18.size() != 3) continue;
-
-      if (shape18[2] != embed_dim || shape18[1] * num_heads != shape8[1]) continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
-      node4->set_op_type("noop_reducedncnn");
-      node5->set_op_type("noop_reducedncnn");
-      node6->set_op_type("noop_reducedncnn");
-      node7->set_op_type("noop_reducedncnn");
-      node8->set_op_type("noop_reducedncnn");
-      node9->set_op_type("noop_reducedncnn");
-      node10->set_op_type("noop_reducedncnn");
-      node11->set_op_type("noop_reducedncnn");
-      node12->set_op_type("noop_reducedncnn");
-      node13->set_op_type("noop_reducedncnn");
-      node14->set_op_type("noop_reducedncnn");
-      node15->set_op_type("noop_reducedncnn");
-      node16->set_op_type("noop_reducedncnn");
-      node17->set_op_type("noop_reducedncnn");
-      node18->set_op_type("noop_reducedncnn");
-      node19->set_op_type("noop_reducedncnn");
-
-      node_reference[node2->input(0)] -= 1;
-      node_reference[node4->input(0)] -= 1;
-      node_reference[node6->input(0)] -= 1;
-      node_reference[node7->input(0)] -= 1;
-      node_reference[node7->input(1)] -= 1;
-      node_reference[node8->input(0)] -= 1;
-      if (node8->input_size() == 2) {
-        node_reference[node8->input(1)] -= 1;
-      }
-      node_reference[node9->input(0)] -= 1;
-      node_reference[node10->input(0)] -= 1;
-      if (node10->input_size() == 2) {
-        node_reference[node10->input(1)] -= 1;
-      }
-      node_reference[node11->input(0)] -= 1;
-      if (node11->input_size() == 2) {
-        node_reference[node11->input(1)] -= 1;
-      }
-      node_reference[node12->input(0)] -= 1;
-      node_reference[node13->input(0)] -= 1;
-      node_reference[node14->input(0)] -= 1;
-      node_reference[node14->input(1)] -= 1;
-      node_reference[node15->input(0)] -= 1;
-      node_reference[node16->input(0)] -= 1;
-      node_reference[node16->input(1)] -= 1;
-      node_reference[node17->input(0)] -= 1;
-      node_reference[node18->input(0)] -= 1;
-      if (node18->input_size() == 2) {
-        node_reference[node18->input(1)] -= 1;
-      }
-      node_reference[node19->input(0)] -= 1;
-      node_reference[node20->input(0)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      blob_names.erase(node3->output(0));
-      blob_names.erase(node4->output(0));
-      blob_names.erase(node5->output(0));
-      blob_names.erase(node6->output(0));
-      blob_names.erase(node7->output(0));
-      blob_names.erase(node8->output(0));
-      blob_names.erase(node9->output(0));
-      blob_names.erase(node10->output(0));
-      blob_names.erase(node11->output(0));
-      blob_names.erase(node12->output(0));
-      blob_names.erase(node13->output(0));
-      blob_names.erase(node14->output(0));
-      blob_names.erase(node15->output(0));
-      blob_names.erase(node16->output(0));
-      blob_names.erase(node17->output(0));
-      blob_names.erase(node18->output(0));
-      blob_names.erase(node19->output(0));
-
-      std::string qw = node->input(1);
-      std::string qb = node2->input(1);
-      std::string kw = node3->input(1);
-      std::string kb = node4->input(1);
-      std::string vw = node5->input(1);
-      std::string vb = node6->input(1);
-      std::string ow = node19->input(1);
-      std::string ob = node20->input(1);
-
-      node20->set_op_type("MultiHeadAttention");
-      node20->clear_input();
-      node20->add_input(node->input(0));
-      node20->add_input(node3->input(0));
-      node20->add_input(node5->input(0));
-      // q
-      node20->add_input(qw);
-      node20->add_input(qb);
-      // k
-      node20->add_input(kw);
-      node20->add_input(kb);
-      // v
-      node20->add_input(vw);
-      node20->add_input(vb);
-      // out linear
-      node20->add_input(ow);
-      node20->add_input(ob);
-
-      onnx::AttributeProto* attr_embed_dim = node20->add_attribute();
-      attr_embed_dim->set_name("embed_dim");
-      attr_embed_dim->set_i(embed_dim);
-
-      onnx::AttributeProto* attr_num_heads = node20->add_attribute();
-      attr_num_heads->set_name("num_heads");
-      attr_num_heads->set_i(num_heads);
-
-      reduced_node_count += 19;
-      i += 19;
+                             std::map<std::string, int>&               node_reference,
+                             std::set<std::string>&                    blob_names,
+                             int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // MultiHeadAttention <= MatMul(q) - Add
+        //                      - MatMul(k) - Add
+        //                      - MatMul(v) - Add
+        //                      - Mul
+        //                      - Reshape - Transpose
+        //                      - Reshape - Reshape - Transpose - Transpose
+        //                      - Gemm - Softmax - Gemm - Transpose - Reshape -
+        //                      MatMul - Add
+        if (node->op_type() == "MatMul")
+        {
+            if (i + 19 >= node_count) continue;
+
+            if (node_reference[node->output(0)] != 1) continue;
+
+            onnx::NodeProto* node2  = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3  = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4  = mutable_graph->mutable_node(i + 3);
+            onnx::NodeProto* node5  = mutable_graph->mutable_node(i + 4);
+            onnx::NodeProto* node6  = mutable_graph->mutable_node(i + 5);
+            onnx::NodeProto* node7  = mutable_graph->mutable_node(i + 6);
+            onnx::NodeProto* node8  = mutable_graph->mutable_node(i + 7);
+            onnx::NodeProto* node9  = mutable_graph->mutable_node(i + 8);
+            onnx::NodeProto* node10 = mutable_graph->mutable_node(i + 9);
+            onnx::NodeProto* node11 = mutable_graph->mutable_node(i + 10);
+            onnx::NodeProto* node12 = mutable_graph->mutable_node(i + 11);
+            onnx::NodeProto* node13 = mutable_graph->mutable_node(i + 12);
+            onnx::NodeProto* node14 = mutable_graph->mutable_node(i + 13);
+            onnx::NodeProto* node15 = mutable_graph->mutable_node(i + 14);
+            onnx::NodeProto* node16 = mutable_graph->mutable_node(i + 15);
+            onnx::NodeProto* node17 = mutable_graph->mutable_node(i + 16);
+            onnx::NodeProto* node18 = mutable_graph->mutable_node(i + 17);
+            onnx::NodeProto* node19 = mutable_graph->mutable_node(i + 18);
+            onnx::NodeProto* node20 = mutable_graph->mutable_node(i + 19);
+
+            if (node2->op_type() != "Add" || node3->op_type() != "MatMul" || node4->op_type() != "Add" ||
+                node5->op_type() != "MatMul" || node6->op_type() != "Add" || node7->op_type() != "Mul" ||
+                node8->op_type() != "Reshape" || node9->op_type() != "Transpose" ||
+                node10->op_type() != "Reshape" || node11->op_type() != "Reshape" ||
+                node12->op_type() != "Transpose" || node13->op_type() != "Transpose" ||
+                node14->op_type() != "MatMul" || node15->op_type() != "Softmax" ||
+                node16->op_type() != "MatMul" || node17->op_type() != "Transpose" ||
+                node18->op_type() != "Reshape" || node19->op_type() != "MatMul" ||
+                node20->op_type() != "Add")
+                continue;
+
+            if (node_reference[node2->output(0)] != 1 || node_reference[node3->output(0)] != 1 ||
+                node_reference[node4->output(0)] != 1 || node_reference[node5->output(0)] != 1 ||
+                node_reference[node6->output(0)] != 1 || node_reference[node7->output(0)] != 1 ||
+                node_reference[node8->output(0)] != 1 || node_reference[node9->output(0)] != 1 ||
+                node_reference[node10->output(0)] != 1 || node_reference[node11->output(0)] != 1 ||
+                node_reference[node12->output(0)] != 1 || node_reference[node13->output(0)] != 1 ||
+                node_reference[node14->output(0)] != 1 || node_reference[node15->output(0)] != 1 ||
+                node_reference[node16->output(0)] != 1 || node_reference[node17->output(0)] != 1 ||
+                node_reference[node18->output(0)] != 1 || node_reference[node19->output(0)] != 1)
+                continue;
+
+            if (node2->input(0) != node->output(0) || node4->input(0) != node3->output(0) ||
+                node6->input(0) != node5->output(0) || node7->input(0) != node2->output(0) ||
+                node8->input(0) != node7->output(0) || node9->input(0) != node8->output(0) ||
+                node10->input(0) != node4->output(0) || node11->input(0) != node6->output(0) ||
+                node12->input(0) != node11->output(0) || node13->input(0) != node10->output(0) ||
+                node14->input(0) != node9->output(0) || node14->input(1) != node13->output(0) ||
+                node15->input(0) != node14->output(0) || node16->input(0) != node15->output(0) ||
+                node16->input(1) != node12->output(0) || node17->input(0) != node16->output(0) ||
+                node18->input(0) != node17->output(0) || node19->input(0) != node18->output(0) ||
+                node20->input(0) != node19->output(0))
+                continue;
+
+            std::vector<float> q_B = get_node_attr_from_input_af(weights[node2->input(1)]);
+            std::vector<float> k_B = get_node_attr_from_input_af(weights[node4->input(1)]);
+            std::vector<float> v_B = get_node_attr_from_input_af(weights[node6->input(1)]);
+            std::vector<float> o_B = get_node_attr_from_input_af(weights[node20->input(1)]);
+
+            if (q_B.size() != k_B.size() || q_B.size() != v_B.size() || q_B.size() != o_B.size())
+                continue;
+
+            int              embed_dim = q_B.size();
+
+            // 1 0 2
+            std::vector<int> perm9  = get_node_attr_ai(*node9, "perm");
+            std::vector<int> perm12 = get_node_attr_ai(*node12, "perm");
+            if (perm9.size() != 3 || perm12.size() != 3) continue;
+
+            if (perm9[0] != 1 || perm9[1] != 0 || perm9[2] != 2 || perm12[0] != 1 || perm12[1] != 0 ||
+                perm12[2] != 2)
+                continue;
+
+            // 1 2 0
+            std::vector<int> perm13 = get_node_attr_ai(*node13, "perm");
+            if (perm13.size() != 3) continue;
+
+            if (perm13[0] != 1 || perm13[1] != 2 || perm13[2] != 0) continue;
+
+            // 1 0 2
+            std::vector<int> perm17 = get_node_attr_ai(*node17, "perm");
+            if (perm17.size() != 3) continue;
+
+            if (perm17[0] != 1 || perm17[1] != 0 || perm17[2] != 2) continue;
+
+            int softmax_axis = get_node_attr_i(*node15, "axis");
+            if (softmax_axis != 2) continue;
+
+            // 1/-1, seqlen * num_heads, embed_dim / num_heads
+            std::vector<int> shape8;
+            std::vector<int> shape10;
+            std::vector<int> shape11;
+            if (node8->input_size() == 1)
+            {
+                shape8 = get_node_attr_ai(*node8, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node8->input(1)) == weights.end()) continue;
+
+                shape8 = get_node_attr_from_input_ai(weights[node8->input(1)]);
+            }
+            if (node10->input_size() == 1)
+            {
+                shape10 = get_node_attr_ai(*node10, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node10->input(1)) == weights.end()) continue;
+
+                shape10 = get_node_attr_from_input_ai(weights[node10->input(1)]);
+            }
+            if (node11->input_size() == 1)
+            {
+                shape11 = get_node_attr_ai(*node11, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node11->input(1)) == weights.end()) continue;
+
+                shape11 = get_node_attr_from_input_ai(weights[node11->input(1)]);
+            }
+
+            if (shape8.size() != 3 || shape10.size() != 3 || shape11.size() != 3) continue;
+
+            if (shape8[1] != shape10[1] || shape8[1] != shape11[1] || shape8[2] != shape10[2] ||
+                shape8[2] != shape11[2])
+                continue;
+
+            int              num_heads = embed_dim / shape8[2];
+
+            // 1, seqlen, embed_dim
+            std::vector<int> shape18;
+            if (node18->input_size() == 1)
+            {
+                shape18 = get_node_attr_ai(*node18, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node18->input(1)) == weights.end()) continue;
+
+                shape18 = get_node_attr_from_input_ai(weights[node18->input(1)]);
+            }
+
+            if (shape18.size() != 3) continue;
+
+            if (shape18[2] != embed_dim || shape18[1] * num_heads != shape8[1]) continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
+            node4->set_op_type("noop_reducedncnn");
+            node5->set_op_type("noop_reducedncnn");
+            node6->set_op_type("noop_reducedncnn");
+            node7->set_op_type("noop_reducedncnn");
+            node8->set_op_type("noop_reducedncnn");
+            node9->set_op_type("noop_reducedncnn");
+            node10->set_op_type("noop_reducedncnn");
+            node11->set_op_type("noop_reducedncnn");
+            node12->set_op_type("noop_reducedncnn");
+            node13->set_op_type("noop_reducedncnn");
+            node14->set_op_type("noop_reducedncnn");
+            node15->set_op_type("noop_reducedncnn");
+            node16->set_op_type("noop_reducedncnn");
+            node17->set_op_type("noop_reducedncnn");
+            node18->set_op_type("noop_reducedncnn");
+            node19->set_op_type("noop_reducedncnn");
+
+            node_reference[node2->input(0)] -= 1;
+            node_reference[node4->input(0)] -= 1;
+            node_reference[node6->input(0)] -= 1;
+            node_reference[node7->input(0)] -= 1;
+            node_reference[node7->input(1)] -= 1;
+            node_reference[node8->input(0)] -= 1;
+            if (node8->input_size() == 2)
+            {
+                node_reference[node8->input(1)] -= 1;
+            }
+            node_reference[node9->input(0)] -= 1;
+            node_reference[node10->input(0)] -= 1;
+            if (node10->input_size() == 2)
+            {
+                node_reference[node10->input(1)] -= 1;
+            }
+            node_reference[node11->input(0)] -= 1;
+            if (node11->input_size() == 2)
+            {
+                node_reference[node11->input(1)] -= 1;
+            }
+            node_reference[node12->input(0)] -= 1;
+            node_reference[node13->input(0)] -= 1;
+            node_reference[node14->input(0)] -= 1;
+            node_reference[node14->input(1)] -= 1;
+            node_reference[node15->input(0)] -= 1;
+            node_reference[node16->input(0)] -= 1;
+            node_reference[node16->input(1)] -= 1;
+            node_reference[node17->input(0)] -= 1;
+            node_reference[node18->input(0)] -= 1;
+            if (node18->input_size() == 2)
+            {
+                node_reference[node18->input(1)] -= 1;
+            }
+            node_reference[node19->input(0)] -= 1;
+            node_reference[node20->input(0)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            blob_names.erase(node3->output(0));
+            blob_names.erase(node4->output(0));
+            blob_names.erase(node5->output(0));
+            blob_names.erase(node6->output(0));
+            blob_names.erase(node7->output(0));
+            blob_names.erase(node8->output(0));
+            blob_names.erase(node9->output(0));
+            blob_names.erase(node10->output(0));
+            blob_names.erase(node11->output(0));
+            blob_names.erase(node12->output(0));
+            blob_names.erase(node13->output(0));
+            blob_names.erase(node14->output(0));
+            blob_names.erase(node15->output(0));
+            blob_names.erase(node16->output(0));
+            blob_names.erase(node17->output(0));
+            blob_names.erase(node18->output(0));
+            blob_names.erase(node19->output(0));
+
+            std::string qw = node->input(1);
+            std::string qb = node2->input(1);
+            std::string kw = node3->input(1);
+            std::string kb = node4->input(1);
+            std::string vw = node5->input(1);
+            std::string vb = node6->input(1);
+            std::string ow = node19->input(1);
+            std::string ob = node20->input(1);
+
+            node20->set_op_type("MultiHeadAttention");
+            node20->clear_input();
+            node20->add_input(node->input(0));
+            node20->add_input(node3->input(0));
+            node20->add_input(node5->input(0));
+            // q
+            node20->add_input(qw);
+            node20->add_input(qb);
+            // k
+            node20->add_input(kw);
+            node20->add_input(kb);
+            // v
+            node20->add_input(vw);
+            node20->add_input(vb);
+            // out linear
+            node20->add_input(ow);
+            node20->add_input(ob);
+
+            onnx::AttributeProto* attr_embed_dim = node20->add_attribute();
+            attr_embed_dim->set_name("embed_dim");
+            attr_embed_dim->set_i(embed_dim);
+
+            onnx::AttributeProto* attr_num_heads = node20->add_attribute();
+            attr_num_heads->set_name("num_heads");
+            attr_num_heads->set_i(num_heads);
+
+            reduced_node_count += 19;
+            i += 19;
+        }
     }
-  }
-
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // MultiHeadAttention <= MatMul(qkv) - Add - Split
-    //                      - Mul
-    //                      - Reshape - Transpose
-    //                      - Reshape - Reshape - Transpose - Transpose
-    //                      - Gemm - Softmax - Gemm - Transpose - Reshape -
-    //                      MatMul - Add
-    if (node->op_type() == "MatMul") {
-      if (i + 16 >= node_count) continue;
-
-      if (node_reference[node->output(0)] != 1) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-      onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
-      onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
-      onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
-      onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
-      onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
-      onnx::NodeProto* node10 = mutable_graph->mutable_node(i + 9);
-      onnx::NodeProto* node11 = mutable_graph->mutable_node(i + 10);
-      onnx::NodeProto* node12 = mutable_graph->mutable_node(i + 11);
-      onnx::NodeProto* node13 = mutable_graph->mutable_node(i + 12);
-      onnx::NodeProto* node14 = mutable_graph->mutable_node(i + 13);
-      onnx::NodeProto* node15 = mutable_graph->mutable_node(i + 14);
-      onnx::NodeProto* node16 = mutable_graph->mutable_node(i + 15);
-      onnx::NodeProto* node17 = mutable_graph->mutable_node(i + 16);
-
-      if (node2->op_type() != "Add" || node3->op_type() != "Split" || node4->op_type() != "Mul" ||
-          node5->op_type() != "Reshape" || node6->op_type() != "Transpose" ||
-          node7->op_type() != "Reshape" || node8->op_type() != "Reshape" ||
-          node9->op_type() != "Transpose" || node10->op_type() != "Transpose" ||
-          node11->op_type() != "MatMul" || node12->op_type() != "Softmax" ||
-          node13->op_type() != "MatMul" || node14->op_type() != "Transpose" ||
-          node15->op_type() != "Reshape" || node16->op_type() != "MatMul" ||
-          node17->op_type() != "Add")
-        continue;
-
-      if (node_reference[node2->output(0)] != 1 || node_reference[node3->output(0)] != 1 ||
-          node_reference[node3->output(1)] != 1 || node_reference[node3->output(2)] != 1 ||
-          node_reference[node4->output(0)] != 1 || node_reference[node5->output(0)] != 1 ||
-          node_reference[node6->output(0)] != 1 || node_reference[node7->output(0)] != 1 ||
-          node_reference[node8->output(0)] != 1 || node_reference[node9->output(0)] != 1 ||
-          node_reference[node10->output(0)] != 1 || node_reference[node11->output(0)] != 1 ||
-          node_reference[node12->output(0)] != 1 || node_reference[node13->output(0)] != 1 ||
-          node_reference[node14->output(0)] != 1 || node_reference[node15->output(0)] != 1 ||
-          node_reference[node16->output(0)] != 1)
-        continue;
-
-      if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
-          node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0) ||
-          node6->input(0) != node5->output(0) || node7->input(0) != node3->output(1) ||
-          node8->input(0) != node3->output(2) || node9->input(0) != node8->output(0) ||
-          node10->input(0) != node7->output(0) || node11->input(0) != node6->output(0) ||
-          node11->input(1) != node10->output(0) || node12->input(0) != node11->output(0) ||
-          node13->input(0) != node12->output(0) || node13->input(1) != node9->output(0) ||
-          node14->input(0) != node13->output(0) || node15->input(0) != node14->output(0) ||
-          node16->input(0) != node15->output(0) || node17->input(0) != node16->output(0))
-        continue;
-
-      std::vector<float> qkv_B = get_node_attr_from_input_af(weights[node2->input(1)]);
-      std::vector<float> o_B = get_node_attr_from_input_af(weights[node17->input(1)]);
-
-      if (qkv_B.size() != o_B.size() * 3) continue;
-
-      int embed_dim = o_B.size();
-
-      // 1 0 2
-      std::vector<int> perm6 = get_node_attr_ai(*node6, "perm");
-      std::vector<int> perm9 = get_node_attr_ai(*node9, "perm");
-      if (perm6.size() != 3 || perm9.size() != 3) continue;
-
-      if (perm6[0] != 1 || perm6[1] != 0 || perm6[2] != 2 || perm9[0] != 1 || perm9[1] != 0 ||
-          perm9[2] != 2)
-        continue;
-
-      // 1 2 0
-      std::vector<int> perm10 = get_node_attr_ai(*node10, "perm");
-      if (perm10.size() != 3) continue;
-
-      if (perm10[0] != 1 || perm10[1] != 2 || perm10[2] != 0) continue;
-
-      // 1 0 2
-      std::vector<int> perm14 = get_node_attr_ai(*node14, "perm");
-      if (perm14.size() != 3) continue;
-
-      if (perm14[0] != 1 || perm14[1] != 0 || perm14[2] != 2) continue;
-
-      int softmax_axis = get_node_attr_i(*node12, "axis");
-      if (softmax_axis != 2) continue;
-
-      // 1/-1, seqlen * num_heads, embed_dim / num_heads
-      std::vector<int> shape5;
-      std::vector<int> shape7;
-      std::vector<int> shape8;
-      if (node5->input_size() == 1) {
-        shape5 = get_node_attr_ai(*node5, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node5->input(1)) == weights.end()) continue;
-
-        shape5 = get_node_attr_from_input_ai(weights[node5->input(1)]);
-      }
-      if (node7->input_size() == 1) {
-        shape7 = get_node_attr_ai(*node7, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node7->input(1)) == weights.end()) continue;
-
-        shape7 = get_node_attr_from_input_ai(weights[node7->input(1)]);
-      }
-      if (node8->input_size() == 1) {
-        shape8 = get_node_attr_ai(*node8, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node8->input(1)) == weights.end()) continue;
-
-        shape8 = get_node_attr_from_input_ai(weights[node8->input(1)]);
-      }
-
-      if (shape5.size() != 3 || shape7.size() != 3 || shape8.size() != 3) continue;
-
-      if (shape5[1] != shape7[1] || shape5[1] != shape8[1] || shape5[2] != shape7[2] ||
-          shape5[2] != shape8[2])
-        continue;
-
-      int num_heads = embed_dim / shape5[2];
-
-      // 1, seqlen, embed_dim
-      std::vector<int> shape15;
-      if (node15->input_size() == 1) {
-        shape15 = get_node_attr_ai(*node15, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node15->input(1)) == weights.end()) continue;
-
-        shape15 = get_node_attr_from_input_ai(weights[node15->input(1)]);
-      }
-
-      if (shape15.size() != 3) continue;
-
-      if (shape15[2] != embed_dim || shape15[1] * num_heads != shape8[1]) continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
-      node4->set_op_type("noop_reducedncnn");
-      node5->set_op_type("noop_reducedncnn");
-      node6->set_op_type("noop_reducedncnn");
-      node7->set_op_type("noop_reducedncnn");
-      node8->set_op_type("noop_reducedncnn");
-      node9->set_op_type("noop_reducedncnn");
-      node10->set_op_type("noop_reducedncnn");
-      node11->set_op_type("noop_reducedncnn");
-      node12->set_op_type("noop_reducedncnn");
-      node13->set_op_type("noop_reducedncnn");
-      node14->set_op_type("noop_reducedncnn");
-      node15->set_op_type("noop_reducedncnn");
-      node16->set_op_type("noop_reducedncnn");
-
-      node_reference[node2->input(0)] -= 1;
-      node_reference[node3->input(0)] -= 1;
-      node_reference[node4->input(0)] -= 1;
-      node_reference[node4->input(1)] -= 1;
-      node_reference[node5->input(0)] -= 1;
-      if (node5->input_size() == 2) {
-        node_reference[node5->input(1)] -= 1;
-      }
-      node_reference[node6->input(0)] -= 1;
-      node_reference[node7->input(0)] -= 1;
-      if (node7->input_size() == 2) {
-        node_reference[node7->input(1)] -= 1;
-      }
-      node_reference[node8->input(0)] -= 1;
-      if (node8->input_size() == 2) {
-        node_reference[node8->input(1)] -= 1;
-      }
-      node_reference[node9->input(0)] -= 1;
-      node_reference[node10->input(0)] -= 1;
-      node_reference[node11->input(0)] -= 1;
-      node_reference[node11->input(1)] -= 1;
-      node_reference[node12->input(0)] -= 1;
-      node_reference[node13->input(0)] -= 1;
-      node_reference[node13->input(1)] -= 1;
-      node_reference[node14->input(0)] -= 1;
-      node_reference[node15->input(0)] -= 1;
-      if (node15->input_size() == 2) {
-        node_reference[node15->input(1)] -= 1;
-      }
-      node_reference[node16->input(0)] -= 1;
-      node_reference[node17->input(0)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      blob_names.erase(node3->output(0));
-      blob_names.erase(node3->output(1));
-      blob_names.erase(node3->output(2));
-      blob_names.erase(node4->output(0));
-      blob_names.erase(node5->output(0));
-      blob_names.erase(node6->output(0));
-      blob_names.erase(node7->output(0));
-      blob_names.erase(node8->output(0));
-      blob_names.erase(node9->output(0));
-      blob_names.erase(node10->output(0));
-      blob_names.erase(node11->output(0));
-      blob_names.erase(node12->output(0));
-      blob_names.erase(node13->output(0));
-      blob_names.erase(node14->output(0));
-      blob_names.erase(node15->output(0));
-      blob_names.erase(node16->output(0));
-
-      std::string qkvw = node->input(1);
-      std::string qkvb = node2->input(1);
-      std::string ow = node16->input(1);
-      std::string ob = node17->input(1);
-
-      node17->set_op_type("MultiHeadAttention");
-      node17->clear_input();
-      node17->add_input(node->input(0));
-      // qkv
-      node17->add_input(qkvw);
-      node17->add_input(qkvb);
-      // out linear
-      node17->add_input(ow);
-      node17->add_input(ob);
-
-      onnx::AttributeProto* attr_embed_dim = node17->add_attribute();
-      attr_embed_dim->set_name("embed_dim");
-      attr_embed_dim->set_i(embed_dim);
-
-      onnx::AttributeProto* attr_num_heads = node17->add_attribute();
-      attr_num_heads->set_name("num_heads");
-      attr_num_heads->set_i(num_heads);
-
-      reduced_node_count += 16;
-      i += 16;
+
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // MultiHeadAttention <= MatMul(qkv) - Add - Split
+        //                      - Mul
+        //                      - Reshape - Transpose
+        //                      - Reshape - Reshape - Transpose - Transpose
+        //                      - Gemm - Softmax - Gemm - Transpose - Reshape -
+        //                      MatMul - Add
+        if (node->op_type() == "MatMul")
+        {
+            if (i + 16 >= node_count) continue;
+
+            if (node_reference[node->output(0)] != 1) continue;
+
+            onnx::NodeProto* node2  = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3  = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4  = mutable_graph->mutable_node(i + 3);
+            onnx::NodeProto* node5  = mutable_graph->mutable_node(i + 4);
+            onnx::NodeProto* node6  = mutable_graph->mutable_node(i + 5);
+            onnx::NodeProto* node7  = mutable_graph->mutable_node(i + 6);
+            onnx::NodeProto* node8  = mutable_graph->mutable_node(i + 7);
+            onnx::NodeProto* node9  = mutable_graph->mutable_node(i + 8);
+            onnx::NodeProto* node10 = mutable_graph->mutable_node(i + 9);
+            onnx::NodeProto* node11 = mutable_graph->mutable_node(i + 10);
+            onnx::NodeProto* node12 = mutable_graph->mutable_node(i + 11);
+            onnx::NodeProto* node13 = mutable_graph->mutable_node(i + 12);
+            onnx::NodeProto* node14 = mutable_graph->mutable_node(i + 13);
+            onnx::NodeProto* node15 = mutable_graph->mutable_node(i + 14);
+            onnx::NodeProto* node16 = mutable_graph->mutable_node(i + 15);
+            onnx::NodeProto* node17 = mutable_graph->mutable_node(i + 16);
+
+            if (node2->op_type() != "Add" || node3->op_type() != "Split" || node4->op_type() != "Mul" ||
+                node5->op_type() != "Reshape" || node6->op_type() != "Transpose" ||
+                node7->op_type() != "Reshape" || node8->op_type() != "Reshape" ||
+                node9->op_type() != "Transpose" || node10->op_type() != "Transpose" ||
+                node11->op_type() != "MatMul" || node12->op_type() != "Softmax" ||
+                node13->op_type() != "MatMul" || node14->op_type() != "Transpose" ||
+                node15->op_type() != "Reshape" || node16->op_type() != "MatMul" ||
+                node17->op_type() != "Add")
+                continue;
+
+            if (node_reference[node2->output(0)] != 1 || node_reference[node3->output(0)] != 1 ||
+                node_reference[node3->output(1)] != 1 || node_reference[node3->output(2)] != 1 ||
+                node_reference[node4->output(0)] != 1 || node_reference[node5->output(0)] != 1 ||
+                node_reference[node6->output(0)] != 1 || node_reference[node7->output(0)] != 1 ||
+                node_reference[node8->output(0)] != 1 || node_reference[node9->output(0)] != 1 ||
+                node_reference[node10->output(0)] != 1 || node_reference[node11->output(0)] != 1 ||
+                node_reference[node12->output(0)] != 1 || node_reference[node13->output(0)] != 1 ||
+                node_reference[node14->output(0)] != 1 || node_reference[node15->output(0)] != 1 ||
+                node_reference[node16->output(0)] != 1)
+                continue;
+
+            if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
+                node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0) ||
+                node6->input(0) != node5->output(0) || node7->input(0) != node3->output(1) ||
+                node8->input(0) != node3->output(2) || node9->input(0) != node8->output(0) ||
+                node10->input(0) != node7->output(0) || node11->input(0) != node6->output(0) ||
+                node11->input(1) != node10->output(0) || node12->input(0) != node11->output(0) ||
+                node13->input(0) != node12->output(0) || node13->input(1) != node9->output(0) ||
+                node14->input(0) != node13->output(0) || node15->input(0) != node14->output(0) ||
+                node16->input(0) != node15->output(0) || node17->input(0) != node16->output(0))
+                continue;
+
+            std::vector<float> qkv_B = get_node_attr_from_input_af(weights[node2->input(1)]);
+            std::vector<float> o_B   = get_node_attr_from_input_af(weights[node17->input(1)]);
+
+            if (qkv_B.size() != o_B.size() * 3) continue;
+
+            int              embed_dim = o_B.size();
+
+            // 1 0 2
+            std::vector<int> perm6 = get_node_attr_ai(*node6, "perm");
+            std::vector<int> perm9 = get_node_attr_ai(*node9, "perm");
+            if (perm6.size() != 3 || perm9.size() != 3) continue;
+
+            if (perm6[0] != 1 || perm6[1] != 0 || perm6[2] != 2 || perm9[0] != 1 || perm9[1] != 0 ||
+                perm9[2] != 2)
+                continue;
+
+            // 1 2 0
+            std::vector<int> perm10 = get_node_attr_ai(*node10, "perm");
+            if (perm10.size() != 3) continue;
+
+            if (perm10[0] != 1 || perm10[1] != 2 || perm10[2] != 0) continue;
+
+            // 1 0 2
+            std::vector<int> perm14 = get_node_attr_ai(*node14, "perm");
+            if (perm14.size() != 3) continue;
+
+            if (perm14[0] != 1 || perm14[1] != 0 || perm14[2] != 2) continue;
+
+            int softmax_axis = get_node_attr_i(*node12, "axis");
+            if (softmax_axis != 2) continue;
+
+            // 1/-1, seqlen * num_heads, embed_dim / num_heads
+            std::vector<int> shape5;
+            std::vector<int> shape7;
+            std::vector<int> shape8;
+            if (node5->input_size() == 1)
+            {
+                shape5 = get_node_attr_ai(*node5, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node5->input(1)) == weights.end()) continue;
+
+                shape5 = get_node_attr_from_input_ai(weights[node5->input(1)]);
+            }
+            if (node7->input_size() == 1)
+            {
+                shape7 = get_node_attr_ai(*node7, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node7->input(1)) == weights.end()) continue;
+
+                shape7 = get_node_attr_from_input_ai(weights[node7->input(1)]);
+            }
+            if (node8->input_size() == 1)
+            {
+                shape8 = get_node_attr_ai(*node8, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node8->input(1)) == weights.end()) continue;
+
+                shape8 = get_node_attr_from_input_ai(weights[node8->input(1)]);
+            }
+
+            if (shape5.size() != 3 || shape7.size() != 3 || shape8.size() != 3) continue;
+
+            if (shape5[1] != shape7[1] || shape5[1] != shape8[1] || shape5[2] != shape7[2] ||
+                shape5[2] != shape8[2])
+                continue;
+
+            int              num_heads = embed_dim / shape5[2];
+
+            // 1, seqlen, embed_dim
+            std::vector<int> shape15;
+            if (node15->input_size() == 1)
+            {
+                shape15 = get_node_attr_ai(*node15, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node15->input(1)) == weights.end()) continue;
+
+                shape15 = get_node_attr_from_input_ai(weights[node15->input(1)]);
+            }
+
+            if (shape15.size() != 3) continue;
+
+            if (shape15[2] != embed_dim || shape15[1] * num_heads != shape8[1]) continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
+            node4->set_op_type("noop_reducedncnn");
+            node5->set_op_type("noop_reducedncnn");
+            node6->set_op_type("noop_reducedncnn");
+            node7->set_op_type("noop_reducedncnn");
+            node8->set_op_type("noop_reducedncnn");
+            node9->set_op_type("noop_reducedncnn");
+            node10->set_op_type("noop_reducedncnn");
+            node11->set_op_type("noop_reducedncnn");
+            node12->set_op_type("noop_reducedncnn");
+            node13->set_op_type("noop_reducedncnn");
+            node14->set_op_type("noop_reducedncnn");
+            node15->set_op_type("noop_reducedncnn");
+            node16->set_op_type("noop_reducedncnn");
+
+            node_reference[node2->input(0)] -= 1;
+            node_reference[node3->input(0)] -= 1;
+            node_reference[node4->input(0)] -= 1;
+            node_reference[node4->input(1)] -= 1;
+            node_reference[node5->input(0)] -= 1;
+            if (node5->input_size() == 2)
+            {
+                node_reference[node5->input(1)] -= 1;
+            }
+            node_reference[node6->input(0)] -= 1;
+            node_reference[node7->input(0)] -= 1;
+            if (node7->input_size() == 2)
+            {
+                node_reference[node7->input(1)] -= 1;
+            }
+            node_reference[node8->input(0)] -= 1;
+            if (node8->input_size() == 2)
+            {
+                node_reference[node8->input(1)] -= 1;
+            }
+            node_reference[node9->input(0)] -= 1;
+            node_reference[node10->input(0)] -= 1;
+            node_reference[node11->input(0)] -= 1;
+            node_reference[node11->input(1)] -= 1;
+            node_reference[node12->input(0)] -= 1;
+            node_reference[node13->input(0)] -= 1;
+            node_reference[node13->input(1)] -= 1;
+            node_reference[node14->input(0)] -= 1;
+            node_reference[node15->input(0)] -= 1;
+            if (node15->input_size() == 2)
+            {
+                node_reference[node15->input(1)] -= 1;
+            }
+            node_reference[node16->input(0)] -= 1;
+            node_reference[node17->input(0)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            blob_names.erase(node3->output(0));
+            blob_names.erase(node3->output(1));
+            blob_names.erase(node3->output(2));
+            blob_names.erase(node4->output(0));
+            blob_names.erase(node5->output(0));
+            blob_names.erase(node6->output(0));
+            blob_names.erase(node7->output(0));
+            blob_names.erase(node8->output(0));
+            blob_names.erase(node9->output(0));
+            blob_names.erase(node10->output(0));
+            blob_names.erase(node11->output(0));
+            blob_names.erase(node12->output(0));
+            blob_names.erase(node13->output(0));
+            blob_names.erase(node14->output(0));
+            blob_names.erase(node15->output(0));
+            blob_names.erase(node16->output(0));
+
+            std::string qkvw = node->input(1);
+            std::string qkvb = node2->input(1);
+            std::string ow   = node16->input(1);
+            std::string ob   = node17->input(1);
+
+            node17->set_op_type("MultiHeadAttention");
+            node17->clear_input();
+            node17->add_input(node->input(0));
+            // qkv
+            node17->add_input(qkvw);
+            node17->add_input(qkvb);
+            // out linear
+            node17->add_input(ow);
+            node17->add_input(ob);
+
+            onnx::AttributeProto* attr_embed_dim = node17->add_attribute();
+            attr_embed_dim->set_name("embed_dim");
+            attr_embed_dim->set_i(embed_dim);
+
+            onnx::AttributeProto* attr_num_heads = node17->add_attribute();
+            attr_num_heads->set_name("num_heads");
+            attr_num_heads->set_i(num_heads);
+
+            reduced_node_count += 16;
+            i += 16;
+        }
     }
-  }
 }
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h
index 31dc6f5b93..73390cc24d 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h
@@ -4,30 +4,35 @@
 #include "shape_inference.h"
 #include "utils.h"
 
-void fuse_identity(onnx::GraphProto* mutable_graph,
+void fuse_identity(onnx::GraphProto*                         mutable_graph,
                    std::map<std::string, onnx::TensorProto>& weights,
-                   std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                   int& reduced_node_count);
+                   std::map<std::string, int>&               node_reference,
+                   std::set<std::string>&                    blob_names,
+                   int&                                      reduced_node_count);
 
-void fuse_rewrite_gather(onnx::GraphProto* mutable_graph,
+void fuse_rewrite_gather(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count);
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count);
 
-void fuse_weight_reshape(onnx::GraphProto* mutable_graph,
+void fuse_weight_reshape(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count);
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count);
 
-void fuse_shufflechannel(onnx::GraphProto* mutable_graph,
+void fuse_shufflechannel(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count);
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count);
 
-void fuse_shufflechannel_split(onnx::GraphProto* mutable_graph,
+void fuse_shufflechannel_split(onnx::GraphProto*                         mutable_graph,
                                std::map<std::string, onnx::TensorProto>& weights,
-                               std::map<std::string, int>& node_reference,
-                               std::set<std::string>& blob_names, int& reduced_node_count);
+                               std::map<std::string, int>&               node_reference,
+                               std::set<std::string>&                    blob_names,
+                               int&                                      reduced_node_count);
 
 /**
  * @brief fuse subgraph
@@ -46,85 +51,96 @@ void fuse_shufflechannel_split(onnx::GraphProto* mutable_graph,
  * @param blob_names
  * @param reduced_node_count
  */
-void fuse_conv_reshape(onnx::GraphProto* mutable_graph,
+void fuse_conv_reshape(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count);
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count);
 
-void fuse_binaryop_with_scalar(onnx::GraphProto* mutable_graph,
+void fuse_binaryop_with_scalar(onnx::GraphProto*                         mutable_graph,
                                std::map<std::string, onnx::TensorProto>& weights,
-                               std::map<std::string, int>& node_reference,
-                               std::set<std::string>& blob_names, int& reduced_node_count);
+                               std::map<std::string, int>&               node_reference,
+                               std::set<std::string>&                    blob_names,
+                               int&                                      reduced_node_count);
 
-void fuse_hardswish(onnx::GraphProto* mutable_graph,
+void fuse_hardswish(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count);
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count);
 
-void fuse_hardsigmoid(onnx::GraphProto* mutable_graph,
+void fuse_hardsigmoid(onnx::GraphProto*                         mutable_graph,
                       std::map<std::string, onnx::TensorProto>& weights,
-                      std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                      int& reduced_node_count);
+                      std::map<std::string, int>&               node_reference,
+                      std::set<std::string>&                    blob_names,
+                      int&                                      reduced_node_count);
 
-void fuse_batchnorm1d_squeeze_unsqueeze(onnx::GraphProto* mutable_graph,
+void fuse_batchnorm1d_squeeze_unsqueeze(onnx::GraphProto*                         mutable_graph,
                                         std::map<std::string, onnx::TensorProto>& weights,
-                                        std::map<std::string, int>& node_reference,
-                                        std::set<std::string>& blob_names, int& reduced_node_count);
+                                        std::map<std::string, int>&               node_reference,
+                                        std::set<std::string>&                    blob_names,
+                                        int&                                      reduced_node_count);
 
-void fuse_unsqueeze_prelu(onnx::GraphProto* mutable_graph,
+void fuse_unsqueeze_prelu(onnx::GraphProto*                         mutable_graph,
                           std::map<std::string, onnx::TensorProto>& weights,
-                          std::map<std::string, int>& node_reference,
-                          std::set<std::string>& blob_names, int& reduced_node_count);
+                          std::map<std::string, int>&               node_reference,
+                          std::set<std::string>&                    blob_names,
+                          int&                                      reduced_node_count);
 
-void fuse_normalize(onnx::GraphProto* mutable_graph,
+void fuse_normalize(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count);
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count);
 
-void fuse_groupnorm(onnx::GraphProto* mutable_graph,
+void fuse_groupnorm(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count);
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count);
 
-void fuse_layernorm(onnx::GraphProto* mutable_graph,
+void fuse_layernorm(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count);
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count);
 
-void fuse_flatten(onnx::GraphProto* mutable_graph,
+void fuse_flatten(onnx::GraphProto*                         mutable_graph,
                   std::map<std::string, onnx::TensorProto>& weights,
-                  std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                  int& reduced_node_count);
+                  std::map<std::string, int>&               node_reference,
+                  std::set<std::string>&                    blob_names,
+                  int&                                      reduced_node_count);
 
-void fuse_pixelshuffle(onnx::GraphProto* mutable_graph,
+void fuse_pixelshuffle(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count);
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count);
 
-void fuse_reorg(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights,
-                std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                int& reduced_node_count);
+void fuse_reorg(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference, std::set<std::string>& blob_names, int& reduced_node_count);
 
-void fuse_expand_broadcast(onnx::GraphProto* mutable_graph,
+void fuse_expand_broadcast(onnx::GraphProto*                         mutable_graph,
                            std::map<std::string, onnx::TensorProto>& weights,
-                           std::map<std::string, int>& node_reference,
-                           std::set<std::string>& blob_names, int& reduced_node_count);
+                           std::map<std::string, int>&               node_reference,
+                           std::set<std::string>&                    blob_names,
+                           int&                                      reduced_node_count);
 
-void fuse_lstm_gru_rnn(onnx::GraphProto* mutable_graph,
+void fuse_lstm_gru_rnn(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count);
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count);
 
-void fuse_multiheadattention(onnx::GraphProto* mutable_graph,
+void fuse_multiheadattention(onnx::GraphProto*                         mutable_graph,
                              std::map<std::string, onnx::TensorProto>& weights,
-                             std::map<std::string, int>& node_reference,
-                             std::set<std::string>& blob_names, int& reduced_node_count);
+                             std::map<std::string, int>&               node_reference,
+                             std::set<std::string>&                    blob_names,
+                             int&                                      reduced_node_count);
 
-void fuse_weight_transpose(onnx::GraphProto* mutable_graph,
+void fuse_weight_transpose(onnx::GraphProto*                         mutable_graph,
                            std::map<std::string, onnx::TensorProto>& weights,
-                           std::map<std::string, int>& node_reference,
-                           std::set<std::string>& blob_names, int& reduced_node_count);
+                           std::map<std::string, int>&               node_reference,
+                           std::set<std::string>&                    blob_names,
+                           int&                                      reduced_node_count);
 
-void fuse_swish(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights,
-                std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                int& reduced_node_count);
+void fuse_swish(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference, std::set<std::string>& blob_names, int& reduced_node_count);
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/onnx2ncnn.cpp b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/onnx2ncnn.cpp
index ca8cd628ad..bc38599b63 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/onnx2ncnn.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/onnx2ncnn.cpp
@@ -26,2719 +26,3551 @@
 #include "shape_inference.h"
 #include "utils.h"
 
-int main(int argc, char** argv) {
-  if (!(argc == 2 || argc == 4)) {
-    fprintf(stderr, "Usage: %s [onnxpb] [ncnnparam] [ncnnbin]\n", argv[0]);
-    return -1;
-  }
-
-  const char* onnxpb = argv[1];
-  const char* ncnn_prototxt = argc == 4 ? argv[2] : "ncnn.param";
-  const char* ncnn_modelbin = argc == 4 ? argv[3] : "ncnn.bin";
-
-  onnx::ModelProto model;
-
-  // load
-  bool s1 = read_proto_from_binary(onnxpb, &model);
-  if (!s1) {
-    fprintf(stderr, "read_proto_from_binary failed\n");
-    return -1;
-  }
-  FILE* pp = fopen(ncnn_prototxt, "wb");
-  FILE* bp = fopen(ncnn_modelbin, "wb");
-  // magic
-  fprintf(pp, "7767517\n");
-  onnx::GraphProto* mutable_graph = model.mutable_graph();
-  int node_count = mutable_graph->node_size();
-
-  // node reference
-  std::map<std::string, int> node_reference;
-
-  // weight node and weight reshape node
-  std::map<std::string, onnx::TensorProto> weights;
-  for (int j = 0; j < mutable_graph->initializer_size(); j++) {
-    const onnx::TensorProto& initializer = mutable_graph->initializer(j);
-
-    //         fprintf(stderr, "weight = %s %d\n", initializer.name().c_str(),
-    //         initializer.data_type());
-
-    weights[initializer.name()] = initializer;
-  }
-  // topological sort
-  {
-    // name -> producer node index
-    std::set<std::string> producers;
-    for (int j = 0; j < mutable_graph->input_size(); j++) {
-      const std::string& input_name = mutable_graph->input(j).name();
-      producers.insert(input_name);
+int main(int argc, char** argv)
+{
+    if (!(argc == 2 || argc == 4))
+    {
+        fprintf(stderr, "Usage: %s [onnxpb] [ncnnparam] [ncnnbin]\n", argv[0]);
+        return -1;
     }
 
-    for (int i = 0; i < node_count;) {
-      onnx::NodeProto* node = mutable_graph->mutable_node(i);
+    const char*      onnxpb        = argv[1];
+    const char*      ncnn_prototxt = argc == 4 ? argv[2] : "ncnn.param";
+    const char*      ncnn_modelbin = argc == 4 ? argv[3] : "ncnn.bin";
 
-      bool swapnode = false;
-      std::string missing_input_name;
-      for (int j = 0; j < (int)node->input_size(); j++) {
-        const std::string& input_name = node->input(j);
-        if (input_name.empty()) continue;
+    onnx::ModelProto model;
 
-        if (producers.find(input_name) == producers.end() &&
-            weights.find(input_name) == weights.end()) {
-          swapnode = true;
-          missing_input_name = input_name;
-          break;
-        }
-      }
+    // load
+    bool             s1 = read_proto_from_binary(onnxpb, &model);
+    if (!s1)
+    {
+        fprintf(stderr, "read_proto_from_binary failed\n");
+        return -1;
+    }
+    FILE* pp = fopen(ncnn_prototxt, "wb");
+    FILE* bp = fopen(ncnn_modelbin, "wb");
+    // magic
+    fprintf(pp, "7767517\n");
+    onnx::GraphProto*                        mutable_graph = model.mutable_graph();
+    int                                      node_count    = mutable_graph->node_size();
+
+    // node reference
+    std::map<std::string, int>               node_reference;
+
+    // weight node and weight reshape node
+    std::map<std::string, onnx::TensorProto> weights;
+    for (int j = 0; j < mutable_graph->initializer_size(); j++)
+    {
+        const onnx::TensorProto& initializer = mutable_graph->initializer(j);
 
-      if (!swapnode) {
-        for (int j = 0; j < (int)node->output_size(); j++) {
-          const std::string& output_name = node->output(j);
-          if (output_name.empty()) continue;
+        //         fprintf(stderr, "weight = %s %d\n", initializer.name().c_str(),
+        //         initializer.data_type());
 
-          producers.insert(output_name);
+        weights[initializer.name()] = initializer;
+    }
+    // topological sort
+    {
+        // name -> producer node index
+        std::set<std::string> producers;
+        for (int j = 0; j < mutable_graph->input_size(); j++)
+        {
+            const std::string& input_name = mutable_graph->input(j).name();
+            producers.insert(input_name);
         }
 
-        i++;
-        continue;
-      }
-
-      // find node that produce missing_input_name
-      int q = i + 1;
-      for (; q < node_count; q++) {
-        onnx::NodeProto* nodeq = mutable_graph->mutable_node(q);
-        bool found = false;
-        for (int j = 0; j < (int)nodeq->output_size(); j++) {
-          const std::string& output_name = nodeq->output(j);
-          if (output_name == missing_input_name) {
-            found = true;
-            break;
-          }
-        }
+        for (int i = 0; i < node_count;)
+        {
+            onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+            bool             swapnode = false;
+            std::string      missing_input_name;
+            for (int j = 0; j < (int)node->input_size(); j++)
+            {
+                const std::string& input_name = node->input(j);
+                if (input_name.empty()) continue;
+
+                if (producers.find(input_name) == producers.end() &&
+                    weights.find(input_name) == weights.end())
+                {
+                    swapnode           = true;
+                    missing_input_name = input_name;
+                    break;
+                }
+            }
 
-        if (found) break;
-      }
+            if (!swapnode)
+            {
+                for (int j = 0; j < (int)node->output_size(); j++)
+                {
+                    const std::string& output_name = node->output(j);
+                    if (output_name.empty()) continue;
 
-      if (q == node_count) {
-        fprintf(stderr, "cannot find node produces %s but node %d requires it\n",
-                missing_input_name.c_str(), i);
-        return -1;
-      }
-
-      // fprintf(stderr, "swap %d %d\n", i, q);
-      // swap this node with q
-      onnx::NodeProto* nodeq = mutable_graph->mutable_node(q);
-      onnx::NodeProto tmp = *node;
-      *node = *nodeq;
-      *nodeq = tmp;
-    }
-  }
-  // global definition line
-  // [layer count] [blob count]
-  std::set<std::string> blob_names;
-  for (int i = 0; i < node_count; i++) {
-    const onnx::NodeProto& node = mutable_graph->node(i);
-
-    const std::string& op = node.op_type();
-
-    std::string name = node.name();
-    if (name.empty()) {
-      name = node.output(0);
-    }
+                    producers.insert(output_name);
+                }
 
-    if (op == "Constant") {
-      onnx::TensorProto tensor = get_node_attr_tensor(node, "value");
-      weights[node.output(0)] = tensor;
-    }
+                i++;
+                continue;
+            }
 
-    for (int j = 0; j < (int)node.input_size(); j++) {
-      const std::string& input_name = node.input(j);
+            // find node that produce missing_input_name
+            int q = i + 1;
+            for (; q < node_count; q++)
+            {
+                onnx::NodeProto* nodeq = mutable_graph->mutable_node(q);
+                bool             found = false;
+                for (int j = 0; j < (int)nodeq->output_size(); j++)
+                {
+                    const std::string& output_name = nodeq->output(j);
+                    if (output_name == missing_input_name)
+                    {
+                        found = true;
+                        break;
+                    }
+                }
+
+                if (found) break;
+            }
 
-      blob_names.insert(input_name);
+            if (q == node_count)
+            {
+                fprintf(stderr, "cannot find node produces %s but node %d requires it\n", missing_input_name.c_str(), i);
+                return -1;
+            }
 
-      if (node_reference.find(input_name) == node_reference.end()) {
-        node_reference[input_name] = 1;
-      } else {
-        node_reference[input_name] = node_reference[input_name] + 1;
-      }
+            // fprintf(stderr, "swap %d %d\n", i, q);
+            // swap this node with q
+            onnx::NodeProto* nodeq = mutable_graph->mutable_node(q);
+            onnx::NodeProto  tmp   = *node;
+            *node                  = *nodeq;
+            *nodeq                 = tmp;
+        }
     }
+    // global definition line
+    // [layer count] [blob count]
+    std::set<std::string> blob_names;
+    for (int i = 0; i < node_count; i++)
+    {
+        const onnx::NodeProto& node = mutable_graph->node(i);
 
-    if (op == "Dropout") {
-      const std::string& output_name = node.output(0);
-      blob_names.insert(output_name);
-      node_reference[output_name] = 0;
-      continue;
-    }
+        const std::string&     op = node.op_type();
 
-    for (int j = 0; j < (int)node.output_size(); j++) {
-      const std::string& output_name = node.output(j);
+        std::string            name = node.name();
+        if (name.empty())
+        {
+            name = node.output(0);
+        }
 
-      blob_names.insert(output_name);
+        if (op == "Constant")
+        {
+            onnx::TensorProto tensor = get_node_attr_tensor(node, "value");
+            weights[node.output(0)]  = tensor;
+        }
 
-      node_reference[output_name] = 0;
-    }
-  }
-  // include Input node
-  int input_node_count = 0;
-  for (int j = 0; j < mutable_graph->input_size(); j++) {
-    const std::string& input_name = mutable_graph->input(j).name();
-
-    // check weight
-    if (weights.find(input_name) != weights.end()) continue;
-
-    blob_names.insert(input_name);
-
-    input_node_count++;
-  }
-
-  //     for (auto a: node_reference)
-  //     {
-  //         fprintf(stderr, "a = %s %d\n", a.first.c_str(), a.second);
-  //     }
-
-  // op chain fusion
-  int reduced_node_count = 0;
-  {
-    fuse_identity(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_conv_reshape(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_weight_reshape(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_weight_transpose(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_shufflechannel(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_shufflechannel_split(mutable_graph, weights, node_reference, blob_names,
-                              reduced_node_count);
-    fuse_hardsigmoid(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_hardswish(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_swish(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_batchnorm1d_squeeze_unsqueeze(mutable_graph, weights, node_reference, blob_names,
-                                       reduced_node_count);
-    fuse_unsqueeze_prelu(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_normalize(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_groupnorm(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_layernorm(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_flatten(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_pixelshuffle(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_reorg(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_expand_broadcast(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_lstm_gru_rnn(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_multiheadattention(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_binaryop_with_scalar(mutable_graph, weights, node_reference, blob_names,
-                              reduced_node_count);
-    fuse_rewrite_gather(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-  }
-  // reduce common const weight node_reference
-  for (int i = 0; i < node_count; i++) {
-    const onnx::NodeProto& node = mutable_graph->node(i);
-
-    const std::string& op = node.op_type();
-
-    if (op == "BatchNormalization") {
-      node_reference[node.input(1)] -= 1;
-      node_reference[node.input(2)] -= 1;
-      node_reference[node.input(3)] -= 1;
-      node_reference[node.input(4)] -= 1;
-    } else if (op == "BiasGelu") {
-      node_reference[node.input(1)] -= 1;
-    } else if (op == "Clip") {
-      if (node.input_size() == 3) {
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-      }
-    } else if (op == "Conv") {
-      node_reference[node.input(1)] -= 1;
-      if (node.input_size() == 3) {
-        node_reference[node.input(2)] -= 1;
-      }
-    } else if (op == "ConvTranspose") {
-      node_reference[node.input(1)] -= 1;
-      if (node.input_size() == 3) {
-        node_reference[node.input(2)] -= 1;
-      }
-    } else if (op == "EmbedLayerNormalization") {
-      node_reference[node.input(1)] -= 1;
-      node_reference[node.input(2)] -= 1;
-      node_reference[node.input(3)] -= 1;
-      node_reference[node.input(4)] -= 1;
-      node_reference[node.input(5)] -= 1;
-      node_reference[node.input(6)] -= 1;
-    } else if (op == "Gemm") {
-      float alpha = get_node_attr_f(node, "alpha", 1.f);
-      float beta = get_node_attr_f(node, "beta", 1.f);
-      int transA = get_node_attr_i(node, "transA", 0);
-      int transB = get_node_attr_i(node, "transB", 0);
-
-      if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1) {
-        // InnerProduct-like A * B + C, C is optional.
-        node_reference[node.input(1)] -= 1;
-        if (node.input_size() == 3) {
-          node_reference[node.input(2)] -= 1;
-        }
-      }
-    } else if (op == "GroupNorm") {
-      int affine = get_node_attr_i(node, "affine", 1);
-      if (affine) {
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-      }
-    } else if (op == "GRU") {
-      for (int j = 1; j < node.input_size(); j++) {
-        node_reference[node.input(j)] -= 1;
-      }
-    } else if (op == "InstanceNormalization") {
-      node_reference[node.input(1)] -= 1;
-      node_reference[node.input(2)] -= 1;
-    } else if (op == "LayerNorm") {
-      int affine = get_node_attr_i(node, "affine", 1);
-      if (affine) {
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-      }
-    } else if (op == "LSTM") {
-      for (int j = 1; j < node.input_size(); j++) {
-        node_reference[node.input(j)] -= 1;
-      }
-    } else if (op == "MatMul") {
-      if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2) {
-        // InnerProduct
-        node_reference[node.input(1)] -= 1;
-      }
-    } else if (op == "MultiHeadAttention") {
-      if (node.input_size() == 5) {
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-        node_reference[node.input(3)] -= 1;
-        node_reference[node.input(4)] -= 1;
-      } else {
-        node_reference[node.input(3)] -= 1;
-        node_reference[node.input(4)] -= 1;
-        node_reference[node.input(5)] -= 1;
-        node_reference[node.input(6)] -= 1;
-        node_reference[node.input(7)] -= 1;
-        node_reference[node.input(8)] -= 1;
-        node_reference[node.input(9)] -= 1;
-        node_reference[node.input(10)] -= 1;
-      }
-    } else if (op == "NonMaxSuppression") {
-      if (node.input_size() >= 3) {
-        node_reference[node.input(2)] -= 1;
-      }
-      if (node.input_size() >= 4) {
-        node_reference[node.input(3)] -= 1;
-      }
-      if (node.input_size() >= 5) {
-        node_reference[node.input(4)] -= 1;
-      }
-    } else if (op == "Pad") {
-      if (node.input_size() >= 2) {
-        node_reference[node.input(1)] -= 1;
-      }
-    } else if (op == "PRelu") {
-      node_reference[node.input(1)] -= 1;
-    } else if (op == "Reshape") {
-      if (node.input_size() == 2) {
-        if (weights[node.input(1)].data_type() != 0) {
-          node_reference[node.input(1)] -= 1;
-        }
-      }
-    } else if (op == "Resize") {
-      if (node.input_size() == 2) {
-        // opset 10
-        node_reference[node.input(1)] -= 1;
-      } else {
-        // opset 11+
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-        if (node.input_size() >= 4) {
-          node_reference[node.input(3)] -= 1;
-        }
-      }
-    } else if (op == "RNN") {
-      for (int j = 1; j < node.input_size(); j++) {
-        node_reference[node.input(j)] -= 1;
-      }
-    } else if (op == "SkipLayerNormalization") {
-      node_reference[node.input(2)] -= 1;
-      node_reference[node.input(3)] -= 1;
-      node_reference[node.input(4)] -= 1;
-    } else if (op == "Slice") {
-      if (node.input_size() >= 2) {
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-        if (node.input_size() >= 4) node_reference[node.input(3)] -= 1;
-        if (node.input_size() >= 5) node_reference[node.input(4)] -= 1;
-      }
-    } else if (op == "Upsample") {
-      if (node.input_size() >= 2) {
-        node_reference[node.input(1)] -= 1;
-      }
-    } else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
-               op == "adaptive_max_pool2d") {
-      if (node.input_size() >= 2) {
-        node_reference[node.input(1)] -= 1;
-      }
-    }
-  }
+        for (int j = 0; j < (int)node.input_size(); j++)
+        {
+            const std::string& input_name = node.input(j);
 
-  //         for (auto a: node_reference)
-  //         {
-  //             fprintf(stderr, "b = %s %d\n", a.first.c_str(), a.second);
-  //         }
+            blob_names.insert(input_name);
 
-  // count all weight node with zero reference
-  int zero_reference_weight_node_count = 0;
-  for (std::map<std::string, onnx::TensorProto>::iterator it = weights.begin(); it != weights.end();
-       it++) {
-    const std::string& input_name = it->first;
+            if (node_reference.find(input_name) == node_reference.end())
+            {
+                node_reference[input_name] = 1;
+            }
+            else
+            {
+                node_reference[input_name] = node_reference[input_name] + 1;
+            }
+        }
 
-    int refcount = node_reference[input_name];
-    if (refcount == 0) zero_reference_weight_node_count++;
-  }
+        if (op == "Dropout")
+        {
+            const std::string& output_name = node.output(0);
+            blob_names.insert(output_name);
+            node_reference[output_name] = 0;
+            continue;
+        }
 
-  // we always treat constant node as weight or binaryop_weights
-  // do not count it twice for layer_count
-  int constant_node_count_moved_to_weight = 0;
-  for (int i = 0; i < node_count; i++) {
-    const onnx::NodeProto& node = mutable_graph->node(i);
+        for (int j = 0; j < (int)node.output_size(); j++)
+        {
+            const std::string& output_name = node.output(j);
 
-    const std::string& op = node.op_type();
+            blob_names.insert(output_name);
 
-    if (op == "Constant") {
-      constant_node_count_moved_to_weight++;
-    }
-  }
-
-  // some op may have anonymous input
-  // LSTM sequence_lens
-  blob_names.erase("");
-  node_reference.erase("");
-
-  // remove node_reference entry with reference equals to one
-  int split_layer_count = 0;
-  int splitncnn_blob_count = 0;
-  // split node reference
-  std::map<std::string, int> split_node_reference;
-  for (std::map<std::string, int>::iterator it = node_reference.begin(); it != node_reference.end();
-       it++) {
-    if (it->second > 1) {
-      split_layer_count++;
-      splitncnn_blob_count += it->second;
-
-      split_node_reference[it->first] = it->second;
+            node_reference[output_name] = 0;
+        }
     }
-  }
-
-  fprintf(pp, "%zu %zu\n",
-          node_count - constant_node_count_moved_to_weight + weights.size() -
-              zero_reference_weight_node_count - reduced_node_count + input_node_count +
-              split_layer_count,
-          blob_names.size() - zero_reference_weight_node_count + splitncnn_blob_count);
-
-  int internal_split = 0;
-
-  // place Input at the beginning
-  for (int j = 0; j < mutable_graph->input_size(); j++) {
-    const std::string& input_name = mutable_graph->input(j).name();
+    // include Input node
+    int input_node_count = 0;
+    for (int j = 0; j < mutable_graph->input_size(); j++)
+    {
+        const std::string& input_name = mutable_graph->input(j).name();
 
-    // check weight
-    if (weights.find(input_name) != weights.end()) continue;
+        // check weight
+        if (weights.find(input_name) != weights.end()) continue;
 
-    fprintf(pp, "%-16s %-24s 0 1 %s\n", "Input", input_name.c_str(), input_name.c_str());
+        blob_names.insert(input_name);
 
-    int refcount = node_reference[input_name];
-    if (refcount <= 1) {
-      continue;
+        input_node_count++;
     }
 
-    char splitname[256];
-    sprintf(splitname, "splitncnn_input%d", j);
-    fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
-    fprintf(pp, " %s", input_name.c_str());
+    //     for (auto a: node_reference)
+    //     {
+    //         fprintf(stderr, "a = %s %d\n", a.first.c_str(), a.second);
+    //     }
 
-    for (int k = 0; k < refcount; k++) {
-      fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k);
+    // op chain fusion
+    int reduced_node_count = 0;
+    {
+        fuse_identity(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_conv_reshape(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_weight_reshape(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_weight_transpose(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_shufflechannel(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_shufflechannel_split(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_hardsigmoid(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_hardswish(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_swish(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_batchnorm1d_squeeze_unsqueeze(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_unsqueeze_prelu(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_normalize(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_groupnorm(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_layernorm(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_flatten(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_pixelshuffle(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_reorg(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_expand_broadcast(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_lstm_gru_rnn(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_multiheadattention(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_binaryop_with_scalar(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_rewrite_gather(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
     }
-    fprintf(pp, "\n");
-  }
+    // reduce common const weight node_reference
+    for (int i = 0; i < node_count; i++)
+    {
+        const onnx::NodeProto& node = mutable_graph->node(i);
 
-  // place MemoryData next
-  for (std::map<std::string, onnx::TensorProto>::iterator weight_it = weights.begin();
-       weight_it != weights.end(); weight_it++) {
-    const std::string& input_name = weight_it->first;
+        const std::string&     op = node.op_type();
 
-    int refcount = node_reference[input_name];
-    if (refcount == 0) {
-      continue;
+        if (op == "BatchNormalization")
+        {
+            node_reference[node.input(1)] -= 1;
+            node_reference[node.input(2)] -= 1;
+            node_reference[node.input(3)] -= 1;
+            node_reference[node.input(4)] -= 1;
+        }
+        else if (op == "BiasGelu")
+        {
+            node_reference[node.input(1)] -= 1;
+        }
+        else if (op == "Clip")
+        {
+            if (node.input_size() == 3)
+            {
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+            }
+        }
+        else if (op == "Conv")
+        {
+            node_reference[node.input(1)] -= 1;
+            if (node.input_size() == 3)
+            {
+                node_reference[node.input(2)] -= 1;
+            }
+        }
+        else if (op == "ConvTranspose")
+        {
+            node_reference[node.input(1)] -= 1;
+            if (node.input_size() == 3)
+            {
+                node_reference[node.input(2)] -= 1;
+            }
+        }
+        else if (op == "EmbedLayerNormalization")
+        {
+            node_reference[node.input(1)] -= 1;
+            node_reference[node.input(2)] -= 1;
+            node_reference[node.input(3)] -= 1;
+            node_reference[node.input(4)] -= 1;
+            node_reference[node.input(5)] -= 1;
+            node_reference[node.input(6)] -= 1;
+        }
+        else if (op == "Gemm")
+        {
+            float alpha  = get_node_attr_f(node, "alpha", 1.f);
+            float beta   = get_node_attr_f(node, "beta", 1.f);
+            int   transA = get_node_attr_i(node, "transA", 0);
+            int   transB = get_node_attr_i(node, "transB", 0);
+
+            if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1)
+            {
+                // InnerProduct-like A * B + C, C is optional.
+                node_reference[node.input(1)] -= 1;
+                if (node.input_size() == 3)
+                {
+                    node_reference[node.input(2)] -= 1;
+                }
+            }
+        }
+        else if (op == "GroupNorm")
+        {
+            int affine = get_node_attr_i(node, "affine", 1);
+            if (affine)
+            {
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+            }
+        }
+        else if (op == "GRU")
+        {
+            for (int j = 1; j < node.input_size(); j++)
+            {
+                node_reference[node.input(j)] -= 1;
+            }
+        }
+        else if (op == "InstanceNormalization")
+        {
+            node_reference[node.input(1)] -= 1;
+            node_reference[node.input(2)] -= 1;
+        }
+        else if (op == "LayerNorm")
+        {
+            int affine = get_node_attr_i(node, "affine", 1);
+            if (affine)
+            {
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+            }
+        }
+        else if (op == "LSTM")
+        {
+            for (int j = 1; j < node.input_size(); j++)
+            {
+                node_reference[node.input(j)] -= 1;
+            }
+        }
+        else if (op == "MatMul")
+        {
+            if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2)
+            {
+                // InnerProduct
+                node_reference[node.input(1)] -= 1;
+            }
+        }
+        else if (op == "MultiHeadAttention")
+        {
+            if (node.input_size() == 5)
+            {
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+                node_reference[node.input(3)] -= 1;
+                node_reference[node.input(4)] -= 1;
+            }
+            else
+            {
+                node_reference[node.input(3)] -= 1;
+                node_reference[node.input(4)] -= 1;
+                node_reference[node.input(5)] -= 1;
+                node_reference[node.input(6)] -= 1;
+                node_reference[node.input(7)] -= 1;
+                node_reference[node.input(8)] -= 1;
+                node_reference[node.input(9)] -= 1;
+                node_reference[node.input(10)] -= 1;
+            }
+        }
+        else if (op == "NonMaxSuppression")
+        {
+            if (node.input_size() >= 3)
+            {
+                node_reference[node.input(2)] -= 1;
+            }
+            if (node.input_size() >= 4)
+            {
+                node_reference[node.input(3)] -= 1;
+            }
+            if (node.input_size() >= 5)
+            {
+                node_reference[node.input(4)] -= 1;
+            }
+        }
+        else if (op == "Pad")
+        {
+            if (node.input_size() >= 2)
+            {
+                node_reference[node.input(1)] -= 1;
+            }
+        }
+        else if (op == "PRelu")
+        {
+            node_reference[node.input(1)] -= 1;
+        }
+        else if (op == "Reshape")
+        {
+            if (node.input_size() == 2)
+            {
+                if (weights[node.input(1)].data_type() != 0)
+                {
+                    node_reference[node.input(1)] -= 1;
+                }
+            }
+        }
+        else if (op == "Resize")
+        {
+            if (node.input_size() == 2)
+            {
+                // opset 10
+                node_reference[node.input(1)] -= 1;
+            }
+            else
+            {
+                // opset 11+
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+                if (node.input_size() >= 4)
+                {
+                    node_reference[node.input(3)] -= 1;
+                }
+            }
+        }
+        else if (op == "RNN")
+        {
+            for (int j = 1; j < node.input_size(); j++)
+            {
+                node_reference[node.input(j)] -= 1;
+            }
+        }
+        else if (op == "SkipLayerNormalization")
+        {
+            node_reference[node.input(2)] -= 1;
+            node_reference[node.input(3)] -= 1;
+            node_reference[node.input(4)] -= 1;
+        }
+        else if (op == "Slice")
+        {
+            if (node.input_size() >= 2)
+            {
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+                if (node.input_size() >= 4) node_reference[node.input(3)] -= 1;
+                if (node.input_size() >= 5) node_reference[node.input(4)] -= 1;
+            }
+        }
+        else if (op == "Upsample")
+        {
+            if (node.input_size() >= 2)
+            {
+                node_reference[node.input(1)] -= 1;
+            }
+        }
+        else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
+                 op == "adaptive_max_pool2d")
+        {
+            if (node.input_size() >= 2)
+            {
+                node_reference[node.input(1)] -= 1;
+            }
+        }
     }
 
-    fprintf(pp, "%-16s %-24s 0 1 %s", "MemoryData", input_name.c_str(), input_name.c_str());
-
-    const onnx::TensorProto& M = weights[input_name];
-
-    if (M.dims_size() == 0) {
-      fprintf(pp, " 0=%d", get_tensor_proto_data_size(M));
-    } else if (M.dims_size() == 1) {
-      fprintf(pp, " 0=%d", (int)M.dims(0));
-    } else if (M.dims_size() == 2) {
-      fprintf(pp, " 0=%d", (int)M.dims(1));
-      if (M.dims(0) != 1) {
-        fprintf(pp, " 1=%d", (int)M.dims(0));
-      }
-    } else if (M.dims_size() == 3) {
-      fprintf(pp, " 0=%d", (int)M.dims(2));
-      fprintf(pp, " 1=%d", (int)M.dims(1));
-      if (M.dims(0) != 1) {
-        fprintf(pp, " 2=%d", (int)M.dims(0));
-      }
-    } else if (M.dims_size() == 4) {
-      fprintf(pp, " 0=%d", (int)M.dims(3));
-      fprintf(pp, " 1=%d", (int)M.dims(2));
-      fprintf(pp, " 2=%d", (int)M.dims(1));
-    }
+    //         for (auto a: node_reference)
+    //         {
+    //             fprintf(stderr, "b = %s %d\n", a.first.c_str(), a.second);
+    //         }
 
-    fprintf(pp, "\n");
-    if (M.data_type() == 1) {
-      fwrite_tensor_proto_data(M, bp);
-    } else if (M.data_type() == 7 || M.data_type() == 6 || M.data_type() == 9 ||
-               M.data_type() == 11) {
-      fwrite_tensor_proto_data_to_float(M, bp);
-    } else {
-      fwrite_tensor_proto_data(M, bp);
-    }
+    // count all weight node with zero reference
+    int zero_reference_weight_node_count = 0;
+    for (std::map<std::string, onnx::TensorProto>::iterator it = weights.begin(); it != weights.end();
+         it++)
+    {
+        const std::string& input_name = it->first;
 
-    if (refcount <= 1) {
-      continue;
+        int                refcount = node_reference[input_name];
+        if (refcount == 0) zero_reference_weight_node_count++;
     }
 
-    char splitname[256];
-    sprintf(splitname, "splitncnn_%d", internal_split);
-    fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
+    // we always treat constant node as weight or binaryop_weights
+    // do not count it twice for layer_count
+    int constant_node_count_moved_to_weight = 0;
+    for (int i = 0; i < node_count; i++)
+    {
+        const onnx::NodeProto& node = mutable_graph->node(i);
 
-    fprintf(pp, " %s", input_name.c_str());
+        const std::string&     op = node.op_type();
 
-    for (int k = 0; k < refcount; k++) {
-      fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k);
+        if (op == "Constant")
+        {
+            constant_node_count_moved_to_weight++;
+        }
     }
-    fprintf(pp, "\n");
 
-    internal_split++;
-  }
+    // some op may have anonymous input
+    // LSTM sequence_lens
+    blob_names.erase("");
+    node_reference.erase("");
+
+    // remove node_reference entry with reference equals to one
+    int                        split_layer_count    = 0;
+    int                        splitncnn_blob_count = 0;
+    // split node reference
+    std::map<std::string, int> split_node_reference;
+    for (std::map<std::string, int>::iterator it = node_reference.begin(); it != node_reference.end();
+         it++)
+    {
+        if (it->second > 1)
+        {
+            split_layer_count++;
+            splitncnn_blob_count += it->second;
 
-  for (int i = 0; i < node_count; i++) {
-    const onnx::NodeProto& node = mutable_graph->node(i);
-    const std::string& op = node.op_type();
+            split_node_reference[it->first] = it->second;
+        }
+    }
 
-    //         fprintf(stderr, "op = %s\n", op.c_str());
+    fprintf(pp, "%zu %zu\n", node_count - constant_node_count_moved_to_weight + weights.size() - zero_reference_weight_node_count - reduced_node_count + input_node_count + split_layer_count, blob_names.size() - zero_reference_weight_node_count + splitncnn_blob_count);
 
-    if (op == "noop_reducedncnn") {
-      continue;
-    }
+    int internal_split = 0;
 
-    std::string name = node.name();
-    if (name.empty()) {
-      name = node.output(0);
-    }
+    // place Input at the beginning
+    for (int j = 0; j < mutable_graph->input_size(); j++)
+    {
+        const std::string& input_name = mutable_graph->input(j).name();
 
-    int input_size = node.input_size();
-    int output_size = node.output_size();
+        // check weight
+        if (weights.find(input_name) != weights.end()) continue;
 
-    for (int j = 0; j < (int)node.input_size(); j++) {
-      const std::string& input_name = node.input(j);
+        fprintf(pp, "%-16s %-24s 0 1 %s\n", "Input", input_name.c_str(), input_name.c_str());
 
-      // check weight
-      if (weights.find(input_name) != weights.end() && node_reference[input_name] == 0) {
-        input_size--;
-      }
+        int refcount = node_reference[input_name];
+        if (refcount <= 1)
+        {
+            continue;
+        }
 
-      if (input_name.empty()) {
-        input_size--;
-      }
+        char splitname[256];
+        sprintf(splitname, "splitncnn_input%d", j);
+        fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
+        fprintf(pp, " %s", input_name.c_str());
 
-      //             fprintf(stderr, "  input = %s\n", input_name.c_str());
+        for (int k = 0; k < refcount; k++)
+        {
+            fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k);
+        }
+        fprintf(pp, "\n");
     }
-    /*
-    for (int j=0; j<(int)node.output_size(); j++)
+
+    // place MemoryData next
+    for (std::map<std::string, onnx::TensorProto>::iterator weight_it = weights.begin();
+         weight_it != weights.end();
+         weight_it++)
     {
-        const std::string& output_name = node.output(j);
-        fprintf(stderr, "  output = %s\n", output_name.c_str());
-    }
-    */
-
-    if (op == "Abs") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Acos") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Add") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "ArgMax") {
-      fprintf(pp, "%-16s", "TopK");
-    } else if (op == "Asin") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Atan") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "AveragePool" || op == "MaxPool") {
-      std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
-      if (kernel_shape.size() == 1) {
-        fprintf(pp, "%-16s", "Pooling1D");
-      } else {
-        fprintf(pp, "%-16s", "Pooling");
-      }
-    } else if (op == "BatchNormalization") {
-      fprintf(pp, "%-16s", "BatchNorm");
-    } else if (op == "BiasGelu") {
-      fprintf(pp, "%-16s", "BiasGelu");
-    } else if (op == "Cast") {
-      fprintf(pp, "%-16s", "Noop");
-    } else if (op == "Ceil") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Clip") {
-      fprintf(pp, "%-16s", "Clip");
-    } else if (op == "Concat") {
-      fprintf(pp, "%-16s", "Concat");
-    } else if (op == "Constant") {
-      continue;
-    } else if (op == "ConstantOfShape") {
-      fprintf(pp, "%-16s", "ConstantOfShape");
-    } else if (op == "Conv") {
-      std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
-      if (kernel_shape.size() == 1) {
-        fprintf(pp, "%-16s", "Convolution1D");
-      } else {
-        int group = get_node_attr_i(node, "group", 1);
-        if (group > 1) {
-          fprintf(pp, "%-16s", "ConvolutionDepthWise");
-        } else {
-          fprintf(pp, "%-16s", "Convolution");
-        }
-      }
-    } else if (op == "ConvTranspose") {
-      int group = get_node_attr_i(node, "group", 1);
-      if (group > 1) {
-        fprintf(pp, "%-16s", "DeconvolutionDepthWise");
-      } else {
-        fprintf(pp, "%-16s", "Deconvolution");
-      }
-    } else if (op == "Cos") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Crop") {
-      fprintf(pp, "%-16s", "Crop");
-    } else if (op == "DepthToSpace") {
-      fprintf(pp, "%-16s", "PixelShuffle");
-    } else if (op == "DetectionOutput") {
-      fprintf(pp, "%-16s", "DetectionOutput");
-    } else if (op == "Div") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "Dropout") {
-      fprintf(pp, "%-16s", "Dropout");
-      output_size = 1;
-    } else if (op == "Elu") {
-      fprintf(pp, "%-16s", "ELU");
-    } else if (op == "EmbedLayerNormalization") {
-      fprintf(pp, "%-16s", "EmbedLayerNormalization");
-    } else if (op == "Equal") {
-      fprintf(pp, "%-16s", "Compare");
-    } else if (op == "Exp") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Expand") {
-      fprintf(pp, "%-16s", "Expand");
-    } else if (op == "Flatten") {
-      fprintf(pp, "%-16s", "Flatten");
-    } else if (op == "Floor") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Gather") {
-      fprintf(pp, "%-16s", "Gather");
-    } else if (op == "Gelu") {
-      fprintf(pp, "%-16s", "GELU");
-    } else if (op == "Gemm") {
-      float alpha = get_node_attr_f(node, "alpha", 1.f);
-      float beta = get_node_attr_f(node, "beta", 1.f);
-      int transA = get_node_attr_i(node, "transA", 0);
-      int transB = get_node_attr_i(node, "transB", 0);
-
-      if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1) {
-        // InnerProduct-like A * B + C
-        fprintf(pp, "%-16s", "InnerProduct");
-      } else {
-        fprintf(pp, "%-16s", "Gemm");
-      }
-    } else if (op == "GlobalAveragePool") {
-      fprintf(pp, "%-16s", "Pooling");
-    } else if (op == "GlobalMaxPool") {
-      fprintf(pp, "%-16s", "Pooling");
-    } else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
-               op == "adaptive_max_pool2d") {
-      fprintf(pp, "%-16s", "Pooling");
-    } else if (op == "GroupNorm") {
-      fprintf(pp, "%-16s", "GroupNorm");
-    } else if (op == "GRU") {
-      fprintf(pp, "%-16s", "GRU");
-    } else if (op == "HardSigmoid") {
-      fprintf(pp, "%-16s", "HardSigmoid");
-    } else if (op == "HardSwish") {
-      fprintf(pp, "%-16s", "HardSwish");
-    } else if (op == "ImageScaler") {
-      fprintf(pp, "%-16s", "Scale");
-    } else if (op == "InstanceNormalization") {
-      fprintf(pp, "%-16s", "InstanceNorm");
-    } else if (op == "LayerNorm") {
-      fprintf(pp, "%-16s", "LayerNorm");
-    } else if (op == "LeakyRelu") {
-      fprintf(pp, "%-16s", "ReLU");
-    } else if (op == "Threshold") {
-      fprintf(pp, "%-16s", "Threshold");
-    } else if (op == "Log") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "LRN") {
-      fprintf(pp, "%-16s", "LRN");
-    } else if (op == "LSTM") {
-      fprintf(pp, "%-16s", "LSTM");
-    } else if (op == "MatMul") {
-      if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2) {
-        fprintf(pp, "%-16s", "InnerProduct");
-      } else {
-        fprintf(pp, "%-16s", "Gemm");
-      }
-    } else if (op == "Max") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "Min") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "Mul") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "MultiHeadAttention") {
-      fprintf(pp, "%-16s", "MultiHeadAttention");
-    } else if (op == "Neg") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "NonMaxSuppression") {
-      fprintf(pp, "%-16s", "NonMaxSuppression");
-    } else if (op == "Normalize") {
-      fprintf(pp, "%-16s", "Normalize");
-    } else if (op == "Pad") {
-      fprintf(pp, "%-16s", "Padding");
-    } else if (op == "PixelShuffle") {
-      fprintf(pp, "%-16s", "PixelShuffle");
-    } else if (op == "Pow") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "PriorBox") {
-      fprintf(pp, "%-16s", "PriorBox");
-    } else if (op == "PRelu") {
-      fprintf(pp, "%-16s", "PReLU");
-    } else if (op == "Range") {
-      fprintf(pp, "%-16s", "Range");
-    } else if (op == "Reciprocal") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "ReduceMax" || op == "ReduceMin" || op == "ReduceMean" || op == "ReduceProd" ||
-               op == "ReduceSum" || op == "ReduceSumSquare" || op == "ReduceL1" ||
-               op == "ReduceL2" || op == "ReduceLogSum" || op == "ReduceLogSumExp") {
-      fprintf(pp, "%-16s", "Reduction");
-    } else if (op == "Relu") {
-      fprintf(pp, "%-16s", "ReLU");
-    } else if (op == "Reorg") {
-      fprintf(pp, "%-16s", "Reorg");
-    } else if (op == "Reshape") {
-      fprintf(pp, "%-16s", "Reshape");
-    } else if (op == "RNN") {
-      fprintf(pp, "%-16s", "RNN");
-    } else if (op == "RDiv") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "RSub") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "RoiAlign") {
-      fprintf(pp, "%-16s", "ROIAlign");
-    } else if (op == "ScatterND") {
-      fprintf(pp, "%-16s", "ScatterND");
-    } else if (op == "Shape") {
-      fprintf(pp, "%-16s", "Shape");
-    } else if (op == "ShuffleChannel") {
-      fprintf(pp, "%-16s", "ShuffleChannel");
-    } else if (op == "Sigmoid") {
-      fprintf(pp, "%-16s", "Sigmoid");
-    } else if (op == "Sin") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "SkipLayerNormalization") {
-      fprintf(pp, "%-16s", "SkipLayerNormalization");
-    } else if (op == "Slice") {
-      std::vector<int> ends;
-      std::vector<int> steps;
-      bool use_crop = true;
-
-      if (node.input_size() == 1) {
-        ends = get_node_attr_ai(node, "ends");
-        steps = get_node_attr_ai(node, "steps");  // TODO
-      } else {
-        ends = get_node_attr_from_input_ai(weights[node.input(2)]);
-        if (node.input_size() >= 5) steps = get_node_attr_from_input_ai(weights[node.input(4)]);
-      }
-
-      // assert step == 1
-      for (int i = 0; i < (int)steps.size(); i++) {
-        if (steps[i] != 1 && steps[i] < ends[i]) {
-          use_crop = false;
-          break;
-        }
-      }
-
-      if (use_crop) {
-        fprintf(pp, "%-16s", "Crop");
-      } else {
-        fprintf(pp, "%-16s", "TensorSlice");
-      }
-    } else if (op == "Softmax") {
-      fprintf(pp, "%-16s", "Softmax");
-    } else if (op == "Softplus") {
-      fprintf(pp, "%-16s", "Softplus");
-    } else if (op == "Split") {
-      fprintf(pp, "%-16s", "Slice");
-    } else if (op == "Sqrt") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Squeeze") {
-      std::vector<int> axes = get_node_attr_ai(node, "axes");
-      // fprintf(stderr, "axes[0]: %d\n",axes[0]);
-      if (axes[0] == 0) {
-        fprintf(pp, "%-16s", "Noop");
-      } else {
-        fprintf(pp, "%-16s", "Squeeze");
-      }
-    } else if (op == "Sub") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "Sum") {
-      fprintf(pp, "%-16s", "Eltwise");
-    } else if (op == "Swish") {
-      fprintf(pp, "%-16s", "Swish");
-    } else if (op == "Tan") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Tanh") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Tile") {
-      fprintf(pp, "%-16s", "TileOnnx");
-    } else if (op == "TopK") {
-      fprintf(pp, "%-16s", "TopK");
-    } else if (op == "Transpose") {
-      fprintf(pp, "%-16s", "Permute");
-    } else if (op == "Upsample" || op == "Resize") {
-      fprintf(pp, "%-16s", "Interp");
-    } else if (op == "Unsqueeze") {
-      std::vector<int> axes = get_node_attr_ai(node, "axes");
-      // fprintf(stderr, "axes[0]: %d\n",axes[0]);
-      if (axes[0] == 0) {
-        fprintf(pp, "%-16s", "Noop");
-      } else {
-        fprintf(pp, "%-16s", "ExpandDims");
-      }
-    } else if (op == "Where") {
-      fprintf(pp, "%-16s", "Where");
-    } else if (op == "Yolov3DetectionOutput") {
-      fprintf(pp, "%-16s", "Yolov3DetectionOutput");
-    } else {
-      // TODO
-      fprintf(stderr, "%s not supported yet!\n", op.c_str());
-      fprintf(pp, "%-16s", op.c_str());
-    }
+        const std::string& input_name = weight_it->first;
 
-    fprintf(pp, " %-24s %d %d", name.c_str(), input_size, output_size);
+        int                refcount = node_reference[input_name];
+        if (refcount == 0)
+        {
+            continue;
+        }
 
-    for (int j = 0; j < (int)node.input_size(); j++) {
-      std::string input_name = node.input(j);
+        fprintf(pp, "%-16s %-24s 0 1 %s", "MemoryData", input_name.c_str(), input_name.c_str());
 
-      // check weight
-      if (weights.find(input_name) != weights.end() && node_reference[input_name] == 0) {
-        continue;
-      }
+        const onnx::TensorProto& M = weights[input_name];
 
-      if (input_name.empty()) {
-        continue;
-      }
+        if (M.dims_size() == 0)
+        {
+            fprintf(pp, " 0=%d", get_tensor_proto_data_size(M));
+        }
+        else if (M.dims_size() == 1)
+        {
+            fprintf(pp, " 0=%d", (int)M.dims(0));
+        }
+        else if (M.dims_size() == 2)
+        {
+            fprintf(pp, " 0=%d", (int)M.dims(1));
+            if (M.dims(0) != 1)
+            {
+                fprintf(pp, " 1=%d", (int)M.dims(0));
+            }
+        }
+        else if (M.dims_size() == 3)
+        {
+            fprintf(pp, " 0=%d", (int)M.dims(2));
+            fprintf(pp, " 1=%d", (int)M.dims(1));
+            if (M.dims(0) != 1)
+            {
+                fprintf(pp, " 2=%d", (int)M.dims(0));
+            }
+        }
+        else if (M.dims_size() == 4)
+        {
+            fprintf(pp, " 0=%d", (int)M.dims(3));
+            fprintf(pp, " 1=%d", (int)M.dims(2));
+            fprintf(pp, " 2=%d", (int)M.dims(1));
+        }
 
-      if (split_node_reference.find(input_name) != split_node_reference.end()) {
-        int refidx = split_node_reference[input_name] - 1;
-        split_node_reference[input_name] = refidx;
+        fprintf(pp, "\n");
+        if (M.data_type() == 1)
+        {
+            fwrite_tensor_proto_data(M, bp);
+        }
+        else if (M.data_type() == 7 || M.data_type() == 6 || M.data_type() == 9 ||
+                 M.data_type() == 11)
+        {
+            fwrite_tensor_proto_data_to_float(M, bp);
+        }
+        else
+        {
+            fwrite_tensor_proto_data(M, bp);
+        }
 
-        char splitsuffix[256];
-        sprintf(splitsuffix, "_splitncnn_%d", refidx);
-        input_name = input_name + splitsuffix;
-      }
+        if (refcount <= 1)
+        {
+            continue;
+        }
 
-      fprintf(pp, " %s", input_name.c_str());
-    }
+        char splitname[256];
+        sprintf(splitname, "splitncnn_%d", internal_split);
+        fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
 
-    for (int j = 0; j < output_size; j++) {
-      const std::string& output_name = node.output(j);
+        fprintf(pp, " %s", input_name.c_str());
 
-      fprintf(pp, " %s", output_name.c_str());
-    }
+        for (int k = 0; k < refcount; k++)
+        {
+            fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k);
+        }
+        fprintf(pp, "\n");
 
-    if (op == "Abs") {
-      int op_type = 0;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Acos") {
-      int op_type = 13;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Add") {
-      int op_type = 0;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "ArgMax") {
-      int axis = get_node_attr_i(node, "axis");
-      int keepdims = get_node_attr_i(node, "keepdims");
-      fprintf(pp, " 0=%d", axis - 1);
-      fprintf(pp, " 3=%d", keepdims);
-    } else if (op == "Asin") {
-      int op_type = 12;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Atan") {
-      int op_type = 14;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "AveragePool" || op == "MaxPool") {
-      std::string auto_pad = get_node_attr_s(node, "auto_pad");
-      int ceil_mode = get_node_attr_i(node, "ceil_mode", 0);
-      std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
-      std::vector<int> strides = get_node_attr_ai(node, "strides");
-      std::vector<int> pads = get_node_attr_ai(node, "pads");
-
-      int pool = op == "AveragePool" ? 1 : 0;
-      int pad_mode = 1;
-
-      if (auto_pad == "SAME_UPPER") {
-        pad_mode = 2;
-      } else if (auto_pad == "SAME_LOWER") {
-        pad_mode = 3;
-      }
-
-      if (ceil_mode == 1) {
-        pad_mode = 0;
-      }
-
-      fprintf(pp, " 0=%d", pool);
-
-      if (kernel_shape.size() == 1) {
-        fprintf(pp, " 1=%d", kernel_shape[0]);
-      } else if (kernel_shape.size() == 2) {
-        fprintf(pp, " 1=%d", kernel_shape[1]);
-        fprintf(pp, " 11=%d", kernel_shape[0]);
-      }
-
-      if (strides.size() == 1) {
-        fprintf(pp, " 2=%d", strides[0]);
-      } else if (strides.size() == 2) {
-        fprintf(pp, " 2=%d", strides[1]);
-        fprintf(pp, " 12=%d", strides[0]);
-      }
-
-      if (pads.size() == 1) {
-        fprintf(pp, " 3=%d", pads[0]);
-      } else if (pads.size() == 2) {
-        fprintf(pp, " 3=%d", pads[1]);
-        fprintf(pp, " 13=%d", pads[0]);
-      } else if (pads.size() == 4) {
-        fprintf(pp, " 3=%d", pads[1]);
-        fprintf(pp, " 13=%d", pads[0]);
-        fprintf(pp, " 14=%d", pads[3]);
-        fprintf(pp, " 15=%d", pads[2]);
-      }
-
-      fprintf(pp, " 5=%d", pad_mode);
-
-      if (op == "AveragePool") {
-        int avgpool_count_include_pad = get_node_attr_i(node, "count_include_pad", 0);
-        fprintf(pp, " 6=%d", avgpool_count_include_pad);
-      }
-    } else if (op == "BatchNormalization") {
-      float epsilon = get_node_attr_f(node, "epsilon", 1e-5f);
-
-      const onnx::TensorProto& scale = weights[node.input(1)];
-      const onnx::TensorProto& B = weights[node.input(2)];
-      const onnx::TensorProto& mean = weights[node.input(3)];
-      const onnx::TensorProto& var = weights[node.input(4)];
-
-      int channels = get_tensor_proto_data_size(scale);
-
-      fprintf(pp, " 0=%d", channels);
-
-      fwrite_tensor_proto_data(scale, bp);
-      fwrite_tensor_proto_data(mean, bp);
-      // apply epsilon to var
-      {
-        const float* v =
-            var.has_raw_data() ? (const float*)var.raw_data().data() : var.float_data().data();
-
-        for (int j = 0; j < channels; j++) {
-          float ve = v[j] + epsilon;
-          fwrite(&ve, sizeof(float), 1, bp);
-        }
-      }
-      fwrite_tensor_proto_data(B, bp);
-    } else if (op == "BiasGelu") {
-      const onnx::TensorProto& B = weights[node.input(1)];
-
-      fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
-
-      int quantize_tag = 0;
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(B, bp);
-    } else if (op == "Ceil") {
-      int op_type = 3;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Clip") {
-      float min;
-      float max;
-      if (node.input_size() == 1) {
-        min = get_node_attr_f(node, "min", -FLT_MAX);
-        max = get_node_attr_f(node, "max", FLT_MAX);
-      } else {
-        min = weights.find(node.input(1)) != weights.end()
-                  ? get_node_attr_from_input<float>(weights[node.input(1)])
-                  : -FLT_MAX;
-        max = weights.find(node.input(2)) != weights.end()
-                  ? get_node_attr_from_input<float>(weights[node.input(2)])
-                  : FLT_MAX;
-      }
-
-      fprintf(pp, " 0=%e", min);
-      fprintf(pp, " 1=%e", max);
-    } else if (op == "Concat") {
-      int axis = get_node_attr_i(node, "axis", 1);
-      fprintf(pp, " 0=%d", axis - 1);
-    } else if (op == "Constant") {
-      // never reach here
-    } else if (op == "ConstantOfShape") {
-      float value = 0.f;
-      value = get_node_attr_f(node, "value", 0.f);
-      fprintf(pp, " 0=%f", value);
-
-    } else if (op == "Conv") {
-      const onnx::TensorProto& W = weights[node.input(1)];
-
-      int num_filter = W.dims(0);
-      int has_bias = node.input_size() == 3 ? 1 : 0;
-
-      std::string auto_pad = get_node_attr_s(node, "auto_pad");
-      std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
-      std::vector<int> dilations = get_node_attr_ai(node, "dilations");
-      std::vector<int> strides = get_node_attr_ai(node, "strides");
-      std::vector<int> pads = get_node_attr_ai(node, "pads");
-      int group = get_node_attr_i(node, "group", 1);
-
-      fprintf(pp, " 0=%d", num_filter);
-
-      if (kernel_shape.size() == 1) {
-        fprintf(pp, " 1=%d", kernel_shape[0]);
-      } else if (kernel_shape.size() == 2) {
-        fprintf(pp, " 1=%d", kernel_shape[1]);
-        fprintf(pp, " 11=%d", kernel_shape[0]);
-      }
-
-      if (dilations.size() == 1) {
-        fprintf(pp, " 2=%d", dilations[0]);
-      } else if (dilations.size() == 2) {
-        fprintf(pp, " 2=%d", dilations[1]);
-        fprintf(pp, " 12=%d", dilations[0]);
-      }
-
-      if (strides.size() == 1) {
-        fprintf(pp, " 3=%d", strides[0]);
-      } else if (strides.size() == 2) {
-        fprintf(pp, " 3=%d", strides[1]);
-        fprintf(pp, " 13=%d", strides[0]);
-      }
-
-      if (auto_pad == "SAME_UPPER") {
-        fprintf(pp, " 4=-233");
-      } else if (auto_pad == "SAME_LOWER") {
-        fprintf(pp, " 4=-234");
-      } else {
-        if (pads.size() == 1) {
-          fprintf(pp, " 4=%d", pads[0]);
-        } else if (pads.size() == 2) {
-          fprintf(pp, " 4=%d", pads[1]);
-          fprintf(pp, " 14=%d", pads[0]);
-        } else if (pads.size() == 4) {
-          fprintf(pp, " 4=%d", pads[1]);
-          fprintf(pp, " 14=%d", pads[0]);
-          fprintf(pp, " 15=%d", pads[3]);
-          fprintf(pp, " 16=%d", pads[2]);
-        }
-      }
-
-      fprintf(pp, " 5=%d", has_bias);
-
-      fprintf(pp, " 6=%d", get_tensor_proto_data_size(W));
-
-      if (group > 1) {
-        fprintf(pp, " 7=%d", group);
-      }
-
-      int quantize_tag = 0;
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(W, bp);
-
-      if (has_bias) {
-        const onnx::TensorProto& B = weights[node.input(2)];
-        fwrite_tensor_proto_data(B, bp);
-      }
-    } else if (op == "ConvTranspose") {
-      const onnx::TensorProto& W = weights[node.input(1)];
-
-      int has_bias = node.input_size() == 3 ? 1 : 0;
-
-      std::string auto_pad = get_node_attr_s(node, "auto_pad");
-      std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
-      std::vector<int> dilations = get_node_attr_ai(node, "dilations");
-      std::vector<int> strides = get_node_attr_ai(node, "strides");
-      std::vector<int> output_padding = get_node_attr_ai(node, "output_padding");
-      std::vector<int> output_shape = get_node_attr_ai(node, "output_shape");
-      std::vector<int> pads = get_node_attr_ai(node, "pads");
-      int group = get_node_attr_i(node, "group", 1);
-      int num_filter = W.dims(1) * group;
-
-      fprintf(pp, " 0=%d", num_filter);
-
-      if (kernel_shape.size() == 1) {
-        fprintf(pp, " 1=%d", kernel_shape[0]);
-      } else if (kernel_shape.size() == 2) {
-        fprintf(pp, " 1=%d", kernel_shape[1]);
-        fprintf(pp, " 11=%d", kernel_shape[0]);
-      }
-
-      if (dilations.size() == 1) {
-        fprintf(pp, " 2=%d", dilations[0]);
-      } else if (dilations.size() == 2) {
-        fprintf(pp, " 2=%d", dilations[1]);
-        fprintf(pp, " 12=%d", dilations[0]);
-      }
-
-      if (strides.size() == 1) {
-        fprintf(pp, " 3=%d", strides[0]);
-      } else if (strides.size() == 2) {
-        fprintf(pp, " 3=%d", strides[1]);
-        fprintf(pp, " 13=%d", strides[0]);
-      }
-
-      if (auto_pad == "SAME_UPPER") {
-        fprintf(pp, " 4=-233");
-      } else if (auto_pad == "SAME_LOWER") {
-        fprintf(pp, " 4=-234");
-      } else {
-        if (pads.size() == 1) {
-          fprintf(pp, " 4=%d", pads[0]);
-        } else if (pads.size() == 2) {
-          fprintf(pp, " 4=%d", pads[1]);
-          fprintf(pp, " 14=%d", pads[0]);
-        } else if (pads.size() == 4) {
-          fprintf(pp, " 4=%d", pads[1]);
-          fprintf(pp, " 14=%d", pads[0]);
-          fprintf(pp, " 15=%d", pads[3]);
-          fprintf(pp, " 16=%d", pads[2]);
-        }
-      }
-
-      if (output_padding.size() == 1) {
-        fprintf(pp, " 18=%d", output_padding[0]);
-      } else if (output_padding.size() == 2) {
-        fprintf(pp, " 18=%d", output_padding[1]);
-        fprintf(pp, " 19=%d", output_padding[0]);
-      }
-
-      if (output_shape.size() == 1) {
-        fprintf(pp, " 20=%d", output_shape[0]);
-      } else if (output_shape.size() == 2) {
-        fprintf(pp, " 20=%d", output_shape[1]);
-        fprintf(pp, " 21=%d", output_shape[0]);
-      }
-
-      fprintf(pp, " 5=%d", has_bias);
-
-      fprintf(pp, " 6=%d", get_tensor_proto_data_size(W));
-
-      if (group > 1) {
-        fprintf(pp, " 7=%d", group);
-      }
-
-      int quantize_tag = 0;
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      int maxk = 0;
-      if (kernel_shape.size() == 2) {
-        maxk = kernel_shape[1] * kernel_shape[0];
-      } else {
-        maxk = kernel_shape[0] * kernel_shape[0];
-      }
-      int weight_data_size = get_tensor_proto_data_size(W);
-      const float* weight_data = 0;
-      if (W.has_raw_data()) {
-        weight_data = (const float*)W.raw_data().data();
-      } else if (W.data_type() == 1) {
-        weight_data = W.float_data().data();
-      }
-      for (int g = 0; g < group; g++) {
-        // reorder weight from inch-outch to outch-inch
-        int num_filter_g = num_filter / group;
-        int num_input = weight_data_size / maxk / num_filter_g / group;
-        const float* weight_data_ptr = weight_data + g * maxk * num_filter_g * num_input;
-        for (int k = 0; k < num_filter_g; k++) {
-          for (int j = 0; j < num_input; j++) {
-            fwrite(weight_data_ptr + (j * num_filter_g + k) * maxk, sizeof(float), maxk, bp);
-          }
-        }
-      }
-
-      if (has_bias) {
-        const onnx::TensorProto& B = weights[node.input(2)];
-        fwrite_tensor_proto_data(B, bp);
-      }
-    } else if (op == "Cos") {
-      int op_type = 10;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Crop") {
-      auto starts = get_node_attr_ai(node, "starts");
-      fprintf(pp, " -23309=%zu", starts.size());
-      for (size_t j = 0; j < starts.size(); ++j) {
-        fprintf(pp, ",%i", starts[j]);
-      }
-      auto ends = get_node_attr_ai(node, "ends");
-      fprintf(pp, " -23310=%zu", ends.size());
-      for (size_t j = 0; j < ends.size(); ++j) {
-        fprintf(pp, ",%i", ends[j]);
-      }
-      auto axis = get_node_attr_ai(node, "axis");
-      fprintf(pp, " -23311=%zu", axis.size());
-      for (size_t j = 0; j < axis.size(); ++j) {
-        fprintf(pp, ",%i", axis[j]);
-      }
-    } else if (op == "DepthToSpace") {
-      // pixelshuffle
-      int scale_factor = get_node_attr_i(node, "blocksize", 1);
-      std::string mode = get_node_attr_s(node, "mode");
-      fprintf(pp, " 0=%d", scale_factor);
-      if (mode == "CRD") {
-        fprintf(pp, " 1=0");
-      } else if (mode == "DCR") {
-        fprintf(pp, " 1=1");
-      }
-    } else if (op == "DetectionOutput") {
-      float score_threshold = get_node_attr_f(node, "score_threshold");
-      float nms_threshold = get_node_attr_f(node, "nms_threshold");
-      int nms_top_k = get_node_attr_i(node, "nms_top_k");
-      int keep_top_k = get_node_attr_i(node, "keep_top_k");
-      int num_class = get_node_attr_i(node, "num_class");
-      std::vector<float> vars = get_node_attr_af(node, "vars");
-      fprintf(pp, " 0=%d", num_class);
-      fprintf(pp, " 1=%f", nms_threshold);
-      fprintf(pp, " 2=%d", nms_top_k);
-      fprintf(pp, " 3=%d", keep_top_k);
-      fprintf(pp, " 4=%f", score_threshold);
-      fprintf(pp, " 5=%f", vars[0]);
-      fprintf(pp, " 6=%f", vars[1]);
-      fprintf(pp, " 7=%f", vars[2]);
-      fprintf(pp, " 8=%f", vars[3]);
-    } else if (op == "Div") {
-      int op_type = 3;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "Dropout") {
-      // no-op
-    } else if (op == "Elu") {
-      float alpha = get_node_attr_f(node, "alpha", 1.f);
-      fprintf(pp, " 0=%e", alpha);
-    } else if (op == "EmbedLayerNormalization") {
-      const onnx::TensorProto& words = weights[node.input(2)];
-      const onnx::TensorProto& positions = weights[node.input(3)];
-      const onnx::TensorProto& W = weights[node.input(5)];
-      const onnx::TensorProto& B = weights[node.input(6)];
-
-      fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
-      fprintf(pp, " 1=%d", get_tensor_proto_data_size(words));
-      fprintf(pp, " 2=%d", get_tensor_proto_data_size(positions));
-
-      int quantize_tag = 0;
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(words, bp);
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(positions, bp);
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(W, bp);
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(B, bp);
-    } else if (op == "Equal") {
-      int op_type = 0;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Exp") {
-      int op_type = 7;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Flatten") {
-      int axis = get_node_attr_i(node, "axis", 1);
-      if (axis != 1) {
-        fprintf(stderr, "Unsupported Flatten axis %d!\n", axis);
-      }
-    } else if (op == "Floor") {
-      int op_type = 2;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Gather") {
-      if (weights[node.input(1)].dims_size() > 1) {
-        fprintf(stderr, "Unsupported indice dims > 1");
-      }
-      int axis = get_node_attr_i(node, "axis", 1) - 1;
-      if (axis < 0) {
-        fprintf(stderr, "Unsupported Gather axis: %d\n", axis + 1);
-      }
-      fprintf(pp, " 0=%d", axis);
-    } else if (op == "Gelu") {
-      fprintf(pp, " 0=1");
-    } else if (op == "Gemm") {
-      float alpha = get_node_attr_f(node, "alpha", 1.f);
-      float beta = get_node_attr_f(node, "beta", 1.f);
-      int transA = get_node_attr_i(node, "transA", 0);
-      int transB = get_node_attr_i(node, "transB", 0);
-
-      if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1) {
-        // InnerProduct-like A * B + C
-        const onnx::TensorProto& B = weights[node.input(1)];
-        // B has transposed.
-        int num_output = B.dims(0);
-        fprintf(pp, " 0=%d", num_output);
-        if (node.input_size() == 3) {
-          fprintf(pp, " 1=1");
-        } else {
-          fprintf(pp, " 1=0");
-        }
-        fprintf(pp, " 2=%d", get_tensor_proto_data_size(B));
-
-        int quantize_tag = 0;
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        fwrite_tensor_proto_data(B, bp);
-        if (node.input_size() == 3) {
-          const onnx::TensorProto& C = weights[node.input(2)];
-          fwrite_tensor_proto_data(C, bp);
-        }
-      } else {
-        // gemm
-        fprintf(pp, " 0=%e", alpha);
-        fprintf(pp, " 1=%e", beta);
-        fprintf(pp, " 2=%d", transA);
-        fprintf(pp, " 3=%d", transB);
-      }
-    } else if (op == "GlobalAveragePool") {
-      int pool = 1;
-      int global_pool = 1;
-
-      fprintf(pp, " 0=%d", pool);
-      fprintf(pp, " 4=%d", global_pool);
-    } else if (op == "GlobalMaxPool") {
-      int pool = 0;
-      int global_pool = 1;
-
-      fprintf(pp, " 0=%d", pool);
-      fprintf(pp, " 4=%d", global_pool);
-    } else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
-               op == "adaptive_max_pool2d") {
-      int pool = 0;
-      if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d") {
-        pool = 1;
-      }
-      int adaptive_pooling = 1;
-      const onnx::TensorProto& out_shape_tp = weights[node.input(1)];
-      std::vector<int> out_shape = get_node_attr_from_input_ai(out_shape_tp);
-
-      fprintf(pp, " 0=%d", pool);
-      fprintf(pp, " 7=%d", adaptive_pooling);
-      if (out_shape.size() == 1) {
-        fprintf(pp, " 8=%d", out_shape[0]);
-      } else if (out_shape.size() == 2) {
-        // out_w
-        fprintf(pp, " 8=%d", out_shape[1]);
-        // out_h
-        fprintf(pp, " 18=%d", out_shape[0]);
-      }
-    } else if (op == "GroupNorm") {
-      int groups = get_node_attr_i(node, "groups", 1);
-      int channels = get_node_attr_i(node, "channels", 1);
-      float eps = get_node_attr_f(node, "epsilon", 1e-5f);
-      int affine = get_node_attr_i(node, "affine", 1);
-
-      if (affine) {
-        // discard affine-less S=1 B=0
-        std::vector<float> affine_S = get_node_attr_from_input_af(weights[node.input(1)]);
-        std::vector<float> affine_B = get_node_attr_from_input_af(weights[node.input(2)]);
-        if (affine_S.size() == 1 && affine_S[0] == 1.f && affine_B.size() == 1 &&
-            affine_B[0] == 0.f) {
-          affine = 0;
-        } else {
-          affine = 0;
-          {
-            for (int j = 0; j < channels; j++) {
-              if (affine_S[j] != 1.f || affine_B[j] != 0.f) {
-                affine = 1;
-                break;
-              }
-            }
-          }
-        }
-      }
-
-      fprintf(pp, " 0=%d", groups);
-      fprintf(pp, " 1=%d", channels);
-      fprintf(pp, " 2=%e", eps);
-      fprintf(pp, " 3=%d", affine);
-      if (affine) {
-        const onnx::TensorProto& scale = weights[node.input(1)];
-        const onnx::TensorProto& B = weights[node.input(2)];
-
-        fwrite_tensor_proto_data(scale, bp);
-        fwrite_tensor_proto_data(B, bp);
-      }
-    } else if (op == "GRU") {
-      const onnx::TensorProto& W = weights[node.input(1)];
-      const onnx::TensorProto& R = weights[node.input(2)];
-      const onnx::TensorProto& B = weights[node.input(3)];
-
-      int hidden_size = get_node_attr_i(node, "hidden_size", 0);
-      std::string direction = get_node_attr_s(node, "direction");
-
-      int direction_type = 0;
-      if (direction == "forward") {
-        direction_type = 0;
-      } else if (direction == "reverse") {
-        direction_type = 1;
-      } else if (direction == "bidirectional") {
-        direction_type = 2;
-      }
-
-      int weight_data_size = get_tensor_proto_data_size(W);
-
-      fprintf(pp, " 0=%d", hidden_size);
-      fprintf(pp, " 1=%d", weight_data_size);
-      fprintf(pp, " 2=%d", direction_type);
-
-      int num_directions = direction_type == 2 ? 2 : 1;
-
-      int quantize_tag = 0;
-
-      // reorder num_directions-URN-hidden-size to
-      // num_directions-RUN-hidden-size
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int weight_data_size_g = get_tensor_proto_data_size(W) / 3 / num_directions;
-        const float* wptr =
-            W.has_raw_data() ? (const float*)W.raw_data().data() : W.float_data().data();
-
-        const float* uptr = wptr;
-        const float* rptr = wptr + weight_data_size_g;
-        const float* nptr = wptr + weight_data_size_g * 2;
-        fwrite(rptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(uptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(nptr, sizeof(float), weight_data_size_g, bp);
-
-        if (direction_type == 2) {
-          uptr += weight_data_size_g * 3;
-          rptr += weight_data_size_g * 3;
-          nptr += weight_data_size_g * 3;
-          fwrite(rptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(uptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(nptr, sizeof(float), weight_data_size_g, bp);
-        }
-      }
-
-      // reduce U and R bias except N
-      // reorder num_directions-URN-hidden to num_directions-RUN-hidden
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int bias_data_size_g = get_tensor_proto_data_size(B) / 2 / 3 / num_directions;
-        const float* bptr =
-            B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
-        const float* wuptr = bptr;
-        const float* wrptr = bptr + bias_data_size_g;
-        const float* wnptr = bptr + bias_data_size_g * 2;
-        const float* buptr = bptr + bias_data_size_g * 3;
-        const float* brptr = bptr + bias_data_size_g * 4;
-        const float* bnptr = bptr + bias_data_size_g * 5;
-
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = wrptr[j] + brptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = wuptr[j] + buptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-        fwrite(wnptr, sizeof(float), bias_data_size_g, bp);
-        fwrite(bnptr, sizeof(float), bias_data_size_g, bp);
-
-        if (direction_type == 2) {
-          wuptr += bias_data_size_g * 6;
-          wrptr += bias_data_size_g * 6;
-          wnptr += bias_data_size_g * 6;
-          buptr += bias_data_size_g * 6;
-          brptr += bias_data_size_g * 6;
-          bnptr += bias_data_size_g * 6;
-
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = wrptr[j] + brptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = wuptr[j] + buptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-          fwrite(wnptr, sizeof(float), bias_data_size_g, bp);
-          fwrite(bnptr, sizeof(float), bias_data_size_g, bp);
-        }
-      }
-
-      // reorder num_directions-URN-hidden-hidden to
-      // num_directions-RUN-hidden-hidden
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int weight_data_size_g = get_tensor_proto_data_size(R) / 3 / num_directions;
-        const float* Rptr =
-            R.has_raw_data() ? (const float*)R.raw_data().data() : R.float_data().data();
-
-        const float* uptr = Rptr;
-        const float* rptr = Rptr + weight_data_size_g;
-        const float* nptr = Rptr + weight_data_size_g * 2;
-        fwrite(rptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(uptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(nptr, sizeof(float), weight_data_size_g, bp);
-
-        if (direction_type == 2) {
-          uptr += weight_data_size_g * 3;
-          rptr += weight_data_size_g * 3;
-          nptr += weight_data_size_g * 3;
-          fwrite(rptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(uptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(nptr, sizeof(float), weight_data_size_g, bp);
-        }
-      }
-    } else if (op == "HardSigmoid") {
-      float alpha = get_node_attr_f(node, "alpha", 0.2f);
-      float beta = get_node_attr_f(node, "beta", 0.5f);
-
-      fprintf(pp, " 0=%e", alpha);
-      fprintf(pp, " 1=%e", beta);
-    } else if (op == "HardSwish") {
-      float alpha = get_node_attr_f(node, "alpha", 0.2f);
-      float beta = get_node_attr_f(node, "beta", 0.5f);
-
-      fprintf(pp, " 0=%e", alpha);
-      fprintf(pp, " 1=%e", beta);
-    } else if (op == "ImageScaler") {
-      std::vector<float> bias = get_node_attr_af(node, "bias");
-      float scale = get_node_attr_f(node, "scale", 1.f);
-
-      int channels = (int)bias.size();
-
-      fprintf(pp, " 0=%d", channels);
-      fprintf(pp, " 1=1");
-
-      for (int j = 0; j < channels; j++) {
-        fwrite(&scale, sizeof(float), 1, bp);
-      }
-      fwrite(&bias[0], sizeof(float), channels, bp);
-    } else if (op == "InstanceNormalization") {
-      float eps = get_node_attr_f(node, "epsilon", 1e-5f);
-
-      // discard affine-less S=1 B=0
-      std::vector<float> affine_S = get_node_attr_from_input_af(weights[node.input(1)]);
-      std::vector<float> affine_B = get_node_attr_from_input_af(weights[node.input(2)]);
-      int channels = (int)affine_S.size();
-      int affine = 0;
-      {
-        for (int j = 0; j < channels; j++) {
-          if (affine_S[j] != 1.f || affine_B[j] != 0.f) {
-            affine = 1;
-            break;
-          }
-        }
-      }
-
-      fprintf(pp, " 0=%d", channels);
-      fprintf(pp, " 1=%e", eps);
-      fprintf(pp, " 2=%d", affine);
-      if (affine) {
-        const onnx::TensorProto& scale = weights[node.input(1)];
-        const onnx::TensorProto& B = weights[node.input(2)];
-
-        fwrite_tensor_proto_data(scale, bp);
-        fwrite_tensor_proto_data(B, bp);
-      }
-    } else if (op == "LayerNorm") {
-      float eps = get_node_attr_f(node, "epsilon", 1e-5f);
-      int affine = get_node_attr_i(node, "affine", 1);
-
-      if (affine) {
-        // discard affine-less S=1 B=0
-        std::vector<float> affine_S = get_node_attr_from_input_af(weights[node.input(1)]);
-        std::vector<float> affine_B = get_node_attr_from_input_af(weights[node.input(2)]);
-        int affine_size = (int)affine_S.size();
-        affine = 0;
-        {
-          for (int j = 0; j < affine_size; j++) {
-            if (affine_S[j] != 1.f || affine_B[j] != 0.f) {
-              affine = 1;
-              break;
-            }
-          }
-        }
-
-        if (affine) {
-          fprintf(pp, " 0=%d", affine_size);
-        }
-      }
-
-      fprintf(pp, " 1=%e", eps);
-      fprintf(pp, " 2=%d", affine);
-
-      if (affine) {
-        const onnx::TensorProto& scale = weights[node.input(1)];
-        const onnx::TensorProto& B = weights[node.input(2)];
-
-        fwrite_tensor_proto_data(scale, bp);
-        fwrite_tensor_proto_data(B, bp);
-      }
-    } else if (op == "LeakyRelu") {
-      float alpha = get_node_attr_f(node, "alpha", 0.01f);
-      fprintf(pp, " 0=%e", alpha);
-    } else if (op == "Threshold") {
-      float threshold = get_node_attr_f(node, "threshold", 0.f);
-      fprintf(pp, " 0=%e", threshold);
-    } else if (op == "Log") {
-      int op_type = 8;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "LRN") {
-      float alpha = get_node_attr_f(node, "alpha", 1.f);
-      float beta = get_node_attr_f(node, "beta", 0.5f);
-      float bias = get_node_attr_f(node, "bias", 1.f);
-      int size = get_node_attr_i(node, "size", 1);
-
-      int norm_region = 0;
-
-      fprintf(pp, " 0=%d", norm_region);
-      fprintf(pp, " 1=%d", size);
-      fprintf(pp, " 2=%e", alpha);
-      fprintf(pp, " 3=%e", beta);
-      fprintf(pp, " 4=%e", bias);
-    } else if (op == "LSTM") {
-      const onnx::TensorProto& W = weights[node.input(1)];
-      const onnx::TensorProto& R = weights[node.input(2)];
-      const onnx::TensorProto& B = weights[node.input(3)];
-
-      int hidden_size = get_node_attr_i(node, "hidden_size", 0);
-      std::string direction = get_node_attr_s(node, "direction");
-
-      int direction_type = 0;
-      if (direction == "forward") {
-        direction_type = 0;
-      } else if (direction == "reverse") {
-        direction_type = 1;
-      } else if (direction == "bidirectional") {
-        direction_type = 2;
-      }
-
-      int weight_data_size = get_tensor_proto_data_size(W);
-
-      fprintf(pp, " 0=%d", hidden_size);
-      fprintf(pp, " 1=%d", weight_data_size);
-      fprintf(pp, " 2=%d", direction_type);
-
-      int num_directions = direction_type == 2 ? 2 : 1;
-
-      int quantize_tag = 0;
-
-      // reorder num_directions-IOFG-hidden-size to
-      // num_directions-IFOG-hidden-size
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int weight_data_size_g = get_tensor_proto_data_size(W) / 4 / num_directions;
-        const float* wptr =
-            W.has_raw_data() ? (const float*)W.raw_data().data() : W.float_data().data();
-
-        const float* iptr = wptr;
-        const float* optr = wptr + weight_data_size_g;
-        const float* fptr = wptr + weight_data_size_g * 2;
-        const float* gptr = wptr + weight_data_size_g * 3;
-        fwrite(iptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(fptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(optr, sizeof(float), weight_data_size_g, bp);
-        fwrite(gptr, sizeof(float), weight_data_size_g, bp);
-
-        if (direction_type == 2) {
-          iptr += weight_data_size_g * 4;
-          optr += weight_data_size_g * 4;
-          fptr += weight_data_size_g * 4;
-          gptr += weight_data_size_g * 4;
-          fwrite(iptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(fptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(optr, sizeof(float), weight_data_size_g, bp);
-          fwrite(gptr, sizeof(float), weight_data_size_g, bp);
-        }
-      }
-
-      // reduce xc and hc bias
-      // reorder num_directions-IOFG-hidden to num_directions-IFOG-hidden
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int bias_data_size_g = get_tensor_proto_data_size(B) / 2 / 4 / num_directions;
-        const float* xcbptr =
-            B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
-        const float* xiptr = xcbptr;
-        const float* xoptr = xcbptr + bias_data_size_g;
-        const float* xfptr = xcbptr + bias_data_size_g * 2;
-        const float* xgptr = xcbptr + bias_data_size_g * 3;
-        const float* hiptr = xcbptr + bias_data_size_g * 4;
-        const float* hoptr = xcbptr + bias_data_size_g * 5;
-        const float* hfptr = xcbptr + bias_data_size_g * 6;
-        const float* hgptr = xcbptr + bias_data_size_g * 7;
-
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = xiptr[j] + hiptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = xfptr[j] + hfptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = xoptr[j] + hoptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = xgptr[j] + hgptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-
-        if (direction_type == 2) {
-          xiptr += bias_data_size_g * 8;
-          xoptr += bias_data_size_g * 8;
-          xfptr += bias_data_size_g * 8;
-          xgptr += bias_data_size_g * 8;
-          hiptr += bias_data_size_g * 8;
-          hoptr += bias_data_size_g * 8;
-          hfptr += bias_data_size_g * 8;
-          hgptr += bias_data_size_g * 8;
-
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = xiptr[j] + hiptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = xfptr[j] + hfptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = xoptr[j] + hoptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = xgptr[j] + hgptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-        }
-      }
-
-      // reorder num_directions-IOFG-hidden-hidden to
-      // num_directions-IFOG-hidden-hidden
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int weight_data_size_g = get_tensor_proto_data_size(R) / 4 / num_directions;
-        const float* rptr =
-            R.has_raw_data() ? (const float*)R.raw_data().data() : R.float_data().data();
-
-        const float* iptr = rptr;
-        const float* optr = rptr + weight_data_size_g;
-        const float* fptr = rptr + weight_data_size_g * 2;
-        const float* gptr = rptr + weight_data_size_g * 3;
-        fwrite(iptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(fptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(optr, sizeof(float), weight_data_size_g, bp);
-        fwrite(gptr, sizeof(float), weight_data_size_g, bp);
-
-        if (direction_type == 2) {
-          iptr += weight_data_size_g * 4;
-          optr += weight_data_size_g * 4;
-          fptr += weight_data_size_g * 4;
-          gptr += weight_data_size_g * 4;
-          fwrite(iptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(fptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(optr, sizeof(float), weight_data_size_g, bp);
-          fwrite(gptr, sizeof(float), weight_data_size_g, bp);
-        }
-      }
-    } else if (op == "MatMul") {
-      if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2) {
-        // InnerProduct
-        const onnx::TensorProto& B = weights[node.input(1)];
-
-        int weight_data_size = get_tensor_proto_data_size(B);
-
-        int num_output = B.dims(B.dims_size() - 1);
-        int num_input = weight_data_size / num_output;
-
-        fprintf(pp, " 0=%d", num_output);
-        fprintf(pp, " 1=0");
-        fprintf(pp, " 2=%d", weight_data_size);
-
-        int quantize_tag = 0;
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        // reorder num_input-num_output to num_output-num_input
-        {
-          const float* bptr =
-              B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
-
-          for (int j = 0; j < num_output; j++) {
-            for (int k = 0; k < num_input; k++) {
-              float vb = bptr[k * num_output + j];
-              fwrite(&vb, sizeof(float), 1, bp);
-            }
-          }
-        }
-
-        // fwrite_tensor_proto_data(B, bp)
-      } else {
-        // default matrix multiplication
-      }
-    } else if (op == "Max") {
-      int op_type = 4;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "Min") {
-      int op_type = 5;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "Mul") {
-      int op_type = 2;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "MultiHeadAttention") {
-      int embed_dim = get_node_attr_i(node, "embed_dim", 0);
-      int num_heads = get_node_attr_i(node, "num_heads", 0);
+        internal_split++;
+    }
 
-      fprintf(pp, " 0=%d", embed_dim);
-      fprintf(pp, " 1=%d", num_heads);
+    for (int i = 0; i < node_count; i++)
+    {
+        const onnx::NodeProto& node = mutable_graph->node(i);
+        const std::string&     op   = node.op_type();
 
-      if (node.input_size() == 5) {
-        const onnx::TensorProto& qkvw = weights[node.input(1)];
-        const onnx::TensorProto& qkvb = weights[node.input(2)];
-        const onnx::TensorProto& ow = weights[node.input(3)];
-        const onnx::TensorProto& ob = weights[node.input(4)];
+        //         fprintf(stderr, "op = %s\n", op.c_str());
 
-        int weight_data_size = get_tensor_proto_data_size(ow);
+        if (op == "noop_reducedncnn")
+        {
+            continue;
+        }
 
-        fprintf(pp, " 2=%d", weight_data_size);
+        std::string name = node.name();
+        if (name.empty())
+        {
+            name = node.output(0);
+        }
 
-        int quantize_tag = 0;
+        int input_size  = node.input_size();
+        int output_size = node.output_size();
 
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose qw
+        for (int j = 0; j < (int)node.input_size(); j++)
         {
-          const float* wptr =
-              qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
-          const float* bptr =
-              qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
+            const std::string& input_name = node.input(j);
 
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim * 3 + k];
-              fwrite(&vb, sizeof(float), 1, bp);
+            // check weight
+            if (weights.find(input_name) != weights.end() && node_reference[input_name] == 0)
+            {
+                input_size--;
             }
-          }
 
-          fwrite(bptr, sizeof(float), embed_dim, bp);
-        }
+            if (input_name.empty())
+            {
+                input_size--;
+            }
 
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose kw
+            //             fprintf(stderr, "  input = %s\n", input_name.c_str());
+        }
+        /*
+        for (int j=0; j<(int)node.output_size(); j++)
         {
-          const float* wptr =
-              qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
-          const float* bptr =
-              qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
-          bptr += embed_dim;
+            const std::string& output_name = node.output(j);
+            fprintf(stderr, "  output = %s\n", output_name.c_str());
+        }
+        */
 
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim * 3 + k + embed_dim];
-              fwrite(&vb, sizeof(float), 1, bp);
+        if (op == "Abs")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Acos")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Add")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "ArgMax")
+        {
+            fprintf(pp, "%-16s", "TopK");
+        }
+        else if (op == "Asin")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Atan")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "AveragePool" || op == "MaxPool")
+        {
+            std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
+            if (kernel_shape.size() == 1)
+            {
+                fprintf(pp, "%-16s", "Pooling1D");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "Pooling");
             }
-          }
-
-          fwrite(bptr, sizeof(float), embed_dim, bp);
         }
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose vw
+        else if (op == "BatchNormalization")
         {
-          const float* wptr =
-              qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
-          const float* bptr =
-              qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
-          bptr += embed_dim * 2;
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim * 3 + k + embed_dim * 2];
-              fwrite(&vb, sizeof(float), 1, bp);
+            fprintf(pp, "%-16s", "BatchNorm");
+        }
+        else if (op == "BiasGelu")
+        {
+            fprintf(pp, "%-16s", "BiasGelu");
+        }
+        else if (op == "Cast")
+        {
+            fprintf(pp, "%-16s", "Noop");
+        }
+        else if (op == "Ceil")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Clip")
+        {
+            fprintf(pp, "%-16s", "Clip");
+        }
+        else if (op == "Concat")
+        {
+            fprintf(pp, "%-16s", "Concat");
+        }
+        else if (op == "Constant")
+        {
+            continue;
+        }
+        else if (op == "ConstantOfShape")
+        {
+            fprintf(pp, "%-16s", "ConstantOfShape");
+        }
+        else if (op == "Conv")
+        {
+            std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
+            if (kernel_shape.size() == 1)
+            {
+                fprintf(pp, "%-16s", "Convolution1D");
+            }
+            else
+            {
+                int group = get_node_attr_i(node, "group", 1);
+                if (group > 1)
+                {
+                    fprintf(pp, "%-16s", "ConvolutionDepthWise");
+                }
+                else
+                {
+                    fprintf(pp, "%-16s", "Convolution");
+                }
             }
-          }
-
-          fwrite(bptr, sizeof(float), embed_dim, bp);
         }
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose ow
+        else if (op == "ConvTranspose")
         {
-          const float* wptr =
-              ow.has_raw_data() ? (const float*)ow.raw_data().data() : ow.float_data().data();
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim + k];
-              fwrite(&vb, sizeof(float), 1, bp);
+            int group = get_node_attr_i(node, "group", 1);
+            if (group > 1)
+            {
+                fprintf(pp, "%-16s", "DeconvolutionDepthWise");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "Deconvolution");
             }
-          }
         }
-        fwrite_tensor_proto_data(ob, bp);
-      } else {
-        const onnx::TensorProto& qw = weights[node.input(3)];
-        const onnx::TensorProto& qb = weights[node.input(4)];
-        const onnx::TensorProto& kw = weights[node.input(5)];
-        const onnx::TensorProto& kb = weights[node.input(6)];
-        const onnx::TensorProto& vw = weights[node.input(7)];
-        const onnx::TensorProto& vb = weights[node.input(8)];
-        const onnx::TensorProto& ow = weights[node.input(9)];
-        const onnx::TensorProto& ob = weights[node.input(10)];
-
-        int weight_data_size = get_tensor_proto_data_size(qw);
-
-        fprintf(pp, " 2=%d", weight_data_size);
-
-        int quantize_tag = 0;
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose qw
+        else if (op == "Cos")
         {
-          const float* wptr =
-              qw.has_raw_data() ? (const float*)qw.raw_data().data() : qw.float_data().data();
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim + k];
-              fwrite(&vb, sizeof(float), 1, bp);
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Crop")
+        {
+            fprintf(pp, "%-16s", "Crop");
+        }
+        else if (op == "DepthToSpace")
+        {
+            fprintf(pp, "%-16s", "PixelShuffle");
+        }
+        else if (op == "DetectionOutput")
+        {
+            fprintf(pp, "%-16s", "DetectionOutput");
+        }
+        else if (op == "Div")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "Dropout")
+        {
+            fprintf(pp, "%-16s", "Dropout");
+            output_size = 1;
+        }
+        else if (op == "Elu")
+        {
+            fprintf(pp, "%-16s", "ELU");
+        }
+        else if (op == "EmbedLayerNormalization")
+        {
+            fprintf(pp, "%-16s", "EmbedLayerNormalization");
+        }
+        else if (op == "Equal")
+        {
+            fprintf(pp, "%-16s", "Compare");
+        }
+        else if (op == "Exp")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Expand")
+        {
+            fprintf(pp, "%-16s", "Expand");
+        }
+        else if (op == "Flatten")
+        {
+            fprintf(pp, "%-16s", "Flatten");
+        }
+        else if (op == "Floor")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Gather")
+        {
+            fprintf(pp, "%-16s", "Gather");
+        }
+        else if (op == "Gelu")
+        {
+            fprintf(pp, "%-16s", "GELU");
+        }
+        else if (op == "Gemm")
+        {
+            float alpha  = get_node_attr_f(node, "alpha", 1.f);
+            float beta   = get_node_attr_f(node, "beta", 1.f);
+            int   transA = get_node_attr_i(node, "transA", 0);
+            int   transB = get_node_attr_i(node, "transB", 0);
+
+            if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1)
+            {
+                // InnerProduct-like A * B + C
+                fprintf(pp, "%-16s", "InnerProduct");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "Gemm");
+            }
+        }
+        else if (op == "GlobalAveragePool")
+        {
+            fprintf(pp, "%-16s", "Pooling");
+        }
+        else if (op == "GlobalMaxPool")
+        {
+            fprintf(pp, "%-16s", "Pooling");
+        }
+        else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
+                 op == "adaptive_max_pool2d")
+        {
+            fprintf(pp, "%-16s", "Pooling");
+        }
+        else if (op == "GroupNorm")
+        {
+            fprintf(pp, "%-16s", "GroupNorm");
+        }
+        else if (op == "GRU")
+        {
+            fprintf(pp, "%-16s", "GRU");
+        }
+        else if (op == "HardSigmoid")
+        {
+            fprintf(pp, "%-16s", "HardSigmoid");
+        }
+        else if (op == "HardSwish")
+        {
+            fprintf(pp, "%-16s", "HardSwish");
+        }
+        else if (op == "ImageScaler")
+        {
+            fprintf(pp, "%-16s", "Scale");
+        }
+        else if (op == "InstanceNormalization")
+        {
+            fprintf(pp, "%-16s", "InstanceNorm");
+        }
+        else if (op == "LayerNorm")
+        {
+            fprintf(pp, "%-16s", "LayerNorm");
+        }
+        else if (op == "LeakyRelu")
+        {
+            fprintf(pp, "%-16s", "ReLU");
+        }
+        else if (op == "Threshold")
+        {
+            fprintf(pp, "%-16s", "Threshold");
+        }
+        else if (op == "Log")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "LRN")
+        {
+            fprintf(pp, "%-16s", "LRN");
+        }
+        else if (op == "LSTM")
+        {
+            fprintf(pp, "%-16s", "LSTM");
+        }
+        else if (op == "MatMul")
+        {
+            if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2)
+            {
+                fprintf(pp, "%-16s", "InnerProduct");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "Gemm");
+            }
+        }
+        else if (op == "Max")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "Min")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "Mul")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "MultiHeadAttention")
+        {
+            fprintf(pp, "%-16s", "MultiHeadAttention");
+        }
+        else if (op == "Neg")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "NonMaxSuppression")
+        {
+            fprintf(pp, "%-16s", "NonMaxSuppression");
+        }
+        else if (op == "Normalize")
+        {
+            fprintf(pp, "%-16s", "Normalize");
+        }
+        else if (op == "Pad")
+        {
+            fprintf(pp, "%-16s", "Padding");
+        }
+        else if (op == "PixelShuffle")
+        {
+            fprintf(pp, "%-16s", "PixelShuffle");
+        }
+        else if (op == "Pow")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "PriorBox")
+        {
+            fprintf(pp, "%-16s", "PriorBox");
+        }
+        else if (op == "PRelu")
+        {
+            fprintf(pp, "%-16s", "PReLU");
+        }
+        else if (op == "Range")
+        {
+            fprintf(pp, "%-16s", "Range");
+        }
+        else if (op == "Reciprocal")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "ReduceMax" || op == "ReduceMin" || op == "ReduceMean" || op == "ReduceProd" ||
+                 op == "ReduceSum" || op == "ReduceSumSquare" || op == "ReduceL1" ||
+                 op == "ReduceL2" || op == "ReduceLogSum" || op == "ReduceLogSumExp")
+        {
+            fprintf(pp, "%-16s", "Reduction");
+        }
+        else if (op == "Relu")
+        {
+            fprintf(pp, "%-16s", "ReLU");
+        }
+        else if (op == "Reorg")
+        {
+            fprintf(pp, "%-16s", "Reorg");
+        }
+        else if (op == "Reshape")
+        {
+            fprintf(pp, "%-16s", "Reshape");
+        }
+        else if (op == "RNN")
+        {
+            fprintf(pp, "%-16s", "RNN");
+        }
+        else if (op == "RDiv")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "RSub")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "RoiAlign")
+        {
+            fprintf(pp, "%-16s", "ROIAlign");
+        }
+        else if (op == "ScatterND")
+        {
+            fprintf(pp, "%-16s", "ScatterND");
+        }
+        else if (op == "Shape")
+        {
+            fprintf(pp, "%-16s", "Shape");
+        }
+        else if (op == "ShuffleChannel")
+        {
+            fprintf(pp, "%-16s", "ShuffleChannel");
+        }
+        else if (op == "Sigmoid")
+        {
+            fprintf(pp, "%-16s", "Sigmoid");
+        }
+        else if (op == "Sin")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "SkipLayerNormalization")
+        {
+            fprintf(pp, "%-16s", "SkipLayerNormalization");
+        }
+        else if (op == "Slice")
+        {
+            std::vector<int> ends;
+            std::vector<int> steps;
+            bool             use_crop = true;
+
+            if (node.input_size() == 1)
+            {
+                ends  = get_node_attr_ai(node, "ends");
+                steps = get_node_attr_ai(node, "steps");  // TODO
+            }
+            else
+            {
+                ends = get_node_attr_from_input_ai(weights[node.input(2)]);
+                if (node.input_size() >= 5) steps = get_node_attr_from_input_ai(weights[node.input(4)]);
+            }
+
+            // assert step == 1
+            for (int i = 0; i < (int)steps.size(); i++)
+            {
+                if (steps[i] != 1 && steps[i] < ends[i])
+                {
+                    use_crop = false;
+                    break;
+                }
+            }
+
+            if (use_crop)
+            {
+                fprintf(pp, "%-16s", "Crop");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "TensorSlice");
+            }
+        }
+        else if (op == "Softmax")
+        {
+            fprintf(pp, "%-16s", "Softmax");
+        }
+        else if (op == "Softplus")
+        {
+            fprintf(pp, "%-16s", "Softplus");
+        }
+        else if (op == "Split")
+        {
+            fprintf(pp, "%-16s", "Slice");
+        }
+        else if (op == "Sqrt")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Squeeze")
+        {
+            std::vector<int> axes = get_node_attr_ai(node, "axes");
+            // fprintf(stderr, "axes[0]: %d\n",axes[0]);
+            if (axes[0] == 0)
+            {
+                fprintf(pp, "%-16s", "Noop");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "Squeeze");
+            }
+        }
+        else if (op == "Sub")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "Sum")
+        {
+            fprintf(pp, "%-16s", "Eltwise");
+        }
+        else if (op == "Swish")
+        {
+            fprintf(pp, "%-16s", "Swish");
+        }
+        else if (op == "Tan")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Tanh")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Tile")
+        {
+            fprintf(pp, "%-16s", "TileOnnx");
+        }
+        else if (op == "TopK")
+        {
+            fprintf(pp, "%-16s", "TopK");
+        }
+        else if (op == "Transpose")
+        {
+            fprintf(pp, "%-16s", "Permute");
+        }
+        else if (op == "Upsample" || op == "Resize")
+        {
+            fprintf(pp, "%-16s", "Interp");
+        }
+        else if (op == "Unsqueeze")
+        {
+            std::vector<int> axes = get_node_attr_ai(node, "axes");
+            // fprintf(stderr, "axes[0]: %d\n",axes[0]);
+            if (axes[0] == 0)
+            {
+                fprintf(pp, "%-16s", "Noop");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "ExpandDims");
+            }
+        }
+        else if (op == "Where")
+        {
+            fprintf(pp, "%-16s", "Where");
+        }
+        else if (op == "Yolov3DetectionOutput")
+        {
+            fprintf(pp, "%-16s", "Yolov3DetectionOutput");
+        }
+        else
+        {
+            // TODO
+            fprintf(stderr, "%s not supported yet!\n", op.c_str());
+            fprintf(pp, "%-16s", op.c_str());
+        }
+
+        fprintf(pp, " %-24s %d %d", name.c_str(), input_size, output_size);
+
+        for (int j = 0; j < (int)node.input_size(); j++)
+        {
+            std::string input_name = node.input(j);
+
+            // check weight
+            if (weights.find(input_name) != weights.end() && node_reference[input_name] == 0)
+            {
+                continue;
+            }
+
+            if (input_name.empty())
+            {
+                continue;
+            }
+
+            if (split_node_reference.find(input_name) != split_node_reference.end())
+            {
+                int refidx                       = split_node_reference[input_name] - 1;
+                split_node_reference[input_name] = refidx;
+
+                char splitsuffix[256];
+                sprintf(splitsuffix, "_splitncnn_%d", refidx);
+                input_name = input_name + splitsuffix;
+            }
+
+            fprintf(pp, " %s", input_name.c_str());
+        }
+
+        for (int j = 0; j < output_size; j++)
+        {
+            const std::string& output_name = node.output(j);
+
+            fprintf(pp, " %s", output_name.c_str());
+        }
+
+        if (op == "Abs")
+        {
+            int op_type = 0;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Acos")
+        {
+            int op_type = 13;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Add")
+        {
+            int op_type = 0;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "ArgMax")
+        {
+            int axis     = get_node_attr_i(node, "axis");
+            int keepdims = get_node_attr_i(node, "keepdims");
+            fprintf(pp, " 0=%d", axis - 1);
+            fprintf(pp, " 3=%d", keepdims);
+        }
+        else if (op == "Asin")
+        {
+            int op_type = 12;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Atan")
+        {
+            int op_type = 14;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "AveragePool" || op == "MaxPool")
+        {
+            std::string      auto_pad     = get_node_attr_s(node, "auto_pad");
+            int              ceil_mode    = get_node_attr_i(node, "ceil_mode", 0);
+            std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
+            std::vector<int> strides      = get_node_attr_ai(node, "strides");
+            std::vector<int> pads         = get_node_attr_ai(node, "pads");
+
+            int              pool     = op == "AveragePool" ? 1 : 0;
+            int              pad_mode = 1;
+
+            if (auto_pad == "SAME_UPPER")
+            {
+                pad_mode = 2;
+            }
+            else if (auto_pad == "SAME_LOWER")
+            {
+                pad_mode = 3;
+            }
+
+            if (ceil_mode == 1)
+            {
+                pad_mode = 0;
+            }
+
+            fprintf(pp, " 0=%d", pool);
+
+            if (kernel_shape.size() == 1)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[0]);
+            }
+            else if (kernel_shape.size() == 2)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[1]);
+                fprintf(pp, " 11=%d", kernel_shape[0]);
+            }
+
+            if (strides.size() == 1)
+            {
+                fprintf(pp, " 2=%d", strides[0]);
+            }
+            else if (strides.size() == 2)
+            {
+                fprintf(pp, " 2=%d", strides[1]);
+                fprintf(pp, " 12=%d", strides[0]);
+            }
+
+            if (pads.size() == 1)
+            {
+                fprintf(pp, " 3=%d", pads[0]);
+            }
+            else if (pads.size() == 2)
+            {
+                fprintf(pp, " 3=%d", pads[1]);
+                fprintf(pp, " 13=%d", pads[0]);
+            }
+            else if (pads.size() == 4)
+            {
+                fprintf(pp, " 3=%d", pads[1]);
+                fprintf(pp, " 13=%d", pads[0]);
+                fprintf(pp, " 14=%d", pads[3]);
+                fprintf(pp, " 15=%d", pads[2]);
+            }
+
+            fprintf(pp, " 5=%d", pad_mode);
+
+            if (op == "AveragePool")
+            {
+                int avgpool_count_include_pad = get_node_attr_i(node, "count_include_pad", 0);
+                fprintf(pp, " 6=%d", avgpool_count_include_pad);
+            }
+        }
+        else if (op == "BatchNormalization")
+        {
+            float                    epsilon = get_node_attr_f(node, "epsilon", 1e-5f);
+
+            const onnx::TensorProto& scale = weights[node.input(1)];
+            const onnx::TensorProto& B     = weights[node.input(2)];
+            const onnx::TensorProto& mean  = weights[node.input(3)];
+            const onnx::TensorProto& var   = weights[node.input(4)];
+
+            int                      channels = get_tensor_proto_data_size(scale);
+
+            fprintf(pp, " 0=%d", channels);
+
+            fwrite_tensor_proto_data(scale, bp);
+            fwrite_tensor_proto_data(mean, bp);
+            // apply epsilon to var
+            {
+                const float* v =
+                    var.has_raw_data() ? (const float*)var.raw_data().data() : var.float_data().data();
+
+                for (int j = 0; j < channels; j++)
+                {
+                    float ve = v[j] + epsilon;
+                    fwrite(&ve, sizeof(float), 1, bp);
+                }
+            }
+            fwrite_tensor_proto_data(B, bp);
+        }
+        else if (op == "BiasGelu")
+        {
+            const onnx::TensorProto& B = weights[node.input(1)];
+
+            fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
+
+            int quantize_tag = 0;
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(B, bp);
+        }
+        else if (op == "Ceil")
+        {
+            int op_type = 3;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Clip")
+        {
+            float min;
+            float max;
+            if (node.input_size() == 1)
+            {
+                min = get_node_attr_f(node, "min", -FLT_MAX);
+                max = get_node_attr_f(node, "max", FLT_MAX);
+            }
+            else
+            {
+                min = weights.find(node.input(1)) != weights.end() ? get_node_attr_from_input<float>(weights[node.input(1)]) : -FLT_MAX;
+                max = weights.find(node.input(2)) != weights.end() ? get_node_attr_from_input<float>(weights[node.input(2)]) : FLT_MAX;
+            }
+
+            fprintf(pp, " 0=%e", min);
+            fprintf(pp, " 1=%e", max);
+        }
+        else if (op == "Concat")
+        {
+            int axis = get_node_attr_i(node, "axis", 1);
+            fprintf(pp, " 0=%d", axis - 1);
+        }
+        else if (op == "Constant")
+        {
+            // never reach here
+        }
+        else if (op == "ConstantOfShape")
+        {
+            float value = 0.f;
+            value       = get_node_attr_f(node, "value", 0.f);
+            fprintf(pp, " 0=%f", value);
+        }
+        else if (op == "Conv")
+        {
+            const onnx::TensorProto& W = weights[node.input(1)];
+
+            int                      num_filter = W.dims(0);
+            int                      has_bias   = node.input_size() == 3 ? 1 : 0;
+
+            std::string              auto_pad     = get_node_attr_s(node, "auto_pad");
+            std::vector<int>         kernel_shape = get_node_attr_ai(node, "kernel_shape");
+            std::vector<int>         dilations    = get_node_attr_ai(node, "dilations");
+            std::vector<int>         strides      = get_node_attr_ai(node, "strides");
+            std::vector<int>         pads         = get_node_attr_ai(node, "pads");
+            int                      group        = get_node_attr_i(node, "group", 1);
+
+            fprintf(pp, " 0=%d", num_filter);
+
+            if (kernel_shape.size() == 1)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[0]);
+            }
+            else if (kernel_shape.size() == 2)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[1]);
+                fprintf(pp, " 11=%d", kernel_shape[0]);
+            }
+
+            if (dilations.size() == 1)
+            {
+                fprintf(pp, " 2=%d", dilations[0]);
+            }
+            else if (dilations.size() == 2)
+            {
+                fprintf(pp, " 2=%d", dilations[1]);
+                fprintf(pp, " 12=%d", dilations[0]);
+            }
+
+            if (strides.size() == 1)
+            {
+                fprintf(pp, " 3=%d", strides[0]);
+            }
+            else if (strides.size() == 2)
+            {
+                fprintf(pp, " 3=%d", strides[1]);
+                fprintf(pp, " 13=%d", strides[0]);
+            }
+
+            if (auto_pad == "SAME_UPPER")
+            {
+                fprintf(pp, " 4=-233");
+            }
+            else if (auto_pad == "SAME_LOWER")
+            {
+                fprintf(pp, " 4=-234");
+            }
+            else
+            {
+                if (pads.size() == 1)
+                {
+                    fprintf(pp, " 4=%d", pads[0]);
+                }
+                else if (pads.size() == 2)
+                {
+                    fprintf(pp, " 4=%d", pads[1]);
+                    fprintf(pp, " 14=%d", pads[0]);
+                }
+                else if (pads.size() == 4)
+                {
+                    fprintf(pp, " 4=%d", pads[1]);
+                    fprintf(pp, " 14=%d", pads[0]);
+                    fprintf(pp, " 15=%d", pads[3]);
+                    fprintf(pp, " 16=%d", pads[2]);
+                }
+            }
+
+            fprintf(pp, " 5=%d", has_bias);
+
+            fprintf(pp, " 6=%d", get_tensor_proto_data_size(W));
+
+            if (group > 1)
+            {
+                fprintf(pp, " 7=%d", group);
+            }
+
+            int quantize_tag = 0;
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(W, bp);
+
+            if (has_bias)
+            {
+                const onnx::TensorProto& B = weights[node.input(2)];
+                fwrite_tensor_proto_data(B, bp);
+            }
+        }
+        else if (op == "ConvTranspose")
+        {
+            const onnx::TensorProto& W = weights[node.input(1)];
+
+            int                      has_bias = node.input_size() == 3 ? 1 : 0;
+
+            std::string              auto_pad       = get_node_attr_s(node, "auto_pad");
+            std::vector<int>         kernel_shape   = get_node_attr_ai(node, "kernel_shape");
+            std::vector<int>         dilations      = get_node_attr_ai(node, "dilations");
+            std::vector<int>         strides        = get_node_attr_ai(node, "strides");
+            std::vector<int>         output_padding = get_node_attr_ai(node, "output_padding");
+            std::vector<int>         output_shape   = get_node_attr_ai(node, "output_shape");
+            std::vector<int>         pads           = get_node_attr_ai(node, "pads");
+            int                      group          = get_node_attr_i(node, "group", 1);
+            int                      num_filter     = W.dims(1) * group;
+
+            fprintf(pp, " 0=%d", num_filter);
+
+            if (kernel_shape.size() == 1)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[0]);
+            }
+            else if (kernel_shape.size() == 2)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[1]);
+                fprintf(pp, " 11=%d", kernel_shape[0]);
+            }
+
+            if (dilations.size() == 1)
+            {
+                fprintf(pp, " 2=%d", dilations[0]);
+            }
+            else if (dilations.size() == 2)
+            {
+                fprintf(pp, " 2=%d", dilations[1]);
+                fprintf(pp, " 12=%d", dilations[0]);
+            }
+
+            if (strides.size() == 1)
+            {
+                fprintf(pp, " 3=%d", strides[0]);
+            }
+            else if (strides.size() == 2)
+            {
+                fprintf(pp, " 3=%d", strides[1]);
+                fprintf(pp, " 13=%d", strides[0]);
+            }
+
+            if (auto_pad == "SAME_UPPER")
+            {
+                fprintf(pp, " 4=-233");
+            }
+            else if (auto_pad == "SAME_LOWER")
+            {
+                fprintf(pp, " 4=-234");
+            }
+            else
+            {
+                if (pads.size() == 1)
+                {
+                    fprintf(pp, " 4=%d", pads[0]);
+                }
+                else if (pads.size() == 2)
+                {
+                    fprintf(pp, " 4=%d", pads[1]);
+                    fprintf(pp, " 14=%d", pads[0]);
+                }
+                else if (pads.size() == 4)
+                {
+                    fprintf(pp, " 4=%d", pads[1]);
+                    fprintf(pp, " 14=%d", pads[0]);
+                    fprintf(pp, " 15=%d", pads[3]);
+                    fprintf(pp, " 16=%d", pads[2]);
+                }
+            }
+
+            if (output_padding.size() == 1)
+            {
+                fprintf(pp, " 18=%d", output_padding[0]);
+            }
+            else if (output_padding.size() == 2)
+            {
+                fprintf(pp, " 18=%d", output_padding[1]);
+                fprintf(pp, " 19=%d", output_padding[0]);
+            }
+
+            if (output_shape.size() == 1)
+            {
+                fprintf(pp, " 20=%d", output_shape[0]);
+            }
+            else if (output_shape.size() == 2)
+            {
+                fprintf(pp, " 20=%d", output_shape[1]);
+                fprintf(pp, " 21=%d", output_shape[0]);
+            }
+
+            fprintf(pp, " 5=%d", has_bias);
+
+            fprintf(pp, " 6=%d", get_tensor_proto_data_size(W));
+
+            if (group > 1)
+            {
+                fprintf(pp, " 7=%d", group);
+            }
+
+            int quantize_tag = 0;
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            int maxk = 0;
+            if (kernel_shape.size() == 2)
+            {
+                maxk = kernel_shape[1] * kernel_shape[0];
+            }
+            else
+            {
+                maxk = kernel_shape[0] * kernel_shape[0];
+            }
+            int          weight_data_size = get_tensor_proto_data_size(W);
+            const float* weight_data      = 0;
+            if (W.has_raw_data())
+            {
+                weight_data = (const float*)W.raw_data().data();
+            }
+            else if (W.data_type() == 1)
+            {
+                weight_data = W.float_data().data();
+            }
+            for (int g = 0; g < group; g++)
+            {
+                // reorder weight from inch-outch to outch-inch
+                int          num_filter_g    = num_filter / group;
+                int          num_input       = weight_data_size / maxk / num_filter_g / group;
+                const float* weight_data_ptr = weight_data + g * maxk * num_filter_g * num_input;
+                for (int k = 0; k < num_filter_g; k++)
+                {
+                    for (int j = 0; j < num_input; j++)
+                    {
+                        fwrite(weight_data_ptr + (j * num_filter_g + k) * maxk, sizeof(float), maxk, bp);
+                    }
+                }
+            }
+
+            if (has_bias)
+            {
+                const onnx::TensorProto& B = weights[node.input(2)];
+                fwrite_tensor_proto_data(B, bp);
+            }
+        }
+        else if (op == "Cos")
+        {
+            int op_type = 10;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Crop")
+        {
+            auto starts = get_node_attr_ai(node, "starts");
+            fprintf(pp, " -23309=%zu", starts.size());
+            for (size_t j = 0; j < starts.size(); ++j)
+            {
+                fprintf(pp, ",%i", starts[j]);
+            }
+            auto ends = get_node_attr_ai(node, "ends");
+            fprintf(pp, " -23310=%zu", ends.size());
+            for (size_t j = 0; j < ends.size(); ++j)
+            {
+                fprintf(pp, ",%i", ends[j]);
+            }
+            auto axis = get_node_attr_ai(node, "axis");
+            fprintf(pp, " -23311=%zu", axis.size());
+            for (size_t j = 0; j < axis.size(); ++j)
+            {
+                fprintf(pp, ",%i", axis[j]);
+            }
+        }
+        else if (op == "DepthToSpace")
+        {
+            // pixelshuffle
+            int         scale_factor = get_node_attr_i(node, "blocksize", 1);
+            std::string mode         = get_node_attr_s(node, "mode");
+            fprintf(pp, " 0=%d", scale_factor);
+            if (mode == "CRD")
+            {
+                fprintf(pp, " 1=0");
+            }
+            else if (mode == "DCR")
+            {
+                fprintf(pp, " 1=1");
+            }
+        }
+        else if (op == "DetectionOutput")
+        {
+            float              score_threshold = get_node_attr_f(node, "score_threshold");
+            float              nms_threshold   = get_node_attr_f(node, "nms_threshold");
+            int                nms_top_k       = get_node_attr_i(node, "nms_top_k");
+            int                keep_top_k      = get_node_attr_i(node, "keep_top_k");
+            int                num_class       = get_node_attr_i(node, "num_class");
+            std::vector<float> vars            = get_node_attr_af(node, "vars");
+            fprintf(pp, " 0=%d", num_class);
+            fprintf(pp, " 1=%f", nms_threshold);
+            fprintf(pp, " 2=%d", nms_top_k);
+            fprintf(pp, " 3=%d", keep_top_k);
+            fprintf(pp, " 4=%f", score_threshold);
+            fprintf(pp, " 5=%f", vars[0]);
+            fprintf(pp, " 6=%f", vars[1]);
+            fprintf(pp, " 7=%f", vars[2]);
+            fprintf(pp, " 8=%f", vars[3]);
+        }
+        else if (op == "Div")
+        {
+            int op_type = 3;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "Dropout")
+        {
+            // no-op
+        }
+        else if (op == "Elu")
+        {
+            float alpha = get_node_attr_f(node, "alpha", 1.f);
+            fprintf(pp, " 0=%e", alpha);
+        }
+        else if (op == "EmbedLayerNormalization")
+        {
+            const onnx::TensorProto& words     = weights[node.input(2)];
+            const onnx::TensorProto& positions = weights[node.input(3)];
+            const onnx::TensorProto& W         = weights[node.input(5)];
+            const onnx::TensorProto& B         = weights[node.input(6)];
+
+            fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
+            fprintf(pp, " 1=%d", get_tensor_proto_data_size(words));
+            fprintf(pp, " 2=%d", get_tensor_proto_data_size(positions));
+
+            int quantize_tag = 0;
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(words, bp);
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(positions, bp);
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(W, bp);
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(B, bp);
+        }
+        else if (op == "Equal")
+        {
+            int op_type = 0;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Exp")
+        {
+            int op_type = 7;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Flatten")
+        {
+            int axis = get_node_attr_i(node, "axis", 1);
+            if (axis != 1)
+            {
+                fprintf(stderr, "Unsupported Flatten axis %d!\n", axis);
+            }
+        }
+        else if (op == "Floor")
+        {
+            int op_type = 2;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Gather")
+        {
+            if (weights[node.input(1)].dims_size() > 1)
+            {
+                fprintf(stderr, "Unsupported indice dims > 1");
+            }
+            int axis = get_node_attr_i(node, "axis", 1) - 1;
+            if (axis < 0)
+            {
+                fprintf(stderr, "Unsupported Gather axis: %d\n", axis + 1);
+            }
+            fprintf(pp, " 0=%d", axis);
+        }
+        else if (op == "Gelu")
+        {
+            fprintf(pp, " 0=1");
+        }
+        else if (op == "Gemm")
+        {
+            float alpha  = get_node_attr_f(node, "alpha", 1.f);
+            float beta   = get_node_attr_f(node, "beta", 1.f);
+            int   transA = get_node_attr_i(node, "transA", 0);
+            int   transB = get_node_attr_i(node, "transB", 0);
+
+            if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1)
+            {
+                // InnerProduct-like A * B + C
+                const onnx::TensorProto& B          = weights[node.input(1)];
+                // B has transposed.
+                int                      num_output = B.dims(0);
+                fprintf(pp, " 0=%d", num_output);
+                if (node.input_size() == 3)
+                {
+                    fprintf(pp, " 1=1");
+                }
+                else
+                {
+                    fprintf(pp, " 1=0");
+                }
+                fprintf(pp, " 2=%d", get_tensor_proto_data_size(B));
+
+                int quantize_tag = 0;
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                fwrite_tensor_proto_data(B, bp);
+                if (node.input_size() == 3)
+                {
+                    const onnx::TensorProto& C = weights[node.input(2)];
+                    fwrite_tensor_proto_data(C, bp);
+                }
+            }
+            else
+            {
+                // gemm
+                fprintf(pp, " 0=%e", alpha);
+                fprintf(pp, " 1=%e", beta);
+                fprintf(pp, " 2=%d", transA);
+                fprintf(pp, " 3=%d", transB);
+            }
+        }
+        else if (op == "GlobalAveragePool")
+        {
+            int pool        = 1;
+            int global_pool = 1;
+
+            fprintf(pp, " 0=%d", pool);
+            fprintf(pp, " 4=%d", global_pool);
+        }
+        else if (op == "GlobalMaxPool")
+        {
+            int pool        = 0;
+            int global_pool = 1;
+
+            fprintf(pp, " 0=%d", pool);
+            fprintf(pp, " 4=%d", global_pool);
+        }
+        else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
+                 op == "adaptive_max_pool2d")
+        {
+            int pool = 0;
+            if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d")
+            {
+                pool = 1;
+            }
+            int                      adaptive_pooling = 1;
+            const onnx::TensorProto& out_shape_tp     = weights[node.input(1)];
+            std::vector<int>         out_shape        = get_node_attr_from_input_ai(out_shape_tp);
+
+            fprintf(pp, " 0=%d", pool);
+            fprintf(pp, " 7=%d", adaptive_pooling);
+            if (out_shape.size() == 1)
+            {
+                fprintf(pp, " 8=%d", out_shape[0]);
+            }
+            else if (out_shape.size() == 2)
+            {
+                // out_w
+                fprintf(pp, " 8=%d", out_shape[1]);
+                // out_h
+                fprintf(pp, " 18=%d", out_shape[0]);
+            }
+        }
+        else if (op == "GroupNorm")
+        {
+            int   groups   = get_node_attr_i(node, "groups", 1);
+            int   channels = get_node_attr_i(node, "channels", 1);
+            float eps      = get_node_attr_f(node, "epsilon", 1e-5f);
+            int   affine   = get_node_attr_i(node, "affine", 1);
+
+            if (affine)
+            {
+                // discard affine-less S=1 B=0
+                std::vector<float> affine_S = get_node_attr_from_input_af(weights[node.input(1)]);
+                std::vector<float> affine_B = get_node_attr_from_input_af(weights[node.input(2)]);
+                if (affine_S.size() == 1 && affine_S[0] == 1.f && affine_B.size() == 1 &&
+                    affine_B[0] == 0.f)
+                {
+                    affine = 0;
+                }
+                else
+                {
+                    affine = 0;
+                    {
+                        for (int j = 0; j < channels; j++)
+                        {
+                            if (affine_S[j] != 1.f || affine_B[j] != 0.f)
+                            {
+                                affine = 1;
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+
+            fprintf(pp, " 0=%d", groups);
+            fprintf(pp, " 1=%d", channels);
+            fprintf(pp, " 2=%e", eps);
+            fprintf(pp, " 3=%d", affine);
+            if (affine)
+            {
+                const onnx::TensorProto& scale = weights[node.input(1)];
+                const onnx::TensorProto& B     = weights[node.input(2)];
+
+                fwrite_tensor_proto_data(scale, bp);
+                fwrite_tensor_proto_data(B, bp);
+            }
+        }
+        else if (op == "GRU")
+        {
+            const onnx::TensorProto& W = weights[node.input(1)];
+            const onnx::TensorProto& R = weights[node.input(2)];
+            const onnx::TensorProto& B = weights[node.input(3)];
+
+            int                      hidden_size = get_node_attr_i(node, "hidden_size", 0);
+            std::string              direction   = get_node_attr_s(node, "direction");
+
+            int                      direction_type = 0;
+            if (direction == "forward")
+            {
+                direction_type = 0;
+            }
+            else if (direction == "reverse")
+            {
+                direction_type = 1;
+            }
+            else if (direction == "bidirectional")
+            {
+                direction_type = 2;
+            }
+
+            int weight_data_size = get_tensor_proto_data_size(W);
+
+            fprintf(pp, " 0=%d", hidden_size);
+            fprintf(pp, " 1=%d", weight_data_size);
+            fprintf(pp, " 2=%d", direction_type);
+
+            int num_directions = direction_type == 2 ? 2 : 1;
+
+            int quantize_tag = 0;
+
+            // reorder num_directions-URN-hidden-size to
+            // num_directions-RUN-hidden-size
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          weight_data_size_g = get_tensor_proto_data_size(W) / 3 / num_directions;
+                const float* wptr =
+                    W.has_raw_data() ? (const float*)W.raw_data().data() : W.float_data().data();
+
+                const float* uptr = wptr;
+                const float* rptr = wptr + weight_data_size_g;
+                const float* nptr = wptr + weight_data_size_g * 2;
+                fwrite(rptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(uptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(nptr, sizeof(float), weight_data_size_g, bp);
+
+                if (direction_type == 2)
+                {
+                    uptr += weight_data_size_g * 3;
+                    rptr += weight_data_size_g * 3;
+                    nptr += weight_data_size_g * 3;
+                    fwrite(rptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(uptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(nptr, sizeof(float), weight_data_size_g, bp);
+                }
+            }
+
+            // reduce U and R bias except N
+            // reorder num_directions-URN-hidden to num_directions-RUN-hidden
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          bias_data_size_g = get_tensor_proto_data_size(B) / 2 / 3 / num_directions;
+                const float* bptr =
+                    B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
+                const float* wuptr = bptr;
+                const float* wrptr = bptr + bias_data_size_g;
+                const float* wnptr = bptr + bias_data_size_g * 2;
+                const float* buptr = bptr + bias_data_size_g * 3;
+                const float* brptr = bptr + bias_data_size_g * 4;
+                const float* bnptr = bptr + bias_data_size_g * 5;
+
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = wrptr[j] + brptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = wuptr[j] + buptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+                fwrite(wnptr, sizeof(float), bias_data_size_g, bp);
+                fwrite(bnptr, sizeof(float), bias_data_size_g, bp);
+
+                if (direction_type == 2)
+                {
+                    wuptr += bias_data_size_g * 6;
+                    wrptr += bias_data_size_g * 6;
+                    wnptr += bias_data_size_g * 6;
+                    buptr += bias_data_size_g * 6;
+                    brptr += bias_data_size_g * 6;
+                    bnptr += bias_data_size_g * 6;
+
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = wrptr[j] + brptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = wuptr[j] + buptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                    fwrite(wnptr, sizeof(float), bias_data_size_g, bp);
+                    fwrite(bnptr, sizeof(float), bias_data_size_g, bp);
+                }
+            }
+
+            // reorder num_directions-URN-hidden-hidden to
+            // num_directions-RUN-hidden-hidden
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          weight_data_size_g = get_tensor_proto_data_size(R) / 3 / num_directions;
+                const float* Rptr =
+                    R.has_raw_data() ? (const float*)R.raw_data().data() : R.float_data().data();
+
+                const float* uptr = Rptr;
+                const float* rptr = Rptr + weight_data_size_g;
+                const float* nptr = Rptr + weight_data_size_g * 2;
+                fwrite(rptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(uptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(nptr, sizeof(float), weight_data_size_g, bp);
+
+                if (direction_type == 2)
+                {
+                    uptr += weight_data_size_g * 3;
+                    rptr += weight_data_size_g * 3;
+                    nptr += weight_data_size_g * 3;
+                    fwrite(rptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(uptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(nptr, sizeof(float), weight_data_size_g, bp);
+                }
+            }
+        }
+        else if (op == "HardSigmoid")
+        {
+            float alpha = get_node_attr_f(node, "alpha", 0.2f);
+            float beta  = get_node_attr_f(node, "beta", 0.5f);
+
+            fprintf(pp, " 0=%e", alpha);
+            fprintf(pp, " 1=%e", beta);
+        }
+        else if (op == "HardSwish")
+        {
+            float alpha = get_node_attr_f(node, "alpha", 0.2f);
+            float beta  = get_node_attr_f(node, "beta", 0.5f);
+
+            fprintf(pp, " 0=%e", alpha);
+            fprintf(pp, " 1=%e", beta);
+        }
+        else if (op == "ImageScaler")
+        {
+            std::vector<float> bias  = get_node_attr_af(node, "bias");
+            float              scale = get_node_attr_f(node, "scale", 1.f);
+
+            int                channels = (int)bias.size();
+
+            fprintf(pp, " 0=%d", channels);
+            fprintf(pp, " 1=1");
+
+            for (int j = 0; j < channels; j++)
+            {
+                fwrite(&scale, sizeof(float), 1, bp);
+            }
+            fwrite(&bias[0], sizeof(float), channels, bp);
+        }
+        else if (op == "InstanceNormalization")
+        {
+            float              eps = get_node_attr_f(node, "epsilon", 1e-5f);
+
+            // discard affine-less S=1 B=0
+            std::vector<float> affine_S = get_node_attr_from_input_af(weights[node.input(1)]);
+            std::vector<float> affine_B = get_node_attr_from_input_af(weights[node.input(2)]);
+            int                channels = (int)affine_S.size();
+            int                affine   = 0;
+            {
+                for (int j = 0; j < channels; j++)
+                {
+                    if (affine_S[j] != 1.f || affine_B[j] != 0.f)
+                    {
+                        affine = 1;
+                        break;
+                    }
+                }
+            }
+
+            fprintf(pp, " 0=%d", channels);
+            fprintf(pp, " 1=%e", eps);
+            fprintf(pp, " 2=%d", affine);
+            if (affine)
+            {
+                const onnx::TensorProto& scale = weights[node.input(1)];
+                const onnx::TensorProto& B     = weights[node.input(2)];
+
+                fwrite_tensor_proto_data(scale, bp);
+                fwrite_tensor_proto_data(B, bp);
+            }
+        }
+        else if (op == "LayerNorm")
+        {
+            float eps    = get_node_attr_f(node, "epsilon", 1e-5f);
+            int   affine = get_node_attr_i(node, "affine", 1);
+
+            if (affine)
+            {
+                // discard affine-less S=1 B=0
+                std::vector<float> affine_S    = get_node_attr_from_input_af(weights[node.input(1)]);
+                std::vector<float> affine_B    = get_node_attr_from_input_af(weights[node.input(2)]);
+                int                affine_size = (int)affine_S.size();
+                affine                         = 0;
+                {
+                    for (int j = 0; j < affine_size; j++)
+                    {
+                        if (affine_S[j] != 1.f || affine_B[j] != 0.f)
+                        {
+                            affine = 1;
+                            break;
+                        }
+                    }
+                }
+
+                if (affine)
+                {
+                    fprintf(pp, " 0=%d", affine_size);
+                }
+            }
+
+            fprintf(pp, " 1=%e", eps);
+            fprintf(pp, " 2=%d", affine);
+
+            if (affine)
+            {
+                const onnx::TensorProto& scale = weights[node.input(1)];
+                const onnx::TensorProto& B     = weights[node.input(2)];
+
+                fwrite_tensor_proto_data(scale, bp);
+                fwrite_tensor_proto_data(B, bp);
+            }
+        }
+        else if (op == "LeakyRelu")
+        {
+            float alpha = get_node_attr_f(node, "alpha", 0.01f);
+            fprintf(pp, " 0=%e", alpha);
+        }
+        else if (op == "Threshold")
+        {
+            float threshold = get_node_attr_f(node, "threshold", 0.f);
+            fprintf(pp, " 0=%e", threshold);
+        }
+        else if (op == "Log")
+        {
+            int op_type = 8;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "LRN")
+        {
+            float alpha = get_node_attr_f(node, "alpha", 1.f);
+            float beta  = get_node_attr_f(node, "beta", 0.5f);
+            float bias  = get_node_attr_f(node, "bias", 1.f);
+            int   size  = get_node_attr_i(node, "size", 1);
+
+            int   norm_region = 0;
+
+            fprintf(pp, " 0=%d", norm_region);
+            fprintf(pp, " 1=%d", size);
+            fprintf(pp, " 2=%e", alpha);
+            fprintf(pp, " 3=%e", beta);
+            fprintf(pp, " 4=%e", bias);
+        }
+        else if (op == "LSTM")
+        {
+            const onnx::TensorProto& W = weights[node.input(1)];
+            const onnx::TensorProto& R = weights[node.input(2)];
+            const onnx::TensorProto& B = weights[node.input(3)];
+
+            int                      hidden_size = get_node_attr_i(node, "hidden_size", 0);
+            std::string              direction   = get_node_attr_s(node, "direction");
+
+            int                      direction_type = 0;
+            if (direction == "forward")
+            {
+                direction_type = 0;
+            }
+            else if (direction == "reverse")
+            {
+                direction_type = 1;
+            }
+            else if (direction == "bidirectional")
+            {
+                direction_type = 2;
+            }
+
+            int weight_data_size = get_tensor_proto_data_size(W);
+
+            fprintf(pp, " 0=%d", hidden_size);
+            fprintf(pp, " 1=%d", weight_data_size);
+            fprintf(pp, " 2=%d", direction_type);
+
+            int num_directions = direction_type == 2 ? 2 : 1;
+
+            int quantize_tag = 0;
+
+            // reorder num_directions-IOFG-hidden-size to
+            // num_directions-IFOG-hidden-size
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          weight_data_size_g = get_tensor_proto_data_size(W) / 4 / num_directions;
+                const float* wptr =
+                    W.has_raw_data() ? (const float*)W.raw_data().data() : W.float_data().data();
+
+                const float* iptr = wptr;
+                const float* optr = wptr + weight_data_size_g;
+                const float* fptr = wptr + weight_data_size_g * 2;
+                const float* gptr = wptr + weight_data_size_g * 3;
+                fwrite(iptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(fptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(optr, sizeof(float), weight_data_size_g, bp);
+                fwrite(gptr, sizeof(float), weight_data_size_g, bp);
+
+                if (direction_type == 2)
+                {
+                    iptr += weight_data_size_g * 4;
+                    optr += weight_data_size_g * 4;
+                    fptr += weight_data_size_g * 4;
+                    gptr += weight_data_size_g * 4;
+                    fwrite(iptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(fptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(optr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(gptr, sizeof(float), weight_data_size_g, bp);
+                }
+            }
+
+            // reduce xc and hc bias
+            // reorder num_directions-IOFG-hidden to num_directions-IFOG-hidden
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          bias_data_size_g = get_tensor_proto_data_size(B) / 2 / 4 / num_directions;
+                const float* xcbptr =
+                    B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
+                const float* xiptr = xcbptr;
+                const float* xoptr = xcbptr + bias_data_size_g;
+                const float* xfptr = xcbptr + bias_data_size_g * 2;
+                const float* xgptr = xcbptr + bias_data_size_g * 3;
+                const float* hiptr = xcbptr + bias_data_size_g * 4;
+                const float* hoptr = xcbptr + bias_data_size_g * 5;
+                const float* hfptr = xcbptr + bias_data_size_g * 6;
+                const float* hgptr = xcbptr + bias_data_size_g * 7;
+
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = xiptr[j] + hiptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = xfptr[j] + hfptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = xoptr[j] + hoptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = xgptr[j] + hgptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+
+                if (direction_type == 2)
+                {
+                    xiptr += bias_data_size_g * 8;
+                    xoptr += bias_data_size_g * 8;
+                    xfptr += bias_data_size_g * 8;
+                    xgptr += bias_data_size_g * 8;
+                    hiptr += bias_data_size_g * 8;
+                    hoptr += bias_data_size_g * 8;
+                    hfptr += bias_data_size_g * 8;
+                    hgptr += bias_data_size_g * 8;
+
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = xiptr[j] + hiptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = xfptr[j] + hfptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = xoptr[j] + hoptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = xgptr[j] + hgptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                }
+            }
+
+            // reorder num_directions-IOFG-hidden-hidden to
+            // num_directions-IFOG-hidden-hidden
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          weight_data_size_g = get_tensor_proto_data_size(R) / 4 / num_directions;
+                const float* rptr =
+                    R.has_raw_data() ? (const float*)R.raw_data().data() : R.float_data().data();
+
+                const float* iptr = rptr;
+                const float* optr = rptr + weight_data_size_g;
+                const float* fptr = rptr + weight_data_size_g * 2;
+                const float* gptr = rptr + weight_data_size_g * 3;
+                fwrite(iptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(fptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(optr, sizeof(float), weight_data_size_g, bp);
+                fwrite(gptr, sizeof(float), weight_data_size_g, bp);
+
+                if (direction_type == 2)
+                {
+                    iptr += weight_data_size_g * 4;
+                    optr += weight_data_size_g * 4;
+                    fptr += weight_data_size_g * 4;
+                    gptr += weight_data_size_g * 4;
+                    fwrite(iptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(fptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(optr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(gptr, sizeof(float), weight_data_size_g, bp);
+                }
+            }
+        }
+        else if (op == "MatMul")
+        {
+            if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2)
+            {
+                // InnerProduct
+                const onnx::TensorProto& B = weights[node.input(1)];
+
+                int                      weight_data_size = get_tensor_proto_data_size(B);
+
+                int                      num_output = B.dims(B.dims_size() - 1);
+                int                      num_input  = weight_data_size / num_output;
+
+                fprintf(pp, " 0=%d", num_output);
+                fprintf(pp, " 1=0");
+                fprintf(pp, " 2=%d", weight_data_size);
+
+                int quantize_tag = 0;
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                // reorder num_input-num_output to num_output-num_input
+                {
+                    const float* bptr =
+                        B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
+
+                    for (int j = 0; j < num_output; j++)
+                    {
+                        for (int k = 0; k < num_input; k++)
+                        {
+                            float vb = bptr[k * num_output + j];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+
+                // fwrite_tensor_proto_data(B, bp)
+            }
+            else
+            {
+                // default matrix multiplication
+            }
+        }
+        else if (op == "Max")
+        {
+            int op_type = 4;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "Min")
+        {
+            int op_type = 5;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "Mul")
+        {
+            int op_type = 2;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "MultiHeadAttention")
+        {
+            int embed_dim = get_node_attr_i(node, "embed_dim", 0);
+            int num_heads = get_node_attr_i(node, "num_heads", 0);
+
+            fprintf(pp, " 0=%d", embed_dim);
+            fprintf(pp, " 1=%d", num_heads);
+
+            if (node.input_size() == 5)
+            {
+                const onnx::TensorProto& qkvw = weights[node.input(1)];
+                const onnx::TensorProto& qkvb = weights[node.input(2)];
+                const onnx::TensorProto& ow   = weights[node.input(3)];
+                const onnx::TensorProto& ob   = weights[node.input(4)];
+
+                int                      weight_data_size = get_tensor_proto_data_size(ow);
+
+                fprintf(pp, " 2=%d", weight_data_size);
+
+                int quantize_tag = 0;
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose qw
+                {
+                    const float* wptr =
+                        qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
+                    const float* bptr =
+                        qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim * 3 + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+
+                    fwrite(bptr, sizeof(float), embed_dim, bp);
+                }
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose kw
+                {
+                    const float* wptr =
+                        qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
+                    const float* bptr =
+                        qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
+                    bptr += embed_dim;
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim * 3 + k + embed_dim];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+
+                    fwrite(bptr, sizeof(float), embed_dim, bp);
+                }
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose vw
+                {
+                    const float* wptr =
+                        qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
+                    const float* bptr =
+                        qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
+                    bptr += embed_dim * 2;
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim * 3 + k + embed_dim * 2];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+
+                    fwrite(bptr, sizeof(float), embed_dim, bp);
+                }
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose ow
+                {
+                    const float* wptr =
+                        ow.has_raw_data() ? (const float*)ow.raw_data().data() : ow.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+                fwrite_tensor_proto_data(ob, bp);
+            }
+            else
+            {
+                const onnx::TensorProto& qw = weights[node.input(3)];
+                const onnx::TensorProto& qb = weights[node.input(4)];
+                const onnx::TensorProto& kw = weights[node.input(5)];
+                const onnx::TensorProto& kb = weights[node.input(6)];
+                const onnx::TensorProto& vw = weights[node.input(7)];
+                const onnx::TensorProto& vb = weights[node.input(8)];
+                const onnx::TensorProto& ow = weights[node.input(9)];
+                const onnx::TensorProto& ob = weights[node.input(10)];
+
+                int                      weight_data_size = get_tensor_proto_data_size(qw);
+
+                fprintf(pp, " 2=%d", weight_data_size);
+
+                int quantize_tag = 0;
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose qw
+                {
+                    const float* wptr =
+                        qw.has_raw_data() ? (const float*)qw.raw_data().data() : qw.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+                fwrite_tensor_proto_data(qb, bp);
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose kw
+                {
+                    const float* wptr =
+                        kw.has_raw_data() ? (const float*)kw.raw_data().data() : kw.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+                fwrite_tensor_proto_data(kb, bp);
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose vw
+                {
+                    const float* wptr =
+                        vw.has_raw_data() ? (const float*)vw.raw_data().data() : vw.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+                fwrite_tensor_proto_data(vb, bp);
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose ow
+                {
+                    const float* wptr =
+                        ow.has_raw_data() ? (const float*)ow.raw_data().data() : ow.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+                fwrite_tensor_proto_data(ob, bp);
+            }
+        }
+        else if (op == "Neg")
+        {
+            int op_type = 1;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "NonMaxSuppression")
+        {
+            int   max_dets   = 0;
+            float iou_thre   = 0.f;
+            float score_thre = 0.f;
+            // fprintf(stderr, "%s\n", node.name().c_str());
+            // fprintf(stderr, "node.input_size(): %d\n", node.input_size());
+            if (node.input_size() >= 3)
+            {
+                // fprintf(stderr, "ok12!\n");
+                max_dets = (int)(get_node_attr_from_input<float>(weights[node.input(2)]) + 0.5);
+            }
+            if (node.input_size() >= 4)
+            {
+                // fprintf(stderr, "iou_thre: %f\n",
+                // get_node_attr_from_input<float>(weights[node.input(3)]));
+                iou_thre = get_node_attr_from_input<float>(weights[node.input(3)]);
+            }
+            if (node.input_size() >= 5)
+            {
+                // fprintf(stderr, "score_thre: %f\n",
+                // get_node_attr_from_input<float>(weights[node.input(4)]));
+                score_thre = get_node_attr_from_input<float>(weights[node.input(4)]);
+            }
+            fprintf(pp, " 0=%d", max_dets);
+            fprintf(pp, " 1=%f", iou_thre);
+            fprintf(pp, " 2=%f", score_thre);
+        }
+        else if (op == "Normalize")
+        {
+            float eps             = get_node_attr_f(node, "eps", 0.f);
+            int   scale_data_size = 1;
+
+            fprintf(pp, " 1=1");  // channel_shared
+            fprintf(pp, " 2=%e", eps);
+            fprintf(pp, " 3=%d", scale_data_size);
+            fprintf(pp, " 9=1");  // TODO hardcode pytorch style
+
+            const float scale_data[1] = {1.f};
+            fwrite(scale_data, sizeof(float), 1, bp);
+        }
+        else if (op == "Pad")
+        {
+            std::string      mode  = get_node_attr_s(node, "mode");
+            float            value = get_node_attr_f(node, "value", 0.f);
+
+            std::vector<int> pads;
+            if (node.input_size() == 1)
+            {
+                pads = get_node_attr_ai(node, "pads");
+            }
+            else
+            {
+                pads = get_node_attr_from_input_ai(weights[node.input(1)]);
+            }
+            int type = 0;
+            if (mode == "constant")
+            {
+                type = 0;
+            }
+            else if (mode == "edge")
+            {
+                type = 1;
+            }
+            else if (mode == "reflect")
+            {
+                type = 2;
+            }
+
+            int pad_size = (int)pads.size();
+            int top      = 0;
+            int bottom   = 0;
+            int left     = 0;
+            int right    = 0;
+            int front    = 0;
+            int behind   = 0;
+            if (pad_size == 8)
+            {
+                // NCHW
+                top    = pads[2];
+                bottom = pads[6];
+                left   = pads[3];
+                right  = pads[7];
+                front  = pads[1];
+                behind = pads[5];
+            }
+            else if (pad_size == 6)
+            {
+                // NHW
+                top    = pads[1];
+                bottom = pads[4];
+                left   = pads[2];
+                right  = pads[5];
+            }
+            else
+            {
+                // NW
+                left  = pads[1];
+                right = pads[3];
+            }
+
+            fprintf(pp, " 0=%d", top);
+            fprintf(pp, " 1=%d", bottom);
+            fprintf(pp, " 2=%d", left);
+            fprintf(pp, " 3=%d", right);
+            fprintf(pp, " 4=%d", type);
+            fprintf(pp, " 5=%e", value);
+            fprintf(pp, " 7=%d", front);
+            fprintf(pp, " 8=%d", behind);
+        }
+        else if (op == "Pow")
+        {
+            int op_type = 6;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "PriorBox")
+        {
+            std::vector<float> min_sizes     = get_node_attr_af(node, "min_sizes");
+            std::vector<float> max_sizes     = get_node_attr_af(node, "max_sizes");
+            std::vector<float> aspect_ratios = get_node_attr_af(node, "aspect_ratios");
+            fprintf(pp, " -23300=%zu", min_sizes.size());
+            for (size_t j = 0; j < min_sizes.size(); ++j)
+            {
+                fprintf(pp, ",%f", min_sizes[j]);
+            }
+            fprintf(pp, " -23301=%zu", max_sizes.size());
+            for (size_t j = 0; j < max_sizes.size(); ++j)
+            {
+                fprintf(pp, ",%f", max_sizes[j]);
+            }
+            fprintf(pp, " -23302=%zu", aspect_ratios.size());
+            for (size_t j = 0; j < aspect_ratios.size(); ++j)
+            {
+                fprintf(pp, ",%f", aspect_ratios[j]);
+            }
+            int   image_width      = get_node_attr_i(node, "image_width");
+            int   image_height     = get_node_attr_i(node, "image_height");
+            float step_width       = get_node_attr_f(node, "step_width");
+            float step_height      = get_node_attr_f(node, "step_height");
+            float offset           = get_node_attr_f(node, "offset");
+            int   step_mmdetection = get_node_attr_i(node, "step_mmdetection");
+            fprintf(pp, " 9=%d", image_width);
+            fprintf(pp, " 10=%d", image_height);
+            fprintf(pp, " 11=%f", step_width);
+            fprintf(pp, " 12=%f", step_height);
+            fprintf(pp, " 13=%f", offset);
+            fprintf(pp, " 14=%d", step_mmdetection);
+        }
+        else if (op == "PixelShuffle")
+        {
+            int scale_factor = get_node_attr_i(node, "scale_factor", 1);
+            fprintf(pp, " 0=%d", scale_factor);
+        }
+        else if (op == "PRelu")
+        {
+            const onnx::TensorProto& slope = weights[node.input(1)];
+
+            int                      num_slope = get_tensor_proto_data_size(slope);
+
+            fprintf(pp, " 0=%d", num_slope);
+
+            fwrite_tensor_proto_data(slope, bp);
+        }
+        else if (op == "Reciprocal")
+        {
+            int op_type = 15;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "ReduceMax" || op == "ReduceMin" || op == "ReduceMean" || op == "ReduceProd" ||
+                 op == "ReduceSum" || op == "ReduceSumSquare" || op == "ReduceL1" ||
+                 op == "ReduceL2" || op == "ReduceLogSum" || op == "ReduceLogSumExp")
+        {
+            int op_type = -233;
+            if (op == "ReduceSum")
+                op_type = 0;
+            else if (op == "ReduceSumSquare")
+                op_type = 2;
+            else if (op == "ReduceMean")
+                op_type = 3;
+            else if (op == "ReduceMax")
+                op_type = 4;
+            else if (op == "ReduceMin")
+                op_type = 5;
+            else if (op == "ReduceProd")
+                op_type = 6;
+            else if (op == "ReduceL1")
+                op_type = 7;
+            else if (op == "ReduceL2")
+                op_type = 8;
+            else if (op == "ReduceLogSum")
+                op_type = 9;
+            else if (op == "ReduceLogSumExp")
+                op_type = 10;
+            fprintf(pp, " 0=%d", op_type);
+
+            std::vector<int> axes     = get_node_attr_ai(node, "axes");
+            int              keepdims = get_node_attr_i(node, "keepdims", 1);
+
+            if (axes.size() > 0)
+            {
+                // if axes set, reduce according to axes
+                fprintf(pp, " 1=%d", 0);
+                fprintf(pp, " -23303=%zu", axes.size());
+                for (size_t j = 0; j < axes.size(); j++)
+                {
+                    if (axes[j] == 0 || axes[j] > 4 || axes[j] < -3)
+                        fprintf(stderr, "Unsupported reduction axes !\n");
+                    fprintf(pp, ",%d", axes[j] > 0 ? axes[j] - 1 : axes[j]);
+                }
+            }
+            else
+            {
+                // if axes not set, reduce all axes by default
+                fprintf(pp, " 1=%d", 1);
+            }
+            fprintf(pp, " 4=%d", keepdims);
+            fprintf(pp, " 5=1");
+        }
+        else if (op == "Reorg")
+        {
+            int stride = get_node_attr_i(node, "stride", 1);
+            fprintf(pp, " 0=%d", stride);
+        }
+        else if (op == "Reshape")
+        {
+            std::vector<int> shape;
+
+            if (node.input_size() == 1)
+            {
+                shape = get_node_attr_ai(node, "shape");
+            }
+            else if (weights.find(node.input(1)) != weights.end())
+            {
+                shape = get_node_attr_from_input_ai(weights[node.input(1)]);
+            }
+            else
+            {
+                fprintf(stderr, "Unsupported reshape weight ! \n");
+            }
+
+            if (shape.size() == 1)
+            {
+                fprintf(pp, " 0=%d", shape[0]);  // should never reach here
+            }
+            else if (shape.size() == 2)
+            {
+                fprintf(pp, " 0=%d", shape[1]);
+            }
+            else if (shape.size() == 3)
+            {
+                fprintf(pp, " 0=%d", shape[2]);
+                fprintf(pp, " 1=%d", shape[1]);
+            }
+            else if (shape.size() == 4)
+            {
+                fprintf(pp, " 0=%d", shape[3]);
+                fprintf(pp, " 1=%d", shape[2]);
+                fprintf(pp, " 2=%d", shape[1]);
+            }
+            else if (shape.size() == 5)
+            {
+                fprintf(pp, " 0=%d", shape[4] * shape[3]);
+                fprintf(pp, " 1=%d", shape[2]);
+                fprintf(pp, " 2=%d", shape[1]);
+            }
+        }
+        else if (op == "Resize")
+        {
+            std::string        mode  = get_node_attr_s(node, "mode");
+            std::string        align = get_node_attr_s(node, "coordinate_transformation_mode");
+
+            std::vector<float> scales;
+            std::vector<int>   sizes;
+            if (node.input_size() == 2)
+            {
+                // opset 10
+                scales = get_node_attr_from_input_af(weights[node.input(1)]);
+            }
+            else
+            {
+                // opset 11+
+                scales = get_node_attr_from_input_af(weights[node.input(2)]);
+                if (node.input_size() >= 4)
+                {
+                    sizes = get_node_attr_from_input_ai(weights[node.input(3)]);
+                }
+            }
+
+            int resize_type = 1;
+            if (mode == "nearest")
+            {
+                resize_type = 1;
+            }
+            else if (mode == "linear")
+            {
+                resize_type = 2;
+            }
+            else if (mode == "cubic")
+            {
+                resize_type = 3;
+            }
+
+            if (scales.empty() && sizes.empty())
+            {
+                fprintf(stderr, "Unsupported Resize scales and sizes are all empty!\n");
+            }
+
+            float h_scale = 1.f;
+            float w_scale = 1.f;
+            if (scales.size() == 2)
+            {
+                w_scale = scales[1];
+            }
+            else if (scales.size() == 3)
+            {
+                h_scale = scales[1];
+                w_scale = scales[2];
+            }
+            else if (scales.size() == 4)
+            {
+                h_scale = scales[2];
+                w_scale = scales[3];
+
+                if (scales[1] != 1.f) fprintf(stderr, "Unsupported Resize scales !\n");
+            }
+
+            int output_height = 0;
+            int output_width  = 0;
+            if (sizes.size() == 2)
+            {
+                output_width = sizes[1];
+            }
+            else if (sizes.size() == 3)
+            {
+                output_height = sizes[1];
+                output_width  = sizes[2];
+            }
+            else if (sizes.size() == 4)
+            {
+                output_height = sizes[2];
+                output_width  = sizes[3];
+            }
+
+            int align_corner = 0;
+            if (align == "align_corners")
+            {
+                align_corner = 1;
+            }
+
+            fprintf(pp, " 0=%d", resize_type);
+            fprintf(pp, " 1=%e", h_scale);
+            fprintf(pp, " 2=%e", w_scale);
+            fprintf(pp, " 3=%d", output_height);
+            fprintf(pp, " 4=%d", output_width);
+            fprintf(pp, " 6=%d", align_corner);
+        }
+        else if (op == "RNN")
+        {
+            const onnx::TensorProto& W = weights[node.input(1)];
+            const onnx::TensorProto& R = weights[node.input(2)];
+            const onnx::TensorProto& B = weights[node.input(3)];
+
+            int                      hidden_size = get_node_attr_i(node, "hidden_size", 0);
+            std::string              direction   = get_node_attr_s(node, "direction");
+
+            int                      direction_type = 0;
+            if (direction == "forward")
+            {
+                direction_type = 0;
+            }
+            else if (direction == "reverse")
+            {
+                direction_type = 1;
+            }
+            else if (direction == "bidirectional")
+            {
+                direction_type = 2;
+            }
+
+            int weight_data_size = get_tensor_proto_data_size(W);
+
+            fprintf(pp, " 0=%d", hidden_size);
+            fprintf(pp, " 1=%d", weight_data_size);
+            fprintf(pp, " 2=%d", direction_type);
+
+            int num_directions = direction_type == 2 ? 2 : 1;
+
+            int quantize_tag = 0;
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+            fwrite_tensor_proto_data(W, bp);
+
+            // reduce xc and hc bias
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          bias_data_size_g = get_tensor_proto_data_size(B) / 2 / num_directions;
+                const float* bptr =
+                    B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
+                const float* xiptr = bptr;
+                const float* hiptr = bptr + bias_data_size_g;
+
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = xiptr[j] + hiptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+
+                if (direction_type == 2)
+                {
+                    xiptr += bias_data_size_g * 2;
+                    hiptr += bias_data_size_g * 2;
+
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = xiptr[j] + hiptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                }
+            }
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+            fwrite_tensor_proto_data(R, bp);
+        }
+        else if (op == "RDiv")
+        {
+            int op_type = 8;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "RSub")
+        {
+            int op_type = 7;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "RoiAlign")
+        {
+            int   pooled_width   = get_node_attr_i(node, "output_width", 1);
+            int   pooled_height  = get_node_attr_i(node, "output_height", 1);
+            float spatial_scale  = get_node_attr_f(node, "spatial_scale", 1.f);
+            int   sampling_ratio = get_node_attr_i(node, "sampling_ratio", 0);
+            fprintf(pp, " 0=%d", pooled_width);
+            fprintf(pp, " 1=%d", pooled_height);
+            fprintf(pp, " 2=%f", spatial_scale);
+            fprintf(pp, " 3=%d", sampling_ratio);
+        }
+        else if (op == "ShuffleChannel")
+        {
+            int group   = get_node_attr_i(node, "group", 1);
+            int reverse = get_node_attr_i(node, "reverse", 0);
+            fprintf(pp, " 0=%d", group);
+            fprintf(pp, " 1=%d", reverse);
+        }
+        else if (op == "Sigmoid")
+        {
+            // no param
+        }
+        else if (op == "Sin")
+        {
+            int op_type = 9;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "SkipLayerNormalization")
+        {
+            const onnx::TensorProto& W  = weights[node.input(2)];
+            const onnx::TensorProto& B  = weights[node.input(3)];
+            const onnx::TensorProto& B2 = weights[node.input(4)];
+
+            fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
+
+            int quantize_tag = 0;
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(W, bp);
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(B, bp);
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(B2, bp);
+        }
+        else if (op == "Slice")
+        {
+            bool             use_crop = true;
+
+            std::vector<int> starts;
+            std::vector<int> ends;
+            std::vector<int> axes;
+            std::vector<int> steps;
+            if (node.input_size() == 1)
+            {
+                starts = get_node_attr_ai(node, "starts");
+                ends   = get_node_attr_ai(node, "ends");
+                axes   = get_node_attr_ai(node, "axes");
+                steps  = get_node_attr_ai(node, "steps");  // TODO
+            }
+            else
+            {
+                starts = get_node_attr_from_input_ai(weights[node.input(1)]);
+                ends   = get_node_attr_from_input_ai(weights[node.input(2)]);
+                if (node.input_size() >= 4) axes = get_node_attr_from_input_ai(weights[node.input(3)]);
+                if (node.input_size() >= 5) steps = get_node_attr_from_input_ai(weights[node.input(4)]);
+            }
+
+            // assert step == 1 or step >= ends
+            for (int i = 0; i < (int)steps.size(); i++)
+            {
+                if (steps[i] != 1 && steps[i] < ends[i])
+                {
+                    use_crop = false;
+                    fprintf(stderr, "Unsupported slice step ! Use custom TensorSlice\n");
+                }
+            }
+
+            if (use_crop)
+            {
+                // filter out N-dim axis
+                if (!axes.empty())
+                {
+                    for (int i = 0; i < (int)axes.size(); i++)
+                    {
+                        int axis = axes[i];
+                        if (axis == 0)
+                        {
+                            starts.erase(starts.begin() + i);
+                            ends.erase(ends.begin() + i);
+                            axes.erase(axes.begin() + i);
+                            break;
+                        }
+                    }
+                }
+
+                fprintf(pp, " -23309=%d", (int)starts.size());
+                for (int i = 0; i < (int)starts.size(); i++)
+                {
+                    fprintf(pp, ",%d", starts[i]);
+                }
+                fprintf(pp, " -23310=%d", (int)ends.size());
+                for (int i = 0; i < (int)ends.size(); i++)
+                {
+                    fprintf(pp, ",%d", ends[i]);
+                }
+                if (!axes.empty())
+                {
+                    fprintf(pp, " -23311=%d", (int)axes.size());
+                    for (int i = 0; i < (int)axes.size(); i++)
+                    {
+                        int axis = axes[i];
+                        if (axis == 0 || axis > 3 || axis < -3) fprintf(stderr, "Unsupported slice axes !\n");
+
+                        if (axis > 0) axis = axis - 1;  // -1 for skip N-dim
+
+                        fprintf(pp, ",%d", axis);
+                    }
+                }
+            }
+            else
+            {
+                fprintf(pp, " -23300=%d", (int)starts.size());
+                for (int i = 0; i < (int)starts.size(); i++)
+                {
+                    fprintf(pp, ",%d", starts[i]);
+                }
+                fprintf(pp, " -23301=%d", (int)ends.size());
+                for (int i = 0; i < (int)ends.size(); i++)
+                {
+                    fprintf(pp, ",%d", ends[i]);
+                }
+                if (!axes.empty())
+                {
+                    fprintf(pp, " -23302=%d", (int)axes.size());
+                    for (int i = 0; i < (int)axes.size(); i++)
+                    {
+                        int axis = axes[i];
+                        if (axis > 3 || axis < -3) fprintf(stderr, "Unsupported slice axes !\n");
+                        fprintf(pp, ",%d", axis);
+                    }
+                }
+                if (!steps.empty())
+                {
+                    fprintf(pp, " -23303=%d", (int)steps.size());
+                    for (int i = 0; i < (int)steps.size(); i++)
+                    {
+                        int step = steps[i];
+                        if (step == 0) fprintf(stderr, "Unsupported slice step ! Unsupported slice step\n");
+                        fprintf(pp, ",%d", step);
+                    }
+                }
+            }
+        }
+        else if (op == "Softmax")
+        {
+            int axis = get_node_attr_i(node, "axis", 1);
+            fprintf(pp, " 0=%d", axis - 1);
+            fprintf(pp, " 1=1");
+        }
+        else if (op == "Split")
+        {
+            int              axis  = get_node_attr_i(node, "axis", 0);
+            std::vector<int> split = get_node_attr_ai(node, "split");
+            if (axis < 1) fprintf(stderr, "Unsupported split axis !\n");
+
+            fprintf(pp, " -23300=%d", output_size);
+            if (split.empty())
+            {
+                for (int i = 0; i < output_size; i++)
+                {
+                    fprintf(pp, ",-233");
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < split.size() - 1; i++)
+                {
+                    fprintf(pp, ",%d", split[i]);
+                }
+                fprintf(pp, ",-233");
+            }
+            fprintf(pp, " 1=%d", axis - 1);
+        }
+        else if (op == "Sqrt")
+        {
+            int op_type = 5;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Squeeze")
+        {
+            std::vector<int> axes = get_node_attr_ai(node, "axes");
+
+            if (axes.empty())
+            {
+                fprintf(pp, " 0=1");
+                fprintf(pp, " 1=1");
+                fprintf(pp, " 2=1");
+            }
+            else
+            {
+                bool flag = true;
+                for (int i = 0; i < (int)axes.size(); i++)
+                {
+                    if (axes[i] == 0)
+                    {
+                        flag = false;
+                        break;
+                    }
+                }
+                if (flag == true)
+                {
+                    fprintf(pp, " -23303=%zu", axes.size());
+                    for (int i = 0; i < (int)axes.size(); i++)
+                    {
+                        if (axes[i] == 0 || axes[i] > 3 || axes[i] < -3)
+                            fprintf(stderr, "Unsupported squeeze axes !: %d, %s\n", axes[i], node.name().c_str());
+                        fprintf(pp, ",%d", axes[i] - 1);
+                    }
+                }
+            }
+        }
+        else if (op == "Sub")
+        {
+            int op_type = 1;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "Sum")
+        {
+            int op_type = 1;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Swish")
+        {
+            // no param
+        }
+        else if (op == "Tan")
+        {
+            int op_type = 11;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Tanh")
+        {
+            int op_type = 16;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "TopK")
+        {
+            int axis    = get_node_attr_i(node, "axis", -1);
+            axis        = axis > 0 ? axis - 1 : axis;
+            int largest = get_node_attr_i(node, "largest", 1);
+            int sorted  = get_node_attr_i(node, "sorted", 1);
+            fprintf(pp, " 0=%d", axis);
+            fprintf(pp, " 1=%d", largest);
+            fprintf(pp, " 2=%d", sorted);
+        }
+        else if (op == "Transpose")
+        {
+            std::vector<int> perm = get_node_attr_ai(node, "perm");
+
+            if (perm.size() == 3)
+            {
+                if (perm[1] == 1 && perm[2] == 2)
+                    fprintf(pp, " 0=0");  // w h
+                else if (perm[1] == 2 && perm[2] == 1)
+                    fprintf(pp, " 0=1");  // h w
+                else if (perm[0] == 1 && perm[1] == 0 && perm[2] == 2)
+                    fprintf(pp, " 0=0");  // w h
+                else if (perm[0] == 2 && perm[1] == 0 && perm[2] == 1)
+                    fprintf(pp, " 0=1");  // h w
+            }
+            else if (perm.size() == 4)
+            {
+                if (perm[1] == 1 && perm[2] == 2 && perm[3] == 3)
+                    fprintf(pp, " 0=0");  // w h c
+                else if (perm[1] == 1 && perm[2] == 3 && perm[3] == 2)
+                    fprintf(pp, " 0=1");  // h w c
+                else if (perm[1] == 2 && perm[2] == 1 && perm[3] == 3)
+                    fprintf(pp, " 0=2");  // w c h
+                else if (perm[1] == 2 && perm[2] == 3 && perm[3] == 1)
+                    fprintf(pp, " 0=3");  // c w h
+                else if (perm[1] == 3 && perm[2] == 1 && perm[3] == 2)
+                    fprintf(pp, " 0=4");  // h c w
+                else if (perm[1] == 3 && perm[2] == 2 && perm[3] == 1)
+                    fprintf(pp, " 0=5");  // c h w
+            }
+            else if (perm.size() == 5)
+            {
+                if (perm[1] == 1 && perm[2] == 2 && perm[3] == 3 && perm[4] == 4)
+                    fprintf(pp, " 0=0");  // wx h c
+                else if (perm[1] == 1 && perm[2] == 3 && perm[3] == 4 && perm[4] == 2)
+                    fprintf(pp, " 0=1");  // h wx c
+                else if (perm[1] == 2 && perm[2] == 1 && perm[3] == 3 && perm[4] == 4)
+                    fprintf(pp, " 0=2");  // wx c h
+                else if (perm[1] == 2 && perm[2] == 3 && perm[3] == 4 && perm[4] == 1)
+                    fprintf(pp, " 0=3");  // c wx h
+                else if (perm[1] == 3 && perm[2] == 4 && perm[3] == 1 && perm[4] == 2)
+                    fprintf(pp, " 0=4");  // h c wx
+                else if (perm[1] == 3 && perm[2] == 4 && perm[3] == 2 && perm[4] == 1)
+                    fprintf(pp, " 0=5");  // c h wx
+                else
+                    fprintf(stderr, "Unsupported transpose type !\n");
+            }
+        }
+        else if (op == "Upsample")
+        {
+            std::string        mode  = get_node_attr_s(node, "mode");
+            std::string        align = get_node_attr_s(node, "coordinate_transformation_mode");
+
+            std::vector<float> scales;
+
+            if (node.input_size() == 1)
+            {
+                scales = get_node_attr_af(node, "scales");
+            }
+            else
+            {
+                scales = get_node_attr_from_input_af(weights[node.input(1)]);
+            }
+
+            int resize_type = 1;
+            if (mode == "nearest")
+            {
+                resize_type = 1;
+            }
+            else if (mode == "bilinear" || mode == "linear")
+            {
+                resize_type = 2;
+            }
+            else if (mode == "trilinear")
+            {
+                fprintf(stderr, "Unsupported Upsample mode !\n");
+            }
+
+            float h_scale = 1.f;
+            float w_scale = 1.f;
+            if (scales.size() == 2)
+            {
+                w_scale = scales[1];
+            }
+            else if (scales.size() == 3)
+            {
+                h_scale = scales[1];
+                w_scale = scales[2];
+            }
+            else if (scales.size() == 4)
+            {
+                h_scale = scales[2];
+                w_scale = scales[3];
+
+                if (scales[1] != 1.f) fprintf(stderr, "Unsupported Upsample scales !\n");
+            }
+            else
+            {
+                fprintf(stderr, "Unsupported Upsample scales !\n");
+            }
+
+            int align_corner = 0;
+            if (align == "align_corners")
+            {
+                align_corner = 1;
+            }
+
+            fprintf(pp, " 0=%d", resize_type);
+            fprintf(pp, " 1=%e", h_scale);
+            fprintf(pp, " 2=%e", w_scale);
+            fprintf(pp, " 6=%d", align_corner);
+        }
+        else if (op == "Unsqueeze")
+        {
+            std::vector<int> axes = get_node_attr_ai(node, "axes");
+            bool             flag = true;
+            for (int i = 0; i < (int)axes.size(); i++)
+            {
+                if (axes[i] == 0)
+                {
+                    flag = false;
+                    break;
+                }
+            }
+            if (flag)
+            {
+                fprintf(pp, " -23303=%zu", axes.size());
+                for (int i = 0; i < (int)axes.size(); i++)
+                {
+                    if (axes[i] == 0 || axes[i] > 4 || axes[i] < -4)
+                        fprintf(stderr, "Unsupported unsqueeze axes !: %d, %s\n", axes[i], node.name().c_str());
+                    fprintf(pp, ",%d", axes[i] - 1);
+                }
+            }
+        }
+        else if (op == "Yolov3DetectionOutput")
+        {
+            int   num_class            = get_node_attr_i(node, "num_class");
+            int   num_box              = get_node_attr_i(node, "num_box");
+            float confidence_threshold = get_node_attr_f(node, "confidence_threshold");
+            float nms_threshold        = get_node_attr_f(node, "nms_threshold");
+            fprintf(pp, " 0=%d", num_class);
+            fprintf(pp, " 1=%d", num_box);
+            fprintf(pp, " 2=%e", confidence_threshold);
+            fprintf(pp, " 3=%e", nms_threshold);
+            std::vector<float> biases = get_node_attr_af(node, "biases");
+            if (biases.size() > 0)
+            {
+                fprintf(pp, " -23304=%zu", biases.size());
+                for (int i = 0; i < (int)biases.size(); i++)
+                {
+                    fprintf(pp, ",%e", biases[i]);
+                }
+            }
+            std::vector<float> mask = get_node_attr_af(node, "mask");
+            if (mask.size() > 0)
+            {
+                fprintf(pp, " -23305=%zu", mask.size());
+                for (int i = 0; i < (int)mask.size(); i++)
+                {
+                    fprintf(pp, ",%e", mask[i]);
+                }
+            }
+            std::vector<float> anchors_scale = get_node_attr_af(node, "anchors_scale");
+            if (anchors_scale.size() > 0)
+            {
+                fprintf(pp, " -23306=%zu", anchors_scale.size());
+                for (int i = 0; i < (int)anchors_scale.size(); i++)
+                {
+                    fprintf(pp, ",%e", anchors_scale[i]);
+                }
+            }
+        }
+        else
+        {
+            // TODO op specific param
+        }
+
+        fprintf(pp, "\n");
+        for (int j = 0; j < output_size; j++)
+        {
+            const std::string& output_name = node.output(j);
+            if (node_reference.find(output_name) != node_reference.end())
+            {
+                int refcount = node_reference[output_name];
+                if (refcount > 1)
+                {
+                    char splitname[256];
+                    sprintf(splitname, "splitncnn_%d", internal_split);
+                    fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
+
+                    fprintf(pp, " %s", output_name.c_str());
+
+                    for (int k = 0; k < refcount; k++)
+                    {
+                        fprintf(pp, " %s_splitncnn_%d", output_name.c_str(), k);
+                    }
+                    fprintf(pp, "\n");
+
+                    internal_split++;
+                }
             }
-          }
-        }
-        fwrite_tensor_proto_data(qb, bp);
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose kw
-        {
-          const float* wptr =
-              kw.has_raw_data() ? (const float*)kw.raw_data().data() : kw.float_data().data();
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim + k];
-              fwrite(&vb, sizeof(float), 1, bp);
-            }
-          }
-        }
-        fwrite_tensor_proto_data(kb, bp);
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose vw
-        {
-          const float* wptr =
-              vw.has_raw_data() ? (const float*)vw.raw_data().data() : vw.float_data().data();
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim + k];
-              fwrite(&vb, sizeof(float), 1, bp);
-            }
-          }
-        }
-        fwrite_tensor_proto_data(vb, bp);
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose ow
-        {
-          const float* wptr =
-              ow.has_raw_data() ? (const float*)ow.raw_data().data() : ow.float_data().data();
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim + k];
-              fwrite(&vb, sizeof(float), 1, bp);
-            }
-          }
-        }
-        fwrite_tensor_proto_data(ob, bp);
-      }
-    } else if (op == "Neg") {
-      int op_type = 1;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "NonMaxSuppression") {
-      int max_dets = 0;
-      float iou_thre = 0.f;
-      float score_thre = 0.f;
-      // fprintf(stderr, "%s\n", node.name().c_str());
-      // fprintf(stderr, "node.input_size(): %d\n", node.input_size());
-      if (node.input_size() >= 3) {
-        // fprintf(stderr, "ok12!\n");
-        max_dets = (int)(get_node_attr_from_input<float>(weights[node.input(2)]) + 0.5);
-      }
-      if (node.input_size() >= 4) {
-        // fprintf(stderr, "iou_thre: %f\n",
-        // get_node_attr_from_input<float>(weights[node.input(3)]));
-        iou_thre = get_node_attr_from_input<float>(weights[node.input(3)]);
-      }
-      if (node.input_size() >= 5) {
-        // fprintf(stderr, "score_thre: %f\n",
-        // get_node_attr_from_input<float>(weights[node.input(4)]));
-        score_thre = get_node_attr_from_input<float>(weights[node.input(4)]);
-      }
-      fprintf(pp, " 0=%d", max_dets);
-      fprintf(pp, " 1=%f", iou_thre);
-      fprintf(pp, " 2=%f", score_thre);
-    } else if (op == "Normalize") {
-      float eps = get_node_attr_f(node, "eps", 0.f);
-      int scale_data_size = 1;
-
-      fprintf(pp, " 1=1");  // channel_shared
-      fprintf(pp, " 2=%e", eps);
-      fprintf(pp, " 3=%d", scale_data_size);
-      fprintf(pp, " 9=1");  // TODO hardcode pytorch style
-
-      const float scale_data[1] = {1.f};
-      fwrite(scale_data, sizeof(float), 1, bp);
-    } else if (op == "Pad") {
-      std::string mode = get_node_attr_s(node, "mode");
-      float value = get_node_attr_f(node, "value", 0.f);
-
-      std::vector<int> pads;
-      if (node.input_size() == 1) {
-        pads = get_node_attr_ai(node, "pads");
-      } else {
-        pads = get_node_attr_from_input_ai(weights[node.input(1)]);
-      }
-      int type = 0;
-      if (mode == "constant") {
-        type = 0;
-      } else if (mode == "edge") {
-        type = 1;
-      } else if (mode == "reflect") {
-        type = 2;
-      }
-
-      int pad_size = (int)pads.size();
-      int top = 0;
-      int bottom = 0;
-      int left = 0;
-      int right = 0;
-      int front = 0;
-      int behind = 0;
-      if (pad_size == 8) {
-        // NCHW
-        top = pads[2];
-        bottom = pads[6];
-        left = pads[3];
-        right = pads[7];
-        front = pads[1];
-        behind = pads[5];
-      } else if (pad_size == 6) {
-        // NHW
-        top = pads[1];
-        bottom = pads[4];
-        left = pads[2];
-        right = pads[5];
-      } else {
-        // NW
-        left = pads[1];
-        right = pads[3];
-      }
-
-      fprintf(pp, " 0=%d", top);
-      fprintf(pp, " 1=%d", bottom);
-      fprintf(pp, " 2=%d", left);
-      fprintf(pp, " 3=%d", right);
-      fprintf(pp, " 4=%d", type);
-      fprintf(pp, " 5=%e", value);
-      fprintf(pp, " 7=%d", front);
-      fprintf(pp, " 8=%d", behind);
-    } else if (op == "Pow") {
-      int op_type = 6;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "PriorBox") {
-      std::vector<float> min_sizes = get_node_attr_af(node, "min_sizes");
-      std::vector<float> max_sizes = get_node_attr_af(node, "max_sizes");
-      std::vector<float> aspect_ratios = get_node_attr_af(node, "aspect_ratios");
-      fprintf(pp, " -23300=%zu", min_sizes.size());
-      for (size_t j = 0; j < min_sizes.size(); ++j) {
-        fprintf(pp, ",%f", min_sizes[j]);
-      }
-      fprintf(pp, " -23301=%zu", max_sizes.size());
-      for (size_t j = 0; j < max_sizes.size(); ++j) {
-        fprintf(pp, ",%f", max_sizes[j]);
-      }
-      fprintf(pp, " -23302=%zu", aspect_ratios.size());
-      for (size_t j = 0; j < aspect_ratios.size(); ++j) {
-        fprintf(pp, ",%f", aspect_ratios[j]);
-      }
-      int image_width = get_node_attr_i(node, "image_width");
-      int image_height = get_node_attr_i(node, "image_height");
-      float step_width = get_node_attr_f(node, "step_width");
-      float step_height = get_node_attr_f(node, "step_height");
-      float offset = get_node_attr_f(node, "offset");
-      int step_mmdetection = get_node_attr_i(node, "step_mmdetection");
-      fprintf(pp, " 9=%d", image_width);
-      fprintf(pp, " 10=%d", image_height);
-      fprintf(pp, " 11=%f", step_width);
-      fprintf(pp, " 12=%f", step_height);
-      fprintf(pp, " 13=%f", offset);
-      fprintf(pp, " 14=%d", step_mmdetection);
-    } else if (op == "PixelShuffle") {
-      int scale_factor = get_node_attr_i(node, "scale_factor", 1);
-      fprintf(pp, " 0=%d", scale_factor);
-    } else if (op == "PRelu") {
-      const onnx::TensorProto& slope = weights[node.input(1)];
-
-      int num_slope = get_tensor_proto_data_size(slope);
-
-      fprintf(pp, " 0=%d", num_slope);
-
-      fwrite_tensor_proto_data(slope, bp);
-    } else if (op == "Reciprocal") {
-      int op_type = 15;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "ReduceMax" || op == "ReduceMin" || op == "ReduceMean" || op == "ReduceProd" ||
-               op == "ReduceSum" || op == "ReduceSumSquare" || op == "ReduceL1" ||
-               op == "ReduceL2" || op == "ReduceLogSum" || op == "ReduceLogSumExp") {
-      int op_type = -233;
-      if (op == "ReduceSum")
-        op_type = 0;
-      else if (op == "ReduceSumSquare")
-        op_type = 2;
-      else if (op == "ReduceMean")
-        op_type = 3;
-      else if (op == "ReduceMax")
-        op_type = 4;
-      else if (op == "ReduceMin")
-        op_type = 5;
-      else if (op == "ReduceProd")
-        op_type = 6;
-      else if (op == "ReduceL1")
-        op_type = 7;
-      else if (op == "ReduceL2")
-        op_type = 8;
-      else if (op == "ReduceLogSum")
-        op_type = 9;
-      else if (op == "ReduceLogSumExp")
-        op_type = 10;
-      fprintf(pp, " 0=%d", op_type);
-
-      std::vector<int> axes = get_node_attr_ai(node, "axes");
-      int keepdims = get_node_attr_i(node, "keepdims", 1);
-
-      if (axes.size() > 0) {
-        // if axes set, reduce according to axes
-        fprintf(pp, " 1=%d", 0);
-        fprintf(pp, " -23303=%zu", axes.size());
-        for (size_t j = 0; j < axes.size(); j++) {
-          if (axes[j] == 0 || axes[j] > 4 || axes[j] < -3)
-            fprintf(stderr, "Unsupported reduction axes !\n");
-          fprintf(pp, ",%d", axes[j] > 0 ? axes[j] - 1 : axes[j]);
-        }
-      } else {
-        // if axes not set, reduce all axes by default
-        fprintf(pp, " 1=%d", 1);
-      }
-      fprintf(pp, " 4=%d", keepdims);
-      fprintf(pp, " 5=1");
-    } else if (op == "Reorg") {
-      int stride = get_node_attr_i(node, "stride", 1);
-      fprintf(pp, " 0=%d", stride);
-    } else if (op == "Reshape") {
-      std::vector<int> shape;
-
-      if (node.input_size() == 1) {
-        shape = get_node_attr_ai(node, "shape");
-      } else if (weights.find(node.input(1)) != weights.end()) {
-        shape = get_node_attr_from_input_ai(weights[node.input(1)]);
-      } else {
-        fprintf(stderr, "Unsupported reshape weight ! \n");
-      }
-
-      if (shape.size() == 1) {
-        fprintf(pp, " 0=%d", shape[0]);  // should never reach here
-      } else if (shape.size() == 2) {
-        fprintf(pp, " 0=%d", shape[1]);
-      } else if (shape.size() == 3) {
-        fprintf(pp, " 0=%d", shape[2]);
-        fprintf(pp, " 1=%d", shape[1]);
-      } else if (shape.size() == 4) {
-        fprintf(pp, " 0=%d", shape[3]);
-        fprintf(pp, " 1=%d", shape[2]);
-        fprintf(pp, " 2=%d", shape[1]);
-      } else if (shape.size() == 5) {
-        fprintf(pp, " 0=%d", shape[4] * shape[3]);
-        fprintf(pp, " 1=%d", shape[2]);
-        fprintf(pp, " 2=%d", shape[1]);
-      }
-    } else if (op == "Resize") {
-      std::string mode = get_node_attr_s(node, "mode");
-      std::string align = get_node_attr_s(node, "coordinate_transformation_mode");
-
-      std::vector<float> scales;
-      std::vector<int> sizes;
-      if (node.input_size() == 2) {
-        // opset 10
-        scales = get_node_attr_from_input_af(weights[node.input(1)]);
-      } else {
-        // opset 11+
-        scales = get_node_attr_from_input_af(weights[node.input(2)]);
-        if (node.input_size() >= 4) {
-          sizes = get_node_attr_from_input_ai(weights[node.input(3)]);
-        }
-      }
-
-      int resize_type = 1;
-      if (mode == "nearest") {
-        resize_type = 1;
-      } else if (mode == "linear") {
-        resize_type = 2;
-      } else if (mode == "cubic") {
-        resize_type = 3;
-      }
-
-      if (scales.empty() && sizes.empty()) {
-        fprintf(stderr, "Unsupported Resize scales and sizes are all empty!\n");
-      }
-
-      float h_scale = 1.f;
-      float w_scale = 1.f;
-      if (scales.size() == 2) {
-        w_scale = scales[1];
-      } else if (scales.size() == 3) {
-        h_scale = scales[1];
-        w_scale = scales[2];
-      } else if (scales.size() == 4) {
-        h_scale = scales[2];
-        w_scale = scales[3];
-
-        if (scales[1] != 1.f) fprintf(stderr, "Unsupported Resize scales !\n");
-      }
-
-      int output_height = 0;
-      int output_width = 0;
-      if (sizes.size() == 2) {
-        output_width = sizes[1];
-      } else if (sizes.size() == 3) {
-        output_height = sizes[1];
-        output_width = sizes[2];
-      } else if (sizes.size() == 4) {
-        output_height = sizes[2];
-        output_width = sizes[3];
-      }
-
-      int align_corner = 0;
-      if (align == "align_corners") {
-        align_corner = 1;
-      }
-
-      fprintf(pp, " 0=%d", resize_type);
-      fprintf(pp, " 1=%e", h_scale);
-      fprintf(pp, " 2=%e", w_scale);
-      fprintf(pp, " 3=%d", output_height);
-      fprintf(pp, " 4=%d", output_width);
-      fprintf(pp, " 6=%d", align_corner);
-    } else if (op == "RNN") {
-      const onnx::TensorProto& W = weights[node.input(1)];
-      const onnx::TensorProto& R = weights[node.input(2)];
-      const onnx::TensorProto& B = weights[node.input(3)];
-
-      int hidden_size = get_node_attr_i(node, "hidden_size", 0);
-      std::string direction = get_node_attr_s(node, "direction");
-
-      int direction_type = 0;
-      if (direction == "forward") {
-        direction_type = 0;
-      } else if (direction == "reverse") {
-        direction_type = 1;
-      } else if (direction == "bidirectional") {
-        direction_type = 2;
-      }
-
-      int weight_data_size = get_tensor_proto_data_size(W);
-
-      fprintf(pp, " 0=%d", hidden_size);
-      fprintf(pp, " 1=%d", weight_data_size);
-      fprintf(pp, " 2=%d", direction_type);
-
-      int num_directions = direction_type == 2 ? 2 : 1;
-
-      int quantize_tag = 0;
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-      fwrite_tensor_proto_data(W, bp);
-
-      // reduce xc and hc bias
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int bias_data_size_g = get_tensor_proto_data_size(B) / 2 / num_directions;
-        const float* bptr =
-            B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
-        const float* xiptr = bptr;
-        const float* hiptr = bptr + bias_data_size_g;
-
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = xiptr[j] + hiptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-
-        if (direction_type == 2) {
-          xiptr += bias_data_size_g * 2;
-          hiptr += bias_data_size_g * 2;
-
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = xiptr[j] + hiptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-        }
-      }
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-      fwrite_tensor_proto_data(R, bp);
-    } else if (op == "RDiv") {
-      int op_type = 8;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "RSub") {
-      int op_type = 7;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "RoiAlign") {
-      int pooled_width = get_node_attr_i(node, "output_width", 1);
-      int pooled_height = get_node_attr_i(node, "output_height", 1);
-      float spatial_scale = get_node_attr_f(node, "spatial_scale", 1.f);
-      int sampling_ratio = get_node_attr_i(node, "sampling_ratio", 0);
-      fprintf(pp, " 0=%d", pooled_width);
-      fprintf(pp, " 1=%d", pooled_height);
-      fprintf(pp, " 2=%f", spatial_scale);
-      fprintf(pp, " 3=%d", sampling_ratio);
-    } else if (op == "ShuffleChannel") {
-      int group = get_node_attr_i(node, "group", 1);
-      int reverse = get_node_attr_i(node, "reverse", 0);
-      fprintf(pp, " 0=%d", group);
-      fprintf(pp, " 1=%d", reverse);
-    } else if (op == "Sigmoid") {
-      // no param
-    } else if (op == "Sin") {
-      int op_type = 9;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "SkipLayerNormalization") {
-      const onnx::TensorProto& W = weights[node.input(2)];
-      const onnx::TensorProto& B = weights[node.input(3)];
-      const onnx::TensorProto& B2 = weights[node.input(4)];
-
-      fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
-
-      int quantize_tag = 0;
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(W, bp);
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(B, bp);
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(B2, bp);
-    } else if (op == "Slice") {
-      bool use_crop = true;
-
-      std::vector<int> starts;
-      std::vector<int> ends;
-      std::vector<int> axes;
-      std::vector<int> steps;
-      if (node.input_size() == 1) {
-        starts = get_node_attr_ai(node, "starts");
-        ends = get_node_attr_ai(node, "ends");
-        axes = get_node_attr_ai(node, "axes");
-        steps = get_node_attr_ai(node, "steps");  // TODO
-      } else {
-        starts = get_node_attr_from_input_ai(weights[node.input(1)]);
-        ends = get_node_attr_from_input_ai(weights[node.input(2)]);
-        if (node.input_size() >= 4) axes = get_node_attr_from_input_ai(weights[node.input(3)]);
-        if (node.input_size() >= 5) steps = get_node_attr_from_input_ai(weights[node.input(4)]);
-      }
-
-      // assert step == 1 or step >= ends
-      for (int i = 0; i < (int)steps.size(); i++) {
-        if (steps[i] != 1 && steps[i] < ends[i]) {
-          use_crop = false;
-          fprintf(stderr, "Unsupported slice step ! Use custom TensorSlice\n");
-        }
-      }
-
-      if (use_crop) {
-        // filter out N-dim axis
-        if (!axes.empty()) {
-          for (int i = 0; i < (int)axes.size(); i++) {
-            int axis = axes[i];
-            if (axis == 0) {
-              starts.erase(starts.begin() + i);
-              ends.erase(ends.begin() + i);
-              axes.erase(axes.begin() + i);
-              break;
-            }
-          }
-        }
-
-        fprintf(pp, " -23309=%d", (int)starts.size());
-        for (int i = 0; i < (int)starts.size(); i++) {
-          fprintf(pp, ",%d", starts[i]);
-        }
-        fprintf(pp, " -23310=%d", (int)ends.size());
-        for (int i = 0; i < (int)ends.size(); i++) {
-          fprintf(pp, ",%d", ends[i]);
-        }
-        if (!axes.empty()) {
-          fprintf(pp, " -23311=%d", (int)axes.size());
-          for (int i = 0; i < (int)axes.size(); i++) {
-            int axis = axes[i];
-            if (axis == 0 || axis > 3 || axis < -3) fprintf(stderr, "Unsupported slice axes !\n");
-
-            if (axis > 0) axis = axis - 1;  // -1 for skip N-dim
-
-            fprintf(pp, ",%d", axis);
-          }
-        }
-      } else {
-        fprintf(pp, " -23300=%d", (int)starts.size());
-        for (int i = 0; i < (int)starts.size(); i++) {
-          fprintf(pp, ",%d", starts[i]);
-        }
-        fprintf(pp, " -23301=%d", (int)ends.size());
-        for (int i = 0; i < (int)ends.size(); i++) {
-          fprintf(pp, ",%d", ends[i]);
-        }
-        if (!axes.empty()) {
-          fprintf(pp, " -23302=%d", (int)axes.size());
-          for (int i = 0; i < (int)axes.size(); i++) {
-            int axis = axes[i];
-            if (axis > 3 || axis < -3) fprintf(stderr, "Unsupported slice axes !\n");
-            fprintf(pp, ",%d", axis);
-          }
-        }
-        if (!steps.empty()) {
-          fprintf(pp, " -23303=%d", (int)steps.size());
-          for (int i = 0; i < (int)steps.size(); i++) {
-            int step = steps[i];
-            if (step == 0) fprintf(stderr, "Unsupported slice step ! Unsupported slice step\n");
-            fprintf(pp, ",%d", step);
-          }
-        }
-      }
-    } else if (op == "Softmax") {
-      int axis = get_node_attr_i(node, "axis", 1);
-      fprintf(pp, " 0=%d", axis - 1);
-      fprintf(pp, " 1=1");
-    } else if (op == "Split") {
-      int axis = get_node_attr_i(node, "axis", 0);
-      std::vector<int> split = get_node_attr_ai(node, "split");
-      if (axis < 1) fprintf(stderr, "Unsupported split axis !\n");
-
-      fprintf(pp, " -23300=%d", output_size);
-      if (split.empty()) {
-        for (int i = 0; i < output_size; i++) {
-          fprintf(pp, ",-233");
-        }
-      } else {
-        for (size_t i = 0; i < split.size() - 1; i++) {
-          fprintf(pp, ",%d", split[i]);
-        }
-        fprintf(pp, ",-233");
-      }
-      fprintf(pp, " 1=%d", axis - 1);
-    } else if (op == "Sqrt") {
-      int op_type = 5;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Squeeze") {
-      std::vector<int> axes = get_node_attr_ai(node, "axes");
-
-      if (axes.empty()) {
-        fprintf(pp, " 0=1");
-        fprintf(pp, " 1=1");
-        fprintf(pp, " 2=1");
-      } else {
-        bool flag = true;
-        for (int i = 0; i < (int)axes.size(); i++) {
-          if (axes[i] == 0) {
-            flag = false;
-            break;
-          }
-        }
-        if (flag == true) {
-          fprintf(pp, " -23303=%zu", axes.size());
-          for (int i = 0; i < (int)axes.size(); i++) {
-            if (axes[i] == 0 || axes[i] > 3 || axes[i] < -3)
-              fprintf(stderr, "Unsupported squeeze axes !: %d, %s\n", axes[i], node.name().c_str());
-            fprintf(pp, ",%d", axes[i] - 1);
-          }
-        }
-      }
-    } else if (op == "Sub") {
-      int op_type = 1;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "Sum") {
-      int op_type = 1;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Swish") {
-      // no param
-    } else if (op == "Tan") {
-      int op_type = 11;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Tanh") {
-      int op_type = 16;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "TopK") {
-      int axis = get_node_attr_i(node, "axis", -1);
-      axis = axis > 0 ? axis - 1 : axis;
-      int largest = get_node_attr_i(node, "largest", 1);
-      int sorted = get_node_attr_i(node, "sorted", 1);
-      fprintf(pp, " 0=%d", axis);
-      fprintf(pp, " 1=%d", largest);
-      fprintf(pp, " 2=%d", sorted);
-    } else if (op == "Transpose") {
-      std::vector<int> perm = get_node_attr_ai(node, "perm");
-
-      if (perm.size() == 3) {
-        if (perm[1] == 1 && perm[2] == 2)
-          fprintf(pp, " 0=0");  // w h
-        else if (perm[1] == 2 && perm[2] == 1)
-          fprintf(pp, " 0=1");  // h w
-        else if (perm[0] == 1 && perm[1] == 0 && perm[2] == 2)
-          fprintf(pp, " 0=0");  // w h
-        else if (perm[0] == 2 && perm[1] == 0 && perm[2] == 1)
-          fprintf(pp, " 0=1");  // h w
-      } else if (perm.size() == 4) {
-        if (perm[1] == 1 && perm[2] == 2 && perm[3] == 3)
-          fprintf(pp, " 0=0");  // w h c
-        else if (perm[1] == 1 && perm[2] == 3 && perm[3] == 2)
-          fprintf(pp, " 0=1");  // h w c
-        else if (perm[1] == 2 && perm[2] == 1 && perm[3] == 3)
-          fprintf(pp, " 0=2");  // w c h
-        else if (perm[1] == 2 && perm[2] == 3 && perm[3] == 1)
-          fprintf(pp, " 0=3");  // c w h
-        else if (perm[1] == 3 && perm[2] == 1 && perm[3] == 2)
-          fprintf(pp, " 0=4");  // h c w
-        else if (perm[1] == 3 && perm[2] == 2 && perm[3] == 1)
-          fprintf(pp, " 0=5");  // c h w
-      } else if (perm.size() == 5) {
-        if (perm[1] == 1 && perm[2] == 2 && perm[3] == 3 && perm[4] == 4)
-          fprintf(pp, " 0=0");  // wx h c
-        else if (perm[1] == 1 && perm[2] == 3 && perm[3] == 4 && perm[4] == 2)
-          fprintf(pp, " 0=1");  // h wx c
-        else if (perm[1] == 2 && perm[2] == 1 && perm[3] == 3 && perm[4] == 4)
-          fprintf(pp, " 0=2");  // wx c h
-        else if (perm[1] == 2 && perm[2] == 3 && perm[3] == 4 && perm[4] == 1)
-          fprintf(pp, " 0=3");  // c wx h
-        else if (perm[1] == 3 && perm[2] == 4 && perm[3] == 1 && perm[4] == 2)
-          fprintf(pp, " 0=4");  // h c wx
-        else if (perm[1] == 3 && perm[2] == 4 && perm[3] == 2 && perm[4] == 1)
-          fprintf(pp, " 0=5");  // c h wx
-        else
-          fprintf(stderr, "Unsupported transpose type !\n");
-      }
-    } else if (op == "Upsample") {
-      std::string mode = get_node_attr_s(node, "mode");
-      std::string align = get_node_attr_s(node, "coordinate_transformation_mode");
-
-      std::vector<float> scales;
-
-      if (node.input_size() == 1) {
-        scales = get_node_attr_af(node, "scales");
-      } else {
-        scales = get_node_attr_from_input_af(weights[node.input(1)]);
-      }
-
-      int resize_type = 1;
-      if (mode == "nearest") {
-        resize_type = 1;
-      } else if (mode == "bilinear" || mode == "linear") {
-        resize_type = 2;
-      } else if (mode == "trilinear") {
-        fprintf(stderr, "Unsupported Upsample mode !\n");
-      }
-
-      float h_scale = 1.f;
-      float w_scale = 1.f;
-      if (scales.size() == 2) {
-        w_scale = scales[1];
-      } else if (scales.size() == 3) {
-        h_scale = scales[1];
-        w_scale = scales[2];
-      } else if (scales.size() == 4) {
-        h_scale = scales[2];
-        w_scale = scales[3];
-
-        if (scales[1] != 1.f) fprintf(stderr, "Unsupported Upsample scales !\n");
-      } else {
-        fprintf(stderr, "Unsupported Upsample scales !\n");
-      }
-
-      int align_corner = 0;
-      if (align == "align_corners") {
-        align_corner = 1;
-      }
-
-      fprintf(pp, " 0=%d", resize_type);
-      fprintf(pp, " 1=%e", h_scale);
-      fprintf(pp, " 2=%e", w_scale);
-      fprintf(pp, " 6=%d", align_corner);
-    } else if (op == "Unsqueeze") {
-      std::vector<int> axes = get_node_attr_ai(node, "axes");
-      bool flag = true;
-      for (int i = 0; i < (int)axes.size(); i++) {
-        if (axes[i] == 0) {
-          flag = false;
-          break;
-        }
-      }
-      if (flag) {
-        fprintf(pp, " -23303=%zu", axes.size());
-        for (int i = 0; i < (int)axes.size(); i++) {
-          if (axes[i] == 0 || axes[i] > 4 || axes[i] < -4)
-            fprintf(stderr, "Unsupported unsqueeze axes !: %d, %s\n", axes[i], node.name().c_str());
-          fprintf(pp, ",%d", axes[i] - 1);
-        }
-      }
-    } else if (op == "Yolov3DetectionOutput") {
-      int num_class = get_node_attr_i(node, "num_class");
-      int num_box = get_node_attr_i(node, "num_box");
-      float confidence_threshold = get_node_attr_f(node, "confidence_threshold");
-      float nms_threshold = get_node_attr_f(node, "nms_threshold");
-      fprintf(pp, " 0=%d", num_class);
-      fprintf(pp, " 1=%d", num_box);
-      fprintf(pp, " 2=%e", confidence_threshold);
-      fprintf(pp, " 3=%e", nms_threshold);
-      std::vector<float> biases = get_node_attr_af(node, "biases");
-      if (biases.size() > 0) {
-        fprintf(pp, " -23304=%zu", biases.size());
-        for (int i = 0; i < (int)biases.size(); i++) {
-          fprintf(pp, ",%e", biases[i]);
-        }
-      }
-      std::vector<float> mask = get_node_attr_af(node, "mask");
-      if (mask.size() > 0) {
-        fprintf(pp, " -23305=%zu", mask.size());
-        for (int i = 0; i < (int)mask.size(); i++) {
-          fprintf(pp, ",%e", mask[i]);
-        }
-      }
-      std::vector<float> anchors_scale = get_node_attr_af(node, "anchors_scale");
-      if (anchors_scale.size() > 0) {
-        fprintf(pp, " -23306=%zu", anchors_scale.size());
-        for (int i = 0; i < (int)anchors_scale.size(); i++) {
-          fprintf(pp, ",%e", anchors_scale[i]);
-        }
-      }
-    } else {
-      // TODO op specific param
-    }
-
-    fprintf(pp, "\n");
-    for (int j = 0; j < output_size; j++) {
-      const std::string& output_name = node.output(j);
-      if (node_reference.find(output_name) != node_reference.end()) {
-        int refcount = node_reference[output_name];
-        if (refcount > 1) {
-          char splitname[256];
-          sprintf(splitname, "splitncnn_%d", internal_split);
-          fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
-
-          fprintf(pp, " %s", output_name.c_str());
-
-          for (int k = 0; k < refcount; k++) {
-            fprintf(pp, " %s_splitncnn_%d", output_name.c_str(), k);
-          }
-          fprintf(pp, "\n");
-
-          internal_split++;
         }
-      }
     }
-  }
 
-  fclose(pp);
-  fclose(bp);
-  fprintf(stderr, "onnx2ncnn finish\n");
-  return 0;
+    fclose(pp);
+    fclose(bp);
+    fprintf(stderr, "onnx2ncnn finish\n");
+    return 0;
 }
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp
index dd1fe2c4f6..42482ee8b8 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp
@@ -14,157 +14,179 @@
  * @return std::tuple<bool, std::vector<int>>
  */
 std::tuple<bool, std::vector<int>> query_shape(
-    onnx::GraphProto* mutable_graph, onnx::NodeProto* target,
+    onnx::GraphProto*                               mutable_graph,
+    onnx::NodeProto*                                target,
     const std::map<std::string, onnx::TensorProto>& weights,
-    std::map<std::string, std::vector<int>>& context) {
-  // emplace all input nodes
-  const int input_count = mutable_graph->input_size();
-  for (int i = 0; i < input_count; i++) {
-    auto inp = mutable_graph->input(i);
-    onnx::TypeProto inp_type = inp.type();
-    onnx::TensorShapeProto shape_proto = inp_type.tensor_type().shape();
-
-    auto dim_size = shape_proto.dim_size();
-    std::vector<int> shape(dim_size);
-    for (int index = 0; index < dim_size; ++index) {
-      shape[index] = shape_proto.dim(index).dim_value();
-    }
-
-    context.emplace(inp.name(), shape);
-  }
-
-  // BFS the tree, `target` as root, onnx::graph inputs and weights as leaf nodes
-  std::vector<onnx::NodeProto*> serial = {target};
-  {
-    std::set<std::string> mark_as_appended = {};
-    while (true) {
-      int start = 0, end = serial.size();
-      for (int i = start; i < end; ++i) {
-        auto node_ptr = serial[i];
-        auto len = node_ptr->input_size();
-
-        for (int j = 0; j < len; ++j) {
-          std::string name = node_ptr->input(j);
-          if (context.find(name) != context.end()) {
-            // if input founded, skip
-            continue;
-          }
-
-          if (weights.find(name) != weights.end()) {
-            // if founded in weights, extract shape to context
-            auto weight = weights.at(name);
-            std::vector<int> shape;
-            for (auto index = 0; index < weight.dims_size(); ++index) {
-              shape.emplace_back(weight.dims(index));
-            }
-            context.emplace(name, shape);
-            continue;
-          }
-
-          if (mark_as_appended.find(name) != mark_as_appended.end()) {
-            // if mark as appended, skip
-            continue;
-          }
-          // else append it to serialization list
-          auto depend_ptr = find_node_by_output_name(mutable_graph, name);
-          if (depend_ptr == nullptr) {
-            fprintf(stderr, "cannot find %s from graph !\n", name.c_str());
-            return std::make_tuple(false, std::vector<int>{});
-          }
-          mark_as_appended.insert(name);
-          serial.emplace_back(depend_ptr);
+    std::map<std::string, std::vector<int>>&        context)
+{
+    // emplace all input nodes
+    const int input_count = mutable_graph->input_size();
+    for (int i = 0; i < input_count; i++)
+    {
+        auto                   inp         = mutable_graph->input(i);
+        onnx::TypeProto        inp_type    = inp.type();
+        onnx::TensorShapeProto shape_proto = inp_type.tensor_type().shape();
+
+        auto                   dim_size = shape_proto.dim_size();
+        std::vector<int>       shape(dim_size);
+        for (int index = 0; index < dim_size; ++index)
+        {
+            shape[index] = shape_proto.dim(index).dim_value();
         }
-      }
 
-      if (serial.size() <= end) {
-        // if not new node added, quit
-        break;
-      }
-
-      // update start and end position, continue BFS the tree
-      start = end;
-      end = serial.size();
+        context.emplace(inp.name(), shape);
     }
-  }
-
-  // for each node in serialization list, calculate the output shape
-  {
-    std::reverse(serial.begin(), serial.end());
-    for (auto node : serial) {
-      if (node->op_type() == "Conv") {
-        auto inp = context[node->input(0)];
-        auto weight = context[node->input(1)];
-        assert(inp.size() == 4 and weight.size() == 4);
-
-        int group = get_node_attr_i(*node, "group", 1);
-        assert(group == 1);
-
-        // treat multiple spatial attr as single one
-#define EXTRACT_REPEATED_PARAM(NAME, ATTR, DEFAULT)        \
-  int ATTR = DEFAULT;                                      \
-  {                                                        \
-    std::vector<int> _vec = get_node_attr_ai(*node, NAME); \
-    if (not _vec.empty()) {                                \
-      ATTR = _vec[0];                                      \
-    }                                                      \
-  }
-
-        EXTRACT_REPEATED_PARAM("dilations", dilation, 1);
-        EXTRACT_REPEATED_PARAM("pads", pad, 0);
-        EXTRACT_REPEATED_PARAM("strides", stride, 1);
-
-#undef EXTRACT_REPEATED_PARAM
 
-        int on = inp[0];
-        int oc = weight[0];
-        int oh = (inp[2] + 2 * pad - weight[2]) / stride + 1;
-        int ow = (inp[3] + 2 * pad - weight[3]) / stride + 1;
-        context.emplace(node->output(0), std::vector<int>{on, oc, oh, ow});
-
-      } else if (node->op_type() == "Shape") {
-        auto inp = context[node->input(0)];
-        context.emplace(node->output(0), std::vector<int>{1, inp[1], inp[2], inp[3]});
-
-      } else if (node->op_type() == "Slice") {
-        assert(node->input_size() >= 4);
+    // BFS the tree, `target` as root, onnx::graph inputs and weights as leaf nodes
+    std::vector<onnx::NodeProto*> serial = {target};
+    {
+        std::set<std::string> mark_as_appended = {};
+        while (true)
+        {
+            int start = 0, end = serial.size();
+            for (int i = start; i < end; ++i)
+            {
+                auto node_ptr = serial[i];
+                auto len      = node_ptr->input_size();
+
+                for (int j = 0; j < len; ++j)
+                {
+                    std::string name = node_ptr->input(j);
+                    if (context.find(name) != context.end())
+                    {
+                        // if input founded, skip
+                        continue;
+                    }
+
+                    if (weights.find(name) != weights.end())
+                    {
+                        // if founded in weights, extract shape to context
+                        auto             weight = weights.at(name);
+                        std::vector<int> shape;
+                        for (auto index = 0; index < weight.dims_size(); ++index)
+                        {
+                            shape.emplace_back(weight.dims(index));
+                        }
+                        context.emplace(name, shape);
+                        continue;
+                    }
+
+                    if (mark_as_appended.find(name) != mark_as_appended.end())
+                    {
+                        // if mark as appended, skip
+                        continue;
+                    }
+                    // else append it to serialization list
+                    auto depend_ptr = find_node_by_output_name(mutable_graph, name);
+                    if (depend_ptr == nullptr)
+                    {
+                        fprintf(stderr, "cannot find %s from graph !\n", name.c_str());
+                        return std::make_tuple(false, std::vector<int>{});
+                    }
+                    mark_as_appended.insert(name);
+                    serial.emplace_back(depend_ptr);
+                }
+            }
 
-        auto inp = context[node->input(0)];
-        int start = get_node_attr_from_input<int>(weights.at(node->input(1)));
-        int end = get_node_attr_from_input<int>(weights.at(node->input(2)));
-        int axes = get_node_attr_from_input<int>(weights.at(node->input(3)));
+            if (serial.size() <= end)
+            {
+                // if not new node added, quit
+                break;
+            }
 
-        if (axes != 0) {
-          fprintf(stderr, "Not support axes=%d !\n", axes);
-          return std::make_tuple(false, std::vector<int>{});
+            // update start and end position, continue BFS the tree
+            start = end;
+            end   = serial.size();
         }
+    }
 
-        assert(inp.size() >= end - start);
-        context.emplace(node->output(0), std::vector<int>{inp.begin() + start, inp.begin() + end});
-
-      } else if (node->op_type() == "Concat") {
-        assert(node->input_size() >= 2);
-
-        auto axis = get_node_attr_i(*node, "axis", 0);
-        if (axis != 0) {
-          fprintf(stderr, "Not support axes=%d !\n", axis);
-          return std::make_tuple(false, std::vector<int>{});
-        }
+    // for each node in serialization list, calculate the output shape
+    {
+        std::reverse(serial.begin(), serial.end());
+        for (auto node : serial)
+        {
+            if (node->op_type() == "Conv")
+            {
+                auto inp    = context[node->input(0)];
+                auto weight = context[node->input(1)];
+                assert(inp.size() == 4 and weight.size() == 4);
+
+                int group = get_node_attr_i(*node, "group", 1);
+                assert(group == 1);
+
+                // treat multiple spatial attr as single one
+#define EXTRACT_REPEATED_PARAM(NAME, ATTR, DEFAULT)            \
+    int ATTR = DEFAULT;                                        \
+    {                                                          \
+        std::vector<int> _vec = get_node_attr_ai(*node, NAME); \
+        if (not _vec.empty())                                  \
+        {                                                      \
+            ATTR = _vec[0];                                    \
+        }                                                      \
+    }
 
-        std::vector<int> inp = context[node->input(0)];
-        std::vector<int> w_data = get_node_attr_from_input_ai(weights.at(node->input(1)));
+                EXTRACT_REPEATED_PARAM("dilations", dilation, 1);
+                EXTRACT_REPEATED_PARAM("pads", pad, 0);
+                EXTRACT_REPEATED_PARAM("strides", stride, 1);
 
-        // concat data on axis 0
-        inp.insert(inp.end(), w_data.begin(), w_data.end());
-        context.emplace(node->output(0), inp);
+#undef EXTRACT_REPEATED_PARAM
 
-      } else {
-        fprintf(stderr, "Unsupported type %s in query_shape !\n", node->op_type().c_str());
-        return std::make_tuple(false, std::vector<int>{});
-      }
+                int on = inp[0];
+                int oc = weight[0];
+                int oh = (inp[2] + 2 * pad - weight[2]) / stride + 1;
+                int ow = (inp[3] + 2 * pad - weight[3]) / stride + 1;
+                context.emplace(node->output(0), std::vector<int>{on, oc, oh, ow});
+            }
+            else if (node->op_type() == "Shape")
+            {
+                auto inp = context[node->input(0)];
+                context.emplace(node->output(0), std::vector<int>{1, inp[1], inp[2], inp[3]});
+            }
+            else if (node->op_type() == "Slice")
+            {
+                assert(node->input_size() >= 4);
+
+                auto inp   = context[node->input(0)];
+                int  start = get_node_attr_from_input<int>(weights.at(node->input(1)));
+                int  end   = get_node_attr_from_input<int>(weights.at(node->input(2)));
+                int  axes  = get_node_attr_from_input<int>(weights.at(node->input(3)));
+
+                if (axes != 0)
+                {
+                    fprintf(stderr, "Not support axes=%d !\n", axes);
+                    return std::make_tuple(false, std::vector<int>{});
+                }
+
+                assert(inp.size() >= end - start);
+                context.emplace(node->output(0), std::vector<int>{inp.begin() + start, inp.begin() + end});
+            }
+            else if (node->op_type() == "Concat")
+            {
+                assert(node->input_size() >= 2);
+
+                auto axis = get_node_attr_i(*node, "axis", 0);
+                if (axis != 0)
+                {
+                    fprintf(stderr, "Not support axes=%d !\n", axis);
+                    return std::make_tuple(false, std::vector<int>{});
+                }
+
+                std::vector<int> inp    = context[node->input(0)];
+                std::vector<int> w_data = get_node_attr_from_input_ai(weights.at(node->input(1)));
+
+                // concat data on axis 0
+                inp.insert(inp.end(), w_data.begin(), w_data.end());
+                context.emplace(node->output(0), inp);
+            }
+            else
+            {
+                fprintf(stderr, "Unsupported type %s in query_shape !\n", node->op_type().c_str());
+                return std::make_tuple(false, std::vector<int>{});
+            }
+        }
     }
-  }
 
-  assert(context.find(target->output(0)) != context.end());
-  auto target_shape = context[target->output(0)];
-  return std::make_tuple(true, target_shape);
+    assert(context.find(target->output(0)) != context.end());
+    auto target_shape = context[target->output(0)];
+    return std::make_tuple(true, target_shape);
 }
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h
index fa62ffe9de..e7a29a2cef 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h
@@ -14,6 +14,7 @@
  * @return std::tuple<bool, std::vector<int>>
  */
 std::tuple<bool, std::vector<int>> query_shape(
-    onnx::GraphProto* mutable_graph, onnx::NodeProto* target,
+    onnx::GraphProto*                               mutable_graph,
+    onnx::NodeProto*                                target,
     const std::map<std::string, onnx::TensorProto>& weights,
-    std::map<std::string, std::vector<int>>& context);
+    std::map<std::string, std::vector<int>>&        context);
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/utils.h b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/utils.h
index 792db0ed34..ab991a52f9 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/utils.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/utils.h
@@ -21,381 +21,496 @@
  * @param name
  * @return onnx::NodeProto*
  */
-static onnx::NodeProto* find_node_by_output_name(onnx::GraphProto* mutable_graph,
-                                                 const std::string& name) {
-  const int input_count = mutable_graph->node_size();
-  for (int i = 0; i < input_count; ++i) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    for (int j = 0; j < node->output_size(); ++j) {
-      auto output = node->output(j);
-      if (output == name) {
-        return node;
-      }
+static onnx::NodeProto* find_node_by_output_name(onnx::GraphProto*  mutable_graph,
+                                                 const std::string& name)
+{
+    const int input_count = mutable_graph->node_size();
+    for (int i = 0; i < input_count; ++i)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        for (int j = 0; j < node->output_size(); ++j)
+        {
+            auto output = node->output(j);
+            if (output == name)
+            {
+                return node;
+            }
+        }
     }
-  }
 
-  return nullptr;
+    return nullptr;
 }
 
-static bool read_proto_from_binary(const char* filepath, onnx::ModelProto* message) {
-  std::ifstream fs(filepath, std::ifstream::in | std::ifstream::binary);
-  if (!fs.is_open()) {
-    fprintf(stderr, "open failed %s\n", filepath);
-    return false;
-  }
+static bool read_proto_from_binary(const char* filepath, onnx::ModelProto* message)
+{
+    std::ifstream fs(filepath, std::ifstream::in | std::ifstream::binary);
+    if (!fs.is_open())
+    {
+        fprintf(stderr, "open failed %s\n", filepath);
+        return false;
+    }
 
-  google::protobuf::io::IstreamInputStream input(&fs);
-  google::protobuf::io::CodedInputStream codedstr(&input);
+    google::protobuf::io::IstreamInputStream input(&fs);
+    google::protobuf::io::CodedInputStream   codedstr(&input);
 
 #if GOOGLE_PROTOBUF_VERSION >= 3011000
-  codedstr.SetTotalBytesLimit(INT_MAX);
+    codedstr.SetTotalBytesLimit(INT_MAX);
 #else
-  codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2);
+    codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2);
 #endif
 
-  bool success = message->ParseFromCodedStream(&codedstr);
+    bool success = message->ParseFromCodedStream(&codedstr);
 
-  fs.close();
+    fs.close();
 
-  return success;
+    return success;
 }
 
-static std::vector<int> get_node_attr_ai(const onnx::NodeProto& node, const char* key) {
-  std::vector<int> v;
+static std::vector<int> get_node_attr_ai(const onnx::NodeProto& node, const char* key)
+{
+    std::vector<int> v;
+
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            v.resize(attr.ints_size());
+            for (int j = 0; j < attr.ints_size(); j++)
+            {
+                v[j] = std::max(std::min(attr.ints(j), (::google::protobuf::int64)INT_MAX),
+                                (::google::protobuf::int64)INT_MIN);
+            }
+
+            break;
+        }
+    }
 
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      v.resize(attr.ints_size());
-      for (int j = 0; j < attr.ints_size(); j++) {
-        v[j] = std::max(std::min(attr.ints(j), (::google::protobuf::int64)INT_MAX),
-                        (::google::protobuf::int64)INT_MIN);
-      }
+    return v;
+}
 
-      break;
+static void set_node_attr_ai(onnx::NodeProto& node, const char* key, const std::vector<int>& value)
+{
+    onnx::AttributeProto* attr_group = node.add_attribute();
+    attr_group->set_name(key);
+    for (auto v : value)
+    {
+        attr_group->add_ints(v);
     }
-  }
 
-  return v;
+    return;
 }
 
-static void set_node_attr_ai(onnx::NodeProto& node, const char* key,
-                             const std::vector<int>& value) {
-  onnx::AttributeProto* attr_group = node.add_attribute();
-  attr_group->set_name(key);
-  for (auto v : value) {
-    attr_group->add_ints(v);
-  }
+static std::vector<float> get_node_attr_af(const onnx::NodeProto& node, const char* key)
+{
+    std::vector<float> v;
+
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            v.resize(attr.floats_size());
+            for (int j = 0; j < attr.floats_size(); j++)
+            {
+                v[j] = attr.floats(j);
+            }
+
+            break;
+        }
+    }
 
-  return;
+    return v;
 }
 
-static std::vector<float> get_node_attr_af(const onnx::NodeProto& node, const char* key) {
-  std::vector<float> v;
+static int get_node_attr_i(const onnx::NodeProto& node, const char* key, int def = 0)
+{
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            return std::max(std::min(attr.i(), (::google::protobuf::int64)INT_MAX),
+                            (::google::protobuf::int64)INT_MIN);
+        }
+    }
 
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      v.resize(attr.floats_size());
-      for (int j = 0; j < attr.floats_size(); j++) {
-        v[j] = attr.floats(j);
-      }
+    return def;
+}
 
-      break;
+static float get_node_attr_f(const onnx::NodeProto& node, const char* key, float def = 0.f)
+{
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            return attr.f();
+        }
     }
-  }
 
-  return v;
+    return def;
 }
 
-static int get_node_attr_i(const onnx::NodeProto& node, const char* key, int def = 0) {
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      return std::max(std::min(attr.i(), (::google::protobuf::int64)INT_MAX),
-                      (::google::protobuf::int64)INT_MIN);
+static std::string get_node_attr_s(const onnx::NodeProto& node, const char* key, const std::string& def = std::string())
+{
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            return attr.s();
+        }
     }
-  }
 
-  return def;
+    return def;
 }
 
-static float get_node_attr_f(const onnx::NodeProto& node, const char* key, float def = 0.f) {
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      return attr.f();
+static onnx::TensorProto get_node_attr_tensor(const onnx::NodeProto& node, const char* key)
+{
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            return attr.t();
+        }
     }
-  }
 
-  return def;
+    return onnx::TensorProto();
 }
 
-static std::string get_node_attr_s(const onnx::NodeProto& node, const char* key,
-                                   const std::string& def = std::string()) {
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      return attr.s();
+template<typename T>
+static T get_node_attr_from_input(const onnx::TensorProto& tp)
+{
+    T v = 0.f;
+
+    // float
+    if (tp.data_type() == 1)
+    {
+        const float* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const float*)tp.raw_data().data();
+        }
+        else
+        {
+            shape_data = tp.float_data().data();
+        }
+        v = shape_data[0];
+    }
+    // double
+    else if (tp.data_type() == 11)
+    {
+        const double* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const double*)tp.raw_data().data();
+        }
+        else
+        {
+            shape_data = tp.double_data().data();
+        }
+        v = shape_data[0];
+    }
+    // int64
+    else if (tp.data_type() == 7)
+    {
+        const int64_t* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const int64_t*)tp.raw_data().data();
+        }
+        else
+        {
+            shape_data = tp.int64_data().data();
+        }
+        v = std::max(std::min(shape_data[0], (::google::protobuf::int64)INT_MAX),
+                     (::google::protobuf::int64)INT_MIN);
+    }
+    // int32
+    else if (tp.data_type() == 6)
+    {
+        const int32_t* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const int32_t*)tp.raw_data().data();
+        }
+        else
+        {
+            shape_data = tp.int32_data().data();
+        }
+        v = shape_data[0];
+    }
+    else
+    {
+        // fprintf(stderr, "tp.name: %s\n", tp.name().c_str());
+        fprintf(stderr, "Unknown data type %d\n", tp.data_type());
+        fprintf(stderr, "get_node_attr_from_input\n");
+        abort();
     }
-  }
 
-  return def;
+    return v;
 }
 
-static onnx::TensorProto get_node_attr_tensor(const onnx::NodeProto& node, const char* key) {
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      return attr.t();
+static std::vector<int> get_node_attr_from_input_ai(const onnx::TensorProto& tp)
+{
+    int              size = 0;
+
+    std::vector<int> v;
+
+    // int64
+    if (tp.data_type() == 7)
+    {
+        const int64_t* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const int64_t*)tp.raw_data().data();
+            size       = (int)(tp.raw_data().size() / 8);
+        }
+        else
+        {
+            shape_data = tp.int64_data().data();
+            size       = tp.int64_data_size();
+        }
+        for (int j = 0; j < size; j++)
+        {
+            int vi = std::max(std::min(shape_data[j], (::google::protobuf::int64)INT_MAX),
+                              (::google::protobuf::int64)INT_MIN);
+            v.push_back(vi);
+        }
+    }
+    // int32
+    else if (tp.data_type() == 6)
+    {
+        const int32_t* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const int32_t*)tp.raw_data().data();
+            size       = (int)(tp.raw_data().size() / 4);
+        }
+        else
+        {
+            shape_data = tp.int32_data().data();
+            size       = tp.int32_data_size();
+        }
+        for (int j = 0; j < size; j++)
+        {
+            v.push_back(shape_data[j]);
+        }
+    }
+    else
+    {
+        fprintf(stderr, "Unknown data type %d\n", tp.data_type());
     }
-  }
 
-  return onnx::TensorProto();
+    return v;
 }
 
-template <typename T>
-static T get_node_attr_from_input(const onnx::TensorProto& tp) {
-  T v = 0.f;
-
-  // float
-  if (tp.data_type() == 1) {
-    const float* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const float*)tp.raw_data().data();
-    } else {
-      shape_data = tp.float_data().data();
-    }
-    v = shape_data[0];
-  }
-  // double
-  else if (tp.data_type() == 11) {
-    const double* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const double*)tp.raw_data().data();
-    } else {
-      shape_data = tp.double_data().data();
-    }
-    v = shape_data[0];
-  }
-  // int64
-  else if (tp.data_type() == 7) {
-    const int64_t* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const int64_t*)tp.raw_data().data();
-    } else {
-      shape_data = tp.int64_data().data();
-    }
-    v = std::max(std::min(shape_data[0], (::google::protobuf::int64)INT_MAX),
-                 (::google::protobuf::int64)INT_MIN);
-  }
-  // int32
-  else if (tp.data_type() == 6) {
-    const int32_t* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const int32_t*)tp.raw_data().data();
-    } else {
-      shape_data = tp.int32_data().data();
-    }
-    v = shape_data[0];
-  } else {
-    // fprintf(stderr, "tp.name: %s\n", tp.name().c_str());
-    fprintf(stderr, "Unknown data type %d\n", tp.data_type());
-    fprintf(stderr, "get_node_attr_from_input\n");
-    abort();
-  }
-
-  return v;
-}
+static std::vector<float> get_node_attr_from_input_af(const onnx::TensorProto& tp)
+{
+    int                size = 0;
+
+    std::vector<float> v;
+
+    // float
+    if (tp.data_type() == 1)
+    {
+        const float* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const float*)tp.raw_data().data();
+            size       = (int)(tp.raw_data().size() / 4);
+        }
+        else
+        {
+            shape_data = tp.float_data().data();
+            size       = tp.float_data_size();
+        }
+        for (int j = 0; j < size; j++)
+        {
+            v.push_back(shape_data[j]);
+        }
+    }
+    // double
+    else if (tp.data_type() == 11)
+    {
+        const double* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const double*)tp.raw_data().data();
+            size       = (int)(tp.raw_data().size() / 8);
+        }
+        else
+        {
+            shape_data = tp.double_data().data();
+            size       = tp.double_data_size();
+        }
+        for (int j = 0; j < size; j++)
+        {
+            v.push_back((float)shape_data[j]);
+        }
+    }
+    else
+    {
+        fprintf(stderr, "Unknown data type %d\n", tp.data_type());
+    }
 
-static std::vector<int> get_node_attr_from_input_ai(const onnx::TensorProto& tp) {
-  int size = 0;
-
-  std::vector<int> v;
-
-  // int64
-  if (tp.data_type() == 7) {
-    const int64_t* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const int64_t*)tp.raw_data().data();
-      size = (int)(tp.raw_data().size() / 8);
-    } else {
-      shape_data = tp.int64_data().data();
-      size = tp.int64_data_size();
-    }
-    for (int j = 0; j < size; j++) {
-      int vi = std::max(std::min(shape_data[j], (::google::protobuf::int64)INT_MAX),
-                        (::google::protobuf::int64)INT_MIN);
-      v.push_back(vi);
-    }
-  }
-  // int32
-  else if (tp.data_type() == 6) {
-    const int32_t* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const int32_t*)tp.raw_data().data();
-      size = (int)(tp.raw_data().size() / 4);
-    } else {
-      shape_data = tp.int32_data().data();
-      size = tp.int32_data_size();
-    }
-    for (int j = 0; j < size; j++) {
-      v.push_back(shape_data[j]);
-    }
-  } else {
-    fprintf(stderr, "Unknown data type %d\n", tp.data_type());
-  }
-
-  return v;
+    return v;
 }
 
-static std::vector<float> get_node_attr_from_input_af(const onnx::TensorProto& tp) {
-  int size = 0;
-
-  std::vector<float> v;
-
-  // float
-  if (tp.data_type() == 1) {
-    const float* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const float*)tp.raw_data().data();
-      size = (int)(tp.raw_data().size() / 4);
-    } else {
-      shape_data = tp.float_data().data();
-      size = tp.float_data_size();
-    }
-    for (int j = 0; j < size; j++) {
-      v.push_back(shape_data[j]);
-    }
-  }
-  // double
-  else if (tp.data_type() == 11) {
-    const double* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const double*)tp.raw_data().data();
-      size = (int)(tp.raw_data().size() / 8);
-    } else {
-      shape_data = tp.double_data().data();
-      size = tp.double_data_size();
-    }
-    for (int j = 0; j < size; j++) {
-      v.push_back((float)shape_data[j]);
-    }
-  } else {
-    fprintf(stderr, "Unknown data type %d\n", tp.data_type());
-  }
-
-  return v;
-}
+static int get_tensor_proto_data_size(const onnx::TensorProto& tp)
+{
+    if (tp.has_raw_data())
+    {
+        if (tp.data_type() == 1 || tp.data_type() == 6)
+        {
+            const std::string& raw_data = tp.raw_data();
+            int                size     = (int)raw_data.size() / 4;
+            return size;
+        }
+        else if (tp.data_type() == 7 || tp.data_type() == 11)
+        {
+            const std::string& raw_data = tp.raw_data();
+            int                size     = (int)raw_data.size() / 8;
+            return size;
+        }
+        else if (tp.data_type() == 9)
+        {
+            const std::string& raw_data = tp.raw_data();
+            return 0;
+        }
+    }
+    else if (tp.data_type() == 1)
+    {
+        return tp.float_data_size();
+    }
+    else if (tp.data_type() == 7)
+    {
+        return tp.int64_data_size();
+    }
+    else if (tp.data_type() == 6)
+    {
+        return tp.int32_data_size();
+    }
+    else if (tp.data_type() == 11)
+    {
+        return tp.double_data_size();
+    }
 
-static int get_tensor_proto_data_size(const onnx::TensorProto& tp) {
-  if (tp.has_raw_data()) {
-    if (tp.data_type() == 1 || tp.data_type() == 6) {
-      const std::string& raw_data = tp.raw_data();
-      int size = (int)raw_data.size() / 4;
-      return size;
-    } else if (tp.data_type() == 7 || tp.data_type() == 11) {
-      const std::string& raw_data = tp.raw_data();
-      int size = (int)raw_data.size() / 8;
-      return size;
-    } else if (tp.data_type() == 9) {
-      const std::string& raw_data = tp.raw_data();
-      return 0;
-    }
-  } else if (tp.data_type() == 1) {
-    return tp.float_data_size();
-  } else if (tp.data_type() == 7) {
-    return tp.int64_data_size();
-  } else if (tp.data_type() == 6) {
-    return tp.int32_data_size();
-  } else if (tp.data_type() == 11) {
-    return tp.double_data_size();
-  }
-
-  return 0;
+    return 0;
 }
 
-static void fwrite_tensor_proto_data(const onnx::TensorProto& tp, FILE* bp) {
-  int size = get_tensor_proto_data_size(tp);
+static void fwrite_tensor_proto_data(const onnx::TensorProto& tp, FILE* bp)
+{
+    int size = get_tensor_proto_data_size(tp);
 
-  if (tp.has_raw_data()) {
-    const std::string& raw_data = tp.raw_data();
-    fwrite(raw_data.data(), sizeof(float), size, bp);
-  } else if (tp.data_type() == 1) {
-    fwrite(tp.float_data().data(), sizeof(float), size, bp);
-  }
+    if (tp.has_raw_data())
+    {
+        const std::string& raw_data = tp.raw_data();
+        fwrite(raw_data.data(), sizeof(float), size, bp);
+    }
+    else if (tp.data_type() == 1)
+    {
+        fwrite(tp.float_data().data(), sizeof(float), size, bp);
+    }
 }
 
-static void fwrite_tensor_proto_data_to_float(const onnx::TensorProto& tp, FILE* bp) {
-  int size = get_tensor_proto_data_size(tp);
-  size_t written_size;
-  if (tp.has_raw_data()) {
-    const std::string& raw_data = tp.raw_data();
-    if (tp.data_type() == 6) {
-      int* intdataptr = (int*)raw_data.data();
-      float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-      for (int i = 0; i < size; i++) {
-        floatdataptr[i] = (float)intdataptr[i];
-      }
-      written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-      std::free(floatdataptr);
-    } else if (tp.data_type() == 7) {
-      int64_t* intdataptr = (int64_t*)raw_data.data();
-      float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-      for (int i = 0; i < size; i++) {
-        floatdataptr[i] = (float)intdataptr[i];
-      }
-      written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-      std::free(floatdataptr);
-    } else if (tp.data_type() == 9) {
-      bool* intdataptr = (bool*)raw_data.data();
-      float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-      for (int i = 0; i < size; i++) {
-        floatdataptr[i] = (float)intdataptr[i];
-      }
-      written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-      std::free(floatdataptr);
-    } else if (tp.data_type() == 11) {
-      double* doubledataptr = (double*)raw_data.data();
-      float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-      for (int i = 0; i < size; i++) {
-        floatdataptr[i] = (float)doubledataptr[i];
-      }
-      written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-      std::free(floatdataptr);
-    }
-  } else if (tp.data_type() == 6) {
-    int* intdataptr = (int*)tp.int32_data().data();
-    float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-    for (int i = 0; i < size; i++) {
-      floatdataptr[i] = (float)intdataptr[i];
-    }
-    written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-    std::free(floatdataptr);
-  } else if (tp.data_type() == 7) {
-    int64_t* intdataptr = (int64_t*)tp.int64_data().data();
-    float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-    for (int i = 0; i < size; i++) {
-      floatdataptr[i] = (float)intdataptr[i];
-    }
-    written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-    std::free(floatdataptr);
-  } else if (tp.data_type() == 9) {
-    int* intdataptr = (int*)tp.int64_data().data();
-    float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-    for (int i = 0; i < size; i++) {
-      floatdataptr[i] = (float)intdataptr[i];
-    }
-    written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-    std::free(floatdataptr);
-  } else if (tp.data_type() == 11) {
-    double* doubledataptr = (double*)tp.double_data().data();
-    float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-    for (int i = 0; i < size; i++) {
-      floatdataptr[i] = (float)doubledataptr[i];
-    }
-    written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-    std::free(floatdataptr);
-  }
+static void fwrite_tensor_proto_data_to_float(const onnx::TensorProto& tp, FILE* bp)
+{
+    int    size = get_tensor_proto_data_size(tp);
+    size_t written_size;
+    if (tp.has_raw_data())
+    {
+        const std::string& raw_data = tp.raw_data();
+        if (tp.data_type() == 6)
+        {
+            int*   intdataptr   = (int*)raw_data.data();
+            float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
+            for (int i = 0; i < size; i++)
+            {
+                floatdataptr[i] = (float)intdataptr[i];
+            }
+            written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+            std::free(floatdataptr);
+        }
+        else if (tp.data_type() == 7)
+        {
+            int64_t* intdataptr   = (int64_t*)raw_data.data();
+            float*   floatdataptr = (float*)std::malloc(sizeof(float) * size);
+            for (int i = 0; i < size; i++)
+            {
+                floatdataptr[i] = (float)intdataptr[i];
+            }
+            written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+            std::free(floatdataptr);
+        }
+        else if (tp.data_type() == 9)
+        {
+            bool*  intdataptr   = (bool*)raw_data.data();
+            float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
+            for (int i = 0; i < size; i++)
+            {
+                floatdataptr[i] = (float)intdataptr[i];
+            }
+            written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+            std::free(floatdataptr);
+        }
+        else if (tp.data_type() == 11)
+        {
+            double* doubledataptr = (double*)raw_data.data();
+            float*  floatdataptr  = (float*)std::malloc(sizeof(float) * size);
+            for (int i = 0; i < size; i++)
+            {
+                floatdataptr[i] = (float)doubledataptr[i];
+            }
+            written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+            std::free(floatdataptr);
+        }
+    }
+    else if (tp.data_type() == 6)
+    {
+        int*   intdataptr   = (int*)tp.int32_data().data();
+        float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
+        for (int i = 0; i < size; i++)
+        {
+            floatdataptr[i] = (float)intdataptr[i];
+        }
+        written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+        std::free(floatdataptr);
+    }
+    else if (tp.data_type() == 7)
+    {
+        int64_t* intdataptr   = (int64_t*)tp.int64_data().data();
+        float*   floatdataptr = (float*)std::malloc(sizeof(float) * size);
+        for (int i = 0; i < size; i++)
+        {
+            floatdataptr[i] = (float)intdataptr[i];
+        }
+        written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+        std::free(floatdataptr);
+    }
+    else if (tp.data_type() == 9)
+    {
+        int*   intdataptr   = (int*)tp.int64_data().data();
+        float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
+        for (int i = 0; i < size; i++)
+        {
+            floatdataptr[i] = (float)intdataptr[i];
+        }
+        written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+        std::free(floatdataptr);
+    }
+    else if (tp.data_type() == 11)
+    {
+        double* doubledataptr = (double*)tp.double_data().data();
+        float*  floatdataptr  = (float*)std::malloc(sizeof(float) * size);
+        for (int i = 0; i < size; i++)
+        {
+            floatdataptr[i] = (float)doubledataptr[i];
+        }
+        written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+        std::free(floatdataptr);
+    }
 }
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp
old mode 100755
new mode 100644
index b865db7b25..c347cb97a9
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp
@@ -3,51 +3,60 @@
 
 #include "../ncnn_ops_definer.h"
 
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(ConstantOfShape)
-DEFINE_NCNN_OPS(ConstantOfShape, ConstantOfShape)
-ConstantOfShape::ConstantOfShape() {
-  one_blob_only = true;
-  support_inplace = false;
-}
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(ConstantOfShape)
+    DEFINE_NCNN_OPS(ConstantOfShape, ConstantOfShape)
+    ConstantOfShape::ConstantOfShape()
+    {
+        one_blob_only   = true;
+        support_inplace = false;
+    }
 
-int ConstantOfShape::load_param(const ParamDict& pd) {
-  val = pd.get(0, 0.f);
-  return 0;
-}
+    int ConstantOfShape::load_param(const ParamDict& pd)
+    {
+        val = pd.get(0, 0.f);
+        return 0;
+    }
 
-int ConstantOfShape::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const {
-  int dims = bottom_blob.w - 1;
-  const float* bottom_ptr = bottom_blob;
-  const float* shape_ptr = bottom_ptr + 1;
+    int ConstantOfShape::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+    {
+        int          dims       = bottom_blob.w - 1;
+        const float* bottom_ptr = bottom_blob;
+        const float* shape_ptr  = bottom_ptr + 1;
 
-  if (dims == 1) {
-    int w = (int)(shape_ptr[0] + 0.5);
-    size_t elemsize = sizeof(val);
-    top_blob.create(w, elemsize, opt.blob_allocator);
-    if (top_blob.empty()) return -100;
-    top_blob.fill(val);
-    return 0;
-  } else if (dims == 2) {
-    int h = (int)(shape_ptr[0] + 0.5);
-    int w = (int)(shape_ptr[1] + 0.5);
-    size_t elemsize = sizeof(val);
-    top_blob.create(w, h, elemsize, opt.blob_allocator);
-    if (top_blob.empty()) return -100;
-    top_blob.fill(val);
-    return 0;
-  } else if (dims == 3) {
-    int channels = (int)(shape_ptr[0] + 0.5);
-    int h = (int)(shape_ptr[1] + 0.5);
-    int w = (int)(shape_ptr[2] + 0.5);
-    size_t elemsize = sizeof(val);
-    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
-    if (top_blob.empty()) return -100;
-    top_blob.fill(val);
-    return 0;
-  }
-  return -1;
-}
+        if (dims == 1)
+        {
+            int    w        = (int)(shape_ptr[0] + 0.5);
+            size_t elemsize = sizeof(val);
+            top_blob.create(w, elemsize, opt.blob_allocator);
+            if (top_blob.empty()) return -100;
+            top_blob.fill(val);
+            return 0;
+        }
+        else if (dims == 2)
+        {
+            int    h        = (int)(shape_ptr[0] + 0.5);
+            int    w        = (int)(shape_ptr[1] + 0.5);
+            size_t elemsize = sizeof(val);
+            top_blob.create(w, h, elemsize, opt.blob_allocator);
+            if (top_blob.empty()) return -100;
+            top_blob.fill(val);
+            return 0;
+        }
+        else if (dims == 3)
+        {
+            int    channels = (int)(shape_ptr[0] + 0.5);
+            int    h        = (int)(shape_ptr[1] + 0.5);
+            int    w        = (int)(shape_ptr[2] + 0.5);
+            size_t elemsize = sizeof(val);
+            top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
+            if (top_blob.empty()) return -100;
+            top_blob.fill(val);
+            return 0;
+        }
+        return -1;
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h
old mode 100755
new mode 100644
index b61fb62c09..d068fd3196
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h
@@ -4,20 +4,21 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class ConstantOfShape : public ncnn::Layer {
- public:
-  ConstantOfShape();
+    class ConstantOfShape : public ncnn::Layer
+    {
+      public:
+        ConstantOfShape();
 
-  virtual int load_param(const ncnn::ParamDict& pd);
+        virtual int load_param(const ncnn::ParamDict& pd);
 
-  virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob,
-                      const ncnn::Option& opt) const;
+        virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const;
 
- public:
-  float val;
-};
+      public:
+        float val;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp
old mode 100755
new mode 100644
index be3d75a248..c742b91df7
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp
@@ -4,330 +4,452 @@
 #include "expand.h"
 
 #include "../ncnn_ops_definer.h"
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(Expand)
-DEFINE_NCNN_OPS(Expand, Expand)
-Expand::Expand() {
-  one_blob_only = false;
-  support_inplace = false;
-}
-
-int Expand::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs,
-                    const Option& opt) const {
-  const Mat& bottom_blob = bottom_blobs[0];
-  size_t elemsize = bottom_blob.elemsize;
-  const Mat& old_shape_blob = bottom_blobs[1];
-  const int shape_width = old_shape_blob.w - 1;
-  Mat shape_blob(shape_width, elemsize, opt.workspace_allocator);
-  memcpy(shape_blob.row(0), old_shape_blob.row(0) + 1, shape_width * elemsize);
-  Mat& top_blob = top_blobs[0];
-
-  if (bottom_blob.dims == 1 && shape_blob.w == 1) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    if (bottom_blob.w != shape_0 && bottom_blob.w != 1 && shape_0 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d) vs (%d)\n", bottom_blob.w, shape_0);
-    } else if (bottom_blob.w == shape_0 || shape_0 == 1) {
-      top_blob.create(bottom_blob.w, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-
-      for (int i = 0; i < bottom_blob.w; i++) {
-        top_blob[i] = bottom_blob[i];
-      }
-    } else if (bottom_blob.w == 1) {
-      top_blob.create(shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-
-      for (int i = 0; i < shape_0; i++) {
-        top_blob[i] = bottom_blob[0];
-      }
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(Expand)
+    DEFINE_NCNN_OPS(Expand, Expand)
+    Expand::Expand()
+    {
+        one_blob_only   = false;
+        support_inplace = false;
     }
-    return 0;
-  } else if (bottom_blob.dims == 1 && shape_blob.w == 2) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    int shape_1 = (int)(shape_blob[1] + 0.5);
-    if (bottom_blob.w != shape_1 && bottom_blob.w != 1 && shape_1 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (1, %d) vs (%d, %d)\n", bottom_blob.w, shape_0,
-              shape_1);
-    } else if (bottom_blob.w == shape_1 || shape_1 == 1) {
-      top_blob.create(bottom_blob.w, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
 
-      for (int j = 0; j < shape_0; j++) {
-        for (int i = 0; i < bottom_blob.w; i++) {
-          top_blob.row(j)[i] = bottom_blob[i];
-        }
-      }
+    int Expand::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+    {
+        const Mat& bottom_blob    = bottom_blobs[0];
+        size_t     elemsize       = bottom_blob.elemsize;
+        const Mat& old_shape_blob = bottom_blobs[1];
+        const int  shape_width    = old_shape_blob.w - 1;
+        Mat        shape_blob(shape_width, elemsize, opt.workspace_allocator);
+        memcpy(shape_blob.row(0), old_shape_blob.row(0) + 1, shape_width * elemsize);
+        Mat& top_blob = top_blobs[0];
 
-    } else if (bottom_blob.w == 1) {
-      top_blob.create(shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
+        if (bottom_blob.dims == 1 && shape_blob.w == 1)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            if (bottom_blob.w != shape_0 && bottom_blob.w != 1 && shape_0 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d) vs (%d)\n", bottom_blob.w, shape_0);
+            }
+            else if (bottom_blob.w == shape_0 || shape_0 == 1)
+            {
+                top_blob.create(bottom_blob.w, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
 
-      for (int j = 0; j < shape_0; j++) {
-        for (int i = 0; i < shape_1; i++) {
-          top_blob.row(j)[i] = bottom_blob[0];
-        }
-      }
+                for (int i = 0; i < bottom_blob.w; i++)
+                {
+                    top_blob[i] = bottom_blob[i];
+                }
+            }
+            else if (bottom_blob.w == 1)
+            {
+                top_blob.create(shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
 
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
-    }
-    return 0;
-  } else if (bottom_blob.dims == 1 && shape_blob.w == 3) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    int shape_1 = (int)(shape_blob[1] + 0.5);
-    int shape_2 = (int)(shape_blob[2] + 0.5);
-
-    if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (1, 1, %d) vs (%d, %d, %d)\n", bottom_blob.w,
-              shape_0, shape_1, shape_2);
-    } else if (bottom_blob.w == shape_2 || shape_2 == 1) {
-      top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob[i];
-          }
-        }
-      }
-    } else if (bottom_blob.w == 1) {
-      top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob[0];
-          }
-        }
-      }
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
-    }
-    return 0;
-  } else if (bottom_blob.dims == 2 && shape_blob.w == 2) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    int shape_1 = (int)(shape_blob[1] + 0.5);
-    if (bottom_blob.w != shape_1 && bottom_blob.w != 1 && shape_1 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d)\n", bottom_blob.h,
-              bottom_blob.w, shape_0, shape_1);
-    } else if (bottom_blob.h != shape_0 && bottom_blob.h != 1 && shape_0 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d)\n", bottom_blob.h,
-              bottom_blob.w, shape_0, shape_1);
-    } else if ((bottom_blob.w == shape_1 || shape_1 == 1) &&
-               (bottom_blob.h == shape_0 || shape_0 == 1)) {
-      top_blob.create(bottom_blob.w, bottom_blob.h, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int j = 0; j < bottom_blob.h; j++) {
-        for (int i = 0; i < bottom_blob.w; i++) {
-          top_blob.row(j)[i] = bottom_blob.row(j)[i];
-        }
-      }
-    } else if ((bottom_blob.w == shape_1 || shape_1 == 1) && (bottom_blob.h == 1)) {
-      top_blob.create(bottom_blob.w, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int j = 0; j < shape_0; j++) {
-        for (int i = 0; i < bottom_blob.w; i++) {
-          top_blob.row(j)[i] = bottom_blob.row(0)[i];
-        }
-      }
-    } else if ((bottom_blob.w == 1) && (bottom_blob.h == shape_0 || shape_0 == 1)) {
-      top_blob.create(shape_1, bottom_blob.h, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int j = 0; j < bottom_blob.h; j++) {
-        for (int i = 0; i < shape_1; i++) {
-          top_blob.row(j)[i] = bottom_blob.row(j)[0];
+                for (int i = 0; i < shape_0; i++)
+                {
+                    top_blob[i] = bottom_blob[0];
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
-    } else if (bottom_blob.h == 1 && bottom_blob.w == 1) {
-      top_blob.create(shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int j = 0; j < shape_0; j++) {
-        for (int i = 0; i < shape_1; i++) {
-          top_blob.row(j)[i] = bottom_blob.row(0)[0];
-        }
-      }
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
-    }
-    return 0;
-  } else if (bottom_blob.dims == 2 && shape_blob.w == 3) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    int shape_1 = (int)(shape_blob[1] + 0.5);
-    int shape_2 = (int)(shape_blob[2] + 0.5);
-    if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d, %d)\n", bottom_blob.h,
-              bottom_blob.w, shape_0, shape_1, shape_2);
-    } else if (bottom_blob.h != shape_1 && bottom_blob.h != 1 && shape_1 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d, %d)\n", bottom_blob.h,
-              bottom_blob.w, shape_0, shape_1, shape_2);
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
-               (bottom_blob.h == shape_1 || shape_1 == 1)) {
-      top_blob.create(bottom_blob.w, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.row(j)[i];
-          }
-        }
-      }
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1)) {
-      top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.row(0)[i];
-          }
-        }
-      }
-
-    } else if ((bottom_blob.w == 1) && (bottom_blob.h == shape_1 || shape_1 == 1)) {
-      top_blob.create(shape_2, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.row(j)[0];
-          }
-        }
-      }
+        else if (bottom_blob.dims == 1 && shape_blob.w == 2)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            int shape_1 = (int)(shape_blob[1] + 0.5);
+            if (bottom_blob.w != shape_1 && bottom_blob.w != 1 && shape_1 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (1, %d) vs (%d, %d)\n", bottom_blob.w, shape_0, shape_1);
+            }
+            else if (bottom_blob.w == shape_1 || shape_1 == 1)
+            {
+                top_blob.create(bottom_blob.w, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
 
-    } else if (bottom_blob.h == 1 && bottom_blob.w == 1) {
-      top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.row(0)[0];
-          }
-        }
-      }
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
-    }
-    return 0;
-  } else if (bottom_blob.dims == 3 && shape_blob.w == 3) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    int shape_1 = (int)(shape_blob[1] + 0.5);
-    int shape_2 = (int)(shape_blob[2] + 0.5);
-    if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c,
-              bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
-    } else if (bottom_blob.h != shape_1 && bottom_blob.h != 1 && shape_1 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c,
-              bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
-    } else if (bottom_blob.c != shape_0 && bottom_blob.c != 1 && shape_0 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c,
-              bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
-               (bottom_blob.h == shape_1 || shape_1 == 1) &&
-               (bottom_blob.c == shape_0 || shape_0 == 1)) {
-      top_blob.create(bottom_blob.w, bottom_blob.h, bottom_blob.c, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < bottom_blob.c; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(j)[i];
-          }
-        }
-      }
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
-               (bottom_blob.h == shape_1 || shape_1 == 1) && (bottom_blob.c == 1)) {
-      top_blob.create(bottom_blob.w, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(j)[i];
-          }
-        }
-      }
-
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1) &&
-               (bottom_blob.c == shape_0 || shape_0 == 1)) {
-      top_blob.create(bottom_blob.w, shape_1, bottom_blob.c, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < bottom_blob.c; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(0)[i];
-          }
-        }
-      }
+                for (int j = 0; j < shape_0; j++)
+                {
+                    for (int i = 0; i < bottom_blob.w; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob[i];
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1)
+            {
+                top_blob.create(shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
 
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1) &&
-               (bottom_blob.c == 1)) {
-      top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(0)[i];
-          }
+                for (int j = 0; j < shape_0; j++)
+                {
+                    for (int i = 0; i < shape_1; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob[0];
+                    }
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
+        else if (bottom_blob.dims == 1 && shape_blob.w == 3)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            int shape_1 = (int)(shape_blob[1] + 0.5);
+            int shape_2 = (int)(shape_blob[2] + 0.5);
 
-    } else if (bottom_blob.w == 1 && (bottom_blob.h == shape_1 || shape_1 == 1) &&
-               (bottom_blob.c == shape_0 || shape_0 == 1)) {
-      top_blob.create(shape_2, bottom_blob.h, bottom_blob.c, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < bottom_blob.c; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(j)[0];
-          }
+            if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (1, 1, %d) vs (%d, %d, %d)\n", bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if (bottom_blob.w == shape_2 || shape_2 == 1)
+            {
+                top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob[i];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1)
+            {
+                top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob[0];
+                        }
+                    }
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
-    } else if (bottom_blob.w == 1 && (bottom_blob.h == shape_1 || shape_1 == 1) &&
-               (bottom_blob.c == 1)) {
-      top_blob.create(shape_2, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(j)[0];
-          }
+        else if (bottom_blob.dims == 2 && shape_blob.w == 2)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            int shape_1 = (int)(shape_blob[1] + 0.5);
+            if (bottom_blob.w != shape_1 && bottom_blob.w != 1 && shape_1 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d)\n", bottom_blob.h, bottom_blob.w, shape_0, shape_1);
+            }
+            else if (bottom_blob.h != shape_0 && bottom_blob.h != 1 && shape_0 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d)\n", bottom_blob.h, bottom_blob.w, shape_0, shape_1);
+            }
+            else if ((bottom_blob.w == shape_1 || shape_1 == 1) &&
+                     (bottom_blob.h == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(bottom_blob.w, bottom_blob.h, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int j = 0; j < bottom_blob.h; j++)
+                {
+                    for (int i = 0; i < bottom_blob.w; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob.row(j)[i];
+                    }
+                }
+            }
+            else if ((bottom_blob.w == shape_1 || shape_1 == 1) && (bottom_blob.h == 1))
+            {
+                top_blob.create(bottom_blob.w, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int j = 0; j < shape_0; j++)
+                {
+                    for (int i = 0; i < bottom_blob.w; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob.row(0)[i];
+                    }
+                }
+            }
+            else if ((bottom_blob.w == 1) && (bottom_blob.h == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(shape_1, bottom_blob.h, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int j = 0; j < bottom_blob.h; j++)
+                {
+                    for (int i = 0; i < shape_1; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob.row(j)[0];
+                    }
+                }
+            }
+            else if (bottom_blob.h == 1 && bottom_blob.w == 1)
+            {
+                top_blob.create(shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int j = 0; j < shape_0; j++)
+                {
+                    for (int i = 0; i < shape_1; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob.row(0)[0];
+                    }
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
-    } else if (bottom_blob.w == 1 && bottom_blob.h == 1 &&
-               (bottom_blob.c == shape_0 || shape_0 == 1)) {
-      top_blob.create(shape_2, shape_1, bottom_blob.c, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < bottom_blob.c; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(0)[0];
-          }
+        else if (bottom_blob.dims == 2 && shape_blob.w == 3)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            int shape_1 = (int)(shape_blob[1] + 0.5);
+            int shape_2 = (int)(shape_blob[2] + 0.5);
+            if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d, %d)\n", bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if (bottom_blob.h != shape_1 && bottom_blob.h != 1 && shape_1 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d, %d)\n", bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
+                     (bottom_blob.h == shape_1 || shape_1 == 1))
+            {
+                top_blob.create(bottom_blob.w, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.row(j)[i];
+                        }
+                    }
+                }
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1))
+            {
+                top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.row(0)[i];
+                        }
+                    }
+                }
+            }
+            else if ((bottom_blob.w == 1) && (bottom_blob.h == shape_1 || shape_1 == 1))
+            {
+                top_blob.create(shape_2, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.row(j)[0];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.h == 1 && bottom_blob.w == 1)
+            {
+                top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.row(0)[0];
+                        }
+                    }
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
-    } else if (bottom_blob.w == 1 && bottom_blob.h == 1 && bottom_blob.c == 1) {
-      top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(0)[0];
-          }
+        else if (bottom_blob.dims == 3 && shape_blob.w == 3)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            int shape_1 = (int)(shape_blob[1] + 0.5);
+            int shape_2 = (int)(shape_blob[2] + 0.5);
+            if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c, bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if (bottom_blob.h != shape_1 && bottom_blob.h != 1 && shape_1 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c, bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if (bottom_blob.c != shape_0 && bottom_blob.c != 1 && shape_0 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c, bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
+                     (bottom_blob.h == shape_1 || shape_1 == 1) &&
+                     (bottom_blob.c == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(bottom_blob.w, bottom_blob.h, bottom_blob.c, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < bottom_blob.c; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(j)[i];
+                        }
+                    }
+                }
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
+                     (bottom_blob.h == shape_1 || shape_1 == 1) && (bottom_blob.c == 1))
+            {
+                top_blob.create(bottom_blob.w, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(j)[i];
+                        }
+                    }
+                }
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1) &&
+                     (bottom_blob.c == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(bottom_blob.w, shape_1, bottom_blob.c, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < bottom_blob.c; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(0)[i];
+                        }
+                    }
+                }
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1) &&
+                     (bottom_blob.c == 1))
+            {
+                top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(0)[i];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1 && (bottom_blob.h == shape_1 || shape_1 == 1) &&
+                     (bottom_blob.c == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(shape_2, bottom_blob.h, bottom_blob.c, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < bottom_blob.c; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(j)[0];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1 && (bottom_blob.h == shape_1 || shape_1 == 1) &&
+                     (bottom_blob.c == 1))
+            {
+                top_blob.create(shape_2, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(j)[0];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1 && bottom_blob.h == 1 &&
+                     (bottom_blob.c == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(shape_2, shape_1, bottom_blob.c, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < bottom_blob.c; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(0)[0];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1 && bottom_blob.h == 1 && bottom_blob.c == 1)
+            {
+                top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(0)[0];
+                        }
+                    }
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
+        fprintf(stderr, "Layer: Expand, bottom_blob.dims: %d, shape_blob.w: %d\n", bottom_blob.dims, shape_blob.w);
+        return -1;
     }
-    return 0;
-  }
-  fprintf(stderr, "Layer: Expand, bottom_blob.dims: %d, shape_blob.w: %d\n", bottom_blob.dims,
-          shape_blob.w);
-  return -1;
-}
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h
old mode 100755
new mode 100644
index 3dca54fb0f..a378965d03
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h
@@ -4,15 +4,16 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class Expand : public ncnn::Layer {
- public:
-  Expand();
+    class Expand : public ncnn::Layer
+    {
+      public:
+        Expand();
 
-  virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs,
-                      const ncnn::Option& opt) const;
-};
+        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp
index 4b6bd34630..24ea7f7181 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp
@@ -4,157 +4,180 @@
 #include "../ncnn_ops_definer.h"
 #include "assert.h"
 
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(Gather)
-DEFINE_NCNN_OPS(Gather, Gather)
-Gather::Gather() {
-  one_blob_only = false;
-  support_inplace = false;
-}
-
-int Gather::load_param(const ParamDict &pd) {
-  axis = pd.get(0, 0);
-
-  return 0;
-}
-
-// Gather only support 1-dim of indices, because the data and indices all has
-// implicit batch in ncnn, this will lead to wrong shape to match onnx result.
-// When indices dim equals to 1, after eliminating implicit batch, the indices
-// dim still be 1. So there is only 1 implicit batch in data, this will make
-// the shape match onnx result.
-int Gather::forward(const std::vector<Mat> &bottom_blobs, std::vector<Mat> &top_blobs,
-                    const Option &opt) const {
-  const Mat &bottom_blob = bottom_blobs[0];
-  const Mat &indices = bottom_blobs[1];
-  int dims = bottom_blob.dims;
-  int indices_dims = indices.dims;
-  size_t elemsize = bottom_blob.elemsize;
-  int positive_axis = axis < 0 ? dims + axis : axis;
-  Mat &top_blob = top_blobs[0];
-  assert(indices.dims == 1);
-  const float *indices_ptr = indices;
-
-  if (dims == 1 && indices_dims == 1)  // positive_axis == 0
-  {
-    int w = indices.w;
-    top_blob.create(w, elemsize, opt.blob_allocator);
-    if (top_blob.empty()) {
-      return -100;
-    }
-    const float *ptr = bottom_blob;
-    float *outptr = top_blob;
-    for (int i = 0; i < w; i++) {
-      float indice = indices_ptr[i];
-      outptr[i] = ptr[(int)(indice + 0.5)];
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(Gather)
+    DEFINE_NCNN_OPS(Gather, Gather)
+    Gather::Gather()
+    {
+        one_blob_only   = false;
+        support_inplace = false;
     }
 
-    return 0;
-  }
-
-  if (dims == 2 && positive_axis == 0 && indices_dims == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    top_blob.create(w, indices.w, elemsize, opt.blob_allocator);
-    // w -> w
-    // h -> indices.w
-    // h * w -> indices.w * w
-    if (top_blob.empty()) {
-      return -100;
-    }
-    const float *ptr = bottom_blob;
-    float *outptr = top_blob;
-    for (int i = 0; i < indices.w; i++) {
-      const int selected = (int)(indices_ptr[i] + 0.5);
-      memcpy(top_blob.row(i), bottom_blob.row(selected), w * elemsize);
-    }
+    int Gather::load_param(const ParamDict& pd)
+    {
+        axis = pd.get(0, 0);
 
-    return 0;
-  }
-
-  if (dims == 2 && positive_axis == 1 && indices_dims == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    top_blob.create(indices.w, h, elemsize, opt.blob_allocator);
-    // w -> h
-    // h -> indices.w
-    // h * w -> indices.w * h
-    if (top_blob.empty()) {
-      return -100;
-    }
-    const float *ptr = bottom_blob;
-    float *outptr = top_blob;
-    for (int j = 0; j < h; j++) {
-      for (int i = 0; i < indices.w; i++) {
-        int selected = (int)(indices_ptr[i] + 0.5);
-        outptr[j * indices.w + i] = ptr[j * w + selected];
-      }
+        return 0;
     }
-    return 0;
-  }
 
-  if (dims == 3 && positive_axis == 0 && indices_dims == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    top_blob.create(w, h, indices.w, elemsize, opt.blob_allocator);
+    // Gather only support 1-dim of indices, because the data and indices all has
+    // implicit batch in ncnn, this will lead to wrong shape to match onnx result.
+    // When indices dim equals to 1, after eliminating implicit batch, the indices
+    // dim still be 1. So there is only 1 implicit batch in data, this will make
+    // the shape match onnx result.
+    int Gather::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+    {
+        const Mat& bottom_blob   = bottom_blobs[0];
+        const Mat& indices       = bottom_blobs[1];
+        int        dims          = bottom_blob.dims;
+        int        indices_dims  = indices.dims;
+        size_t     elemsize      = bottom_blob.elemsize;
+        int        positive_axis = axis < 0 ? dims + axis : axis;
+        Mat&       top_blob      = top_blobs[0];
+        assert(indices.dims == 1);
+        const float* indices_ptr = indices;
+
+        if (dims == 1 && indices_dims == 1)  // positive_axis == 0
+        {
+            int w = indices.w;
+            top_blob.create(w, elemsize, opt.blob_allocator);
+            if (top_blob.empty())
+            {
+                return -100;
+            }
+            const float* ptr    = bottom_blob;
+            float*       outptr = top_blob;
+            for (int i = 0; i < w; i++)
+            {
+                float indice = indices_ptr[i];
+                outptr[i]    = ptr[(int)(indice + 0.5)];
+            }
+
+            return 0;
+        }
 
-    if (top_blob.empty()) {
-      return -100;
-    }
-    for (int i = 0; i < indices.w; i++) {
-      int selected = (int)(indices_ptr[i] + 0.5);
-      const unsigned char *ptr = bottom_blob.channel(selected);
-      unsigned char *outptr = top_blob.channel(i);
+        if (dims == 2 && positive_axis == 0 && indices_dims == 1)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            top_blob.create(w, indices.w, elemsize, opt.blob_allocator);
+            // w -> w
+            // h -> indices.w
+            // h * w -> indices.w * w
+            if (top_blob.empty())
+            {
+                return -100;
+            }
+            const float* ptr    = bottom_blob;
+            float*       outptr = top_blob;
+            for (int i = 0; i < indices.w; i++)
+            {
+                const int selected = (int)(indices_ptr[i] + 0.5);
+                memcpy(top_blob.row(i), bottom_blob.row(selected), w * elemsize);
+            }
+
+            return 0;
+        }
 
-      memcpy(outptr, ptr, w * h * elemsize);
-    }
-    return 0;
-  }
-
-  if (dims == 3 && positive_axis == 1 && indices_dims == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    top_blob.create(w, indices.w, channels, elemsize, opt.blob_allocator);
-#pragma omp parallel for num_threads(opt.num_threads)
-    // use parallel programming
-    for (int i = 0; i < channels; i++) {
-      float *outptr = top_blob.channel(i);
-      const float *ptr = bottom_blob.channel(i);
-      for (int j = 0; j < indices.w; j++) {
-        int selected = (int)(indices_ptr[j] + 0.5);
-        for (int k = 0; k < w; k++) {
-          outptr[j * w + k] = ptr[selected * w + k];
+        if (dims == 2 && positive_axis == 1 && indices_dims == 1)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            top_blob.create(indices.w, h, elemsize, opt.blob_allocator);
+            // w -> h
+            // h -> indices.w
+            // h * w -> indices.w * h
+            if (top_blob.empty())
+            {
+                return -100;
+            }
+            const float* ptr    = bottom_blob;
+            float*       outptr = top_blob;
+            for (int j = 0; j < h; j++)
+            {
+                for (int i = 0; i < indices.w; i++)
+                {
+                    int selected              = (int)(indices_ptr[i] + 0.5);
+                    outptr[j * indices.w + i] = ptr[j * w + selected];
+                }
+            }
+            return 0;
         }
-      }
-    }
 
-    return 0;
-  }
+        if (dims == 3 && positive_axis == 0 && indices_dims == 1)
+        {
+            int w        = bottom_blob.w;
+            int h        = bottom_blob.h;
+            int channels = bottom_blob.c;
+            top_blob.create(w, h, indices.w, elemsize, opt.blob_allocator);
+
+            if (top_blob.empty())
+            {
+                return -100;
+            }
+            for (int i = 0; i < indices.w; i++)
+            {
+                int                  selected = (int)(indices_ptr[i] + 0.5);
+                const unsigned char* ptr      = bottom_blob.channel(selected);
+                unsigned char*       outptr   = top_blob.channel(i);
+
+                memcpy(outptr, ptr, w * h * elemsize);
+            }
+            return 0;
+        }
 
-  if (dims == 3 && positive_axis == 2 && indices_dims == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    top_blob.create(indices.w, h, channels, elemsize, opt.blob_allocator);
+        if (dims == 3 && positive_axis == 1 && indices_dims == 1)
+        {
+            int w        = bottom_blob.w;
+            int h        = bottom_blob.h;
+            int channels = bottom_blob.c;
+            top_blob.create(w, indices.w, channels, elemsize, opt.blob_allocator);
 #pragma omp parallel for num_threads(opt.num_threads)
-    // use parallel programming
-    for (int i = 0; i < channels; i++) {
-      float *outptr = top_blob.channel(i);
-      const float *ptr = bottom_blob.channel(i);
-      for (int j = 0; j < h; j++) {
-        for (int k = 0; k < indices.w; k++) {
-          int selected = (int)(indices_ptr[k] + 0.5);
-          outptr[j * indices.w + k] = ptr[j * w + selected];
+            // use parallel programming
+            for (int i = 0; i < channels; i++)
+            {
+                float*       outptr = top_blob.channel(i);
+                const float* ptr    = bottom_blob.channel(i);
+                for (int j = 0; j < indices.w; j++)
+                {
+                    int selected = (int)(indices_ptr[j] + 0.5);
+                    for (int k = 0; k < w; k++)
+                    {
+                        outptr[j * w + k] = ptr[selected * w + k];
+                    }
+                }
+            }
+
+            return 0;
         }
-      }
-    }
-    return 0;
-  }
 
-  return 0;
-}
+        if (dims == 3 && positive_axis == 2 && indices_dims == 1)
+        {
+            int w        = bottom_blob.w;
+            int h        = bottom_blob.h;
+            int channels = bottom_blob.c;
+            top_blob.create(indices.w, h, channels, elemsize, opt.blob_allocator);
+#pragma omp parallel for num_threads(opt.num_threads)
+            // use parallel programming
+            for (int i = 0; i < channels; i++)
+            {
+                float*       outptr = top_blob.channel(i);
+                const float* ptr    = bottom_blob.channel(i);
+                for (int j = 0; j < h; j++)
+                {
+                    for (int k = 0; k < indices.w; k++)
+                    {
+                        int selected              = (int)(indices_ptr[k] + 0.5);
+                        outptr[j * indices.w + k] = ptr[j * w + selected];
+                    }
+                }
+            }
+            return 0;
+        }
+
+        return 0;
+    }
 
 }  //  namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
old mode 100755
new mode 100644
index af6eb6365e..13d38e4bd0
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
@@ -4,20 +4,21 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class Gather : public ncnn::Layer {
- public:
-  Gather();
+    class Gather : public ncnn::Layer
+    {
+      public:
+        Gather();
 
-  virtual int load_param(const ncnn::ParamDict& pd);
+        virtual int load_param(const ncnn::ParamDict& pd);
 
-  virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs,
-                      const ncnn::Option& opt) const;
+        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const;
 
- public:
-  int axis;
-};
+      public:
+        int axis;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_definer.h b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_definer.h
old mode 100755
new mode 100644
index 509c8c0ce0..bd5d9ca23e
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_definer.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_definer.h
@@ -7,22 +7,24 @@
 #include "layer.h"
 #include "ncnn_ops_register.h"
 
-namespace mmdeploy {
-
-class NCNNOpsDefiner {
- public:
-  NCNNOpsDefiner(const std::string& ops_name, const ncnn::layer_creator_func& creator_func = 0,
-                 const ncnn::layer_destroyer_func& destroyer_func = 0)
-      : _ops_name(ops_name) {
-    get_mmdeploy_layer_creator()[_ops_name.c_str()] = creator_func;
-  }
-
- private:
-  const std::string _ops_name;
-};
+namespace mmdeploy
+{
+
+    class NCNNOpsDefiner
+    {
+      public:
+        NCNNOpsDefiner(const std::string& ops_name, const ncnn::layer_creator_func& creator_func = 0, const ncnn::layer_destroyer_func& destroyer_func = 0)
+            : _ops_name(ops_name)
+        {
+            get_mmdeploy_layer_creator()[_ops_name.c_str()] = creator_func;
+        }
+
+      private:
+        const std::string _ops_name;
+    };
 
 #define DEFINE_NCNN_OPS(ops_name, OpsLayer) \
-  static mmdeploy::NCNNOpsDefiner NCNNOpsDefiner##ops_name{#ops_name, OpsLayer##_layer_creator};
+    static mmdeploy::NCNNOpsDefiner NCNNOpsDefiner##ops_name{#ops_name, OpsLayer##_layer_creator};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.cpp
old mode 100755
new mode 100644
index 42bc050a1c..85d4f66d04
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.cpp
@@ -3,32 +3,38 @@
 
 #include <iostream>
 
-std::map<const char *, ncnn::layer_creator_func> &get_mmdeploy_layer_creator() {
-  static std::map<const char *, ncnn::layer_creator_func> _layer_creator_map;
-  return _layer_creator_map;
+std::map<const char*, ncnn::layer_creator_func>& get_mmdeploy_layer_creator()
+{
+    static std::map<const char*, ncnn::layer_creator_func> _layer_creator_map;
+    return _layer_creator_map;
 }
 
-std::map<const char *, ncnn::layer_destroyer_func> &get_mmdeploy_layer_destroyer() {
-  static std::map<const char *, ncnn::layer_destroyer_func> _layer_destroyer_map;
-  return _layer_destroyer_map;
+std::map<const char*, ncnn::layer_destroyer_func>& get_mmdeploy_layer_destroyer()
+{
+    static std::map<const char*, ncnn::layer_destroyer_func> _layer_destroyer_map;
+    return _layer_destroyer_map;
 }
 
-int register_mmdeploy_custom_layers(ncnn::Net &net) {
-  auto &layer_creator_map = get_mmdeploy_layer_creator();
-  auto &layer_destroyer_map = get_mmdeploy_layer_destroyer();
+int register_mmdeploy_custom_layers(ncnn::Net& net)
+{
+    auto& layer_creator_map   = get_mmdeploy_layer_creator();
+    auto& layer_destroyer_map = get_mmdeploy_layer_destroyer();
 
-  for (auto const &creator_pair : layer_creator_map) {
-    auto creator_name = creator_pair.first;
-    auto creator_func = creator_pair.second;
+    for (auto const& creator_pair : layer_creator_map)
+    {
+        auto                       creator_name = creator_pair.first;
+        auto                       creator_func = creator_pair.second;
 
-    ncnn::layer_destroyer_func destroyer_func = 0;
-    if (layer_destroyer_map.find(creator_name) != layer_destroyer_map.end()) {
-      destroyer_func = layer_destroyer_map[creator_name];
+        ncnn::layer_destroyer_func destroyer_func = 0;
+        if (layer_destroyer_map.find(creator_name) != layer_destroyer_map.end())
+        {
+            destroyer_func = layer_destroyer_map[creator_name];
+        }
+        int ret = net.register_custom_layer(creator_name, creator_func, destroyer_func);
+        if (0 != ret)
+        {
+            return ret;
+        }
     }
-    int ret = net.register_custom_layer(creator_name, creator_func, destroyer_func);
-    if (0 != ret) {
-      return ret;
-    }
-  }
-  return 0;
+    return 0;
 }
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
old mode 100755
new mode 100644
index 0d9974f783..b0de664040
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
@@ -11,6 +11,6 @@
 MMDEPLOY_API std::map<const char*, ncnn::layer_creator_func>& get_mmdeploy_layer_creator();
 MMDEPLOY_API std::map<const char*, ncnn::layer_destroyer_func>& get_mmdeploy_layer_destroyer();
 
-MMDEPLOY_API int register_mmdeploy_custom_layers(ncnn::Net& net);
+MMDEPLOY_API int                                                register_mmdeploy_custom_layers(ncnn::Net& net);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
old mode 100755
new mode 100644
index f538eabbac..17ae195659
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
@@ -3,45 +3,56 @@
 
 #include "../ncnn_ops_definer.h"
 
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(Shape)
-DEFINE_NCNN_OPS(Shape, Shape)
-Shape::Shape() {
-  one_blob_only = true;
-  support_inplace = false;
-}
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(Shape)
+    DEFINE_NCNN_OPS(Shape, Shape)
+    Shape::Shape()
+    {
+        one_blob_only   = true;
+        support_inplace = false;
+    }
 
-int Shape::forward(const Mat &bottom_blob, Mat &top_blob, const Option &opt) const {
-  int dims = bottom_blob.dims;
-  int w = bottom_blob.w;
-  size_t elemsize = sizeof(float);
-  top_blob.create(dims + 1, elemsize, opt.blob_allocator);
-  if (top_blob.empty()) {
-    return -100;
-  }
-  float *outptr = top_blob;
+    int Shape::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+    {
+        int    dims     = bottom_blob.dims;
+        int    w        = bottom_blob.w;
+        size_t elemsize = sizeof(float);
+        top_blob.create(dims + 1, elemsize, opt.blob_allocator);
+        if (top_blob.empty())
+        {
+            return -100;
+        }
+        float* outptr = top_blob;
 
-  if (dims == 1) {
-    outptr[0] = 1.0f;
-    outptr[1] = w;
-  } else if (dims == 2) {
-    int h = bottom_blob.h;
-    outptr[0] = 1.0f;
-    outptr[1] = h;
-    outptr[2] = w;
-  } else if (dims == 3) {
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    outptr[0] = 1.0f;
-    outptr[1] = channels;
-    outptr[2] = h;
-    outptr[3] = w;
-  } else {
-    fprintf(stdout, "Unsupported dims=%d\n", dims);
-  }
+        if (dims == 1)
+        {
+            outptr[0] = 1.0f;
+            outptr[1] = w;
+        }
+        else if (dims == 2)
+        {
+            int h     = bottom_blob.h;
+            outptr[0] = 1.0f;
+            outptr[1] = h;
+            outptr[2] = w;
+        }
+        else if (dims == 3)
+        {
+            int h        = bottom_blob.h;
+            int channels = bottom_blob.c;
+            outptr[0]    = 1.0f;
+            outptr[1]    = channels;
+            outptr[2]    = h;
+            outptr[3]    = w;
+        }
+        else
+        {
+            fprintf(stdout, "Unsupported dims=%d\n", dims);
+        }
 
-  return 0;
-}
+        return 0;
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
old mode 100755
new mode 100644
index 863dc77c1d..2330f57ba4
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
@@ -4,15 +4,16 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class Shape : public ncnn::Layer {
- public:
-  Shape();
+    class Shape : public ncnn::Layer
+    {
+      public:
+        Shape();
 
-  virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob,
-                      const ncnn::Option& opt) const;
-};
+        virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp
index 9f2ced1992..b77c9ce56f 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp
@@ -5,202 +5,250 @@
 
 #include "../ncnn_ops_definer.h"
 
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(TensorSlice)
-DEFINE_NCNN_OPS(TensorSlice, TensorSlice)
-TensorSlice::TensorSlice() {
-  one_blob_only = true;
-  support_inplace = false;
-}
-
-int TensorSlice::load_param(const ParamDict& pd) {
-  starts = pd.get(0, Mat());
-  ends = pd.get(1, Mat());
-  axes = pd.get(2, Mat());
-  steps = pd.get(3, Mat());
-  if (axes.w == 0) {
-    axes.create(starts.w);
-    int* axes_ptr = axes;
-    for (int i = 0; i < starts.w; i++) {
-      axes_ptr[i] = i;
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(TensorSlice)
+    DEFINE_NCNN_OPS(TensorSlice, TensorSlice)
+    TensorSlice::TensorSlice()
+    {
+        one_blob_only   = true;
+        support_inplace = false;
     }
-  }
-  if (steps.w == 0) {
-    steps.create(axes.w);
-    steps.fill(1);
-  }
-  return 0;
-}
-
-static inline int get_shape_by_axes(const Mat& blob, int axes, int dims) {
-  switch (dims - axes) {
-    case 0:
-      return blob.w;
-    case 1:
-      return blob.h;
-    case 2:
-      return blob.c;
-    default:
-      fprintf(stderr, "wrong axes %d!\n", axes);
-      return -1;
-  }
-  return 0;
-}
 
-int TensorSlice::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const {
-  int dims = bottom_blob.dims;
-  size_t elemsize = bottom_blob.elemsize;
-  const int* start_ptr = starts;
-  const int* end_ptr = ends;
-  const int* axes_ptr = axes;
-  const int* step_ptr = steps;
-  if (starts.w > dims || ends.w > dims) {
-    fprintf(stderr, "start/end attributes shape error!\n");
-    return -100;
-  }
-  if (axes.w != 1) {
-    fprintf(stderr,
-            "axes.w must be 1 because any of multiaxes slice is regarded as "
-            "multi-staged onnx slice in pytorch2onnx.");
-  }
-  if (dims == 1) {
-    for (int i = 0; i < axes.w; i++) {
-      int positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
-      int step = step_ptr[i];
-      std::vector<float> temp_val;
-      int start = start_ptr[i];
-      int end = end_ptr[i];
-      int cur = start;
-      if (step > 0) {
-        while (cur < end && cur < bottom_blob.w) {
-          temp_val.push_back(bottom_blob[cur]);
-          cur += step;
+    int TensorSlice::load_param(const ParamDict& pd)
+    {
+        starts = pd.get(0, Mat());
+        ends   = pd.get(1, Mat());
+        axes   = pd.get(2, Mat());
+        steps  = pd.get(3, Mat());
+        if (axes.w == 0)
+        {
+            axes.create(starts.w);
+            int* axes_ptr = axes;
+            for (int i = 0; i < starts.w; i++)
+            {
+                axes_ptr[i] = i;
+            }
         }
-      } else if (step < 0) {
-        while (cur > end && cur > 0) {
-          temp_val.push_back(bottom_blob[cur]);
-          cur += step;
+        if (steps.w == 0)
+        {
+            steps.create(axes.w);
+            steps.fill(1);
         }
-      } else {
-        fprintf(stderr, "step should not be 0!\n");
-        return -100;
-      }
-      top_blob.create(temp_val.size(), elemsize, opt.blob_allocator);
-      for (int i = 0; i < temp_val.size(); i++) {
-        top_blob[i] = temp_val[i];
-      }
-    }
-    return 0;
-  }
-  if (dims == 2) {
-    std::vector<std::vector<int> > active_indice;
-    std::vector<int> indices;
-    for (int i = 0; i < bottom_blob.h; i++) {
-      indices.push_back(i);
+        return 0;
     }
-    active_indice.push_back(indices);
-    indices.clear();
-    for (int i = 0; i < bottom_blob.w; i++) {
-      indices.push_back(i);
-    }
-    active_indice.push_back(indices);
-    for (int i = 0; i < axes.w; i++) {
-      int positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
-      int step = step_ptr[i];
-      int start = start_ptr[i];
-      int end = end_ptr[i];
-      int dim_shape = get_shape_by_axes(bottom_blob, positive_axis, dims);
-      int dim_shape_test = get_shape_by_axes(bottom_blob, positive_axis, dims - 1);
-      if (dim_shape < 0) {
-        return -1;
-      }
-      end = end < dim_shape ? end : dim_shape;
-      int cur = start;
-      std::vector<int> temp_indice;
-      if (step > 0) {
-        while (cur < end && cur < dim_shape) {
-          temp_indice.push_back(cur);
-          cur += step;
-        }
-      } else if (step < 0) {
-        while (cur > end && cur > 0) {
-          temp_indice.push_back(cur);
-          cur += step;
-        }
-      } else {
-        fprintf(stderr, "step should not be 0!\n");
-        return -100;
-      }
-      active_indice[positive_axis - 1] = temp_indice;
-      active_indice[positive_axis - 1].resize(temp_indice.size());
-    }
-    top_blob.create((int)active_indice[1].size(), (int)active_indice[0].size(), elemsize,
-                    opt.blob_allocator);
-    for (int i = 0; i < active_indice[0].size(); i++) {
-      for (int j = 0; j < active_indice[1].size(); j++) {
-        top_blob.row(i)[j] = bottom_blob.row(active_indice[0][i])[active_indice[1][j]];
-      }
-    }
-    return 0;
-  }
 
-  if (dims == 3) {
-    std::vector<std::vector<int> > active_indice;
-    std::vector<int> indices;
-    for (int i = 0; i < bottom_blob.c; i++) {
-      indices.push_back(i);
-    }
-    active_indice.push_back(indices);
-    indices.clear();
-    for (int i = 0; i < bottom_blob.h; i++) {
-      indices.push_back(i);
-    }
-    active_indice.push_back(indices);
-    indices.clear();
-    for (int i = 0; i < bottom_blob.w; i++) {
-      indices.push_back(i);
+    static inline int get_shape_by_axes(const Mat& blob, int axes, int dims)
+    {
+        switch (dims - axes)
+        {
+            case 0:
+                return blob.w;
+            case 1:
+                return blob.h;
+            case 2:
+                return blob.c;
+            default:
+                fprintf(stderr, "wrong axes %d!\n", axes);
+                return -1;
+        }
+        return 0;
     }
-    active_indice.push_back(indices);
-    for (int i = 0; i < axes.w; i++) {
-      int positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
-      int step = step_ptr[i];
 
-      int start = start_ptr[i];
-      int end = end_ptr[i];
-      int cur = start;
-      std::vector<int> temp_indice;
-      if (step > 0) {
-        while (cur < end && cur < bottom_blob.w) {
-          temp_indice.push_back(cur);
-          cur += step;
+    int TensorSlice::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+    {
+        int        dims      = bottom_blob.dims;
+        size_t     elemsize  = bottom_blob.elemsize;
+        const int* start_ptr = starts;
+        const int* end_ptr   = ends;
+        const int* axes_ptr  = axes;
+        const int* step_ptr  = steps;
+        if (starts.w > dims || ends.w > dims)
+        {
+            fprintf(stderr, "start/end attributes shape error!\n");
+            return -100;
         }
-      } else if (step < 0) {
-        while (cur > end && cur > 0) {
-          temp_indice.push_back(cur);
-          cur += step;
+        if (axes.w != 1)
+        {
+            fprintf(stderr,
+                    "axes.w must be 1 because any of multiaxes slice is regarded as "
+                    "multi-staged onnx slice in pytorch2onnx.");
         }
-      } else {
-        fprintf(stderr, "step should not be 0!\n");
-        return -100;
-      }
-      active_indice[positive_axis - 1] = temp_indice;
-      active_indice[positive_axis - 1].resize(temp_indice.size());
-    }
-    top_blob.create((int)active_indice[2].size(), (int)active_indice[1].size(),
-                    (int)active_indice[0].size(), elemsize, opt.blob_allocator);
-    for (int i = 0; i < active_indice[0].size(); i++) {
-      for (int j = 0; j < active_indice[1].size(); j++) {
-        for (int k = 0; k < active_indice[2].size(); k++) {
-          top_blob.channel(i).row(j)[k] = bottom_blob.channel(active_indice[0][i])
-                                              .row(active_indice[1][j])[active_indice[2][k]];
+        if (dims == 1)
+        {
+            for (int i = 0; i < axes.w; i++)
+            {
+                int                positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
+                int                step          = step_ptr[i];
+                std::vector<float> temp_val;
+                int                start = start_ptr[i];
+                int                end   = end_ptr[i];
+                int                cur   = start;
+                if (step > 0)
+                {
+                    while (cur < end && cur < bottom_blob.w)
+                    {
+                        temp_val.push_back(bottom_blob[cur]);
+                        cur += step;
+                    }
+                }
+                else if (step < 0)
+                {
+                    while (cur > end && cur > 0)
+                    {
+                        temp_val.push_back(bottom_blob[cur]);
+                        cur += step;
+                    }
+                }
+                else
+                {
+                    fprintf(stderr, "step should not be 0!\n");
+                    return -100;
+                }
+                top_blob.create(temp_val.size(), elemsize, opt.blob_allocator);
+                for (int i = 0; i < temp_val.size(); i++)
+                {
+                    top_blob[i] = temp_val[i];
+                }
+            }
+            return 0;
+        }
+        if (dims == 2)
+        {
+            std::vector<std::vector<int>> active_indice;
+            std::vector<int>              indices;
+            for (int i = 0; i < bottom_blob.h; i++)
+            {
+                indices.push_back(i);
+            }
+            active_indice.push_back(indices);
+            indices.clear();
+            for (int i = 0; i < bottom_blob.w; i++)
+            {
+                indices.push_back(i);
+            }
+            active_indice.push_back(indices);
+            for (int i = 0; i < axes.w; i++)
+            {
+                int positive_axis  = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
+                int step           = step_ptr[i];
+                int start          = start_ptr[i];
+                int end            = end_ptr[i];
+                int dim_shape      = get_shape_by_axes(bottom_blob, positive_axis, dims);
+                int dim_shape_test = get_shape_by_axes(bottom_blob, positive_axis, dims - 1);
+                if (dim_shape < 0)
+                {
+                    return -1;
+                }
+                end                  = end < dim_shape ? end : dim_shape;
+                int              cur = start;
+                std::vector<int> temp_indice;
+                if (step > 0)
+                {
+                    while (cur < end && cur < dim_shape)
+                    {
+                        temp_indice.push_back(cur);
+                        cur += step;
+                    }
+                }
+                else if (step < 0)
+                {
+                    while (cur > end && cur > 0)
+                    {
+                        temp_indice.push_back(cur);
+                        cur += step;
+                    }
+                }
+                else
+                {
+                    fprintf(stderr, "step should not be 0!\n");
+                    return -100;
+                }
+                active_indice[positive_axis - 1] = temp_indice;
+                active_indice[positive_axis - 1].resize(temp_indice.size());
+            }
+            top_blob.create((int)active_indice[1].size(), (int)active_indice[0].size(), elemsize, opt.blob_allocator);
+            for (int i = 0; i < active_indice[0].size(); i++)
+            {
+                for (int j = 0; j < active_indice[1].size(); j++)
+                {
+                    top_blob.row(i)[j] = bottom_blob.row(active_indice[0][i])[active_indice[1][j]];
+                }
+            }
+            return 0;
         }
-      }
-    }
-    return 0;
-  }
 
-  return 0;
-}
+        if (dims == 3)
+        {
+            std::vector<std::vector<int>> active_indice;
+            std::vector<int>              indices;
+            for (int i = 0; i < bottom_blob.c; i++)
+            {
+                indices.push_back(i);
+            }
+            active_indice.push_back(indices);
+            indices.clear();
+            for (int i = 0; i < bottom_blob.h; i++)
+            {
+                indices.push_back(i);
+            }
+            active_indice.push_back(indices);
+            indices.clear();
+            for (int i = 0; i < bottom_blob.w; i++)
+            {
+                indices.push_back(i);
+            }
+            active_indice.push_back(indices);
+            for (int i = 0; i < axes.w; i++)
+            {
+                int              positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
+                int              step          = step_ptr[i];
+
+                int              start = start_ptr[i];
+                int              end   = end_ptr[i];
+                int              cur   = start;
+                std::vector<int> temp_indice;
+                if (step > 0)
+                {
+                    while (cur < end && cur < bottom_blob.w)
+                    {
+                        temp_indice.push_back(cur);
+                        cur += step;
+                    }
+                }
+                else if (step < 0)
+                {
+                    while (cur > end && cur > 0)
+                    {
+                        temp_indice.push_back(cur);
+                        cur += step;
+                    }
+                }
+                else
+                {
+                    fprintf(stderr, "step should not be 0!\n");
+                    return -100;
+                }
+                active_indice[positive_axis - 1] = temp_indice;
+                active_indice[positive_axis - 1].resize(temp_indice.size());
+            }
+            top_blob.create((int)active_indice[2].size(), (int)active_indice[1].size(), (int)active_indice[0].size(), elemsize, opt.blob_allocator);
+            for (int i = 0; i < active_indice[0].size(); i++)
+            {
+                for (int j = 0; j < active_indice[1].size(); j++)
+                {
+                    for (int k = 0; k < active_indice[2].size(); k++)
+                    {
+                        top_blob.channel(i).row(j)[k] = bottom_blob.channel(active_indice[0][i])
+                                                            .row(active_indice[1][j])[active_indice[2][k]];
+                    }
+                }
+            }
+            return 0;
+        }
+
+        return 0;
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
old mode 100755
new mode 100644
index 9164d43335..14342c6f81
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
@@ -4,23 +4,24 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class TensorSlice : public ncnn::Layer {
- public:
-  TensorSlice();
+    class TensorSlice : public ncnn::Layer
+    {
+      public:
+        TensorSlice();
 
-  virtual int load_param(const ncnn::ParamDict& pd);
+        virtual int load_param(const ncnn::ParamDict& pd);
 
-  virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob,
-                      const ncnn::Option& opt) const;
+        virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const;
 
- public:
-  ncnn::Mat starts;
-  ncnn::Mat ends;
-  ncnn::Mat axes;
-  ncnn::Mat steps;
-};
+      public:
+        ncnn::Mat starts;
+        ncnn::Mat ends;
+        ncnn::Mat axes;
+        ncnn::Mat steps;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp
index f618831568..91235fa476 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp
@@ -6,872 +6,1118 @@
 #include <functional>
 
 #include "../ncnn_ops_definer.h"
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(TopK)
-DEFINE_NCNN_OPS(TopK, TopK)
-
-TopK::TopK() {
-  one_blob_only = false;
-  support_inplace = false;
-}
-int TopK::load_param(const ParamDict& pd) {
-  axis = pd.get(0, -1);
-  largest = pd.get(1, 1);
-  sorted = pd.get(2, 1);
-  keep_dims = pd.get(3, 1);
-
-  return 0;
-}
-int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs,
-                  const Option& opt) const {
-  int dims = bottom_blobs[0].dims;
-  int positive_axis = axis < 0 ? dims + axis : axis;
-  int topk;
-  if (bottom_blobs.size() == 2) {
-    const Mat& topk_blob = bottom_blobs[1];
-    topk = (int)(topk_blob[0] + 0.5);
-  } else if (bottom_blobs.size() == 1) {
-    topk = 1;
-  } else {
-    fprintf(stderr, "topk input blobs should be 1 or 2, but not %ld\n", bottom_blobs.size());
-    return -103;
-  }
-
-  // To do: Cut the top_val_blob after unit test. And we should change them in
-  // param files.
-  // Adaptive outputs. For onnx TopK, we output 2 blobs, for ArgMax, we output
-  // 1 blob.
-  Mat& top_val_blob = top_blobs[0];
-  Mat& top_ind_blob = top_blobs.size() == 2 ? top_blobs[1] : top_val_blob;
-
-  if (topk > 1) {
-    // real topk
-    if (keep_dims == 0) {
-      fprintf(stderr, "real topk should not reduce dims!\n");
-      return -102;
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(TopK)
+    DEFINE_NCNN_OPS(TopK, TopK)
+
+    TopK::TopK()
+    {
+        one_blob_only   = false;
+        support_inplace = false;
     }
-    if (dims == 1 && positive_axis == 0) {
-      if (topk > bottom_blobs[0].w) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(topk, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(topk, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      const float* ptr = bottom_blobs[0];
-      std::vector<std::pair<float, int> > vec;
-      vec.resize(bottom_blobs[0].w);
-
-      if (largest == 1) {
-        for (int i = 0; i < bottom_blobs[0].w; i++) {
-          vec[i] = std::make_pair(ptr[i], -i);
-        }
-        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                          std::greater<std::pair<float, int> >());
-      } else if (largest == 0) {
-        for (int i = 0; i < bottom_blobs[0].w; i++) {
-          vec[i] = std::make_pair(ptr[i], i);
-        }
-        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                          std::less<std::pair<float, int> >());
-      } else {
-        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-        return -100;
-      }
-      float* valptr = top_val_blob;
-      float* indptr = top_ind_blob;
-      if (sorted == 1) {
-        for (int i = 0; i < topk; i++) {
-          valptr[i] = vec[i].first;
-          indptr[i] = abs(vec[i].second);
-        }
-      } else if (sorted == 0) {
-        int cur = 0;
-        float valtarget = vec[topk - 1].first;
-        int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-
-        // pair comparison
-        if (largest == 1) {
-          for (int i = 0; i < bottom_blobs[0].w; i++) {
-            if (cur >= topk) break;
-            if (bottom_blobs[0][i] > valtarget) {
-              valptr[cur] = bottom_blobs[0][i];
-              indptr[cur] = i;
-              cur++;
-            } else if (bottom_blobs[0][i] == valtarget && i <= indtarget) {
-              valptr[cur] = bottom_blobs[0][i];
-              indptr[cur] = i;
-              cur++;
-            }
-          }
-        } else {
-          for (int i = 0; i < bottom_blobs[0].w; i++) {
-            if (cur >= topk) break;
-            if (bottom_blobs[0][i] < valtarget) {
-              valptr[cur] = bottom_blobs[0][i];
-              indptr[cur] = i;
-              cur++;
-            } else if (bottom_blobs[0][i] == valtarget && i <= indtarget) {
-              valptr[cur] = bottom_blobs[0][i];
-              indptr[cur] = i;
-              cur++;
-            }
-          }
-        }
-      }
+    int TopK::load_param(const ParamDict& pd)
+    {
+        axis      = pd.get(0, -1);
+        largest   = pd.get(1, 1);
+        sorted    = pd.get(2, 1);
+        keep_dims = pd.get(3, 1);
+
+        return 0;
     }
-    if (dims == 2 && positive_axis == 0) {
-      if (topk > bottom_blobs[0].h) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      for (int col = 0; col < bottom_blobs[0].w; col++) {
-        std::vector<std::pair<float, int> > vec;
-        vec.resize(bottom_blobs[0].h);
-
-        if (largest == 1) {
-          for (int i = 0; i < bottom_blobs[0].h; i++) {
-            vec[i] = std::make_pair(bottom_blobs[0].row(i)[col], -i);
-          }
-          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                            std::greater<std::pair<float, int> >());
-        } else if (largest == 0) {
-          for (int i = 0; i < bottom_blobs[0].h; i++) {
-            vec[i] = std::make_pair(bottom_blobs[0].row(i)[col], i);
-          }
-          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                            std::less<std::pair<float, int> >());
-        } else {
-          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-          return -100;
+    int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+    {
+        int dims          = bottom_blobs[0].dims;
+        int positive_axis = axis < 0 ? dims + axis : axis;
+        int topk;
+        if (bottom_blobs.size() == 2)
+        {
+            const Mat& topk_blob = bottom_blobs[1];
+            topk                 = (int)(topk_blob[0] + 0.5);
         }
-        if (sorted == 1) {
-          for (int i = 0; i < topk; i++) {
-            top_val_blob.row(i)[col] = vec[i].first;
-            top_ind_blob.row(i)[col] = abs(vec[i].second);
-          }
-        } else if (sorted == 0) {
-          int cur = 0;
-          float valtarget = vec[topk - 1].first;
-          int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-          if (largest == 1) {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              if (cur >= topk) break;
-              if (bottom_blobs[0].row(i)[col] > valtarget) {
-                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
-                top_ind_blob.row(cur)[col] = i;
-                cur++;
-              } else if (bottom_blobs[0].row(i)[col] == valtarget && i <= indtarget) {
-                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
-                top_ind_blob.row(cur)[col] = i;
-                cur++;
-              }
-            }
-          } else {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              if (cur >= topk) break;
-              if (bottom_blobs[0].row(i)[col] < valtarget) {
-                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
-                top_ind_blob.row(cur)[col] = i;
-                cur++;
-              } else if (bottom_blobs[0].row(i)[col] == valtarget && i <= indtarget) {
-                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
-                top_ind_blob.row(cur)[col] = i;
-                cur++;
-              }
-            }
-          }
-        } else {
-          fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
-          return -100;
+        else if (bottom_blobs.size() == 1)
+        {
+            topk = 1;
         }
-      }
-    }
-    if (dims == 2 && positive_axis == 1) {
-      if (topk > bottom_blobs[0].w) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      for (int r = 0; r < bottom_blobs[0].h; r++) {
-        std::vector<std::pair<float, int> > vec;
-        vec.resize(bottom_blobs[0].w);
-
-        if (largest == 1) {
-          for (int i = 0; i < bottom_blobs[0].w; i++) {
-            vec[i] = std::make_pair(bottom_blobs[0].row(r)[i], -i);
-          }
-          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                            std::greater<std::pair<float, int> >());
-        } else if (largest == 0) {
-          for (int i = 0; i < bottom_blobs[0].w; i++) {
-            vec[i] = std::make_pair(bottom_blobs[0].row(r)[i], i);
-          }
-          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                            std::less<std::pair<float, int> >());
-        } else {
-          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-          return -100;
+        else
+        {
+            fprintf(stderr, "topk input blobs should be 1 or 2, but not %ld\n", bottom_blobs.size());
+            return -103;
         }
 
-        if (sorted == 1) {
-          for (int i = 0; i < topk; i++) {
-            top_val_blob.row(r)[i] = vec[i].first;
-            top_ind_blob.row(r)[i] = abs(vec[i].second);
-          }
-        } else if (sorted == 0) {
-          int cur = 0;
-          float valtarget = vec[topk - 1].first;
-          int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-          if (largest == 1) {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              if (cur >= topk) break;
-              if (bottom_blobs[0].row(r)[i] > valtarget) {
-                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
-                top_ind_blob.row(r)[cur] = i;
-                cur++;
-              } else if (bottom_blobs[0].row(r)[i] == valtarget && i <= indtarget) {
-                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
-                top_ind_blob.row(r)[cur] = i;
-                cur++;
-              }
-            }
-          } else {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              if (cur >= topk) break;
-              if (bottom_blobs[0].row(r)[i] < valtarget) {
-                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
-                top_ind_blob.row(r)[cur] = i;
-                cur++;
-              } else if (bottom_blobs[0].row(r)[i] == valtarget && i <= indtarget) {
-                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
-                top_ind_blob.row(r)[cur] = i;
-                cur++;
-              }
-            }
-          }
+        // To do: Cut the top_val_blob after unit test. And we should change them in
+        // param files.
+        // Adaptive outputs. For onnx TopK, we output 2 blobs, for ArgMax, we output
+        // 1 blob.
+        Mat& top_val_blob = top_blobs[0];
+        Mat& top_ind_blob = top_blobs.size() == 2 ? top_blobs[1] : top_val_blob;
 
-        } else {
-          fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
-          return -100;
-        }
-      }
-    }
-    if (dims == 3 && positive_axis == 0) {
-      if (topk > bottom_blobs[0].c) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      for (int r = 0; r < bottom_blobs[0].h; r++) {
-        for (int col = 0; col < bottom_blobs[0].w; col++) {
-          std::vector<std::pair<float, int> > vec;
-          vec.resize(bottom_blobs[0].c);
-
-          if (largest == 1) {
-            for (int i = 0; i < bottom_blobs[0].c; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(i).row(r)[col], -i);
-            }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::greater<std::pair<float, int> >());
-          } else if (largest == 0) {
-            for (int i = 0; i < bottom_blobs[0].c; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(i).row(r)[col], i);
+        if (topk > 1)
+        {
+            // real topk
+            if (keep_dims == 0)
+            {
+                fprintf(stderr, "real topk should not reduce dims!\n");
+                return -102;
             }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::less<std::pair<float, int> >());
-          } else {
-            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-            return -100;
-          }
-
-          if (sorted == 1) {
-            for (int i = 0; i < topk; i++) {
-              top_val_blob.channel(i).row(r)[col] = vec[i].first;
-              top_ind_blob.channel(i).row(r)[col] = abs(vec[i].second);
+            if (dims == 1 && positive_axis == 0)
+            {
+                if (topk > bottom_blobs[0].w)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(topk, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                top_ind_blob.create(topk, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                const float*                       ptr = bottom_blobs[0];
+                std::vector<std::pair<float, int>> vec;
+                vec.resize(bottom_blobs[0].w);
+
+                if (largest == 1)
+                {
+                    for (int i = 0; i < bottom_blobs[0].w; i++)
+                    {
+                        vec[i] = std::make_pair(ptr[i], -i);
+                    }
+                    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                }
+                else if (largest == 0)
+                {
+                    for (int i = 0; i < bottom_blobs[0].w; i++)
+                    {
+                        vec[i] = std::make_pair(ptr[i], i);
+                    }
+                    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                }
+                else
+                {
+                    fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                    return -100;
+                }
+                float* valptr = top_val_blob;
+                float* indptr = top_ind_blob;
+                if (sorted == 1)
+                {
+                    for (int i = 0; i < topk; i++)
+                    {
+                        valptr[i] = vec[i].first;
+                        indptr[i] = abs(vec[i].second);
+                    }
+                }
+                else if (sorted == 0)
+                {
+                    int   cur       = 0;
+                    float valtarget = vec[topk - 1].first;
+                    int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+
+                    // pair comparison
+                    if (largest == 1)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].w; i++)
+                        {
+                            if (cur >= topk) break;
+                            if (bottom_blobs[0][i] > valtarget)
+                            {
+                                valptr[cur] = bottom_blobs[0][i];
+                                indptr[cur] = i;
+                                cur++;
+                            }
+                            else if (bottom_blobs[0][i] == valtarget && i <= indtarget)
+                            {
+                                valptr[cur] = bottom_blobs[0][i];
+                                indptr[cur] = i;
+                                cur++;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        for (int i = 0; i < bottom_blobs[0].w; i++)
+                        {
+                            if (cur >= topk) break;
+                            if (bottom_blobs[0][i] < valtarget)
+                            {
+                                valptr[cur] = bottom_blobs[0][i];
+                                indptr[cur] = i;
+                                cur++;
+                            }
+                            else if (bottom_blobs[0][i] == valtarget && i <= indtarget)
+                            {
+                                valptr[cur] = bottom_blobs[0][i];
+                                indptr[cur] = i;
+                                cur++;
+                            }
+                        }
+                    }
+                }
             }
-          } else if (sorted == 0) {
-            int cur = 0;
-            float valtarget = vec[topk - 1].first;
-            int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-            if (largest == 1) {
-              for (int i = 0; i < bottom_blobs[0].c; i++) {
-                if (cur >= topk) break;
-                if (bottom_blobs[0].channel(i).row(r)[col] > valtarget) {
-                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
-                  top_ind_blob.channel(cur).row(r)[col] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(i).row(r)[col] == valtarget && i <= indtarget) {
-                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
-                  top_ind_blob.channel(cur).row(r)[col] = i;
-                  cur++;
-                }
-              }
-            } else {
-              for (int i = 0; i < bottom_blobs[0].c; i++) {
-                if (cur >= topk) break;
-                if (bottom_blobs[0].channel(i).row(r)[col] < valtarget) {
-                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
-                  top_ind_blob.channel(cur).row(r)[col] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(i).row(r)[col] == valtarget && i <= indtarget) {
-                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
-                  top_ind_blob.channel(cur).row(r)[col] = i;
-                  cur++;
-                }
-              }
+            if (dims == 2 && positive_axis == 0)
+            {
+                if (topk > bottom_blobs[0].h)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                top_ind_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                for (int col = 0; col < bottom_blobs[0].w; col++)
+                {
+                    std::vector<std::pair<float, int>> vec;
+                    vec.resize(bottom_blobs[0].h);
+
+                    if (largest == 1)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].h; i++)
+                        {
+                            vec[i] = std::make_pair(bottom_blobs[0].row(i)[col], -i);
+                        }
+                        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                    }
+                    else if (largest == 0)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].h; i++)
+                        {
+                            vec[i] = std::make_pair(bottom_blobs[0].row(i)[col], i);
+                        }
+                        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                    }
+                    else
+                    {
+                        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                        return -100;
+                    }
+                    if (sorted == 1)
+                    {
+                        for (int i = 0; i < topk; i++)
+                        {
+                            top_val_blob.row(i)[col] = vec[i].first;
+                            top_ind_blob.row(i)[col] = abs(vec[i].second);
+                        }
+                    }
+                    else if (sorted == 0)
+                    {
+                        int   cur       = 0;
+                        float valtarget = vec[topk - 1].first;
+                        int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+                        if (largest == 1)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                if (cur >= topk) break;
+                                if (bottom_blobs[0].row(i)[col] > valtarget)
+                                {
+                                    top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                                    top_ind_blob.row(cur)[col] = i;
+                                    cur++;
+                                }
+                                else if (bottom_blobs[0].row(i)[col] == valtarget && i <= indtarget)
+                                {
+                                    top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                                    top_ind_blob.row(cur)[col] = i;
+                                    cur++;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                if (cur >= topk) break;
+                                if (bottom_blobs[0].row(i)[col] < valtarget)
+                                {
+                                    top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                                    top_ind_blob.row(cur)[col] = i;
+                                    cur++;
+                                }
+                                else if (bottom_blobs[0].row(i)[col] == valtarget && i <= indtarget)
+                                {
+                                    top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                                    top_ind_blob.row(cur)[col] = i;
+                                    cur++;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+                        return -100;
+                    }
+                }
             }
+            if (dims == 2 && positive_axis == 1)
+            {
+                if (topk > bottom_blobs[0].w)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
 
-          } else {
-            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
-            return -100;
-          }
-        }
-      }
-    }
-    if (dims == 3 && positive_axis == 1) {
-      if (topk > bottom_blobs[0].h) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      for (int page = 0; page < bottom_blobs[0].c; page++) {
-        for (int col = 0; col < bottom_blobs[0].w; col++) {
-          std::vector<std::pair<float, int> > vec;
-          vec.resize(bottom_blobs[0].h);
-
-          if (largest == 1) {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(i)[col], -i);
+                top_ind_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                for (int r = 0; r < bottom_blobs[0].h; r++)
+                {
+                    std::vector<std::pair<float, int>> vec;
+                    vec.resize(bottom_blobs[0].w);
+
+                    if (largest == 1)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].w; i++)
+                        {
+                            vec[i] = std::make_pair(bottom_blobs[0].row(r)[i], -i);
+                        }
+                        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                    }
+                    else if (largest == 0)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].w; i++)
+                        {
+                            vec[i] = std::make_pair(bottom_blobs[0].row(r)[i], i);
+                        }
+                        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                    }
+                    else
+                    {
+                        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                        return -100;
+                    }
+
+                    if (sorted == 1)
+                    {
+                        for (int i = 0; i < topk; i++)
+                        {
+                            top_val_blob.row(r)[i] = vec[i].first;
+                            top_ind_blob.row(r)[i] = abs(vec[i].second);
+                        }
+                    }
+                    else if (sorted == 0)
+                    {
+                        int   cur       = 0;
+                        float valtarget = vec[topk - 1].first;
+                        int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+                        if (largest == 1)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                if (cur >= topk) break;
+                                if (bottom_blobs[0].row(r)[i] > valtarget)
+                                {
+                                    top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                                    top_ind_blob.row(r)[cur] = i;
+                                    cur++;
+                                }
+                                else if (bottom_blobs[0].row(r)[i] == valtarget && i <= indtarget)
+                                {
+                                    top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                                    top_ind_blob.row(r)[cur] = i;
+                                    cur++;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                if (cur >= topk) break;
+                                if (bottom_blobs[0].row(r)[i] < valtarget)
+                                {
+                                    top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                                    top_ind_blob.row(r)[cur] = i;
+                                    cur++;
+                                }
+                                else if (bottom_blobs[0].row(r)[i] == valtarget && i <= indtarget)
+                                {
+                                    top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                                    top_ind_blob.row(r)[cur] = i;
+                                    cur++;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+                        return -100;
+                    }
+                }
             }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::greater<std::pair<float, int> >());
-          } else if (largest == 0) {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(i)[col], i);
+            if (dims == 3 && positive_axis == 0)
+            {
+                if (topk > bottom_blobs[0].c)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                for (int r = 0; r < bottom_blobs[0].h; r++)
+                {
+                    for (int col = 0; col < bottom_blobs[0].w; col++)
+                    {
+                        std::vector<std::pair<float, int>> vec;
+                        vec.resize(bottom_blobs[0].c);
+
+                        if (largest == 1)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].c; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(i).row(r)[col], -i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                        }
+                        else if (largest == 0)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].c; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(i).row(r)[col], i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                        }
+                        else
+                        {
+                            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                            return -100;
+                        }
+
+                        if (sorted == 1)
+                        {
+                            for (int i = 0; i < topk; i++)
+                            {
+                                top_val_blob.channel(i).row(r)[col] = vec[i].first;
+                                top_ind_blob.channel(i).row(r)[col] = abs(vec[i].second);
+                            }
+                        }
+                        else if (sorted == 0)
+                        {
+                            int   cur       = 0;
+                            float valtarget = vec[topk - 1].first;
+                            int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+                            if (largest == 1)
+                            {
+                                for (int i = 0; i < bottom_blobs[0].c; i++)
+                                {
+                                    if (cur >= topk) break;
+                                    if (bottom_blobs[0].channel(i).row(r)[col] > valtarget)
+                                    {
+                                        top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                                        top_ind_blob.channel(cur).row(r)[col] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(i).row(r)[col] == valtarget && i <= indtarget)
+                                    {
+                                        top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                                        top_ind_blob.channel(cur).row(r)[col] = i;
+                                        cur++;
+                                    }
+                                }
+                            }
+                            else
+                            {
+                                for (int i = 0; i < bottom_blobs[0].c; i++)
+                                {
+                                    if (cur >= topk) break;
+                                    if (bottom_blobs[0].channel(i).row(r)[col] < valtarget)
+                                    {
+                                        top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                                        top_ind_blob.channel(cur).row(r)[col] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(i).row(r)[col] == valtarget && i <= indtarget)
+                                    {
+                                        top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                                        top_ind_blob.channel(cur).row(r)[col] = i;
+                                        cur++;
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+                            return -100;
+                        }
+                    }
+                }
             }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::less<std::pair<float, int> >());
-          } else {
-            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-            return -100;
-          }
-
-          if (sorted == 1) {
-            for (int i = 0; i < topk; i++) {
-              top_val_blob.channel(page).row(i)[col] = vec[i].first;
-              top_ind_blob.channel(page).row(i)[col] = abs(vec[i].second);
+            if (dims == 3 && positive_axis == 1)
+            {
+                if (topk > bottom_blobs[0].h)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                top_ind_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                for (int page = 0; page < bottom_blobs[0].c; page++)
+                {
+                    for (int col = 0; col < bottom_blobs[0].w; col++)
+                    {
+                        std::vector<std::pair<float, int>> vec;
+                        vec.resize(bottom_blobs[0].h);
+
+                        if (largest == 1)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(i)[col], -i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                        }
+                        else if (largest == 0)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(i)[col], i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                        }
+                        else
+                        {
+                            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                            return -100;
+                        }
+
+                        if (sorted == 1)
+                        {
+                            for (int i = 0; i < topk; i++)
+                            {
+                                top_val_blob.channel(page).row(i)[col] = vec[i].first;
+                                top_ind_blob.channel(page).row(i)[col] = abs(vec[i].second);
+                            }
+                        }
+                        else if (sorted == 0)
+                        {
+                            int   cur       = 0;
+                            float valtarget = vec[topk - 1].first;
+                            int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                if (cur >= topk) break;
+                                if (largest == 1)
+                                {
+                                    if (bottom_blobs[0].channel(page).row(i)[col] > valtarget)
+                                    {
+                                        top_val_blob.channel(page).row(cur)[col] =
+                                            bottom_blobs[0].channel(page).row(i)[col];
+                                        top_ind_blob.channel(page).row(cur)[col] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(page).row(i)[col] == valtarget &&
+                                             i <= indtarget)
+                                    {
+                                        top_val_blob.channel(page).row(cur)[col] =
+                                            bottom_blobs[0].channel(page).row(i)[col];
+                                        top_ind_blob.channel(page).row(cur)[col] = i;
+                                        cur++;
+                                    }
+                                }
+                                else
+                                {
+                                    if (bottom_blobs[0].channel(page).row(i)[col] < valtarget)
+                                    {
+                                        top_val_blob.channel(page).row(cur)[col] =
+                                            bottom_blobs[0].channel(page).row(i)[col];
+                                        top_ind_blob.channel(page).row(cur)[col] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(page).row(i)[col] == valtarget &&
+                                             i <= indtarget)
+                                    {
+                                        top_val_blob.channel(page).row(cur)[col] =
+                                            bottom_blobs[0].channel(page).row(i)[col];
+                                        top_ind_blob.channel(page).row(cur)[col] = i;
+                                        cur++;
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+                            return -100;
+                        }
+                    }
+                }
             }
-          } else if (sorted == 0) {
-            int cur = 0;
-            float valtarget = vec[topk - 1].first;
-            int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              if (cur >= topk) break;
-              if (largest == 1) {
-                if (bottom_blobs[0].channel(page).row(i)[col] > valtarget) {
-                  top_val_blob.channel(page).row(cur)[col] =
-                      bottom_blobs[0].channel(page).row(i)[col];
-                  top_ind_blob.channel(page).row(cur)[col] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(page).row(i)[col] == valtarget &&
-                           i <= indtarget) {
-                  top_val_blob.channel(page).row(cur)[col] =
-                      bottom_blobs[0].channel(page).row(i)[col];
-                  top_ind_blob.channel(page).row(cur)[col] = i;
-                  cur++;
-                }
-              } else {
-                if (bottom_blobs[0].channel(page).row(i)[col] < valtarget) {
-                  top_val_blob.channel(page).row(cur)[col] =
-                      bottom_blobs[0].channel(page).row(i)[col];
-                  top_ind_blob.channel(page).row(cur)[col] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(page).row(i)[col] == valtarget &&
-                           i <= indtarget) {
-                  top_val_blob.channel(page).row(cur)[col] =
-                      bottom_blobs[0].channel(page).row(i)[col];
-                  top_ind_blob.channel(page).row(cur)[col] = i;
-                  cur++;
-                }
-              }
+            if (dims == 3 && positive_axis == 2)
+            {
+                if (topk > bottom_blobs[0].w)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                top_ind_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                for (int page = 0; page < bottom_blobs[0].c; page++)
+                {
+                    for (int r = 0; r < bottom_blobs[0].h; r++)
+                    {
+                        std::vector<std::pair<float, int>> vec;
+                        vec.resize(bottom_blobs[0].w);
+
+                        if (largest == 1)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(r)[i], -i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                        }
+                        else if (largest == 0)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(r)[i], i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                        }
+                        else
+                        {
+                            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                            return -100;
+                        }
+
+                        if (sorted == 1)
+                        {
+                            for (int i = 0; i < topk; i++)
+                            {
+                                top_val_blob.channel(page).row(r)[i] = vec[i].first;
+                                top_ind_blob.channel(page).row(r)[i] = abs(vec[i].second);
+                            }
+                        }
+                        else if (sorted == 0)
+                        {
+                            int   cur       = 0;
+                            float valtarget = vec[topk - 1].first;
+                            int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+                            if (largest == 1)
+                            {
+                                for (int i = 0; i < bottom_blobs[0].w; i++)
+                                {
+                                    if (cur >= topk) break;
+                                    if (bottom_blobs[0].channel(page).row(r)[i] > valtarget)
+                                    {
+                                        top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                                        top_ind_blob.channel(page).row(r)[cur] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(page).row(r)[i] == valtarget && i <= indtarget)
+                                    {
+                                        top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                                        top_ind_blob.channel(page).row(r)[cur] = i;
+                                        cur++;
+                                    }
+                                }
+                            }
+                            else
+                            {
+                                for (int i = 0; i < bottom_blobs[0].w; i++)
+                                {
+                                    if (cur >= topk) break;
+                                    if (bottom_blobs[0].channel(page).row(r)[i] < valtarget)
+                                    {
+                                        top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                                        top_ind_blob.channel(page).row(r)[cur] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(page).row(r)[i] == valtarget && i <= indtarget)
+                                    {
+                                        top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                                        top_ind_blob.channel(page).row(r)[cur] = i;
+                                        cur++;
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+                            return -100;
+                        }
+                    }
+                }
             }
-          } else {
-            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
-            return -100;
-          }
         }
-      }
-    }
-    if (dims == 3 && positive_axis == 2) {
-      if (topk > bottom_blobs[0].w) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      for (int page = 0; page < bottom_blobs[0].c; page++) {
-        for (int r = 0; r < bottom_blobs[0].h; r++) {
-          std::vector<std::pair<float, int> > vec;
-          vec.resize(bottom_blobs[0].w);
-
-          if (largest == 1) {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(r)[i], -i);
+        else
+        {
+            if (topk <= 0)
+            {
+                fprintf(stderr, "topk should not <= 0!\n");
+                return -102;
             }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::greater<std::pair<float, int> >());
-          } else if (largest == 0) {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(r)[i], i);
+            if (dims == 1 && positive_axis == 0)
+            {
+                if (topk > bottom_blobs[0].w)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(topk, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                if (top_blobs.size() == 2)
+                {
+                    top_ind_blob.create(topk, 4u, opt.blob_allocator);
+                    if (top_ind_blob.empty()) return -100;
+                }
+
+                const float*       ptr = bottom_blobs[0];
+                std::vector<float> vec;
+                vec.resize(bottom_blobs[0].w);
+                float* valptr = top_val_blob;
+                float* indptr;
+                if (top_blobs.size() == 2) indptr = top_ind_blob;
+
+                for (int i = 0; i < bottom_blobs[0].w; i++)
+                {
+                    vec[i] = ptr[i];
+                }
+                if (largest == 1)
+                {
+                    auto index_iter = std::max_element(vec.begin(), vec.end());
+                    valptr[0]       = *index_iter;
+                    if (top_blobs.size() == 2)
+                        indptr[0] = std::distance(vec.begin(), index_iter);
+                    else
+                        valptr[0] = std::distance(vec.begin(), index_iter);  // replace with index
+                }
+                else if (largest == 0)
+                {
+                    auto index_iter = std::min_element(vec.begin(), vec.end());
+                    valptr[0]       = *index_iter;
+                    if (top_blobs.size() == 2)
+                        indptr[0] = std::distance(vec.begin(), index_iter);
+                    else
+                        valptr[0] = std::distance(vec.begin(), index_iter);  // replace with index
+                }
+                else
+                {
+                    fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                    return -100;
+                }
             }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::less<std::pair<float, int> >());
-          } else {
-            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-            return -100;
-          }
-
-          if (sorted == 1) {
-            for (int i = 0; i < topk; i++) {
-              top_val_blob.channel(page).row(r)[i] = vec[i].first;
-              top_ind_blob.channel(page).row(r)[i] = abs(vec[i].second);
+            if (dims == 2 && positive_axis == 0)
+            {
+                if (keep_dims == 1)
+                {
+                    top_val_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+                else
+                {
+                    top_val_blob.create(bottom_blobs[0].w, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+                const float*       ptr = bottom_blobs[0];
+                std::vector<float> vec;
+                vec.resize(bottom_blobs[0].h);
+                float* valptr = top_val_blob;
+                float* indptr;
+                if (top_blobs.size() == 2) indptr = top_ind_blob;
+                for (int col = 0; col < bottom_blobs[0].w; col++)
+                {
+                    for (int i = 0; i < bottom_blobs[0].h; i++)
+                    {
+                        vec[i] = ptr[i * bottom_blobs[0].w + col];
+                    }
+                    if (largest == 1)
+                    {
+                        auto index_iter = std::max_element(vec.begin(), vec.end());
+                        valptr[col]     = *index_iter;
+                        if (top_blobs.size() == 2)
+                            indptr[col] = std::distance(vec.begin(), index_iter);
+                        else
+                            valptr[col] = std::distance(vec.begin(), index_iter);
+                    }
+                    else if (largest == 0)
+                    {
+                        auto index_iter = std::min_element(vec.begin(), vec.end());
+                        valptr[col]     = *index_iter;
+                        if (top_blobs.size() == 2)
+                            indptr[col] = std::distance(vec.begin(), index_iter);
+                        else
+                            valptr[col] = std::distance(vec.begin(), index_iter);
+                    }
+                    else
+                    {
+                        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                        return -100;
+                    }
+                }
             }
-          } else if (sorted == 0) {
-            int cur = 0;
-            float valtarget = vec[topk - 1].first;
-            int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-            if (largest == 1) {
-              for (int i = 0; i < bottom_blobs[0].w; i++) {
-                if (cur >= topk) break;
-                if (bottom_blobs[0].channel(page).row(r)[i] > valtarget) {
-                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
-                  top_ind_blob.channel(page).row(r)[cur] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(page).row(r)[i] == valtarget && i <= indtarget) {
-                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
-                  top_ind_blob.channel(page).row(r)[cur] = i;
-                  cur++;
-                }
-              }
-            } else {
-              for (int i = 0; i < bottom_blobs[0].w; i++) {
-                if (cur >= topk) break;
-                if (bottom_blobs[0].channel(page).row(r)[i] < valtarget) {
-                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
-                  top_ind_blob.channel(page).row(r)[cur] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(page).row(r)[i] == valtarget && i <= indtarget) {
-                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
-                  top_ind_blob.channel(page).row(r)[cur] = i;
-                  cur++;
-                }
-              }
+            if (dims == 2 && positive_axis == 1)
+            {
+                if (keep_dims == 1)
+                {
+                    top_val_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+                else
+                {
+                    top_val_blob.create(bottom_blobs[0].h, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].h, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+
+                const float*       ptr = bottom_blobs[0];
+                std::vector<float> vec;
+                vec.resize(bottom_blobs[0].w);
+                float* valptr = top_val_blob;
+                float* indptr;
+                if (top_blobs.size() == 2) indptr = top_ind_blob;
+
+                for (int r = 0; r < bottom_blobs[0].h; r++)
+                {
+                    for (int i = 0; i < bottom_blobs[0].w; i++)
+                    {
+                        vec[i] = ptr[r * bottom_blobs[0].w + i];
+                    }
+                    if (largest == 1)
+                    {
+                        auto index_iter = std::max_element(vec.begin(), vec.end());
+                        valptr[r]       = *index_iter;
+                        if (top_blobs.size() == 2)
+                            indptr[r] = std::distance(vec.begin(), index_iter);
+                        else
+                            valptr[r] = std::distance(vec.begin(), index_iter);
+                    }
+                    else if (largest == 0)
+                    {
+                        auto index_iter = std::min_element(vec.begin(), vec.end());
+                        valptr[r]       = *index_iter;
+                        if (top_blobs.size() == 2)
+                            indptr[r] = std::distance(vec.begin(), index_iter);
+                        else
+                            valptr[r] = std::distance(vec.begin(), index_iter);
+                    }
+                    else
+                    {
+                        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                        return -100;
+                    }
+                }
             }
+            if (dims == 3 && positive_axis == 0)
+            {
+                if (keep_dims == 1)
+                {
+                    top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+                else
+                {
+                    top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+                const float*       ptr = bottom_blobs[0];
+                std::vector<float> vec;
+                vec.resize(bottom_blobs[0].c);
+                float* valptr = top_val_blob;
+                float* indptr;
+                if (top_blobs.size() == 2) indptr = top_ind_blob;
 
-          } else {
-            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
-            return -100;
-          }
-        }
-      }
-    }
-  } else {
-    if (topk <= 0) {
-      fprintf(stderr, "topk should not <= 0!\n");
-      return -102;
-    }
-    if (dims == 1 && positive_axis == 0) {
-      if (topk > bottom_blobs[0].w) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(topk, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      if (top_blobs.size() == 2) {
-        top_ind_blob.create(topk, 4u, opt.blob_allocator);
-        if (top_ind_blob.empty()) return -100;
-      }
-
-      const float* ptr = bottom_blobs[0];
-      std::vector<float> vec;
-      vec.resize(bottom_blobs[0].w);
-      float* valptr = top_val_blob;
-      float* indptr;
-      if (top_blobs.size() == 2) indptr = top_ind_blob;
-
-      for (int i = 0; i < bottom_blobs[0].w; i++) {
-        vec[i] = ptr[i];
-      }
-      if (largest == 1) {
-        auto index_iter = std::max_element(vec.begin(), vec.end());
-        valptr[0] = *index_iter;
-        if (top_blobs.size() == 2)
-          indptr[0] = std::distance(vec.begin(), index_iter);
-        else
-          valptr[0] = std::distance(vec.begin(), index_iter);  // replace with index
-      } else if (largest == 0) {
-        auto index_iter = std::min_element(vec.begin(), vec.end());
-        valptr[0] = *index_iter;
-        if (top_blobs.size() == 2)
-          indptr[0] = std::distance(vec.begin(), index_iter);
-        else
-          valptr[0] = std::distance(vec.begin(), index_iter);  // replace with index
-      } else {
-        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-        return -100;
-      }
-    }
-    if (dims == 2 && positive_axis == 0) {
-      if (keep_dims == 1) {
-        top_val_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                for (int r = 0; r < bottom_blobs[0].h; r++)
+                {
+                    for (int col = 0; col < bottom_blobs[0].w; col++)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].c; i++)
+                        {
+                            ptr    = bottom_blobs[0].channel(i);
+                            vec[i] = ptr[r * bottom_blobs[0].w + col];
+                        }
+                        if (largest == 1)
+                        {
+                            auto index_iter                  = std::max_element(vec.begin(), vec.end());
+                            valptr[r * top_val_blob.w + col] = *index_iter;
+                            if (top_blobs.size() == 2)
+                                indptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                            else
+                                valptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                        }
+                        else if (largest == 0)
+                        {
+                            auto index_iter                  = std::min_element(vec.begin(), vec.end());
+                            valptr[r * top_val_blob.w + col] = *index_iter;
 
-      } else {
-        top_val_blob.create(bottom_blobs[0].w, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
+                            if (top_blobs.size() == 2)
+                                indptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                            else
+                                valptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                        }
+                        else
+                        {
+                            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                            return -100;
+                        }
+                    }
+                }
+            }
+            if (dims == 3 && positive_axis == 1)
+            {
+                if (keep_dims == 1)
+                {
+                    top_val_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
 
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
-      }
-      const float* ptr = bottom_blobs[0];
-      std::vector<float> vec;
-      vec.resize(bottom_blobs[0].h);
-      float* valptr = top_val_blob;
-      float* indptr;
-      if (top_blobs.size() == 2) indptr = top_ind_blob;
-      for (int col = 0; col < bottom_blobs[0].w; col++) {
-        for (int i = 0; i < bottom_blobs[0].h; i++) {
-          vec[i] = ptr[i * bottom_blobs[0].w + col];
-        }
-        if (largest == 1) {
-          auto index_iter = std::max_element(vec.begin(), vec.end());
-          valptr[col] = *index_iter;
-          if (top_blobs.size() == 2)
-            indptr[col] = std::distance(vec.begin(), index_iter);
-          else
-            valptr[col] = std::distance(vec.begin(), index_iter);
-
-        } else if (largest == 0) {
-          auto index_iter = std::min_element(vec.begin(), vec.end());
-          valptr[col] = *index_iter;
-          if (top_blobs.size() == 2)
-            indptr[col] = std::distance(vec.begin(), index_iter);
-          else
-            valptr[col] = std::distance(vec.begin(), index_iter);
-        } else {
-          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-          return -100;
-        }
-      }
-    }
-    if (dims == 2 && positive_axis == 1) {
-      if (keep_dims == 1) {
-        top_val_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                    std::vector<float> vec;
+                    vec.resize(bottom_blobs[0].h);
 
-      } else {
-        top_val_blob.create(bottom_blobs[0].h, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].h, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
-      }
-
-      const float* ptr = bottom_blobs[0];
-      std::vector<float> vec;
-      vec.resize(bottom_blobs[0].w);
-      float* valptr = top_val_blob;
-      float* indptr;
-      if (top_blobs.size() == 2) indptr = top_ind_blob;
-
-      for (int r = 0; r < bottom_blobs[0].h; r++) {
-        for (int i = 0; i < bottom_blobs[0].w; i++) {
-          vec[i] = ptr[r * bottom_blobs[0].w + i];
-        }
-        if (largest == 1) {
-          auto index_iter = std::max_element(vec.begin(), vec.end());
-          valptr[r] = *index_iter;
-          if (top_blobs.size() == 2)
-            indptr[r] = std::distance(vec.begin(), index_iter);
-          else
-            valptr[r] = std::distance(vec.begin(), index_iter);
-
-        } else if (largest == 0) {
-          auto index_iter = std::min_element(vec.begin(), vec.end());
-          valptr[r] = *index_iter;
-          if (top_blobs.size() == 2)
-            indptr[r] = std::distance(vec.begin(), index_iter);
-          else
-            valptr[r] = std::distance(vec.begin(), index_iter);
-        } else {
-          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-          return -100;
-        }
-      }
-    }
-    if (dims == 3 && positive_axis == 0) {
-      if (keep_dims == 1) {
-        top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                    for (int page = 0; page < bottom_blobs[0].c; page++)
+                    {
+                        const float* ptr    = bottom_blobs[0].channel(page);
+                        float*       valptr = top_val_blob.channel(page);
+                        float*       indptr;
+                        if (top_blobs.size() == 2) indptr = top_ind_blob.channel(page);
+                        for (int col = 0; col < bottom_blobs[0].w; col++)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                vec[i] = ptr[i * bottom_blobs[0].w + col];
+                            }
+                            if (largest == 1)
+                            {
+                                auto index_iter = std::max_element(vec.begin(), vec.end());
+                                valptr[col]     = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[col] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[col] = std::distance(vec.begin(), index_iter);
+                            }
+                            else if (largest == 0)
+                            {
+                                auto index_iter = std::min_element(vec.begin(), vec.end());
+                                valptr[col]     = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[col] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[col] = std::distance(vec.begin(), index_iter);
+                            }
+                            else
+                            {
+                                fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                                return -100;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
 
-      } else {
-        top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
-      }
-      const float* ptr = bottom_blobs[0];
-      std::vector<float> vec;
-      vec.resize(bottom_blobs[0].c);
-      float* valptr = top_val_blob;
-      float* indptr;
-      if (top_blobs.size() == 2) indptr = top_ind_blob;
-
-      for (int r = 0; r < bottom_blobs[0].h; r++) {
-        for (int col = 0; col < bottom_blobs[0].w; col++) {
-          for (int i = 0; i < bottom_blobs[0].c; i++) {
-            ptr = bottom_blobs[0].channel(i);
-            vec[i] = ptr[r * bottom_blobs[0].w + col];
-          }
-          if (largest == 1) {
-            auto index_iter = std::max_element(vec.begin(), vec.end());
-            valptr[r * top_val_blob.w + col] = *index_iter;
-            if (top_blobs.size() == 2)
-              indptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-            else
-              valptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-
-          } else if (largest == 0) {
-            auto index_iter = std::min_element(vec.begin(), vec.end());
-            valptr[r * top_val_blob.w + col] = *index_iter;
-
-            if (top_blobs.size() == 2)
-              indptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-            else
-              valptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-          } else {
-            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-            return -100;
-          }
-        }
-      }
-    }
-    if (dims == 3 && positive_axis == 1) {
-      if (keep_dims == 1) {
-        top_val_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                    std::vector<float> vec;
+                    vec.resize(bottom_blobs[0].h);
+                    float* valptr = top_val_blob;
+                    float* indptr;
+                    if (top_blobs.size() == 2) indptr = top_ind_blob;
 
-        std::vector<float> vec;
-        vec.resize(bottom_blobs[0].h);
-
-        for (int page = 0; page < bottom_blobs[0].c; page++) {
-          const float* ptr = bottom_blobs[0].channel(page);
-          float* valptr = top_val_blob.channel(page);
-          float* indptr;
-          if (top_blobs.size() == 2) indptr = top_ind_blob.channel(page);
-          for (int col = 0; col < bottom_blobs[0].w; col++) {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              vec[i] = ptr[i * bottom_blobs[0].w + col];
-            }
-            if (largest == 1) {
-              auto index_iter = std::max_element(vec.begin(), vec.end());
-              valptr[col] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[col] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[col] = std::distance(vec.begin(), index_iter);
-            } else if (largest == 0) {
-              auto index_iter = std::min_element(vec.begin(), vec.end());
-              valptr[col] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[col] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[col] = std::distance(vec.begin(), index_iter);
-            } else {
-              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-              return -100;
+                    for (int page = 0; page < bottom_blobs[0].c; page++)
+                    {
+                        const float* ptr = bottom_blobs[0].channel(page);
+                        for (int col = 0; col < bottom_blobs[0].w; col++)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                vec[i] = ptr[i * bottom_blobs[0].w + col];
+                            }
+                            if (largest == 1)
+                            {
+                                auto index_iter                     = std::max_element(vec.begin(), vec.end());
+                                valptr[page * top_val_blob.w + col] = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                            }
+                            else if (largest == 0)
+                            {
+                                auto index_iter                     = std::min_element(vec.begin(), vec.end());
+                                valptr[page * top_val_blob.w + col] = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                            }
+                            else
+                            {
+                                fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                                return -100;
+                            }
+                        }
+                    }
+                }
             }
-          }
-        }
-      } else {
-        top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].c, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].c, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+            if (dims == 3 && positive_axis == 2)
+            {
+                if (keep_dims == 1)
+                {
+                    top_val_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
 
-        std::vector<float> vec;
-        vec.resize(bottom_blobs[0].h);
-        float* valptr = top_val_blob;
-        float* indptr;
-        if (top_blobs.size() == 2) indptr = top_ind_blob;
-
-        for (int page = 0; page < bottom_blobs[0].c; page++) {
-          const float* ptr = bottom_blobs[0].channel(page);
-          for (int col = 0; col < bottom_blobs[0].w; col++) {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              vec[i] = ptr[i * bottom_blobs[0].w + col];
-            }
-            if (largest == 1) {
-              auto index_iter = std::max_element(vec.begin(), vec.end());
-              valptr[page * top_val_blob.w + col] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-            } else if (largest == 0) {
-              auto index_iter = std::min_element(vec.begin(), vec.end());
-              valptr[page * top_val_blob.w + col] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-            } else {
-              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-              return -100;
-            }
-          }
-        }
-      }
-    }
-    if (dims == 3 && positive_axis == 2) {
-      if (keep_dims == 1) {
-        top_val_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                    std::vector<float> vec;
+                    vec.resize(bottom_blobs[0].w);
 
-        std::vector<float> vec;
-        vec.resize(bottom_blobs[0].w);
-
-        for (int page = 0; page < bottom_blobs[0].c; page++) {
-          const float* ptr = bottom_blobs[0].channel(page);
-          float* valptr = top_val_blob.channel(page);
-          float* indptr;
-          if (top_blobs.size() == 2) indptr = top_ind_blob.channel(page);
-          for (int r = 0; r < bottom_blobs[0].h; r++) {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              vec[i] = ptr[r * bottom_blobs[0].w + i];
-            }
-            if (largest == 1) {
-              auto index_iter = std::max_element(vec.begin(), vec.end());
-              valptr[r] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[r] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[r] = std::distance(vec.begin(), index_iter);
-            } else if (largest == 0) {
-              auto index_iter = std::min_element(vec.begin(), vec.end());
-              valptr[r] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[r] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[r] = std::distance(vec.begin(), index_iter);
-            } else {
-              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-              return -100;
-            }
-          }
-        }
-      } else {
-        top_val_blob.create(bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                    for (int page = 0; page < bottom_blobs[0].c; page++)
+                    {
+                        const float* ptr    = bottom_blobs[0].channel(page);
+                        float*       valptr = top_val_blob.channel(page);
+                        float*       indptr;
+                        if (top_blobs.size() == 2) indptr = top_ind_blob.channel(page);
+                        for (int r = 0; r < bottom_blobs[0].h; r++)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                vec[i] = ptr[r * bottom_blobs[0].w + i];
+                            }
+                            if (largest == 1)
+                            {
+                                auto index_iter = std::max_element(vec.begin(), vec.end());
+                                valptr[r]       = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[r] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[r] = std::distance(vec.begin(), index_iter);
+                            }
+                            else if (largest == 0)
+                            {
+                                auto index_iter = std::min_element(vec.begin(), vec.end());
+                                valptr[r]       = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[r] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[r] = std::distance(vec.begin(), index_iter);
+                            }
+                            else
+                            {
+                                fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                                return -100;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    top_val_blob.create(bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
 
-        std::vector<float> vec;
-        vec.resize(bottom_blobs[0].w);
-        float* valptr = top_val_blob;
-        float* indptr;
-        if (top_blobs.size() == 2) indptr = top_ind_blob;
-
-        for (int page = 0; page < bottom_blobs[0].c; page++) {
-          const float* ptr = bottom_blobs[0].channel(page);
-          for (int r = 0; r < bottom_blobs[0].h; r++) {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              vec[i] = ptr[r * bottom_blobs[0].w + i];
-            }
-            if (largest == 1) {
-              auto index_iter = std::max_element(vec.begin(), vec.end());
-              valptr[page * top_val_blob.w + r] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
-            } else if (largest == 0) {
-              auto index_iter = std::min_element(vec.begin(), vec.end());
-              valptr[page * top_val_blob.w + r] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[page * top_val_blob.w + r] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
-            } else {
-              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-              return -100;
+                    std::vector<float> vec;
+                    vec.resize(bottom_blobs[0].w);
+                    float* valptr = top_val_blob;
+                    float* indptr;
+                    if (top_blobs.size() == 2) indptr = top_ind_blob;
+
+                    for (int page = 0; page < bottom_blobs[0].c; page++)
+                    {
+                        const float* ptr = bottom_blobs[0].channel(page);
+                        for (int r = 0; r < bottom_blobs[0].h; r++)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                vec[i] = ptr[r * bottom_blobs[0].w + i];
+                            }
+                            if (largest == 1)
+                            {
+                                auto index_iter                   = std::max_element(vec.begin(), vec.end());
+                                valptr[page * top_val_blob.w + r] = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
+                            }
+                            else if (largest == 0)
+                            {
+                                auto index_iter                   = std::min_element(vec.begin(), vec.end());
+                                valptr[page * top_val_blob.w + r] = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[page * top_val_blob.w + r] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
+                            }
+                            else
+                            {
+                                fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                                return -100;
+                            }
+                        }
+                    }
+                }
             }
-          }
         }
-      }
+        return 0;
     }
-  }
-  return 0;
-}
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h
index d390fbafcd..e9bbde1297 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h
@@ -4,21 +4,22 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class TopK : public ncnn::Layer {
- public:
-  TopK();
-  virtual int load_param(const ncnn::ParamDict& pd);
-  virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs,
-                      const ncnn::Option& opt) const;
+    class TopK : public ncnn::Layer
+    {
+      public:
+        TopK();
+        virtual int load_param(const ncnn::ParamDict& pd);
+        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const;
 
- public:
-  int axis;
-  int largest;
-  int sorted;
-  int keep_dims;
-};
+      public:
+        int axis;
+        int largest;
+        int sorted;
+        int keep_dims;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp b/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp
old mode 100755
new mode 100644
index ac158b9edb..1c8ad70cc7
--- a/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp
@@ -4,9 +4,11 @@
 #include "ncnn_ops_register.h"
 #include "net.h"
 
-PYBIND11_MODULE(ncnn_ext, m) {
-  m.def(
-      "register_mmdeploy_custom_layers",
-      [](ncnn::Net &net) { return register_mmdeploy_custom_layers(net); },
-      "register mmdeploy custom ncnn layers.");
+PYBIND11_MODULE(ncnn_ext, m)
+{
+    m.def(
+        "register_mmdeploy_custom_layers",
+        [](ncnn::Net& net)
+        { return register_mmdeploy_custom_layers(net); },
+        "register mmdeploy custom ncnn layers.");
 }
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_register.h b/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_register.h
index 28d2a2b782..1095c28bae 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_register.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_register.h
@@ -6,11 +6,12 @@
 #include "mmdeploy/core/macro.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-MMDEPLOY_API OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
-                                                       const OrtApiBase *api);
+    MMDEPLOY_API OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options,
+                                                           const OrtApiBase*  api);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.cpp
index c604e4b650..da959ec37e 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.cpp
@@ -1,10 +1,12 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 #include "ort_utils.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-CustomOpsTable& get_mmdeploy_custom_ops() {
-  static CustomOpsTable _custom_ops;
-  return _custom_ops;
-}
+    CustomOpsTable& get_mmdeploy_custom_ops()
+    {
+        static CustomOpsTable _custom_ops;
+        return _custom_ops;
+    }
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.h b/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.h
index e19c984f86..14d2da3457 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.h
@@ -6,32 +6,39 @@
 #include <unordered_map>
 #include <vector>
 
-namespace mmdeploy {
-
-typedef std::unordered_map<std::string, std::vector<OrtCustomOp*>> CustomOpsTable;
-
-struct OrtTensorDimensions : std::vector<int64_t> {
-  OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) {
-    OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
-    std::vector<int64_t>::operator=(ort.GetTensorShape(info));
-    ort.ReleaseTensorTypeAndShapeInfo(info);
-  }
-};
-
-CustomOpsTable& get_mmdeploy_custom_ops();
-
-template <char const* domain, typename T>
-class OrtOpsRegistry {
- public:
-  OrtOpsRegistry() { get_mmdeploy_custom_ops()[domain].push_back(&instance); }
-
- private:
-  T instance{};
-};
-
-#define REGISTER_ONNXRUNTIME_OPS(domain, name)     \
-  static char __domain_##domain##name[] = #domain; \
-  static OrtOpsRegistry<__domain_##domain##name, name> ort_ops_registry_##domain##name {}
+namespace mmdeploy
+{
+
+    typedef std::unordered_map<std::string, std::vector<OrtCustomOp*>> CustomOpsTable;
+
+    struct OrtTensorDimensions : std::vector<int64_t>
+    {
+        OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value)
+        {
+            OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
+            std::vector<int64_t>::operator=(ort.GetTensorShape(info));
+            ort.ReleaseTensorTypeAndShapeInfo(info);
+        }
+    };
+
+    CustomOpsTable& get_mmdeploy_custom_ops();
+
+    template<char const* domain, typename T>
+    class OrtOpsRegistry
+    {
+      public:
+        OrtOpsRegistry()
+        {
+            get_mmdeploy_custom_ops()[domain].push_back(&instance);
+        }
+
+      private:
+        T instance{};
+    };
+
+#define REGISTER_ONNXRUNTIME_OPS(domain, name)                                                \
+    static char                                          __domain_##domain##name[] = #domain; \
+    static OrtOpsRegistry<__domain_##domain##name, name> ort_ops_registry_##domain##name {}
 
 }  // namespace mmdeploy
 #endif  // ORT_MMCV_UTILS_H
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.cpp
index c7fed37d23..27eb677394 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.cpp
@@ -8,287 +8,335 @@
 
 #include "ort_utils.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 #define MAX(a, b) (((a) < (b)) ? (b) : (a))
 #define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit - 1), MAX(in, 0))
 
-GridSampleKernel::GridSampleKernel(const OrtApi &api, const OrtKernelInfo *info)
-    : ort_(api), info_(info) {
-  align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
-  interpolation_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
-  padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
-
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-
-enum GridSamplerInterpolation { Bilinear = 0, Nearest = 1, Bicubic = 2 };
-enum GridSamplerPadding { Zeros = 0, Border = 1, Reflection = 2 };
-
-template <typename scalar_t>
-static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, bool align_corners) {
-  if (align_corners) {
-    return ((coord + 1) / 2) * (size - 1);
-  } else {
-    return ((coord + 1) * size - 1) / 2;
-  }
-}
-
-// Clips coordinates to between 0 and clip_limit - 1
-template <typename scalar_t>
-static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
-  return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
-}
-
-// Reflects coordinates until they fall between low and high (inclusive).
-// The bounds are passed as twice their value so that half-integer values
-// can be represented as ints.
-template <typename scalar_t>
-static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low, int64_t twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<scalar_t>(0);
-  }
-  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-  in = std::fabs(in - min);
-  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
-  scalar_t extra = std::fmod(in, span);
-  int flips = static_cast<int>(std::floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
-}
-
-template <typename scalar_t>
-static inline scalar_t compute_coordinates(scalar_t coord, int64_t size, int64_t padding_mode,
-                                           bool align_corners) {
-  if (padding_mode == GridSamplerPadding::Border) {
-    coord = clip_coordinates(coord, size);
-  } else if (padding_mode == GridSamplerPadding::Reflection) {
-    if (align_corners) {
-      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
-    } else {
-      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    GridSampleKernel::GridSampleKernel(const OrtApi& api, const OrtKernelInfo* info)
+        : ort_(api)
+        , info_(info)
+    {
+        align_corners_      = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
+        interpolation_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
+        padding_mode_       = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
+
+        allocator_ = Ort::AllocatorWithDefaultOptions();
     }
-    coord = clip_coordinates(coord, size);
-  }
-  return coord;
-}
-
-// Computes the pixel source index value for a grid coordinate
-template <typename scalar_t>
-static inline scalar_t grid_sampler_compute_source_index(scalar_t coord, int64_t size,
-                                                         int64_t padding_mode, bool align_corners) {
-  coord = grid_sampler_unnormalize(coord, size, align_corners);
-  coord = compute_coordinates(coord, size, padding_mode, align_corners);
-  return coord;
-}
-
-static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
-}
-
-template <typename scalar_t>
-static inline scalar_t get_value_bounded(const scalar_t *data, scalar_t x, scalar_t y, int64_t W,
-                                         int64_t H, int64_t sW, int64_t sH, int64_t padding_mode,
-                                         bool align_corners) {
-  x = compute_coordinates(x, W, padding_mode, align_corners);
-  y = compute_coordinates(y, H, padding_mode, align_corners);
-
-  int64_t ix = static_cast<int64_t>(x);
-  int64_t iy = static_cast<int64_t>(y);
-
-  if (within_bounds_2d(iy, ix, H, W)) {
-    return data[iy * sH + ix * sW];
-  }
-  return static_cast<scalar_t>(0);
-}
-
-template <typename scalar_t>
-static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
-  return ((A + 2) * x - (A + 3)) * x * x + 1;
-}
-
-template <typename scalar_t>
-static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
-  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
-}
-
-template <typename scalar_t>
-static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4], scalar_t t) {
-  scalar_t A = -0.75;
-
-  scalar_t x1 = t;
-  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
-  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
-
-  // opposite coefficients
-  scalar_t x2 = 1.0 - t;
-  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
-  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
-}
-
-template <typename scalar_t>
-static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2, scalar_t x3,
-                                      scalar_t t) {
-  scalar_t coeffs[4];
-  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
-
-  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
-}
-
-void GridSampleKernel::Compute(OrtKernelContext *context) {
-  const bool align_corners = align_corners_;
-  const int64_t padding_mode = padding_mode_;
-  const int64_t interpolation_mode = interpolation_mode_;
-
-  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
-  const float *input_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
-
-  const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
-  const float *grid_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
-
-  OrtTensorDimensions input_dims(ort_, input);
-  OrtTensorDimensions grid_dims(ort_, grid);
-  int64_t N = input_dims[0];
-  int64_t C = input_dims[1];
-  int64_t inp_H = input_dims[2];
-  int64_t inp_W = input_dims[3];
-  int64_t out_H = grid_dims[1];
-  int64_t out_W = grid_dims[2];
-
-  std::vector<int64_t> output_dims = {N, C, out_H, out_W};
-  OrtValue *output =
-      ort_.KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size());
-  float *out_ptr = ort_.GetTensorMutableData<float>(output);
-
-  int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
-  int64_t inp_sC = input_dims[2] * input_dims[3];
-  int64_t inp_sH = input_dims[3];
-  int64_t inp_sW = 1;
-  int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
-  int64_t grid_sH = grid_dims[2] * grid_dims[3];
-  int64_t grid_sW = grid_dims[3];
-  int64_t grid_sCoor = 1;
-  int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
-  int64_t out_sC = output_dims[2] * output_dims[3];
-  int64_t out_sH = output_dims[3];
-  int64_t out_sW = 1;
-
-  // loop over each output pixel
-  for (int64_t n = 0; n < N; ++n) {
-    const float *grid_ptr_N = grid_data + n * grid_sN;
-    const float *inp_ptr_N = input_data + n * inp_sN;
-    for (int64_t h = 0; h < out_H; ++h) {
-      for (int64_t w = 0; w < out_W; ++w) {
-        const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
-        float x = *grid_ptr_NHW;
-        float y = grid_ptr_NHW[grid_sCoor];
-
-        float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
-        float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);
-
-        if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-          // get corner pixel values from (x, y)
-          // for 4d, we use north-east-south-west
-          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-
-          int64_t ix_ne = ix_nw + 1;
-          int64_t iy_ne = iy_nw;
-
-          int64_t ix_sw = ix_nw;
-          int64_t iy_sw = iy_nw + 1;
-
-          int64_t ix_se = ix_nw + 1;
-          int64_t iy_se = iy_nw + 1;
-
-          // get surfaces to each neighbor:
-          float nw = (ix_se - ix) * (iy_se - iy);
-          float ne = (ix - ix_sw) * (iy_sw - iy);
-          float sw = (ix_ne - ix) * (iy - iy_ne);
-          float se = (ix - ix_nw) * (iy - iy_nw);
-
-          // calculate bilinear weighted pixel value and set output pixel
-          const float *inp_ptr_NC = inp_ptr_N;
-          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-          for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-            auto res = static_cast<float>(0);
-            if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-            }
-            if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-            }
-            if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-            }
-            if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-            }
-            *out_ptr_NCHW = res;
-          }
-        } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-          int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
-          int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
-
-          // assign nearest neighbor pixel value to output pixel
-          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-          const float *inp_ptr_NC = inp_ptr_N;
-          for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
-              *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-            } else {
-              *out_ptr_NCHW = static_cast<float>(0);
+
+    enum GridSamplerInterpolation
+    {
+        Bilinear = 0,
+        Nearest  = 1,
+        Bicubic  = 2
+    };
+    enum GridSamplerPadding
+    {
+        Zeros      = 0,
+        Border     = 1,
+        Reflection = 2
+    };
+
+    template<typename scalar_t>
+    static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, bool align_corners)
+    {
+        if (align_corners)
+        {
+            return ((coord + 1) / 2) * (size - 1);
+        }
+        else
+        {
+            return ((coord + 1) * size - 1) / 2;
+        }
+    }
+
+    // Clips coordinates to between 0 and clip_limit - 1
+    template<typename scalar_t>
+    static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit)
+    {
+        return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
+    }
+
+    // Reflects coordinates until they fall between low and high (inclusive).
+    // The bounds are passed as twice their value so that half-integer values
+    // can be represented as ints.
+    template<typename scalar_t>
+    static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low, int64_t twice_high)
+    {
+        if (twice_low == twice_high)
+        {
+            return static_cast<scalar_t>(0);
+        }
+        scalar_t min   = static_cast<scalar_t>(twice_low) / 2;
+        scalar_t span  = static_cast<scalar_t>(twice_high - twice_low) / 2;
+        in             = std::fabs(in - min);
+        // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+        scalar_t extra = std::fmod(in, span);
+        int      flips = static_cast<int>(std::floor(in / span));
+        if (flips % 2 == 0)
+        {
+            return extra + min;
+        }
+        else
+        {
+            return span - extra + min;
+        }
+    }
+
+    template<typename scalar_t>
+    static inline scalar_t compute_coordinates(scalar_t coord, int64_t size, int64_t padding_mode, bool align_corners)
+    {
+        if (padding_mode == GridSamplerPadding::Border)
+        {
+            coord = clip_coordinates(coord, size);
+        }
+        else if (padding_mode == GridSamplerPadding::Reflection)
+        {
+            if (align_corners)
+            {
+                coord = reflect_coordinates(coord, 0, 2 * (size - 1));
             }
-          }
-        } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
-          // grid_sampler_compute_source_index will "clip the value" of idx
-          // depends on the padding,
-          // which would cause calculation to be wrong,
-          // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
-          // = floor(x) = -1
-          // There would be more problem in reflection padding, since the -1 and
-          // +1 direction is not fixed in boundary condition
-          ix = grid_sampler_unnormalize(x, inp_W, align_corners);
-          iy = grid_sampler_unnormalize(y, inp_H, align_corners);
-
-          float ix_nw = std::floor(ix);
-          float iy_nw = std::floor(iy);
-
-          const float tx = ix - ix_nw;
-          const float ty = iy - iy_nw;
-
-          const float *inp_ptr_NC = inp_ptr_N;
-          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-          for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-            float coefficients[4];
-
-            // Interpolate 4 values in the x direction
-            for (int64_t i = 0; i < 4; ++i) {
-              coefficients[i] = cubic_interp1d<float>(
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H,
-                                           inp_sW, inp_sH, padding_mode, align_corners),
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H,
-                                           inp_sW, inp_sH, padding_mode, align_corners),
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i, inp_W, inp_H,
-                                           inp_sW, inp_sH, padding_mode, align_corners),
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i, inp_W, inp_H,
-                                           inp_sW, inp_sH, padding_mode, align_corners),
-                  tx);
+            else
+            {
+                coord = reflect_coordinates(coord, -1, 2 * size - 1);
             }
+            coord = clip_coordinates(coord, size);
+        }
+        return coord;
+    }
 
-            // Interpolate in the y direction
-            *out_ptr_NCHW = cubic_interp1d<float>(coefficients[0], coefficients[1], coefficients[2],
-                                                  coefficients[3], ty);
-          }
+    // Computes the pixel source index value for a grid coordinate
+    template<typename scalar_t>
+    static inline scalar_t grid_sampler_compute_source_index(scalar_t coord, int64_t size, int64_t padding_mode, bool align_corners)
+    {
+        coord = grid_sampler_unnormalize(coord, size, align_corners);
+        coord = compute_coordinates(coord, size, padding_mode, align_corners);
+        return coord;
+    }
+
+    static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W)
+    {
+        return h >= 0 && h < H && w >= 0 && w < W;
+    }
+
+    template<typename scalar_t>
+    static inline scalar_t get_value_bounded(const scalar_t* data, scalar_t x, scalar_t y, int64_t W, int64_t H, int64_t sW, int64_t sH, int64_t padding_mode, bool align_corners)
+    {
+        x = compute_coordinates(x, W, padding_mode, align_corners);
+        y = compute_coordinates(y, H, padding_mode, align_corners);
+
+        int64_t ix = static_cast<int64_t>(x);
+        int64_t iy = static_cast<int64_t>(y);
+
+        if (within_bounds_2d(iy, ix, H, W))
+        {
+            return data[iy * sH + ix * sW];
+        }
+        return static_cast<scalar_t>(0);
+    }
+
+    template<typename scalar_t>
+    static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A)
+    {
+        return ((A + 2) * x - (A + 3)) * x * x + 1;
+    }
+
+    template<typename scalar_t>
+    static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A)
+    {
+        return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+    }
+
+    template<typename scalar_t>
+    static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4], scalar_t t)
+    {
+        scalar_t A = -0.75;
+
+        scalar_t x1 = t;
+        coeffs[0]   = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+        coeffs[1]   = cubic_convolution1<scalar_t>(x1, A);
+
+        // opposite coefficients
+        scalar_t x2 = 1.0 - t;
+        coeffs[2]   = cubic_convolution1<scalar_t>(x2, A);
+        coeffs[3]   = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+    }
+
+    template<typename scalar_t>
+    static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2, scalar_t x3, scalar_t t)
+    {
+        scalar_t coeffs[4];
+        get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+
+        return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+    }
+
+    void GridSampleKernel::Compute(OrtKernelContext* context)
+    {
+        const bool           align_corners      = align_corners_;
+        const int64_t        padding_mode       = padding_mode_;
+        const int64_t        interpolation_mode = interpolation_mode_;
+
+        const OrtValue*      input      = ort_.KernelContext_GetInput(context, 0);
+        const float*         input_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(input));
+
+        const OrtValue*      grid      = ort_.KernelContext_GetInput(context, 1);
+        const float*         grid_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(grid));
+
+        OrtTensorDimensions  input_dims(ort_, input);
+        OrtTensorDimensions  grid_dims(ort_, grid);
+        int64_t              N     = input_dims[0];
+        int64_t              C     = input_dims[1];
+        int64_t              inp_H = input_dims[2];
+        int64_t              inp_W = input_dims[3];
+        int64_t              out_H = grid_dims[1];
+        int64_t              out_W = grid_dims[2];
+
+        std::vector<int64_t> output_dims = {N, C, out_H, out_W};
+        OrtValue*            output =
+            ort_.KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size());
+        float*  out_ptr = ort_.GetTensorMutableData<float>(output);
+
+        int64_t inp_sN     = input_dims[1] * input_dims[2] * input_dims[3];
+        int64_t inp_sC     = input_dims[2] * input_dims[3];
+        int64_t inp_sH     = input_dims[3];
+        int64_t inp_sW     = 1;
+        int64_t grid_sN    = grid_dims[1] * grid_dims[2] * grid_dims[3];
+        int64_t grid_sH    = grid_dims[2] * grid_dims[3];
+        int64_t grid_sW    = grid_dims[3];
+        int64_t grid_sCoor = 1;
+        int64_t out_sN     = output_dims[1] * output_dims[2] * output_dims[3];
+        int64_t out_sC     = output_dims[2] * output_dims[3];
+        int64_t out_sH     = output_dims[3];
+        int64_t out_sW     = 1;
+
+        // loop over each output pixel
+        for (int64_t n = 0; n < N; ++n)
+        {
+            const float* grid_ptr_N = grid_data + n * grid_sN;
+            const float* inp_ptr_N  = input_data + n * inp_sN;
+            for (int64_t h = 0; h < out_H; ++h)
+            {
+                for (int64_t w = 0; w < out_W; ++w)
+                {
+                    const float* grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
+                    float        x            = *grid_ptr_NHW;
+                    float        y            = grid_ptr_NHW[grid_sCoor];
+
+                    float        ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
+                    float        iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);
+
+                    if (interpolation_mode == GridSamplerInterpolation::Bilinear)
+                    {
+                        // get corner pixel values from (x, y)
+                        // for 4d, we use north-east-south-west
+                        int64_t      ix_nw = static_cast<int64_t>(std::floor(ix));
+                        int64_t      iy_nw = static_cast<int64_t>(std::floor(iy));
+
+                        int64_t      ix_ne = ix_nw + 1;
+                        int64_t      iy_ne = iy_nw;
+
+                        int64_t      ix_sw = ix_nw;
+                        int64_t      iy_sw = iy_nw + 1;
+
+                        int64_t      ix_se = ix_nw + 1;
+                        int64_t      iy_se = iy_nw + 1;
+
+                        // get surfaces to each neighbor:
+                        float        nw = (ix_se - ix) * (iy_se - iy);
+                        float        ne = (ix - ix_sw) * (iy_sw - iy);
+                        float        sw = (ix_ne - ix) * (iy - iy_ne);
+                        float        se = (ix - ix_nw) * (iy - iy_nw);
+
+                        // calculate bilinear weighted pixel value and set output pixel
+                        const float* inp_ptr_NC   = inp_ptr_N;
+                        float*       out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+                        for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
+                        {
+                            auto res = static_cast<float>(0);
+                            if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W))
+                            {
+                                res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+                            }
+                            if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W))
+                            {
+                                res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+                            }
+                            if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W))
+                            {
+                                res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+                            }
+                            if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W))
+                            {
+                                res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+                            }
+                            *out_ptr_NCHW = res;
+                        }
+                    }
+                    else if (interpolation_mode == GridSamplerInterpolation::Nearest)
+                    {
+                        int64_t      ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
+                        int64_t      iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
+
+                        // assign nearest neighbor pixel value to output pixel
+                        float*       out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+                        const float* inp_ptr_NC   = inp_ptr_N;
+                        for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
+                        {
+                            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W))
+                            {
+                                *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+                            }
+                            else
+                            {
+                                *out_ptr_NCHW = static_cast<float>(0);
+                            }
+                        }
+                    }
+                    else if (interpolation_mode == GridSamplerInterpolation::Bicubic)
+                    {
+                        // grid_sampler_compute_source_index will "clip the value" of idx
+                        // depends on the padding,
+                        // which would cause calculation to be wrong,
+                        // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
+                        // = floor(x) = -1
+                        // There would be more problem in reflection padding, since the -1 and
+                        // +1 direction is not fixed in boundary condition
+                        ix = grid_sampler_unnormalize(x, inp_W, align_corners);
+                        iy = grid_sampler_unnormalize(y, inp_H, align_corners);
+
+                        float        ix_nw = std::floor(ix);
+                        float        iy_nw = std::floor(iy);
+
+                        const float  tx = ix - ix_nw;
+                        const float  ty = iy - iy_nw;
+
+                        const float* inp_ptr_NC   = inp_ptr_N;
+                        float*       out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+                        for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
+                        {
+                            float coefficients[4];
+
+                            // Interpolate 4 values in the x direction
+                            for (int64_t i = 0; i < 4; ++i)
+                            {
+                                coefficients[i] = cubic_interp1d<float>(
+                                    get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                    get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                    get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                    get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                    tx);
+                            }
+
+                            // Interpolate in the y direction
+                            *out_ptr_NCHW = cubic_interp1d<float>(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty);
+                        }
+                    }
+                }
+            }
         }
-      }
     }
-  }
-}
 
-REGISTER_ONNXRUNTIME_OPS(mmdeploy, GridSampleOp);
+    REGISTER_ONNXRUNTIME_OPS(mmdeploy, GridSampleOp);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.h b/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.h
index 2581b7833e..e6c9fa280f 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.h
@@ -4,41 +4,59 @@
 
 #include <onnxruntime_cxx_api.h>
 
-namespace mmdeploy {
-
-struct GridSampleKernel {
-  GridSampleKernel(const OrtApi &api, const OrtKernelInfo *info);
-
-  void Compute(OrtKernelContext *context);
-
- protected:
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo *info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-
-  int64_t align_corners_;
-  int64_t interpolation_mode_;
-  int64_t padding_mode_;
-};
-
-struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
-  void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const {
-    return new GridSampleKernel(api, info);
-  };
-
-  const char *GetName() const { return "grid_sampler"; };
-
-  size_t GetInputTypeCount() const { return 2; };
-  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  size_t GetOutputTypeCount() const { return 1; };
-  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  const char *GetExecutionProviderType() const { return "CPUExecutionProvider"; };
-};
+namespace mmdeploy
+{
+
+    struct GridSampleKernel
+    {
+        GridSampleKernel(const OrtApi& api, const OrtKernelInfo* info);
+
+        void Compute(OrtKernelContext* context);
+
+      protected:
+        Ort::CustomOpApi                 ort_;
+        const OrtKernelInfo*             info_;
+        Ort::AllocatorWithDefaultOptions allocator_;
+
+        int64_t                          align_corners_;
+        int64_t                          interpolation_mode_;
+        int64_t                          padding_mode_;
+    };
+
+    struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel>
+    {
+        void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const
+        {
+            return new GridSampleKernel(api, info);
+        };
+
+        const char* GetName() const
+        {
+            return "grid_sampler";
+        };
+
+        size_t GetInputTypeCount() const
+        {
+            return 2;
+        };
+        ONNXTensorElementDataType GetInputType(size_t /*index*/) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        };
+
+        size_t GetOutputTypeCount() const
+        {
+            return 1;
+        };
+        ONNXTensorElementDataType GetOutputType(size_t /*index*/) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        };
+
+        const char* GetExecutionProviderType() const
+        {
+            return "CPUExecutionProvider";
+        };
+    };
 }  // namespace mmdeploy
 #endif
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp
index 075c3277bc..320fa8dd45 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp
@@ -8,191 +8,218 @@
 #include "modulated_deform_conv/modulated_deform_conv_cpu.h"
 #include "ort_utils.h"
 
-namespace mmdeploy {
-
-void parallel_unroll_gemm(const float *A, const float *B, const float *V, const float *H,
-                          const int32_t M, const int32_t N, const int32_t K, const float alpha,
-                          const float beta, float *Y, const int32_t start_row,
-                          const int32_t end_row) {
-  std::vector<float> tmp(N);
-  for (int32_t m = start_row; m < end_row; ++m) {
-    for (int32_t n = 0; n < N; n++) {
-      tmp[n] = 0;
-    }
+namespace mmdeploy
+{
+
+    void parallel_unroll_gemm(const float* A, const float* B, const float* V, const float* H, const int32_t M, const int32_t N, const int32_t K, const float alpha, const float beta, float* Y, const int32_t start_row, const int32_t end_row)
     {
-      int32_t remainder = K % 8;  // unroll
-      for (int32_t k = 0; k < K; k += 8) {
-        for (int32_t n = 0; n < N; n++) {
-          tmp[n] += A[m * K + k] * B[k * N + n];
-          tmp[n] += A[m * K + k + 1] * B[k * N + N + n];
-          tmp[n] += A[m * K + k + 2] * B[k * N + 2 * N + n];
-          tmp[n] += A[m * K + k + 3] * B[k * N + 3 * N + n];
-          tmp[n] += A[m * K + k + 4] * B[k * N + 4 * N + n];
-          tmp[n] += A[m * K + k + 5] * B[k * N + 5 * N + n];
-          tmp[n] += A[m * K + k + 6] * B[k * N + 6 * N + n];
-          tmp[n] += A[m * K + k + 7] * B[k * N + 7 * N + n];
+        std::vector<float> tmp(N);
+        for (int32_t m = start_row; m < end_row; ++m)
+        {
+            for (int32_t n = 0; n < N; n++)
+            {
+                tmp[n] = 0;
+            }
+            {
+                int32_t remainder = K % 8;  // unroll
+                for (int32_t k = 0; k < K; k += 8)
+                {
+                    for (int32_t n = 0; n < N; n++)
+                    {
+                        tmp[n] += A[m * K + k] * B[k * N + n];
+                        tmp[n] += A[m * K + k + 1] * B[k * N + N + n];
+                        tmp[n] += A[m * K + k + 2] * B[k * N + 2 * N + n];
+                        tmp[n] += A[m * K + k + 3] * B[k * N + 3 * N + n];
+                        tmp[n] += A[m * K + k + 4] * B[k * N + 4 * N + n];
+                        tmp[n] += A[m * K + k + 5] * B[k * N + 5 * N + n];
+                        tmp[n] += A[m * K + k + 6] * B[k * N + 6 * N + n];
+                        tmp[n] += A[m * K + k + 7] * B[k * N + 7 * N + n];
+                    }
+                }
+                for (int32_t k = K - remainder; k < K; k++)
+                {
+                    for (int32_t n = 0; n < N; n++)
+                    {
+                        tmp[n] += A[m * K + k] * B[k * N + n];
+                    }
+                }
+            }
+            for (int32_t n = 0; n < N; n++)
+            {
+                tmp[n] *= alpha;
+                if (V) tmp[n] += beta * V[n];
+                if (H) tmp[n] += beta * H[m * N + n];
+                Y[m * N + n] = tmp[n];
+            }
         }
-      }
-      for (int32_t k = K - remainder; k < K; k++) {
-        for (int32_t n = 0; n < N; n++) {
-          tmp[n] += A[m * K + k] * B[k * N + n];
+    }
+
+    void deformable_conv2d_ref_fp32(const float* src, const float* offset, const float* mask, const float* filter, const float* bias, const int64_t batch, const int64_t src_c, const int64_t src_h, const int64_t src_w, const int64_t dst_c, const int64_t dst_h, const int64_t dst_w, const int64_t group, const int64_t offset_group, const int64_t channels, const int64_t num_output, const int64_t kernel_h, const int64_t kernel_w, const int64_t stride_h, const int64_t stride_w, const int64_t pad_h, const int64_t pad_w, const int64_t dilation_h, const int64_t dilation_w, float* columns, float* dst)
+    {
+        const int64_t            ic_per_gp   = channels / group;
+        const int64_t            oc_per_gp   = num_output / group;
+        // Set up for launching threads
+        std::size_t              num_threads = std::thread::hardware_concurrency();
+        std::vector<std::thread> threads;
+        threads.reserve(num_threads);
+
+        for (int64_t b = 0; b < batch; ++b)
+        {
+            for (int64_t g = 0; g < group; ++g)
+            {
+                deformable_im2col_2d<float>(
+                    src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+                    offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+                    mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w,
+                    src_h,
+                    src_w,
+                    kernel_h,
+                    kernel_w,
+                    pad_h,
+                    pad_w,
+                    stride_h,
+                    stride_w,
+                    dilation_h,
+                    dilation_w,
+                    ic_per_gp,
+                    offset_group,
+                    dst_h,
+                    dst_w,
+                    mask != nullptr,
+                    columns);
+                float* dst_ptr = dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
+                if (bias != nullptr)
+                {
+                    const float* bias_ptr = bias + g * oc_per_gp;
+                    for (int64_t oc = 0; oc < oc_per_gp; ++oc)
+                    {
+                        for (int64_t hw = 0; hw < dst_h * dst_w; ++hw)
+                        {
+                            dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
+                        }
+                    }
+                }
+                else
+                {
+                    memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
+                }
+                if (num_threads > 1)
+                {
+                    // Calculate values to pass to threads
+                    int32_t n_rows  = (oc_per_gp + num_threads - 1) / num_threads;
+                    int32_t end_row = 0;
+                    for (int32_t i = 0; i < num_threads; i++)
+                    {
+                        auto start_row = i * n_rows;
+                        end_row        = start_row + n_rows;
+                        if (end_row > oc_per_gp) end_row = oc_per_gp;
+                        std::thread t(parallel_unroll_gemm,
+                                      filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w,
+                                      columns,
+                                      nullptr,
+                                      dst_ptr,
+                                      oc_per_gp,
+                                      dst_h * dst_w,
+                                      ic_per_gp * kernel_h * kernel_w,
+                                      1.0f,
+                                      1.0f,
+                                      dst_ptr,
+                                      start_row,
+                                      end_row);
+                        threads.emplace_back(std::move(t));
+                    }
+                    // Wait for all threads to complete
+                    for (auto& t : threads) t.join();
+                    threads.clear();
+                }
+                else
+                {  // parallel gemm degrade to serial gemm with start_row=0 and end_row= oc_per_gp
+                    parallel_unroll_gemm(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns, nullptr, dst_ptr, oc_per_gp, dst_h * dst_w, ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr, 0, oc_per_gp);
+                }
+            }
         }
-      }
     }
-    for (int32_t n = 0; n < N; n++) {
-      tmp[n] *= alpha;
-      if (V) tmp[n] += beta * V[n];
-      if (H) tmp[n] += beta * H[m * N + n];
-      Y[m * N + n] = tmp[n];
+
+    MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(const OrtApi&        api,
+                                                                 const OrtKernelInfo* info)
+        : ort_(api)
+        , info_(info)
+    {
+        std::vector<int64_t> stride  = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
+        stride_height_               = stride[0];
+        stride_width_                = stride[1];
+        std::vector<int64_t> padding = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
+        padding_height_              = padding[0];
+        padding_width_               = padding[1];
+        std::vector<int64_t> dilation =
+            ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
+        dilation_height_  = dilation[0];
+        dilation_width_   = dilation[1];
+        deformable_group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
+        group_            = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
+
+        // create allocator
+        allocator_ = Ort::AllocatorWithDefaultOptions();
     }
-  }
-}
-
-void deformable_conv2d_ref_fp32(const float *src, const float *offset, const float *mask,
-                                const float *filter, const float *bias, const int64_t batch,
-                                const int64_t src_c, const int64_t src_h, const int64_t src_w,
-                                const int64_t dst_c, const int64_t dst_h, const int64_t dst_w,
-                                const int64_t group, const int64_t offset_group,
-                                const int64_t channels, const int64_t num_output,
-                                const int64_t kernel_h, const int64_t kernel_w,
-                                const int64_t stride_h, const int64_t stride_w, const int64_t pad_h,
-                                const int64_t pad_w, const int64_t dilation_h,
-                                const int64_t dilation_w, float *columns, float *dst) {
-  const int64_t ic_per_gp = channels / group;
-  const int64_t oc_per_gp = num_output / group;
-  // Set up for launching threads
-  std::size_t num_threads = std::thread::hardware_concurrency();
-  std::vector<std::thread> threads;
-  threads.reserve(num_threads);
-
-  for (int64_t b = 0; b < batch; ++b) {
-    for (int64_t g = 0; g < group; ++g) {
-      deformable_im2col_2d<float>(
-          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
-          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
-          mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h, src_w, kernel_h,
-          kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, ic_per_gp,
-          offset_group, dst_h, dst_w, mask != nullptr, columns);
-      float *dst_ptr = dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
-      if (bias != nullptr) {
-        const float *bias_ptr = bias + g * oc_per_gp;
-        for (int64_t oc = 0; oc < oc_per_gp; ++oc) {
-          for (int64_t hw = 0; hw < dst_h * dst_w; ++hw) {
-            dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
-          }
-        }
-      } else {
-        memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
-      }
-      if (num_threads > 1) {
-        // Calculate values to pass to threads
-        int32_t n_rows = (oc_per_gp + num_threads - 1) / num_threads;
-        int32_t end_row = 0;
-        for (int32_t i = 0; i < num_threads; i++) {
-          auto start_row = i * n_rows;
-          end_row = start_row + n_rows;
-          if (end_row > oc_per_gp) end_row = oc_per_gp;
-          std::thread t(parallel_unroll_gemm,
-                        filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns, nullptr,
-                        dst_ptr, oc_per_gp, dst_h * dst_w, ic_per_gp * kernel_h * kernel_w, 1.0f,
-                        1.0f, dst_ptr, start_row, end_row);
-          threads.emplace_back(std::move(t));
-        }
-        // Wait for all threads to complete
-        for (auto &t : threads) t.join();
-        threads.clear();
-      } else {  // parallel gemm degrade to serial gemm with start_row=0 and end_row= oc_per_gp
-        parallel_unroll_gemm(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns,
-                             nullptr, dst_ptr, oc_per_gp, dst_h * dst_w,
-                             ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr, 0, oc_per_gp);
-      }
+
+    void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext* context)
+    {
+        const int64_t       stride_height    = stride_height_;
+        const int64_t       stride_width     = stride_width_;
+        const int64_t       padding_height   = padding_height_;
+        const int64_t       padding_width    = padding_width_;
+        const int64_t       dilation_height  = dilation_height_;
+        const int64_t       dilation_width   = dilation_width_;
+        const int64_t       deformable_group = deformable_group_;
+        const int64_t       group            = group_;
+
+        const OrtValue*     input      = ort_.KernelContext_GetInput(context, 0);
+        const float*        input_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(input));
+
+        const OrtValue*     offset      = ort_.KernelContext_GetInput(context, 1);
+        const float*        offset_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(offset));
+
+        const OrtValue*     mask      = ort_.KernelContext_GetInput(context, 2);
+        const float*        mask_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(mask));
+
+        const OrtValue*     filter      = ort_.KernelContext_GetInput(context, 3);
+        const float*        filter_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(filter));
+
+        const OrtValue*     bias      = ort_.KernelContext_GetInput(context, 4);
+        const float*        bias_data = (bias != nullptr) ? reinterpret_cast<const float*>(ort_.GetTensorData<float>(bias)) : nullptr;
+        // const float *bias_data = nullptr;
+
+        OrtTensorDimensions input_dims(ort_, input);
+        OrtTensorDimensions filter_dims(ort_, filter);
+
+        int64_t             batch         = input_dims[0];
+        int64_t             channels      = input_dims[1];
+        int64_t             in_height     = input_dims[2];
+        int64_t             in_width      = input_dims[3];
+        int64_t             num_output    = filter_dims[0];
+        int64_t             kernel_height = filter_dims[2];
+        int64_t             kernel_width  = filter_dims[3];
+
+        // get output memory
+        int64_t             out_height = floor(
+            (in_height + 2 * padding_height - dilation_height * (kernel_height - 1) - 1) / stride_height +
+            1);
+        int64_t out_width = floor(
+            (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) / stride_width + 1);
+
+        std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
+        OrtValue*            output =
+            ort_.KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size());
+        float*  out_ptr = ort_.GetTensorMutableData<float>(output);
+
+        // allocate tmp memory
+        int64_t column_len = (channels / group) * kernel_height * kernel_width * out_height * out_width;
+        float*  columns    = (float*)allocator_.Alloc(sizeof(float) * column_len);
+
+        deformable_conv2d_ref_fp32(input_data, offset_data, mask_data, filter_data, bias_data, batch, channels, in_height, in_width, num_output, out_height, out_width, group, deformable_group, channels, num_output, kernel_height, kernel_width, stride_height, stride_width, padding_height, padding_width, dilation_height, dilation_width, columns, out_ptr);
+
+        allocator_.Free(columns);
     }
-  }
-}
-
-MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(const OrtApi &api,
-                                                             const OrtKernelInfo *info)
-    : ort_(api), info_(info) {
-  std::vector<int64_t> stride = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
-  stride_height_ = stride[0];
-  stride_width_ = stride[1];
-  std::vector<int64_t> padding = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
-  padding_height_ = padding[0];
-  padding_width_ = padding[1];
-  std::vector<int64_t> dilation =
-      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
-  dilation_height_ = dilation[0];
-  dilation_width_ = dilation[1];
-  deformable_group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
-  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
-
-  // create allocator
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-
-void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext *context) {
-  const int64_t stride_height = stride_height_;
-  const int64_t stride_width = stride_width_;
-  const int64_t padding_height = padding_height_;
-  const int64_t padding_width = padding_width_;
-  const int64_t dilation_height = dilation_height_;
-  const int64_t dilation_width = dilation_width_;
-  const int64_t deformable_group = deformable_group_;
-  const int64_t group = group_;
-
-  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
-  const float *input_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
-
-  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
-  const float *offset_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
-
-  const OrtValue *mask = ort_.KernelContext_GetInput(context, 2);
-  const float *mask_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(mask));
-
-  const OrtValue *filter = ort_.KernelContext_GetInput(context, 3);
-  const float *filter_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
-
-  const OrtValue *bias = ort_.KernelContext_GetInput(context, 4);
-  const float *bias_data = (bias != nullptr)
-                               ? reinterpret_cast<const float *>(ort_.GetTensorData<float>(bias))
-                               : nullptr;
-  // const float *bias_data = nullptr;
-
-  OrtTensorDimensions input_dims(ort_, input);
-  OrtTensorDimensions filter_dims(ort_, filter);
-
-  int64_t batch = input_dims[0];
-  int64_t channels = input_dims[1];
-  int64_t in_height = input_dims[2];
-  int64_t in_width = input_dims[3];
-  int64_t num_output = filter_dims[0];
-  int64_t kernel_height = filter_dims[2];
-  int64_t kernel_width = filter_dims[3];
-
-  // get output memory
-  int64_t out_height = floor(
-      (in_height + 2 * padding_height - dilation_height * (kernel_height - 1) - 1) / stride_height +
-      1);
-  int64_t out_width = floor(
-      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) / stride_width + 1);
-
-  std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
-  OrtValue *output =
-      ort_.KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size());
-  float *out_ptr = ort_.GetTensorMutableData<float>(output);
-
-  // allocate tmp memory
-  int64_t column_len = (channels / group) * kernel_height * kernel_width * out_height * out_width;
-  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
-
-  deformable_conv2d_ref_fp32(input_data, offset_data, mask_data, filter_data, bias_data, batch,
-                             channels, in_height, in_width, num_output, out_height, out_width,
-                             group, deformable_group, channels, num_output, kernel_height,
-                             kernel_width, stride_height, stride_width, padding_height,
-                             padding_width, dilation_height, dilation_width, columns, out_ptr);
-
-  allocator_.Free(columns);
-}
-REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVModulatedDeformConvOp);
-REGISTER_ONNXRUNTIME_OPS(mmcv, MMCVModulatedDeformConvOp);
+    REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVModulatedDeformConvOp);
+    REGISTER_ONNXRUNTIME_OPS(mmcv, MMCVModulatedDeformConvOp);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h b/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h
index 772a9c4a88..7ffeb702d3 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h
@@ -4,55 +4,74 @@
 
 #include <onnxruntime_cxx_api.h>
 
-namespace mmdeploy {
-
-struct MMCVModulatedDeformConvKernel {
-  MMCVModulatedDeformConvKernel(const OrtApi &api, const OrtKernelInfo *info);
-
-  void Compute(OrtKernelContext *context);
-
- protected:
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo *info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-
-  int64_t stride_height_;
-  int64_t stride_width_;
-  int64_t padding_height_;
-  int64_t padding_width_;
-  int64_t dilation_height_;
-  int64_t dilation_width_;
-  int64_t deformable_group_;
-  int64_t group_;
-};
-
-struct MMCVModulatedDeformConvOp
-    : Ort::CustomOpBase<MMCVModulatedDeformConvOp, MMCVModulatedDeformConvKernel> {
-  void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const {
-    return new MMCVModulatedDeformConvKernel(api, info);
-  }
-
-  const char *GetName() const { return "MMCVModulatedDeformConv2d"; };
-
-  size_t GetInputTypeCount() const { return 5; };
-  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const {
-    // The last input (index == 4) is optional, which is bias
-    if (index == 4) return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
-
-    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; };
-  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  // force cpu
-  const char *GetExecutionProviderType() const { return "CPUExecutionProvider"; };
-};
+namespace mmdeploy
+{
+
+    struct MMCVModulatedDeformConvKernel
+    {
+        MMCVModulatedDeformConvKernel(const OrtApi& api, const OrtKernelInfo* info);
+
+        void Compute(OrtKernelContext* context);
+
+      protected:
+        Ort::CustomOpApi                 ort_;
+        const OrtKernelInfo*             info_;
+        Ort::AllocatorWithDefaultOptions allocator_;
+
+        int64_t                          stride_height_;
+        int64_t                          stride_width_;
+        int64_t                          padding_height_;
+        int64_t                          padding_width_;
+        int64_t                          dilation_height_;
+        int64_t                          dilation_width_;
+        int64_t                          deformable_group_;
+        int64_t                          group_;
+    };
+
+    struct MMCVModulatedDeformConvOp
+        : Ort::CustomOpBase<MMCVModulatedDeformConvOp, MMCVModulatedDeformConvKernel>
+    {
+        void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const
+        {
+            return new MMCVModulatedDeformConvKernel(api, info);
+        }
+
+        const char* GetName() const
+        {
+            return "MMCVModulatedDeformConv2d";
+        };
+
+        size_t GetInputTypeCount() const
+        {
+            return 5;
+        };
+        ONNXTensorElementDataType GetInputType(size_t /*index*/) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        };
+
+        OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const
+        {
+            // The last input (index == 4) is optional, which is bias
+            if (index == 4) return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+
+            return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+        }
+
+        size_t GetOutputTypeCount() const
+        {
+            return 1;
+        };
+        ONNXTensorElementDataType GetOutputType(size_t /*index*/) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        };
+
+        // force cpu
+        const char* GetExecutionProviderType() const
+        {
+            return "CPUExecutionProvider";
+        };
+    };
 }  // namespace mmdeploy
 #endif
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp
index 784be2c987..397bcbf92c 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp
@@ -13,117 +13,132 @@
 
 #include "ort_utils.h"
 
-namespace mmdeploy {
-struct Box {
-  float x1, y1, x2, y2;
-};
-
-float nms_match_iou(Box box1, Box box2) {
-  auto inter_x1 = std::max(box1.x1, box2.x1);
-  auto inter_y1 = std::max(box1.y1, box2.y1);
-  auto inter_x2 = std::min(box1.x2, box2.x2);
-  auto inter_y2 = std::min(box1.y2, box2.y2);
-
-  auto eps = 1e-10;
-
-  auto w = std::max(static_cast<float>(0), inter_x2 - inter_x1);
-  auto h = std::max(static_cast<float>(0), inter_y2 - inter_y1);
-
-  auto area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
-  auto area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
-  auto inter = w * h;
-  auto ovr = inter / (area1 + area2 - inter + eps);
-  return ovr;
-}
-NMSMatchKernel::NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info)
-    : ort_(api), info_(info) {
-  // create allocator
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-
-void NMSMatchKernel::Compute(OrtKernelContext* context) {
-  const OrtValue* boxes = ort_.KernelContext_GetInput(context, 0);
-  const float* boxes_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
-  const OrtValue* scores = ort_.KernelContext_GetInput(context, 1);
-  const float* scores_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
-  const OrtValue* iou_threshold_ = ort_.KernelContext_GetInput(context, 2);
-  const float iou_threshold_data = ort_.GetTensorData<float>(iou_threshold_)[0];
-  const OrtValue* score_threshold_ = ort_.KernelContext_GetInput(context, 3);
-  const float score_threshold_data = ort_.GetTensorData<float>(score_threshold_)[0];
-
-  OrtTensorDimensions boxes_dim(ort_, boxes);
-  OrtTensorDimensions scores_dim(ort_, scores);
-  // loop over batch
-  int64_t nbatch = boxes_dim[0];
-  int64_t nboxes = boxes_dim[1];
-  int64_t nclass = scores_dim[1];
-  assert(boxes_dim[2] == 4);  //(x1, x2, y1, y2)
-  // alloc some temp memory
-  bool* select = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
-
-  std::vector<int64_t> res_order;
-  for (int64_t k = 0; k < nbatch; k++) {
-    for (int64_t g = 0; g < nclass; g++) {
-      for (int64_t i = 0; i < nboxes; i++) {
-        select[i] = true;
-      }
-      // scores
-      // k * nboxes * nclass means per batch
-      // g * nboxes means per class
-      // batch = 2 boxes = 3 classes = 4
-      std::vector<float> tmp_sc;
-      // get the class scores
-      for (int i = 0; i < nboxes; i++) {
-        tmp_sc.push_back(scores_data[k * nboxes * nclass + g * nboxes + i]);
-      }
-
-      std::vector<int64_t> order(tmp_sc.size());
-      std::iota(order.begin(), order.end(), 0);
-      std::sort(order.begin(), order.end(),
-                [&tmp_sc](int64_t id1, int64_t id2) { return tmp_sc[id1] > tmp_sc[id2]; });
-      for (int64_t _i = 0; _i < nboxes; _i++) {
-        auto i = order[_i];
-        if (select[i] == false) continue;
-        std::vector<int64_t> v_i;
-        for (int64_t _j = _i + 1; _j < nboxes; _j++) {
-          auto j = order[_j];
-          if (select[j] == false) continue;
-          Box vbox1, vbox2;
-          vbox1.x1 = boxes_data[k * nboxes * 4 + i * 4];
-          vbox1.y1 = boxes_data[k * nboxes * 4 + i * 4 + 1];
-          vbox1.x2 = boxes_data[k * nboxes * 4 + i * 4 + 2];
-          vbox1.y2 = boxes_data[k * nboxes * 4 + i * 4 + 3];
-
-          vbox2.x1 = boxes_data[k * nboxes * 4 + j * 4];
-          vbox2.y1 = boxes_data[k * nboxes * 4 + j * 4 + 1];
-          vbox2.x2 = boxes_data[k * nboxes * 4 + j * 4 + 2];
-          vbox2.y2 = boxes_data[k * nboxes * 4 + j * 4 + 3];
-
-          auto ovr = nms_match_iou(vbox1, vbox2);
-          if (ovr >= iou_threshold_data) {
-            select[j] = false;
-            v_i.push_back(j);
-          }
-        }
-        if (tmp_sc[i] > score_threshold_data && v_i.size() != 0) {
-          for (int v_i_idx = 0; v_i_idx < v_i.size(); v_i_idx++) {
-            res_order.push_back(k);
-            res_order.push_back(g);
-            res_order.push_back(i);
-            res_order.push_back(v_i[v_i_idx]);
-          }
-        }
-      }
+namespace mmdeploy
+{
+    struct Box
+    {
+        float x1, y1, x2, y2;
+    };
+
+    float nms_match_iou(Box box1, Box box2)
+    {
+        auto inter_x1 = std::max(box1.x1, box2.x1);
+        auto inter_y1 = std::max(box1.y1, box2.y1);
+        auto inter_x2 = std::min(box1.x2, box2.x2);
+        auto inter_y2 = std::min(box1.y2, box2.y2);
+
+        auto eps = 1e-10;
+
+        auto w = std::max(static_cast<float>(0), inter_x2 - inter_x1);
+        auto h = std::max(static_cast<float>(0), inter_y2 - inter_y1);
+
+        auto area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
+        auto area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
+        auto inter = w * h;
+        auto ovr   = inter / (area1 + area2 - inter + eps);
+        return ovr;
+    }
+    NMSMatchKernel::NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info)
+        : ort_(api)
+        , info_(info)
+    {
+        // create allocator
+        allocator_ = Ort::AllocatorWithDefaultOptions();
     }
-  }
-  std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 4, 4});
 
-  OrtValue* res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
-  int64_t* res_data = ort_.GetTensorMutableData<int64_t>(res);
+    void NMSMatchKernel::Compute(OrtKernelContext* context)
+    {
+        const OrtValue*     boxes                = ort_.KernelContext_GetInput(context, 0);
+        const float*        boxes_data           = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
+        const OrtValue*     scores               = ort_.KernelContext_GetInput(context, 1);
+        const float*        scores_data          = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
+        const OrtValue*     iou_threshold_       = ort_.KernelContext_GetInput(context, 2);
+        const float         iou_threshold_data   = ort_.GetTensorData<float>(iou_threshold_)[0];
+        const OrtValue*     score_threshold_     = ort_.KernelContext_GetInput(context, 3);
+        const float         score_threshold_data = ort_.GetTensorData<float>(score_threshold_)[0];
+
+        OrtTensorDimensions boxes_dim(ort_, boxes);
+        OrtTensorDimensions scores_dim(ort_, scores);
+        // loop over batch
+        int64_t             nbatch = boxes_dim[0];
+        int64_t             nboxes = boxes_dim[1];
+        int64_t             nclass = scores_dim[1];
+        assert(boxes_dim[2] == 4);  //(x1, x2, y1, y2)
+        // alloc some temp memory
+        bool*                select = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
+
+        std::vector<int64_t> res_order;
+        for (int64_t k = 0; k < nbatch; k++)
+        {
+            for (int64_t g = 0; g < nclass; g++)
+            {
+                for (int64_t i = 0; i < nboxes; i++)
+                {
+                    select[i] = true;
+                }
+                // scores
+                // k * nboxes * nclass means per batch
+                // g * nboxes means per class
+                // batch = 2 boxes = 3 classes = 4
+                std::vector<float> tmp_sc;
+                // get the class scores
+                for (int i = 0; i < nboxes; i++)
+                {
+                    tmp_sc.push_back(scores_data[k * nboxes * nclass + g * nboxes + i]);
+                }
+
+                std::vector<int64_t> order(tmp_sc.size());
+                std::iota(order.begin(), order.end(), 0);
+                std::sort(order.begin(), order.end(), [&tmp_sc](int64_t id1, int64_t id2)
+                          { return tmp_sc[id1] > tmp_sc[id2]; });
+                for (int64_t _i = 0; _i < nboxes; _i++)
+                {
+                    auto i = order[_i];
+                    if (select[i] == false) continue;
+                    std::vector<int64_t> v_i;
+                    for (int64_t _j = _i + 1; _j < nboxes; _j++)
+                    {
+                        auto j = order[_j];
+                        if (select[j] == false) continue;
+                        Box vbox1, vbox2;
+                        vbox1.x1 = boxes_data[k * nboxes * 4 + i * 4];
+                        vbox1.y1 = boxes_data[k * nboxes * 4 + i * 4 + 1];
+                        vbox1.x2 = boxes_data[k * nboxes * 4 + i * 4 + 2];
+                        vbox1.y2 = boxes_data[k * nboxes * 4 + i * 4 + 3];
+
+                        vbox2.x1 = boxes_data[k * nboxes * 4 + j * 4];
+                        vbox2.y1 = boxes_data[k * nboxes * 4 + j * 4 + 1];
+                        vbox2.x2 = boxes_data[k * nboxes * 4 + j * 4 + 2];
+                        vbox2.y2 = boxes_data[k * nboxes * 4 + j * 4 + 3];
+
+                        auto ovr = nms_match_iou(vbox1, vbox2);
+                        if (ovr >= iou_threshold_data)
+                        {
+                            select[j] = false;
+                            v_i.push_back(j);
+                        }
+                    }
+                    if (tmp_sc[i] > score_threshold_data && v_i.size() != 0)
+                    {
+                        for (int v_i_idx = 0; v_i_idx < v_i.size(); v_i_idx++)
+                        {
+                            res_order.push_back(k);
+                            res_order.push_back(g);
+                            res_order.push_back(i);
+                            res_order.push_back(v_i[v_i_idx]);
+                        }
+                    }
+                }
+            }
+        }
+        std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 4, 4});
+
+        OrtValue*            res      = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
+        int64_t*             res_data = ort_.GetTensorMutableData<int64_t>(res);
 
-  memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+        memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
 
-  allocator_.Free(select);
-}
-REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSMatchOp);
+        allocator_.Free(select);
+    }
+    REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSMatchOp);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h b/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h
index 57aa94d964..48e0d0dbb0 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h
@@ -10,37 +10,55 @@
 #include <string>
 #include <vector>
 
-namespace mmdeploy {
-struct NMSMatchKernel {
-  NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info);
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo* info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-};
-
-struct NMSMatchOp : Ort::CustomOpBase<NMSMatchOp, NMSMatchKernel> {
-  void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
-    return new NMSMatchKernel(api, info);
-  }
-  const char* GetName() const { return "NMSMatch"; }
-
-  size_t GetInputTypeCount() const { return 4; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetOutputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
-  }
-
-  // force cpu
-  const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
-};
+namespace mmdeploy
+{
+    struct NMSMatchKernel
+    {
+        NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info);
+
+        void Compute(OrtKernelContext* context);
+
+      private:
+        Ort::CustomOpApi                 ort_;
+        const OrtKernelInfo*             info_;
+        Ort::AllocatorWithDefaultOptions allocator_;
+    };
+
+    struct NMSMatchOp : Ort::CustomOpBase<NMSMatchOp, NMSMatchKernel>
+    {
+        void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const
+        {
+            return new NMSMatchKernel(api, info);
+        }
+        const char* GetName() const
+        {
+            return "NMSMatch";
+        }
+
+        size_t GetInputTypeCount() const
+        {
+            return 4;
+        }
+        ONNXTensorElementDataType GetInputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        }
+
+        size_t GetOutputTypeCount() const
+        {
+            return 1;
+        }
+        ONNXTensorElementDataType GetOutputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+        }
+
+        // force cpu
+        const char* GetExecutionProviderType() const
+        {
+            return "CPUExecutionProvider";
+        }
+    };
 }  // namespace mmdeploy
 
 #endif  // ONNXRUNTIME_NMS_MATCH_H
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp
index 9d8cc4597e..73c508ce47 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp
@@ -13,356 +13,418 @@
 
 #include "ort_utils.h"
 
-namespace mmdeploy {
-
-namespace {
-struct RotatedBox {
-  float x_ctr, y_ctr, w, h, a;
-};
-struct Point {
-  float x, y;
-  Point(const float& px = 0, const float& py = 0) : x(px), y(py) {}
-  Point operator+(const Point& p) const { return Point(x + p.x, y + p.y); }
-  Point& operator+=(const Point& p) {
-    x += p.x;
-    y += p.y;
-    return *this;
-  }
-  Point operator-(const Point& p) const { return Point(x - p.x, y - p.y); }
-  Point operator*(const float coeff) const { return Point(x * coeff, y * coeff); }
-};
-
-float dot_2d(const Point& A, const Point& B) { return A.x * B.x + A.y * B.y; }
-
-float cross_2d(const Point& A, const Point& B) { return A.x * B.y - B.x * A.y; }
-}  // namespace
-
-void get_rotated_vertices(const RotatedBox& box, Point (&pts)[4]) {
-  // M_PI / 180. == 0.01745329251
-  // double theta = box.a * 0.01745329251;
-  // MODIFIED
-  double theta = box.a;
-  float cosTheta2 = (float)cos(theta) * 0.5f;
-  float sinTheta2 = (float)sin(theta) * 0.5f;
-
-  // y: top --> down; x: left --> right
-  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
-  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
-  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
-  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
-  pts[2].x = 2 * box.x_ctr - pts[0].x;
-  pts[2].y = 2 * box.y_ctr - pts[0].y;
-  pts[3].x = 2 * box.x_ctr - pts[1].x;
-  pts[3].y = 2 * box.y_ctr - pts[1].y;
-}
-
-int get_intersection_points(const Point (&pts1)[4], const Point (&pts2)[4],
-                            Point (&intersections)[24]) {
-  // Line vector
-  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
-  Point vec1[4], vec2[4];
-  for (int i = 0; i < 4; i++) {
-    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
-    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
-  }
-
-  // Line test - test all line combos for intersection
-  int num = 0;  // number of intersections
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 4; j++) {
-      // Solve for 2x2 Ax=b
-      float det = cross_2d(vec2[j], vec1[i]);
-
-      // This takes care of parallel lines
-      if (fabs(det) <= 1e-14) {
-        continue;
-      }
-
-      auto vec12 = pts2[j] - pts1[i];
-
-      float t1 = cross_2d(vec2[j], vec12) / det;
-      float t2 = cross_2d(vec1[i], vec12) / det;
-
-      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
-        intersections[num++] = pts1[i] + vec1[i] * t1;
-      }
-    }
-  }
-
-  // Check for vertices of rect1 inside rect2
-  {
-    const auto& AB = vec2[0];
-    const auto& DA = vec2[3];
-    auto ABdotAB = dot_2d(AB, AB);
-    auto ADdotAD = dot_2d(DA, DA);
-    for (int i = 0; i < 4; i++) {
-      // assume ABCD is the rectangle, and P is the point to be judged
-      // P is inside ABCD iff. P's projection on AB lies within AB
-      // and P's projection on AD lies within AD
-
-      auto AP = pts1[i] - pts2[0];
-
-      auto APdotAB = dot_2d(AP, AB);
-      auto APdotAD = -dot_2d(AP, DA);
-
-      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
-        intersections[num++] = pts1[i];
-      }
-    }
-  }
-
-  // Reverse the check - check for vertices of rect2 inside rect1
-  {
-    const auto& AB = vec1[0];
-    const auto& DA = vec1[3];
-    auto ABdotAB = dot_2d(AB, AB);
-    auto ADdotAD = dot_2d(DA, DA);
-    for (int i = 0; i < 4; i++) {
-      auto AP = pts2[i] - pts1[0];
-
-      auto APdotAB = dot_2d(AP, AB);
-      auto APdotAD = -dot_2d(AP, DA);
-
-      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
-        intersections[num++] = pts2[i];
-      }
+namespace mmdeploy
+{
+
+    namespace
+    {
+        struct RotatedBox
+        {
+            float x_ctr, y_ctr, w, h, a;
+        };
+        struct Point
+        {
+            float x, y;
+            Point(const float& px = 0, const float& py = 0)
+                : x(px)
+                , y(py)
+            {
+            }
+            Point operator+(const Point& p) const
+            {
+                return Point(x + p.x, y + p.y);
+            }
+            Point& operator+=(const Point& p)
+            {
+                x += p.x;
+                y += p.y;
+                return *this;
+            }
+            Point operator-(const Point& p) const
+            {
+                return Point(x - p.x, y - p.y);
+            }
+            Point operator*(const float coeff) const
+            {
+                return Point(x * coeff, y * coeff);
+            }
+        };
+
+        float dot_2d(const Point& A, const Point& B)
+        {
+            return A.x * B.x + A.y * B.y;
+        }
+
+        float cross_2d(const Point& A, const Point& B)
+        {
+            return A.x * B.y - B.x * A.y;
+        }
+    }  // namespace
+
+    void get_rotated_vertices(const RotatedBox& box, Point (&pts)[4])
+    {
+        // M_PI / 180. == 0.01745329251
+        // double theta = box.a * 0.01745329251;
+        // MODIFIED
+        double theta     = box.a;
+        float  cosTheta2 = (float)cos(theta) * 0.5f;
+        float  sinTheta2 = (float)sin(theta) * 0.5f;
+
+        // y: top --> down; x: left --> right
+        pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+        pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+        pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+        pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+        pts[2].x = 2 * box.x_ctr - pts[0].x;
+        pts[2].y = 2 * box.y_ctr - pts[0].y;
+        pts[3].x = 2 * box.x_ctr - pts[1].x;
+        pts[3].y = 2 * box.y_ctr - pts[1].y;
     }
-  }
-
-  return num;
-}
-
-int convex_hull_graham(const Point (&p)[24], const int& num_in, Point (&q)[24],
-                       bool shift_to_zero = false) {
-  assert(num_in >= 2);
-
-  // Step 1:
-  // Find point with minimum y
-  // if more than 1 points have the same minimum y,
-  // pick the one with the minimum x.
-  int t = 0;
-  for (int i = 1; i < num_in; i++) {
-    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
-      t = i;
+
+    int get_intersection_points(const Point (&pts1)[4], const Point (&pts2)[4], Point (&intersections)[24])
+    {
+        // Line vector
+        // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+        Point vec1[4], vec2[4];
+        for (int i = 0; i < 4; i++)
+        {
+            vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+            vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+        }
+
+        // Line test - test all line combos for intersection
+        int num = 0;  // number of intersections
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                // Solve for 2x2 Ax=b
+                float det = cross_2d(vec2[j], vec1[i]);
+
+                // This takes care of parallel lines
+                if (fabs(det) <= 1e-14)
+                {
+                    continue;
+                }
+
+                auto  vec12 = pts2[j] - pts1[i];
+
+                float t1 = cross_2d(vec2[j], vec12) / det;
+                float t2 = cross_2d(vec1[i], vec12) / det;
+
+                if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f)
+                {
+                    intersections[num++] = pts1[i] + vec1[i] * t1;
+                }
+            }
+        }
+
+        // Check for vertices of rect1 inside rect2
+        {
+            const auto& AB      = vec2[0];
+            const auto& DA      = vec2[3];
+            auto        ABdotAB = dot_2d(AB, AB);
+            auto        ADdotAD = dot_2d(DA, DA);
+            for (int i = 0; i < 4; i++)
+            {
+                // assume ABCD is the rectangle, and P is the point to be judged
+                // P is inside ABCD iff. P's projection on AB lies within AB
+                // and P's projection on AD lies within AD
+
+                auto AP = pts1[i] - pts2[0];
+
+                auto APdotAB = dot_2d(AP, AB);
+                auto APdotAD = -dot_2d(AP, DA);
+
+                if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD))
+                {
+                    intersections[num++] = pts1[i];
+                }
+            }
+        }
+
+        // Reverse the check - check for vertices of rect2 inside rect1
+        {
+            const auto& AB      = vec1[0];
+            const auto& DA      = vec1[3];
+            auto        ABdotAB = dot_2d(AB, AB);
+            auto        ADdotAD = dot_2d(DA, DA);
+            for (int i = 0; i < 4; i++)
+            {
+                auto AP = pts2[i] - pts1[0];
+
+                auto APdotAB = dot_2d(AP, AB);
+                auto APdotAD = -dot_2d(AP, DA);
+
+                if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD))
+                {
+                    intersections[num++] = pts2[i];
+                }
+            }
+        }
+
+        return num;
     }
-  }
-  auto& start = p[t];  // starting point
-
-  // Step 2:
-  // Subtract starting point from every points (for sorting in the next step)
-  for (int i = 0; i < num_in; i++) {
-    q[i] = p[i] - start;
-  }
-
-  // Swap the starting point to position 0
-  auto tmp = q[0];
-  q[0] = q[t];
-  q[t] = tmp;
-
-  // Step 3:
-  // Sort point 1 ~ num_in according to their relative cross-product values
-  // (essentially sorting according to angles)
-  // If the angles are the same, sort according to their distance to origin
-  float dist[24];
-  for (int i = 0; i < num_in; i++) {
-    dist[i] = dot_2d(q[i], q[i]);
-  }
-
-  // CPU version
-  std::sort(q + 1, q + num_in, [](const Point& A, const Point& B) -> bool {
+
+    int convex_hull_graham(const Point (&p)[24], const int& num_in, Point (&q)[24], bool shift_to_zero = false)
+    {
+        assert(num_in >= 2);
+
+        // Step 1:
+        // Find point with minimum y
+        // if more than 1 points have the same minimum y,
+        // pick the one with the minimum x.
+        int t = 0;
+        for (int i = 1; i < num_in; i++)
+        {
+            if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x))
+            {
+                t = i;
+            }
+        }
+        auto& start = p[t];  // starting point
+
+        // Step 2:
+        // Subtract starting point from every points (for sorting in the next step)
+        for (int i = 0; i < num_in; i++)
+        {
+            q[i] = p[i] - start;
+        }
+
+        // Swap the starting point to position 0
+        auto tmp = q[0];
+        q[0]     = q[t];
+        q[t]     = tmp;
+
+        // Step 3:
+        // Sort point 1 ~ num_in according to their relative cross-product values
+        // (essentially sorting according to angles)
+        // If the angles are the same, sort according to their distance to origin
+        float dist[24];
+        for (int i = 0; i < num_in; i++)
+        {
+            dist[i] = dot_2d(q[i], q[i]);
+        }
+
+        // CPU version
+        std::sort(q + 1, q + num_in, [](const Point& A, const Point& B) -> bool
+                  {
     float temp = cross_2d(A, B);
     if (fabs(temp) < 1e-6) {
       return dot_2d(A, A) < dot_2d(B, B);
     } else {
       return temp > 0;
+    } });
+        // compute distance to origin after sort, since the points are now different.
+        for (int i = 0; i < num_in; i++)
+        {
+            dist[i] = dot_2d(q[i], q[i]);
+        }
+
+        // Step 4:
+        // Make sure there are at least 2 points (that don't overlap with each other)
+        // in the stack
+        int k;  // index of the non-overlapped second point
+        for (k = 1; k < num_in; k++)
+        {
+            if (dist[k] > 1e-8)
+            {
+                break;
+            }
+        }
+        if (k == num_in)
+        {
+            // We reach the end, which means the convex hull is just one point
+            q[0] = p[t];
+            return 1;
+        }
+        q[1]  = q[k];
+        int m = 2;  // 2 points in the stack
+        // Step 5:
+        // Finally we can start the scanning process.
+        // When a non-convex relationship between the 3 points is found
+        // (either concave shape or duplicated points),
+        // we pop the previous point from the stack
+        // until the 3-point relationship is convex again, or
+        // until the stack only contains two points
+        for (int i = k + 1; i < num_in; i++)
+        {
+            while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0)
+            {
+                m--;
+            }
+            q[m++] = q[i];
+        }
+
+        // Step 6 (Optional):
+        // In general sense we need the original coordinates, so we
+        // need to shift the points back (reverting Step 2)
+        // But if we're only interested in getting the area/perimeter of the shape
+        // We can simply return.
+        if (!shift_to_zero)
+        {
+            for (int i = 0; i < m; i++)
+            {
+                q[i] += start;
+            }
+        }
+
+        return m;
     }
-  });
-  // compute distance to origin after sort, since the points are now different.
-  for (int i = 0; i < num_in; i++) {
-    dist[i] = dot_2d(q[i], q[i]);
-  }
-
-  // Step 4:
-  // Make sure there are at least 2 points (that don't overlap with each other)
-  // in the stack
-  int k;  // index of the non-overlapped second point
-  for (k = 1; k < num_in; k++) {
-    if (dist[k] > 1e-8) {
-      break;
-    }
-  }
-  if (k == num_in) {
-    // We reach the end, which means the convex hull is just one point
-    q[0] = p[t];
-    return 1;
-  }
-  q[1] = q[k];
-  int m = 2;  // 2 points in the stack
-  // Step 5:
-  // Finally we can start the scanning process.
-  // When a non-convex relationship between the 3 points is found
-  // (either concave shape or duplicated points),
-  // we pop the previous point from the stack
-  // until the 3-point relationship is convex again, or
-  // until the stack only contains two points
-  for (int i = k + 1; i < num_in; i++) {
-    while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
-      m--;
-    }
-    q[m++] = q[i];
-  }
-
-  // Step 6 (Optional):
-  // In general sense we need the original coordinates, so we
-  // need to shift the points back (reverting Step 2)
-  // But if we're only interested in getting the area/perimeter of the shape
-  // We can simply return.
-  if (!shift_to_zero) {
-    for (int i = 0; i < m; i++) {
-      q[i] += start;
-    }
-  }
-
-  return m;
-}
-
-float polygon_area(const Point (&q)[24], const int& m) {
-  if (m <= 2) {
-    return 0;
-  }
-
-  float area = 0;
-  for (int i = 1; i < m - 1; i++) {
-    area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
-  }
-
-  return area / 2.0;
-}
-
-float rotated_boxes_intersection(const RotatedBox& box1, const RotatedBox& box2) {
-  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
-  // from rotated_rect_intersection_pts
-  Point intersectPts[24], orderedPts[24];
-
-  Point pts1[4];
-  Point pts2[4];
-  get_rotated_vertices(box1, pts1);
-  get_rotated_vertices(box2, pts2);
-
-  int num = get_intersection_points(pts1, pts2, intersectPts);
-
-  if (num <= 2) {
-    return 0.0;
-  }
-
-  // Convex Hull to order the intersection points in clockwise order and find
-  // the contour area.
-  int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true);
-  return polygon_area(orderedPts, num_convex);
-}
-
-NMSRotatedKernel::NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info)
-    : ort_(api), info_(info) {
-  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
-  score_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "score_threshold");
-
-  // create allocator
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-
-void NMSRotatedKernel::Compute(OrtKernelContext* context) {
-  const float iou_threshold = iou_threshold_;
-  const float score_threshold = score_threshold_;
-
-  const OrtValue* boxes = ort_.KernelContext_GetInput(context, 0);
-  const float* boxes_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
-  const OrtValue* scores = ort_.KernelContext_GetInput(context, 1);
-  const float* scores_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
-
-  OrtTensorDimensions boxes_dim(ort_, boxes);
-  OrtTensorDimensions scores_dim(ort_, scores);
-
-  // loop over batch
-  int64_t nbatch = boxes_dim[0];
-  int64_t nboxes = boxes_dim[1];
-  int64_t nclass = scores_dim[1];
-  assert(boxes_dim[2] == 5);  //(cx,cy,w,h,theta)
-
-  // allocate tmp memory
-  float* tmp_boxes = (float*)allocator_.Alloc(sizeof(float) * nbatch * nboxes * 5);
-  float* sc = (float*)allocator_.Alloc(sizeof(float) * nbatch * nclass * nboxes);
-  bool* select = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
-
-  memcpy(tmp_boxes, boxes_data, sizeof(float) * nbatch * nboxes * 5);
-  memcpy(sc, scores_data, sizeof(float) * nbatch * nclass * nboxes);
-
-  // std::vector<std::vector<int64_t>> res_order;
-  std::vector<int64_t> res_order;
-  for (int64_t k = 0; k < nbatch; k++) {
-    for (int64_t g = 0; g < nclass; g++) {
-      for (int64_t i = 0; i < nboxes; i++) {
-        select[i] = true;
-      }
-      // sort scores
-      std::vector<float> tmp_sc;
-      for (int i = 0; i < nboxes; i++) {
-        tmp_sc.push_back(sc[k * nboxes * nclass + g * nboxes + i]);
-      }
-      std::vector<int64_t> order(tmp_sc.size());
-      std::iota(order.begin(), order.end(), 0);
-      std::sort(order.begin(), order.end(),
-                [&tmp_sc](int64_t id1, int64_t id2) { return tmp_sc[id1] > tmp_sc[id2]; });
-      for (int64_t _i = 0; _i < nboxes; _i++) {
-        if (select[_i] == false) continue;
-        auto i = order[_i];
-        for (int64_t _j = _i + 1; _j < nboxes; _j++) {
-          if (select[_j] == false) continue;
-          auto j = order[_j];
-          RotatedBox box1, box2;
-          auto center_shift_x =
-              (tmp_boxes[k * nboxes * 5 + i * 5] + tmp_boxes[k * nboxes * 5 + j * 5]) / 2.0;
-          auto center_shift_y =
-              (tmp_boxes[k * nboxes * 5 + i * 5 + 1] + tmp_boxes[k * nboxes * 5 + j * 5 + 1]) / 2.0;
-          box1.x_ctr = tmp_boxes[k * nboxes * 5 + i * 5] - center_shift_x;
-          box1.y_ctr = tmp_boxes[k * nboxes * 5 + i * 5 + 1] - center_shift_y;
-          box1.w = tmp_boxes[k * nboxes * 5 + i * 5 + 2];
-          box1.h = tmp_boxes[k * nboxes * 5 + i * 5 + 3];
-          box1.a = tmp_boxes[k * nboxes * 5 + i * 5 + 4];
-          box2.x_ctr = tmp_boxes[k * nboxes * 5 + j * 5] - center_shift_x;
-          box2.y_ctr = tmp_boxes[k * nboxes * 5 + j * 5 + 1] - center_shift_y;
-          box2.w = tmp_boxes[k * nboxes * 5 + j * 5 + 2];
-          box2.h = tmp_boxes[k * nboxes * 5 + j * 5 + 3];
-          box2.a = tmp_boxes[k * nboxes * 5 + j * 5 + 4];
-          auto area1 = box1.w * box1.h;
-          auto area2 = box2.w * box2.h;
-          auto intersection = rotated_boxes_intersection(box1, box2);
-          float baseS = 1.0;
-          baseS = (area1 + area2 - intersection);
-          auto ovr = intersection / baseS;
-          if (ovr > iou_threshold) select[_j] = false;
+
+    float polygon_area(const Point (&q)[24], const int& m)
+    {
+        if (m <= 2)
+        {
+            return 0;
         }
-      }
-      for (int i = 0; i < nboxes; i++) {
-        if (select[i] & (tmp_sc[order[i]] > score_threshold)) {
-          res_order.push_back(k);
-          res_order.push_back(g);
-          res_order.push_back(order[i]);
+
+        float area = 0;
+        for (int i = 1; i < m - 1; i++)
+        {
+            area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
         }
-      }
-    }  // class loop
-  }    // batch loop
 
-  std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 3, 3});
+        return area / 2.0;
+    }
+
+    float rotated_boxes_intersection(const RotatedBox& box1, const RotatedBox& box2)
+    {
+        // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+        // from rotated_rect_intersection_pts
+        Point intersectPts[24], orderedPts[24];
 
-  OrtValue* res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
-  int64_t* res_data = ort_.GetTensorMutableData<int64_t>(res);
+        Point pts1[4];
+        Point pts2[4];
+        get_rotated_vertices(box1, pts1);
+        get_rotated_vertices(box2, pts2);
 
-  memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+        int num = get_intersection_points(pts1, pts2, intersectPts);
 
-  allocator_.Free(tmp_boxes);
-  allocator_.Free(sc);
-  allocator_.Free(select);
-}
+        if (num <= 2)
+        {
+            return 0.0;
+        }
+
+        // Convex Hull to order the intersection points in clockwise order and find
+        // the contour area.
+        int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true);
+        return polygon_area(orderedPts, num_convex);
+    }
+
+    NMSRotatedKernel::NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info)
+        : ort_(api)
+        , info_(info)
+    {
+        iou_threshold_   = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
+        score_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "score_threshold");
+
+        // create allocator
+        allocator_ = Ort::AllocatorWithDefaultOptions();
+    }
+
+    void NMSRotatedKernel::Compute(OrtKernelContext* context)
+    {
+        const float         iou_threshold   = iou_threshold_;
+        const float         score_threshold = score_threshold_;
+
+        const OrtValue*     boxes       = ort_.KernelContext_GetInput(context, 0);
+        const float*        boxes_data  = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
+        const OrtValue*     scores      = ort_.KernelContext_GetInput(context, 1);
+        const float*        scores_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
+
+        OrtTensorDimensions boxes_dim(ort_, boxes);
+        OrtTensorDimensions scores_dim(ort_, scores);
+
+        // loop over batch
+        int64_t             nbatch = boxes_dim[0];
+        int64_t             nboxes = boxes_dim[1];
+        int64_t             nclass = scores_dim[1];
+        assert(boxes_dim[2] == 5);  //(cx,cy,w,h,theta)
+
+        // allocate tmp memory
+        float* tmp_boxes = (float*)allocator_.Alloc(sizeof(float) * nbatch * nboxes * 5);
+        float* sc        = (float*)allocator_.Alloc(sizeof(float) * nbatch * nclass * nboxes);
+        bool*  select    = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
+
+        memcpy(tmp_boxes, boxes_data, sizeof(float) * nbatch * nboxes * 5);
+        memcpy(sc, scores_data, sizeof(float) * nbatch * nclass * nboxes);
+
+        // std::vector<std::vector<int64_t>> res_order;
+        std::vector<int64_t> res_order;
+        for (int64_t k = 0; k < nbatch; k++)
+        {
+            for (int64_t g = 0; g < nclass; g++)
+            {
+                for (int64_t i = 0; i < nboxes; i++)
+                {
+                    select[i] = true;
+                }
+                // sort scores
+                std::vector<float> tmp_sc;
+                for (int i = 0; i < nboxes; i++)
+                {
+                    tmp_sc.push_back(sc[k * nboxes * nclass + g * nboxes + i]);
+                }
+                std::vector<int64_t> order(tmp_sc.size());
+                std::iota(order.begin(), order.end(), 0);
+                std::sort(order.begin(), order.end(), [&tmp_sc](int64_t id1, int64_t id2)
+                          { return tmp_sc[id1] > tmp_sc[id2]; });
+                for (int64_t _i = 0; _i < nboxes; _i++)
+                {
+                    if (select[_i] == false) continue;
+                    auto i = order[_i];
+                    for (int64_t _j = _i + 1; _j < nboxes; _j++)
+                    {
+                        if (select[_j] == false) continue;
+                        auto       j = order[_j];
+                        RotatedBox box1, box2;
+                        auto       center_shift_x =
+                            (tmp_boxes[k * nboxes * 5 + i * 5] + tmp_boxes[k * nboxes * 5 + j * 5]) / 2.0;
+                        auto center_shift_y =
+                            (tmp_boxes[k * nboxes * 5 + i * 5 + 1] + tmp_boxes[k * nboxes * 5 + j * 5 + 1]) / 2.0;
+                        box1.x_ctr         = tmp_boxes[k * nboxes * 5 + i * 5] - center_shift_x;
+                        box1.y_ctr         = tmp_boxes[k * nboxes * 5 + i * 5 + 1] - center_shift_y;
+                        box1.w             = tmp_boxes[k * nboxes * 5 + i * 5 + 2];
+                        box1.h             = tmp_boxes[k * nboxes * 5 + i * 5 + 3];
+                        box1.a             = tmp_boxes[k * nboxes * 5 + i * 5 + 4];
+                        box2.x_ctr         = tmp_boxes[k * nboxes * 5 + j * 5] - center_shift_x;
+                        box2.y_ctr         = tmp_boxes[k * nboxes * 5 + j * 5 + 1] - center_shift_y;
+                        box2.w             = tmp_boxes[k * nboxes * 5 + j * 5 + 2];
+                        box2.h             = tmp_boxes[k * nboxes * 5 + j * 5 + 3];
+                        box2.a             = tmp_boxes[k * nboxes * 5 + j * 5 + 4];
+                        auto  area1        = box1.w * box1.h;
+                        auto  area2        = box2.w * box2.h;
+                        auto  intersection = rotated_boxes_intersection(box1, box2);
+                        float baseS        = 1.0;
+                        baseS              = (area1 + area2 - intersection);
+                        auto ovr           = intersection / baseS;
+                        if (ovr > iou_threshold) select[_j] = false;
+                    }
+                }
+                for (int i = 0; i < nboxes; i++)
+                {
+                    if (select[i] & (tmp_sc[order[i]] > score_threshold))
+                    {
+                        res_order.push_back(k);
+                        res_order.push_back(g);
+                        res_order.push_back(order[i]);
+                    }
+                }
+            }  // class loop
+        }      // batch loop
+
+        std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 3, 3});
+
+        OrtValue*            res      = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
+        int64_t*             res_data = ort_.GetTensorMutableData<int64_t>(res);
+
+        memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+
+        allocator_.Free(tmp_boxes);
+        allocator_.Free(sc);
+        allocator_.Free(select);
+    }
 
-REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSRotatedOp);
+    REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSRotatedOp);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h b/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h
index 6ed44ce410..3b4aa856a5 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h
@@ -10,39 +10,57 @@
 #include <string>
 #include <vector>
 
-namespace mmdeploy {
-struct NMSRotatedKernel {
-  NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info);
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo* info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-  float iou_threshold_;
-  float score_threshold_;
-};
-
-struct NMSRotatedOp : Ort::CustomOpBase<NMSRotatedOp, NMSRotatedKernel> {
-  void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
-    return new NMSRotatedKernel(api, info);
-  }
-  const char* GetName() const { return "NMSRotated"; }
-
-  size_t GetInputTypeCount() const { return 2; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetOutputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
-  }
-
-  // force cpu
-  const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
-};
+namespace mmdeploy
+{
+    struct NMSRotatedKernel
+    {
+        NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info);
+
+        void Compute(OrtKernelContext* context);
+
+      private:
+        Ort::CustomOpApi                 ort_;
+        const OrtKernelInfo*             info_;
+        Ort::AllocatorWithDefaultOptions allocator_;
+        float                            iou_threshold_;
+        float                            score_threshold_;
+    };
+
+    struct NMSRotatedOp : Ort::CustomOpBase<NMSRotatedOp, NMSRotatedKernel>
+    {
+        void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const
+        {
+            return new NMSRotatedKernel(api, info);
+        }
+        const char* GetName() const
+        {
+            return "NMSRotated";
+        }
+
+        size_t GetInputTypeCount() const
+        {
+            return 2;
+        }
+        ONNXTensorElementDataType GetInputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        }
+
+        size_t GetOutputTypeCount() const
+        {
+            return 1;
+        }
+        ONNXTensorElementDataType GetOutputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+        }
+
+        // force cpu
+        const char* GetExecutionProviderType() const
+        {
+            return "CPUExecutionProvider";
+        }
+    };
 }  // namespace mmdeploy
 
 #endif  // ONNXRUNTIME_NMS_ROTATED_H
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp
index f7b9cedff8..1159496843 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp
@@ -3,25 +3,30 @@
 
 #include "ort_utils.h"
 
-const char *c_MMDeployOpDomain = "mmdeploy";
+const char*             c_MMDeployOpDomain = "mmdeploy";
 
-OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, const OrtApiBase *api) {
-  const OrtApi *kOrtApi = api->GetApi(ORT_API_VERSION);
-  OrtStatus *status = nullptr;
-  for (auto &_op_list_pair : mmdeploy::get_mmdeploy_custom_ops()) {
-    OrtCustomOpDomain *domain = nullptr;
-    if (auto status = kOrtApi->CreateCustomOpDomain(_op_list_pair.first.c_str(), &domain)) {
-      return status;
+OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api)
+{
+    const OrtApi* kOrtApi = api->GetApi(ORT_API_VERSION);
+    OrtStatus*    status  = nullptr;
+    for (auto& _op_list_pair : mmdeploy::get_mmdeploy_custom_ops())
+    {
+        OrtCustomOpDomain* domain = nullptr;
+        if (auto status = kOrtApi->CreateCustomOpDomain(_op_list_pair.first.c_str(), &domain))
+        {
+            return status;
+        }
+        auto& _op_list = _op_list_pair.second;
+        for (auto& _op : _op_list)
+        {
+            if (auto status = kOrtApi->CustomOpDomain_Add(domain, _op))
+            {
+                return status;
+            }
+        }
+        // TODO: figure out what will return if failed.
+        status = kOrtApi->AddCustomOpDomain(options, domain);
     }
-    auto &_op_list = _op_list_pair.second;
-    for (auto &_op : _op_list) {
-      if (auto status = kOrtApi->CustomOpDomain_Add(domain, _op)) {
-        return status;
-      }
-    }
-    // TODO: figure out what will return if failed.
-    status = kOrtApi->AddCustomOpDomain(options, domain);
-  }
 
-  return status;
+    return status;
 }
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp
index a8e7023fe1..4fbf6365d0 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp
@@ -5,233 +5,245 @@
 
 #include "ort_utils.h"
 
-namespace mmdeploy {
-// implementation taken from Caffe2
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  float w1;
-  float w2;
-  float w3;
-  float w4;
-};
-
-void pre_calc_for_bilinear_interpolate(const int height, const int width, const int pooled_height,
-                                       const int pooled_width, const int iy_upper,
-                                       const int ix_upper, float roi_start_h, float roi_start_w,
-                                       float bin_size_h, float bin_size_w, int roi_bin_grid_h,
-                                       int roi_bin_grid_w, float roi_center_h, float roi_center_w,
-                                       float cos_theta, float sin_theta,
-                                       std::vector<PreCalc> &pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const float yy = roi_start_h + ph * bin_size_h +
-                         static_cast<float>(iy + .5f) * bin_size_h /
-                             static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const float xx =
-              roi_start_w + pw * bin_size_w +
-              static_cast<float>(ix + .5f) * bin_size_w / static_cast<float>(roi_bin_grid_w);
-
-          // Rotate by theta around the center and translate
-          // In image space, (y, x) is the order for Right Handed System,
-          // and this is essentially multiplying the point by a rotation matrix
-          // to rotate it counterclockwise through angle theta.
-          float y = yy * cos_theta - xx * sin_theta + roi_center_h;
-          float x = yy * sin_theta + xx * cos_theta + roi_center_w;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y < 0) {
-            y = 0;
-          }
-          if (x < 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (float)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (float)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          float ly = y - y_low;
-          float lx = x - x_low;
-          float hy = 1. - ly, hx = 1. - lx;
-          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indices
-          PreCalc pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
+namespace mmdeploy
+{
+    // implementation taken from Caffe2
+    struct PreCalc
+    {
+        int   pos1;
+        int   pos2;
+        int   pos3;
+        int   pos4;
+        float w1;
+        float w2;
+        float w3;
+        float w4;
+    };
+
+    void pre_calc_for_bilinear_interpolate(const int height, const int width, const int pooled_height, const int pooled_width, const int iy_upper, const int ix_upper, float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w, int roi_bin_grid_h, int roi_bin_grid_w, float roi_center_h, float roi_center_w, float cos_theta, float sin_theta, std::vector<PreCalc>& pre_calc)
+    {
+        int pre_calc_index = 0;
+        for (int ph = 0; ph < pooled_height; ph++)
+        {
+            for (int pw = 0; pw < pooled_width; pw++)
+            {
+                for (int iy = 0; iy < iy_upper; iy++)
+                {
+                    const float yy = roi_start_h + ph * bin_size_h +
+                                     static_cast<float>(iy + .5f) * bin_size_h /
+                                         static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+                    for (int ix = 0; ix < ix_upper; ix++)
+                    {
+                        const float xx =
+                            roi_start_w + pw * bin_size_w +
+                            static_cast<float>(ix + .5f) * bin_size_w / static_cast<float>(roi_bin_grid_w);
+
+                        // Rotate by theta around the center and translate
+                        // In image space, (y, x) is the order for Right Handed System,
+                        // and this is essentially multiplying the point by a rotation matrix
+                        // to rotate it counterclockwise through angle theta.
+                        float y = yy * cos_theta - xx * sin_theta + roi_center_h;
+                        float x = yy * sin_theta + xx * cos_theta + roi_center_w;
+                        // deal with: inverse elements are out of feature map boundary
+                        if (y < -1.0 || y > height || x < -1.0 || x > width)
+                        {
+                            // empty
+                            PreCalc pc;
+                            pc.pos1                  = 0;
+                            pc.pos2                  = 0;
+                            pc.pos3                  = 0;
+                            pc.pos4                  = 0;
+                            pc.w1                    = 0;
+                            pc.w2                    = 0;
+                            pc.w3                    = 0;
+                            pc.w4                    = 0;
+                            pre_calc[pre_calc_index] = pc;
+                            pre_calc_index += 1;
+                            continue;
+                        }
+
+                        if (y < 0)
+                        {
+                            y = 0;
+                        }
+                        if (x < 0)
+                        {
+                            x = 0;
+                        }
+
+                        int y_low = (int)y;
+                        int x_low = (int)x;
+                        int y_high;
+                        int x_high;
+
+                        if (y_low >= height - 1)
+                        {
+                            y_high = y_low = height - 1;
+                            y              = (float)y_low;
+                        }
+                        else
+                        {
+                            y_high = y_low + 1;
+                        }
+
+                        if (x_low >= width - 1)
+                        {
+                            x_high = x_low = width - 1;
+                            x              = (float)x_low;
+                        }
+                        else
+                        {
+                            x_high = x_low + 1;
+                        }
+
+                        float   ly = y - y_low;
+                        float   lx = x - x_low;
+                        float   hy = 1. - ly, hx = 1. - lx;
+                        float   w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+                        // save weights and indices
+                        PreCalc pc;
+                        pc.pos1                  = y_low * width + x_low;
+                        pc.pos2                  = y_low * width + x_high;
+                        pc.pos3                  = y_high * width + x_low;
+                        pc.pos4                  = y_high * width + x_high;
+                        pc.w1                    = w1;
+                        pc.w2                    = w2;
+                        pc.w3                    = w3;
+                        pc.w4                    = w4;
+                        pre_calc[pre_calc_index] = pc;
+
+                        pre_calc_index += 1;
+                    }
+                }
+            }
         }
-      }
-    }
-  }
-}
-
-void ROIAlignRotatedForwardCPU(const int nthreads, const float *input, const float *rois,
-                               float *output, const float &spatial_scale, const int aligned,
-                               const int clockwise, const int channels, const int height,
-                               const int width, const int pooled_height, const int pooled_width,
-                               const int sampling_ratio) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-  // (n, c, ph, pw) is an element in the pooled output
-  // can be parallelized using omp
-  // #pragma omp parallel for num_threads(32)
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    const float *current_roi = rois + n * 6;
-    int roi_batch_ind = current_roi[0];
-
-    // Do not use rounding; this implementation detail is critical
-    float offset = aligned ? (float)0.5 : (float)0.0;
-    float roi_center_w = current_roi[1] * spatial_scale - offset;
-    float roi_center_h = current_roi[2] * spatial_scale - offset;
-    float roi_width = current_roi[3] * spatial_scale;
-    float roi_height = current_roi[4] * spatial_scale;
-    // float theta = current_roi[5] * M_PI / 180.0;
-    float theta = current_roi[5];  // Radian angle by default
-    if (clockwise) {
-      theta = -theta;
     }
-    float cos_theta = cos(theta);
-    float sin_theta = sin(theta);
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (float)1.);
-      roi_height = std::max(roi_height, (float)1.);
-    }
-
-    float bin_size_h = static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-    float bin_size_w = static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const float count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-    // we want to precalculate indices and weights shared by all channels,
-    // this is the key point of optimization
-    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
-
-    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
-    // Appropriate translation needs to be applied after.
-    float roi_start_h = -roi_height / 2.0;
-    float roi_start_w = -roi_width / 2.0;
 
-    pre_calc_for_bilinear_interpolate(height, width, pooled_height, pooled_width, roi_bin_grid_h,
-                                      roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h,
-                                      bin_size_w, roi_bin_grid_h, roi_bin_grid_w, roi_center_h,
-                                      roi_center_w, cos_theta, sin_theta, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const float *offset_input = input + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
+    void ROIAlignRotatedForwardCPU(const int nthreads, const float* input, const float* rois, float* output, const float& spatial_scale, const int aligned, const int clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio)
+    {
+        int n_rois = nthreads / channels / pooled_width / pooled_height;
+        // (n, c, ph, pw) is an element in the pooled output
+        // can be parallelized using omp
+        // #pragma omp parallel for num_threads(32)
+        for (int n = 0; n < n_rois; n++)
+        {
+            int          index_n = n * channels * pooled_width * pooled_height;
+
+            const float* current_roi   = rois + n * 6;
+            int          roi_batch_ind = current_roi[0];
+
+            // Do not use rounding; this implementation detail is critical
+            float        offset       = aligned ? (float)0.5 : (float)0.0;
+            float        roi_center_w = current_roi[1] * spatial_scale - offset;
+            float        roi_center_h = current_roi[2] * spatial_scale - offset;
+            float        roi_width    = current_roi[3] * spatial_scale;
+            float        roi_height   = current_roi[4] * spatial_scale;
+            // float theta = current_roi[5] * M_PI / 180.0;
+            float        theta        = current_roi[5];  // Radian angle by default
+            if (clockwise)
+            {
+                theta = -theta;
+            }
+            float cos_theta = cos(theta);
+            float sin_theta = sin(theta);
+            if (!aligned)
+            {  // for backward-compatibility only
+                roi_width  = std::max(roi_width, (float)1.);
+                roi_height = std::max(roi_height, (float)1.);
+            }
 
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
+            float bin_size_h = static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+            float bin_size_w = static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+            // We use roi_bin_grid to sample the grid and mimic integral
+            int   roi_bin_grid_h =
+                (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height);  // e.g., = 2
+            int                  roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+            // We do average (integral) pooling inside a bin
+            const float          count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+            // we want to precalculate indices and weights shared by all channels,
+            // this is the key point of optimization
+            std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+
+            // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+            // Appropriate translation needs to be applied after.
+            float                roi_start_h = -roi_height / 2.0;
+            float                roi_start_w = -roi_width / 2.0;
+
+            pre_calc_for_bilinear_interpolate(height, width, pooled_height, pooled_width, roi_bin_grid_h, roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta, sin_theta, pre_calc);
+
+            for (int c = 0; c < channels; c++)
+            {
+                int          index_n_c      = index_n + c * pooled_width * pooled_height;
+                const float* offset_input   = input + (roi_batch_ind * channels + c) * height * width;
+                int          pre_calc_index = 0;
+
+                for (int ph = 0; ph < pooled_height; ph++)
+                {
+                    for (int pw = 0; pw < pooled_width; pw++)
+                    {
+                        int   index = index_n_c + ph * pooled_width + pw;
+
+                        float output_val = 0.;
+                        for (int iy = 0; iy < roi_bin_grid_h; iy++)
+                        {
+                            for (int ix = 0; ix < roi_bin_grid_w; ix++)
+                            {
+                                PreCalc pc = pre_calc[pre_calc_index];
+                                output_val += pc.w1 * offset_input[pc.pos1] + pc.w2 * offset_input[pc.pos2] +
+                                              pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+                                pre_calc_index += 1;
+                            }
+                        }
+                        output_val /= count;
+
+                        output[index] = output_val;
+                    }  // for pw
+                }      // for ph
+            }          // for c
+        }              // for n
+    }
 
-          float output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_input[pc.pos1] + pc.w2 * offset_input[pc.pos2] +
-                            pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+    void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext* context)
+    {
+        // Setup inputs
+        const OrtValue* input_X    = ort_.KernelContext_GetInput(context, 0);
+        const float*    X_data     = reinterpret_cast<const float*>(ort_.GetTensorData<float>(input_X));
+        const OrtValue* input_rois = ort_.KernelContext_GetInput(context, 1);
+        const float*    rois =
+            reinterpret_cast<const float*>(ort_.GetTensorData<const float*>(input_rois));
+
+        // Setup output
+        OrtTensorDimensions out_dimensions(ort_, input_X);
+        OrtTensorDimensions roi_dimensions(ort_, input_rois);
+
+        int                 batch_size     = out_dimensions.data()[0];
+        int                 input_channels = out_dimensions.data()[1];
+        int                 input_height   = out_dimensions.data()[2];
+        int                 input_width    = out_dimensions.data()[3];
+
+        out_dimensions.data()[0] = roi_dimensions.data()[0];
+        out_dimensions.data()[2] = aligned_height_;
+        out_dimensions.data()[3] = aligned_width_;
+
+        OrtValue* output =
+            ort_.KernelContext_GetOutput(context, 0, out_dimensions.data(), out_dimensions.size());
+        float*                     out         = ort_.GetTensorMutableData<float>(output);
+        OrtTensorTypeAndShapeInfo* output_info = ort_.GetTensorTypeAndShape(output);
+        ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+        // TODO: forward here
+        int output_size = out_dimensions.data()[0];
+        for (auto i = 1; i < out_dimensions.size(); ++i)
+        {
+            output_size *= out_dimensions.data()[i];
+        }
+        ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_, aligned_, clockwise_, input_channels, input_height, input_width, aligned_height_, aligned_width_, sampling_ratio_);
+    }
 
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-
-          output[index] = output_val;
-        }  // for pw
-      }    // for ph
-    }      // for c
-  }        // for n
-}
-
-void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
-  // Setup inputs
-  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
-  const float *X_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
-  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
-  const float *rois =
-      reinterpret_cast<const float *>(ort_.GetTensorData<const float *>(input_rois));
-
-  // Setup output
-  OrtTensorDimensions out_dimensions(ort_, input_X);
-  OrtTensorDimensions roi_dimensions(ort_, input_rois);
-
-  int batch_size = out_dimensions.data()[0];
-  int input_channels = out_dimensions.data()[1];
-  int input_height = out_dimensions.data()[2];
-  int input_width = out_dimensions.data()[3];
-
-  out_dimensions.data()[0] = roi_dimensions.data()[0];
-  out_dimensions.data()[2] = aligned_height_;
-  out_dimensions.data()[3] = aligned_width_;
-
-  OrtValue *output =
-      ort_.KernelContext_GetOutput(context, 0, out_dimensions.data(), out_dimensions.size());
-  float *out = ort_.GetTensorMutableData<float>(output);
-  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
-  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
-
-  // TODO: forward here
-  int output_size = out_dimensions.data()[0];
-  for (auto i = 1; i < out_dimensions.size(); ++i) {
-    output_size *= out_dimensions.data()[i];
-  }
-  ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_, aligned_, clockwise_,
-                            input_channels, input_height, input_width, aligned_height_,
-                            aligned_width_, sampling_ratio_);
-}
-
-REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVRoIAlignRotatedCustomOp);
+    REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVRoIAlignRotatedCustomOp);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h b/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h
index c0129d31f8..24a90e5321 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h
@@ -10,50 +10,70 @@
 #include <string>
 #include <vector>
 
-namespace mmdeploy {
-struct MMCVRoIAlignRotatedKernel {
- public:
-  MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info) : ort_(ort) {
-    aligned_height_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
-    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
-    sampling_ratio_ = ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
-    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
-    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
-    clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
-  }
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-  int aligned_height_;
-  int aligned_width_;
-  float spatial_scale_;
-  int sampling_ratio_;
-  int aligned_;
-  int clockwise_;
-};
-
-struct MMCVRoIAlignRotatedCustomOp
-    : Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp, MMCVRoIAlignRotatedKernel> {
-  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
-    return new MMCVRoIAlignRotatedKernel(api, info);
-  }
-  const char* GetName() const { return "MMCVRoIAlignRotated"; }
-
-  size_t GetInputTypeCount() const { return 2; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetOutputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  // force cpu
-  const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
-};
+namespace mmdeploy
+{
+    struct MMCVRoIAlignRotatedKernel
+    {
+      public:
+        MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+            : ort_(ort)
+        {
+            aligned_height_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
+            aligned_width_  = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
+            sampling_ratio_ = ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
+            spatial_scale_  = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+            aligned_        = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
+            clockwise_      = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
+        }
+
+        void Compute(OrtKernelContext* context);
+
+      private:
+        Ort::CustomOpApi ort_;
+        int              aligned_height_;
+        int              aligned_width_;
+        float            spatial_scale_;
+        int              sampling_ratio_;
+        int              aligned_;
+        int              clockwise_;
+    };
+
+    struct MMCVRoIAlignRotatedCustomOp
+        : Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp, MMCVRoIAlignRotatedKernel>
+    {
+        void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const
+        {
+            return new MMCVRoIAlignRotatedKernel(api, info);
+        }
+        const char* GetName() const
+        {
+            return "MMCVRoIAlignRotated";
+        }
+
+        size_t GetInputTypeCount() const
+        {
+            return 2;
+        }
+        ONNXTensorElementDataType GetInputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        }
+
+        size_t GetOutputTypeCount() const
+        {
+            return 1;
+        }
+        ONNXTensorElementDataType GetOutputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        }
+
+        // force cpu
+        const char* GetExecutionProviderType() const
+        {
+            return "CPUExecutionProvider";
+        }
+    };
 }  // namespace mmdeploy
 
 #endif  // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
index 431f2dd63b..8edec279c5 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
@@ -9,225 +9,294 @@
 #include "nms/kernel.h"
 #include "trt_serialize.hpp"
 
-namespace mmdeploy {
-using namespace nvinfer1;
-using nvinfer1::plugin::NMSParameters;
-
-namespace {
-static const char* NMS_PLUGIN_VERSION{"1"};
-static const char* NMS_PLUGIN_NAME{"TRTBatchedNMS"};
-}  // namespace
-
-TRTBatchedNMS::TRTBatchedNMS(const std::string& name, NMSParameters params, bool returnIndex)
-    : TRTPluginBase(name), param(params), mReturnIndex(returnIndex) {}
-
-TRTBatchedNMS::TRTBatchedNMS(const std::string& name, const void* data, size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &param);
-  deserialize_value(&data, &length, &mClipBoxes);
-  deserialize_value(&data, &length, &mReturnIndex);
-}
-
-int TRTBatchedNMS::getNbOutputs() const TRT_NOEXCEPT {
-  int num = mReturnIndex ? 3 : 2;
-  return num;
-}
-
-nvinfer1::DimsExprs TRTBatchedNMS::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
-  ASSERT(nbInputs == 2);
-  ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
-  ASSERT(inputs[0].nbDims == 4);
-  ASSERT(inputs[1].nbDims == 3);
-
-  nvinfer1::DimsExprs ret;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = exprBuilder.constant(param.keepTopK);
-  switch (outputIndex) {
-    case 0:
-      ret.nbDims = 3;
-      ret.d[2] = exprBuilder.constant(5);
-      break;
-    case 1:
-      ret.nbDims = 2;
-      break;
-    case 2:
-      ret.nbDims = 2;
-    default:
-      break;
-  }
-
-  return ret;
-}
-
-size_t TRTBatchedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-                                       const nvinfer1::PluginTensorDesc* outputs,
-                                       int nbOutputs) const TRT_NOEXCEPT {
-  size_t batch_size = inputs[0].dims.d[0];
-  size_t boxes_size = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
-  size_t score_size = inputs[1].dims.d[1] * inputs[1].dims.d[2];
-  size_t num_priors = inputs[0].dims.d[1];
-  bool shareLocation = (inputs[0].dims.d[2] == 1);
-  int topk = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
-  return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size,
-                                         param.numClasses, num_priors, topk, DataType::kFLOAT,
-                                         DataType::kFLOAT);
-}
-
-int TRTBatchedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-                           const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
-                           void* const* outputs, void* workSpace,
-                           cudaStream_t stream) TRT_NOEXCEPT {
-  const void* const locData = inputs[0];
-  const void* const confData = inputs[1];
-
-  void* nmsedDets = outputs[0];
-  void* nmsedLabels = outputs[1];
-  void* nmsedIndex = mReturnIndex ? outputs[2] : nullptr;
-
-  size_t batch_size = inputDesc[0].dims.d[0];
-  size_t boxes_size = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
-  size_t score_size = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
-  size_t num_priors = inputDesc[0].dims.d[1];
-  bool shareLocation = (inputDesc[0].dims.d[2] == 1);
-
-  int topk =
-      param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
-  bool rotated = false;
-  pluginStatus_t status = nmsInference(
-      stream, batch_size, boxes_size, score_size, shareLocation, param.backgroundLabelId,
-      num_priors, param.numClasses, topk, param.keepTopK, param.scoreThreshold, param.iouThreshold,
-      DataType::kFLOAT, locData, DataType::kFLOAT, confData, nmsedDets, nmsedLabels, nmsedIndex,
-      workSpace, param.isNormalized, false, mClipBoxes, rotated);
-  ASSERT(status == STATUS_SUCCESS);
-
-  return 0;
-}
-
-size_t TRTBatchedNMS::getSerializationSize() const TRT_NOEXCEPT {
-  // NMSParameters
-  return sizeof(NMSParameters) + sizeof(mClipBoxes) + sizeof(mReturnIndex);
-}
-
-void TRTBatchedNMS::serialize(void* buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, param);
-  serialize_value(&buffer, mClipBoxes);
-  serialize_value(&buffer, mReturnIndex);
-}
-
-void TRTBatchedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
-                                    const nvinfer1::DynamicPluginTensorDesc* outputs,
-                                    int nbOutputs) TRT_NOEXCEPT {
-  // Validate input arguments
-}
-
-bool TRTBatchedNMS::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc,
-                                              int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 3 || pos == 4) {
-    return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-           ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-  }
-  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-}
-
-const char* TRTBatchedNMS::getPluginType() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
-
-const char* TRTBatchedNMS::getPluginVersion() const TRT_NOEXCEPT { return NMS_PLUGIN_VERSION; }
-
-IPluginV2DynamicExt* TRTBatchedNMS::clone() const TRT_NOEXCEPT {
-  auto* plugin = new TRTBatchedNMS(mLayerName, param, mReturnIndex);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  plugin->setClipParam(mClipBoxes);
-  return plugin;
-}
-
-nvinfer1::DataType TRTBatchedNMS::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
-                                                    int nbInputs) const TRT_NOEXCEPT {
-  ASSERT(index >= 0 && index < this->getNbOutputs());
-  if (index == 1 || index == 2) {
-    return nvinfer1::DataType::kINT32;
-  }
-  return inputTypes[0];
-}
-
-void TRTBatchedNMS::setClipParam(bool clip) { mClipBoxes = clip; }
-
-TRTBatchedNMSCreator::TRTBatchedNMSCreator() {
-  mPluginAttributes.emplace_back(
-      PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(
-      PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
-  mPluginAttributes.emplace_back(
-      PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
-  mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("return_index", nullptr, PluginFieldType::kINT32, 1));
-
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char* TRTBatchedNMSCreator::getPluginName() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
-
-const char* TRTBatchedNMSCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return NMS_PLUGIN_VERSION;
-}
-
-IPluginV2Ext* TRTBatchedNMSCreator::createPlugin(const char* name,
-                                                 const PluginFieldCollection* fc) TRT_NOEXCEPT {
-  const PluginField* fields = fc->fields;
-  bool clipBoxes = true;
-  bool returnIndex = false;
-  nvinfer1::plugin::NMSParameters params{};
-
-  for (int i = 0; i < fc->nbFields; ++i) {
-    const char* attrName = fields[i].name;
-    if (!strcmp(attrName, "background_label_id")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "num_classes")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.numClasses = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "topk")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.topK = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "keep_topk")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.keepTopK = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "score_threshold")) {
-      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
-      params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
-    } else if (!strcmp(attrName, "iou_threshold")) {
-      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
-      params.iouThreshold = *(static_cast<const float*>(fields[i].data));
-    } else if (!strcmp(attrName, "is_normalized")) {
-      params.isNormalized = *(static_cast<const bool*>(fields[i].data));
-    } else if (!strcmp(attrName, "clip_boxes")) {
-      clipBoxes = *(static_cast<const bool*>(fields[i].data));
-    } else if (!strcmp(attrName, "return_index")) {
-      returnIndex = *(static_cast<const bool*>(fields[i].data));
-    }
-  }
-
-  TRTBatchedNMS* plugin = new TRTBatchedNMS(name, params, returnIndex);
-  plugin->setClipParam(clipBoxes);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  return plugin;
-}
-
-IPluginV2Ext* TRTBatchedNMSCreator::deserializePlugin(const char* name, const void* serialData,
-                                                      size_t serialLength) TRT_NOEXCEPT {
-  // This object will be deleted when the network is destroyed, which will
-  // call NMS::destroy()
-  TRTBatchedNMS* plugin = new TRTBatchedNMS(name, serialData, serialLength);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSCreator);
+namespace mmdeploy
+{
+    using namespace nvinfer1;
+    using nvinfer1::plugin::NMSParameters;
+
+    namespace
+    {
+        static const char* NMS_PLUGIN_VERSION{"1"};
+        static const char* NMS_PLUGIN_NAME{"TRTBatchedNMS"};
+    }  // namespace
+
+    TRTBatchedNMS::TRTBatchedNMS(const std::string& name, NMSParameters params, bool returnIndex)
+        : TRTPluginBase(name)
+        , param(params)
+        , mReturnIndex(returnIndex)
+    {
+    }
+
+    TRTBatchedNMS::TRTBatchedNMS(const std::string& name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &param);
+        deserialize_value(&data, &length, &mClipBoxes);
+        deserialize_value(&data, &length, &mReturnIndex);
+    }
+
+    int TRTBatchedNMS::getNbOutputs() const TRT_NOEXCEPT
+    {
+        int num = mReturnIndex ? 3 : 2;
+        return num;
+    }
+
+    nvinfer1::DimsExprs TRTBatchedNMS::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        ASSERT(nbInputs == 2);
+        ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
+        ASSERT(inputs[0].nbDims == 4);
+        ASSERT(inputs[1].nbDims == 3);
+
+        nvinfer1::DimsExprs ret;
+        ret.d[0] = inputs[0].d[0];
+        ret.d[1] = exprBuilder.constant(param.keepTopK);
+        switch (outputIndex)
+        {
+            case 0:
+                ret.nbDims = 3;
+                ret.d[2]   = exprBuilder.constant(5);
+                break;
+            case 1:
+                ret.nbDims = 2;
+                break;
+            case 2:
+                ret.nbDims = 2;
+            default:
+                break;
+        }
+
+        return ret;
+    }
+
+    size_t TRTBatchedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT
+    {
+        size_t batch_size    = inputs[0].dims.d[0];
+        size_t boxes_size    = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
+        size_t score_size    = inputs[1].dims.d[1] * inputs[1].dims.d[2];
+        size_t num_priors    = inputs[0].dims.d[1];
+        bool   shareLocation = (inputs[0].dims.d[2] == 1);
+        int    topk          = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
+        return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size, param.numClasses, num_priors, topk, DataType::kFLOAT, DataType::kFLOAT);
+    }
+
+    int TRTBatchedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                               const nvinfer1::PluginTensorDesc* outputDesc,
+                               const void* const*                inputs,
+                               void* const*                      outputs,
+                               void*                             workSpace,
+                               cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        const void* const locData  = inputs[0];
+        const void* const confData = inputs[1];
+
+        void*             nmsedDets   = outputs[0];
+        void*             nmsedLabels = outputs[1];
+        void*             nmsedIndex  = mReturnIndex ? outputs[2] : nullptr;
+
+        size_t            batch_size    = inputDesc[0].dims.d[0];
+        size_t            boxes_size    = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
+        size_t            score_size    = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
+        size_t            num_priors    = inputDesc[0].dims.d[1];
+        bool              shareLocation = (inputDesc[0].dims.d[2] == 1);
+
+        int               topk =
+            param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
+        bool           rotated = false;
+        pluginStatus_t status  = nmsInference(
+            stream,
+            batch_size,
+            boxes_size,
+            score_size,
+            shareLocation,
+            param.backgroundLabelId,
+            num_priors,
+            param.numClasses,
+            topk,
+            param.keepTopK,
+            param.scoreThreshold,
+            param.iouThreshold,
+            DataType::kFLOAT,
+            locData,
+            DataType::kFLOAT,
+            confData,
+            nmsedDets,
+            nmsedLabels,
+            nmsedIndex,
+            workSpace,
+            param.isNormalized,
+            false,
+            mClipBoxes,
+            rotated);
+        ASSERT(status == STATUS_SUCCESS);
+
+        return 0;
+    }
+
+    size_t TRTBatchedNMS::getSerializationSize() const TRT_NOEXCEPT
+    {
+        // NMSParameters
+        return sizeof(NMSParameters) + sizeof(mClipBoxes) + sizeof(mReturnIndex);
+    }
+
+    void TRTBatchedNMS::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, param);
+        serialize_value(&buffer, mClipBoxes);
+        serialize_value(&buffer, mReturnIndex);
+    }
+
+    void TRTBatchedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT
+    {
+        // Validate input arguments
+    }
+
+    bool TRTBatchedNMS::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 3 || pos == 4)
+        {
+            return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                   ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+        }
+        return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+
+    const char* TRTBatchedNMS::getPluginType() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_NAME;
+    }
+
+    const char* TRTBatchedNMS::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_VERSION;
+    }
+
+    IPluginV2DynamicExt* TRTBatchedNMS::clone() const TRT_NOEXCEPT
+    {
+        auto* plugin = new TRTBatchedNMS(mLayerName, param, mReturnIndex);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        plugin->setClipParam(mClipBoxes);
+        return plugin;
+    }
+
+    nvinfer1::DataType TRTBatchedNMS::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
+    {
+        ASSERT(index >= 0 && index < this->getNbOutputs());
+        if (index == 1 || index == 2)
+        {
+            return nvinfer1::DataType::kINT32;
+        }
+        return inputTypes[0];
+    }
+
+    void TRTBatchedNMS::setClipParam(bool clip)
+    {
+        mClipBoxes = clip;
+    }
+
+    TRTBatchedNMSCreator::TRTBatchedNMSCreator()
+    {
+        mPluginAttributes.emplace_back(
+            PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(
+            PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+        mPluginAttributes.emplace_back(
+            PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+        mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("return_index", nullptr, PluginFieldType::kINT32, 1));
+
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTBatchedNMSCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_NAME;
+    }
+
+    const char* TRTBatchedNMSCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_VERSION;
+    }
+
+    IPluginV2Ext* TRTBatchedNMSCreator::createPlugin(const char*                  name,
+                                                     const PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        const PluginField*              fields      = fc->fields;
+        bool                            clipBoxes   = true;
+        bool                            returnIndex = false;
+        nvinfer1::plugin::NMSParameters params{};
+
+        for (int i = 0; i < fc->nbFields; ++i)
+        {
+            const char* attrName = fields[i].name;
+            if (!strcmp(attrName, "background_label_id"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "num_classes"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.numClasses = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "topk"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.topK = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "keep_topk"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.keepTopK = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "score_threshold"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+                params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "iou_threshold"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+                params.iouThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "is_normalized"))
+            {
+                params.isNormalized = *(static_cast<const bool*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "clip_boxes"))
+            {
+                clipBoxes = *(static_cast<const bool*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "return_index"))
+            {
+                returnIndex = *(static_cast<const bool*>(fields[i].data));
+            }
+        }
+
+        TRTBatchedNMS* plugin = new TRTBatchedNMS(name, params, returnIndex);
+        plugin->setClipParam(clipBoxes);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+
+    IPluginV2Ext* TRTBatchedNMSCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
+    {
+        // This object will be deleted when the network is destroyed, which will
+        // call NMS::destroy()
+        TRTBatchedNMS* plugin = new TRTBatchedNMS(name, serialData, serialLength);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
index d1e5d643db..2cd276a931 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
@@ -8,75 +8,77 @@
 
 #include "NvInferPluginUtils.h"
 #include "trt_plugin_base.hpp"
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-enum NMSReturnType { RETURN_DETS = 1, RETURN_INDEX = 1 << 1 };
+    enum NMSReturnType
+    {
+        RETURN_DETS  = 1,
+        RETURN_INDEX = 1 << 1
+    };
 
-class TRTBatchedNMS : public TRTPluginBase {
- public:
-  TRTBatchedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param, bool returnIndex);
+    class TRTBatchedNMS : public TRTPluginBase
+    {
+      public:
+        TRTBatchedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param, bool returnIndex);
 
-  TRTBatchedNMS(const std::string& name, const void* data, size_t length);
+        TRTBatchedNMS(const std::string& name, const void* data, size_t length);
 
-  ~TRTBatchedNMS() TRT_NOEXCEPT override = default;
+        ~TRTBatchedNMS() TRT_NOEXCEPT override = default;
 
-  int getNbOutputs() const TRT_NOEXCEPT override;
+        int                 getNbOutputs() const TRT_NOEXCEPT override;
 
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-      TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
 
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
-              void* const* outputs, void* workSpace, cudaStream_t stream) TRT_NOEXCEPT override;
+        int                            enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                               const nvinfer1::PluginTensorDesc* outputDesc,
+                                               const void* const*                inputs,
+                                               void* const*                      outputs,
+                                               void*                             workSpace,
+                                               cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
+        size_t                         getSerializationSize() const TRT_NOEXCEPT override;
 
-  void serialize(void* buffer) const TRT_NOEXCEPT override;
+        void                           serialize(void* buffer) const TRT_NOEXCEPT override;
 
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc* outputs,
-                       int nbOutputs) TRT_NOEXCEPT override;
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT override;
 
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
+        bool                           supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
 
-  const char* getPluginType() const TRT_NOEXCEPT override;
+        const char*                    getPluginType() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*                    getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputType,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int index, const nvinfer1::DataType* inputType, int nbInputs) const TRT_NOEXCEPT override;
 
-  void setClipParam(bool clip);
+        void                           setClipParam(bool clip);
 
- private:
-  nvinfer1::plugin::NMSParameters param{};
-  bool mClipBoxes{};
-  bool mReturnIndex{};
-};
+      private:
+        nvinfer1::plugin::NMSParameters param{};
+        bool                            mClipBoxes{};
+        bool                            mReturnIndex{};
+    };
 
-class TRTBatchedNMSCreator : public TRTPluginCreatorBase {
- public:
-  TRTBatchedNMSCreator();
+    class TRTBatchedNMSCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTBatchedNMSCreator();
 
-  ~TRTBatchedNMSCreator() TRT_NOEXCEPT override = default;
+        ~TRTBatchedNMSCreator() TRT_NOEXCEPT override = default;
 
-  const char* getPluginName() const TRT_NOEXCEPT override;
+        const char*             getPluginName() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*             getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData,
-                                            size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp
index 9d977bc937..80b5be6abc 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp
@@ -8,222 +8,295 @@
 #include "nms/kernel.h"
 #include "trt_serialize.hpp"
 
-namespace mmdeploy {
-using namespace nvinfer1;
-using nvinfer1::plugin::NMSParameters;
-
-namespace {
-static const char* NMS_PLUGIN_VERSION{"1"};
-static const char* NMS_PLUGIN_NAME{"TRTBatchedRotatedNMS"};
-}  // namespace
-
-TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, NMSParameters params)
-    : TRTPluginBase(name), param(params) {}
-
-TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &param);
-  deserialize_value(&data, &length, &mClipBoxes);
-}
-
-int TRTBatchedRotatedNMS::getNbOutputs() const TRT_NOEXCEPT { return 2; }
-
-nvinfer1::DimsExprs TRTBatchedRotatedNMS::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
-  ASSERT(nbInputs == 2);
-  ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
-  ASSERT(inputs[0].nbDims == 4);
-  ASSERT(inputs[1].nbDims == 3);
-
-  nvinfer1::DimsExprs ret;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = exprBuilder.constant(param.keepTopK);
-  switch (outputIndex) {
-    case 0:
-      ret.nbDims = 3;
-      ret.d[2] = exprBuilder.constant(6);
-      break;
-    case 1:
-      ret.nbDims = 2;
-      break;
-    default:
-      break;
-  }
-
-  return ret;
-}
-
-size_t TRTBatchedRotatedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                                              int nbInputs,
-                                              const nvinfer1::PluginTensorDesc* outputs,
-                                              int nbOutputs) const TRT_NOEXCEPT {
-  size_t batch_size = inputs[0].dims.d[0];
-  size_t boxes_size = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
-  size_t score_size = inputs[1].dims.d[1] * inputs[1].dims.d[2];
-  size_t num_priors = inputs[0].dims.d[1];
-  bool shareLocation = (inputs[0].dims.d[2] == 1);
-  int topk = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
-  return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size,
-                                         param.numClasses, num_priors, topk, DataType::kFLOAT,
-                                         DataType::kFLOAT);
-}
-
-int TRTBatchedRotatedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-                                  const nvinfer1::PluginTensorDesc* outputDesc,
-                                  const void* const* inputs, void* const* outputs, void* workSpace,
-                                  cudaStream_t stream) TRT_NOEXCEPT {
-  const void* const locData = inputs[0];
-  const void* const confData = inputs[1];
-
-  void* nmsedDets = outputs[0];
-  void* nmsedLabels = outputs[1];
-
-  size_t batch_size = inputDesc[0].dims.d[0];
-  size_t boxes_size = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
-  size_t score_size = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
-  size_t num_priors = inputDesc[0].dims.d[1];
-  bool shareLocation = (inputDesc[0].dims.d[2] == 1);
-
-  int topk =
-      param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
-  bool rotated = true;
-  pluginStatus_t status = nmsInference(
-      stream, batch_size, boxes_size, score_size, shareLocation, param.backgroundLabelId,
-      num_priors, param.numClasses, topk, param.keepTopK, param.scoreThreshold, param.iouThreshold,
-      DataType::kFLOAT, locData, DataType::kFLOAT, confData, nmsedDets, nmsedLabels, nullptr,
-      workSpace, param.isNormalized, false, mClipBoxes, rotated);
-  ASSERT(status == STATUS_SUCCESS);
-
-  return 0;
-}
-
-size_t TRTBatchedRotatedNMS::getSerializationSize() const TRT_NOEXCEPT {
-  // NMSParameters,
-  return sizeof(NMSParameters) + sizeof(bool);
-}
-
-void TRTBatchedRotatedNMS::serialize(void* buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, param);
-  serialize_value(&buffer, mClipBoxes);
-}
-
-void TRTBatchedRotatedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
-                                           int nbInputs,
-                                           const nvinfer1::DynamicPluginTensorDesc* outputs,
-                                           int nbOutputs) TRT_NOEXCEPT {
-  // Validate input arguments
-}
-
-bool TRTBatchedRotatedNMS::supportsFormatCombination(int pos,
-                                                     const nvinfer1::PluginTensorDesc* ioDesc,
-                                                     int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 3) {
-    return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-           ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-  }
-  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-}
-
-const char* TRTBatchedRotatedNMS::getPluginType() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
-
-const char* TRTBatchedRotatedNMS::getPluginVersion() const TRT_NOEXCEPT {
-  return NMS_PLUGIN_VERSION;
-}
-
-IPluginV2DynamicExt* TRTBatchedRotatedNMS::clone() const TRT_NOEXCEPT {
-  auto* plugin = new TRTBatchedRotatedNMS(mLayerName, param);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  plugin->setClipParam(mClipBoxes);
-  return plugin;
-}
-
-nvinfer1::DataType TRTBatchedRotatedNMS::getOutputDataType(int index,
-                                                           const nvinfer1::DataType* inputTypes,
-                                                           int nbInputs) const TRT_NOEXCEPT {
-  ASSERT(index >= 0 && index < this->getNbOutputs());
-  if (index == 1) {
-    return nvinfer1::DataType::kINT32;
-  }
-  return inputTypes[0];
-}
-
-void TRTBatchedRotatedNMS::setClipParam(bool clip) { mClipBoxes = clip; }
-
-TRTBatchedRotatedNMSCreator::TRTBatchedRotatedNMSCreator() {
-  mPluginAttributes.emplace_back(
-      PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(
-      PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
-  mPluginAttributes.emplace_back(
-      PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
-  mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
-
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char* TRTBatchedRotatedNMSCreator::getPluginName() const TRT_NOEXCEPT {
-  return NMS_PLUGIN_NAME;
-}
-
-const char* TRTBatchedRotatedNMSCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return NMS_PLUGIN_VERSION;
-}
-
-IPluginV2Ext* TRTBatchedRotatedNMSCreator::createPlugin(
-    const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
-  const PluginField* fields = fc->fields;
-  bool clipBoxes = true;
-  nvinfer1::plugin::NMSParameters params{};
-
-  for (int i = 0; i < fc->nbFields; ++i) {
-    const char* attrName = fields[i].name;
-    if (!strcmp(attrName, "background_label_id")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "num_classes")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.numClasses = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "topk")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.topK = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "keep_topk")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.keepTopK = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "score_threshold")) {
-      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
-      params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
-    } else if (!strcmp(attrName, "iou_threshold")) {
-      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
-      params.iouThreshold = *(static_cast<const float*>(fields[i].data));
-    } else if (!strcmp(attrName, "is_normalized")) {
-      params.isNormalized = *(static_cast<const bool*>(fields[i].data));
-    } else if (!strcmp(attrName, "clip_boxes")) {
-      clipBoxes = *(static_cast<const bool*>(fields[i].data));
-    }
-  }
-
-  TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, params);
-  plugin->setClipParam(clipBoxes);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  return plugin;
-}
-
-IPluginV2Ext* TRTBatchedRotatedNMSCreator::deserializePlugin(const char* name,
-                                                             const void* serialData,
-                                                             size_t serialLength) TRT_NOEXCEPT {
-  // This object will be deleted when the network is destroyed, which will
-  // call NMS::destroy()
-  TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, serialData, serialLength);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(TRTBatchedRotatedNMSCreator);
+namespace mmdeploy
+{
+    using namespace nvinfer1;
+    using nvinfer1::plugin::NMSParameters;
+
+    namespace
+    {
+        static const char* NMS_PLUGIN_VERSION{"1"};
+        static const char* NMS_PLUGIN_NAME{"TRTBatchedRotatedNMS"};
+    }  // namespace
+
+    TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, NMSParameters params)
+        : TRTPluginBase(name)
+        , param(params)
+    {
+    }
+
+    TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &param);
+        deserialize_value(&data, &length, &mClipBoxes);
+    }
+
+    int TRTBatchedRotatedNMS::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 2;
+    }
+
+    nvinfer1::DimsExprs TRTBatchedRotatedNMS::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        ASSERT(nbInputs == 2);
+        ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
+        ASSERT(inputs[0].nbDims == 4);
+        ASSERT(inputs[1].nbDims == 3);
+
+        nvinfer1::DimsExprs ret;
+        ret.d[0] = inputs[0].d[0];
+        ret.d[1] = exprBuilder.constant(param.keepTopK);
+        switch (outputIndex)
+        {
+            case 0:
+                ret.nbDims = 3;
+                ret.d[2]   = exprBuilder.constant(6);
+                break;
+            case 1:
+                ret.nbDims = 2;
+                break;
+            default:
+                break;
+        }
+
+        return ret;
+    }
+
+    size_t TRTBatchedRotatedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                  int                               nbInputs,
+                                                  const nvinfer1::PluginTensorDesc* outputs,
+                                                  int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        size_t batch_size    = inputs[0].dims.d[0];
+        size_t boxes_size    = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
+        size_t score_size    = inputs[1].dims.d[1] * inputs[1].dims.d[2];
+        size_t num_priors    = inputs[0].dims.d[1];
+        bool   shareLocation = (inputs[0].dims.d[2] == 1);
+        int    topk          = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
+        return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size, param.numClasses, num_priors, topk, DataType::kFLOAT, DataType::kFLOAT);
+    }
+
+    int TRTBatchedRotatedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                      const nvinfer1::PluginTensorDesc* outputDesc,
+                                      const void* const*                inputs,
+                                      void* const*                      outputs,
+                                      void*                             workSpace,
+                                      cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        const void* const locData  = inputs[0];
+        const void* const confData = inputs[1];
+
+        void*             nmsedDets   = outputs[0];
+        void*             nmsedLabels = outputs[1];
+
+        size_t            batch_size    = inputDesc[0].dims.d[0];
+        size_t            boxes_size    = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
+        size_t            score_size    = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
+        size_t            num_priors    = inputDesc[0].dims.d[1];
+        bool              shareLocation = (inputDesc[0].dims.d[2] == 1);
+
+        int               topk =
+            param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
+        bool           rotated = true;
+        pluginStatus_t status  = nmsInference(
+            stream,
+            batch_size,
+            boxes_size,
+            score_size,
+            shareLocation,
+            param.backgroundLabelId,
+            num_priors,
+            param.numClasses,
+            topk,
+            param.keepTopK,
+            param.scoreThreshold,
+            param.iouThreshold,
+            DataType::kFLOAT,
+            locData,
+            DataType::kFLOAT,
+            confData,
+            nmsedDets,
+            nmsedLabels,
+            nullptr,
+            workSpace,
+            param.isNormalized,
+            false,
+            mClipBoxes,
+            rotated);
+        ASSERT(status == STATUS_SUCCESS);
+
+        return 0;
+    }
+
+    size_t TRTBatchedRotatedNMS::getSerializationSize() const TRT_NOEXCEPT
+    {
+        // NMSParameters,
+        return sizeof(NMSParameters) + sizeof(bool);
+    }
+
+    void TRTBatchedRotatedNMS::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, param);
+        serialize_value(&buffer, mClipBoxes);
+    }
+
+    void TRTBatchedRotatedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                               int                                      nbInputs,
+                                               const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                               int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        // Validate input arguments
+    }
+
+    bool TRTBatchedRotatedNMS::supportsFormatCombination(int                               pos,
+                                                         const nvinfer1::PluginTensorDesc* ioDesc,
+                                                         int                               nbInputs,
+                                                         int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 3)
+        {
+            return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                   ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+        }
+        return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+
+    const char* TRTBatchedRotatedNMS::getPluginType() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_NAME;
+    }
+
+    const char* TRTBatchedRotatedNMS::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_VERSION;
+    }
+
+    IPluginV2DynamicExt* TRTBatchedRotatedNMS::clone() const TRT_NOEXCEPT
+    {
+        auto* plugin = new TRTBatchedRotatedNMS(mLayerName, param);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        plugin->setClipParam(mClipBoxes);
+        return plugin;
+    }
+
+    nvinfer1::DataType TRTBatchedRotatedNMS::getOutputDataType(int                       index,
+                                                               const nvinfer1::DataType* inputTypes,
+                                                               int                       nbInputs) const TRT_NOEXCEPT
+    {
+        ASSERT(index >= 0 && index < this->getNbOutputs());
+        if (index == 1)
+        {
+            return nvinfer1::DataType::kINT32;
+        }
+        return inputTypes[0];
+    }
+
+    void TRTBatchedRotatedNMS::setClipParam(bool clip)
+    {
+        mClipBoxes = clip;
+    }
+
+    TRTBatchedRotatedNMSCreator::TRTBatchedRotatedNMSCreator()
+    {
+        mPluginAttributes.emplace_back(
+            PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(
+            PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+        mPluginAttributes.emplace_back(
+            PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+        mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
+
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTBatchedRotatedNMSCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_NAME;
+    }
+
+    const char* TRTBatchedRotatedNMSCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_VERSION;
+    }
+
+    IPluginV2Ext* TRTBatchedRotatedNMSCreator::createPlugin(
+        const char*                  name,
+        const PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        const PluginField*              fields    = fc->fields;
+        bool                            clipBoxes = true;
+        nvinfer1::plugin::NMSParameters params{};
+
+        for (int i = 0; i < fc->nbFields; ++i)
+        {
+            const char* attrName = fields[i].name;
+            if (!strcmp(attrName, "background_label_id"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "num_classes"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.numClasses = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "topk"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.topK = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "keep_topk"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.keepTopK = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "score_threshold"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+                params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "iou_threshold"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+                params.iouThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "is_normalized"))
+            {
+                params.isNormalized = *(static_cast<const bool*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "clip_boxes"))
+            {
+                clipBoxes = *(static_cast<const bool*>(fields[i].data));
+            }
+        }
+
+        TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, params);
+        plugin->setClipParam(clipBoxes);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+
+    IPluginV2Ext* TRTBatchedRotatedNMSCreator::deserializePlugin(const char* name,
+                                                                 const void* serialData,
+                                                                 size_t      serialLength) TRT_NOEXCEPT
+    {
+        // This object will be deleted when the network is destroyed, which will
+        // call NMS::destroy()
+        TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, serialData, serialLength);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(TRTBatchedRotatedNMSCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
index 66479eb7e7..49b5cb650d 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
@@ -7,72 +7,70 @@
 
 #include "NvInferPluginUtils.h"
 #include "trt_plugin_base.hpp"
-namespace mmdeploy {
-class TRTBatchedRotatedNMS : public TRTPluginBase {
- public:
-  TRTBatchedRotatedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param);
+namespace mmdeploy
+{
+    class TRTBatchedRotatedNMS : public TRTPluginBase
+    {
+      public:
+        TRTBatchedRotatedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param);
 
-  TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length);
+        TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length);
 
-  ~TRTBatchedRotatedNMS() TRT_NOEXCEPT override = default;
+        ~TRTBatchedRotatedNMS() TRT_NOEXCEPT override = default;
 
-  int getNbOutputs() const TRT_NOEXCEPT override;
+        int                 getNbOutputs() const TRT_NOEXCEPT override;
 
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-      TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
 
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
-              void* const* outputs, void* workSpace, cudaStream_t stream) TRT_NOEXCEPT override;
+        int                            enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                               const nvinfer1::PluginTensorDesc* outputDesc,
+                                               const void* const*                inputs,
+                                               void* const*                      outputs,
+                                               void*                             workSpace,
+                                               cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
+        size_t                         getSerializationSize() const TRT_NOEXCEPT override;
 
-  void serialize(void* buffer) const TRT_NOEXCEPT override;
+        void                           serialize(void* buffer) const TRT_NOEXCEPT override;
 
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc* outputs,
-                       int nbOutputs) TRT_NOEXCEPT override;
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT override;
 
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
+        bool                           supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
 
-  const char* getPluginType() const TRT_NOEXCEPT override;
+        const char*                    getPluginType() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*                    getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputType,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int index, const nvinfer1::DataType* inputType, int nbInputs) const TRT_NOEXCEPT override;
 
-  void setClipParam(bool clip);
+        void                           setClipParam(bool clip);
 
- private:
-  nvinfer1::plugin::NMSParameters param{};
-  bool mClipBoxes{};
-};
+      private:
+        nvinfer1::plugin::NMSParameters param{};
+        bool                            mClipBoxes{};
+    };
 
-class TRTBatchedRotatedNMSCreator : public TRTPluginCreatorBase {
- public:
-  TRTBatchedRotatedNMSCreator();
+    class TRTBatchedRotatedNMSCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTBatchedRotatedNMSCreator();
 
-  ~TRTBatchedRotatedNMSCreator() TRT_NOEXCEPT override = default;
+        ~TRTBatchedRotatedNMSCreator() TRT_NOEXCEPT override = default;
 
-  const char* getPluginName() const TRT_NOEXCEPT override;
+        const char*             getPluginName() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*             getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData,
-                                            size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
index 0f236e4956..db2063d235 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
@@ -10,176 +10,222 @@
 #include "trt_serialize.hpp"
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"TRTBicubicInterpolate"};
-}  // namespace
-
-TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string &name,
-                                             std::vector<float> scale_factor, bool align_corners)
-    : TRTPluginBase(name), mScaleFactor(scale_factor), mAlignCorners(align_corners) {}
-
-TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string name, const void *data,
-                                             size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mScaleFactor);
-  deserialize_value(&data, &length, &mAlignCorners);
-}
-
-nvinfer1::IPluginV2DynamicExt *TRTBicubicInterpolate::clone() const TRT_NOEXCEPT {
-  TRTBicubicInterpolate *plugin =
-      new TRTBicubicInterpolate(mLayerName, mScaleFactor, mAlignCorners);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTBicubicInterpolate::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[0].d[1];
-  auto height = exprBuilder.constant(mScaleFactor[0]);
-  auto width = exprBuilder.constant(mScaleFactor[1]);
-  auto d2 = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *height);
-  auto d3 = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[3], *width);
-  ret.d[2] = d2;
-  ret.d[3] = d3;
-
-  return ret;
-}
-
-bool TRTBicubicInterpolate::supportsFormatCombination(int pos,
-                                                      const nvinfer1::PluginTensorDesc *ioDesc,
-                                                      int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-
-  } else {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  }
-}
-
-void TRTBicubicInterpolate::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
-                                            int nbInputs,
-                                            const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                            int nbOutputs) TRT_NOEXCEPT {}
-
-size_t TRTBicubicInterpolate::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
-                                               int nbInputs,
-                                               const nvinfer1::PluginTensorDesc *outputs,
-                                               int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int TRTBicubicInterpolate::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                   const nvinfer1::PluginTensorDesc *outputDesc,
-                                   const void *const *inputs, void *const *outputs, void *workSpace,
-                                   cudaStream_t stream) TRT_NOEXCEPT {
-  int batch = inputDesc[0].dims.d[0];
-  int channels = inputDesc[0].dims.d[1];
-  int height = inputDesc[0].dims.d[2];
-  int width = inputDesc[0].dims.d[3];
-
-  int height_out = outputDesc[0].dims.d[2];
-  int width_out = outputDesc[0].dims.d[3];
-  const void *x = inputs[0];
-  void *output = outputs[0];
-
-  // TODO: add fp16 support
-  auto data_type = inputDesc[0].type;
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      bicubic_interpolate<float>((float *)x, (float *)output, batch, channels, height, width,
-                                 height_out, width_out, mAlignCorners, stream);
-      break;
-    default:
-      return 1;
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType TRTBicubicInterpolate::getOutputDataType(int index,
-                                                            const nvinfer1::DataType *inputTypes,
-                                                            int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *TRTBicubicInterpolate::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTBicubicInterpolate::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int TRTBicubicInterpolate::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTBicubicInterpolate::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mScaleFactor) + serialized_size(mAlignCorners);
-}
-
-void TRTBicubicInterpolate::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mScaleFactor);
-  serialize_value(&buffer, mAlignCorners);
-}
-
-////////////////////// creator /////////////////////////////
-
-TRTBicubicInterpolateCreator::TRTBicubicInterpolateCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("scale_factor"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTBicubicInterpolateCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTBicubicInterpolateCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *TRTBicubicInterpolateCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  nvinfer1::Dims size{2, {1, 1}};
-  std::vector<float> scale_factor;
-  bool align_corners = 1;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("scale_factor") == 0) {
-      int data_size = (fc->fields[i].length);
-      if (data_size != 2) {
-        data_size = data_size / sizeof(float);
-      }
-      ASSERT(data_size == 2)
-      const float *data_start = static_cast<const float *>(fc->fields[i].data);
-      scale_factor = std::vector<float>(data_start, data_start + data_size);
-    }
-
-    if (field_name.compare("align_corners") == 0) {
-      align_corners = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-  }
-
-  TRTBicubicInterpolate *plugin = new TRTBicubicInterpolate(name, scale_factor, align_corners);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *TRTBicubicInterpolateCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new TRTBicubicInterpolate(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(TRTBicubicInterpolateCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"TRTBicubicInterpolate"};
+    }  // namespace
+
+    TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string& name,
+                                                 std::vector<float> scale_factor,
+                                                 bool               align_corners)
+        : TRTPluginBase(name)
+        , mScaleFactor(scale_factor)
+        , mAlignCorners(align_corners)
+    {
+    }
+
+    TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mScaleFactor);
+        deserialize_value(&data, &length, &mAlignCorners);
+    }
+
+    nvinfer1::IPluginV2DynamicExt* TRTBicubicInterpolate::clone() const TRT_NOEXCEPT
+    {
+        TRTBicubicInterpolate* plugin =
+            new TRTBicubicInterpolate(mLayerName, mScaleFactor, mAlignCorners);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTBicubicInterpolate::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        nvinfer1::DimsExprs ret;
+        ret.nbDims  = 4;
+        ret.d[0]    = inputs[0].d[0];
+        ret.d[1]    = inputs[0].d[1];
+        auto height = exprBuilder.constant(mScaleFactor[0]);
+        auto width  = exprBuilder.constant(mScaleFactor[1]);
+        auto d2     = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *height);
+        auto d3     = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[3], *width);
+        ret.d[2]    = d2;
+        ret.d[3]    = d3;
+
+        return ret;
+    }
+
+    bool TRTBicubicInterpolate::supportsFormatCombination(int                               pos,
+                                                          const nvinfer1::PluginTensorDesc* ioDesc,
+                                                          int                               nbInputs,
+                                                          int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+    }
+
+    void   TRTBicubicInterpolate::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                int                                      nbInputs,
+                                                const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                int                                      nbOutputs) TRT_NOEXCEPT {}
+
+    size_t TRTBicubicInterpolate::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                   int                               nbInputs,
+                                                   const nvinfer1::PluginTensorDesc* outputs,
+                                                   int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int TRTBicubicInterpolate::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                       const nvinfer1::PluginTensorDesc* outputDesc,
+                                       const void* const*                inputs,
+                                       void* const*                      outputs,
+                                       void*                             workSpace,
+                                       cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int         batch    = inputDesc[0].dims.d[0];
+        int         channels = inputDesc[0].dims.d[1];
+        int         height   = inputDesc[0].dims.d[2];
+        int         width    = inputDesc[0].dims.d[3];
+
+        int         height_out = outputDesc[0].dims.d[2];
+        int         width_out  = outputDesc[0].dims.d[3];
+        const void* x          = inputs[0];
+        void*       output     = outputs[0];
+
+        // TODO: add fp16 support
+        auto        data_type = inputDesc[0].type;
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                bicubic_interpolate<float>((float*)x, (float*)output, batch, channels, height, width, height_out, width_out, mAlignCorners, stream);
+                break;
+            default:
+                return 1;
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTBicubicInterpolate::getOutputDataType(int                       index,
+                                                                const nvinfer1::DataType* inputTypes,
+                                                                int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* TRTBicubicInterpolate::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTBicubicInterpolate::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTBicubicInterpolate::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTBicubicInterpolate::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mScaleFactor) + serialized_size(mAlignCorners);
+    }
+
+    void TRTBicubicInterpolate::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mScaleFactor);
+        serialize_value(&buffer, mAlignCorners);
+    }
+
+    ////////////////////// creator /////////////////////////////
+
+    TRTBicubicInterpolateCreator::TRTBicubicInterpolateCreator()
+    {
+        mPluginAttributes.clear();
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("scale_factor"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners"));
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTBicubicInterpolateCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTBicubicInterpolateCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTBicubicInterpolateCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        nvinfer1::Dims     size{2, {1, 1}};
+        std::vector<float> scale_factor;
+        bool               align_corners = 1;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("scale_factor") == 0)
+            {
+                int data_size = (fc->fields[i].length);
+                if (data_size != 2)
+                {
+                    data_size = data_size / sizeof(float);
+                }
+                ASSERT(data_size == 2)
+                const float* data_start = static_cast<const float*>(fc->fields[i].data);
+                scale_factor            = std::vector<float>(data_start, data_start + data_size);
+            }
+
+            if (field_name.compare("align_corners") == 0)
+            {
+                align_corners = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+        }
+
+        TRTBicubicInterpolate* plugin = new TRTBicubicInterpolate(name, scale_factor, align_corners);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTBicubicInterpolateCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new TRTBicubicInterpolate(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+    REGISTER_TENSORRT_PLUGIN(TRTBicubicInterpolateCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
index 37ad7cf9ff..709976ce32 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
@@ -7,61 +7,58 @@
 #include <vector>
 
 #include "trt_plugin_base.hpp"
-namespace mmdeploy {
-class TRTBicubicInterpolate : public TRTPluginBase {
- public:
-  TRTBicubicInterpolate(const std::string &name, std::vector<float> scale_factor,
-                        bool align_corners);
+namespace mmdeploy
+{
+    class TRTBicubicInterpolate : public TRTPluginBase
+    {
+      public:
+        TRTBicubicInterpolate(const std::string& name, std::vector<float> scale_factor, bool align_corners);
 
-  TRTBicubicInterpolate(const std::string name, const void *data, size_t length);
+        TRTBicubicInterpolate(const std::string name, const void* data, size_t length);
 
-  TRTBicubicInterpolate() = delete;
+        TRTBicubicInterpolate() = delete;
 
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
 
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
 
- private:
-  std::vector<float> mScaleFactor;
-  bool mAlignCorners;
-};
+      private:
+        std::vector<float> mScaleFactor;
+        bool               mAlignCorners;
+    };
 
-class TRTBicubicInterpolateCreator : public TRTPluginCreatorBase {
- public:
-  TRTBicubicInterpolateCreator();
+    class TRTBicubicInterpolateCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTBicubicInterpolateCreator();
 
-  const char *getPluginName() const TRT_NOEXCEPT override;
+        const char*          getPluginName() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_BICUBIC_INTERPOLATE_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
index efb078c431..7a03aa3144 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
@@ -12,159 +12,176 @@
 
 // Based on
 // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
-template <typename scalar_t>
-__device__ __forceinline__ static scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
-  return ((A + 2) * x - (A + 3)) * x * x + 1;
+template<typename scalar_t>
+__device__ __forceinline__ static scalar_t cubic_convolution1(scalar_t x, scalar_t A)
+{
+    return ((A + 2) * x - (A + 3)) * x * x + 1;
 }
 
-template <typename scalar_t>
-__device__ __forceinline__ static scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
-  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+template<typename scalar_t>
+__device__ __forceinline__ static scalar_t cubic_convolution2(scalar_t x, scalar_t A)
+{
+    return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
 }
 
-template <typename scalar_t>
+template<typename scalar_t>
 __device__ __forceinline__ static void get_cubic_upsample_coefficients(scalar_t coeffs[4],
-                                                                       scalar_t t) {
-  scalar_t A = -0.75;
-
-  scalar_t x1 = t;
-  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
-  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
-
-  // opposite coefficients
-  scalar_t x2 = 1.0 - t;
-  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
-  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+                                                                       scalar_t t)
+{
+    scalar_t A = -0.75;
+
+    scalar_t x1 = t;
+    coeffs[0]   = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+    coeffs[1]   = cubic_convolution1<scalar_t>(x1, A);
+
+    // opposite coefficients
+    scalar_t x2 = 1.0 - t;
+    coeffs[2]   = cubic_convolution1<scalar_t>(x2, A);
+    coeffs[3]   = cubic_convolution2<scalar_t>(x2 + 1.0, A);
 }
 
-template <typename scalar_t>
-__device__ __forceinline__ static scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
-                                                          scalar_t x3, scalar_t t) {
-  scalar_t coeffs[4];
-  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+template<typename scalar_t>
+__device__ __forceinline__ static scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2, scalar_t x3, scalar_t t)
+{
+    scalar_t coeffs[4];
+    get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
 
-  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+    return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
 }
 
 /* Used by UpSampleBicubic2d.cu */
-template <typename scalar_t>
-__device__ __forceinline__ static scalar_t upsample_get_value_bounded(const scalar_t *data,
-                                                                      int batch, int channel,
-                                                                      int batchsize, int channels,
-                                                                      int height, int width, int y,
-                                                                      int x) {
-  int access_y = max(min(y, height - 1), 0);
-  int access_x = max(min(x, width - 1), 0);
-  return data[batch * channels * height * width + channel * height * width + access_y * width +
-              access_x];
+template<typename scalar_t>
+__device__ __forceinline__ static scalar_t upsample_get_value_bounded(const scalar_t* data,
+                                                                      int             batch,
+                                                                      int             channel,
+                                                                      int             batchsize,
+                                                                      int             channels,
+                                                                      int             height,
+                                                                      int             width,
+                                                                      int             y,
+                                                                      int             x)
+{
+    int access_y = max(min(y, height - 1), 0);
+    int access_x = max(min(x, width - 1), 0);
+    return data[batch * channels * height * width + channel * height * width + access_y * width +
+                access_x];
 }
 
-template <typename scalar_t>
+template<typename scalar_t>
 __device__ __forceinline__ scalar_t
-area_pixel_compute_source_index(scalar_t scale, int64_t dst_index, bool align_corners, bool cubic) {
-  if (align_corners) {
-    return scale * dst_index;
-  } else {
-    scalar_t src_idx = scale * (dst_index + 0.5) - 0.5;
-    // [Note] Follow Opencv resize logic:
-    // We allow negative src_idx here and later will use
-    //   dx = src_idx - floorf(src_idx)
-    // to compute the "distance"(which affects weights).
-    // For linear modes, weight distribution doesn't matter
-    // for negative indices as they use 2 pixels to interpolate.
-    // For example, [-1, 0], they both use pixel 0 value so it
-    // doesn't affect if we bound the src_idx to 0 or not.
-    // TODO: Our current linear mode impls use unbound indices
-    // where we should and then remove this cubic flag.
-    // This matters in cubic mode, as we might need [-1, 0, 1, 2]
-    // to interpolate and the weights can be affected.
-    return (!cubic && src_idx < 0) ? scalar_t(0) : src_idx;
-  }
+    area_pixel_compute_source_index(scalar_t scale, int64_t dst_index, bool align_corners, bool cubic)
+{
+    if (align_corners)
+    {
+        return scale * dst_index;
+    }
+    else
+    {
+        scalar_t src_idx = scale * (dst_index + 0.5) - 0.5;
+        // [Note] Follow Opencv resize logic:
+        // We allow negative src_idx here and later will use
+        //   dx = src_idx - floorf(src_idx)
+        // to compute the "distance"(which affects weights).
+        // For linear modes, weight distribution doesn't matter
+        // for negative indices as they use 2 pixels to interpolate.
+        // For example, [-1, 0], they both use pixel 0 value so it
+        // doesn't affect if we bound the src_idx to 0 or not.
+        // TODO: Our current linear mode impls use unbound indices
+        // where we should and then remove this cubic flag.
+        // This matters in cubic mode, as we might need [-1, 0, 1, 2]
+        // to interpolate and the weights can be affected.
+        return (!cubic && src_idx < 0) ? scalar_t(0) : src_idx;
+    }
 }
 
 // cubic interpolation pytorch
-template <typename scalar_t>
-__global__ void resize_cubic_kernel_torch(const int num_elements, const scalar_t *src,
-                                          const int batchsize, const int channels, int srcWidth,
-                                          int srcHeight, scalar_t *dst, int dstWidth, int dstHeight,
-                                          bool align_corners, float height_scale,
-                                          float width_scale) {
-  CUDA_1D_KERNEL_LOOP(index, num_elements) {
-    // Special case: input and output are the same size, just copy
-    const int output_x = index % dstWidth;
-    const int output_y = index / dstWidth;
-
-    if (srcHeight == dstHeight && srcWidth == dstWidth) {
-      for (int n = 0; n < batchsize; n++) {
-        for (int c = 0; c < channels; c++) {
-          const scalar_t val = src[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth +
-                                   output_y * dstWidth + output_x];
-          dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
-              output_x] = val;
+template<typename scalar_t>
+__global__ void resize_cubic_kernel_torch(const int num_elements, const scalar_t* src, const int batchsize, const int channels, int srcWidth, int srcHeight, scalar_t* dst, int dstWidth, int dstHeight, bool align_corners, float height_scale, float width_scale)
+{
+    CUDA_1D_KERNEL_LOOP(index, num_elements)
+    {
+        // Special case: input and output are the same size, just copy
+        const int output_x = index % dstWidth;
+        const int output_y = index / dstWidth;
+
+        if (srcHeight == dstHeight && srcWidth == dstWidth)
+        {
+            for (int n = 0; n < batchsize; n++)
+            {
+                for (int c = 0; c < channels; c++)
+                {
+                    const scalar_t val = src[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth +
+                                             output_y * dstWidth + output_x];
+                    dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
+                        output_x]      = val;
+                }
+            }
+            return;
         }
-      }
-      return;
-    }
-    // Interpolation kernel
-    scalar_t real_x =
-        area_pixel_compute_source_index(width_scale, output_x, align_corners, /*cubic=*/true);
-    int in_x = floorf(real_x);
-    scalar_t t_x = real_x - in_x;
-
-    scalar_t real_y =
-        area_pixel_compute_source_index(height_scale, output_y, align_corners, /*cubic=*/true);
-    int in_y = floorf(real_y);
-    scalar_t t_y = real_y - in_y;
-
-    for (int n = 0; n < batchsize; n++) {
-      for (int c = 0; c < channels; c++) {
-        scalar_t coefficients[4];
-
-        for (int k = 0; k < 4; k++) {
-          coefficients[k] = cubic_interp1d<scalar_t>(
-              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
-                                         in_y - 1 + k, in_x - 1),
-              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
-                                         in_y - 1 + k, in_x + 0),
-              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
-                                         in_y - 1 + k, in_x + 1),
-              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
-                                         in_y - 1 + k, in_x + 2),
-              t_x);
+        // Interpolation kernel
+        scalar_t real_x =
+            area_pixel_compute_source_index(width_scale, output_x, align_corners, /*cubic=*/true);
+        int      in_x = floorf(real_x);
+        scalar_t t_x  = real_x - in_x;
+
+        scalar_t real_y =
+            area_pixel_compute_source_index(height_scale, output_y, align_corners, /*cubic=*/true);
+        int      in_y = floorf(real_y);
+        scalar_t t_y  = real_y - in_y;
+
+        for (int n = 0; n < batchsize; n++)
+        {
+            for (int c = 0; c < channels; c++)
+            {
+                scalar_t coefficients[4];
+
+                for (int k = 0; k < 4; k++)
+                {
+                    coefficients[k] = cubic_interp1d<scalar_t>(
+                        upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth, in_y - 1 + k, in_x - 1),
+                        upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth, in_y - 1 + k, in_x + 0),
+                        upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth, in_y - 1 + k, in_x + 1),
+                        upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth, in_y - 1 + k, in_x + 2),
+                        t_x);
+                }
+
+                dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
+                    output_x] = scalar_t(cubic_interp1d(coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y));
+            }
         }
-
-        dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
-            output_x] = scalar_t(cubic_interp1d(coefficients[0], coefficients[1], coefficients[2],
-                                                coefficients[3], t_y));
-      }
     }
-  }
 }
 
-template <typename scalar_t>
-void resizeGPU(const scalar_t *pIn_d, scalar_t *pOut_d, int batch, int channels, int srcWidth,
-               int srcHeight, int dstWidth, int dstHeight, bool align_corners,
-               cudaStream_t stream) {
-  float height_scale = float(srcHeight) / dstHeight;
-  float width_scale = float(srcWidth) / dstWidth;
-  if (align_corners && dstWidth > 1 && dstHeight > 1) {
-    height_scale = (float)(srcHeight - 1) / (dstHeight - 1);
-    width_scale = (float)(srcWidth - 1) / (dstWidth - 1);
-  }
-  int n = batch * dstWidth * dstHeight * channels;
-  resize_cubic_kernel_torch<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(
-      dstWidth * dstHeight, pIn_d, batch, channels, srcWidth, srcHeight, pOut_d, dstWidth,
-      dstHeight, align_corners, height_scale, width_scale);
+template<typename scalar_t>
+void resizeGPU(const scalar_t* pIn_d, scalar_t* pOut_d, int batch, int channels, int srcWidth, int srcHeight, int dstWidth, int dstHeight, bool align_corners, cudaStream_t stream)
+{
+    float height_scale = float(srcHeight) / dstHeight;
+    float width_scale  = float(srcWidth) / dstWidth;
+    if (align_corners && dstWidth > 1 && dstHeight > 1)
+    {
+        height_scale = (float)(srcHeight - 1) / (dstHeight - 1);
+        width_scale  = (float)(srcWidth - 1) / (dstWidth - 1);
+    }
+    int n = batch * dstWidth * dstHeight * channels;
+    resize_cubic_kernel_torch<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(
+        dstWidth * dstHeight,
+        pIn_d,
+        batch,
+        channels,
+        srcWidth,
+        srcHeight,
+        pOut_d,
+        dstWidth,
+        dstHeight,
+        align_corners,
+        height_scale,
+        width_scale);
 }
 
-template <typename scalar_t>
-void bicubic_interpolate(const scalar_t *input, scalar_t *output, int batch, int channels,
-                         int in_height, int in_width, int out_height, int out_width,
-                         bool align_corners, cudaStream_t stream) {
-  resizeGPU(input, output, batch, channels, in_width, in_height, out_width, out_height,
-            align_corners, stream);
+template<typename scalar_t>
+void bicubic_interpolate(const scalar_t* input, scalar_t* output, int batch, int channels, int in_height, int in_width, int out_height, int out_width, bool align_corners, cudaStream_t stream)
+{
+    resizeGPU(input, output, batch, channels, in_width, in_height, out_width, out_height, align_corners, stream);
 }
 
-template void bicubic_interpolate<float>(const float *input, float *output, int batch, int channels,
-                                         int in_height, int in_width, int out_height, int out_width,
-                                         bool align_corners, cudaStream_t stream);
+template void bicubic_interpolate<float>(const float* input, float* output, int batch, int channels, int in_height, int in_width, int out_height, int out_width, bool align_corners, cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
index 66560f59f5..28a89a71db 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
@@ -4,8 +4,6 @@
 
 #include "common_cuda_helper.hpp"
 
-template <typename scalar_t>
-void bicubic_interpolate(const scalar_t *input, scalar_t *output, int batch, int channels,
-                         int in_height, int in_width, int out_height, int out_width,
-                         bool align_corners, cudaStream_t stream);
+template<typename scalar_t>
+void bicubic_interpolate(const scalar_t* input, scalar_t* output, int batch, int channels, int in_height, int in_width, int out_height, int out_width, bool align_corners, cudaStream_t stream);
 #endif  // TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
index c76cac8a32..97738f8f02 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
@@ -9,25 +9,27 @@
 #include <algorithm>
 
 #define CUDA_1D_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 
 #define THREADS_PER_BLOCK 512
 
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-inline int GET_BLOCKS(const int N) {
-  int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
-  int max_block_num = 4096;
-  return std::min(optimal_block_num, max_block_num);
+inline int GET_BLOCKS(const int N)
+{
+    int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
+    int max_block_num     = 4096;
+    return std::min(optimal_block_num, max_block_num);
 }
 
-#define cudaCheckError()                                                               \
-  {                                                                                    \
-    cudaError_t e = cudaGetLastError();                                                \
-    if (e != cudaSuccess) {                                                            \
-      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
-      exit(0);                                                                         \
-    }                                                                                  \
-  }
+#define cudaCheckError()                                                                     \
+    {                                                                                        \
+        cudaError_t e = cudaGetLastError();                                                  \
+        if (e != cudaSuccess)                                                                \
+        {                                                                                    \
+            printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+            exit(0);                                                                         \
+        }                                                                                    \
+    }
 
 /**
  * Returns a view of the original tensor with its dimensions permuted.
@@ -39,44 +41,43 @@ inline int GET_BLOCKS(const int N) {
  * @param[in] src_dim dim of src tensor
  * @param[in] stream cuda stream handle
  */
-template <class scalar_t>
-void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim,
-                   cudaStream_t stream = 0);
+template<class scalar_t>
+void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim, cudaStream_t stream = 0);
 
-template <typename scalar_t>
-cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha,
-                              const scalar_t* A, int lda, const scalar_t* B, int ldb,
-                              const scalar_t* beta, scalar_t* C, int ldc);
+template<typename scalar_t>
+cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha, const scalar_t* A, int lda, const scalar_t* B, int ldb, const scalar_t* beta, scalar_t* C, int ldc);
 
-template <typename scalar_t>
+template<typename scalar_t>
 __device__ __forceinline__ scalar_t bilinear_interpolate(const scalar_t* __restrict__ input,
-                                                         const int height, const int width,
-                                                         scalar_t y, scalar_t x) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+                                                         const int height,
+                                                         const int width,
+                                                         scalar_t  y,
+                                                         scalar_t  x)
+{
+    // deal with cases that inverse elements are out of feature map boundary
+    if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
 
-  y = min(scalar_t(height - 1), max(scalar_t(0), y));
-  x = min(scalar_t(width - 1), max(scalar_t(0), x));
+    y = min(scalar_t(height - 1), max(scalar_t(0), y));
+    x = min(scalar_t(width - 1), max(scalar_t(0), x));
 
-  const int y_low = floor(y);
-  const int x_low = floor(x);
-  const int y_high = ceil(y);
-  const int x_high = ceil(x);
+    const int      y_low  = floor(y);
+    const int      x_low  = floor(x);
+    const int      y_high = ceil(y);
+    const int      x_high = ceil(x);
 
-  const scalar_t v1 = input[y_low * width + x_low];
-  const scalar_t v2 = input[y_low * width + x_high];
-  const scalar_t v3 = input[y_high * width + x_low];
-  const scalar_t v4 = input[y_high * width + x_high];
+    const scalar_t v1 = input[y_low * width + x_low];
+    const scalar_t v2 = input[y_low * width + x_high];
+    const scalar_t v3 = input[y_high * width + x_low];
+    const scalar_t v4 = input[y_high * width + x_high];
 
-  // lerp can be performed by fma
-  const scalar_t ly = y - y_low;
-  const scalar_t lx = x - x_low;
-  const scalar_t v_low = fma(v2 - v1, lx, v1);
-  const scalar_t v_high = fma(v4 - v3, lx, v3);
-  const scalar_t val = fma(v_high - v_low, ly, v_low);
+    // lerp can be performed by fma
+    const scalar_t ly     = y - y_low;
+    const scalar_t lx     = x - x_low;
+    const scalar_t v_low  = fma(v2 - v1, lx, v1);
+    const scalar_t v_high = fma(v4 - v3, lx, v3);
+    const scalar_t val    = fma(v_high - v_low, ly, v_low);
 
-  return val;
+    return val;
 }
 
 #endif  // COMMON_CUDA_HELPER
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
index 22cffa0605..8b28458fd0 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
@@ -6,14 +6,6 @@
 #include "cuda_runtime_api.h"
 #include "kernel.h"
 
-pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize,
-                            const int perBatchScoresSize, const bool shareLocation,
-                            const int backgroundLabelId, const int numPredsPerClass,
-                            const int numClasses, const int topK, const int keepTopK,
-                            const float scoreThreshold, const float iouThreshold,
-                            const DataType DT_BBOX, const void* locData, const DataType DT_SCORE,
-                            const void* confData, void* nmsedDets, void* nmsedLabels,
-                            void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid,
-                            bool clipBoxes, bool rotated = false);
+pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize, const int perBatchScoresSize, const bool shareLocation, const int backgroundLabelId, const int numPredsPerClass, const int numClasses, const int topK, const int keepTopK, const float scoreThreshold, const float iouThreshold, const DataType DT_BBOX, const void* locData, const DataType DT_SCORE, const void* confData, void* nmsedDets, void* nmsedLabels, void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid, bool clipBoxes, bool rotated = false);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
index 93fd2a4fb9..81500147e7 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
@@ -2,14 +2,14 @@
 // modify from
 // https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
 #include "kernel.h"
-template <typename KeyT, typename ValueT>
-size_t cubSortPairsWorkspaceSize(int num_items, int num_segments) {
-  size_t temp_storage_bytes = 0;
-  cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)NULL, temp_storage_bytes,
-                                                     (const KeyT*)NULL, (KeyT*)NULL,
-                                                     (const ValueT*)NULL, (ValueT*)NULL,
-                                                     num_items,     // # items
-                                                     num_segments,  // # segments
-                                                     (const int*)NULL, (const int*)NULL);
-  return temp_storage_bytes;
+template<typename KeyT, typename ValueT>
+size_t cubSortPairsWorkspaceSize(int num_items, int num_segments)
+{
+    size_t temp_storage_bytes = 0;
+    cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)NULL, temp_storage_bytes, (const KeyT*)NULL, (KeyT*)NULL, (const ValueT*)NULL, (ValueT*)NULL,
+                                                       num_items,     // # items
+                                                       num_segments,  // # segments
+                                                       (const int*)NULL,
+                                                       (const int*)NULL);
+    return temp_storage_bytes;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
index 1b50fa4e9f..87b089b623 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
@@ -15,72 +15,54 @@
 using namespace nvinfer1;
 #define DEBUG_ENABLE 0
 
-template <typename T>
-struct Bbox {
-  T xmin, ymin, xmax, ymax;
-  Bbox(T xmin, T ymin, T xmax, T ymax) : xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {}
-  Bbox() = default;
+template<typename T>
+struct Bbox
+{
+    T xmin, ymin, xmax, ymax;
+    Bbox(T xmin, T ymin, T xmax, T ymax)
+        : xmin(xmin)
+        , ymin(ymin)
+        , xmax(xmax)
+        , ymax(ymax)
+    {
+    }
+    Bbox() = default;
 };
 
-size_t get_cuda_arch(int devID);
+size_t         get_cuda_arch(int devID);
 
-int8_t* alignPtr(int8_t* ptr, uintptr_t to);
+int8_t*        alignPtr(int8_t* ptr, uintptr_t to);
 
-int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize);
+int8_t*        nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize);
 
-void setUniformOffsets(cudaStream_t stream, int num_segments, int offset, int* d_offsets);
+void           setUniformOffsets(cudaStream_t stream, int num_segments, int offset, int* d_offsets);
 
-pluginStatus_t allClassNMS(cudaStream_t stream, int num, int num_classes, int num_preds_per_class,
-                           int top_k, float nms_threshold, bool share_location, bool isNormalized,
-                           DataType DT_SCORE, DataType DT_BBOX, void* bbox_data,
-                           void* beforeNMS_scores, void* beforeNMS_index_array,
-                           void* afterNMS_scores, void* afterNMS_index_array, bool flipXY = false);
+pluginStatus_t allClassNMS(cudaStream_t stream, int num, int num_classes, int num_preds_per_class, int top_k, float nms_threshold, bool share_location, bool isNormalized, DataType DT_SCORE, DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array, bool flipXY = false);
 
-pluginStatus_t allClassRotatedNMS(cudaStream_t stream, int num, int num_classes,
-                                  int num_preds_per_class, int top_k, float nms_threshold,
-                                  bool share_location, bool isNormalized, DataType DT_SCORE,
-                                  DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores,
-                                  void* beforeNMS_index_array, void* afterNMS_scores,
-                                  void* afterNMS_index_array, bool flipXY = false);
+pluginStatus_t allClassRotatedNMS(cudaStream_t stream, int num, int num_classes, int num_preds_per_class, int top_k, float nms_threshold, bool share_location, bool isNormalized, DataType DT_SCORE, DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array, bool flipXY = false);
 
-size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX);
+size_t         detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX);
 
-size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX);
+size_t         detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX);
 
-size_t sortScoresPerClassWorkspaceSize(int num, int num_classes, int num_preds_per_class,
-                                       DataType DT_CONF);
+size_t         sortScoresPerClassWorkspaceSize(int num, int num_classes, int num_preds_per_class, DataType DT_CONF);
 
-size_t sortScoresPerImageWorkspaceSize(int num_images, int num_items_per_image, DataType DT_SCORE);
+size_t         sortScoresPerImageWorkspaceSize(int num_images, int num_items_per_image, DataType DT_SCORE);
 
-pluginStatus_t sortScoresPerImage(cudaStream_t stream, int num_images, int num_items_per_image,
-                                  DataType DT_SCORE, void* unsorted_scores,
-                                  void* unsorted_bbox_indices, void* sorted_scores,
-                                  void* sorted_bbox_indices, void* workspace);
+pluginStatus_t sortScoresPerImage(cudaStream_t stream, int num_images, int num_items_per_image, DataType DT_SCORE, void* unsorted_scores, void* unsorted_bbox_indices, void* sorted_scores, void* sorted_bbox_indices, void* workspace);
 
-pluginStatus_t sortScoresPerClass(cudaStream_t stream, int num, int num_classes,
-                                  int num_preds_per_class, int background_label_id,
-                                  float confidence_threshold, DataType DT_SCORE,
-                                  void* conf_scores_gpu, void* index_array_gpu, void* workspace);
+pluginStatus_t sortScoresPerClass(cudaStream_t stream, int num, int num_classes, int num_preds_per_class, int background_label_id, float confidence_threshold, DataType DT_SCORE, void* conf_scores_gpu, void* index_array_gpu, void* workspace);
 
-size_t calculateTotalWorkspaceSize(size_t* workspaces, int count);
+size_t         calculateTotalWorkspaceSize(size_t* workspaces, int count);
 
-pluginStatus_t permuteData(cudaStream_t stream, int nthreads, int num_classes, int num_data,
-                           int num_dim, DataType DT_DATA, bool confSigmoid, const void* data,
-                           void* new_data);
+pluginStatus_t permuteData(cudaStream_t stream, int nthreads, int num_classes, int num_data, int num_dim, DataType DT_DATA, bool confSigmoid, const void* data, void* new_data);
 
-size_t detectionForwardPreNMSSize(int N, int C2);
+size_t         detectionForwardPreNMSSize(int N, int C2);
 
-size_t detectionForwardPostNMSSize(int N, int numClasses, int topK);
+size_t         detectionForwardPostNMSSize(int N, int numClasses, int topK);
 
-pluginStatus_t gatherNMSOutputs(cudaStream_t stream, bool shareLocation, int numImages,
-                                int numPredsPerClass, int numClasses, int topK, int keepTopK,
-                                DataType DT_BBOX, DataType DT_SCORE, const void* indices,
-                                const void* scores, const void* bboxData, void* nmsedDets,
-                                void* nmsedLabels, void* nmsedIndex = nullptr,
-                                bool clipBoxes = true, bool rotated = false);
+pluginStatus_t gatherNMSOutputs(cudaStream_t stream, bool shareLocation, int numImages, int numPredsPerClass, int numClasses, int topK, int keepTopK, DataType DT_BBOX, DataType DT_SCORE, const void* indices, const void* scores, const void* bboxData, void* nmsedDets, void* nmsedLabels, void* nmsedIndex = nullptr, bool clipBoxes = true, bool rotated = false);
 
-size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses,
-                                       int numPredsPerClass, int topK, DataType DT_BBOX,
-                                       DataType DT_SCORE);
+size_t         detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass, int topK, DataType DT_BBOX, DataType DT_SCORE);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
index 8440bb6219..482d11a924 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
@@ -5,73 +5,98 @@
 #include "NvInferVersion.h"
 #include "trt_plugin_helper.hpp"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
 #if NV_TENSORRT_MAJOR > 7
-#define TRT_NOEXCEPT noexcept
+    #define TRT_NOEXCEPT noexcept
 #else
-#define TRT_NOEXCEPT
+    #define TRT_NOEXCEPT
 #endif
 
-class TRTPluginBase : public nvinfer1::IPluginV2DynamicExt {
- public:
-  TRTPluginBase(const std::string &name) : mLayerName(name) {}
-  // IPluginV2 Methods
-  const char *getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
-  int initialize() TRT_NOEXCEPT override { return STATUS_SUCCESS; }
-  void terminate() TRT_NOEXCEPT override {}
-  void destroy() TRT_NOEXCEPT override { delete this; }
-  void setPluginNamespace(const char *pluginNamespace) TRT_NOEXCEPT override {
-    mNamespace = pluginNamespace;
-  }
-  const char *getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
+    class TRTPluginBase : public nvinfer1::IPluginV2DynamicExt
+    {
+      public:
+        TRTPluginBase(const std::string& name)
+            : mLayerName(name)
+        {
+        }
+        // IPluginV2 Methods
+        const char* getPluginVersion() const TRT_NOEXCEPT override
+        {
+            return "1";
+        }
+        int initialize() TRT_NOEXCEPT override
+        {
+            return STATUS_SUCCESS;
+        }
+        void terminate() TRT_NOEXCEPT override {}
+        void destroy() TRT_NOEXCEPT override
+        {
+            delete this;
+        }
+        void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override
+        {
+            mNamespace = pluginNamespace;
+        }
+        const char* getPluginNamespace() const TRT_NOEXCEPT override
+        {
+            return mNamespace.c_str();
+        }
 
-  virtual void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                               const nvinfer1::DynamicPluginTensorDesc *out,
-                               int nbOutputs) TRT_NOEXCEPT override {}
+        virtual void   configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override {}
 
-  virtual size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                                  const nvinfer1::PluginTensorDesc *outputs,
-                                  int nbOutputs) const TRT_NOEXCEPT override {
-    return 0;
-  }
+        virtual size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override
+        {
+            return 0;
+        }
 
-  virtual void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
-                               nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override {}
+        virtual void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override {}
 
-  virtual void detachFromContext() TRT_NOEXCEPT override {}
+        virtual void detachFromContext() TRT_NOEXCEPT override {}
 
- protected:
-  const std::string mLayerName;
-  std::string mNamespace;
+      protected:
+        const std::string mLayerName;
+        std::string       mNamespace;
 
 #if NV_TENSORRT_MAJOR < 8
- protected:
-  // To prevent compiler warnings.
-  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::enqueue;
-  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
-  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+      protected:
+        // To prevent compiler warnings.
+        using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+        using nvinfer1::IPluginV2DynamicExt::enqueue;
+        using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+        using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+        using nvinfer1::IPluginV2DynamicExt::supportsFormat;
 #endif
-};
+    };
 
-class TRTPluginCreatorBase : public nvinfer1::IPluginCreator {
- public:
-  const char *getPluginVersion() const TRT_NOEXCEPT override { return "1"; };
+    class TRTPluginCreatorBase : public nvinfer1::IPluginCreator
+    {
+      public:
+        const char* getPluginVersion() const TRT_NOEXCEPT override
+        {
+            return "1";
+        };
 
-  const nvinfer1::PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return &mFC; }
+        const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override
+        {
+            return &mFC;
+        }
 
-  void setPluginNamespace(const char *pluginNamespace) TRT_NOEXCEPT override {
-    mNamespace = pluginNamespace;
-  }
+        void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override
+        {
+            mNamespace = pluginNamespace;
+        }
 
-  const char *getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
+        const char* getPluginNamespace() const TRT_NOEXCEPT override
+        {
+            return mNamespace.c_str();
+        }
 
- protected:
-  nvinfer1::PluginFieldCollection mFC;
-  std::vector<nvinfer1::PluginField> mPluginAttributes;
-  std::string mNamespace;
-};
+      protected:
+        nvinfer1::PluginFieldCollection    mFC;
+        std::vector<nvinfer1::PluginField> mPluginAttributes;
+        std::string                        mNamespace;
+    };
 }  // namespace mmdeploy
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp
index 41b47acdbe..050c0dd308 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp
@@ -11,145 +11,159 @@
 cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t* cudnn_dtype);
 
 // Enumerator for status
-typedef enum {
-  STATUS_SUCCESS = 0,
-  STATUS_FAILURE = 1,
-  STATUS_BAD_PARAM = 2,
-  STATUS_NOT_SUPPORTED = 3,
-  STATUS_NOT_INITIALIZED = 4
+typedef enum
+{
+    STATUS_SUCCESS         = 0,
+    STATUS_FAILURE         = 1,
+    STATUS_BAD_PARAM       = 2,
+    STATUS_NOT_SUPPORTED   = 3,
+    STATUS_NOT_INITIALIZED = 4
 } pluginStatus_t;
 
-#define ASSERT(assertion)                                                    \
-  {                                                                          \
-    if (!(assertion)) {                                                      \
-      std::cerr << "#assertion" << __FILE__ << "," << __LINE__ << std::endl; \
-      abort();                                                               \
-    }                                                                        \
-  }
-
-#define CUASSERT(status_)                                                                       \
-  {                                                                                             \
-    auto s_ = status_;                                                                          \
-    if (s_ != cudaSuccess) {                                                                    \
-      std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << ", " << cudaGetErrorString(s_) \
-                << std::endl;                                                                   \
-    }                                                                                           \
-  }
-#define CUBLASASSERT(status_)                                               \
-  {                                                                         \
-    auto s_ = status_;                                                      \
-    if (s_ != CUBLAS_STATUS_SUCCESS) {                                      \
-      std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
-    }                                                                       \
-  }
-#define CUERRORMSG(status_)                                                            \
-  {                                                                                    \
-    auto s_ = status_;                                                                 \
-    if (s_ != 0) std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
-  }
+#define ASSERT(assertion)                                                          \
+    {                                                                              \
+        if (!(assertion))                                                          \
+        {                                                                          \
+            std::cerr << "#assertion" << __FILE__ << "," << __LINE__ << std::endl; \
+            abort();                                                               \
+        }                                                                          \
+    }
+
+#define CUASSERT(status_)                                                                             \
+    {                                                                                                 \
+        auto s_ = status_;                                                                            \
+        if (s_ != cudaSuccess)                                                                        \
+        {                                                                                             \
+            std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << ", " << cudaGetErrorString(s_) \
+                      << std::endl;                                                                   \
+        }                                                                                             \
+    }
+#define CUBLASASSERT(status_)                                                     \
+    {                                                                             \
+        auto s_ = status_;                                                        \
+        if (s_ != CUBLAS_STATUS_SUCCESS)                                          \
+        {                                                                         \
+            std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
+        }                                                                         \
+    }
+#define CUERRORMSG(status_)                                                                \
+    {                                                                                      \
+        auto s_ = status_;                                                                 \
+        if (s_ != 0) std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
+    }
 
 #ifndef DEBUG
 
-#define CHECK(status)         \
-  do {                        \
-    if (status != 0) abort(); \
-  } while (0)
-
-#define ASSERT_PARAM(exp)                \
-  do {                                   \
-    if (!(exp)) return STATUS_BAD_PARAM; \
-  } while (0)
-
-#define ASSERT_FAILURE(exp)            \
-  do {                                 \
-    if (!(exp)) return STATUS_FAILURE; \
-  } while (0)
-
-#define CSC(call, err)               \
-  do {                               \
-    cudaError_t cudaStatus = call;   \
-    if (cudaStatus != cudaSuccess) { \
-      return err;                    \
-    }                                \
-  } while (0)
-
-#define DEBUG_PRINTF(...) \
-  do {                    \
-  } while (0)
+    #define CHECK(status)             \
+        do {                          \
+            if (status != 0) abort(); \
+        } while (0)
+
+    #define ASSERT_PARAM(exp)                    \
+        do {                                     \
+            if (!(exp)) return STATUS_BAD_PARAM; \
+        } while (0)
+
+    #define ASSERT_FAILURE(exp)                \
+        do {                                   \
+            if (!(exp)) return STATUS_FAILURE; \
+        } while (0)
+
+    #define CSC(call, err)                 \
+        do {                               \
+            cudaError_t cudaStatus = call; \
+            if (cudaStatus != cudaSuccess) \
+            {                              \
+                return err;                \
+            }                              \
+        } while (0)
+
+    #define DEBUG_PRINTF(...) \
+        do {                  \
+        } while (0)
 
 #else
 
-#define ASSERT_PARAM(exp)                                                   \
-  do {                                                                      \
-    if (!(exp)) {                                                           \
-      fprintf(stderr, "Bad param - " #exp ", %s:%d\n", __FILE__, __LINE__); \
-      return STATUS_BAD_PARAM;                                              \
-    }                                                                       \
-  } while (0)
-
-#define ASSERT_FAILURE(exp)                                               \
-  do {                                                                    \
-    if (!(exp)) {                                                         \
-      fprintf(stderr, "Failure - " #exp ", %s:%d\n", __FILE__, __LINE__); \
-      return STATUS_FAILURE;                                              \
-    }                                                                     \
-  } while (0)
-
-#define CSC(call, err)                                                                    \
-  do {                                                                                    \
-    cudaError_t cudaStatus = call;                                                        \
-    if (cudaStatus != cudaSuccess) {                                                      \
-      printf("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(cudaStatus)); \
-      return err;                                                                         \
-    }                                                                                     \
-  } while (0)
-
-#define CHECK(status)                                                                       \
-  {                                                                                         \
-    if (status != 0) {                                                                      \
-      DEBUG_PRINTF("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \
-      abort();                                                                              \
-    }                                                                                       \
-  }
-
-#define DEBUG_PRINTF(...) \
-  do {                    \
-    printf(__VA_ARGS__);  \
-  } while (0)
+    #define ASSERT_PARAM(exp)                                                         \
+        do {                                                                          \
+            if (!(exp))                                                               \
+            {                                                                         \
+                fprintf(stderr, "Bad param - " #exp ", %s:%d\n", __FILE__, __LINE__); \
+                return STATUS_BAD_PARAM;                                              \
+            }                                                                         \
+        } while (0)
+
+    #define ASSERT_FAILURE(exp)                                                     \
+        do {                                                                        \
+            if (!(exp))                                                             \
+            {                                                                       \
+                fprintf(stderr, "Failure - " #exp ", %s:%d\n", __FILE__, __LINE__); \
+                return STATUS_FAILURE;                                              \
+            }                                                                       \
+        } while (0)
+
+    #define CSC(call, err)                                                                          \
+        do {                                                                                        \
+            cudaError_t cudaStatus = call;                                                          \
+            if (cudaStatus != cudaSuccess)                                                          \
+            {                                                                                       \
+                printf("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(cudaStatus)); \
+                return err;                                                                         \
+            }                                                                                       \
+        } while (0)
+
+    #define CHECK(status)                                                                             \
+        {                                                                                             \
+            if (status != 0)                                                                          \
+            {                                                                                         \
+                DEBUG_PRINTF("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \
+                abort();                                                                              \
+            }                                                                                         \
+        }
+
+    #define DEBUG_PRINTF(...)    \
+        do {                     \
+            printf(__VA_ARGS__); \
+        } while (0)
 
 #endif
 
-namespace mmdeploy {
-
-const int MAXTENSORDIMS = 10;
-
-struct TensorDesc {
-  int shape[MAXTENSORDIMS];
-  int stride[MAXTENSORDIMS];
-  int dim;
-};
-
-inline unsigned int getElementSize(nvinfer1::DataType t) {
-  switch (t) {
-    case nvinfer1::DataType::kINT32:
-      return 4;
-    case nvinfer1::DataType::kFLOAT:
-      return 4;
-    case nvinfer1::DataType::kHALF:
-      return 2;
-    // case nvinfer1::DataType::kBOOL:
-    case nvinfer1::DataType::kINT8:
-      return 1;
-    default:
-      throw std::runtime_error("Invalid DataType.");
-  }
-  throw std::runtime_error("Invalid DataType.");
-  return 0;
-}
-
-inline size_t getAlignedSize(size_t origin_size, size_t aligned_number = 16) {
-  return size_t((origin_size + aligned_number - 1) / aligned_number) * aligned_number;
-}
+namespace mmdeploy
+{
+
+    const int MAXTENSORDIMS = 10;
+
+    struct TensorDesc
+    {
+        int shape[MAXTENSORDIMS];
+        int stride[MAXTENSORDIMS];
+        int dim;
+    };
+
+    inline unsigned int getElementSize(nvinfer1::DataType t)
+    {
+        switch (t)
+        {
+            case nvinfer1::DataType::kINT32:
+                return 4;
+            case nvinfer1::DataType::kFLOAT:
+                return 4;
+            case nvinfer1::DataType::kHALF:
+                return 2;
+            // case nvinfer1::DataType::kBOOL:
+            case nvinfer1::DataType::kINT8:
+                return 1;
+            default:
+                throw std::runtime_error("Invalid DataType.");
+        }
+        throw std::runtime_error("Invalid DataType.");
+        return 0;
+    }
+
+    inline size_t getAlignedSize(size_t origin_size, size_t aligned_number = 16)
+    {
+        return size_t((origin_size + aligned_number - 1) / aligned_number) * aligned_number;
+    }
 
 }  // namespace mmdeploy
 #endif  // TRT_PLUGIN_HELPER_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
index db88184432..d1d2fff678 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
@@ -9,89 +9,111 @@
 #include <type_traits>
 #include <vector>
 
-template <typename T>
+template<typename T>
 inline void serialize_value(void** buffer, T const& value);
 
-template <typename T>
+template<typename T>
 inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value);
 
-namespace {
+namespace
+{
 
-template <typename T, class Enable = void>
-struct Serializer {};
+    template<typename T, class Enable = void>
+    struct Serializer
+    {
+    };
 
-template <typename T>
-struct Serializer<T,
-                  typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
-                                          std::is_pod<T>::value>::type> {
-  static size_t serialized_size(T const& value) { return sizeof(T); }
-  static void serialize(void** buffer, T const& value) {
-    ::memcpy(*buffer, &value, sizeof(T));
-    reinterpret_cast<char*&>(*buffer) += sizeof(T);
-  }
-  static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
-    assert(*buffer_size >= sizeof(T));
-    ::memcpy(value, *buffer, sizeof(T));
-    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
-    *buffer_size -= sizeof(T);
-  }
-};
+    template<typename T>
+    struct Serializer<T,
+                      typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
+                                              std::is_pod<T>::value>::type>
+    {
+        static size_t serialized_size(T const& value)
+        {
+            return sizeof(T);
+        }
+        static void serialize(void** buffer, T const& value)
+        {
+            ::memcpy(*buffer, &value, sizeof(T));
+            reinterpret_cast<char*&>(*buffer) += sizeof(T);
+        }
+        static void deserialize(void const** buffer, size_t* buffer_size, T* value)
+        {
+            assert(*buffer_size >= sizeof(T));
+            ::memcpy(value, *buffer, sizeof(T));
+            reinterpret_cast<char const*&>(*buffer) += sizeof(T);
+            *buffer_size -= sizeof(T);
+        }
+    };
 
-template <>
-struct Serializer<const char*> {
-  static size_t serialized_size(const char* value) { return strlen(value) + 1; }
-  static void serialize(void** buffer, const char* value) {
-    ::strcpy(static_cast<char*>(*buffer), value);
-    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
-  }
-  static void deserialize(void const** buffer, size_t* buffer_size, const char** value) {
-    *value = static_cast<char const*>(*buffer);
-    size_t data_size = strnlen(*value, *buffer_size) + 1;
-    assert(*buffer_size >= data_size);
-    reinterpret_cast<char const*&>(*buffer) += data_size;
-    *buffer_size -= data_size;
-  }
-};
+    template<>
+    struct Serializer<const char*>
+    {
+        static size_t serialized_size(const char* value)
+        {
+            return strlen(value) + 1;
+        }
+        static void serialize(void** buffer, const char* value)
+        {
+            ::strcpy(static_cast<char*>(*buffer), value);
+            reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
+        }
+        static void deserialize(void const** buffer, size_t* buffer_size, const char** value)
+        {
+            *value           = static_cast<char const*>(*buffer);
+            size_t data_size = strnlen(*value, *buffer_size) + 1;
+            assert(*buffer_size >= data_size);
+            reinterpret_cast<char const*&>(*buffer) += data_size;
+            *buffer_size -= data_size;
+        }
+    };
 
-template <typename T>
-struct Serializer<std::vector<T>,
-                  typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
-                                          std::is_pod<T>::value>::type> {
-  static size_t serialized_size(std::vector<T> const& value) {
-    return sizeof(value.size()) + value.size() * sizeof(T);
-  }
-  static void serialize(void** buffer, std::vector<T> const& value) {
-    serialize_value(buffer, value.size());
-    size_t nbyte = value.size() * sizeof(T);
-    ::memcpy(*buffer, value.data(), nbyte);
-    reinterpret_cast<char*&>(*buffer) += nbyte;
-  }
-  static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value) {
-    size_t size;
-    deserialize_value(buffer, buffer_size, &size);
-    value->resize(size);
-    size_t nbyte = value->size() * sizeof(T);
-    assert(*buffer_size >= nbyte);
-    ::memcpy(value->data(), *buffer, nbyte);
-    reinterpret_cast<char const*&>(*buffer) += nbyte;
-    *buffer_size -= nbyte;
-  }
-};
+    template<typename T>
+    struct Serializer<std::vector<T>,
+                      typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
+                                              std::is_pod<T>::value>::type>
+    {
+        static size_t serialized_size(std::vector<T> const& value)
+        {
+            return sizeof(value.size()) + value.size() * sizeof(T);
+        }
+        static void serialize(void** buffer, std::vector<T> const& value)
+        {
+            serialize_value(buffer, value.size());
+            size_t nbyte = value.size() * sizeof(T);
+            ::memcpy(*buffer, value.data(), nbyte);
+            reinterpret_cast<char*&>(*buffer) += nbyte;
+        }
+        static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value)
+        {
+            size_t size;
+            deserialize_value(buffer, buffer_size, &size);
+            value->resize(size);
+            size_t nbyte = value->size() * sizeof(T);
+            assert(*buffer_size >= nbyte);
+            ::memcpy(value->data(), *buffer, nbyte);
+            reinterpret_cast<char const*&>(*buffer) += nbyte;
+            *buffer_size -= nbyte;
+        }
+    };
 
 }  // namespace
 
-template <typename T>
-inline size_t serialized_size(T const& value) {
-  return Serializer<T>::serialized_size(value);
+template<typename T>
+inline size_t serialized_size(T const& value)
+{
+    return Serializer<T>::serialized_size(value);
 }
 
-template <typename T>
-inline void serialize_value(void** buffer, T const& value) {
-  return Serializer<T>::serialize(buffer, value);
+template<typename T>
+inline void serialize_value(void** buffer, T const& value)
+{
+    return Serializer<T>::serialize(buffer, value);
 }
 
-template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value) {
-  return Serializer<T>::deserialize(buffer, buffer_size, value);
+template<typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value)
+{
+    return Serializer<T>::deserialize(buffer, buffer_size, value);
 }
 #endif  // TRT_SERIALIZE_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
index 44c08152db..08a6a617ce 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
@@ -7,62 +7,78 @@
 
 const static int BS = 512;
 
-template <typename T_BBOX>
-__device__ T_BBOX bboxSize(const Bbox<T_BBOX> &bbox, const bool normalized, T_BBOX offset) {
-  if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin) {
-    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
-    return 0;
-  } else {
-    T_BBOX width = bbox.xmax - bbox.xmin;
-    T_BBOX height = bbox.ymax - bbox.ymin;
-    if (normalized) {
-      return width * height;
-    } else {
-      // If bbox is not within range [0, 1].
-      return (width + offset) * (height + offset);
+template<typename T_BBOX>
+__device__ T_BBOX bboxSize(const Bbox<T_BBOX>& bbox, const bool normalized, T_BBOX offset)
+{
+    if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin)
+    {
+        // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
+        return 0;
+    }
+    else
+    {
+        T_BBOX width  = bbox.xmax - bbox.xmin;
+        T_BBOX height = bbox.ymax - bbox.ymin;
+        if (normalized)
+        {
+            return width * height;
+        }
+        else
+        {
+            // If bbox is not within range [0, 1].
+            return (width + offset) * (height + offset);
+        }
     }
-  }
 }
 
-template <typename T_BBOX>
-__device__ void intersectBbox(const Bbox<T_BBOX> &bbox1, const Bbox<T_BBOX> &bbox2,
-                              Bbox<T_BBOX> *intersect_bbox) {
-  if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin || bbox2.ymin > bbox1.ymax ||
-      bbox2.ymax < bbox1.ymin) {
-    // Return [0, 0, 0, 0] if there is no intersection.
-    intersect_bbox->xmin = T_BBOX(0);
-    intersect_bbox->ymin = T_BBOX(0);
-    intersect_bbox->xmax = T_BBOX(0);
-    intersect_bbox->ymax = T_BBOX(0);
-  } else {
-    intersect_bbox->xmin = max(bbox1.xmin, bbox2.xmin);
-    intersect_bbox->ymin = max(bbox1.ymin, bbox2.ymin);
-    intersect_bbox->xmax = min(bbox1.xmax, bbox2.xmax);
-    intersect_bbox->ymax = min(bbox1.ymax, bbox2.ymax);
-  }
+template<typename T_BBOX>
+__device__ void intersectBbox(const Bbox<T_BBOX>& bbox1, const Bbox<T_BBOX>& bbox2, Bbox<T_BBOX>* intersect_bbox)
+{
+    if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin || bbox2.ymin > bbox1.ymax ||
+        bbox2.ymax < bbox1.ymin)
+    {
+        // Return [0, 0, 0, 0] if there is no intersection.
+        intersect_bbox->xmin = T_BBOX(0);
+        intersect_bbox->ymin = T_BBOX(0);
+        intersect_bbox->xmax = T_BBOX(0);
+        intersect_bbox->ymax = T_BBOX(0);
+    }
+    else
+    {
+        intersect_bbox->xmin = max(bbox1.xmin, bbox2.xmin);
+        intersect_bbox->ymin = max(bbox1.ymin, bbox2.ymin);
+        intersect_bbox->xmax = min(bbox1.xmax, bbox2.xmax);
+        intersect_bbox->ymax = min(bbox1.ymax, bbox2.ymax);
+    }
 }
 
-template <typename T_BBOX>
-__device__ float jaccardOverlap(const Bbox<T_BBOX> &bbox1, const Bbox<T_BBOX> &bbox2,
-                                const bool normalized, T_BBOX offset) {
-  Bbox<T_BBOX> intersect_bbox;
-  intersectBbox(bbox1, bbox2, &intersect_bbox);
-  float intersect_width, intersect_height;
-  if (normalized) {
-    intersect_width = intersect_bbox.xmax - intersect_bbox.xmin;
-    intersect_height = intersect_bbox.ymax - intersect_bbox.ymin;
-  } else {
-    intersect_width = intersect_bbox.xmax - intersect_bbox.xmin + offset;
-    intersect_height = intersect_bbox.ymax - intersect_bbox.ymin + offset;
-  }
-  if (intersect_width > 0 && intersect_height > 0) {
-    float intersect_size = intersect_width * intersect_height;
-    float bbox1_size = bboxSize(bbox1, normalized, offset);
-    float bbox2_size = bboxSize(bbox2, normalized, offset);
-    return intersect_size / (bbox1_size + bbox2_size - intersect_size);
-  } else {
-    return 0.;
-  }
+template<typename T_BBOX>
+__device__ float jaccardOverlap(const Bbox<T_BBOX>& bbox1, const Bbox<T_BBOX>& bbox2, const bool normalized, T_BBOX offset)
+{
+    Bbox<T_BBOX> intersect_bbox;
+    intersectBbox(bbox1, bbox2, &intersect_bbox);
+    float intersect_width, intersect_height;
+    if (normalized)
+    {
+        intersect_width  = intersect_bbox.xmax - intersect_bbox.xmin;
+        intersect_height = intersect_bbox.ymax - intersect_bbox.ymin;
+    }
+    else
+    {
+        intersect_width  = intersect_bbox.xmax - intersect_bbox.xmin + offset;
+        intersect_height = intersect_bbox.ymax - intersect_bbox.ymin + offset;
+    }
+    if (intersect_width > 0 && intersect_height > 0)
+    {
+        float intersect_size = intersect_width * intersect_height;
+        float bbox1_size     = bboxSize(bbox1, normalized, offset);
+        float bbox2_size     = bboxSize(bbox2, normalized, offset);
+        return intersect_size / (bbox1_size + bbox2_size - intersect_size);
+    }
+    else
+    {
+        return 0.;
+    }
 }
 
 /********** new NMS for only score and index array **********/
@@ -82,186 +98,211 @@ allClassNMS_kernel(const int num, const int num_classes, const int num_preds_per
                                         // location information
                     T_SCORE *beforeNMS_scores, int *beforeNMS_index_array,
                     T_SCORE *afterNMS_scores, int *afterNMS_index_array, bool flipXY = false) {
-  // clang-format on
-  //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
-  __shared__ bool kept_bboxinfo_flag[TSIZE * BS];
-  for (int i = 0; i < num; i++) {
-    const int offset = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
-    const int max_idx = offset + top_k;  // put top_k bboxes into NMS calculation
-    const int bbox_idx_offset =
-        share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
-
-    // local thread data
-    int loc_bboxIndex[TSIZE];
-    Bbox<T_BBOX> loc_bbox[TSIZE];
-
-    // initialize Bbox, Bboxinfo, kept_bboxinfo_flag
-    // Eliminate shared memory RAW hazard
-    __syncthreads();
+    // clang-format on
+    //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
+    __shared__ bool kept_bboxinfo_flag[TSIZE * BS];
+    for (int i = 0; i < num; i++)
+    {
+        const int offset  = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
+        const int max_idx = offset + top_k;  // put top_k bboxes into NMS calculation
+        const int bbox_idx_offset =
+            share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
+
+        // local thread data
+        int          loc_bboxIndex[TSIZE];
+        Bbox<T_BBOX> loc_bbox[TSIZE];
+
+        // initialize Bbox, Bboxinfo, kept_bboxinfo_flag
+        // Eliminate shared memory RAW hazard
+        __syncthreads();
 #pragma unroll
-    for (int t = 0; t < TSIZE; t++) {
-      const int cur_idx = threadIdx.x + blockDim.x * t;
-      const int item_idx = offset + cur_idx;
+        for (int t = 0; t < TSIZE; t++)
+        {
+            const int cur_idx  = threadIdx.x + blockDim.x * t;
+            const int item_idx = offset + cur_idx;
+
+            if (item_idx < max_idx)
+            {
+                loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
+
+                if (loc_bboxIndex[t] >= 0)
+                // if (loc_bboxIndex[t] != -1)
+                {
+                    const int bbox_data_idx = share_location ? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset) : loc_bboxIndex[t];
+
+                    loc_bbox[t].xmin =
+                        flipXY ? bbox_data[bbox_data_idx * 4 + 1] : bbox_data[bbox_data_idx * 4 + 0];
+                    loc_bbox[t].ymin =
+                        flipXY ? bbox_data[bbox_data_idx * 4 + 0] : bbox_data[bbox_data_idx * 4 + 1];
+                    loc_bbox[t].xmax =
+                        flipXY ? bbox_data[bbox_data_idx * 4 + 3] : bbox_data[bbox_data_idx * 4 + 2];
+                    loc_bbox[t].ymax =
+                        flipXY ? bbox_data[bbox_data_idx * 4 + 2] : bbox_data[bbox_data_idx * 4 + 3];
+                    kept_bboxinfo_flag[cur_idx] = true;
+                }
+                else
+                {
+                    kept_bboxinfo_flag[cur_idx] = false;
+                }
+            }
+            else
+            {
+                kept_bboxinfo_flag[cur_idx] = false;
+            }
+        }
 
-      if (item_idx < max_idx) {
-        loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
+        // filter out overlapped boxes with lower scores
+        int ref_item_idx = offset;
+        int ref_bbox_idx =
+            share_location ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) : beforeNMS_index_array[ref_item_idx];
 
-        if (loc_bboxIndex[t] >= 0)
-        // if (loc_bboxIndex[t] != -1)
+        while ((ref_bbox_idx != -1) && ref_item_idx < max_idx)
         {
-          const int bbox_data_idx = share_location
-                                        ? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset)
-                                        : loc_bboxIndex[t];
-
-          loc_bbox[t].xmin =
-              flipXY ? bbox_data[bbox_data_idx * 4 + 1] : bbox_data[bbox_data_idx * 4 + 0];
-          loc_bbox[t].ymin =
-              flipXY ? bbox_data[bbox_data_idx * 4 + 0] : bbox_data[bbox_data_idx * 4 + 1];
-          loc_bbox[t].xmax =
-              flipXY ? bbox_data[bbox_data_idx * 4 + 3] : bbox_data[bbox_data_idx * 4 + 2];
-          loc_bbox[t].ymax =
-              flipXY ? bbox_data[bbox_data_idx * 4 + 2] : bbox_data[bbox_data_idx * 4 + 3];
-          kept_bboxinfo_flag[cur_idx] = true;
-        } else {
-          kept_bboxinfo_flag[cur_idx] = false;
+            Bbox<T_BBOX> ref_bbox;
+            ref_bbox.xmin = flipXY ? bbox_data[ref_bbox_idx * 4 + 1] : bbox_data[ref_bbox_idx * 4 + 0];
+            ref_bbox.ymin = flipXY ? bbox_data[ref_bbox_idx * 4 + 0] : bbox_data[ref_bbox_idx * 4 + 1];
+            ref_bbox.xmax = flipXY ? bbox_data[ref_bbox_idx * 4 + 3] : bbox_data[ref_bbox_idx * 4 + 2];
+            ref_bbox.ymax = flipXY ? bbox_data[ref_bbox_idx * 4 + 2] : bbox_data[ref_bbox_idx * 4 + 3];
+
+            // Eliminate shared memory RAW hazard
+            __syncthreads();
+
+            for (int t = 0; t < TSIZE; t++)
+            {
+                const int cur_idx  = threadIdx.x + blockDim.x * t;
+                const int item_idx = offset + cur_idx;
+
+                if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx))
+                {
+                    // TODO: may need to add bool normalized as argument, HERE true means
+                    // normalized
+                    if (jaccardOverlap(ref_bbox, loc_bbox[t], isNormalized, T_BBOX(0)) > nms_threshold)
+                    {
+                        kept_bboxinfo_flag[cur_idx] = false;
+                    }
+                }
+            }
+            __syncthreads();
+
+            do {
+                ref_item_idx++;
+            } while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
+
+            ref_bbox_idx =
+                share_location ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) : beforeNMS_index_array[ref_item_idx];
         }
-      } else {
-        kept_bboxinfo_flag[cur_idx] = false;
-      }
-    }
 
-    // filter out overlapped boxes with lower scores
-    int ref_item_idx = offset;
-    int ref_bbox_idx =
-        share_location
-            ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
-            : beforeNMS_index_array[ref_item_idx];
-
-    while ((ref_bbox_idx != -1) && ref_item_idx < max_idx) {
-      Bbox<T_BBOX> ref_bbox;
-      ref_bbox.xmin = flipXY ? bbox_data[ref_bbox_idx * 4 + 1] : bbox_data[ref_bbox_idx * 4 + 0];
-      ref_bbox.ymin = flipXY ? bbox_data[ref_bbox_idx * 4 + 0] : bbox_data[ref_bbox_idx * 4 + 1];
-      ref_bbox.xmax = flipXY ? bbox_data[ref_bbox_idx * 4 + 3] : bbox_data[ref_bbox_idx * 4 + 2];
-      ref_bbox.ymax = flipXY ? bbox_data[ref_bbox_idx * 4 + 2] : bbox_data[ref_bbox_idx * 4 + 3];
-
-      // Eliminate shared memory RAW hazard
-      __syncthreads();
-
-      for (int t = 0; t < TSIZE; t++) {
-        const int cur_idx = threadIdx.x + blockDim.x * t;
-        const int item_idx = offset + cur_idx;
-
-        if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx)) {
-          // TODO: may need to add bool normalized as argument, HERE true means
-          // normalized
-          if (jaccardOverlap(ref_bbox, loc_bbox[t], isNormalized, T_BBOX(0)) > nms_threshold) {
-            kept_bboxinfo_flag[cur_idx] = false;
-          }
+        // store data
+        for (int t = 0; t < TSIZE; t++)
+        {
+            const int cur_idx        = threadIdx.x + blockDim.x * t;
+            const int read_item_idx  = offset + cur_idx;
+            const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
+            /*
+             * If not not keeping the bbox
+             * Set the score to 0
+             * Set the bounding box index to -1
+             */
+            if (read_item_idx < max_idx)
+            {
+                afterNMS_scores[write_item_idx] =
+                    kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
+                afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
+            }
         }
-      }
-      __syncthreads();
-
-      do {
-        ref_item_idx++;
-      } while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
-
-      ref_bbox_idx =
-          share_location
-              ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
-              : beforeNMS_index_array[ref_item_idx];
     }
-
-    // store data
-    for (int t = 0; t < TSIZE; t++) {
-      const int cur_idx = threadIdx.x + blockDim.x * t;
-      const int read_item_idx = offset + cur_idx;
-      const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
-      /*
-       * If not not keeping the bbox
-       * Set the score to 0
-       * Set the bounding box index to -1
-       */
-      if (read_item_idx < max_idx) {
-        afterNMS_scores[write_item_idx] =
-            kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
-        afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
-      }
-    }
-  }
 }
 
-template <typename T_SCORE, typename T_BBOX>
-pluginStatus_t allClassNMS_gpu(cudaStream_t stream, const int num, const int num_classes,
-                               const int num_preds_per_class, const int top_k,
-                               const float nms_threshold, const bool share_location,
-                               const bool isNormalized, void *bbox_data, void *beforeNMS_scores,
-                               void *beforeNMS_index_array, void *afterNMS_scores,
-                               void *afterNMS_index_array, bool flipXY = false) {
+template<typename T_SCORE, typename T_BBOX>
+pluginStatus_t allClassNMS_gpu(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int top_k, const float nms_threshold, const bool share_location, const bool isNormalized, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array, bool flipXY = false)
+{
 #define P(tsize) allClassNMS_kernel<T_SCORE, T_BBOX, (tsize)>
 
-  void (*kernel[10])(const int, const int, const int, const int, const float, const bool,
-                     const bool, float *, T_SCORE *, int *, T_SCORE *, int *, bool) = {
-      P(1), P(2), P(3), P(4), P(5), P(6), P(7), P(8), P(9), P(10),
-  };
-
-  const int GS = num_classes;
-  const int t_size = (top_k + BS - 1) / BS;
-
-  ASSERT(t_size <= 10);
-  kernel[t_size - 1]<<<GS, BS, 0, stream>>>(
-      num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized,
-      (T_BBOX *)bbox_data, (T_SCORE *)beforeNMS_scores, (int *)beforeNMS_index_array,
-      (T_SCORE *)afterNMS_scores, (int *)afterNMS_index_array, flipXY);
-
-  cudaError_t code = cudaGetLastError();
-  CUASSERT(code);
-  CSC(code, STATUS_FAILURE);
-  return STATUS_SUCCESS;
+    void (*kernel[10])(const int, const int, const int, const int, const float, const bool, const bool, float*, T_SCORE*, int*, T_SCORE*, int*, bool) = {
+        P(1),
+        P(2),
+        P(3),
+        P(4),
+        P(5),
+        P(6),
+        P(7),
+        P(8),
+        P(9),
+        P(10),
+    };
+
+    const int GS     = num_classes;
+    const int t_size = (top_k + BS - 1) / BS;
+
+    ASSERT(t_size <= 10);
+    kernel[t_size - 1]<<<GS, BS, 0, stream>>>(
+        num,
+        num_classes,
+        num_preds_per_class,
+        top_k,
+        nms_threshold,
+        share_location,
+        isNormalized,
+        (T_BBOX*)bbox_data,
+        (T_SCORE*)beforeNMS_scores,
+        (int*)beforeNMS_index_array,
+        (T_SCORE*)afterNMS_scores,
+        (int*)afterNMS_index_array,
+        flipXY);
+
+    cudaError_t code = cudaGetLastError();
+    CUASSERT(code);
+    CSC(code, STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // allClassNMS LAUNCH CONFIG
-typedef pluginStatus_t (*nmsFunc)(cudaStream_t, const int, const int, const int, const int,
-                                  const float, const bool, const bool, void *, void *, void *,
-                                  void *, void *, bool);
-
-struct nmsLaunchConfigSSD {
-  DataType t_score;
-  DataType t_bbox;
-  nmsFunc function;
-
-  nmsLaunchConfigSSD(DataType t_score, DataType t_bbox) : t_score(t_score), t_bbox(t_bbox) {}
-  nmsLaunchConfigSSD(DataType t_score, DataType t_bbox, nmsFunc function)
-      : t_score(t_score), t_bbox(t_bbox), function(function) {}
-  bool operator==(const nmsLaunchConfigSSD &other) {
-    return t_score == other.t_score && t_bbox == other.t_bbox;
-  }
+typedef pluginStatus_t (*nmsFunc)(cudaStream_t, const int, const int, const int, const int, const float, const bool, const bool, void*, void*, void*, void*, void*, bool);
+
+struct nmsLaunchConfigSSD
+{
+    DataType t_score;
+    DataType t_bbox;
+    nmsFunc  function;
+
+    nmsLaunchConfigSSD(DataType t_score, DataType t_bbox)
+        : t_score(t_score)
+        , t_bbox(t_bbox)
+    {
+    }
+    nmsLaunchConfigSSD(DataType t_score, DataType t_bbox, nmsFunc function)
+        : t_score(t_score)
+        , t_bbox(t_bbox)
+        , function(function)
+    {
+    }
+    bool operator==(const nmsLaunchConfigSSD& other)
+    {
+        return t_score == other.t_score && t_bbox == other.t_bbox;
+    }
 };
 
 static std::vector<nmsLaunchConfigSSD> nmsFuncVec;
 
-bool nmsInit() {
-  nmsFuncVec.push_back(
-      nmsLaunchConfigSSD(DataType::kFLOAT, DataType::kFLOAT, allClassNMS_gpu<float, float>));
-  return true;
+bool                                   nmsInit()
+{
+    nmsFuncVec.push_back(
+        nmsLaunchConfigSSD(DataType::kFLOAT, DataType::kFLOAT, allClassNMS_gpu<float, float>));
+    return true;
 }
 
-static bool initialized = nmsInit();
-
-pluginStatus_t allClassNMS(cudaStream_t stream, const int num, const int num_classes,
-                           const int num_preds_per_class, const int top_k,
-                           const float nms_threshold, const bool share_location,
-                           const bool isNormalized, const DataType DT_SCORE, const DataType DT_BBOX,
-                           void *bbox_data, void *beforeNMS_scores, void *beforeNMS_index_array,
-                           void *afterNMS_scores, void *afterNMS_index_array, bool flipXY) {
-  nmsLaunchConfigSSD lc(DT_SCORE, DT_BBOX);
-  for (unsigned i = 0; i < nmsFuncVec.size(); ++i) {
-    if (lc == nmsFuncVec[i]) {
-      DEBUG_PRINTF("all class nms kernel %d\n", i);
-      return nmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k,
-                                    nms_threshold, share_location, isNormalized, bbox_data,
-                                    beforeNMS_scores, beforeNMS_index_array, afterNMS_scores,
-                                    afterNMS_index_array, flipXY);
+static bool    initialized = nmsInit();
+
+pluginStatus_t allClassNMS(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int top_k, const float nms_threshold, const bool share_location, const bool isNormalized, const DataType DT_SCORE, const DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array, bool flipXY)
+{
+    nmsLaunchConfigSSD lc(DT_SCORE, DT_BBOX);
+    for (unsigned i = 0; i < nmsFuncVec.size(); ++i)
+    {
+        if (lc == nmsFuncVec[i])
+        {
+            DEBUG_PRINTF("all class nms kernel %d\n", i);
+            return nmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized, bbox_data, beforeNMS_scores, beforeNMS_index_array, afterNMS_scores, afterNMS_index_array, flipXY);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
index 0edea2bfaf..52758ea247 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
@@ -6,490 +6,559 @@
 
 #include "nms/kernel.h"
 
-template <typename T>
-struct RotatedBox {
-  T x_ctr, y_ctr, w, h, a;
+template<typename T>
+struct RotatedBox
+{
+    T x_ctr, y_ctr, w, h, a;
 };
 
-template <typename T>
-struct Point {
-  T x, y;
-  __host__ __device__ __forceinline__ Point(const T &px = 0, const T &py = 0) : x(px), y(py) {}
-  __host__ __device__ __forceinline__ Point operator+(const Point &p) const {
-    return Point(x + p.x, y + p.y);
-  }
-  __host__ __device__ __forceinline__ Point &operator+=(const Point &p) {
-    x += p.x;
-    y += p.y;
-    return *this;
-  }
-  __host__ __device__ __forceinline__ Point operator-(const Point &p) const {
-    return Point(x - p.x, y - p.y);
-  }
-  __host__ __device__ __forceinline__ Point operator*(const T coeff) const {
-    return Point(x * coeff, y * coeff);
-  }
+template<typename T>
+struct Point
+{
+    T                                   x, y;
+    __host__ __device__ __forceinline__ Point(const T& px = 0, const T& py = 0)
+        : x(px)
+        , y(py)
+    {
+    }
+    __host__ __device__ __forceinline__ Point operator+(const Point& p) const
+    {
+        return Point(x + p.x, y + p.y);
+    }
+    __host__ __device__ __forceinline__ Point& operator+=(const Point& p)
+    {
+        x += p.x;
+        y += p.y;
+        return *this;
+    }
+    __host__ __device__ __forceinline__ Point operator-(const Point& p) const
+    {
+        return Point(x - p.x, y - p.y);
+    }
+    __host__ __device__ __forceinline__ Point operator*(const T coeff) const
+    {
+        return Point(x * coeff, y * coeff);
+    }
 };
 
-template <typename T>
-__host__ __device__ __forceinline__ T dot_2d(const Point<T> &A, const Point<T> &B) {
-  return A.x * B.x + A.y * B.y;
+template<typename T>
+__host__ __device__ __forceinline__ T dot_2d(const Point<T>& A, const Point<T>& B)
+{
+    return A.x * B.x + A.y * B.y;
 }
 
-template <typename T>
-__host__ __device__ __forceinline__ T cross_2d(const Point<T> &A, const Point<T> &B) {
-  return A.x * B.y - B.x * A.y;
+template<typename T>
+__host__ __device__ __forceinline__ T cross_2d(const Point<T>& A, const Point<T>& B)
+{
+    return A.x * B.y - B.x * A.y;
 }
 
-template <typename T>
-__host__ __device__ __forceinline__ void get_rotated_vertices(const RotatedBox<T> &box,
-                                                              Point<T> (&pts)[4]) {
-  // M_PI / 180. == 0.01745329251
-  // double theta = box.a * 0.01745329251;
-  // MODIFIED
-  double theta = box.a;
-  T cosTheta2 = (T)cos(theta) * 0.5f;
-  T sinTheta2 = (T)sin(theta) * 0.5f;
-
-  // y: top --> down; x: left --> right
-  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
-  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
-  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
-  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
-  pts[2].x = 2 * box.x_ctr - pts[0].x;
-  pts[2].y = 2 * box.y_ctr - pts[0].y;
-  pts[3].x = 2 * box.x_ctr - pts[1].x;
-  pts[3].y = 2 * box.y_ctr - pts[1].y;
+template<typename T>
+__host__ __device__ __forceinline__ void get_rotated_vertices(const RotatedBox<T>& box,
+                                                              Point<T> (&pts)[4])
+{
+    // M_PI / 180. == 0.01745329251
+    // double theta = box.a * 0.01745329251;
+    // MODIFIED
+    double theta     = box.a;
+    T      cosTheta2 = (T)cos(theta) * 0.5f;
+    T      sinTheta2 = (T)sin(theta) * 0.5f;
+
+    // y: top --> down; x: left --> right
+    pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+    pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+    pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+    pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+    pts[2].x = 2 * box.x_ctr - pts[0].x;
+    pts[2].y = 2 * box.y_ctr - pts[0].y;
+    pts[3].x = 2 * box.x_ctr - pts[1].x;
+    pts[3].y = 2 * box.y_ctr - pts[1].y;
 }
 
-template <typename T>
+template<typename T>
 __host__ __device__ __forceinline__ int get_intersection_points(const Point<T> (&pts1)[4],
                                                                 const Point<T> (&pts2)[4],
-                                                                Point<T> (&intersections)[24]) {
-  // Line vector
-  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
-  Point<T> vec1[4], vec2[4];
-  for (int i = 0; i < 4; i++) {
-    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
-    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
-  }
-
-  // Line test - test all line combos for intersection
-  int num = 0;  // number of intersections
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 4; j++) {
-      // Solve for 2x2 Ax=b
-      T det = cross_2d<T>(vec2[j], vec1[i]);
-
-      // This takes care of parallel lines
-      if (fabs(det) <= 1e-14) {
-        continue;
-      }
-
-      auto vec12 = pts2[j] - pts1[i];
-
-      T t1 = cross_2d<T>(vec2[j], vec12) / det;
-      T t2 = cross_2d<T>(vec1[i], vec12) / det;
-
-      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
-        intersections[num++] = pts1[i] + vec1[i] * t1;
-      }
+                                                                Point<T> (&intersections)[24])
+{
+    // Line vector
+    // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+    Point<T> vec1[4], vec2[4];
+    for (int i = 0; i < 4; i++)
+    {
+        vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+        vec2[i] = pts2[(i + 1) % 4] - pts2[i];
     }
-  }
-
-  // Check for vertices of rect1 inside rect2
-  {
-    const auto &AB = vec2[0];
-    const auto &DA = vec2[3];
-    auto ABdotAB = dot_2d<T>(AB, AB);
-    auto ADdotAD = dot_2d<T>(DA, DA);
-    for (int i = 0; i < 4; i++) {
-      // assume ABCD is the rectangle, and P is the point to be judged
-      // P is inside ABCD iff. P's projection on AB lies within AB
-      // and P's projection on AD lies within AD
-
-      auto AP = pts1[i] - pts2[0];
-
-      auto APdotAB = dot_2d<T>(AP, AB);
-      auto APdotAD = -dot_2d<T>(AP, DA);
-
-      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
-        intersections[num++] = pts1[i];
-      }
+
+    // Line test - test all line combos for intersection
+    int num = 0;  // number of intersections
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            // Solve for 2x2 Ax=b
+            T det = cross_2d<T>(vec2[j], vec1[i]);
+
+            // This takes care of parallel lines
+            if (fabs(det) <= 1e-14)
+            {
+                continue;
+            }
+
+            auto vec12 = pts2[j] - pts1[i];
+
+            T    t1 = cross_2d<T>(vec2[j], vec12) / det;
+            T    t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+            if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f)
+            {
+                intersections[num++] = pts1[i] + vec1[i] * t1;
+            }
+        }
     }
-  }
-
-  // Reverse the check - check for vertices of rect2 inside rect1
-  {
-    const auto &AB = vec1[0];
-    const auto &DA = vec1[3];
-    auto ABdotAB = dot_2d<T>(AB, AB);
-    auto ADdotAD = dot_2d<T>(DA, DA);
-    for (int i = 0; i < 4; i++) {
-      auto AP = pts2[i] - pts1[0];
-
-      auto APdotAB = dot_2d<T>(AP, AB);
-      auto APdotAD = -dot_2d<T>(AP, DA);
-
-      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
-        intersections[num++] = pts2[i];
-      }
+
+    // Check for vertices of rect1 inside rect2
+    {
+        const auto& AB      = vec2[0];
+        const auto& DA      = vec2[3];
+        auto        ABdotAB = dot_2d<T>(AB, AB);
+        auto        ADdotAD = dot_2d<T>(DA, DA);
+        for (int i = 0; i < 4; i++)
+        {
+            // assume ABCD is the rectangle, and P is the point to be judged
+            // P is inside ABCD iff. P's projection on AB lies within AB
+            // and P's projection on AD lies within AD
+
+            auto AP = pts1[i] - pts2[0];
+
+            auto APdotAB = dot_2d<T>(AP, AB);
+            auto APdotAD = -dot_2d<T>(AP, DA);
+
+            if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD))
+            {
+                intersections[num++] = pts1[i];
+            }
+        }
+    }
+
+    // Reverse the check - check for vertices of rect2 inside rect1
+    {
+        const auto& AB      = vec1[0];
+        const auto& DA      = vec1[3];
+        auto        ABdotAB = dot_2d<T>(AB, AB);
+        auto        ADdotAD = dot_2d<T>(DA, DA);
+        for (int i = 0; i < 4; i++)
+        {
+            auto AP = pts2[i] - pts1[0];
+
+            auto APdotAB = dot_2d<T>(AP, AB);
+            auto APdotAD = -dot_2d<T>(AP, DA);
+
+            if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD))
+            {
+                intersections[num++] = pts2[i];
+            }
+        }
     }
-  }
 
-  return num;
+    return num;
 }
 
-template <typename T>
+template<typename T>
 __host__ __device__ __forceinline__ int convex_hull_graham(const Point<T> (&p)[24],
-                                                           const int &num_in, Point<T> (&q)[24],
-                                                           bool shift_to_zero = false) {
-  assert(num_in >= 2);
-
-  // Step 1:
-  // Find point with minimum y
-  // if more than 1 points have the same minimum y,
-  // pick the one with the minimum x.
-  int t = 0;
-  for (int i = 1; i < num_in; i++) {
-    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
-      t = i;
+                                                           const int& num_in,
+                                                           Point<T> (&q)[24],
+                                                           bool shift_to_zero = false)
+{
+    assert(num_in >= 2);
+
+    // Step 1:
+    // Find point with minimum y
+    // if more than 1 points have the same minimum y,
+    // pick the one with the minimum x.
+    int t = 0;
+    for (int i = 1; i < num_in; i++)
+    {
+        if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x))
+        {
+            t = i;
+        }
+    }
+    auto& start = p[t];  // starting point
+
+    // Step 2:
+    // Subtract starting point from every points (for sorting in the next step)
+    for (int i = 0; i < num_in; i++)
+    {
+        q[i] = p[i] - start;
+    }
+
+    // Swap the starting point to position 0
+    auto tmp = q[0];
+    q[0]     = q[t];
+    q[t]     = tmp;
+
+    // Step 3:
+    // Sort point 1 ~ num_in according to their relative cross-product values
+    // (essentially sorting according to angles)
+    // If the angles are the same, sort according to their distance to origin
+    T dist[24];
+    for (int i = 0; i < num_in; i++)
+    {
+        dist[i] = dot_2d<T>(q[i], q[i]);
+    }
+
+    for (int i = 1; i < num_in - 1; i++)
+    {
+        for (int j = i + 1; j < num_in; j++)
+        {
+            T crossProduct = cross_2d<T>(q[i], q[j]);
+            if ((crossProduct < -1e-6) || (fabs(crossProduct) < 1e-6 && dist[i] > dist[j]))
+            {
+                auto q_tmp    = q[i];
+                q[i]          = q[j];
+                q[j]          = q_tmp;
+                auto dist_tmp = dist[i];
+                dist[i]       = dist[j];
+                dist[j]       = dist_tmp;
+            }
+        }
     }
-  }
-  auto &start = p[t];  // starting point
-
-  // Step 2:
-  // Subtract starting point from every points (for sorting in the next step)
-  for (int i = 0; i < num_in; i++) {
-    q[i] = p[i] - start;
-  }
-
-  // Swap the starting point to position 0
-  auto tmp = q[0];
-  q[0] = q[t];
-  q[t] = tmp;
-
-  // Step 3:
-  // Sort point 1 ~ num_in according to their relative cross-product values
-  // (essentially sorting according to angles)
-  // If the angles are the same, sort according to their distance to origin
-  T dist[24];
-  for (int i = 0; i < num_in; i++) {
-    dist[i] = dot_2d<T>(q[i], q[i]);
-  }
-
-  for (int i = 1; i < num_in - 1; i++) {
-    for (int j = i + 1; j < num_in; j++) {
-      T crossProduct = cross_2d<T>(q[i], q[j]);
-      if ((crossProduct < -1e-6) || (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
-        auto q_tmp = q[i];
-        q[i] = q[j];
-        q[j] = q_tmp;
-        auto dist_tmp = dist[i];
-        dist[i] = dist[j];
-        dist[j] = dist_tmp;
-      }
+
+    // Step 4:
+    // Make sure there are at least 2 points (that don't overlap with each other)
+    // in the stack
+    int k;  // index of the non-overlapped second point
+    for (k = 1; k < num_in; k++)
+    {
+        if (dist[k] > 1e-8)
+        {
+            break;
+        }
     }
-  }
-
-  // Step 4:
-  // Make sure there are at least 2 points (that don't overlap with each other)
-  // in the stack
-  int k;  // index of the non-overlapped second point
-  for (k = 1; k < num_in; k++) {
-    if (dist[k] > 1e-8) {
-      break;
+    if (k == num_in)
+    {
+        // We reach the end, which means the convex hull is just one point
+        q[0] = p[t];
+        return 1;
     }
-  }
-  if (k == num_in) {
-    // We reach the end, which means the convex hull is just one point
-    q[0] = p[t];
-    return 1;
-  }
-  q[1] = q[k];
-  int m = 2;  // 2 points in the stack
-  // Step 5:
-  // Finally we can start the scanning process.
-  // When a non-convex relationship between the 3 points is found
-  // (either concave shape or duplicated points),
-  // we pop the previous point from the stack
-  // until the 3-point relationship is convex again, or
-  // until the stack only contains two points
-  for (int i = k + 1; i < num_in; i++) {
-    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
-      m--;
+    q[1]  = q[k];
+    int m = 2;  // 2 points in the stack
+    // Step 5:
+    // Finally we can start the scanning process.
+    // When a non-convex relationship between the 3 points is found
+    // (either concave shape or duplicated points),
+    // we pop the previous point from the stack
+    // until the 3-point relationship is convex again, or
+    // until the stack only contains two points
+    for (int i = k + 1; i < num_in; i++)
+    {
+        while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0)
+        {
+            m--;
+        }
+        q[m++] = q[i];
     }
-    q[m++] = q[i];
-  }
-
-  // Step 6 (Optional):
-  // In general sense we need the original coordinates, so we
-  // need to shift the points back (reverting Step 2)
-  // But if we're only interested in getting the area/perimeter of the shape
-  // We can simply return.
-  if (!shift_to_zero) {
-    for (int i = 0; i < m; i++) {
-      q[i] += start;
+
+    // Step 6 (Optional):
+    // In general sense we need the original coordinates, so we
+    // need to shift the points back (reverting Step 2)
+    // But if we're only interested in getting the area/perimeter of the shape
+    // We can simply return.
+    if (!shift_to_zero)
+    {
+        for (int i = 0; i < m; i++)
+        {
+            q[i] += start;
+        }
     }
-  }
 
-  return m;
+    return m;
 }
 
-template <typename T>
-__host__ __device__ __forceinline__ T polygon_area(const Point<T> (&q)[24], const int &m) {
-  if (m <= 2) {
-    return 0;
-  }
+template<typename T>
+__host__ __device__ __forceinline__ T polygon_area(const Point<T> (&q)[24], const int& m)
+{
+    if (m <= 2)
+    {
+        return 0;
+    }
 
-  T area = 0;
-  for (int i = 1; i < m - 1; i++) {
-    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
-  }
+    T area = 0;
+    for (int i = 1; i < m - 1; i++)
+    {
+        area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+    }
 
-  return area / 2.0;
+    return area / 2.0;
 }
 
-template <typename T>
-__host__ __device__ __forceinline__ T rotated_boxes_intersection(const RotatedBox<T> &box1,
-                                                                 const RotatedBox<T> &box2) {
-  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
-  // from rotated_rect_intersection_pts
-  Point<T> intersectPts[24], orderedPts[24];
+template<typename T>
+__host__ __device__ __forceinline__ T rotated_boxes_intersection(const RotatedBox<T>& box1,
+                                                                 const RotatedBox<T>& box2)
+{
+    // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+    // from rotated_rect_intersection_pts
+    Point<T> intersectPts[24], orderedPts[24];
 
-  Point<T> pts1[4];
-  Point<T> pts2[4];
-  get_rotated_vertices<T>(box1, pts1);
-  get_rotated_vertices<T>(box2, pts2);
+    Point<T> pts1[4];
+    Point<T> pts2[4];
+    get_rotated_vertices<T>(box1, pts1);
+    get_rotated_vertices<T>(box2, pts2);
 
-  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+    int num = get_intersection_points<T>(pts1, pts2, intersectPts);
 
-  if (num <= 2) {
-    return 0.0;
-  }
+    if (num <= 2)
+    {
+        return 0.0;
+    }
 
-  // Convex Hull to order the intersection points in clockwise order and find
-  // the contour area.
-  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
-  return polygon_area<T>(orderedPts, num_convex);
+    // Convex Hull to order the intersection points in clockwise order and find
+    // the contour area.
+    int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+    return polygon_area<T>(orderedPts, num_convex);
 }
 
-template <typename T>
-__host__ __device__ __forceinline__ T single_box_iou_rotated(T const *const box1_raw,
-                                                             T const *const box2_raw) {
-  // shift center to the middle point to achieve higher precision in result
-  RotatedBox<T> box1, box2;
-  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
-  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
-  box1.x_ctr = box1_raw[0] - center_shift_x;
-  box1.y_ctr = box1_raw[1] - center_shift_y;
-  box1.w = box1_raw[2];
-  box1.h = box1_raw[3];
-  box1.a = box1_raw[4];
-  box2.x_ctr = box2_raw[0] - center_shift_x;
-  box2.y_ctr = box2_raw[1] - center_shift_y;
-  box2.w = box2_raw[2];
-  box2.h = box2_raw[3];
-  box2.a = box2_raw[4];
-
-  const T area1 = box1.w * box1.h;
-  const T area2 = box2.w * box2.h;
-  if (area1 < 1e-14 || area2 < 1e-14) {
-    return 1.0f;
-  }
-
-  const T intersection = rotated_boxes_intersection<T>(box1, box2);
-  T baseS = 1.0;
-  baseS = (area1 + area2 - intersection);
-  const T iou = intersection / baseS;
-  return iou;
+template<typename T>
+__host__ __device__ __forceinline__ T single_box_iou_rotated(T const* const box1_raw,
+                                                             T const* const box2_raw)
+{
+    // shift center to the middle point to achieve higher precision in result
+    RotatedBox<T> box1, box2;
+    auto          center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+    auto          center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+    box1.x_ctr                   = box1_raw[0] - center_shift_x;
+    box1.y_ctr                   = box1_raw[1] - center_shift_y;
+    box1.w                       = box1_raw[2];
+    box1.h                       = box1_raw[3];
+    box1.a                       = box1_raw[4];
+    box2.x_ctr                   = box2_raw[0] - center_shift_x;
+    box2.y_ctr                   = box2_raw[1] - center_shift_y;
+    box2.w                       = box2_raw[2];
+    box2.h                       = box2_raw[3];
+    box2.a                       = box2_raw[4];
+
+    const T area1 = box1.w * box1.h;
+    const T area2 = box2.w * box2.h;
+    if (area1 < 1e-14 || area2 < 1e-14)
+    {
+        return 1.0f;
+    }
+
+    const T intersection = rotated_boxes_intersection<T>(box1, box2);
+    T       baseS        = 1.0;
+    baseS                = (area1 + area2 - intersection);
+    const T iou          = intersection / baseS;
+    return iou;
 }
 
 /********** new NMS for only score and index array **********/
 
-template <typename T_SCORE, typename T_BBOX, int TSIZE>
-__global__ void allClassRotatedNMS_kernel(const int num, const int num_classes,
-                                          const int num_preds_per_class, const int top_k,
-                                          const float nms_threshold, const bool share_location,
-                                          const bool isNormalized,
-                                          T_BBOX *bbox_data,  // bbox_data should be float to
-                                                              // preserve location information
-                                          T_SCORE *beforeNMS_scores, int *beforeNMS_index_array,
-                                          T_SCORE *afterNMS_scores, int *afterNMS_index_array) {
-  //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
-  extern __shared__ bool kept_bboxinfo_flag[];
-  for (int i = 0; i < num; i++) {
-    const int offset = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
-    const int max_idx = offset + top_k;  // put top_k bboxes into NMS calculation
-    const int bbox_idx_offset =
-        share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
-
-    // local thread data
-    int loc_bboxIndex[TSIZE];
-    T_BBOX loc_bbox[TSIZE * 5];
-
-    // initialize Bbox, Bboxinfo, kept_bboxinfo_flag
-    // Eliminate shared memory RAW hazard
-    __syncthreads();
+template<typename T_SCORE, typename T_BBOX, int TSIZE>
+__global__ void allClassRotatedNMS_kernel(const int num, const int num_classes, const int num_preds_per_class, const int top_k, const float nms_threshold, const bool share_location, const bool isNormalized,
+                                          T_BBOX*  bbox_data,  // bbox_data should be float to
+                                                               // preserve location information
+                                          T_SCORE* beforeNMS_scores,
+                                          int*     beforeNMS_index_array,
+                                          T_SCORE* afterNMS_scores,
+                                          int*     afterNMS_index_array)
+{
+    //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
+    extern __shared__ bool kept_bboxinfo_flag[];
+    for (int i = 0; i < num; i++)
+    {
+        const int offset  = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
+        const int max_idx = offset + top_k;  // put top_k bboxes into NMS calculation
+        const int bbox_idx_offset =
+            share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
+
+        // local thread data
+        int    loc_bboxIndex[TSIZE];
+        T_BBOX loc_bbox[TSIZE * 5];
+
+        // initialize Bbox, Bboxinfo, kept_bboxinfo_flag
+        // Eliminate shared memory RAW hazard
+        __syncthreads();
 #pragma unroll
-    for (int t = 0; t < TSIZE; t++) {
-      const int cur_idx = threadIdx.x + blockDim.x * t;
-      const int item_idx = offset + cur_idx;
+        for (int t = 0; t < TSIZE; t++)
+        {
+            const int cur_idx  = threadIdx.x + blockDim.x * t;
+            const int item_idx = offset + cur_idx;
+
+            if (item_idx < max_idx)
+            {
+                loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
+
+                if (loc_bboxIndex[t] >= 0)
+                // if (loc_bboxIndex[t] != -1)
+                {
+                    const int bbox_data_idx = share_location ? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset) : loc_bboxIndex[t];
+                    memcpy(&loc_bbox[t * 5], &bbox_data[bbox_data_idx * 5], 5 * sizeof(T_BBOX));
+                    kept_bboxinfo_flag[cur_idx] = true;
+                }
+                else
+                {
+                    kept_bboxinfo_flag[cur_idx] = false;
+                }
+            }
+            else
+            {
+                kept_bboxinfo_flag[cur_idx] = false;
+            }
+        }
 
-      if (item_idx < max_idx) {
-        loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
+        // filter out overlapped boxes with lower scores
+        int ref_item_idx = offset;
+        int ref_bbox_idx =
+            share_location ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) : beforeNMS_index_array[ref_item_idx];
 
-        if (loc_bboxIndex[t] >= 0)
-        // if (loc_bboxIndex[t] != -1)
+        while ((ref_bbox_idx != -1) && ref_item_idx < max_idx)
         {
-          const int bbox_data_idx = share_location
-                                        ? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset)
-                                        : loc_bboxIndex[t];
-          memcpy(&loc_bbox[t * 5], &bbox_data[bbox_data_idx * 5], 5 * sizeof(T_BBOX));
-          kept_bboxinfo_flag[cur_idx] = true;
-        } else {
-          kept_bboxinfo_flag[cur_idx] = false;
+            T_BBOX ref_bbox[5];
+            memcpy(&ref_bbox[0], &bbox_data[ref_bbox_idx * 5], 5 * sizeof(T_BBOX));
+
+            // Eliminate shared memory RAW hazard
+            __syncthreads();
+
+            for (int t = 0; t < TSIZE; t++)
+            {
+                const int cur_idx  = threadIdx.x + blockDim.x * t;
+                const int item_idx = offset + cur_idx;
+
+                if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx))
+                {
+                    // TODO: may need to add bool normalized as argument, HERE true means
+                    // normalized
+                    if (single_box_iou_rotated(&ref_bbox[0], loc_bbox + t * 5) > nms_threshold)
+                    {
+                        kept_bboxinfo_flag[cur_idx] = false;
+                    }
+                }
+            }
+            __syncthreads();
+
+            do {
+                ref_item_idx++;
+            } while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
+
+            ref_bbox_idx =
+                share_location ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) : beforeNMS_index_array[ref_item_idx];
         }
-      } else {
-        kept_bboxinfo_flag[cur_idx] = false;
-      }
-    }
 
-    // filter out overlapped boxes with lower scores
-    int ref_item_idx = offset;
-    int ref_bbox_idx =
-        share_location
-            ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
-            : beforeNMS_index_array[ref_item_idx];
-
-    while ((ref_bbox_idx != -1) && ref_item_idx < max_idx) {
-      T_BBOX ref_bbox[5];
-      memcpy(&ref_bbox[0], &bbox_data[ref_bbox_idx * 5], 5 * sizeof(T_BBOX));
-
-      // Eliminate shared memory RAW hazard
-      __syncthreads();
-
-      for (int t = 0; t < TSIZE; t++) {
-        const int cur_idx = threadIdx.x + blockDim.x * t;
-        const int item_idx = offset + cur_idx;
-
-        if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx)) {
-          // TODO: may need to add bool normalized as argument, HERE true means
-          // normalized
-          if (single_box_iou_rotated(&ref_bbox[0], loc_bbox + t * 5) > nms_threshold) {
-            kept_bboxinfo_flag[cur_idx] = false;
-          }
+        // store data
+        for (int t = 0; t < TSIZE; t++)
+        {
+            const int cur_idx        = threadIdx.x + blockDim.x * t;
+            const int read_item_idx  = offset + cur_idx;
+            const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
+            /*
+             * If not not keeping the bbox
+             * Set the score to 0
+             * Set the bounding box index to -1
+             */
+            if (read_item_idx < max_idx)
+            {
+                afterNMS_scores[write_item_idx] =
+                    kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
+                afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
+            }
         }
-      }
-      __syncthreads();
-
-      do {
-        ref_item_idx++;
-      } while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
-
-      ref_bbox_idx =
-          share_location
-              ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
-              : beforeNMS_index_array[ref_item_idx];
-    }
-
-    // store data
-    for (int t = 0; t < TSIZE; t++) {
-      const int cur_idx = threadIdx.x + blockDim.x * t;
-      const int read_item_idx = offset + cur_idx;
-      const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
-      /*
-       * If not not keeping the bbox
-       * Set the score to 0
-       * Set the bounding box index to -1
-       */
-      if (read_item_idx < max_idx) {
-        afterNMS_scores[write_item_idx] =
-            kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
-        afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
-      }
     }
-  }
 }
 
-template <typename T_SCORE, typename T_BBOX>
-pluginStatus_t allClassRotatedNMS_gpu(cudaStream_t stream, const int num, const int num_classes,
-                                      const int num_preds_per_class, const int top_k,
-                                      const float nms_threshold, const bool share_location,
-                                      const bool isNormalized, void *bbox_data,
-                                      void *beforeNMS_scores, void *beforeNMS_index_array,
-                                      void *afterNMS_scores, void *afterNMS_index_array) {
+template<typename T_SCORE, typename T_BBOX>
+pluginStatus_t allClassRotatedNMS_gpu(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int top_k, const float nms_threshold, const bool share_location, const bool isNormalized, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array)
+{
 #define P(tsize) allClassRotatedNMS_kernel<T_SCORE, T_BBOX, (tsize)>
 
-  void (*kernel[10])(const int, const int, const int, const int, const float, const bool,
-                     const bool, float *, T_SCORE *, int *, T_SCORE *, int *) = {
-      P(1), P(2), P(3), P(4), P(5), P(6), P(7), P(8), P(9), P(10),
-  };
-
-  const int BS = 512;
-  const int GS = num_classes;
-  const int t_size = (top_k + BS - 1) / BS;
-
-  ASSERT(t_size <= 10);
-  kernel[t_size - 1]<<<GS, BS, BS * t_size * sizeof(bool), stream>>>(
-      num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized,
-      (T_BBOX *)bbox_data, (T_SCORE *)beforeNMS_scores, (int *)beforeNMS_index_array,
-      (T_SCORE *)afterNMS_scores, (int *)afterNMS_index_array);
-
-  CSC(cudaGetLastError(), STATUS_FAILURE);
-  return STATUS_SUCCESS;
+    void (*kernel[10])(const int, const int, const int, const int, const float, const bool, const bool, float*, T_SCORE*, int*, T_SCORE*, int*) = {
+        P(1),
+        P(2),
+        P(3),
+        P(4),
+        P(5),
+        P(6),
+        P(7),
+        P(8),
+        P(9),
+        P(10),
+    };
+
+    const int BS     = 512;
+    const int GS     = num_classes;
+    const int t_size = (top_k + BS - 1) / BS;
+
+    ASSERT(t_size <= 10);
+    kernel[t_size - 1]<<<GS, BS, BS * t_size * sizeof(bool), stream>>>(
+        num,
+        num_classes,
+        num_preds_per_class,
+        top_k,
+        nms_threshold,
+        share_location,
+        isNormalized,
+        (T_BBOX*)bbox_data,
+        (T_SCORE*)beforeNMS_scores,
+        (int*)beforeNMS_index_array,
+        (T_SCORE*)afterNMS_scores,
+        (int*)afterNMS_index_array);
+
+    CSC(cudaGetLastError(), STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // allClassNMS LAUNCH CONFIG
-typedef pluginStatus_t (*rotatedNmsFunc)(cudaStream_t, const int, const int, const int, const int,
-                                         const float, const bool, const bool, void *, void *,
-                                         void *, void *, void *);
-
-struct rotatedNmsLaunchConfig {
-  DataType t_score;
-  DataType t_bbox;
-  rotatedNmsFunc function;
-
-  rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox) : t_score(t_score), t_bbox(t_bbox) {}
-  rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox, rotatedNmsFunc function)
-      : t_score(t_score), t_bbox(t_bbox), function(function) {}
-  bool operator==(const rotatedNmsLaunchConfig &other) {
-    return t_score == other.t_score && t_bbox == other.t_bbox;
-  }
+typedef pluginStatus_t (*rotatedNmsFunc)(cudaStream_t, const int, const int, const int, const int, const float, const bool, const bool, void*, void*, void*, void*, void*);
+
+struct rotatedNmsLaunchConfig
+{
+    DataType       t_score;
+    DataType       t_bbox;
+    rotatedNmsFunc function;
+
+    rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox)
+        : t_score(t_score)
+        , t_bbox(t_bbox)
+    {
+    }
+    rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox, rotatedNmsFunc function)
+        : t_score(t_score)
+        , t_bbox(t_bbox)
+        , function(function)
+    {
+    }
+    bool operator==(const rotatedNmsLaunchConfig& other)
+    {
+        return t_score == other.t_score && t_bbox == other.t_bbox;
+    }
 };
 
 static std::vector<rotatedNmsLaunchConfig> rotatedNmsFuncVec;
 
-bool rotatedNmsInit() {
-  rotatedNmsFuncVec.push_back(rotatedNmsLaunchConfig(DataType::kFLOAT, DataType::kFLOAT,
-                                                     allClassRotatedNMS_gpu<float, float>));
-  return true;
+bool                                       rotatedNmsInit()
+{
+    rotatedNmsFuncVec.push_back(rotatedNmsLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, allClassRotatedNMS_gpu<float, float>));
+    return true;
 }
 
-static bool initialized = rotatedNmsInit();
-
-pluginStatus_t allClassRotatedNMS(cudaStream_t stream, const int num, const int num_classes,
-                                  const int num_preds_per_class, const int top_k,
-                                  const float nms_threshold, const bool share_location,
-                                  const bool isNormalized, const DataType DT_SCORE,
-                                  const DataType DT_BBOX, void *bbox_data, void *beforeNMS_scores,
-                                  void *beforeNMS_index_array, void *afterNMS_scores,
-                                  void *afterNMS_index_array, bool) {
-  auto __cuda_arch__ = get_cuda_arch(0);  // assume there is only one arch 7.2 device
-  if (__cuda_arch__ == 720 && top_k >= 1000) {
-    printf("Warning: pre_top_k need to be reduced for devices with arch 7.2, got pre_top_k=%d\n",
-           top_k);
-  }
-  rotatedNmsLaunchConfig lc(DT_SCORE, DT_BBOX);
-
-  for (unsigned i = 0; i < rotatedNmsFuncVec.size(); ++i) {
-    if (lc == rotatedNmsFuncVec[i]) {
-      DEBUG_PRINTF("all class rotated nms kernel %d\n", i);
-      return rotatedNmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k,
-                                           nms_threshold, share_location, isNormalized, bbox_data,
-                                           beforeNMS_scores, beforeNMS_index_array, afterNMS_scores,
-                                           afterNMS_index_array);
+static bool    initialized = rotatedNmsInit();
+
+pluginStatus_t allClassRotatedNMS(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int top_k, const float nms_threshold, const bool share_location, const bool isNormalized, const DataType DT_SCORE, const DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array, bool)
+{
+    auto __cuda_arch__ = get_cuda_arch(0);  // assume there is only one arch 7.2 device
+    if (__cuda_arch__ == 720 && top_k >= 1000)
+    {
+        printf("Warning: pre_top_k need to be reduced for devices with arch 7.2, got pre_top_k=%d\n",
+               top_k);
+    }
+    rotatedNmsLaunchConfig lc(DT_SCORE, DT_BBOX);
+
+    for (unsigned i = 0; i < rotatedNmsFuncVec.size(); ++i)
+    {
+        if (lc == rotatedNmsFuncVec[i])
+        {
+            DEBUG_PRINTF("all class rotated nms kernel %d\n", i);
+            return rotatedNmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized, bbox_data, beforeNMS_scores, beforeNMS_index_array, afterNMS_scores, afterNMS_index_array);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
index 71cb7a8592..903624d86b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
@@ -3,123 +3,111 @@
 // https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
 #include "nms/batched_nms_kernel.hpp"
 
-pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize,
-                            const int perBatchScoresSize, const bool shareLocation,
-                            const int backgroundLabelId, const int numPredsPerClass,
-                            const int numClasses, const int topK, const int keepTopK,
-                            const float scoreThreshold, const float iouThreshold,
-                            const DataType DT_BBOX, const void* locData, const DataType DT_SCORE,
-                            const void* confData, void* nmsedDets, void* nmsedLabels,
-                            void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid,
-                            bool clipBoxes, bool rotated) {
-  const int topKVal = topK < 0 ? numPredsPerClass : topK;
-  const int keepTopKVal = keepTopK < 0 ? numPredsPerClass : keepTopK;
-  // locCount = batch_size * number_boxes_per_sample * 4
-  const int locCount = N * perBatchBoxesSize;
-  /*
-   * shareLocation
-   * Bounding box are shared among all classes, i.e., a bounding box could be
-   * classified as any candidate class. Otherwise Bounding box are designed for
-   * specific classes, i.e., a bounding box could be classified as one certain
-   * class or not (binary classification).
-   */
-  const int numLocClasses = shareLocation ? 1 : numClasses;
-
-  size_t bboxDataSize = detectionForwardBBoxDataSize(N, perBatchBoxesSize, DataType::kFLOAT);
-  void* bboxDataRaw = workspace;
-  cudaMemcpyAsync(bboxDataRaw, locData, bboxDataSize, cudaMemcpyDeviceToDevice, stream);
-  pluginStatus_t status;
-
-  /*
-   * bboxDataRaw format:
-   * [batch size, numPriors (per sample), numLocClasses, 4]
-   */
-  // float for now
-  void* bboxData;
-  size_t bboxPermuteSize =
-      detectionForwardBBoxPermuteSize(shareLocation, N, perBatchBoxesSize, DataType::kFLOAT);
-  void* bboxPermute = nextWorkspacePtr((int8_t*)bboxDataRaw, bboxDataSize);
-
-  /*
-   * After permutation, bboxData format:
-   * [batch_size, numLocClasses, numPriors (per sample) (numPredsPerClass), 4]
-   * This is equivalent to swapping axis
-   */
-  if (!shareLocation) {
-    status = permuteData(stream, locCount, numLocClasses, numPredsPerClass, rotated ? 5 : 4,
-                         DataType::kFLOAT, false, bboxDataRaw, bboxPermute);
+pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize, const int perBatchScoresSize, const bool shareLocation, const int backgroundLabelId, const int numPredsPerClass, const int numClasses, const int topK, const int keepTopK, const float scoreThreshold, const float iouThreshold, const DataType DT_BBOX, const void* locData, const DataType DT_SCORE, const void* confData, void* nmsedDets, void* nmsedLabels, void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid, bool clipBoxes, bool rotated)
+{
+    const int topKVal       = topK < 0 ? numPredsPerClass : topK;
+    const int keepTopKVal   = keepTopK < 0 ? numPredsPerClass : keepTopK;
+    // locCount = batch_size * number_boxes_per_sample * 4
+    const int locCount      = N * perBatchBoxesSize;
+    /*
+     * shareLocation
+     * Bounding box are shared among all classes, i.e., a bounding box could be
+     * classified as any candidate class. Otherwise Bounding box are designed for
+     * specific classes, i.e., a bounding box could be classified as one certain
+     * class or not (binary classification).
+     */
+    const int numLocClasses = shareLocation ? 1 : numClasses;
+
+    size_t    bboxDataSize = detectionForwardBBoxDataSize(N, perBatchBoxesSize, DataType::kFLOAT);
+    void*     bboxDataRaw  = workspace;
+    cudaMemcpyAsync(bboxDataRaw, locData, bboxDataSize, cudaMemcpyDeviceToDevice, stream);
+    pluginStatus_t status;
+
+    /*
+     * bboxDataRaw format:
+     * [batch size, numPriors (per sample), numLocClasses, 4]
+     */
+    // float for now
+    void*          bboxData;
+    size_t         bboxPermuteSize =
+        detectionForwardBBoxPermuteSize(shareLocation, N, perBatchBoxesSize, DataType::kFLOAT);
+    void* bboxPermute = nextWorkspacePtr((int8_t*)bboxDataRaw, bboxDataSize);
+
+    /*
+     * After permutation, bboxData format:
+     * [batch_size, numLocClasses, numPriors (per sample) (numPredsPerClass), 4]
+     * This is equivalent to swapping axis
+     */
+    if (!shareLocation)
+    {
+        status = permuteData(stream, locCount, numLocClasses, numPredsPerClass, rotated ? 5 : 4, DataType::kFLOAT, false, bboxDataRaw, bboxPermute);
+        ASSERT_FAILURE(status == STATUS_SUCCESS);
+        bboxData = bboxPermute;
+    }
+    /*
+     * If shareLocation, numLocClasses = 1
+     * No need to permute data on linear memory
+     */
+    else
+    {
+        bboxData = bboxDataRaw;
+    }
+
+    /*
+     * Conf data format
+     * [batch size, numPriors * param.numClasses, 1, 1]
+     */
+    const int numScores       = N * perBatchScoresSize;
+    size_t    totalScoresSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
+    void*     scores          = nextWorkspacePtr((int8_t*)bboxPermute, bboxPermuteSize);
+
+    // need a conf_scores
+    /*
+     * After permutation, bboxData format:
+     * [batch_size, numClasses, numPredsPerClass, 1]
+     */
+    status = permuteData(stream, numScores, numClasses, numPredsPerClass, 1, DataType::kFLOAT, confSigmoid, confData, scores);
     ASSERT_FAILURE(status == STATUS_SUCCESS);
-    bboxData = bboxPermute;
-  }
-  /*
-   * If shareLocation, numLocClasses = 1
-   * No need to permute data on linear memory
-   */
-  else {
-    bboxData = bboxDataRaw;
-  }
-
-  /*
-   * Conf data format
-   * [batch size, numPriors * param.numClasses, 1, 1]
-   */
-  const int numScores = N * perBatchScoresSize;
-  size_t totalScoresSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
-  void* scores = nextWorkspacePtr((int8_t*)bboxPermute, bboxPermuteSize);
-
-  // need a conf_scores
-  /*
-   * After permutation, bboxData format:
-   * [batch_size, numClasses, numPredsPerClass, 1]
-   */
-  status = permuteData(stream, numScores, numClasses, numPredsPerClass, 1, DataType::kFLOAT,
-                       confSigmoid, confData, scores);
-  ASSERT_FAILURE(status == STATUS_SUCCESS);
-
-  size_t indicesSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
-  void* indices = nextWorkspacePtr((int8_t*)scores, totalScoresSize);
-
-  size_t postNMSScoresSize = detectionForwardPostNMSSize(N, numClasses, topKVal);
-  size_t postNMSIndicesSize = detectionForwardPostNMSSize(N, numClasses, topKVal);
-  void* postNMSScores = nextWorkspacePtr((int8_t*)indices, indicesSize);
-  void* postNMSIndices = nextWorkspacePtr((int8_t*)postNMSScores, postNMSScoresSize);
-
-  void* sortingWorkspace = nextWorkspacePtr((int8_t*)postNMSIndices, postNMSIndicesSize);
-  // Sort the scores so that the following NMS could be applied.
-
-  status = sortScoresPerClass(stream, N, numClasses, numPredsPerClass, backgroundLabelId,
-                              scoreThreshold, DataType::kFLOAT, scores, indices, sortingWorkspace);
-  ASSERT_FAILURE(status == STATUS_SUCCESS);
-
-  // This is set to true as the input bounding boxes are of the format [ymin,
-  // xmin, ymax, xmax]. The default implementation assumes [xmin, ymin, xmax,
-  // ymax]
-  bool flipXY = false;
-  // NMS
-  if (rotated) {
-    status = allClassRotatedNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold,
-                                shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT,
-                                bboxData, scores, indices, postNMSScores, postNMSIndices, flipXY);
-  } else {
-    status = allClassNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold,
-                         shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT, bboxData,
-                         scores, indices, postNMSScores, postNMSIndices, flipXY);
-  }
-
-  ASSERT_FAILURE(status == STATUS_SUCCESS);
-
-  // Sort the bounding boxes after NMS using scores
-  status = sortScoresPerImage(stream, N, numClasses * topKVal, DataType::kFLOAT, postNMSScores,
-                              postNMSIndices, scores, indices, sortingWorkspace);
-
-  ASSERT_FAILURE(status == STATUS_SUCCESS);
-
-  // Gather data from the sorted bounding boxes after NMS
-  status = gatherNMSOutputs(stream, shareLocation, N, numPredsPerClass, numClasses, topKVal,
-                            keepTopKVal, DataType::kFLOAT, DataType::kFLOAT, indices, scores,
-                            bboxData, nmsedDets, nmsedLabels, nmsedIndex, clipBoxes, rotated);
-
-  ASSERT_FAILURE(status == STATUS_SUCCESS);
-
-  return STATUS_SUCCESS;
+
+    size_t indicesSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
+    void*  indices     = nextWorkspacePtr((int8_t*)scores, totalScoresSize);
+
+    size_t postNMSScoresSize  = detectionForwardPostNMSSize(N, numClasses, topKVal);
+    size_t postNMSIndicesSize = detectionForwardPostNMSSize(N, numClasses, topKVal);
+    void*  postNMSScores      = nextWorkspacePtr((int8_t*)indices, indicesSize);
+    void*  postNMSIndices     = nextWorkspacePtr((int8_t*)postNMSScores, postNMSScoresSize);
+
+    void*  sortingWorkspace = nextWorkspacePtr((int8_t*)postNMSIndices, postNMSIndicesSize);
+    // Sort the scores so that the following NMS could be applied.
+
+    status = sortScoresPerClass(stream, N, numClasses, numPredsPerClass, backgroundLabelId, scoreThreshold, DataType::kFLOAT, scores, indices, sortingWorkspace);
+    ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+    // This is set to true as the input bounding boxes are of the format [ymin,
+    // xmin, ymax, xmax]. The default implementation assumes [xmin, ymin, xmax,
+    // ymax]
+    bool flipXY = false;
+    // NMS
+    if (rotated)
+    {
+        status = allClassRotatedNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold, shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT, bboxData, scores, indices, postNMSScores, postNMSIndices, flipXY);
+    }
+    else
+    {
+        status = allClassNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold, shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT, bboxData, scores, indices, postNMSScores, postNMSIndices, flipXY);
+    }
+
+    ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+    // Sort the bounding boxes after NMS using scores
+    status = sortScoresPerImage(stream, N, numClasses * topKVal, DataType::kFLOAT, postNMSScores, postNMSIndices, scores, indices, sortingWorkspace);
+
+    ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+    // Gather data from the sorted bounding boxes after NMS
+    status = gatherNMSOutputs(stream, shareLocation, N, numPredsPerClass, numClasses, topKVal, keepTopKVal, DataType::kFLOAT, DataType::kFLOAT, indices, scores, bboxData, nmsedDets, nmsedLabels, nmsedIndex, clipBoxes, rotated);
+
+    ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+    return STATUS_SUCCESS;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
index 58419f8c16..22d901565c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
@@ -6,159 +6,170 @@
 #include "nms/kernel.h"
 #include "trt_plugin_helper.hpp"
 
-template <typename T_BBOX, typename T_SCORE, bool rotated, unsigned nthds_per_cta>
+template<typename T_BBOX, typename T_SCORE, bool rotated, unsigned nthds_per_cta>
 __launch_bounds__(nthds_per_cta) __global__
-    void gatherNMSOutputs_kernel(const bool shareLocation, const int numImages,
-                                 const int numPredsPerClass, const int numClasses, const int topK,
-                                 const int keepTopK, const int *indices, const T_SCORE *scores,
-                                 const T_BBOX *bboxData, T_BBOX *nmsedDets, int *nmsedLabels,
-                                 int *nmsedIndex, bool clipBoxes) {
-  if (keepTopK > topK) return;
-  for (int i = blockIdx.x * nthds_per_cta + threadIdx.x; i < numImages * keepTopK;
-       i += gridDim.x * nthds_per_cta) {
-    const int imgId = i / keepTopK;
-    const int detId = i % keepTopK;
-    const int offset = imgId * numClasses * topK;
-    const int index = indices[offset + detId];
-    const T_SCORE score = scores[offset + detId];
-    if (index == -1) {
-      nmsedLabels[i] = -1;
-      if (nmsedIndex != nullptr) {
-        nmsedIndex[i] = -1;
-      }
-      if (rotated) {
-        nmsedDets[i * 6] = 0;
-        nmsedDets[i * 6 + 1] = 0;
-        nmsedDets[i * 6 + 2] = 0;
-        nmsedDets[i * 6 + 3] = 0;
-        nmsedDets[i * 6 + 4] = 0;
-        nmsedDets[i * 6 + 5] = 0;
-      } else {
-        nmsedDets[i * 5] = 0;
-        nmsedDets[i * 5 + 1] = 0;
-        nmsedDets[i * 5 + 2] = 0;
-        nmsedDets[i * 5 + 3] = 0;
-        nmsedDets[i * 5 + 4] = 0;
-      }
-    } else {
-      const int bboxOffset =
-          imgId * (shareLocation ? numPredsPerClass : (numClasses * numPredsPerClass));
-      nmsedLabels[i] = (index % (numClasses * numPredsPerClass)) / numPredsPerClass;  // label
-      if (rotated) {
-        const int bboxId = ((shareLocation ? (index % numPredsPerClass)
-                                           : index % (numClasses * numPredsPerClass)) +
-                            bboxOffset) *
-                           5;
-        if (nmsedIndex != nullptr) {
-          nmsedIndex[i] = bboxId / 5 - bboxOffset;
+    void gatherNMSOutputs_kernel(const bool shareLocation, const int numImages, const int numPredsPerClass, const int numClasses, const int topK, const int keepTopK, const int* indices, const T_SCORE* scores, const T_BBOX* bboxData, T_BBOX* nmsedDets, int* nmsedLabels, int* nmsedIndex, bool clipBoxes)
+{
+    if (keepTopK > topK) return;
+    for (int i = blockIdx.x * nthds_per_cta + threadIdx.x; i < numImages * keepTopK;
+         i += gridDim.x * nthds_per_cta)
+    {
+        const int     imgId  = i / keepTopK;
+        const int     detId  = i % keepTopK;
+        const int     offset = imgId * numClasses * topK;
+        const int     index  = indices[offset + detId];
+        const T_SCORE score  = scores[offset + detId];
+        if (index == -1)
+        {
+            nmsedLabels[i] = -1;
+            if (nmsedIndex != nullptr)
+            {
+                nmsedIndex[i] = -1;
+            }
+            if (rotated)
+            {
+                nmsedDets[i * 6]     = 0;
+                nmsedDets[i * 6 + 1] = 0;
+                nmsedDets[i * 6 + 2] = 0;
+                nmsedDets[i * 6 + 3] = 0;
+                nmsedDets[i * 6 + 4] = 0;
+                nmsedDets[i * 6 + 5] = 0;
+            }
+            else
+            {
+                nmsedDets[i * 5]     = 0;
+                nmsedDets[i * 5 + 1] = 0;
+                nmsedDets[i * 5 + 2] = 0;
+                nmsedDets[i * 5 + 3] = 0;
+                nmsedDets[i * 5 + 4] = 0;
+            }
         }
-        // clipped bbox xmin
-        nmsedDets[i * 6] =
-            clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
-        // clipped bbox ymin
-        nmsedDets[i * 6 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 1];
-        // clipped bbox xmax
-        nmsedDets[i * 6 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 2];
-        // clipped bbox ymax
-        nmsedDets[i * 6 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 3];
-        // clipped bbox angle
-        nmsedDets[i * 6 + 4] = clipBoxes ? max(min(bboxData[bboxId + 4], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 4];
-        nmsedDets[i * 6 + 5] = score;
-      } else {
-        const int bboxId = ((shareLocation ? (index % numPredsPerClass)
-                                           : index % (numClasses * numPredsPerClass)) +
-                            bboxOffset) *
-                           4;
-        if (nmsedIndex != nullptr) {
-          nmsedIndex[i] = bboxId / 4 - bboxOffset;
+        else
+        {
+            const int bboxOffset =
+                imgId * (shareLocation ? numPredsPerClass : (numClasses * numPredsPerClass));
+            nmsedLabels[i] = (index % (numClasses * numPredsPerClass)) / numPredsPerClass;  // label
+            if (rotated)
+            {
+                const int bboxId = ((shareLocation ? (index % numPredsPerClass) : index % (numClasses * numPredsPerClass)) +
+                                    bboxOffset) *
+                                   5;
+                if (nmsedIndex != nullptr)
+                {
+                    nmsedIndex[i] = bboxId / 5 - bboxOffset;
+                }
+                // clipped bbox xmin
+                nmsedDets[i * 6] =
+                    clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
+                // clipped bbox ymin
+                nmsedDets[i * 6 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 1];
+                // clipped bbox xmax
+                nmsedDets[i * 6 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 2];
+                // clipped bbox ymax
+                nmsedDets[i * 6 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 3];
+                // clipped bbox angle
+                nmsedDets[i * 6 + 4] = clipBoxes ? max(min(bboxData[bboxId + 4], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 4];
+                nmsedDets[i * 6 + 5] = score;
+            }
+            else
+            {
+                const int bboxId = ((shareLocation ? (index % numPredsPerClass) : index % (numClasses * numPredsPerClass)) +
+                                    bboxOffset) *
+                                   4;
+                if (nmsedIndex != nullptr)
+                {
+                    nmsedIndex[i] = bboxId / 4 - bboxOffset;
+                }
+                // clipped bbox xmin
+                nmsedDets[i * 5] =
+                    clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
+                // clipped bbox ymin
+                nmsedDets[i * 5 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 1];
+                // clipped bbox xmax
+                nmsedDets[i * 5 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 2];
+                // clipped bbox ymax
+                nmsedDets[i * 5 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 3];
+                nmsedDets[i * 5 + 4] = score;
+            }
         }
-        // clipped bbox xmin
-        nmsedDets[i * 5] =
-            clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
-        // clipped bbox ymin
-        nmsedDets[i * 5 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 1];
-        // clipped bbox xmax
-        nmsedDets[i * 5 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 2];
-        // clipped bbox ymax
-        nmsedDets[i * 5 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 3];
-        nmsedDets[i * 5 + 4] = score;
-      }
     }
-  }
 }
 
-template <typename T_BBOX, typename T_SCORE, bool rotated>
-pluginStatus_t gatherNMSOutputs_gpu(cudaStream_t stream, const bool shareLocation,
-                                    const int numImages, const int numPredsPerClass,
-                                    const int numClasses, const int topK, const int keepTopK,
-                                    const void *indices, const void *scores, const void *bboxData,
-                                    void *nmsedDets, void *nmsedLabels, void *nmsedIndex,
-                                    bool clipBoxes) {
-  const int BS = 32;
-  const int GS = 32;
-  gatherNMSOutputs_kernel<T_BBOX, T_SCORE, rotated, BS><<<GS, BS, 0, stream>>>(
-      shareLocation, numImages, numPredsPerClass, numClasses, topK, keepTopK, (int *)indices,
-      (T_SCORE *)scores, (T_BBOX *)bboxData, (T_BBOX *)nmsedDets, (int *)nmsedLabels,
-      (int *)nmsedIndex, clipBoxes);
+template<typename T_BBOX, typename T_SCORE, bool rotated>
+pluginStatus_t gatherNMSOutputs_gpu(cudaStream_t stream, const bool shareLocation, const int numImages, const int numPredsPerClass, const int numClasses, const int topK, const int keepTopK, const void* indices, const void* scores, const void* bboxData, void* nmsedDets, void* nmsedLabels, void* nmsedIndex, bool clipBoxes)
+{
+    const int BS = 32;
+    const int GS = 32;
+    gatherNMSOutputs_kernel<T_BBOX, T_SCORE, rotated, BS><<<GS, BS, 0, stream>>>(
+        shareLocation,
+        numImages,
+        numPredsPerClass,
+        numClasses,
+        topK,
+        keepTopK,
+        (int*)indices,
+        (T_SCORE*)scores,
+        (T_BBOX*)bboxData,
+        (T_BBOX*)nmsedDets,
+        (int*)nmsedLabels,
+        (int*)nmsedIndex,
+        clipBoxes);
 
-  CSC(cudaGetLastError(), STATUS_FAILURE);
-  return STATUS_SUCCESS;
+    CSC(cudaGetLastError(), STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // gatherNMSOutputs LAUNCH CONFIG {{{
-typedef pluginStatus_t (*nmsOutFunc)(cudaStream_t, const bool, const int, const int, const int,
-                                     const int, const int, const void *, const void *, const void *,
-                                     void *, void *, void *, bool);
-struct nmsOutLaunchConfig {
-  DataType t_bbox;
-  DataType t_score;
-  bool rotated;
-  nmsOutFunc function;
+typedef pluginStatus_t (*nmsOutFunc)(cudaStream_t, const bool, const int, const int, const int, const int, const int, const void*, const void*, const void*, void*, void*, void*, bool);
+struct nmsOutLaunchConfig
+{
+    DataType   t_bbox;
+    DataType   t_score;
+    bool       rotated;
+    nmsOutFunc function;
 
-  nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated)
-      : t_bbox(t_bbox), t_score(t_score), rotated(rotated) {}
-  nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated, nmsOutFunc function)
-      : t_bbox(t_bbox), t_score(t_score), rotated(rotated), function(function) {}
-  bool operator==(const nmsOutLaunchConfig &other) {
-    return t_bbox == other.t_bbox && t_score == other.t_score && rotated == other.rotated;
-  }
+    nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated)
+        : t_bbox(t_bbox)
+        , t_score(t_score)
+        , rotated(rotated)
+    {
+    }
+    nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated, nmsOutFunc function)
+        : t_bbox(t_bbox)
+        , t_score(t_score)
+        , rotated(rotated)
+        , function(function)
+    {
+    }
+    bool operator==(const nmsOutLaunchConfig& other)
+    {
+        return t_bbox == other.t_bbox && t_score == other.t_score && rotated == other.rotated;
+    }
 };
 
 using nvinfer1::DataType;
 
 static std::vector<nmsOutLaunchConfig> nmsOutFuncVec;
 
-bool nmsOutputInit() {
-  nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, false,
-                                             gatherNMSOutputs_gpu<float, float, false>));
-  nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, true,
-                                             gatherNMSOutputs_gpu<float, float, true>));
-  return true;
+bool                                   nmsOutputInit()
+{
+    nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, false, gatherNMSOutputs_gpu<float, float, false>));
+    nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, true, gatherNMSOutputs_gpu<float, float, true>));
+    return true;
 }
 
-static bool initialized = nmsOutputInit();
+static bool    initialized = nmsOutputInit();
 
-pluginStatus_t gatherNMSOutputs(cudaStream_t stream, const bool shareLocation, const int numImages,
-                                const int numPredsPerClass, const int numClasses, const int topK,
-                                const int keepTopK, const DataType DT_BBOX, const DataType DT_SCORE,
-                                const void *indices, const void *scores, const void *bboxData,
-                                void *nmsedDets, void *nmsedLabels, void *nmsedIndex,
-                                bool clipBoxes, bool rotated) {
-  nmsOutLaunchConfig lc = nmsOutLaunchConfig(DT_BBOX, DT_SCORE, rotated);
-  for (unsigned i = 0; i < nmsOutFuncVec.size(); ++i) {
-    if (lc == nmsOutFuncVec[i]) {
-      DEBUG_PRINTF("gatherNMSOutputs kernel %d\n", i);
-      return nmsOutFuncVec[i].function(stream, shareLocation, numImages, numPredsPerClass,
-                                       numClasses, topK, keepTopK, indices, scores, bboxData,
-                                       nmsedDets, nmsedLabels, nmsedIndex, clipBoxes);
+pluginStatus_t gatherNMSOutputs(cudaStream_t stream, const bool shareLocation, const int numImages, const int numPredsPerClass, const int numClasses, const int topK, const int keepTopK, const DataType DT_BBOX, const DataType DT_SCORE, const void* indices, const void* scores, const void* bboxData, void* nmsedDets, void* nmsedLabels, void* nmsedIndex, bool clipBoxes, bool rotated)
+{
+    nmsOutLaunchConfig lc = nmsOutLaunchConfig(DT_BBOX, DT_SCORE, rotated);
+    for (unsigned i = 0; i < nmsOutFuncVec.size(); ++i)
+    {
+        if (lc == nmsOutFuncVec[i])
+        {
+            DEBUG_PRINTF("gatherNMSOutputs kernel %d\n", i);
+            return nmsOutFuncVec[i].function(stream, shareLocation, numImages, numPredsPerClass, numClasses, topK, keepTopK, indices, scores, bboxData, nmsedDets, nmsedLabels, nmsedIndex, clipBoxes);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
index f0e1c9d0cc..e13f8969d4 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
@@ -12,96 +12,109 @@
 #define CUDA_MEM_ALIGN 256
 
 // return cuda arch
-size_t get_cuda_arch(int devID) {
-  int computeMode = -1, major = 0, minor = 0;
-  CUASSERT(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
-  CUASSERT(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
-  CUASSERT(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
-  return major * 100 + minor * 10;
+size_t get_cuda_arch(int devID)
+{
+    int computeMode = -1, major = 0, minor = 0;
+    CUASSERT(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+    CUASSERT(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+    CUASSERT(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+    return major * 100 + minor * 10;
 }
 
 // ALIGNPTR
-int8_t *alignPtr(int8_t *ptr, uintptr_t to) {
-  uintptr_t addr = (uintptr_t)ptr;
-  if (addr % to) {
-    addr += to - addr % to;
-  }
-  return (int8_t *)addr;
+int8_t* alignPtr(int8_t* ptr, uintptr_t to)
+{
+    uintptr_t addr = (uintptr_t)ptr;
+    if (addr % to)
+    {
+        addr += to - addr % to;
+    }
+    return (int8_t*)addr;
 }
 
 // NEXTWORKSPACEPTR
-int8_t *nextWorkspacePtr(int8_t *ptr, uintptr_t previousWorkspaceSize) {
-  uintptr_t addr = (uintptr_t)ptr;
-  addr += previousWorkspaceSize;
-  return alignPtr((int8_t *)addr, CUDA_MEM_ALIGN);
+int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize)
+{
+    uintptr_t addr = (uintptr_t)ptr;
+    addr += previousWorkspaceSize;
+    return alignPtr((int8_t*)addr, CUDA_MEM_ALIGN);
 }
 
 // CALCULATE TOTAL WORKSPACE SIZE
-size_t calculateTotalWorkspaceSize(size_t *workspaces, int count) {
-  size_t total = 0;
-  for (int i = 0; i < count; i++) {
-    total += workspaces[i];
-    if (workspaces[i] % CUDA_MEM_ALIGN) {
-      total += CUDA_MEM_ALIGN - (workspaces[i] % CUDA_MEM_ALIGN);
+size_t calculateTotalWorkspaceSize(size_t* workspaces, int count)
+{
+    size_t total = 0;
+    for (int i = 0; i < count; i++)
+    {
+        total += workspaces[i];
+        if (workspaces[i] % CUDA_MEM_ALIGN)
+        {
+            total += CUDA_MEM_ALIGN - (workspaces[i] % CUDA_MEM_ALIGN);
+        }
     }
-  }
-  return total;
+    return total;
 }
 
 using nvinfer1::DataType;
 
-template <unsigned nthds_per_cta>
+template<unsigned nthds_per_cta>
 __launch_bounds__(nthds_per_cta) __global__
-    void setUniformOffsets_kernel(const int num_segments, const int offset, int *d_offsets) {
-  const int idx = blockIdx.x * nthds_per_cta + threadIdx.x;
-  if (idx <= num_segments) d_offsets[idx] = idx * offset;
+    void setUniformOffsets_kernel(const int num_segments, const int offset, int* d_offsets)
+{
+    const int idx = blockIdx.x * nthds_per_cta + threadIdx.x;
+    if (idx <= num_segments) d_offsets[idx] = idx * offset;
 }
 
-void setUniformOffsets(cudaStream_t stream, const int num_segments, const int offset,
-                       int *d_offsets) {
-  const int BS = 32;
-  const int GS = (num_segments + 1 + BS - 1) / BS;
-  setUniformOffsets_kernel<BS><<<GS, BS, 0, stream>>>(num_segments, offset, d_offsets);
+void setUniformOffsets(cudaStream_t stream, const int num_segments, const int offset, int* d_offsets)
+{
+    const int BS = 32;
+    const int GS = (num_segments + 1 + BS - 1) / BS;
+    setUniformOffsets_kernel<BS><<<GS, BS, 0, stream>>>(num_segments, offset, d_offsets);
 }
 
-size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX) {
-  if (DT_BBOX == DataType::kFLOAT) {
-    return N * C1 * sizeof(float);
-  }
+size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX)
+{
+    if (DT_BBOX == DataType::kFLOAT)
+    {
+        return N * C1 * sizeof(float);
+    }
 
-  printf("Only FP32 type bounding boxes are supported.\n");
-  return (size_t)-1;
+    printf("Only FP32 type bounding boxes are supported.\n");
+    return (size_t)-1;
 }
 
-size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX) {
-  if (DT_BBOX == DataType::kFLOAT) {
-    return shareLocation ? 0 : N * C1 * sizeof(float);
-  }
-  printf("Only FP32 type bounding boxes are supported.\n");
-  return (size_t)-1;
+size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX)
+{
+    if (DT_BBOX == DataType::kFLOAT)
+    {
+        return shareLocation ? 0 : N * C1 * sizeof(float);
+    }
+    printf("Only FP32 type bounding boxes are supported.\n");
+    return (size_t)-1;
 }
 
-size_t detectionForwardPreNMSSize(int N, int C2) {
-  ASSERT(sizeof(float) == sizeof(int));
-  return N * C2 * sizeof(float);
+size_t detectionForwardPreNMSSize(int N, int C2)
+{
+    ASSERT(sizeof(float) == sizeof(int));
+    return N * C2 * sizeof(float);
 }
 
-size_t detectionForwardPostNMSSize(int N, int numClasses, int topK) {
-  ASSERT(sizeof(float) == sizeof(int));
-  return N * numClasses * topK * sizeof(float);
+size_t detectionForwardPostNMSSize(int N, int numClasses, int topK)
+{
+    ASSERT(sizeof(float) == sizeof(int));
+    return N * numClasses * topK * sizeof(float);
 }
 
-size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses,
-                                       int numPredsPerClass, int topK, DataType DT_BBOX,
-                                       DataType DT_SCORE) {
-  size_t wss[7];
-  wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
-  wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
-  wss[2] = detectionForwardPreNMSSize(N, C2);
-  wss[3] = detectionForwardPreNMSSize(N, C2);
-  wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
-  wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
-  wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
-                    sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
-  return calculateTotalWorkspaceSize(wss, 7);
+size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass, int topK, DataType DT_BBOX, DataType DT_SCORE)
+{
+    size_t wss[7];
+    wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
+    wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
+    wss[2] = detectionForwardPreNMSSize(N, C2);
+    wss[3] = detectionForwardPreNMSSize(N, C2);
+    wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
+    wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
+    wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
+                      sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
+    return calculateTotalWorkspaceSize(wss, 7);
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
index 659c964970..23600a3ce8 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
@@ -5,72 +5,80 @@
 
 #include "nms/kernel.h"
 
-template <typename Dtype, unsigned nthds_per_cta>
+template<typename Dtype, unsigned nthds_per_cta>
 __launch_bounds__(nthds_per_cta) __global__
-    void permuteData_kernel(const int nthreads, const int num_classes, const int num_data,
-                            const int num_dim, bool confSigmoid, const Dtype *data,
-                            Dtype *new_data) {
-  // data format: [batch_size, num_data, num_classes, num_dim]
-  for (int index = blockIdx.x * nthds_per_cta + threadIdx.x; index < nthreads;
-       index += nthds_per_cta * gridDim.x) {
-    const int i = index % num_dim;
-    const int c = (index / num_dim) % num_classes;
-    const int d = (index / num_dim / num_classes) % num_data;
-    const int n = index / num_dim / num_classes / num_data;
-    const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i;
-    float result = data[index];
-    if (confSigmoid) result = exp(result) / (1 + exp(result));
+    void permuteData_kernel(const int nthreads, const int num_classes, const int num_data, const int num_dim, bool confSigmoid, const Dtype* data, Dtype* new_data)
+{
+    // data format: [batch_size, num_data, num_classes, num_dim]
+    for (int index = blockIdx.x * nthds_per_cta + threadIdx.x; index < nthreads;
+         index += nthds_per_cta * gridDim.x)
+    {
+        const int i         = index % num_dim;
+        const int c         = (index / num_dim) % num_classes;
+        const int d         = (index / num_dim / num_classes) % num_data;
+        const int n         = index / num_dim / num_classes / num_data;
+        const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i;
+        float     result    = data[index];
+        if (confSigmoid) result = exp(result) / (1 + exp(result));
 
-    new_data[new_index] = result;
-  }
-  // new data format: [batch_size, num_classes, num_data, num_dim]
+        new_data[new_index] = result;
+    }
+    // new data format: [batch_size, num_classes, num_data, num_dim]
 }
 
-template <typename Dtype>
-pluginStatus_t permuteData_gpu(cudaStream_t stream, const int nthreads, const int num_classes,
-                               const int num_data, const int num_dim, bool confSigmoid,
-                               const void *data, void *new_data) {
-  const int BS = 512;
-  const int GS = (nthreads + BS - 1) / BS;
-  permuteData_kernel<Dtype, BS><<<GS, BS, 0, stream>>>(nthreads, num_classes, num_data, num_dim,
-                                                       confSigmoid, (const Dtype *)data,
-                                                       (Dtype *)new_data);
-  CSC(cudaGetLastError(), STATUS_FAILURE);
-  return STATUS_SUCCESS;
+template<typename Dtype>
+pluginStatus_t permuteData_gpu(cudaStream_t stream, const int nthreads, const int num_classes, const int num_data, const int num_dim, bool confSigmoid, const void* data, void* new_data)
+{
+    const int BS = 512;
+    const int GS = (nthreads + BS - 1) / BS;
+    permuteData_kernel<Dtype, BS><<<GS, BS, 0, stream>>>(nthreads, num_classes, num_data, num_dim, confSigmoid, (const Dtype*)data, (Dtype*)new_data);
+    CSC(cudaGetLastError(), STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // permuteData LAUNCH CONFIG
-typedef pluginStatus_t (*pdFunc)(cudaStream_t, const int, const int, const int, const int, bool,
-                                 const void *, void *);
+typedef pluginStatus_t (*pdFunc)(cudaStream_t, const int, const int, const int, const int, bool, const void*, void*);
 
-struct pdLaunchConfig {
-  DataType t_data;
-  pdFunc function;
+struct pdLaunchConfig
+{
+    DataType t_data;
+    pdFunc   function;
 
-  pdLaunchConfig(DataType t_data) : t_data(t_data) {}
-  pdLaunchConfig(DataType t_data, pdFunc function) : t_data(t_data), function(function) {}
-  bool operator==(const pdLaunchConfig &other) { return t_data == other.t_data; }
+    pdLaunchConfig(DataType t_data)
+        : t_data(t_data)
+    {
+    }
+    pdLaunchConfig(DataType t_data, pdFunc function)
+        : t_data(t_data)
+        , function(function)
+    {
+    }
+    bool operator==(const pdLaunchConfig& other)
+    {
+        return t_data == other.t_data;
+    }
 };
 
 static std::vector<pdLaunchConfig> pdFuncVec;
 
-bool permuteDataInit() {
-  pdFuncVec.push_back(pdLaunchConfig(DataType::kFLOAT, permuteData_gpu<float>));
-  return true;
+bool                               permuteDataInit()
+{
+    pdFuncVec.push_back(pdLaunchConfig(DataType::kFLOAT, permuteData_gpu<float>));
+    return true;
 }
 
-static bool initialized = permuteDataInit();
+static bool    initialized = permuteDataInit();
 
-pluginStatus_t permuteData(cudaStream_t stream, const int nthreads, const int num_classes,
-                           const int num_data, const int num_dim, const DataType DT_DATA,
-                           bool confSigmoid, const void *data, void *new_data) {
-  pdLaunchConfig lc = pdLaunchConfig(DT_DATA);
-  for (unsigned i = 0; i < pdFuncVec.size(); ++i) {
-    if (lc == pdFuncVec[i]) {
-      DEBUG_PRINTF("permuteData kernel %d\n", i);
-      return pdFuncVec[i].function(stream, nthreads, num_classes, num_data, num_dim, confSigmoid,
-                                   data, new_data);
+pluginStatus_t permuteData(cudaStream_t stream, const int nthreads, const int num_classes, const int num_data, const int num_dim, const DataType DT_DATA, bool confSigmoid, const void* data, void* new_data)
+{
+    pdLaunchConfig lc = pdLaunchConfig(DT_DATA);
+    for (unsigned i = 0; i < pdFuncVec.size(); ++i)
+    {
+        if (lc == pdFuncVec[i])
+        {
+            DEBUG_PRINTF("permuteData kernel %d\n", i);
+            return pdFuncVec[i].function(stream, nthreads, num_classes, num_data, num_dim, confSigmoid, data, new_data);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
index e72f040cc9..284974e801 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
@@ -8,134 +8,166 @@
 #include "nms/kernel.h"
 #include "trt_plugin_helper.hpp"
 
-template <typename T_SCORE, unsigned nthds_per_cta>
+template<typename T_SCORE, unsigned nthds_per_cta>
 __launch_bounds__(nthds_per_cta) __global__
-    void prepareSortData(const int num, const int num_classes, const int num_preds_per_class,
-                         const int background_label_id, const float confidence_threshold,
-                         T_SCORE *conf_scores_gpu, T_SCORE *temp_scores, int *temp_idx,
-                         int *d_offsets) {
-  // Prepare scores data for sort
-  const int cur_idx = blockIdx.x * nthds_per_cta + threadIdx.x;
-  const int numPredsPerBatch = num_classes * num_preds_per_class;
-  if (cur_idx < numPredsPerBatch) {
-    const int class_idx = cur_idx / num_preds_per_class;
-    for (int i = 0; i < num; i++) {
-      const int targetIdx = i * numPredsPerBatch + cur_idx;
-      const T_SCORE score = conf_scores_gpu[targetIdx];
+    void prepareSortData(const int num, const int num_classes, const int num_preds_per_class, const int background_label_id, const float confidence_threshold, T_SCORE* conf_scores_gpu, T_SCORE* temp_scores, int* temp_idx, int* d_offsets)
+{
+    // Prepare scores data for sort
+    const int cur_idx          = blockIdx.x * nthds_per_cta + threadIdx.x;
+    const int numPredsPerBatch = num_classes * num_preds_per_class;
+    if (cur_idx < numPredsPerBatch)
+    {
+        const int class_idx = cur_idx / num_preds_per_class;
+        for (int i = 0; i < num; i++)
+        {
+            const int     targetIdx = i * numPredsPerBatch + cur_idx;
+            const T_SCORE score     = conf_scores_gpu[targetIdx];
 
-      // "Clear" background labeled score and index
-      // Because we do not care about background
-      if (class_idx == background_label_id) {
-        // Set scores to 0
-        // Set label = -1
-        temp_scores[targetIdx] = 0.0f;
-        temp_idx[targetIdx] = -1;
-        conf_scores_gpu[targetIdx] = 0.0f;
-      }
-      // "Clear" scores lower than threshold
-      else {
-        if (score > confidence_threshold) {
-          temp_scores[targetIdx] = score;
-          temp_idx[targetIdx] = cur_idx + i * numPredsPerBatch;
-        } else {
-          // Set scores to 0
-          // Set label = -1
-          temp_scores[targetIdx] = 0.0f;
-          temp_idx[targetIdx] = -1;
-          conf_scores_gpu[targetIdx] = 0.0f;
-          // TODO: HERE writing memory too many times
-        }
-      }
+            // "Clear" background labeled score and index
+            // Because we do not care about background
+            if (class_idx == background_label_id)
+            {
+                // Set scores to 0
+                // Set label = -1
+                temp_scores[targetIdx]     = 0.0f;
+                temp_idx[targetIdx]        = -1;
+                conf_scores_gpu[targetIdx] = 0.0f;
+            }
+            // "Clear" scores lower than threshold
+            else
+            {
+                if (score > confidence_threshold)
+                {
+                    temp_scores[targetIdx] = score;
+                    temp_idx[targetIdx]    = cur_idx + i * numPredsPerBatch;
+                }
+                else
+                {
+                    // Set scores to 0
+                    // Set label = -1
+                    temp_scores[targetIdx]     = 0.0f;
+                    temp_idx[targetIdx]        = -1;
+                    conf_scores_gpu[targetIdx] = 0.0f;
+                    // TODO: HERE writing memory too many times
+                }
+            }
 
-      if ((cur_idx % num_preds_per_class) == 0) {
-        const int offset_ct = i * num_classes + cur_idx / num_preds_per_class;
-        d_offsets[offset_ct] = offset_ct * num_preds_per_class;
-        // set the last element in d_offset
-        if (blockIdx.x == 0 && threadIdx.x == 0)
-          d_offsets[num * num_classes] = num * numPredsPerBatch;
-      }
+            if ((cur_idx % num_preds_per_class) == 0)
+            {
+                const int offset_ct  = i * num_classes + cur_idx / num_preds_per_class;
+                d_offsets[offset_ct] = offset_ct * num_preds_per_class;
+                // set the last element in d_offset
+                if (blockIdx.x == 0 && threadIdx.x == 0)
+                    d_offsets[num * num_classes] = num * numPredsPerBatch;
+            }
+        }
     }
-  }
 }
 
-template <typename T_SCORE>
-pluginStatus_t sortScoresPerClass_gpu(cudaStream_t stream, const int num, const int num_classes,
-                                      const int num_preds_per_class, const int background_label_id,
-                                      const float confidence_threshold, void *conf_scores_gpu,
-                                      void *index_array_gpu, void *workspace) {
-  const int num_segments = num * num_classes;
-  void *temp_scores = workspace;
-  const int arrayLen = num * num_classes * num_preds_per_class;
-  void *temp_idx = nextWorkspacePtr((int8_t *)temp_scores, arrayLen * sizeof(T_SCORE));
-  void *d_offsets = nextWorkspacePtr((int8_t *)temp_idx, arrayLen * sizeof(int));
-  size_t cubOffsetSize = (num_segments + 1) * sizeof(int);
-  void *cubWorkspace = nextWorkspacePtr((int8_t *)d_offsets, cubOffsetSize);
+template<typename T_SCORE>
+pluginStatus_t sortScoresPerClass_gpu(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int background_label_id, const float confidence_threshold, void* conf_scores_gpu, void* index_array_gpu, void* workspace)
+{
+    const int num_segments  = num * num_classes;
+    void*     temp_scores   = workspace;
+    const int arrayLen      = num * num_classes * num_preds_per_class;
+    void*     temp_idx      = nextWorkspacePtr((int8_t*)temp_scores, arrayLen * sizeof(T_SCORE));
+    void*     d_offsets     = nextWorkspacePtr((int8_t*)temp_idx, arrayLen * sizeof(int));
+    size_t    cubOffsetSize = (num_segments + 1) * sizeof(int);
+    void*     cubWorkspace  = nextWorkspacePtr((int8_t*)d_offsets, cubOffsetSize);
 
-  const int BS = 512;
-  const int GS = (num_classes * num_preds_per_class + BS - 1) / BS;
-  prepareSortData<T_SCORE, BS><<<GS, BS, 0, stream>>>(
-      num, num_classes, num_preds_per_class, background_label_id, confidence_threshold,
-      (T_SCORE *)conf_scores_gpu, (T_SCORE *)temp_scores, (int *)temp_idx, (int *)d_offsets);
+    const int BS = 512;
+    const int GS = (num_classes * num_preds_per_class + BS - 1) / BS;
+    prepareSortData<T_SCORE, BS><<<GS, BS, 0, stream>>>(
+        num,
+        num_classes,
+        num_preds_per_class,
+        background_label_id,
+        confidence_threshold,
+        (T_SCORE*)conf_scores_gpu,
+        (T_SCORE*)temp_scores,
+        (int*)temp_idx,
+        (int*)d_offsets);
 
-  size_t temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_segments);
-  cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      cubWorkspace, temp_storage_bytes, (const T_SCORE *)(temp_scores),
-      (T_SCORE *)(conf_scores_gpu), (const int *)(temp_idx), (int *)(index_array_gpu), arrayLen,
-      num_segments, (const int *)d_offsets, (const int *)d_offsets + 1, 0, sizeof(T_SCORE) * 8,
-      stream);
-  CSC(cudaGetLastError(), STATUS_FAILURE);
-  return STATUS_SUCCESS;
+    size_t temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_segments);
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        cubWorkspace,
+        temp_storage_bytes,
+        (const T_SCORE*)(temp_scores),
+        (T_SCORE*)(conf_scores_gpu),
+        (const int*)(temp_idx),
+        (int*)(index_array_gpu),
+        arrayLen,
+        num_segments,
+        (const int*)d_offsets,
+        (const int*)d_offsets + 1,
+        0,
+        sizeof(T_SCORE) * 8,
+        stream);
+    CSC(cudaGetLastError(), STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // sortScoresPerClass LAUNCH CONFIG
-typedef pluginStatus_t (*sspcFunc)(cudaStream_t, const int, const int, const int, const int,
-                                   const float, void *, void *, void *);
-struct sspcLaunchConfig {
-  DataType t_score;
-  sspcFunc function;
+typedef pluginStatus_t (*sspcFunc)(cudaStream_t, const int, const int, const int, const int, const float, void*, void*, void*);
+struct sspcLaunchConfig
+{
+    DataType t_score;
+    sspcFunc function;
 
-  sspcLaunchConfig(DataType t_score) : t_score(t_score) {}
-  sspcLaunchConfig(DataType t_score, sspcFunc function) : t_score(t_score), function(function) {}
-  bool operator==(const sspcLaunchConfig &other) { return t_score == other.t_score; }
+    sspcLaunchConfig(DataType t_score)
+        : t_score(t_score)
+    {
+    }
+    sspcLaunchConfig(DataType t_score, sspcFunc function)
+        : t_score(t_score)
+        , function(function)
+    {
+    }
+    bool operator==(const sspcLaunchConfig& other)
+    {
+        return t_score == other.t_score;
+    }
 };
 
 static std::vector<sspcLaunchConfig> sspcFuncVec;
-bool sspcInit() {
-  sspcFuncVec.push_back(sspcLaunchConfig(DataType::kFLOAT, sortScoresPerClass_gpu<float>));
-  return true;
+bool                                 sspcInit()
+{
+    sspcFuncVec.push_back(sspcLaunchConfig(DataType::kFLOAT, sortScoresPerClass_gpu<float>));
+    return true;
 }
 
-static bool initialized = sspcInit();
+static bool    initialized = sspcInit();
 
-pluginStatus_t sortScoresPerClass(cudaStream_t stream, const int num, const int num_classes,
-                                  const int num_preds_per_class, const int background_label_id,
-                                  const float confidence_threshold, const DataType DT_SCORE,
-                                  void *conf_scores_gpu, void *index_array_gpu, void *workspace) {
-  sspcLaunchConfig lc = sspcLaunchConfig(DT_SCORE);
-  for (unsigned i = 0; i < sspcFuncVec.size(); ++i) {
-    if (lc == sspcFuncVec[i]) {
-      DEBUG_PRINTF("sortScoresPerClass kernel %d\n", i);
-      return sspcFuncVec[i].function(stream, num, num_classes, num_preds_per_class,
-                                     background_label_id, confidence_threshold, conf_scores_gpu,
-                                     index_array_gpu, workspace);
+pluginStatus_t sortScoresPerClass(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int background_label_id, const float confidence_threshold, const DataType DT_SCORE, void* conf_scores_gpu, void* index_array_gpu, void* workspace)
+{
+    sspcLaunchConfig lc = sspcLaunchConfig(DT_SCORE);
+    for (unsigned i = 0; i < sspcFuncVec.size(); ++i)
+    {
+        if (lc == sspcFuncVec[i])
+        {
+            DEBUG_PRINTF("sortScoresPerClass kernel %d\n", i);
+            return sspcFuncVec[i].function(stream, num, num_classes, num_preds_per_class, background_label_id, confidence_threshold, conf_scores_gpu, index_array_gpu, workspace);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
 
-size_t sortScoresPerClassWorkspaceSize(const int num, const int num_classes,
-                                       const int num_preds_per_class, const DataType DT_CONF) {
-  size_t wss[4];
-  const int arrayLen = num * num_classes * num_preds_per_class;
-  wss[0] = arrayLen * mmdeploy::getElementSize(DT_CONF);  // temp scores
-  wss[1] = arrayLen * sizeof(int);                        // temp indices
-  wss[2] = (num * num_classes + 1) * sizeof(int);         // offsets
-  if (DT_CONF == DataType::kFLOAT) {
-    wss[3] = cubSortPairsWorkspaceSize<float, int>(arrayLen, num * num_classes);  // cub workspace
-  } else {
-    printf("SCORE type not supported\n");
-    return (size_t)-1;
-  }
+size_t sortScoresPerClassWorkspaceSize(const int num, const int num_classes, const int num_preds_per_class, const DataType DT_CONF)
+{
+    size_t    wss[4];
+    const int arrayLen = num * num_classes * num_preds_per_class;
+    wss[0]             = arrayLen * mmdeploy::getElementSize(DT_CONF);  // temp scores
+    wss[1]             = arrayLen * sizeof(int);                        // temp indices
+    wss[2]             = (num * num_classes + 1) * sizeof(int);         // offsets
+    if (DT_CONF == DataType::kFLOAT)
+    {
+        wss[3] = cubSortPairsWorkspaceSize<float, int>(arrayLen, num * num_classes);  // cub workspace
+    }
+    else
+    {
+        printf("SCORE type not supported\n");
+        return (size_t)-1;
+    }
 
-  return calculateTotalWorkspaceSize(wss, 4);
+    return calculateTotalWorkspaceSize(wss, 4);
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
index a6ad70262d..2a940b691a 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
@@ -7,75 +7,94 @@
 #include "nms/cub_helper.h"
 #include "nms/kernel.h"
 
-template <typename T_SCORE>
-pluginStatus_t sortScoresPerImage_gpu(cudaStream_t stream, const int num_images,
-                                      const int num_items_per_image, void *unsorted_scores,
-                                      void *unsorted_bbox_indices, void *sorted_scores,
-                                      void *sorted_bbox_indices, void *workspace) {
-  void *d_offsets = workspace;
-  void *cubWorkspace = nextWorkspacePtr((int8_t *)d_offsets, (num_images + 1) * sizeof(int));
+template<typename T_SCORE>
+pluginStatus_t sortScoresPerImage_gpu(cudaStream_t stream, const int num_images, const int num_items_per_image, void* unsorted_scores, void* unsorted_bbox_indices, void* sorted_scores, void* sorted_bbox_indices, void* workspace)
+{
+    void* d_offsets    = workspace;
+    void* cubWorkspace = nextWorkspacePtr((int8_t*)d_offsets, (num_images + 1) * sizeof(int));
 
-  setUniformOffsets(stream, num_images, num_items_per_image, (int *)d_offsets);
+    setUniformOffsets(stream, num_images, num_items_per_image, (int*)d_offsets);
 
-  const int arrayLen = num_images * num_items_per_image;
-  size_t temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_images);
-  cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      cubWorkspace, temp_storage_bytes, (const T_SCORE *)(unsorted_scores),
-      (T_SCORE *)(sorted_scores), (const int *)(unsorted_bbox_indices),
-      (int *)(sorted_bbox_indices), arrayLen, num_images, (const int *)d_offsets,
-      (const int *)d_offsets + 1, 0, sizeof(T_SCORE) * 8, stream);
-  CSC(cudaGetLastError(), STATUS_FAILURE);
-  return STATUS_SUCCESS;
+    const int arrayLen           = num_images * num_items_per_image;
+    size_t    temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_images);
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        cubWorkspace,
+        temp_storage_bytes,
+        (const T_SCORE*)(unsorted_scores),
+        (T_SCORE*)(sorted_scores),
+        (const int*)(unsorted_bbox_indices),
+        (int*)(sorted_bbox_indices),
+        arrayLen,
+        num_images,
+        (const int*)d_offsets,
+        (const int*)d_offsets + 1,
+        0,
+        sizeof(T_SCORE) * 8,
+        stream);
+    CSC(cudaGetLastError(), STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // sortScoresPerImage LAUNCH CONFIG
-typedef pluginStatus_t (*sspiFunc)(cudaStream_t, const int, const int, void *, void *, void *,
-                                   void *, void *);
-struct sspiLaunchConfig {
-  DataType t_score;
-  sspiFunc function;
+typedef pluginStatus_t (*sspiFunc)(cudaStream_t, const int, const int, void*, void*, void*, void*, void*);
+struct sspiLaunchConfig
+{
+    DataType t_score;
+    sspiFunc function;
 
-  sspiLaunchConfig(DataType t_score) : t_score(t_score) {}
-  sspiLaunchConfig(DataType t_score, sspiFunc function) : t_score(t_score), function(function) {}
-  bool operator==(const sspiLaunchConfig &other) { return t_score == other.t_score; }
+    sspiLaunchConfig(DataType t_score)
+        : t_score(t_score)
+    {
+    }
+    sspiLaunchConfig(DataType t_score, sspiFunc function)
+        : t_score(t_score)
+        , function(function)
+    {
+    }
+    bool operator==(const sspiLaunchConfig& other)
+    {
+        return t_score == other.t_score;
+    }
 };
 
 static std::vector<sspiLaunchConfig> sspiFuncVec;
-bool sspiInit() {
-  sspiFuncVec.push_back(sspiLaunchConfig(DataType::kFLOAT, sortScoresPerImage_gpu<float>));
-  return true;
+bool                                 sspiInit()
+{
+    sspiFuncVec.push_back(sspiLaunchConfig(DataType::kFLOAT, sortScoresPerImage_gpu<float>));
+    return true;
 }
 
-static bool initialized = sspiInit();
+static bool    initialized = sspiInit();
 
-pluginStatus_t sortScoresPerImage(cudaStream_t stream, const int num_images,
-                                  const int num_items_per_image, const DataType DT_SCORE,
-                                  void *unsorted_scores, void *unsorted_bbox_indices,
-                                  void *sorted_scores, void *sorted_bbox_indices, void *workspace) {
-  sspiLaunchConfig lc = sspiLaunchConfig(DT_SCORE);
-  for (unsigned i = 0; i < sspiFuncVec.size(); ++i) {
-    if (lc == sspiFuncVec[i]) {
-      DEBUG_PRINTF("sortScoresPerImage kernel %d\n", i);
-      return sspiFuncVec[i].function(stream, num_images, num_items_per_image, unsorted_scores,
-                                     unsorted_bbox_indices, sorted_scores, sorted_bbox_indices,
-                                     workspace);
+pluginStatus_t sortScoresPerImage(cudaStream_t stream, const int num_images, const int num_items_per_image, const DataType DT_SCORE, void* unsorted_scores, void* unsorted_bbox_indices, void* sorted_scores, void* sorted_bbox_indices, void* workspace)
+{
+    sspiLaunchConfig lc = sspiLaunchConfig(DT_SCORE);
+    for (unsigned i = 0; i < sspiFuncVec.size(); ++i)
+    {
+        if (lc == sspiFuncVec[i])
+        {
+            DEBUG_PRINTF("sortScoresPerImage kernel %d\n", i);
+            return sspiFuncVec[i].function(stream, num_images, num_items_per_image, unsorted_scores, unsorted_bbox_indices, sorted_scores, sorted_bbox_indices, workspace);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
 
-size_t sortScoresPerImageWorkspaceSize(const int num_images, const int num_items_per_image,
-                                       const DataType DT_SCORE) {
-  const int arrayLen = num_images * num_items_per_image;
-  size_t wss[2];
-  wss[0] = (num_images + 1) * sizeof(int);  // offsets
-  if (DT_SCORE == DataType::kFLOAT) {
-    wss[1] = cubSortPairsWorkspaceSize<float, int>(arrayLen,
-                                                   num_images);  // cub workspace
-  } else {
-    printf("SCORE type not supported.\n");
-    return (size_t)-1;
-  }
+size_t sortScoresPerImageWorkspaceSize(const int num_images, const int num_items_per_image, const DataType DT_SCORE)
+{
+    const int arrayLen = num_images * num_items_per_image;
+    size_t    wss[2];
+    wss[0] = (num_images + 1) * sizeof(int);  // offsets
+    if (DT_SCORE == DataType::kFLOAT)
+    {
+        wss[1] = cubSortPairsWorkspaceSize<float, int>(arrayLen,
+                                                       num_images);  // cub workspace
+    }
+    else
+    {
+        printf("SCORE type not supported.\n");
+        return (size_t)-1;
+    }
 
-  return calculateTotalWorkspaceSize(wss, 2);
+    return calculateTotalWorkspaceSize(wss, 2);
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
index 47e8ae8615..67fa9d7961 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
@@ -4,92 +4,98 @@
 
 using mmdeploy::TensorDesc;
 
-template <class scalar_t>
-__global__ void copy_permute_kernel(scalar_t *__restrict__ dst, const scalar_t *__restrict__ src,
-                                    int n, TensorDesc ts_src_stride, TensorDesc ts_dst_stride,
-                                    TensorDesc ts_permute) {
-  const int src_dim = ts_src_stride.dim;
-  const auto src_stride = ts_src_stride.stride;
-  const auto dst_stride = ts_dst_stride.stride;
-  const auto permute = ts_permute.shape;
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    size_t dst_index = index;
-    size_t src_index = 0;
-    for (int i = 0; i < src_dim; ++i) {
-      int dim_index = dst_index / dst_stride[i];
-      dst_index = dst_index % dst_stride[i];
-      src_index += dim_index * src_stride[permute[i]];
+template<class scalar_t>
+__global__ void copy_permute_kernel(scalar_t* __restrict__ dst, const scalar_t* __restrict__ src, int n, TensorDesc ts_src_stride, TensorDesc ts_dst_stride, TensorDesc ts_permute)
+{
+    const int  src_dim    = ts_src_stride.dim;
+    const auto src_stride = ts_src_stride.stride;
+    const auto dst_stride = ts_dst_stride.stride;
+    const auto permute    = ts_permute.shape;
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        size_t dst_index = index;
+        size_t src_index = 0;
+        for (int i = 0; i < src_dim; ++i)
+        {
+            int dim_index = dst_index / dst_stride[i];
+            dst_index     = dst_index % dst_stride[i];
+            src_index += dim_index * src_stride[permute[i]];
+        }
+        dst[index] = src[src_index];
     }
-    dst[index] = src[src_index];
-  }
 }
 
-template <class scalar_t>
-void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size, int *permute, int src_dim,
-                   cudaStream_t stream) {
-  size_t copy_size = 1;
-  TensorDesc ts_permute;
-  memcpy(&(ts_permute.shape[0]), permute, src_dim * sizeof(int));
+template<class scalar_t>
+void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim, cudaStream_t stream)
+{
+    size_t     copy_size = 1;
+    TensorDesc ts_permute;
+    memcpy(&(ts_permute.shape[0]), permute, src_dim * sizeof(int));
 
-  TensorDesc ts_src_stride;
-  TensorDesc ts_dst_stride;
-  ts_src_stride.dim = src_dim;
-  ts_dst_stride.dim = src_dim;
-  int *src_stride = &(ts_src_stride.stride[0]);
-  int *dst_stride = &(ts_dst_stride.stride[0]);
-  int *dst_size = &(ts_dst_stride.shape[0]);
-  src_stride[src_dim - 1] = 1;
-  dst_stride[src_dim - 1] = 1;
+    TensorDesc ts_src_stride;
+    TensorDesc ts_dst_stride;
+    ts_src_stride.dim       = src_dim;
+    ts_dst_stride.dim       = src_dim;
+    int* src_stride         = &(ts_src_stride.stride[0]);
+    int* dst_stride         = &(ts_dst_stride.stride[0]);
+    int* dst_size           = &(ts_dst_stride.shape[0]);
+    src_stride[src_dim - 1] = 1;
+    dst_stride[src_dim - 1] = 1;
 
-  for (int i = src_dim - 1; i >= 0; --i) {
-    dst_size[i] = src_size[permute[i]];
-    if (i < src_dim - 1) {
-      src_stride[i] = src_stride[i + 1] * src_size[i + 1];
+    for (int i = src_dim - 1; i >= 0; --i)
+    {
+        dst_size[i] = src_size[permute[i]];
+        if (i < src_dim - 1)
+        {
+            src_stride[i] = src_stride[i + 1] * src_size[i + 1];
+        }
     }
-  }
 
-  for (int i = src_dim - 1; i >= 0; --i) {
-    copy_size *= dst_size[i];
-    if (i < src_dim - 1) {
-      dst_stride[i] = dst_stride[i + 1] * dst_size[i + 1];
+    for (int i = src_dim - 1; i >= 0; --i)
+    {
+        copy_size *= dst_size[i];
+        if (i < src_dim - 1)
+        {
+            dst_stride[i] = dst_stride[i + 1] * dst_size[i + 1];
+        }
     }
-  }
 
-  copy_permute_kernel<scalar_t><<<GET_BLOCKS(copy_size), THREADS_PER_BLOCK, 0, stream>>>(
-      dst, src, copy_size, ts_src_stride, ts_dst_stride, ts_permute);
+    copy_permute_kernel<scalar_t><<<GET_BLOCKS(copy_size), THREADS_PER_BLOCK, 0, stream>>>(
+        dst,
+        src,
+        copy_size,
+        ts_src_stride,
+        ts_dst_stride,
+        ts_permute);
 }
 
-template void memcpyPermute<float>(float *dst, const float *src, int *src_size, int *permute,
-                                   int src_dim, cudaStream_t stream);
-template void memcpyPermute<half>(half *dst, const half *src, int *src_size, int *permute,
-                                  int src_dim, cudaStream_t stream);
+template void memcpyPermute<float>(float* dst, const float* src, int* src_size, int* permute, int src_dim, cudaStream_t stream);
+template void memcpyPermute<half>(half* dst, const half* src, int* src_size, int* permute, int src_dim, cudaStream_t stream);
 
-cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t *cudnn_dtype) {
-  switch (trt_dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      *cudnn_dtype = CUDNN_DATA_FLOAT;
-      break;
-    case nvinfer1::DataType::kHALF:
-      *cudnn_dtype = CUDNN_DATA_HALF;
-      break;
-    default:
-      return CUDNN_STATUS_BAD_PARAM;
-  }
-  return CUDNN_STATUS_SUCCESS;
+cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t* cudnn_dtype)
+{
+    switch (trt_dtype)
+    {
+        case nvinfer1::DataType::kFLOAT:
+            *cudnn_dtype = CUDNN_DATA_FLOAT;
+            break;
+        case nvinfer1::DataType::kHALF:
+            *cudnn_dtype = CUDNN_DATA_HALF;
+            break;
+        default:
+            return CUDNN_STATUS_BAD_PARAM;
+    }
+    return CUDNN_STATUS_SUCCESS;
 }
 
-template <>
-cublasStatus_t cublasGemmWrap<float>(cublasHandle_t handle, cublasOperation_t transa,
-                                     cublasOperation_t transb, int m, int n, int k,
-                                     const float *alpha, const float *A, int lda, const float *B,
-                                     int ldb, const float *beta, float *C, int ldc) {
-  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+template<>
+cublasStatus_t cublasGemmWrap<float>(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc)
+{
+    return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-template <>
-cublasStatus_t cublasGemmWrap<half>(cublasHandle_t handle, cublasOperation_t transa,
-                                    cublasOperation_t transb, int m, int n, int k,
-                                    const half *alpha, const half *A, int lda, const half *B,
-                                    int ldb, const half *beta, half *C, int ldc) {
-  return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+template<>
+cublasStatus_t cublasGemmWrap<half>(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const half* alpha, const half* A, int lda, const half* B, int ldb, const half* beta, half* C, int ldc)
+{
+    return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
index 0d518323d2..b833a7e19a 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
@@ -10,254 +10,302 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVDeformConv2d"};
-}  // namespace
-
-DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string &name,
-                                                         const nvinfer1::Dims stride,
-                                                         const nvinfer1::Dims padding,
-                                                         const nvinfer1::Dims dilation,
-                                                         const int deformableGroup, const int group)
-    : TRTPluginBase(name),
-      mStride(stride),
-      mPadding(padding),
-      mDilation(dilation),
-      mDeformableGroup(deformableGroup),
-      mGroup(group) {}
-
-DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string name, const void *data,
-                                                         size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mStride);
-  deserialize_value(&data, &length, &mPadding);
-  deserialize_value(&data, &length, &mDilation);
-  deserialize_value(&data, &length, &mDeformableGroup);
-  deserialize_value(&data, &length, &mGroup);
-}
-DeformableConvPluginDynamic::~DeformableConvPluginDynamic() {}
-
-nvinfer1::IPluginV2DynamicExt *DeformableConvPluginDynamic::clone() const TRT_NOEXCEPT {
-  DeformableConvPluginDynamic *plugin = new DeformableConvPluginDynamic(
-      mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  // input[0] == input
-  // input[1] == offset
-  // input[2] == weight
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[2].d[0];
-
-  ret.d[2] = inputs[1].d[2];
-  ret.d[3] = inputs[1].d[3];
-
-  return ret;
-}
-
-bool DeformableConvPluginDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
-             ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-  } else {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  }
-}
-
-void DeformableConvPluginDynamic::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
-                                                  int nbInputs,
-                                                  const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                                  int nbOutputs) TRT_NOEXCEPT {}
-
-size_t DeformableConvPluginDynamic::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
-                                                     int nbInputs,
-                                                     const nvinfer1::PluginTensorDesc *outputs,
-                                                     int nbOutputs) const TRT_NOEXCEPT {
-  int sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
-
-  int batch_size = inputs[0].dims.d[0];
-  int nInputPlane = inputs[0].dims.d[1];
-  int inputHeight = inputs[0].dims.d[2];
-  int inputWidth = inputs[0].dims.d[3];
-
-  int nOutputPlane = outputs[0].dims.d[1];
-  int outputHeight = outputs[0].dims.d[2];
-  int outputWidth = outputs[0].dims.d[3];
-
-  int kW = inputs[2].dims.d[2];
-  int kH = inputs[2].dims.d[3];
-  int im2col_step = std::min(32, batch_size);
-
-  size_t col_size = mmdeploy::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
-                                             outputWidth * sizeof_dtype);
-
-  size_t out_size = 0;
-  if (im2col_step != 1)
-    out_size = mmdeploy::getAlignedSize(batch_size * nOutputPlane * outputHeight * outputWidth *
-                                        sizeof_dtype);
-
-  return col_size + out_size;
-}
-
-int DeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                         const nvinfer1::PluginTensorDesc *outputDesc,
-                                         const void *const *inputs, void *const *outputs,
-                                         void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  int batch = inputDesc[0].dims.d[0];
-  int channels = inputDesc[0].dims.d[1];
-  int height = inputDesc[0].dims.d[2];
-  int width = inputDesc[0].dims.d[3];
-  int channels_out = outputDesc[0].dims.d[1];
-  int kernel_h = inputDesc[2].dims.d[2];
-  int kernel_w = inputDesc[2].dims.d[3];
-
-  const void *x = inputs[0];
-  const void *offset = inputs[1];
-  const void *weight = inputs[2];
-  void *output = outputs[0];
-  int im2col_step = std::min(batch, 32);
-
-  auto data_type = inputDesc[0].type;
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      deform_conv<float>((float *)x, (float *)weight, (float *)offset, (float *)output, workSpace,
-                         batch, channels, height, width, channels_out, kernel_w, kernel_h,
-                         mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0],
-                         mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle,
-                         stream);
-      break;
-    case nvinfer1::DataType::kHALF:
-      deform_conv<half>((half *)x, (half *)weight, (half *)offset, (half *)output, workSpace, batch,
-                        channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0],
-                        mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1],
-                        mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
-      break;
-    default:
-      return 1;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType DeformableConvPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *DeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *DeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-int DeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t DeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
-         serialized_size(mDeformableGroup) + serialized_size(mGroup);
-}
-
-void DeformableConvPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mStride);
-  serialize_value(&buffer, mPadding);
-  serialize_value(&buffer, mDilation);
-  serialize_value(&buffer, mDeformableGroup);
-  serialize_value(&buffer, mGroup);
-}
-
-void DeformableConvPluginDynamic::attachToContext(
-    cudnnContext *cudnnContext, cublasContext *cublasContext,
-    nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {
-  m_cublas_handle = cublasContext;
-}
-
-void DeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
-
-////////////////////// creator /////////////////////////////
-
-DeformableConvPluginDynamicCreator::DeformableConvPluginDynamicCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *DeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *DeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  nvinfer1::Dims stride{2, {1, 1}};
-  nvinfer1::Dims padding{2, {0, 0}};
-  nvinfer1::Dims dilation{2, {1, 1}};
-  int deformableGroup = 1;
-  int group = 1;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVDeformConv2d"};
+    }  // namespace
+
+    DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string&   name,
+                                                             const nvinfer1::Dims stride,
+                                                             const nvinfer1::Dims padding,
+                                                             const nvinfer1::Dims dilation,
+                                                             const int            deformableGroup,
+                                                             const int            group)
+        : TRTPluginBase(name)
+        , mStride(stride)
+        , mPadding(padding)
+        , mDilation(dilation)
+        , mDeformableGroup(deformableGroup)
+        , mGroup(group)
+    {
     }
-    std::string field_name(fc->fields[i].name);
 
-    if (field_name.compare("deform_groups") == 0) {
-      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mStride);
+        deserialize_value(&data, &length, &mPadding);
+        deserialize_value(&data, &length, &mDilation);
+        deserialize_value(&data, &length, &mDeformableGroup);
+        deserialize_value(&data, &length, &mGroup);
+    }
+    DeformableConvPluginDynamic::~DeformableConvPluginDynamic() {}
+
+    nvinfer1::IPluginV2DynamicExt* DeformableConvPluginDynamic::clone() const TRT_NOEXCEPT
+    {
+        DeformableConvPluginDynamic* plugin = new DeformableConvPluginDynamic(
+            mLayerName,
+            mStride,
+            mPadding,
+            mDilation,
+            mDeformableGroup,
+            mGroup);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        // input[0] == input
+        // input[1] == offset
+        // input[2] == weight
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 4;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[2].d[0];
+
+        ret.d[2] = inputs[1].d[2];
+        ret.d[3] = inputs[1].d[3];
+
+        return ret;
+    }
+
+    bool DeformableConvPluginDynamic::supportsFormatCombination(
+        int                               pos,
+        const nvinfer1::PluginTensorDesc* ioDesc,
+        int                               nbInputs,
+        int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
+                     ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+    }
+
+    void   DeformableConvPluginDynamic::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                      int                                      nbInputs,
+                                                      const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                      int                                      nbOutputs) TRT_NOEXCEPT {}
+
+    size_t DeformableConvPluginDynamic::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                         int                               nbInputs,
+                                                         const nvinfer1::PluginTensorDesc* outputs,
+                                                         int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        int    sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
+
+        int    batch_size  = inputs[0].dims.d[0];
+        int    nInputPlane = inputs[0].dims.d[1];
+        int    inputHeight = inputs[0].dims.d[2];
+        int    inputWidth  = inputs[0].dims.d[3];
+
+        int    nOutputPlane = outputs[0].dims.d[1];
+        int    outputHeight = outputs[0].dims.d[2];
+        int    outputWidth  = outputs[0].dims.d[3];
+
+        int    kW          = inputs[2].dims.d[2];
+        int    kH          = inputs[2].dims.d[3];
+        int    im2col_step = std::min(32, batch_size);
+
+        size_t col_size = mmdeploy::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
+                                                   outputWidth * sizeof_dtype);
+
+        size_t out_size = 0;
+        if (im2col_step != 1)
+            out_size = mmdeploy::getAlignedSize(batch_size * nOutputPlane * outputHeight * outputWidth *
+                                                sizeof_dtype);
+
+        return col_size + out_size;
+    }
+
+    int DeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                             const nvinfer1::PluginTensorDesc* outputDesc,
+                                             const void* const*                inputs,
+                                             void* const*                      outputs,
+                                             void*                             workSpace,
+                                             cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int         batch        = inputDesc[0].dims.d[0];
+        int         channels     = inputDesc[0].dims.d[1];
+        int         height       = inputDesc[0].dims.d[2];
+        int         width        = inputDesc[0].dims.d[3];
+        int         channels_out = outputDesc[0].dims.d[1];
+        int         kernel_h     = inputDesc[2].dims.d[2];
+        int         kernel_w     = inputDesc[2].dims.d[3];
+
+        const void* x           = inputs[0];
+        const void* offset      = inputs[1];
+        const void* weight      = inputs[2];
+        void*       output      = outputs[0];
+        int         im2col_step = std::min(batch, 32);
+
+        auto        data_type = inputDesc[0].type;
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                deform_conv<float>((float*)x, (float*)weight, (float*)offset, (float*)output, workSpace, batch, channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
+                break;
+            case nvinfer1::DataType::kHALF:
+                deform_conv<half>((half*)x, (half*)weight, (half*)offset, (half*)output, workSpace, batch, channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
+                break;
+            default:
+                return 1;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType DeformableConvPluginDynamic::getOutputDataType(
+        int                       index,
+        const nvinfer1::DataType* inputTypes,
+        int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* DeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* DeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int DeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t DeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
+               serialized_size(mDeformableGroup) + serialized_size(mGroup);
+    }
+
+    void DeformableConvPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mStride);
+        serialize_value(&buffer, mPadding);
+        serialize_value(&buffer, mDilation);
+        serialize_value(&buffer, mDeformableGroup);
+        serialize_value(&buffer, mGroup);
+    }
+
+    void DeformableConvPluginDynamic::attachToContext(
+        cudnnContext*            cudnnContext,
+        cublasContext*           cublasContext,
+        nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
+    {
+        m_cublas_handle = cublasContext;
+    }
+
+    void DeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
+
+    ////////////////////// creator /////////////////////////////
+
+    DeformableConvPluginDynamicCreator::DeformableConvPluginDynamicCreator()
+    {
+        mPluginAttributes.clear();
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
     }
 
-    if (field_name.compare("groups") == 0) {
-      group = static_cast<const int *>(fc->fields[i].data)[0];
+    const char* DeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
     }
 
-    if (field_name.compare("stride") == 0) {
-      stride.nbDims = 2;
-      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    const char* DeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
     }
 
-    if (field_name.compare("padding") == 0) {
-      padding.nbDims = 2;
-      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    nvinfer1::IPluginV2* DeformableConvPluginDynamicCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        nvinfer1::Dims stride{2, {1, 1}};
+        nvinfer1::Dims padding{2, {0, 0}};
+        nvinfer1::Dims dilation{2, {1, 1}};
+        int            deformableGroup = 1;
+        int            group           = 1;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("deform_groups") == 0)
+            {
+                deformableGroup = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("groups") == 0)
+            {
+                group = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("stride") == 0)
+            {
+                stride.nbDims = 2;
+                stride.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                stride.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+
+            if (field_name.compare("padding") == 0)
+            {
+                padding.nbDims = 2;
+                padding.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                padding.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+
+            if (field_name.compare("dilation") == 0)
+            {
+                dilation.nbDims = 2;
+                dilation.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                dilation.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+        }
+
+        DeformableConvPluginDynamic* plugin =
+            new DeformableConvPluginDynamic(name, stride, padding, dilation, deformableGroup, group);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
     }
 
-    if (field_name.compare("dilation") == 0) {
-      dilation.nbDims = 2;
-      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    nvinfer1::IPluginV2* DeformableConvPluginDynamicCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new DeformableConvPluginDynamic(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
     }
-  }
-
-  DeformableConvPluginDynamic *plugin =
-      new DeformableConvPluginDynamic(name, stride, padding, dilation, deformableGroup, group);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new DeformableConvPluginDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(DeformableConvPluginDynamicCreator);
+    REGISTER_TENSORRT_PLUGIN(DeformableConvPluginDynamicCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
index 3ea0ccbefe..6d3b4f936c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
@@ -9,73 +9,68 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class DeformableConvPluginDynamic : public TRTPluginBase {
- public:
-  DeformableConvPluginDynamic(const std::string &name, const nvinfer1::Dims stride,
-                              const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
-                              const int deformableGroup, const int group);
-
-  DeformableConvPluginDynamic(const std::string name, const void *data, size_t length);
-
-  DeformableConvPluginDynamic() = delete;
-
-  ~DeformableConvPluginDynamic() TRT_NOEXCEPT override;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
-                       nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override;
-  void detachFromContext() TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-
- private:
-  nvinfer1::Dims mStride;
-  nvinfer1::Dims mPadding;
-  nvinfer1::Dims mDilation;
-  int mDeformableGroup;
-  int mGroup;
-
-  cublasHandle_t m_cublas_handle;
-};
-
-class DeformableConvPluginDynamicCreator : public TRTPluginCreatorBase {
- public:
-  DeformableConvPluginDynamicCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class DeformableConvPluginDynamic : public TRTPluginBase
+    {
+      public:
+        DeformableConvPluginDynamic(const std::string& name, const nvinfer1::Dims stride, const nvinfer1::Dims padding, const nvinfer1::Dims dilation, const int deformableGroup, const int group);
+
+        DeformableConvPluginDynamic(const std::string name, const void* data, size_t length);
+
+        DeformableConvPluginDynamic() = delete;
+
+        ~DeformableConvPluginDynamic() TRT_NOEXCEPT override;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+        void               attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+        void               detachFromContext() TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+
+      private:
+        nvinfer1::Dims mStride;
+        nvinfer1::Dims mPadding;
+        nvinfer1::Dims mDilation;
+        int            mDeformableGroup;
+        int            mGroup;
+
+        cublasHandle_t m_cublas_handle;
+    };
+
+    class DeformableConvPluginDynamicCreator : public TRTPluginCreatorBase
+    {
+      public:
+        DeformableConvPluginDynamicCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_DEFORM_CONV_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
index 3f401fc9e2..8fe86280af 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
@@ -68,105 +68,107 @@
 #include "trt_deform_conv_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename scalar_t>
-void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column,
-                        const int channels, const int height, const int width, const int ksize_h,
-                        const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
-                        const int stride_w, const int dilation_h, const int dilation_w,
-                        const int parallel_imgs, const int deformable_group, cudaStream_t stream) {
-  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = channels * height_col * width_col * parallel_imgs;
-  int channel_per_deformable_group = channels / deformable_group;
-
-  deformable_im2col_gpu_kernel<scalar_t><<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
-      num_kernels, input, offset, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
-      dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, channels,
-      deformable_group, height_col, width_col, column);
-
-  cudaCheckError();
+template<typename scalar_t>
+void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, cudaStream_t stream)
+{
+    int height_col                   = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+    int width_col                    = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+    int num_kernels                  = channels * height_col * width_col * parallel_imgs;
+    int channel_per_deformable_group = channels / deformable_group;
+
+    deformable_im2col_gpu_kernel<scalar_t><<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+        num_kernels,
+        input,
+        offset,
+        height,
+        width,
+        ksize_h,
+        ksize_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        channel_per_deformable_group,
+        parallel_imgs,
+        channels,
+        deformable_group,
+        height_col,
+        width_col,
+        column);
+
+    cudaCheckError();
 }
 
-template <typename scalar_t>
-void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
-                 scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight,
-                 int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
-                 int padH, int dilationW, int dilationH, int group, int deformable_group,
-                 int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
-  size_t word_size = sizeof(scalar_t);
-
-  im2col_step = std::min(int(batchSize), im2col_step);
-  long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
-  long outputHW = outputHeight * outputWidth;
-  long kHW = kH * kW;
-  long columns_size =
-      mmdeploy::getAlignedSize(nInputPlane * kHW * im2col_step * outputHW * word_size);
-
-  // column buffer for img2col
-  char* workspace_ptr = reinterpret_cast<char*>(workspace);
-  scalar_t* columns = reinterpret_cast<scalar_t*>(workspace_ptr);
-  workspace_ptr = workspace_ptr + columns_size;
-
-  scalar_t* output_buffer;
-  if (im2col_step == 1) {
-    output_buffer = output;
-  } else {
-    // output need permute when im2col_step!=1
-    output_buffer = reinterpret_cast<scalar_t*>(workspace_ptr);
-  }
-
-  long input_elt_step = im2col_step * nInputPlane * inputHeight * inputWidth;
-  long offset_elt_step = im2col_step * deformable_group * 2 * kHW * outputHW;
-  long out_buffer_step = nOutputPlane * im2col_step * outputHW;
-  long col_g_step = nInputPlane * kHW * im2col_step * outputHW / group;
-  long weight_g_step = nOutputPlane * nInputPlane * kHW / (group * group);
-  long out_buffer_g_step = out_buffer_step / group;
-  int m = nOutputPlane / group;
-  int n = im2col_step * outputHW;
-  int k = nInputPlane * kHW / group;
-  scalar_t alpha = 1.f;
-  scalar_t beta = 0.f;
-
-  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    const scalar_t* input_start = input + elt * input_elt_step;
-    const scalar_t* offset_start = offset + elt * offset_elt_step;
-
-    deform_conv_im2col<scalar_t>(input_start, offset_start, columns, nInputPlane, inputHeight,
-                                 inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW,
-                                 im2col_step, deformable_group, stream);
-
-    for (int g = 0; g < group; ++g) {
-      const scalar_t* weight_start = weight + g * weight_g_step;
-      scalar_t* col_start = columns + g * col_g_step;
-      scalar_t* out_buffer_start = output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
-
-      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start,
-                               n, weight_start, k, &beta, out_buffer_start, n);
-      cudaCheckError();
+template<typename scalar_t>
+void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset, scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream)
+{
+    size_t word_size = sizeof(scalar_t);
+
+    im2col_step       = std::min(int(batchSize), im2col_step);
+    long outputWidth  = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+    long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+    long outputHW = outputHeight * outputWidth;
+    long kHW      = kH * kW;
+    long columns_size =
+        mmdeploy::getAlignedSize(nInputPlane * kHW * im2col_step * outputHW * word_size);
+
+    // column buffer for img2col
+    char*     workspace_ptr = reinterpret_cast<char*>(workspace);
+    scalar_t* columns       = reinterpret_cast<scalar_t*>(workspace_ptr);
+    workspace_ptr           = workspace_ptr + columns_size;
+
+    scalar_t* output_buffer;
+    if (im2col_step == 1)
+    {
+        output_buffer = output;
+    }
+    else
+    {
+        // output need permute when im2col_step!=1
+        output_buffer = reinterpret_cast<scalar_t*>(workspace_ptr);
+    }
+
+    long     input_elt_step    = im2col_step * nInputPlane * inputHeight * inputWidth;
+    long     offset_elt_step   = im2col_step * deformable_group * 2 * kHW * outputHW;
+    long     out_buffer_step   = nOutputPlane * im2col_step * outputHW;
+    long     col_g_step        = nInputPlane * kHW * im2col_step * outputHW / group;
+    long     weight_g_step     = nOutputPlane * nInputPlane * kHW / (group * group);
+    long     out_buffer_g_step = out_buffer_step / group;
+    int      m                 = nOutputPlane / group;
+    int      n                 = im2col_step * outputHW;
+    int      k                 = nInputPlane * kHW / group;
+    scalar_t alpha             = 1.f;
+    scalar_t beta              = 0.f;
+
+    for (int elt = 0; elt < batchSize / im2col_step; elt++)
+    {
+        const scalar_t* input_start  = input + elt * input_elt_step;
+        const scalar_t* offset_start = offset + elt * offset_elt_step;
+
+        deform_conv_im2col<scalar_t>(input_start, offset_start, columns, nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, stream);
+
+        for (int g = 0; g < group; ++g)
+        {
+            const scalar_t* weight_start     = weight + g * weight_g_step;
+            scalar_t*       col_start        = columns + g * col_g_step;
+            scalar_t*       out_buffer_start = output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
+
+            cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start, n, weight_start, k, &beta, out_buffer_start, n);
+            cudaCheckError();
+        }
+    }
+
+    if (im2col_step != 1)
+    {
+        int output_buffer_shape[5]   = {batchSize / im2col_step, nOutputPlane, im2col_step, static_cast<int>(outputHeight), static_cast<int>(outputWidth)};
+        int output_buffer_permute[5] = {0, 2, 1, 3, 4};
+        memcpyPermute<scalar_t>(output, output_buffer, &output_buffer_shape[0], &output_buffer_permute[0], 5, stream);
     }
-  }
-
-  if (im2col_step != 1) {
-    int output_buffer_shape[5] = {batchSize / im2col_step, nOutputPlane, im2col_step,
-                                  static_cast<int>(outputHeight), static_cast<int>(outputWidth)};
-    int output_buffer_permute[5] = {0, 2, 1, 3, 4};
-    memcpyPermute<scalar_t>(output, output_buffer, &output_buffer_shape[0],
-                            &output_buffer_permute[0], 5, stream);
-  }
 }
 
-template void deform_conv<float>(const float* input, const float* weight, const float* offset,
-                                 float* output, void* workspace, int batchSize, int nInputPlane,
-                                 int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH,
-                                 int dW, int dH, int padW, int padH, int dilationW, int dilationH,
-                                 int group, int deformable_group, int im2col_step,
-                                 cublasHandle_t cublas_handle, cudaStream_t stream);
-
-template void deform_conv<__half>(const __half* input, const __half* weight, const __half* offset,
-                                  __half* output, void* workspace, int batchSize, int nInputPlane,
-                                  int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH,
-                                  int dW, int dH, int padW, int padH, int dilationW, int dilationH,
-                                  int group, int deformable_group, int im2col_step,
-                                  cublasHandle_t cublas_handle, cudaStream_t stream);
+template void deform_conv<float>(const float* input, const float* weight, const float* offset, float* output, void* workspace, int batchSize, int nInputPlane, int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+
+template void deform_conv<__half>(const __half* input, const __half* weight, const __half* offset, __half* output, void* workspace, int batchSize, int nInputPlane, int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
index c91f17ca4a..330f4b331a 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
@@ -67,108 +67,134 @@
 
 #include "common_cuda_helper.hpp"
 
-template <typename scalar_t>
+template<typename scalar_t>
 __device__ __forceinline__ scalar_t deformable_im2col_bilinear(const scalar_t* __restrict__ input,
-                                                               const int height, const int width,
-                                                               float h, float w) {
-  if (h <= -1 || height <= h || w <= -1 || width <= w) {
-    return 0;
-  }
+                                                               const int height,
+                                                               const int width,
+                                                               float     h,
+                                                               float     w)
+{
+    if (h <= -1 || height <= h || w <= -1 || width <= w)
+    {
+        return 0;
+    }
 
-  const int h_low = floorf(h);
-  const int w_low = floorf(w);
+    const int h_low = floorf(h);
+    const int w_low = floorf(w);
 
-  input += h_low * width;
-  const scalar_t v1 = (h_low >= 0 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
-  const int w_high = w_low + 1;
-  const scalar_t v2 =
-      (h_low >= 0 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
-  const scalar_t lw = w - w_low;
-  const scalar_t v_low = fmaf(v2 - v1, lw, v1);
-  input += width;
-  const scalar_t v3 =
-      (h_low <= height - 2 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
-  const scalar_t v4 =
-      (h_low <= height - 2 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
-  const scalar_t v_high = fmaf(v4 - v3, lw, v3);
-  const scalar_t lh = h - h_low;
-  const scalar_t val = fmaf(v_high - v_low, lh, v_low);
-  return val;
+    input += h_low * width;
+    const scalar_t v1     = (h_low >= 0 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
+    const int      w_high = w_low + 1;
+    const scalar_t v2 =
+        (h_low >= 0 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
+    const scalar_t lw    = w - w_low;
+    const scalar_t v_low = fmaf(v2 - v1, lw, v1);
+    input += width;
+    const scalar_t v3 =
+        (h_low <= height - 2 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
+    const scalar_t v4 =
+        (h_low <= height - 2 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
+    const scalar_t v_high = fmaf(v4 - v3, lw, v3);
+    const scalar_t lh     = h - h_low;
+    const scalar_t val    = fmaf(v_high - v_low, lh, v_low);
+    return val;
 }
 
-template <>
+template<>
 __device__ __forceinline__ __half deformable_im2col_bilinear(const __half* __restrict__ input,
-                                                             const int height, const int width,
-                                                             float h, float w) {
-  if (h <= -1 || height <= h || w <= -1 || width <= w) {
-    return 0;
-  }
+                                                             const int height,
+                                                             const int width,
+                                                             float     h,
+                                                             float     w)
+{
+    if (h <= -1 || height <= h || w <= -1 || width <= w)
+    {
+        return 0;
+    }
 
-  const int h_low = floorf(h);
-  const int w_low = floorf(w);
+    const int h_low = floorf(h);
+    const int w_low = floorf(w);
 
-  input += h_low * width;
-  const float v1 = (h_low >= 0 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
-  const int w_high = w_low + 1;
-  const float v2 = (h_low >= 0 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
-  const float lw = w - w_low;
-  const float v_low = fmaf(v2 - v1, lw, v1);
-  input += width;
-  const float v3 = (h_low <= height - 2 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
-  const float v4 =
-      (h_low <= height - 2 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
-  const float v_high = fmaf(v4 - v3, lw, v3);
-  const float lh = h - h_low;
-  const float val = fmaf(v_high - v_low, lh, v_low);
-  return __float2half(val);
+    input += h_low * width;
+    const float v1     = (h_low >= 0 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
+    const int   w_high = w_low + 1;
+    const float v2     = (h_low >= 0 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
+    const float lw     = w - w_low;
+    const float v_low  = fmaf(v2 - v1, lw, v1);
+    input += width;
+    const float v3 = (h_low <= height - 2 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
+    const float v4 =
+        (h_low <= height - 2 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
+    const float v_high = fmaf(v4 - v3, lw, v3);
+    const float lh     = h - h_low;
+    const float val    = fmaf(v_high - v_low, lh, v_low);
+    return __float2half(val);
 }
 
-template <typename scalar_t>
+template<typename scalar_t>
 __global__ void deformable_im2col_gpu_kernel(
-    const int n, const scalar_t* __restrict__ data_im, const scalar_t* __restrict__ data_offset,
-    const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h,
-    const int pad_w, const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int channel_per_deformable_group, const int batch_size,
-    const int num_channels, const int deformable_group, const int height_col, const int width_col,
-    scalar_t* __restrict__ data_col) {
-  const int hw_col = height_col * width_col;
-  const int data_col_step = batch_size * hw_col;
+    const int n,
+    const scalar_t* __restrict__ data_im,
+    const scalar_t* __restrict__ data_offset,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* __restrict__ data_col)
+{
+    const int hw_col        = height_col * width_col;
+    const int data_col_step = batch_size * hw_col;
 
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    // index index of output matrix
-    int tmp_index = index;
-    const int w_col = tmp_index % width_col;
-    tmp_index /= width_col;
-    const int h_col = tmp_index % height_col;
-    tmp_index /= height_col;
-    const int b_col = tmp_index % batch_size;
-    const int c_im = tmp_index / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        // index index of output matrix
+        int       tmp_index = index;
+        const int w_col     = tmp_index % width_col;
+        tmp_index /= width_col;
+        const int h_col = tmp_index % height_col;
+        tmp_index /= height_col;
+        const int b_col = tmp_index % batch_size;
+        const int c_im  = tmp_index / batch_size;
+        const int c_col = c_im * kernel_h * kernel_w;
 
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
+        // compute deformable group index
+        const int deformable_group_index = c_im / channel_per_deformable_group;
 
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-    scalar_t* __restrict__ data_col_ptr = data_col + c_col * data_col_step + index % data_col_step;
-    const scalar_t* __restrict__ data_im_ptr =
-        data_im + (b_col * num_channels + c_im) * height * width;
-    const scalar_t* __restrict__ data_offset_ptr =
-        data_offset +
-        ((b_col * deformable_group + deformable_group_index) << 1) * kernel_h * kernel_w * hw_col +
-        h_col * width_col + w_col;
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h = (i * kernel_w + j) * hw_col << 1;
-        const scalar_t offset_h = data_offset_ptr[data_offset_h];
-        const int data_offset_w = data_offset_h + hw_col;
-        const scalar_t offset_w = data_offset_ptr[data_offset_w];
-        const scalar_t h_im = h_in + i * dilation_h + (float)offset_h;
-        const scalar_t w_im = w_in + j * dilation_w + (float)offset_w;
-        const scalar_t val = deformable_im2col_bilinear(data_im_ptr, height, width, h_im, w_im);
-        *data_col_ptr = val;
-        data_col_ptr += data_col_step;
-      }
+        const int h_in                      = h_col * stride_h - pad_h;
+        const int w_in                      = w_col * stride_w - pad_w;
+        scalar_t* __restrict__ data_col_ptr = data_col + c_col * data_col_step + index % data_col_step;
+        const scalar_t* __restrict__ data_im_ptr =
+            data_im + (b_col * num_channels + c_im) * height * width;
+        const scalar_t* __restrict__ data_offset_ptr =
+            data_offset +
+            ((b_col * deformable_group + deformable_group_index) << 1) * kernel_h * kernel_w * hw_col +
+            h_col * width_col + w_col;
+        for (int i = 0; i < kernel_h; ++i)
+        {
+            for (int j = 0; j < kernel_w; ++j)
+            {
+                const int      data_offset_h = (i * kernel_w + j) * hw_col << 1;
+                const scalar_t offset_h      = data_offset_ptr[data_offset_h];
+                const int      data_offset_w = data_offset_h + hw_col;
+                const scalar_t offset_w      = data_offset_ptr[data_offset_w];
+                const scalar_t h_im          = h_in + i * dilation_h + (float)offset_h;
+                const scalar_t w_im          = w_in + j * dilation_w + (float)offset_w;
+                const scalar_t val           = deformable_im2col_bilinear(data_im_ptr, height, width, h_im, w_im);
+                *data_col_ptr                = val;
+                data_col_ptr += data_col_step;
+            }
+        }
     }
-  }
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
index 3d8f6dfc45..35f08be1b4 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
@@ -4,17 +4,9 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 
-template <typename scalar_t>
-void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column,
-                        const int channels, const int height, const int width, const int ksize_h,
-                        const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
-                        const int stride_w, const int dilation_h, const int dilation_w,
-                        const int parallel_imgs, const int deformable_group, cudaStream_t stream);
+template<typename scalar_t>
+void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, cudaStream_t stream);
 
-template <typename scalar_t>
-void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
-                 scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight,
-                 int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
-                 int padH, int dilationW, int dilationH, int group, int deformable_group,
-                 int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+template<typename scalar_t>
+void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset, scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
 #endif  // TRT_DEFORM_CONV_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
index b5e6c0b677..7dd688e089 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
@@ -10,141 +10,176 @@
 #include "gather_topk_kernel.hpp"
 #include "trt_serialize.hpp"
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"GatherTopk"};
-}  // namespace
-
-GatherTopk::GatherTopk(const std::string &name) : TRTPluginBase(name) {}
-
-GatherTopk::GatherTopk(const std::string name, const void *data, size_t length)
-    : TRTPluginBase(name) {}
-
-nvinfer1::IPluginV2DynamicExt *GatherTopk::clone() const TRT_NOEXCEPT {
-  GatherTopk *plugin = new GatherTopk(mLayerName);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs GatherTopk::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  assert(inputs[0].nbDims >= inputs[1].nbDims);
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = inputs[0].nbDims;
-  for (int i = 0; i < inputs[1].nbDims; ++i) {
-    ret.d[i] = inputs[1].d[i];
-  }
-  for (int i = inputs[1].nbDims; i < inputs[0].nbDims; ++i) {
-    ret.d[i] = inputs[0].d[i];
-  }
-  return ret;
-}
-
-bool GatherTopk::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
-                                           int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  switch (pos) {
-    case 0:
-      // data
-      return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-              ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
-             (ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-              ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-    case 1:
-      // indices
-      return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-             ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-    case 2:
-      // output
-      return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-    default:
-      return true;
-  }
-  return true;
-}
-
-void GatherTopk::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-                                 const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                 int nbOutputs) TRT_NOEXCEPT {}
-
-size_t GatherTopk::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                                    const nvinfer1::PluginTensorDesc *outputs,
-                                    int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int GatherTopk::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                        const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-                        void *const *outputs, void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  const int *dims = &(inputDesc[0].dims.d[0]);
-  const int *indices_dims = &(inputDesc[1].dims.d[0]);
-  int nbDims = inputDesc[0].dims.nbDims;
-  int indice_nbDims = inputDesc[1].dims.nbDims;
-
-  const void *data = inputs[0];
-  const void *indices = inputs[1];
-  void *output = outputs[0];
-
-  auto data_type = inputDesc[0].type;
-
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      gather_topk_impl<float>((float *)data, (int *)indices, dims, nbDims, indices_dims,
-                              indice_nbDims, (float *)output, stream);
-      break;
-
-    case nvinfer1::DataType::kINT32:
-      gather_topk_impl<int>((int *)data, (int *)indices, dims, nbDims, indices_dims, indice_nbDims,
-                            (int *)output, stream);
-      break;
-    default:
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType GatherTopk::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                                 int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *GatherTopk::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *GatherTopk::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int GatherTopk::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t GatherTopk::getSerializationSize() const TRT_NOEXCEPT { return 0; }
-
-void GatherTopk::serialize(void *buffer) const TRT_NOEXCEPT {}
-
-GatherTopkCreator::GatherTopkCreator() {
-  mPluginAttributes.clear();
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *GatherTopkCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *GatherTopkCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-nvinfer1::IPluginV2 *GatherTopkCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  auto *plugin = new GatherTopk(name);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *GatherTopkCreator::deserializePlugin(const char *name, const void *serialData,
-                                                          size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new GatherTopk(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(GatherTopkCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"GatherTopk"};
+    }  // namespace
+
+    GatherTopk::GatherTopk(const std::string& name)
+        : TRTPluginBase(name)
+    {
+    }
+
+    GatherTopk::GatherTopk(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+    }
+
+    nvinfer1::IPluginV2DynamicExt* GatherTopk::clone() const TRT_NOEXCEPT
+    {
+        GatherTopk* plugin = new GatherTopk(mLayerName);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs GatherTopk::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        assert(inputs[0].nbDims >= inputs[1].nbDims);
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = inputs[0].nbDims;
+        for (int i = 0; i < inputs[1].nbDims; ++i)
+        {
+            ret.d[i] = inputs[1].d[i];
+        }
+        for (int i = inputs[1].nbDims; i < inputs[0].nbDims; ++i)
+        {
+            ret.d[i] = inputs[0].d[i];
+        }
+        return ret;
+    }
+
+    bool GatherTopk::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    {
+        switch (pos)
+        {
+            case 0:
+                // data
+                return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                        ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
+                       (ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                        ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+            case 1:
+                // indices
+                return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                       ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+            case 2:
+                // output
+                return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+            default:
+                return true;
+        }
+        return true;
+    }
+
+    void   GatherTopk::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT {}
+
+    size_t GatherTopk::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int GatherTopk::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                            const nvinfer1::PluginTensorDesc* outputDesc,
+                            const void* const*                inputs,
+                            void* const*                      outputs,
+                            void*                             workSpace,
+                            cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        const int*  dims          = &(inputDesc[0].dims.d[0]);
+        const int*  indices_dims  = &(inputDesc[1].dims.d[0]);
+        int         nbDims        = inputDesc[0].dims.nbDims;
+        int         indice_nbDims = inputDesc[1].dims.nbDims;
+
+        const void* data    = inputs[0];
+        const void* indices = inputs[1];
+        void*       output  = outputs[0];
+
+        auto        data_type = inputDesc[0].type;
+
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                gather_topk_impl<float>((float*)data, (int*)indices, dims, nbDims, indices_dims, indice_nbDims, (float*)output, stream);
+                break;
+
+            case nvinfer1::DataType::kINT32:
+                gather_topk_impl<int>((int*)data, (int*)indices, dims, nbDims, indices_dims, indice_nbDims, (int*)output, stream);
+                break;
+            default:
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType GatherTopk::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* GatherTopk::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* GatherTopk::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int GatherTopk::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t GatherTopk::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    void GatherTopk::serialize(void* buffer) const TRT_NOEXCEPT {}
+
+    GatherTopkCreator::GatherTopkCreator()
+    {
+        mPluginAttributes.clear();
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* GatherTopkCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* GatherTopkCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* GatherTopkCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        auto* plugin = new GatherTopk(name);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* GatherTopkCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new GatherTopk(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(GatherTopkCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
index 72f76a2678..b3db9b4058 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
@@ -9,56 +9,54 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class GatherTopk : public TRTPluginBase {
- public:
-  GatherTopk(const std::string &name);
-
-  GatherTopk(const std::string name, const void *data, size_t length);
-
-  GatherTopk() = delete;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-};
-
-class GatherTopkCreator : public TRTPluginCreatorBase {
- public:
-  GatherTopkCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class GatherTopk : public TRTPluginBase
+    {
+      public:
+        GatherTopk(const std::string& name);
+
+        GatherTopk(const std::string name, const void* data, size_t length);
+
+        GatherTopk() = delete;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+    };
+
+    class GatherTopkCreator : public TRTPluginCreatorBase
+    {
+      public:
+        GatherTopkCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_SCATTERND_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
index 9a5c8ec963..873876ec12 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
@@ -8,39 +8,34 @@
 #include "gather_topk_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename scalar_t>
-__global__ void gather_topk_kernel(const scalar_t* input, const int* indices, scalar_t* output,
-                                   int batch, int num_input, int num_indices, int channel) {
-  CUDA_1D_KERNEL_LOOP(index, batch * num_indices * channel) {
-    const int b_id = index / (num_indices * channel);
-    const int n_id = (index / channel) % num_indices;
-    const int c_id = index % channel;
+template<typename scalar_t>
+__global__ void gather_topk_kernel(const scalar_t* input, const int* indices, scalar_t* output, int batch, int num_input, int num_indices, int channel)
+{
+    CUDA_1D_KERNEL_LOOP(index, batch * num_indices * channel)
+    {
+        const int      b_id = index / (num_indices * channel);
+        const int      n_id = (index / channel) % num_indices;
+        const int      c_id = index % channel;
 
-    const int input_n_id = indices[b_id * num_indices + n_id];
-    const scalar_t value = input[b_id * num_input * channel + input_n_id * channel + c_id];
-    output[b_id * num_indices * channel + n_id * channel + c_id] = value;
-  }
+        const int      input_n_id                                    = indices[b_id * num_indices + n_id];
+        const scalar_t value                                         = input[b_id * num_input * channel + input_n_id * channel + c_id];
+        output[b_id * num_indices * channel + n_id * channel + c_id] = value;
+    }
 }
 
-template <typename scalar_t>
-void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims,
-                      const int* indices_dims, int indice_nbDims, scalar_t* output,
-                      cudaStream_t stream) {
-  int batch = 1;
-  for (int i = 0; i < indice_nbDims - 1; ++i) batch *= dims[i];
-  int num_input = dims[indice_nbDims - 1];
-  int num_indices = indices_dims[indice_nbDims - 1];
-  int channel = 1;
-  for (int i = indice_nbDims; i < nbDims; ++i) channel *= dims[i];
-  const int col_block = DIVUP(batch * num_indices * channel, THREADS_PER_BLOCK);
-  gather_topk_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(input, indices, output, batch,
-                                                                  num_input, num_indices, channel);
+template<typename scalar_t>
+void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, scalar_t* output, cudaStream_t stream)
+{
+    int batch = 1;
+    for (int i = 0; i < indice_nbDims - 1; ++i) batch *= dims[i];
+    int num_input   = dims[indice_nbDims - 1];
+    int num_indices = indices_dims[indice_nbDims - 1];
+    int channel     = 1;
+    for (int i = indice_nbDims; i < nbDims; ++i) channel *= dims[i];
+    const int col_block = DIVUP(batch * num_indices * channel, THREADS_PER_BLOCK);
+    gather_topk_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(input, indices, output, batch, num_input, num_indices, channel);
 }
 
-template void gather_topk_impl<float>(const float* input, const int* indices, const int* dims,
-                                      int nbDims, const int* indices_dims, int indice_nbDims,
-                                      float* output, cudaStream_t stream);
+template void gather_topk_impl<float>(const float* input, const int* indices, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, float* output, cudaStream_t stream);
 
-template void gather_topk_impl<int32_t>(const int32_t* input, const int* indices, const int* dims,
-                                        int nbDims, const int* indices_dims, int indice_nbDims,
-                                        int32_t* output, cudaStream_t stream);
+template void gather_topk_impl<int32_t>(const int32_t* input, const int* indices, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, int32_t* output, cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
index 1f9b428394..e5ee6b987e 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
@@ -3,8 +3,6 @@
 #define TRT_GRID_SAMPLER_KERNEL_HPP
 #include <cuda_runtime.h>
 
-template <typename scalar_t>
-void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims,
-                      const int* indices_dims, int indice_nbDims, scalar_t* output,
-                      cudaStream_t stream);
+template<typename scalar_t>
+void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, scalar_t* output, cudaStream_t stream);
 #endif  // TRT_GRID_SAMPLER_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
index 1850fbfc1a..ef99b1fba6 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
@@ -10,145 +10,190 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"GridPriorsTRT"};
-}  // namespace
-
-GridPriorsTRT::GridPriorsTRT(const std::string &name, const nvinfer1::Dims stride)
-    : TRTPluginBase(name), mStride(stride) {}
-
-GridPriorsTRT::GridPriorsTRT(const std::string name, const void *data, size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mStride);
-}
-GridPriorsTRT::~GridPriorsTRT() {}
-
-nvinfer1::IPluginV2DynamicExt *GridPriorsTRT::clone() const TRT_NOEXCEPT {
-  GridPriorsTRT *plugin = new GridPriorsTRT(mLayerName, mStride);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs GridPriorsTRT::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  // input[0] == base_anchor
-  // input[1] == empty_h
-  // input[2] == empty_w
-
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 2;
-  auto area =
-      exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *inputs[2].d[0], *inputs[1].d[0]);
-  ret.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *area, *(inputs[0].d[0]));
-  ret.d[1] = exprBuilder.constant(4);
-
-  return ret;
-}
-
-bool GridPriorsTRT::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
-                                              int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-  } else if (pos - nbInputs == 0) {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  } else {
-    return true;
-  }
-}
-
-int GridPriorsTRT::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                           const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-                           void *const *outputs, void *workSpace,
-                           cudaStream_t stream) TRT_NOEXCEPT {
-  int num_base_anchors = inputDesc[0].dims.d[0];
-  int feat_h = inputDesc[1].dims.d[0];
-  int feat_w = inputDesc[2].dims.d[0];
-
-  const void *base_anchor = inputs[0];
-  void *output = outputs[0];
-
-  auto data_type = inputDesc[0].type;
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      trt_grid_priors_impl<float>((float *)base_anchor, (float *)output, num_base_anchors, feat_w,
-                                  feat_h, mStride.d[0], mStride.d[1], stream);
-      break;
-    default:
-      return 1;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType GridPriorsTRT::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                                    int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *GridPriorsTRT::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *GridPriorsTRT::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int GridPriorsTRT::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t GridPriorsTRT::getSerializationSize() const TRT_NOEXCEPT { return serialized_size(mStride); }
-
-void GridPriorsTRT::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mStride);
-  ;
-}
-
-////////////////////// creator /////////////////////////////
-
-GridPriorsTRTCreator::GridPriorsTRTCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_h"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_w"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *GridPriorsTRTCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *GridPriorsTRTCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-nvinfer1::IPluginV2 *GridPriorsTRTCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  int stride_w = 1;
-  int stride_h = 1;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("stride_w") == 0) {
-      stride_w = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-    if (field_name.compare("stride_h") == 0) {
-      stride_h = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-  }
-  nvinfer1::Dims stride{2, {stride_w, stride_h}};
-
-  GridPriorsTRT *plugin = new GridPriorsTRT(name, stride);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *GridPriorsTRTCreator::deserializePlugin(const char *name,
-                                                             const void *serialData,
-                                                             size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new GridPriorsTRT(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(GridPriorsTRTCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"GridPriorsTRT"};
+    }  // namespace
+
+    GridPriorsTRT::GridPriorsTRT(const std::string& name, const nvinfer1::Dims stride)
+        : TRTPluginBase(name)
+        , mStride(stride)
+    {
+    }
+
+    GridPriorsTRT::GridPriorsTRT(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mStride);
+    }
+    GridPriorsTRT::~GridPriorsTRT() {}
+
+    nvinfer1::IPluginV2DynamicExt* GridPriorsTRT::clone() const TRT_NOEXCEPT
+    {
+        GridPriorsTRT* plugin = new GridPriorsTRT(mLayerName, mStride);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs GridPriorsTRT::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        // input[0] == base_anchor
+        // input[1] == empty_h
+        // input[2] == empty_w
+
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 2;
+        auto area =
+            exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *inputs[2].d[0], *inputs[1].d[0]);
+        ret.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *area, *(inputs[0].d[0]));
+        ret.d[1] = exprBuilder.constant(4);
+
+        return ret;
+    }
+
+    bool GridPriorsTRT::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else if (pos - nbInputs == 0)
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+        else
+        {
+            return true;
+        }
+    }
+
+    int GridPriorsTRT::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                               const nvinfer1::PluginTensorDesc* outputDesc,
+                               const void* const*                inputs,
+                               void* const*                      outputs,
+                               void*                             workSpace,
+                               cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int         num_base_anchors = inputDesc[0].dims.d[0];
+        int         feat_h           = inputDesc[1].dims.d[0];
+        int         feat_w           = inputDesc[2].dims.d[0];
+
+        const void* base_anchor = inputs[0];
+        void*       output      = outputs[0];
+
+        auto        data_type = inputDesc[0].type;
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                trt_grid_priors_impl<float>((float*)base_anchor, (float*)output, num_base_anchors, feat_w, feat_h, mStride.d[0], mStride.d[1], stream);
+                break;
+            default:
+                return 1;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType GridPriorsTRT::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* GridPriorsTRT::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* GridPriorsTRT::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int GridPriorsTRT::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t GridPriorsTRT::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mStride);
+    }
+
+    void GridPriorsTRT::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mStride);
+        ;
+    }
+
+    ////////////////////// creator /////////////////////////////
+
+    GridPriorsTRTCreator::GridPriorsTRTCreator()
+    {
+        mPluginAttributes.clear();
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_h"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_w"));
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* GridPriorsTRTCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* GridPriorsTRTCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* GridPriorsTRTCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        int stride_w = 1;
+        int stride_h = 1;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("stride_w") == 0)
+            {
+                stride_w = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            if (field_name.compare("stride_h") == 0)
+            {
+                stride_h = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+        }
+        nvinfer1::Dims stride{2, {stride_w, stride_h}};
+
+        GridPriorsTRT* plugin = new GridPriorsTRT(name, stride);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* GridPriorsTRTCreator::deserializePlugin(const char* name,
+                                                                 const void* serialData,
+                                                                 size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new GridPriorsTRT(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+    REGISTER_TENSORRT_PLUGIN(GridPriorsTRTCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
index 0036f62586..a555b2d54a 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
@@ -9,58 +9,60 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class GridPriorsTRT : public TRTPluginBase {
- public:
-  GridPriorsTRT(const std::string &name, const nvinfer1::Dims stride);
+namespace mmdeploy
+{
+    class GridPriorsTRT : public TRTPluginBase
+    {
+      public:
+        GridPriorsTRT(const std::string& name, const nvinfer1::Dims stride);
 
-  GridPriorsTRT(const std::string name, const void *data, size_t length);
+        GridPriorsTRT(const std::string name, const void* data, size_t length);
 
-  GridPriorsTRT() = delete;
+        GridPriorsTRT() = delete;
 
-  ~GridPriorsTRT() TRT_NOEXCEPT override;
+        ~GridPriorsTRT() TRT_NOEXCEPT override;
 
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
 
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
 
- private:
-  nvinfer1::Dims mStride;
+      private:
+        nvinfer1::Dims mStride;
 
-  cublasHandle_t m_cublas_handle;
-};
+        cublasHandle_t m_cublas_handle;
+    };
 
-class GridPriorsTRTCreator : public TRTPluginCreatorBase {
- public:
-  GridPriorsTRTCreator();
+    class GridPriorsTRTCreator : public TRTPluginCreatorBase
+    {
+      public:
+        GridPriorsTRTCreator();
 
-  const char *getPluginName() const TRT_NOEXCEPT override;
+        const char*          getPluginName() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_GRID_PRIORS_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
index 72c33d179a..9decc3ba6e 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
@@ -5,39 +5,42 @@
 #include "trt_grid_priors_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename scalar_t>
-__global__ void trt_grid_priors_kernel(const scalar_t* base_anchor, scalar_t* output,
-                                       int num_base_anchors, int feat_w, int feat_h, int stride_w,
-                                       int stride_h) {
-  // load base anchor into shared memory.
-  extern __shared__ scalar_t shared_base_anchor[];
-  for (int i = threadIdx.x; i < num_base_anchors * 4; i += blockDim.x) {
-    shared_base_anchor[i] = base_anchor[i];
-  }
-  __syncthreads();
+template<typename scalar_t>
+__global__ void trt_grid_priors_kernel(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors, int feat_w, int feat_h, int stride_w, int stride_h)
+{
+    // load base anchor into shared memory.
+    extern __shared__ scalar_t shared_base_anchor[];
+    for (int i = threadIdx.x; i < num_base_anchors * 4; i += blockDim.x)
+    {
+        shared_base_anchor[i] = base_anchor[i];
+    }
+    __syncthreads();
 
-  CUDA_1D_KERNEL_LOOP(index, num_base_anchors * feat_w * feat_h) {
-    const int a_offset = (index % num_base_anchors) << 2;
-    const scalar_t w = scalar_t(((index / num_base_anchors) % feat_w) * stride_w);
-    const scalar_t h = scalar_t((index / (feat_w * num_base_anchors)) * stride_h);
+    CUDA_1D_KERNEL_LOOP(index, num_base_anchors * feat_w * feat_h)
+    {
+        const int      a_offset = (index % num_base_anchors) << 2;
+        const scalar_t w        = scalar_t(((index / num_base_anchors) % feat_w) * stride_w);
+        const scalar_t h        = scalar_t((index / (feat_w * num_base_anchors)) * stride_h);
 
-    auto out_start = output + index * 4;
-    out_start[0] = shared_base_anchor[a_offset] + w;
-    out_start[1] = shared_base_anchor[a_offset + 1] + h;
-    out_start[2] = shared_base_anchor[a_offset + 2] + w;
-    out_start[3] = shared_base_anchor[a_offset + 3] + h;
-  }
+        auto           out_start = output + index * 4;
+        out_start[0]             = shared_base_anchor[a_offset] + w;
+        out_start[1]             = shared_base_anchor[a_offset + 1] + h;
+        out_start[2]             = shared_base_anchor[a_offset + 2] + w;
+        out_start[3]             = shared_base_anchor[a_offset + 3] + h;
+    }
 }
 
-template <typename scalar_t>
-void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors,
-                          int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream) {
-  trt_grid_priors_kernel<<<GET_BLOCKS(num_base_anchors * feat_w * feat_h), THREADS_PER_BLOCK,
-                           DIVUP(num_base_anchors * 4, 32) * 32 * sizeof(scalar_t), stream>>>(
-      base_anchor, output, (int)num_base_anchors, (int)feat_w, (int)feat_h, (int)stride_w,
-      (int)stride_h);
+template<typename scalar_t>
+void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors, int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream)
+{
+    trt_grid_priors_kernel<<<GET_BLOCKS(num_base_anchors * feat_w * feat_h), THREADS_PER_BLOCK, DIVUP(num_base_anchors * 4, 32) * 32 * sizeof(scalar_t), stream>>>(
+        base_anchor,
+        output,
+        (int)num_base_anchors,
+        (int)feat_w,
+        (int)feat_h,
+        (int)stride_w,
+        (int)stride_h);
 }
 
-template void trt_grid_priors_impl<float>(const float* base_anchor, float* output,
-                                          int num_base_anchors, int feat_w, int feat_h,
-                                          int stride_w, int stride_h, cudaStream_t stream);
+template void trt_grid_priors_impl<float>(const float* base_anchor, float* output, int num_base_anchors, int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
index 77cef58c54..e050eb1047 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
@@ -3,8 +3,7 @@
 #define TRT_GRID_PRIORS_KERNEL_HPP
 #include <cuda_runtime.h>
 
-template <typename scalar_t>
-void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors,
-                          int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream);
+template<typename scalar_t>
+void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors, int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
index 7e55686902..0d7ebf32da 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
@@ -9,194 +9,237 @@
 #include "trt_plugin_helper.hpp"
 #include "trt_serialize.hpp"
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"grid_sampler"};
-}  // namespace
-
-TRTGridSampler::TRTGridSampler(const std::string &name, int mode, int paddingMode,
-                               bool alignCorners)
-    : TRTPluginBase(name), mMode(mode), mPaddingMode(paddingMode), mAlignCorners(alignCorners) {}
-
-TRTGridSampler::TRTGridSampler(const std::string name, const void *data, size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mMode);
-  deserialize_value(&data, &length, &mPaddingMode);
-  deserialize_value(&data, &length, &mAlignCorners);
-}
-
-nvinfer1::IPluginV2DynamicExt *TRTGridSampler::clone() const TRT_NOEXCEPT {
-  TRTGridSampler *plugin = new TRTGridSampler(mLayerName, mMode, mPaddingMode, mAlignCorners);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTGridSampler::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = inputs[0].nbDims;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[0].d[1];
-  for (int i = 2; i < ret.nbDims; ++i) {
-    ret.d[i] = inputs[1].d[i - 1];
-  }
-  return ret;
-}
-
-bool TRTGridSampler::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
-                                               int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-  } else {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  }
-}
-
-void TRTGridSampler::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-                                     const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                     int nbOutputs) TRT_NOEXCEPT {
-  // Validate input arguments
-}
-
-size_t TRTGridSampler::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                                        const nvinfer1::PluginTensorDesc *outputs,
-                                        int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int TRTGridSampler::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                            const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-                            void *const *outputs, void *workSpace,
-                            cudaStream_t stream) TRT_NOEXCEPT {
-  nvinfer1::Dims input_dims = inputDesc[0].dims;
-  nvinfer1::Dims grid_dims = inputDesc[1].dims;
-  nvinfer1::Dims output_dims = outputDesc[0].dims;
-
-  GridSamplerInterpolation interp_mode = GridSamplerInterpolation::Bilinear;
-  switch (mMode) {
-    case 0:
-      interp_mode = GridSamplerInterpolation::Bilinear;
-      break;
-    case 1:
-      interp_mode = GridSamplerInterpolation::Nearest;
-      break;
-    default:
-      break;
-  }
-
-  GridSamplerPadding padding_mode = GridSamplerPadding::Zeros;
-  switch (mPaddingMode) {
-    case 0:
-      padding_mode = GridSamplerPadding::Zeros;
-      break;
-
-    case 1:
-      padding_mode = GridSamplerPadding::Border;
-      break;
-
-    case 2:
-      padding_mode = GridSamplerPadding::Reflection;
-      break;
-    default:
-      break;
-  }
-
-  auto data_type = inputDesc[0].type;
-
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      grid_sample<float>((float *)outputs[0], (float *)inputs[0], (float *)inputs[1],
-                         &(output_dims.d[0]), &(input_dims.d[0]), &(grid_dims.d[0]),
-                         input_dims.nbDims, interp_mode, padding_mode, mAlignCorners, stream);
-      break;
-    default:
-      return 1;
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType TRTGridSampler::getOutputDataType(int index,
-                                                     const nvinfer1::DataType *inputTypes,
-                                                     int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *TRTGridSampler::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTGridSampler::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int TRTGridSampler::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTGridSampler::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mMode) + serialized_size(mPaddingMode) + serialized_size(mAlignCorners);
-}
-
-void TRTGridSampler::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mMode);
-  serialize_value(&buffer, mPaddingMode);
-  serialize_value(&buffer, mAlignCorners);
-}
-
-////////////////////// creator /////////////////////////////
-
-TRTGridSamplerCreator::TRTGridSamplerCreator() {
-  mPluginAttributes = std::vector<nvinfer1::PluginField>(
-      {nvinfer1::PluginField("interpolation_mode"), nvinfer1::PluginField("padding_mode"),
-       nvinfer1::PluginField("align_corners")});
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTGridSamplerCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTGridSamplerCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-nvinfer1::IPluginV2 *TRTGridSamplerCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  int mode = 0;
-  int paddingMode = 0;
-  bool alignCorners = false;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("interpolation_mode") == 0) {
-      mode = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("padding_mode") == 0) {
-      paddingMode = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("align_corners") == 0) {
-      alignCorners = (bool)(static_cast<const int *>(fc->fields[i].data)[0]);
-    }
-  }
-
-  TRTGridSampler *plugin = new TRTGridSampler(name, mode, paddingMode, alignCorners);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"grid_sampler"};
+    }  // namespace
+
+    TRTGridSampler::TRTGridSampler(const std::string& name, int mode, int paddingMode, bool alignCorners)
+        : TRTPluginBase(name)
+        , mMode(mode)
+        , mPaddingMode(paddingMode)
+        , mAlignCorners(alignCorners)
+    {
+    }
+
+    TRTGridSampler::TRTGridSampler(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mMode);
+        deserialize_value(&data, &length, &mPaddingMode);
+        deserialize_value(&data, &length, &mAlignCorners);
+    }
+
+    nvinfer1::IPluginV2DynamicExt* TRTGridSampler::clone() const TRT_NOEXCEPT
+    {
+        TRTGridSampler* plugin = new TRTGridSampler(mLayerName, mMode, mPaddingMode, mAlignCorners);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTGridSampler::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = inputs[0].nbDims;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[0].d[1];
+        for (int i = 2; i < ret.nbDims; ++i)
+        {
+            ret.d[i] = inputs[1].d[i - 1];
+        }
+        return ret;
+    }
+
+    bool TRTGridSampler::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+    }
+
+    void TRTGridSampler::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT
+    {
+        // Validate input arguments
+    }
+
+    size_t TRTGridSampler::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int TRTGridSampler::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                const nvinfer1::PluginTensorDesc* outputDesc,
+                                const void* const*                inputs,
+                                void* const*                      outputs,
+                                void*                             workSpace,
+                                cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        nvinfer1::Dims           input_dims  = inputDesc[0].dims;
+        nvinfer1::Dims           grid_dims   = inputDesc[1].dims;
+        nvinfer1::Dims           output_dims = outputDesc[0].dims;
+
+        GridSamplerInterpolation interp_mode = GridSamplerInterpolation::Bilinear;
+        switch (mMode)
+        {
+            case 0:
+                interp_mode = GridSamplerInterpolation::Bilinear;
+                break;
+            case 1:
+                interp_mode = GridSamplerInterpolation::Nearest;
+                break;
+            default:
+                break;
+        }
+
+        GridSamplerPadding padding_mode = GridSamplerPadding::Zeros;
+        switch (mPaddingMode)
+        {
+            case 0:
+                padding_mode = GridSamplerPadding::Zeros;
+                break;
+
+            case 1:
+                padding_mode = GridSamplerPadding::Border;
+                break;
+
+            case 2:
+                padding_mode = GridSamplerPadding::Reflection;
+                break;
+            default:
+                break;
+        }
+
+        auto data_type = inputDesc[0].type;
+
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                grid_sample<float>((float*)outputs[0], (float*)inputs[0], (float*)inputs[1], &(output_dims.d[0]), &(input_dims.d[0]), &(grid_dims.d[0]), input_dims.nbDims, interp_mode, padding_mode, mAlignCorners, stream);
+                break;
+            default:
+                return 1;
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTGridSampler::getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* TRTGridSampler::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTGridSampler::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTGridSampler::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTGridSampler::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mMode) + serialized_size(mPaddingMode) + serialized_size(mAlignCorners);
+    }
+
+    void TRTGridSampler::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mMode);
+        serialize_value(&buffer, mPaddingMode);
+        serialize_value(&buffer, mAlignCorners);
+    }
+
+    ////////////////////// creator /////////////////////////////
 
-nvinfer1::IPluginV2 *TRTGridSamplerCreator::deserializePlugin(const char *name,
-                                                              const void *serialData,
-                                                              size_t serialLength) TRT_NOEXCEPT {
-  // This object will be deleted when the network is destroyed, which will
-  // call FCPluginDynamic::destroy()
-  auto plugin = new TRTGridSampler(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
+    TRTGridSamplerCreator::TRTGridSamplerCreator()
+    {
+        mPluginAttributes = std::vector<nvinfer1::PluginField>(
+            {nvinfer1::PluginField("interpolation_mode"), nvinfer1::PluginField("padding_mode"), nvinfer1::PluginField("align_corners")});
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTGridSamplerCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTGridSamplerCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTGridSamplerCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        int  mode         = 0;
+        int  paddingMode  = 0;
+        bool alignCorners = false;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("interpolation_mode") == 0)
+            {
+                mode = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("padding_mode") == 0)
+            {
+                paddingMode = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("align_corners") == 0)
+            {
+                alignCorners = (bool)(static_cast<const int*>(fc->fields[i].data)[0]);
+            }
+        }
+
+        TRTGridSampler* plugin = new TRTGridSampler(name, mode, paddingMode, alignCorners);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTGridSamplerCreator::deserializePlugin(const char* name,
+                                                                  const void* serialData,
+                                                                  size_t      serialLength) TRT_NOEXCEPT
+    {
+        // This object will be deleted when the network is destroyed, which will
+        // call FCPluginDynamic::destroy()
+        auto plugin = new TRTGridSampler(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
 
-REGISTER_TENSORRT_PLUGIN(TRTGridSamplerCreator);
+    REGISTER_TENSORRT_PLUGIN(TRTGridSamplerCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
index 0f62bce7c8..1fc41e5bb8 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
@@ -9,76 +9,74 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class TRTGridSampler : public TRTPluginBase {
- public:
-  TRTGridSampler(const std::string &name, int mode, int paddingMode, bool alignCorners);
+    class TRTGridSampler : public TRTPluginBase
+    {
+      public:
+        TRTGridSampler(const std::string& name, int mode, int paddingMode, bool alignCorners);
 
-  TRTGridSampler(const std::string name, const void *data, size_t length);
+        TRTGridSampler(const std::string name, const void* data, size_t length);
 
-  TRTGridSampler() = delete;
+        TRTGridSampler() = delete;
 
-  ~TRTGridSampler() TRT_NOEXCEPT override = default;
+        ~TRTGridSampler() TRT_NOEXCEPT override = default;
 
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
 
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
 
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
 
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
 
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
 
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
 
-  int getNbOutputs() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
 
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
 
- private:
-  int mMode;
-  int mPaddingMode;
-  bool mAlignCorners;
-};
+      private:
+        int  mMode;
+        int  mPaddingMode;
+        bool mAlignCorners;
+    };
 
-class TRTGridSamplerCreator : public TRTPluginCreatorBase {
- public:
-  TRTGridSamplerCreator();
+    class TRTGridSamplerCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTGridSamplerCreator();
 
-  ~TRTGridSamplerCreator() TRT_NOEXCEPT override = default;
+        ~TRTGridSamplerCreator() TRT_NOEXCEPT override = default;
 
-  const char *getPluginName() const TRT_NOEXCEPT override;
+        const char*          getPluginName() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_GRID_SAMPLER_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
index 5d83f98d2c..6dafbbb126 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
@@ -27,370 +27,434 @@ using mmdeploy::TensorDesc;
 //     -1 --> -0.5
 //     +1 --> (size - 1) + 0.5 == size - 0.5
 //     scale_factor = size / 2
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t grid_sampler_unnormalize(scalar_t coord, int size,
-                                                                    bool align_corners) {
-  if (align_corners) {
-    // unnormalize coord from [-1, 1] to [0, size - 1]
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
-    return ((coord + 1.f) * size - 1) / 2;
-  }
+template<typename scalar_t>
+static __forceinline__ __device__ scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners)
+{
+    if (align_corners)
+    {
+        // unnormalize coord from [-1, 1] to [0, size - 1]
+        return ((coord + 1.f) / 2) * (size - 1);
+    }
+    else
+    {
+        // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+        return ((coord + 1.f) * size - 1) / 2;
+    }
 }
 
 // Clips coordinates to between 0 and clip_limit - 1
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t clip_coordinates(scalar_t in, int clip_limit) {
-  return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
+template<typename scalar_t>
+static __forceinline__ __device__ scalar_t clip_coordinates(scalar_t in, int clip_limit)
+{
+    return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
 }
 
 // Reflects coordinates until they fall between low and high (inclusive).
 // The bounds are passed as twice their value so that half-integer values
 // can be represented as ints.
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t reflect_coordinates(scalar_t in, int twice_low,
-                                                               int twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<scalar_t>(0);
-  }
-  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-  in = ::fabs(in - min);
-  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
-  scalar_t extra = ::fmod(in, span);
-  int flips = static_cast<int>(::floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
+template<typename scalar_t>
+static __forceinline__ __device__ scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high)
+{
+    if (twice_low == twice_high)
+    {
+        return static_cast<scalar_t>(0);
+    }
+    scalar_t min   = static_cast<scalar_t>(twice_low) / 2;
+    scalar_t span  = static_cast<scalar_t>(twice_high - twice_low) / 2;
+    in             = ::fabs(in - min);
+    // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+    scalar_t extra = ::fmod(in, span);
+    int      flips = static_cast<int>(::floor(in / span));
+    if (flips % 2 == 0)
+    {
+        return extra + min;
+    }
+    else
+    {
+        return span - extra + min;
+    }
 }
 
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t safe_downgrade_to_int_range(scalar_t x) {
-  // -100.0 does not have special meaning. This is just to make sure
-  // it's not within_bounds_2d or within_bounds_3d, and does not cause
-  // undefined behavior. See #35506.
-  if (x > INT_MAX - 1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
-    return static_cast<scalar_t>(-100.0);
-  return x;
+template<typename scalar_t>
+static __forceinline__ __device__ scalar_t safe_downgrade_to_int_range(scalar_t x)
+{
+    // -100.0 does not have special meaning. This is just to make sure
+    // it's not within_bounds_2d or within_bounds_3d, and does not cause
+    // undefined behavior. See #35506.
+    if (x > INT_MAX - 1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
+        return static_cast<scalar_t>(-100.0);
+    return x;
 }
 
 // Computes the pixel source index value for a grid coordinate
-template <typename scalar_t>
+template<typename scalar_t>
 static __forceinline__ __device__ scalar_t grid_sampler_compute_source_index(
-    scalar_t coord, int size, GridSamplerPadding padding_mode, bool align_corners) {
-  coord = grid_sampler_unnormalize(coord, size, align_corners);
-  if (padding_mode == GridSamplerPadding::Border) {
-    // clip coordinates to image borders
-    coord = clip_coordinates(coord, size);
-  } else if (padding_mode == GridSamplerPadding::Reflection) {
-    // reflect coordinates by image borders
-    if (align_corners) {
-      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
-    } else {
-      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    scalar_t           coord,
+    int                size,
+    GridSamplerPadding padding_mode,
+    bool               align_corners)
+{
+    coord = grid_sampler_unnormalize(coord, size, align_corners);
+    if (padding_mode == GridSamplerPadding::Border)
+    {
+        // clip coordinates to image borders
+        coord = clip_coordinates(coord, size);
+    }
+    else if (padding_mode == GridSamplerPadding::Reflection)
+    {
+        // reflect coordinates by image borders
+        if (align_corners)
+        {
+            coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+        }
+        else
+        {
+            coord = reflect_coordinates(coord, -1, 2 * size - 1);
+        }
+        // clip coordinates to image borders
+        coord = clip_coordinates(coord, size);
     }
-    // clip coordinates to image borders
-    coord = clip_coordinates(coord, size);
-  }
 
-  coord = safe_downgrade_to_int_range(coord);
-  return coord;
+    coord = safe_downgrade_to_int_range(coord);
+    return coord;
 }
 
-static __forceinline__ __device__ bool within_bounds_2d(int h, int w, int H, int W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
+static __forceinline__ __device__ bool within_bounds_2d(int h, int w, int H, int W)
+{
+    return h >= 0 && h < H && w >= 0 && w < W;
 }
 
-static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
-  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w, int D, int H, int W)
+{
+    return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
 }
 
-template <typename scalar_t>
-__global__ void grid_sampler_2d_kernel(const int nthreads, const scalar_t *input,
-                                       const scalar_t *grid, scalar_t *output,
-                                       TensorDesc input_desc, TensorDesc grid_desc,
-                                       TensorDesc output_desc,
-                                       const GridSamplerInterpolation interpolation_mode,
-                                       const GridSamplerPadding padding_mode, bool align_corners) {
-  int C = input_desc.shape[1];
-  int inp_H = input_desc.shape[2];
-  int inp_W = input_desc.shape[3];
-  int out_H = grid_desc.shape[1];
-  int out_W = grid_desc.shape[2];
-  int inp_sN = input_desc.stride[0];
-  int inp_sC = input_desc.stride[1];
-  int inp_sH = input_desc.stride[2];
-  int inp_sW = input_desc.stride[3];
-  int grid_sN = grid_desc.stride[0];
-  int grid_sH = grid_desc.stride[1];
-  int grid_sW = grid_desc.stride[2];
-  int grid_sCoor = grid_desc.stride[3];
-  int out_sN = output_desc.stride[0];
-  int out_sC = output_desc.stride[1];
-  int out_sH = output_desc.stride[2];
-  int out_sW = output_desc.stride[3];
-
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_W;
-    const int h = (index / out_W) % out_H;
-    const int n = index / (out_H * out_W);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-    // get the corresponding input x, y coordinates from grid
-    scalar_t ix = grid[grid_offset];
-    scalar_t iy = grid[grid_offset + grid_sCoor];
-
-    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
-    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
-
-    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-      // get NE, NW, SE, SW pixel values from (x, y)
-      int ix_nw = static_cast<int>(::floor(ix));
-      int iy_nw = static_cast<int>(::floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-
-      // get surfaces to each neighbor:
-      scalar_t nw = (ix_se - ix) * (iy_se - iy);
-      scalar_t ne = (ix - ix_sw) * (iy_sw - iy);
-      scalar_t sw = (ix_ne - ix) * (iy - iy_ne);
-      scalar_t se = (ix - ix_nw) * (iy - iy_nw);
-
-      // calculate bilinear weighted pixel value and set output pixel
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        *out_ptr_NCHW = static_cast<scalar_t>(0);
-        if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-        }
-        if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+template<typename scalar_t>
+__global__ void grid_sampler_2d_kernel(const int nthreads, const scalar_t* input, const scalar_t* grid, scalar_t* output, TensorDesc input_desc, TensorDesc grid_desc, TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode, const GridSamplerPadding padding_mode, bool align_corners)
+{
+    int C          = input_desc.shape[1];
+    int inp_H      = input_desc.shape[2];
+    int inp_W      = input_desc.shape[3];
+    int out_H      = grid_desc.shape[1];
+    int out_W      = grid_desc.shape[2];
+    int inp_sN     = input_desc.stride[0];
+    int inp_sC     = input_desc.stride[1];
+    int inp_sH     = input_desc.stride[2];
+    int inp_sW     = input_desc.stride[3];
+    int grid_sN    = grid_desc.stride[0];
+    int grid_sH    = grid_desc.stride[1];
+    int grid_sW    = grid_desc.stride[2];
+    int grid_sCoor = grid_desc.stride[3];
+    int out_sN     = output_desc.stride[0];
+    int out_sC     = output_desc.stride[1];
+    int out_sH     = output_desc.stride[2];
+    int out_sW     = output_desc.stride[3];
+
+    CUDA_1D_KERNEL_LOOP(index, nthreads)
+    {
+        const int w           = index % out_W;
+        const int h           = (index / out_W) % out_H;
+        const int n           = index / (out_H * out_W);
+        const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+        // get the corresponding input x, y coordinates from grid
+        scalar_t  ix = grid[grid_offset];
+        scalar_t  iy = grid[grid_offset + grid_sCoor];
+
+        ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
+        iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
+
+        if (interpolation_mode == GridSamplerInterpolation::Bilinear)
+        {
+            // get NE, NW, SE, SW pixel values from (x, y)
+            int      ix_nw = static_cast<int>(::floor(ix));
+            int      iy_nw = static_cast<int>(::floor(iy));
+            int      ix_ne = ix_nw + 1;
+            int      iy_ne = iy_nw;
+            int      ix_sw = ix_nw;
+            int      iy_sw = iy_nw + 1;
+            int      ix_se = ix_nw + 1;
+            int      iy_se = iy_nw + 1;
+
+            // get surfaces to each neighbor:
+            scalar_t nw = (ix_se - ix) * (iy_se - iy);
+            scalar_t ne = (ix - ix_sw) * (iy_sw - iy);
+            scalar_t sw = (ix_ne - ix) * (iy - iy_ne);
+            scalar_t se = (ix - ix_nw) * (iy - iy_nw);
+
+            // calculate bilinear weighted pixel value and set output pixel
+            auto     inp_ptr_NC   = input + n * inp_sN;
+            auto     out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+            for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC)
+            {
+                *out_ptr_NCHW = static_cast<scalar_t>(0);
+                if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W))
+                {
+                    *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+                }
+                if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W))
+                {
+                    *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+                }
+                if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W))
+                {
+                    *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+                }
+                if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W))
+                {
+                    *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+                }
+            }
         }
-        if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        else if (interpolation_mode == GridSamplerInterpolation::Nearest)
+        {
+            int  ix_nearest = static_cast<int>(::round(ix));
+            int  iy_nearest = static_cast<int>(::round(iy));
+
+            // assign nearest neighbor pixel value to output pixel
+            auto inp_ptr_NC   = input + n * inp_sN;
+            auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+            for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC)
+            {
+                if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W))
+                {
+                    *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+                }
+                else
+                {
+                    *out_ptr_NCHW = static_cast<scalar_t>(0);
+                }
+            }
         }
-        if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-        }
-      }
-    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-      int ix_nearest = static_cast<int>(::round(ix));
-      int iy_nearest = static_cast<int>(::round(iy));
-
-      // assign nearest neighbor pixel value to output pixel
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
-          *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-        } else {
-          *out_ptr_NCHW = static_cast<scalar_t>(0);
-        }
-      }
     }
-  }
 }
 
-template <typename scalar_t>
-__global__ void grid_sampler_3d_kernel(const int nthreads, const scalar_t *input,
-                                       const scalar_t *grid, scalar_t *output,
-                                       TensorDesc input_desc, TensorDesc grid_desc,
-                                       TensorDesc output_desc,
-                                       const GridSamplerInterpolation interpolation_mode,
-                                       const GridSamplerPadding padding_mode, bool align_corners) {
-  int C = input_desc.shape[1];
-  int inp_D = input_desc.shape[2];
-  int inp_H = input_desc.shape[3];
-  int inp_W = input_desc.shape[4];
-  int out_D = grid_desc.shape[1];
-  int out_H = grid_desc.shape[2];
-  int out_W = grid_desc.shape[3];
-  int inp_sN = input_desc.stride[0];
-  int inp_sC = input_desc.stride[1];
-  int inp_sD = input_desc.stride[2];
-  int inp_sH = input_desc.stride[3];
-  int inp_sW = input_desc.stride[4];
-  int grid_sN = grid_desc.stride[0];
-  int grid_sD = grid_desc.stride[1];
-  int grid_sH = grid_desc.stride[2];
-  int grid_sW = grid_desc.stride[3];
-  int grid_sCoor = grid_desc.stride[4];
-  int out_sN = output_desc.stride[0];
-  int out_sC = output_desc.stride[1];
-  int out_sD = output_desc.stride[2];
-  int out_sH = output_desc.stride[3];
-  int out_sW = output_desc.stride[4];
-
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_W;
-    const int h = (index / out_W) % out_H;
-    const int d = (index / (out_H * out_W)) % out_D;
-    const int n = index / (out_D * out_H * out_W);
-    const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
-
-    // get the corresponding input x, y, z coordinates from grid
-    scalar_t ix = grid[grid_offset];
-    scalar_t iy = grid[grid_offset + grid_sCoor];
-    scalar_t iz = grid[grid_offset + 2 * grid_sCoor];
-
-    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
-    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
-    iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode, align_corners);
-
-    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-      // get corner pixel values from (x, y, z)
-      // for 4d, we used north-east-south-west
-      // for 5d, we add top-bottom
-      int ix_tnw = static_cast<int>(::floor(ix));
-      int iy_tnw = static_cast<int>(::floor(iy));
-      int iz_tnw = static_cast<int>(::floor(iz));
-
-      int ix_tne = ix_tnw + 1;
-      int iy_tne = iy_tnw;
-      int iz_tne = iz_tnw;
-
-      int ix_tsw = ix_tnw;
-      int iy_tsw = iy_tnw + 1;
-      int iz_tsw = iz_tnw;
-
-      int ix_tse = ix_tnw + 1;
-      int iy_tse = iy_tnw + 1;
-      int iz_tse = iz_tnw;
-
-      int ix_bnw = ix_tnw;
-      int iy_bnw = iy_tnw;
-      int iz_bnw = iz_tnw + 1;
-
-      int ix_bne = ix_tnw + 1;
-      int iy_bne = iy_tnw;
-      int iz_bne = iz_tnw + 1;
-
-      int ix_bsw = ix_tnw;
-      int iy_bsw = iy_tnw + 1;
-      int iz_bsw = iz_tnw + 1;
-
-      int ix_bse = ix_tnw + 1;
-      int iy_bse = iy_tnw + 1;
-      int iz_bse = iz_tnw + 1;
-
-      // get surfaces to each neighbor:
-      scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
-      scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
-      scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
-      scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
-      scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
-      scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
-      scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
-      scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
-
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
-        //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
-        //   tne
-        // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
-        // tse
-        // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
-        // bne
-        // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
-        // bse
-        *out_ptr_NCDHW = static_cast<scalar_t>(0);
-        if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
-        }
-        if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
+template<typename scalar_t>
+__global__ void grid_sampler_3d_kernel(const int nthreads, const scalar_t* input, const scalar_t* grid, scalar_t* output, TensorDesc input_desc, TensorDesc grid_desc, TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode, const GridSamplerPadding padding_mode, bool align_corners)
+{
+    int C          = input_desc.shape[1];
+    int inp_D      = input_desc.shape[2];
+    int inp_H      = input_desc.shape[3];
+    int inp_W      = input_desc.shape[4];
+    int out_D      = grid_desc.shape[1];
+    int out_H      = grid_desc.shape[2];
+    int out_W      = grid_desc.shape[3];
+    int inp_sN     = input_desc.stride[0];
+    int inp_sC     = input_desc.stride[1];
+    int inp_sD     = input_desc.stride[2];
+    int inp_sH     = input_desc.stride[3];
+    int inp_sW     = input_desc.stride[4];
+    int grid_sN    = grid_desc.stride[0];
+    int grid_sD    = grid_desc.stride[1];
+    int grid_sH    = grid_desc.stride[2];
+    int grid_sW    = grid_desc.stride[3];
+    int grid_sCoor = grid_desc.stride[4];
+    int out_sN     = output_desc.stride[0];
+    int out_sC     = output_desc.stride[1];
+    int out_sD     = output_desc.stride[2];
+    int out_sH     = output_desc.stride[3];
+    int out_sW     = output_desc.stride[4];
+
+    CUDA_1D_KERNEL_LOOP(index, nthreads)
+    {
+        const int w           = index % out_W;
+        const int h           = (index / out_W) % out_H;
+        const int d           = (index / (out_H * out_W)) % out_D;
+        const int n           = index / (out_D * out_H * out_W);
+        const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+        // get the corresponding input x, y, z coordinates from grid
+        scalar_t  ix = grid[grid_offset];
+        scalar_t  iy = grid[grid_offset + grid_sCoor];
+        scalar_t  iz = grid[grid_offset + 2 * grid_sCoor];
+
+        ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
+        iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
+        iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode, align_corners);
+
+        if (interpolation_mode == GridSamplerInterpolation::Bilinear)
+        {
+            // get corner pixel values from (x, y, z)
+            // for 4d, we used north-east-south-west
+            // for 5d, we add top-bottom
+            int      ix_tnw = static_cast<int>(::floor(ix));
+            int      iy_tnw = static_cast<int>(::floor(iy));
+            int      iz_tnw = static_cast<int>(::floor(iz));
+
+            int      ix_tne = ix_tnw + 1;
+            int      iy_tne = iy_tnw;
+            int      iz_tne = iz_tnw;
+
+            int      ix_tsw = ix_tnw;
+            int      iy_tsw = iy_tnw + 1;
+            int      iz_tsw = iz_tnw;
+
+            int      ix_tse = ix_tnw + 1;
+            int      iy_tse = iy_tnw + 1;
+            int      iz_tse = iz_tnw;
+
+            int      ix_bnw = ix_tnw;
+            int      iy_bnw = iy_tnw;
+            int      iz_bnw = iz_tnw + 1;
+
+            int      ix_bne = ix_tnw + 1;
+            int      iy_bne = iy_tnw;
+            int      iz_bne = iz_tnw + 1;
+
+            int      ix_bsw = ix_tnw;
+            int      iy_bsw = iy_tnw + 1;
+            int      iz_bsw = iz_tnw + 1;
+
+            int      ix_bse = ix_tnw + 1;
+            int      iy_bse = iy_tnw + 1;
+            int      iz_bse = iz_tnw + 1;
+
+            // get surfaces to each neighbor:
+            scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+            scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+            scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+            scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+            scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+            scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+            scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+            scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+            auto     inp_ptr_NC    = input + n * inp_sN;
+            auto     out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+            for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC)
+            {
+                //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
+                //   tne
+                // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
+                // tse
+                // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
+                // bne
+                // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
+                // bse
+                *out_ptr_NCDHW = static_cast<scalar_t>(0);
+                if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
+                }
+                if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
+                }
+                if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
+                }
+                if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
+                }
+                if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
+                }
+                if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
+                }
+                if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
+                }
+                if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
+                }
+            }
         }
-        if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
+        else if (interpolation_mode == GridSamplerInterpolation::Nearest)
+        {
+            int  ix_nearest = static_cast<int>(::round(ix));
+            int  iy_nearest = static_cast<int>(::round(iy));
+            int  iz_nearest = static_cast<int>(::round(iz));
+
+            // assign nearest neighbor pixel value to output pixel
+            auto inp_ptr_NC    = input + n * inp_sN;
+            auto out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+            for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC)
+            {
+                if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW =
+                        inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW];
+                }
+                else
+                {
+                    *out_ptr_NCDHW = static_cast<scalar_t>(0);
+                }
+            }
         }
-        if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
-        }
-        if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
-        }
-        if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
-        }
-        if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
-        }
-        if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
-        }
-      }
-    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-      int ix_nearest = static_cast<int>(::round(ix));
-      int iy_nearest = static_cast<int>(::round(iy));
-      int iz_nearest = static_cast<int>(::round(iz));
-
-      // assign nearest neighbor pixel value to output pixel
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
-        if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW =
-              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW];
-        } else {
-          *out_ptr_NCDHW = static_cast<scalar_t>(0);
-        }
-      }
     }
-  }
 }
 
-void create_desc(const int *dims, int nb_dims, TensorDesc &desc) {
-  memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
-  desc.stride[nb_dims - 1] = 1;
-  for (int i = nb_dims - 2; i >= 0; --i) {
-    desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
-  }
+void create_desc(const int* dims, int nb_dims, TensorDesc& desc)
+{
+    memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
+    desc.stride[nb_dims - 1] = 1;
+    for (int i = nb_dims - 2; i >= 0; --i)
+    {
+        desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
+    }
 }
 
-template <typename T>
-void grid_sample(T *output, const T *input, const T *grid, int *output_dims, int *input_dims,
-                 int *grid_dims, int nb_dims, GridSamplerInterpolation interp,
-                 GridSamplerPadding padding, bool align_corners, cudaStream_t stream) {
-  TensorDesc input_desc;
-  create_desc(input_dims, nb_dims, input_desc);
+template<typename T>
+void grid_sample(T* output, const T* input, const T* grid, int* output_dims, int* input_dims, int* grid_dims, int nb_dims, GridSamplerInterpolation interp, GridSamplerPadding padding, bool align_corners, cudaStream_t stream)
+{
+    TensorDesc input_desc;
+    create_desc(input_dims, nb_dims, input_desc);
+
+    TensorDesc output_desc;
+    create_desc(output_dims, nb_dims, output_desc);
 
-  TensorDesc output_desc;
-  create_desc(output_dims, nb_dims, output_desc);
+    TensorDesc grid_desc;
+    create_desc(grid_dims, nb_dims, grid_desc);
 
-  TensorDesc grid_desc;
-  create_desc(grid_dims, nb_dims, grid_desc);
+    int count = 1;
+    for (int i = 0; i < nb_dims; ++i)
+    {
+        if (i == 1)
+        {
+            continue;
+        }
+        count *= output_desc.shape[i];
+    }
 
-  int count = 1;
-  for (int i = 0; i < nb_dims; ++i) {
-    if (i == 1) {
-      continue;
+    if (nb_dims == 4)
+    {
+        grid_sampler_2d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
+            count,
+            input,
+            grid,
+            output,
+            input_desc,
+            grid_desc,
+            output_desc,
+            interp,
+            padding,
+            align_corners);
+    }
+    else if (nb_dims == 5)
+    {
+        grid_sampler_3d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
+            count,
+            input,
+            grid,
+            output,
+            input_desc,
+            grid_desc,
+            output_desc,
+            interp,
+            padding,
+            align_corners);
+    }
+    else
+    {
+        printf("input and grid dims should be 4 or 5\n");
     }
-    count *= output_desc.shape[i];
-  }
-
-  if (nb_dims == 4) {
-    grid_sampler_2d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
-        count, input, grid, output, input_desc, grid_desc, output_desc, interp, padding,
-        align_corners);
-  } else if (nb_dims == 5) {
-    grid_sampler_3d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
-        count, input, grid, output, input_desc, grid_desc, output_desc, interp, padding,
-        align_corners);
-  } else {
-    printf("input and grid dims should be 4 or 5\n");
-  }
 }
 
-template void grid_sample<float>(float *output, const float *input, const float *grid,
-                                 int *output_dims, int *input_dims, int *grid_dims, int nb_dims,
-                                 GridSamplerInterpolation interp, GridSamplerPadding padding,
-                                 bool align_corners, cudaStream_t stream);
+template void grid_sample<float>(float* output, const float* input, const float* grid, int* output_dims, int* input_dims, int* grid_dims, int nb_dims, GridSamplerInterpolation interp, GridSamplerPadding padding, bool align_corners, cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
index e4e50332f4..b73bd91213 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
@@ -3,11 +3,18 @@
 #define TRT_GRID_SAMPLER_KERNEL_HPP
 #include <cuda_runtime.h>
 
-enum class GridSamplerInterpolation { Bilinear, Nearest };
-enum class GridSamplerPadding { Zeros, Border, Reflection };
+enum class GridSamplerInterpolation
+{
+    Bilinear,
+    Nearest
+};
+enum class GridSamplerPadding
+{
+    Zeros,
+    Border,
+    Reflection
+};
 
-template <typename T>
-void grid_sample(T *output, const T *input, const T *grid, int *output_dims, int *input_dims,
-                 int *grid_dims, int nb_dims, GridSamplerInterpolation interp,
-                 GridSamplerPadding padding, bool align_corners, cudaStream_t stream);
+template<typename T>
+void grid_sample(T* output, const T* input, const T* grid, int* output_dims, int* input_dims, int* grid_dims, int nb_dims, GridSamplerInterpolation interp, GridSamplerPadding padding, bool align_corners, cudaStream_t stream);
 #endif  // TRT_GRID_SAMPLER_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
index e6aab92f4c..a3ead6d507 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
@@ -12,203 +12,241 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-constexpr const char* PLUGIN_VERSION{"1"};
-constexpr const char* PLUGIN_NAME{"TRTInstanceNormalization"};
-}  // namespace
-
-TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, float epsilon)
-    : TRTPluginBase(name), mEpsilon(epsilon) {}
-
-TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, void const* serialData,
-                                                   size_t serialLength)
-    : TRTPluginBase(name) {
-  deserialize_value(&serialData, &serialLength, &mEpsilon);
-}
-
-TRTInstanceNormalization::~TRTInstanceNormalization() {}
-
-// TRTInstanceNormalization returns one output.
-int TRTInstanceNormalization::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-DimsExprs TRTInstanceNormalization::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
-  nvinfer1::DimsExprs output(inputs[0]);
-  return output;
-}
-
-size_t TRTInstanceNormalization::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                                                  int nbInputs,
-                                                  const nvinfer1::PluginTensorDesc* outputs,
-                                                  int nbOutputs) const TRT_NOEXCEPT {
-  int n = inputs[0].dims.d[0];
-  int c = inputs[0].dims.d[1];
-  int elem_size = sizeof(float);
-  return getAlignedSize(n * c * elem_size) * 2;
-}
-
-int TRTInstanceNormalization::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-                                      const nvinfer1::PluginTensorDesc* outputDesc,
-                                      const void* const* inputs, void* const* outputs,
-                                      void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
-  nvinfer1::Dims input_dims = inputDesc[0].dims;
-  int n = input_dims.d[0];
-  int c = input_dims.d[1];
-  int h = input_dims.d[2];
-  int w = input_dims.nbDims > 3 ? input_dims.d[3] : 1;
-  int elem_size = sizeof(float);
-
-  void* n_scales = (void*)workspace;
-  void* n_bias = (void*)((char*)workspace + getAlignedSize(n * c * elem_size));
-
-  const void* scales = (const void*)inputs[1];
-  const void* bias = (const void*)inputs[2];
-
-  for (int i = 0; i < n; ++i) {
-    cudaMemcpyAsync((char*)n_scales + i * c * elem_size, scales, c * elem_size,
-                    cudaMemcpyDeviceToDevice, stream);
-    cudaMemcpyAsync((char*)n_bias + i * c * elem_size, bias, c * elem_size,
-                    cudaMemcpyDeviceToDevice, stream);
-  }
-
-  cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, n * c, 1, 1);
-  cudnnDataType_t cudnn_dtype{};
-  convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype);
-  cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
-  cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
-  float alpha = 1;
-  float beta = 0;
-  void const* x_ptr = inputs[0];
-  void* y_ptr = outputs[0];
-  cudnnSetStream(_cudnn_handle, stream);
-  // Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
-  //       overflows (NaNs) for fp32 data in some circumstances. The lower-
-  //       performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
-  //       acceptable.
-  cudnnBatchNormalizationForwardTraining(_cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha,
-                                         &beta, _x_desc, x_ptr, _y_desc, y_ptr, _b_desc, n_scales,
-                                         n_bias, 1., nullptr, nullptr, mEpsilon, nullptr, nullptr);
-  return 0;
-}
-
-size_t TRTInstanceNormalization::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mEpsilon);
-}
-
-void TRTInstanceNormalization::serialize(void* buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mEpsilon);
-}
-
-bool TRTInstanceNormalization::supportsFormatCombination(int pos,
-                                                         const nvinfer1::PluginTensorDesc* ioDesc,
-                                                         int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  switch (pos) {
-    case 0:
-    case 3:
-      return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
-               ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
-              ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR &&
-              ioDesc[pos].type == ioDesc[0].type);
-    case 1:
-    case 2:
-      return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-             ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR;
-    default:
-      return false;
-  }
-  return false;
-}
-
-const char* TRTInstanceNormalization::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char* TRTInstanceNormalization::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-IPluginV2DynamicExt* TRTInstanceNormalization::clone() const TRT_NOEXCEPT {
-  auto* plugin = new TRTInstanceNormalization{mLayerName, mEpsilon};
-  plugin->setPluginNamespace(mPluginNamespace.c_str());
-  return plugin;
-}
-
-nvinfer1::DataType TRTInstanceNormalization::getOutputDataType(int index,
-                                                               const nvinfer1::DataType* inputTypes,
-                                                               int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// Attach the plugin object to an execution context and grant the plugin the
-// access to some context resource.
-void TRTInstanceNormalization::attachToContext(cudnnContext* cudnnContext,
-                                               cublasContext* cublasContext,
-                                               IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {
-  _cudnn_handle = cudnnContext;
-  cudnnCreateTensorDescriptor(&_b_desc);
-  cudnnCreateTensorDescriptor(&_x_desc);
-  cudnnCreateTensorDescriptor(&_y_desc);
-}
-
-// Detach the plugin object from its execution context.
-void TRTInstanceNormalization::detachFromContext() TRT_NOEXCEPT {
-  if (_y_desc) {
-    cudnnDestroyTensorDescriptor(_y_desc);
-    _y_desc = nullptr;
-  }
-  if (_x_desc) {
-    cudnnDestroyTensorDescriptor(_x_desc);
-    _x_desc = nullptr;
-  }
-  if (_b_desc) {
-    cudnnDestroyTensorDescriptor(_b_desc);
-    _b_desc = nullptr;
-  }
-}
-
-void TRTInstanceNormalization::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                                               int nbInputs,
-                                               const nvinfer1::DynamicPluginTensorDesc* out,
-                                               int nbOutputs) TRT_NOEXCEPT {}
-
-// TRTInstanceNormalizationCreator methods
-TRTInstanceNormalizationCreator::TRTInstanceNormalizationCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
-
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char* TRTInstanceNormalizationCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char* TRTInstanceNormalizationCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-IPluginV2DynamicExt* TRTInstanceNormalizationCreator::createPlugin(
-    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
-  float epsilon = 1e-5;
-  const PluginField* fields = fc->fields;
-  for (int i = 0; i < fc->nbFields; ++i) {
-    const char* attrName = fields[i].name;
-    if (!strcmp(attrName, "epsilon")) {
-      epsilon = *(static_cast<const float*>(fields[i].data));
-    }
-  }
-
-  TRTInstanceNormalization* obj = new TRTInstanceNormalization(name, epsilon);
-  obj->setPluginNamespace(mNamespace.c_str());
-  return obj;
-}
-
-IPluginV2DynamicExt* TRTInstanceNormalizationCreator::deserializePlugin(
-    const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
-  TRTInstanceNormalization* obj = new TRTInstanceNormalization{name, serialData, serialLength};
-  obj->setPluginNamespace(mNamespace.c_str());
-  return obj;
-}
-REGISTER_TENSORRT_PLUGIN(TRTInstanceNormalizationCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        constexpr const char* PLUGIN_VERSION{"1"};
+        constexpr const char* PLUGIN_NAME{"TRTInstanceNormalization"};
+    }  // namespace
+
+    TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, float epsilon)
+        : TRTPluginBase(name)
+        , mEpsilon(epsilon)
+    {
+    }
+
+    TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, void const* serialData, size_t serialLength)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&serialData, &serialLength, &mEpsilon);
+    }
+
+    TRTInstanceNormalization::~TRTInstanceNormalization() {}
+
+    // TRTInstanceNormalization returns one output.
+    int TRTInstanceNormalization::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    DimsExprs TRTInstanceNormalization::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        nvinfer1::DimsExprs output(inputs[0]);
+        return output;
+    }
+
+    size_t TRTInstanceNormalization::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                      int                               nbInputs,
+                                                      const nvinfer1::PluginTensorDesc* outputs,
+                                                      int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        int n         = inputs[0].dims.d[0];
+        int c         = inputs[0].dims.d[1];
+        int elem_size = sizeof(float);
+        return getAlignedSize(n * c * elem_size) * 2;
+    }
+
+    int TRTInstanceNormalization::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                          const nvinfer1::PluginTensorDesc* outputDesc,
+                                          const void* const*                inputs,
+                                          void* const*                      outputs,
+                                          void*                             workspace,
+                                          cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        nvinfer1::Dims input_dims = inputDesc[0].dims;
+        int            n          = input_dims.d[0];
+        int            c          = input_dims.d[1];
+        int            h          = input_dims.d[2];
+        int            w          = input_dims.nbDims > 3 ? input_dims.d[3] : 1;
+        int            elem_size  = sizeof(float);
+
+        void*          n_scales = (void*)workspace;
+        void*          n_bias   = (void*)((char*)workspace + getAlignedSize(n * c * elem_size));
+
+        const void*    scales = (const void*)inputs[1];
+        const void*    bias   = (const void*)inputs[2];
+
+        for (int i = 0; i < n; ++i)
+        {
+            cudaMemcpyAsync((char*)n_scales + i * c * elem_size, scales, c * elem_size, cudaMemcpyDeviceToDevice, stream);
+            cudaMemcpyAsync((char*)n_bias + i * c * elem_size, bias, c * elem_size, cudaMemcpyDeviceToDevice, stream);
+        }
+
+        cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, n * c, 1, 1);
+        cudnnDataType_t cudnn_dtype{};
+        convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype);
+        cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
+        cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
+        float       alpha = 1;
+        float       beta  = 0;
+        void const* x_ptr = inputs[0];
+        void*       y_ptr = outputs[0];
+        cudnnSetStream(_cudnn_handle, stream);
+        // Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
+        //       overflows (NaNs) for fp32 data in some circumstances. The lower-
+        //       performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
+        //       acceptable.
+        cudnnBatchNormalizationForwardTraining(_cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha, &beta, _x_desc, x_ptr, _y_desc, y_ptr, _b_desc, n_scales, n_bias, 1., nullptr, nullptr, mEpsilon, nullptr, nullptr);
+        return 0;
+    }
+
+    size_t TRTInstanceNormalization::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mEpsilon);
+    }
+
+    void TRTInstanceNormalization::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mEpsilon);
+    }
+
+    bool TRTInstanceNormalization::supportsFormatCombination(int                               pos,
+                                                             const nvinfer1::PluginTensorDesc* ioDesc,
+                                                             int                               nbInputs,
+                                                             int                               nbOutputs) TRT_NOEXCEPT
+    {
+        switch (pos)
+        {
+            case 0:
+            case 3:
+                return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
+                         ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
+                        ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR &&
+                        ioDesc[pos].type == ioDesc[0].type);
+            case 1:
+            case 2:
+                return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                       ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR;
+            default:
+                return false;
+        }
+        return false;
+    }
+
+    const char* TRTInstanceNormalization::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTInstanceNormalization::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    IPluginV2DynamicExt* TRTInstanceNormalization::clone() const TRT_NOEXCEPT
+    {
+        auto* plugin = new TRTInstanceNormalization{mLayerName, mEpsilon};
+        plugin->setPluginNamespace(mPluginNamespace.c_str());
+        return plugin;
+    }
+
+    nvinfer1::DataType TRTInstanceNormalization::getOutputDataType(int                       index,
+                                                                   const nvinfer1::DataType* inputTypes,
+                                                                   int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // Attach the plugin object to an execution context and grant the plugin the
+    // access to some context resource.
+    void TRTInstanceNormalization::attachToContext(cudnnContext*  cudnnContext,
+                                                   cublasContext* cublasContext,
+                                                   IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
+    {
+        _cudnn_handle = cudnnContext;
+        cudnnCreateTensorDescriptor(&_b_desc);
+        cudnnCreateTensorDescriptor(&_x_desc);
+        cudnnCreateTensorDescriptor(&_y_desc);
+    }
+
+    // Detach the plugin object from its execution context.
+    void TRTInstanceNormalization::detachFromContext() TRT_NOEXCEPT
+    {
+        if (_y_desc)
+        {
+            cudnnDestroyTensorDescriptor(_y_desc);
+            _y_desc = nullptr;
+        }
+        if (_x_desc)
+        {
+            cudnnDestroyTensorDescriptor(_x_desc);
+            _x_desc = nullptr;
+        }
+        if (_b_desc)
+        {
+            cudnnDestroyTensorDescriptor(_b_desc);
+            _b_desc = nullptr;
+        }
+    }
+
+    void TRTInstanceNormalization::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                   int                                      nbInputs,
+                                                   const nvinfer1::DynamicPluginTensorDesc* out,
+                                                   int                                      nbOutputs) TRT_NOEXCEPT {}
+
+    // TRTInstanceNormalizationCreator methods
+    TRTInstanceNormalizationCreator::TRTInstanceNormalizationCreator()
+    {
+        mPluginAttributes.clear();
+        mPluginAttributes.emplace_back(PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
+
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTInstanceNormalizationCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTInstanceNormalizationCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    IPluginV2DynamicExt* TRTInstanceNormalizationCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        float              epsilon = 1e-5;
+        const PluginField* fields  = fc->fields;
+        for (int i = 0; i < fc->nbFields; ++i)
+        {
+            const char* attrName = fields[i].name;
+            if (!strcmp(attrName, "epsilon"))
+            {
+                epsilon = *(static_cast<const float*>(fields[i].data));
+            }
+        }
+
+        TRTInstanceNormalization* obj = new TRTInstanceNormalization(name, epsilon);
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+
+    IPluginV2DynamicExt* TRTInstanceNormalizationCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        TRTInstanceNormalization* obj = new TRTInstanceNormalization{name, serialData, serialLength};
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+    REGISTER_TENSORRT_PLUGIN(TRTInstanceNormalizationCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
index 2df04a5f6d..d513a59301 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
@@ -14,80 +14,78 @@
 
 typedef unsigned short half_type;
 
-namespace mmdeploy {
-class TRTInstanceNormalization final : public TRTPluginBase {
- public:
-  TRTInstanceNormalization(const std::string& name, float epsilon);
+namespace mmdeploy
+{
+    class TRTInstanceNormalization final : public TRTPluginBase
+    {
+      public:
+        TRTInstanceNormalization(const std::string& name, float epsilon);
 
-  TRTInstanceNormalization(const std::string& name, void const* serialData, size_t serialLength);
+        TRTInstanceNormalization(const std::string& name, void const* serialData, size_t serialLength);
 
-  TRTInstanceNormalization() = delete;
+        TRTInstanceNormalization() = delete;
 
-  ~TRTInstanceNormalization() TRT_NOEXCEPT override;
+        ~TRTInstanceNormalization() TRT_NOEXCEPT override;
 
-  int getNbOutputs() const TRT_NOEXCEPT override;
+        int                 getNbOutputs() const TRT_NOEXCEPT override;
 
-  // DynamicExt plugins returns DimsExprs class instead of Dims
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-      TRT_NOEXCEPT override;
+        // DynamicExt plugins returns DimsExprs class instead of Dims
+        nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
 
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
-              void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        int                            enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                               const nvinfer1::PluginTensorDesc* outputDesc,
+                                               const void* const*                inputs,
+                                               void* const*                      outputs,
+                                               void*                             workspace,
+                                               cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
+        size_t                         getSerializationSize() const TRT_NOEXCEPT override;
 
-  void serialize(void* buffer) const TRT_NOEXCEPT override;
+        void                           serialize(void* buffer) const TRT_NOEXCEPT override;
 
-  // DynamicExt plugin supportsFormat update.
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
+        // DynamicExt plugin supportsFormat update.
+        bool                           supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
 
-  const char* getPluginType() const TRT_NOEXCEPT override;
+        const char*                    getPluginType() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*                    getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
 
-  void attachToContext(cudnnContext* cudnn, cublasContext* cublas,
-                       nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
+        void                           attachToContext(cudnnContext* cudnn, cublasContext* cublas, nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
 
-  void detachFromContext() TRT_NOEXCEPT override;
+        void                           detachFromContext() TRT_NOEXCEPT override;
 
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) TRT_NOEXCEPT override;
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
 
- private:
-  float mEpsilon{};
-  cudnnHandle_t _cudnn_handle{};
-  cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _b_desc{};
-  std::string mPluginNamespace{};
-};
+      private:
+        float                   mEpsilon{};
+        cudnnHandle_t           _cudnn_handle{};
+        cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _b_desc{};
+        std::string             mPluginNamespace{};
+    };
 
-class TRTInstanceNormalizationCreator : public TRTPluginCreatorBase {
- public:
-  TRTInstanceNormalizationCreator();
+    class TRTInstanceNormalizationCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTInstanceNormalizationCreator();
 
-  ~TRTInstanceNormalizationCreator() override = default;
+        ~TRTInstanceNormalizationCreator() override = default;
 
-  const char* getPluginName() const TRT_NOEXCEPT override;
+        const char*                    getPluginName() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*                    getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2DynamicExt* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2DynamicExt* createPlugin(
+            const char*                            name,
+            const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData,
-                                                   size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_INSTANCE_NORMALIZATION_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
index 692000b740..363242e8e1 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
@@ -10,297 +10,406 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
-}  // namespace
-
-ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
-    const std::string &name, const nvinfer1::Dims stride, const nvinfer1::Dims padding,
-    const nvinfer1::Dims dilation, const int deformableGroup, const int group)
-    : TRTPluginBase(name),
-      mStride(stride),
-      mPadding(padding),
-      mDilation(dilation),
-      mDeformableGroup(deformableGroup),
-      mGroup(group) {
-  mWithBias = false;
-}
-
-ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(const std::string name,
-                                                                           const void *data,
-                                                                           size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mStride);
-  deserialize_value(&data, &length, &mPadding);
-  deserialize_value(&data, &length, &mDilation);
-  deserialize_value(&data, &length, &mDeformableGroup);
-  deserialize_value(&data, &length, &mGroup);
-  mWithBias = false;
-}
-ModulatedDeformableConvPluginDynamic::~ModulatedDeformableConvPluginDynamic() {}
-
-nvinfer1::IPluginV2DynamicExt *ModulatedDeformableConvPluginDynamic::clone() const TRT_NOEXCEPT {
-  ModulatedDeformableConvPluginDynamic *plugin = new ModulatedDeformableConvPluginDynamic(
-      mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-static const nvinfer1::IDimensionExpr *get_hw(const nvinfer1::IDimensionExpr *input,
-                                              const nvinfer1::IDimensionExpr *weight,
-                                              const nvinfer1::IDimensionExpr *stride,
-                                              const nvinfer1::IDimensionExpr *pad,
-                                              const nvinfer1::IDimensionExpr *dilation,
-                                              nvinfer1::IExprBuilder &exprBuilder) {
-  using DimOp = nvinfer1::DimensionOperation;
-  auto expr_1 = exprBuilder.constant(1);
-
-  // d*(w-1)+1
-  auto kernel_0 = exprBuilder.operation(DimOp::kSUB, *weight, *expr_1);
-  auto kernel_1 = exprBuilder.operation(DimOp::kPROD, *dilation, *kernel_0);
-  auto kernel = exprBuilder.operation(DimOp::kSUM, *kernel_1, *expr_1);
-
-  // (1+2*p-k)//stride -1
-  auto out_0 = exprBuilder.operation(DimOp::kSUM, *pad, *pad);
-  auto out_1 = exprBuilder.operation(DimOp::kSUM, *input, *out_0);
-  auto out_2 = exprBuilder.operation(DimOp::kSUB, *out_1, *kernel);
-  auto out_3 = exprBuilder.operation(DimOp::kFLOOR_DIV, *out_2, *stride);
-  auto out = exprBuilder.operation(DimOp::kSUM, *out_3, *expr_1);
-
-  return out;
-}
-
-nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  using DimOp = nvinfer1::DimensionOperation;
-  auto weight_dim = inputs[3].d;
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[3].d[0];
-
-  auto input_h = inputs[0].d[2];
-  auto input_w = inputs[0].d[3];
-  auto weight_h = weight_dim[2];
-  auto weight_w = weight_dim[3];
-  auto dilation_w = exprBuilder.constant(mDilation.d[0]);
-  auto dilation_h = exprBuilder.constant(mDilation.d[1]);
-  auto pad_w = exprBuilder.constant(mPadding.d[0]);
-  auto pad_h = exprBuilder.constant(mPadding.d[1]);
-  auto stride_w = exprBuilder.constant(mStride.d[0]);
-  auto stride_h = exprBuilder.constant(mStride.d[1]);
-  auto expr_1 = exprBuilder.constant(1);
-  auto expr_2 = exprBuilder.constant(2);
-
-  ret.d[2] = get_hw(input_h, weight_h, stride_h, pad_h, dilation_h, exprBuilder);
-  ret.d[3] = get_hw(input_w, weight_w, stride_w, pad_w, dilation_w, exprBuilder);
-
-  return ret;
-}
-
-bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
-             ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-  } else {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  }
-}
-
-void ModulatedDeformableConvPluginDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) TRT_NOEXCEPT {
-  if (nbInputs == 5) {
-    mWithBias = true;
-  }
-}
-
-size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const TRT_NOEXCEPT {
-  int sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
-
-  int batch_size = inputs[0].dims.d[0];
-  int nInputPlane = inputs[0].dims.d[1];
-  int inputHeight = inputs[0].dims.d[2];
-  int inputWidth = inputs[0].dims.d[3];
-
-  int nOutputPlane = outputs[0].dims.d[1];
-  int outputHeight = outputs[0].dims.d[2];
-  int outputWidth = outputs[0].dims.d[3];
-
-  int kW = inputs[3].dims.d[2];
-  int kH = inputs[3].dims.d[3];
-  int im2col_step = std::min(32, batch_size);
-
-  size_t col_size =
-      mmdeploy::getAlignedSize(nInputPlane * kW * kH * outputHeight * outputWidth * sizeof_dtype);
-
-  return col_size;
-}
-
-int ModulatedDeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                                  const nvinfer1::PluginTensorDesc *outputDesc,
-                                                  const void *const *inputs, void *const *outputs,
-                                                  void *workSpace,
-                                                  cudaStream_t stream) TRT_NOEXCEPT {
-  int batch = inputDesc[0].dims.d[0];
-  int channels = inputDesc[0].dims.d[1];
-  int height = inputDesc[0].dims.d[2];
-  int width = inputDesc[0].dims.d[3];
-  int channels_out = outputDesc[0].dims.d[1];
-  int kernel_h = inputDesc[3].dims.d[2];
-  int kernel_w = inputDesc[3].dims.d[3];
-
-  const void *x = inputs[0];
-  const void *offset = inputs[1];
-  const void *mask = inputs[2];
-  const void *weight = inputs[3];
-  const void *bias = mWithBias ? inputs[4] : nullptr;
-  void *output = outputs[0];
-  int im2col_step = std::min(batch, 32);
-
-  // TODO: add fp16 support
-  auto data_type = inputDesc[0].type;
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      ModulatedDeformConvForwardCUDAKernelLauncher<float>(
-          (float *)x, (float *)weight, (float *)bias, (float *)offset, (float *)mask,
-          (float *)output, workSpace, batch, channels, height, width, channels_out, kernel_w,
-          kernel_h, mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0],
-          mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
-      break;
-    case nvinfer1::DataType::kHALF:
-      ModulatedDeformConvForwardCUDAKernelLauncher<half>(
-          (half *)x, (half *)weight, (half *)bias, (half *)offset, (half *)mask, (half *)output,
-          workSpace, batch, channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0],
-          mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
-          mDeformableGroup, im2col_step, m_cublas_handle, stream);
-      break;
-    default:
-      return 1;
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *ModulatedDeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *ModulatedDeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-int ModulatedDeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t ModulatedDeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
-         serialized_size(mDeformableGroup) + serialized_size(mGroup);
-}
-
-void ModulatedDeformableConvPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mStride);
-  serialize_value(&buffer, mPadding);
-  serialize_value(&buffer, mDilation);
-  serialize_value(&buffer, mDeformableGroup);
-  serialize_value(&buffer, mGroup);
-}
-
-void ModulatedDeformableConvPluginDynamic::attachToContext(
-    cudnnContext *cudnnContext, cublasContext *cublasContext,
-    nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {
-  m_cublas_handle = cublasContext;
-}
-
-void ModulatedDeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
-
-////////////////////// creator /////////////////////////////
-
-ModulatedDeformableConvPluginDynamicCreator::ModulatedDeformableConvPluginDynamicCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *ModulatedDeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *ModulatedDeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  nvinfer1::Dims stride{2, {1, 1}};
-  nvinfer1::Dims padding{2, {0, 0}};
-  nvinfer1::Dims dilation{2, {1, 1}};
-  int deformableGroup = 1;
-  int group = 1;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
+    }  // namespace
+
+    ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
+        const std::string&   name,
+        const nvinfer1::Dims stride,
+        const nvinfer1::Dims padding,
+        const nvinfer1::Dims dilation,
+        const int            deformableGroup,
+        const int            group)
+        : TRTPluginBase(name)
+        , mStride(stride)
+        , mPadding(padding)
+        , mDilation(dilation)
+        , mDeformableGroup(deformableGroup)
+        , mGroup(group)
+    {
+        mWithBias = false;
     }
-    std::string field_name(fc->fields[i].name);
 
-    if (field_name.compare("deform_groups") == 0) {
-      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(const std::string name,
+                                                                               const void*       data,
+                                                                               size_t            length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mStride);
+        deserialize_value(&data, &length, &mPadding);
+        deserialize_value(&data, &length, &mDilation);
+        deserialize_value(&data, &length, &mDeformableGroup);
+        deserialize_value(&data, &length, &mGroup);
+        mWithBias = false;
+    }
+    ModulatedDeformableConvPluginDynamic::~ModulatedDeformableConvPluginDynamic() {}
+
+    nvinfer1::IPluginV2DynamicExt* ModulatedDeformableConvPluginDynamic::clone() const TRT_NOEXCEPT
+    {
+        ModulatedDeformableConvPluginDynamic* plugin = new ModulatedDeformableConvPluginDynamic(
+            mLayerName,
+            mStride,
+            mPadding,
+            mDilation,
+            mDeformableGroup,
+            mGroup);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    static const nvinfer1::IDimensionExpr* get_hw(const nvinfer1::IDimensionExpr* input,
+                                                  const nvinfer1::IDimensionExpr* weight,
+                                                  const nvinfer1::IDimensionExpr* stride,
+                                                  const nvinfer1::IDimensionExpr* pad,
+                                                  const nvinfer1::IDimensionExpr* dilation,
+                                                  nvinfer1::IExprBuilder&         exprBuilder)
+    {
+        using DimOp = nvinfer1::DimensionOperation;
+        auto expr_1 = exprBuilder.constant(1);
+
+        // d*(w-1)+1
+        auto kernel_0 = exprBuilder.operation(DimOp::kSUB, *weight, *expr_1);
+        auto kernel_1 = exprBuilder.operation(DimOp::kPROD, *dilation, *kernel_0);
+        auto kernel   = exprBuilder.operation(DimOp::kSUM, *kernel_1, *expr_1);
+
+        // (1+2*p-k)//stride -1
+        auto out_0 = exprBuilder.operation(DimOp::kSUM, *pad, *pad);
+        auto out_1 = exprBuilder.operation(DimOp::kSUM, *input, *out_0);
+        auto out_2 = exprBuilder.operation(DimOp::kSUB, *out_1, *kernel);
+        auto out_3 = exprBuilder.operation(DimOp::kFLOOR_DIV, *out_2, *stride);
+        auto out   = exprBuilder.operation(DimOp::kSUM, *out_3, *expr_1);
+
+        return out;
+    }
+
+    nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        using DimOp                    = nvinfer1::DimensionOperation;
+        auto                weight_dim = inputs[3].d;
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 4;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[3].d[0];
+
+        auto input_h    = inputs[0].d[2];
+        auto input_w    = inputs[0].d[3];
+        auto weight_h   = weight_dim[2];
+        auto weight_w   = weight_dim[3];
+        auto dilation_w = exprBuilder.constant(mDilation.d[0]);
+        auto dilation_h = exprBuilder.constant(mDilation.d[1]);
+        auto pad_w      = exprBuilder.constant(mPadding.d[0]);
+        auto pad_h      = exprBuilder.constant(mPadding.d[1]);
+        auto stride_w   = exprBuilder.constant(mStride.d[0]);
+        auto stride_h   = exprBuilder.constant(mStride.d[1]);
+        auto expr_1     = exprBuilder.constant(1);
+        auto expr_2     = exprBuilder.constant(2);
+
+        ret.d[2] = get_hw(input_h, weight_h, stride_h, pad_h, dilation_h, exprBuilder);
+        ret.d[3] = get_hw(input_w, weight_w, stride_w, pad_w, dilation_w, exprBuilder);
+
+        return ret;
+    }
+
+    bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(
+        int                               pos,
+        const nvinfer1::PluginTensorDesc* ioDesc,
+        int                               nbInputs,
+        int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
+                     ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+    }
+
+    void ModulatedDeformableConvPluginDynamic::configurePlugin(
+        const nvinfer1::DynamicPluginTensorDesc* inputs,
+        int                                      nbInputs,
+        const nvinfer1::DynamicPluginTensorDesc* outputs,
+        int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        if (nbInputs == 5)
+        {
+            mWithBias = true;
+        }
+    }
+
+    size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(
+        const nvinfer1::PluginTensorDesc* inputs,
+        int                               nbInputs,
+        const nvinfer1::PluginTensorDesc* outputs,
+        int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        int    sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
+
+        int    batch_size  = inputs[0].dims.d[0];
+        int    nInputPlane = inputs[0].dims.d[1];
+        int    inputHeight = inputs[0].dims.d[2];
+        int    inputWidth  = inputs[0].dims.d[3];
+
+        int    nOutputPlane = outputs[0].dims.d[1];
+        int    outputHeight = outputs[0].dims.d[2];
+        int    outputWidth  = outputs[0].dims.d[3];
+
+        int    kW          = inputs[3].dims.d[2];
+        int    kH          = inputs[3].dims.d[3];
+        int    im2col_step = std::min(32, batch_size);
+
+        size_t col_size =
+            mmdeploy::getAlignedSize(nInputPlane * kW * kH * outputHeight * outputWidth * sizeof_dtype);
+
+        return col_size;
+    }
+
+    int ModulatedDeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                                      const nvinfer1::PluginTensorDesc* outputDesc,
+                                                      const void* const*                inputs,
+                                                      void* const*                      outputs,
+                                                      void*                             workSpace,
+                                                      cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int         batch        = inputDesc[0].dims.d[0];
+        int         channels     = inputDesc[0].dims.d[1];
+        int         height       = inputDesc[0].dims.d[2];
+        int         width        = inputDesc[0].dims.d[3];
+        int         channels_out = outputDesc[0].dims.d[1];
+        int         kernel_h     = inputDesc[3].dims.d[2];
+        int         kernel_w     = inputDesc[3].dims.d[3];
+
+        const void* x           = inputs[0];
+        const void* offset      = inputs[1];
+        const void* mask        = inputs[2];
+        const void* weight      = inputs[3];
+        const void* bias        = mWithBias ? inputs[4] : nullptr;
+        void*       output      = outputs[0];
+        int         im2col_step = std::min(batch, 32);
+
+        // TODO: add fp16 support
+        auto        data_type = inputDesc[0].type;
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                ModulatedDeformConvForwardCUDAKernelLauncher<float>(
+                    (float*)x,
+                    (float*)weight,
+                    (float*)bias,
+                    (float*)offset,
+                    (float*)mask,
+                    (float*)output,
+                    workSpace,
+                    batch,
+                    channels,
+                    height,
+                    width,
+                    channels_out,
+                    kernel_w,
+                    kernel_h,
+                    mStride.d[0],
+                    mStride.d[1],
+                    mPadding.d[0],
+                    mPadding.d[1],
+                    mDilation.d[0],
+                    mDilation.d[1],
+                    mGroup,
+                    mDeformableGroup,
+                    im2col_step,
+                    m_cublas_handle,
+                    stream);
+                break;
+            case nvinfer1::DataType::kHALF:
+                ModulatedDeformConvForwardCUDAKernelLauncher<half>(
+                    (half*)x,
+                    (half*)weight,
+                    (half*)bias,
+                    (half*)offset,
+                    (half*)mask,
+                    (half*)output,
+                    workSpace,
+                    batch,
+                    channels,
+                    height,
+                    width,
+                    channels_out,
+                    kernel_w,
+                    kernel_h,
+                    mStride.d[0],
+                    mStride.d[1],
+                    mPadding.d[0],
+                    mPadding.d[1],
+                    mDilation.d[0],
+                    mDilation.d[1],
+                    mGroup,
+                    mDeformableGroup,
+                    im2col_step,
+                    m_cublas_handle,
+                    stream);
+                break;
+            default:
+                return 1;
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(
+        int                       index,
+        const nvinfer1::DataType* inputTypes,
+        int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* ModulatedDeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* ModulatedDeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int ModulatedDeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t ModulatedDeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
+               serialized_size(mDeformableGroup) + serialized_size(mGroup);
+    }
+
+    void ModulatedDeformableConvPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mStride);
+        serialize_value(&buffer, mPadding);
+        serialize_value(&buffer, mDilation);
+        serialize_value(&buffer, mDeformableGroup);
+        serialize_value(&buffer, mGroup);
+    }
+
+    void ModulatedDeformableConvPluginDynamic::attachToContext(
+        cudnnContext*            cudnnContext,
+        cublasContext*           cublasContext,
+        nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
+    {
+        m_cublas_handle = cublasContext;
+    }
+
+    void ModulatedDeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
+
+    ////////////////////// creator /////////////////////////////
+
+    ModulatedDeformableConvPluginDynamicCreator::ModulatedDeformableConvPluginDynamicCreator()
+    {
+        mPluginAttributes.clear();
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
     }
 
-    if (field_name.compare("groups") == 0) {
-      group = static_cast<const int *>(fc->fields[i].data)[0];
+    const char* ModulatedDeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
     }
 
-    if (field_name.compare("stride") == 0) {
-      stride.nbDims = 2;
-      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    const char* ModulatedDeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
     }
 
-    if (field_name.compare("padding") == 0) {
-      padding.nbDims = 2;
-      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    nvinfer1::IPluginV2* ModulatedDeformableConvPluginDynamicCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        nvinfer1::Dims stride{2, {1, 1}};
+        nvinfer1::Dims padding{2, {0, 0}};
+        nvinfer1::Dims dilation{2, {1, 1}};
+        int            deformableGroup = 1;
+        int            group           = 1;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("deform_groups") == 0)
+            {
+                deformableGroup = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("groups") == 0)
+            {
+                group = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("stride") == 0)
+            {
+                stride.nbDims = 2;
+                stride.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                stride.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+
+            if (field_name.compare("padding") == 0)
+            {
+                padding.nbDims = 2;
+                padding.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                padding.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+
+            if (field_name.compare("dilation") == 0)
+            {
+                dilation.nbDims = 2;
+                dilation.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                dilation.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+        }
+
+        ModulatedDeformableConvPluginDynamic* plugin = new ModulatedDeformableConvPluginDynamic(
+            name,
+            stride,
+            padding,
+            dilation,
+            deformableGroup,
+            group);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
     }
 
-    if (field_name.compare("dilation") == 0) {
-      dilation.nbDims = 2;
-      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    nvinfer1::IPluginV2* ModulatedDeformableConvPluginDynamicCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new ModulatedDeformableConvPluginDynamic(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
     }
-  }
-
-  ModulatedDeformableConvPluginDynamic *plugin = new ModulatedDeformableConvPluginDynamic(
-      name, stride, padding, dilation, deformableGroup, group);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new ModulatedDeformableConvPluginDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(ModulatedDeformableConvPluginDynamicCreator);
+    REGISTER_TENSORRT_PLUGIN(ModulatedDeformableConvPluginDynamicCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
index 2dc6ed2f20..2082d83b9a 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
@@ -9,74 +9,69 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class ModulatedDeformableConvPluginDynamic : public TRTPluginBase {
- public:
-  ModulatedDeformableConvPluginDynamic(const std::string &name, const nvinfer1::Dims stride,
-                                       const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
-                                       const int deformableGroup, const int group);
-
-  ModulatedDeformableConvPluginDynamic(const std::string name, const void *data, size_t length);
-
-  ModulatedDeformableConvPluginDynamic() = delete;
-
-  ~ModulatedDeformableConvPluginDynamic() TRT_NOEXCEPT override;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
-                       nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override;
-  void detachFromContext() TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-
- private:
-  nvinfer1::Dims mStride;
-  nvinfer1::Dims mPadding;
-  nvinfer1::Dims mDilation;
-  int mDeformableGroup;
-  int mGroup;
-  bool mWithBias;
-
-  cublasHandle_t m_cublas_handle;
-};
-
-class ModulatedDeformableConvPluginDynamicCreator : public TRTPluginCreatorBase {
- public:
-  ModulatedDeformableConvPluginDynamicCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class ModulatedDeformableConvPluginDynamic : public TRTPluginBase
+    {
+      public:
+        ModulatedDeformableConvPluginDynamic(const std::string& name, const nvinfer1::Dims stride, const nvinfer1::Dims padding, const nvinfer1::Dims dilation, const int deformableGroup, const int group);
+
+        ModulatedDeformableConvPluginDynamic(const std::string name, const void* data, size_t length);
+
+        ModulatedDeformableConvPluginDynamic() = delete;
+
+        ~ModulatedDeformableConvPluginDynamic() TRT_NOEXCEPT override;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+        void               attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+        void               detachFromContext() TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+
+      private:
+        nvinfer1::Dims mStride;
+        nvinfer1::Dims mPadding;
+        nvinfer1::Dims mDilation;
+        int            mDeformableGroup;
+        int            mGroup;
+        bool           mWithBias;
+
+        cublasHandle_t m_cublas_handle;
+    };
+
+    class ModulatedDeformableConvPluginDynamicCreator : public TRTPluginCreatorBase
+    {
+      public:
+        ModulatedDeformableConvPluginDynamicCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_MODULATED_DEFORM_CONV_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
index 1e1f99d5ff..21fc6cacf5 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
@@ -7,132 +7,228 @@
 #include "trt_modulated_deform_conv_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename T>
-void trt_modulated_deformable_im2col(const T* data_im_, const T* data_offset_, const T* data_mask_,
-                                     const int batch_size, const int channels, const int height_im,
-                                     const int width_im, const int height_col, const int width_col,
-                                     const int kernel_h, const int kenerl_w, const int pad_h,
-                                     const int pad_w, const int stride_h, const int stride_w,
-                                     const int dilation_h, const int dilation_w,
-                                     const int deformable_group, T* data_col_,
-                                     cudaStream_t stream) {
-  // num_axes should be smaller than block size
-  const int channel_per_deformable_group = channels / deformable_group;
-  const int num_kernels = channels * batch_size * height_col * width_col;
-
-  modulated_deformable_im2col_gpu_kernel<T>
-      <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
-          num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
-          pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
-          batch_size, channels, deformable_group, height_col, width_col, data_col_);
-
-  cudaCheckError();
+template<typename T>
+void trt_modulated_deformable_im2col(const T* data_im_, const T* data_offset_, const T* data_mask_, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, T* data_col_, cudaStream_t stream)
+{
+    // num_axes should be smaller than block size
+    const int channel_per_deformable_group = channels / deformable_group;
+    const int num_kernels                  = channels * batch_size * height_col * width_col;
+
+    modulated_deformable_im2col_gpu_kernel<T>
+        <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels,
+            data_im_,
+            data_offset_,
+            data_mask_,
+            height_im,
+            width_im,
+            kernel_h,
+            kenerl_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            channels,
+            deformable_group,
+            height_col,
+            width_col,
+            data_col_);
+
+    cudaCheckError();
 }
 
-template <typename scalar_t>
-__global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias, size_t step_batch,
-                                       size_t step_channel, size_t n) {
-  CUDA_1D_KERNEL_LOOP(index, n) { output[index] += bias[(index % step_batch) / step_channel]; }
+template<typename scalar_t>
+__global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias, size_t step_batch, size_t step_channel, size_t n)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        output[index] += bias[(index % step_batch) / step_channel];
+    }
 }
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-template <>
-__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias,
-                                               size_t step_batch, size_t step_channel, size_t n) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    const __half b = bias[(index % step_batch) / step_channel];
-    const __half o = output[index];
-    output[index] = __hadd(o, b);
-  }
+template<>
+__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias, size_t step_batch, size_t step_channel, size_t n)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        const __half b = bias[(index % step_batch) / step_channel];
+        const __half o = output[index];
+        output[index]  = __hadd(o, b);
+    }
 }
 #else
-template <>
-__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias,
-                                               size_t step_batch, size_t step_channel, size_t n) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    const __half b = bias[(index % step_batch) / step_channel];
-    const __half o = output[index];
-    output[index] = __float2half(__half2float(o) + __half2float(b));
-  }
+template<>
+__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias, size_t step_batch, size_t step_channel, size_t n)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        const __half b = bias[(index % step_batch) / step_channel];
+        const __half o = output[index];
+        output[index]  = __float2half(__half2float(o) + __half2float(b));
+    }
 }
 #endif
 
-template <typename scalar_t>
-static void output_add_bias(scalar_t* output, const scalar_t* bias, size_t batch, size_t channel,
-                            size_t height, size_t width, cudaStream_t stream) {
-  size_t step_channel = height * width;
-  size_t step_batch = step_channel * channel;
-  size_t n = step_batch * batch;
-  output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(output, bias, step_batch,
-                                                                          step_channel, n);
+template<typename scalar_t>
+static void output_add_bias(scalar_t* output, const scalar_t* bias, size_t batch, size_t channel, size_t height, size_t width, cudaStream_t stream)
+{
+    size_t step_channel = height * width;
+    size_t step_batch   = step_channel * channel;
+    size_t n            = step_batch * batch;
+    output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(output, bias, step_batch, step_channel, n);
 }
 
-template <typename scalar_t>
+template<typename scalar_t>
 void ModulatedDeformConvForwardCUDAKernelLauncher(
-    const scalar_t* input, const scalar_t* weight, const scalar_t* bias, const scalar_t* offset,
-    const scalar_t* mask, scalar_t* output, void* workspace, int batch, int channels, int height,
-    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
-    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
-    cublasHandle_t cublas_handle, cudaStream_t stream) {
-  bool with_bias = (bias != nullptr);
-
-  im2col_step = std::min(int(batch), im2col_step);
-  assert(batch % im2col_step == 0);
-
-  const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-
-  scalar_t* columns = (scalar_t*)workspace;
-
-  const size_t input_step = channels * height * width;
-  const size_t offset_step = deformable_group * kernel_h * kernel_w * 2 * height_out * width_out;
-  const size_t mask_step = deformable_group * kernel_h * kernel_w * height_out * width_out;
-  const size_t out_step = channels_out * height_out * width_out;
-  const size_t out_group_step = out_step / group;
-  const size_t col_g_step = channels * kernel_w * kernel_h / group * height_out * width_out;
-  const size_t weight_g_step = channels_out / group * channels / group * kernel_h * kernel_w;
-
-  const int m = channels_out / group;
-  const int n = height_out * width_out;
-  const int k = channels / group * kernel_h * kernel_w;
-  scalar_t alpha = 1.;
-  scalar_t beta = 0.;
-
-  for (int b = 0; b < batch; b++) {
-    const scalar_t* input_start = input + b * input_step;
-    const scalar_t* offset_start = offset + b * offset_step;
-    const scalar_t* mask_start = mask + b * mask_step;
-    trt_modulated_deformable_im2col<scalar_t>(
-        input_start, offset_start, mask_start, 1, channels, height, width, height_out, width_out,
-        kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-        deformable_group, columns, stream);
-
-    for (int g = 0; g < group; g++) {
-      const scalar_t* weight_start = weight + g * weight_g_step;
-      scalar_t* col_start = columns + g * col_g_step;
-      scalar_t* out_buffer_start = output + b * out_step + g * out_group_step;
-
-      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start,
-                               n, weight_start, k, &beta, out_buffer_start, n);
-      cudaCheckError();
+    const scalar_t* input,
+    const scalar_t* weight,
+    const scalar_t* bias,
+    const scalar_t* offset,
+    const scalar_t* mask,
+    scalar_t*       output,
+    void*           workspace,
+    int             batch,
+    int             channels,
+    int             height,
+    int             width,
+    int             channels_out,
+    int             kernel_w,
+    int             kernel_h,
+    int             stride_w,
+    int             stride_h,
+    int             pad_w,
+    int             pad_h,
+    int             dilation_w,
+    int             dilation_h,
+    int             group,
+    int             deformable_group,
+    int             im2col_step,
+    cublasHandle_t  cublas_handle,
+    cudaStream_t    stream)
+{
+    bool with_bias = (bias != nullptr);
+
+    im2col_step = std::min(int(batch), im2col_step);
+    assert(batch % im2col_step == 0);
+
+    const int    height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int    width_out  = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    scalar_t*    columns = (scalar_t*)workspace;
+
+    const size_t input_step     = channels * height * width;
+    const size_t offset_step    = deformable_group * kernel_h * kernel_w * 2 * height_out * width_out;
+    const size_t mask_step      = deformable_group * kernel_h * kernel_w * height_out * width_out;
+    const size_t out_step       = channels_out * height_out * width_out;
+    const size_t out_group_step = out_step / group;
+    const size_t col_g_step     = channels * kernel_w * kernel_h / group * height_out * width_out;
+    const size_t weight_g_step  = channels_out / group * channels / group * kernel_h * kernel_w;
+
+    const int    m     = channels_out / group;
+    const int    n     = height_out * width_out;
+    const int    k     = channels / group * kernel_h * kernel_w;
+    scalar_t     alpha = 1.;
+    scalar_t     beta  = 0.;
+
+    for (int b = 0; b < batch; b++)
+    {
+        const scalar_t* input_start  = input + b * input_step;
+        const scalar_t* offset_start = offset + b * offset_step;
+        const scalar_t* mask_start   = mask + b * mask_step;
+        trt_modulated_deformable_im2col<scalar_t>(
+            input_start,
+            offset_start,
+            mask_start,
+            1,
+            channels,
+            height,
+            width,
+            height_out,
+            width_out,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            deformable_group,
+            columns,
+            stream);
+
+        for (int g = 0; g < group; g++)
+        {
+            const scalar_t* weight_start     = weight + g * weight_g_step;
+            scalar_t*       col_start        = columns + g * col_g_step;
+            scalar_t*       out_buffer_start = output + b * out_step + g * out_group_step;
+
+            cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start, n, weight_start, k, &beta, out_buffer_start, n);
+            cudaCheckError();
+        }
     }
-  }
 
-  if (with_bias) {
-    output_add_bias<scalar_t>(output, bias, batch, channels_out, height_out, width_out, stream);
-  }
+    if (with_bias)
+    {
+        output_add_bias<scalar_t>(output, bias, batch, channels_out, height_out, width_out, stream);
+    }
 }
 
 template void ModulatedDeformConvForwardCUDAKernelLauncher<float>(
-    const float* input, const float* weight, const float* bias, const float* offset,
-    const float* mask, float* output, void* workspace, int batch, int channels, int height,
-    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
-    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
-    cublasHandle_t cublas_handle, cudaStream_t stream);
+    const float*   input,
+    const float*   weight,
+    const float*   bias,
+    const float*   offset,
+    const float*   mask,
+    float*         output,
+    void*          workspace,
+    int            batch,
+    int            channels,
+    int            height,
+    int            width,
+    int            channels_out,
+    int            kernel_w,
+    int            kernel_h,
+    int            stride_w,
+    int            stride_h,
+    int            pad_w,
+    int            pad_h,
+    int            dilation_w,
+    int            dilation_h,
+    int            group,
+    int            deformable_group,
+    int            im2col_step,
+    cublasHandle_t cublas_handle,
+    cudaStream_t   stream);
 
 template void ModulatedDeformConvForwardCUDAKernelLauncher<__half>(
-    const __half* input, const __half* weight, const __half* bias, const __half* offset,
-    const __half* mask, __half* output, void* workspace, int batch, int channels, int height,
-    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
-    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
-    cublasHandle_t cublas_handle, cudaStream_t stream);
+    const __half*  input,
+    const __half*  weight,
+    const __half*  bias,
+    const __half*  offset,
+    const __half*  mask,
+    __half*        output,
+    void*          workspace,
+    int            batch,
+    int            channels,
+    int            height,
+    int            width,
+    int            channels_out,
+    int            kernel_w,
+    int            kernel_h,
+    int            stride_w,
+    int            stride_h,
+    int            pad_w,
+    int            pad_h,
+    int            dilation_w,
+    int            dilation_h,
+    int            group,
+    int            deformable_group,
+    int            im2col_step,
+    cublasHandle_t cublas_handle,
+    cudaStream_t   stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
index 4cdec4fb38..3a1298558c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
@@ -4,12 +4,32 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 
-template <typename scalar_t>
+template<typename scalar_t>
 void ModulatedDeformConvForwardCUDAKernelLauncher(
-    const scalar_t* input, const scalar_t* weight, const scalar_t* bias, const scalar_t* offset,
-    const scalar_t* mask, scalar_t* output, void* workspace, int batch, int channels, int height,
-    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
-    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
-    cublasHandle_t cublas_handle, cudaStream_t stream);
+    const scalar_t* input,
+    const scalar_t* weight,
+    const scalar_t* bias,
+    const scalar_t* offset,
+    const scalar_t* mask,
+    scalar_t*       output,
+    void*           workspace,
+    int             batch,
+    int             channels,
+    int             height,
+    int             width,
+    int             channels_out,
+    int             kernel_w,
+    int             kernel_h,
+    int             stride_w,
+    int             stride_h,
+    int             pad_w,
+    int             pad_h,
+    int             dilation_w,
+    int             dilation_h,
+    int             group,
+    int             deformable_group,
+    int             im2col_step,
+    cublasHandle_t  cublas_handle,
+    cudaStream_t    stream);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
index ad9a518da7..456acca9b4 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
@@ -9,219 +9,263 @@
 #include "trt_multi_level_roi_align_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 #include "trt_serialize.hpp"
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVMultiLevelRoiAlign"};
-}  // namespace
-
-TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string &name, int alignedHeight,
-                                             int alignedWidth, int poolMode, int sampleNum,
-                                             const std::vector<float> &featmapStrides,
-                                             float roiScaleFactor, int finestScale, bool aligned)
-    : TRTPluginBase(name),
-      mAlignedHeight(alignedHeight),
-      mAlignedWidth(alignedWidth),
-      mPoolMode(poolMode),
-      mSampleNum(sampleNum),
-      mFeatmapStrides(featmapStrides),
-      mRoiScaleFactor(roiScaleFactor),
-      mFinestScale(finestScale),
-      mAligned(aligned) {}
-
-TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string name, const void *data,
-                                             size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mAlignedHeight);
-  deserialize_value(&data, &length, &mAlignedWidth);
-  deserialize_value(&data, &length, &mPoolMode);
-  deserialize_value(&data, &length, &mSampleNum);
-  deserialize_value(&data, &length, &mRoiScaleFactor);
-  deserialize_value(&data, &length, &mFinestScale);
-  deserialize_value(&data, &length, &mAligned);
-  deserialize_value(&data, &length, &mFeatmapStrides);
-}
-
-nvinfer1::IPluginV2DynamicExt *TRTMultiLevelRoiAlign::clone() const TRT_NOEXCEPT {
-  TRTMultiLevelRoiAlign *plugin =
-      new TRTMultiLevelRoiAlign(mLayerName, mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum,
-                                mFeatmapStrides, mRoiScaleFactor, mFinestScale, mAligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTMultiLevelRoiAlign::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  // warning, nbInputs should equal to mFeatmapStrides.size() + 1
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[1].d[1];
-  ret.d[2] = exprBuilder.constant(mAlignedHeight);
-  ret.d[3] = exprBuilder.constant(mAlignedWidth);
-
-  return ret;
-}
-
-bool TRTMultiLevelRoiAlign::supportsFormatCombination(int pos,
-                                                      const nvinfer1::PluginTensorDesc *ioDesc,
-                                                      int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-}
-
-void TRTMultiLevelRoiAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
-                                            int nbInputs,
-                                            const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                            int nbOutputs) TRT_NOEXCEPT {
-  // Validate input arguments
-  ASSERT(nbOutputs == 1);
-  ASSERT(nbInputs >= 1);
-  mFeatmapStrides =
-      std::vector<float>(mFeatmapStrides.begin(), mFeatmapStrides.begin() + (nbInputs - 1));
-}
-
-size_t TRTMultiLevelRoiAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
-                                               int nbInputs,
-                                               const nvinfer1::PluginTensorDesc *outputs,
-                                               int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int TRTMultiLevelRoiAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                   const nvinfer1::PluginTensorDesc *outputDesc,
-                                   const void *const *inputs, void *const *outputs, void *workSpace,
-                                   cudaStream_t stream) TRT_NOEXCEPT {
-  int num_rois = inputDesc[0].dims.d[0];
-  int batch_size = inputDesc[1].dims.d[0];
-  int channels = inputDesc[1].dims.d[1];
-
-  const int kMaxFeatMap = 10;
-  int heights[kMaxFeatMap];
-  int widths[kMaxFeatMap];
-  float strides[kMaxFeatMap];
-
-  int num_feats = mFeatmapStrides.size();
-  for (int i = 0; i < num_feats; ++i) {
-    heights[i] = inputDesc[i + 1].dims.d[2];
-    widths[i] = inputDesc[i + 1].dims.d[3];
-    strides[i] = mFeatmapStrides[i];
-  }
-
-  const void *rois = inputs[0];
-  const void *const *feats = inputs + 1;
-
-  multi_level_roi_align<float>((float *)outputs[0], (const float *)rois, num_rois, feats, num_feats,
-                               batch_size, channels, &heights[0], &widths[0], &strides[0],
-                               mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum,
-                               mRoiScaleFactor, mFinestScale, mAligned, stream);
-
-  return 0;
-}
-
-nvinfer1::DataType TRTMultiLevelRoiAlign::getOutputDataType(int index,
-                                                            const nvinfer1::DataType *inputTypes,
-                                                            int nbInputs) const TRT_NOEXCEPT {
-  return nvinfer1::DataType::kFLOAT;
-}
-
-// IPluginV2 Methods
-const char *TRTMultiLevelRoiAlign::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTMultiLevelRoiAlign::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int TRTMultiLevelRoiAlign::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTMultiLevelRoiAlign::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
-         serialized_size(mAlignedWidth) + serialized_size(mPoolMode) + serialized_size(mSampleNum) +
-         serialized_size(mRoiScaleFactor) + serialized_size(mFinestScale) +
-         serialized_size(mAligned);
-}
-
-void TRTMultiLevelRoiAlign::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mAlignedHeight);
-  serialize_value(&buffer, mAlignedWidth);
-  serialize_value(&buffer, mPoolMode);
-  serialize_value(&buffer, mSampleNum);
-  serialize_value(&buffer, mRoiScaleFactor);
-  serialize_value(&buffer, mFinestScale);
-  serialize_value(&buffer, mAligned);
-  serialize_value(&buffer, mFeatmapStrides);
-}
-
-TRTMultiLevelRoiAlignCreator::TRTMultiLevelRoiAlignCreator() {
-  mPluginAttributes = std::vector<nvinfer1::PluginField>(
-      {nvinfer1::PluginField("output_height"), nvinfer1::PluginField("output_width"),
-       nvinfer1::PluginField("pool_mode"), nvinfer1::PluginField("sampling_ratio"),
-       nvinfer1::PluginField("featmap_strides"), nvinfer1::PluginField("roi_scale_factor"),
-       nvinfer1::PluginField("finest_scale"), nvinfer1::PluginField("aligned")});
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTMultiLevelRoiAlignCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTMultiLevelRoiAlignCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  int alignedHeight = 7;
-  int alignedWidth = 7;
-  int poolMode = 0;
-  int sampleNum = 2;
-  std::vector<float> featmapStrides;
-  float roiScaleFactor = -1;
-  int finestScale = 56;
-  bool aligned = false;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("output_height") == 0) {
-      alignedHeight = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("output_width") == 0) {
-      alignedWidth = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("pool_mode") == 0) {
-      poolMode = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("sampling_ratio") == 0) {
-      sampleNum = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("roi_scale_factor") == 0) {
-      roiScaleFactor = static_cast<const float *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("finest_scale") == 0) {
-      finestScale = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("featmap_strides") == 0) {
-      int data_size = (fc->fields[i].length);
-      const float *data_start = static_cast<const float *>(fc->fields[i].data);
-      featmapStrides = std::vector<float>(data_start, data_start + data_size);
-    } else if (field_name.compare("aligned") == 0) {
-      int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
-      aligned = aligned_int != 0;
-    }
-  }
-
-  ASSERT(featmapStrides.size() != 0);
-
-  TRTMultiLevelRoiAlign *plugin =
-      new TRTMultiLevelRoiAlign(name, alignedHeight, alignedWidth, poolMode, sampleNum,
-                                featmapStrides, roiScaleFactor, finestScale, aligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new TRTMultiLevelRoiAlign(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRoiAlignCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVMultiLevelRoiAlign"};
+    }  // namespace
+
+    TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string& name, int alignedHeight, int alignedWidth, int poolMode, int sampleNum, const std::vector<float>& featmapStrides, float roiScaleFactor, int finestScale, bool aligned)
+        : TRTPluginBase(name)
+        , mAlignedHeight(alignedHeight)
+        , mAlignedWidth(alignedWidth)
+        , mPoolMode(poolMode)
+        , mSampleNum(sampleNum)
+        , mFeatmapStrides(featmapStrides)
+        , mRoiScaleFactor(roiScaleFactor)
+        , mFinestScale(finestScale)
+        , mAligned(aligned)
+    {
+    }
+
+    TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mAlignedHeight);
+        deserialize_value(&data, &length, &mAlignedWidth);
+        deserialize_value(&data, &length, &mPoolMode);
+        deserialize_value(&data, &length, &mSampleNum);
+        deserialize_value(&data, &length, &mRoiScaleFactor);
+        deserialize_value(&data, &length, &mFinestScale);
+        deserialize_value(&data, &length, &mAligned);
+        deserialize_value(&data, &length, &mFeatmapStrides);
+    }
+
+    nvinfer1::IPluginV2DynamicExt* TRTMultiLevelRoiAlign::clone() const TRT_NOEXCEPT
+    {
+        TRTMultiLevelRoiAlign* plugin =
+            new TRTMultiLevelRoiAlign(mLayerName, mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum, mFeatmapStrides, mRoiScaleFactor, mFinestScale, mAligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTMultiLevelRoiAlign::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        // warning, nbInputs should equal to mFeatmapStrides.size() + 1
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 4;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[1].d[1];
+        ret.d[2]   = exprBuilder.constant(mAlignedHeight);
+        ret.d[3]   = exprBuilder.constant(mAlignedWidth);
+
+        return ret;
+    }
+
+    bool TRTMultiLevelRoiAlign::supportsFormatCombination(int                               pos,
+                                                          const nvinfer1::PluginTensorDesc* ioDesc,
+                                                          int                               nbInputs,
+                                                          int                               nbOutputs) TRT_NOEXCEPT
+    {
+        return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+
+    void TRTMultiLevelRoiAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                int                                      nbInputs,
+                                                const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        // Validate input arguments
+        ASSERT(nbOutputs == 1);
+        ASSERT(nbInputs >= 1);
+        mFeatmapStrides =
+            std::vector<float>(mFeatmapStrides.begin(), mFeatmapStrides.begin() + (nbInputs - 1));
+    }
+
+    size_t TRTMultiLevelRoiAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                   int                               nbInputs,
+                                                   const nvinfer1::PluginTensorDesc* outputs,
+                                                   int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int TRTMultiLevelRoiAlign::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                       const nvinfer1::PluginTensorDesc* outputDesc,
+                                       const void* const*                inputs,
+                                       void* const*                      outputs,
+                                       void*                             workSpace,
+                                       cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int       num_rois   = inputDesc[0].dims.d[0];
+        int       batch_size = inputDesc[1].dims.d[0];
+        int       channels   = inputDesc[1].dims.d[1];
+
+        const int kMaxFeatMap = 10;
+        int       heights[kMaxFeatMap];
+        int       widths[kMaxFeatMap];
+        float     strides[kMaxFeatMap];
+
+        int       num_feats = mFeatmapStrides.size();
+        for (int i = 0; i < num_feats; ++i)
+        {
+            heights[i] = inputDesc[i + 1].dims.d[2];
+            widths[i]  = inputDesc[i + 1].dims.d[3];
+            strides[i] = mFeatmapStrides[i];
+        }
+
+        const void*        rois  = inputs[0];
+        const void* const* feats = inputs + 1;
+
+        multi_level_roi_align<float>((float*)outputs[0], (const float*)rois, num_rois, feats, num_feats, batch_size, channels, &heights[0], &widths[0], &strides[0], mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum, mRoiScaleFactor, mFinestScale, mAligned, stream);
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTMultiLevelRoiAlign::getOutputDataType(int                       index,
+                                                                const nvinfer1::DataType* inputTypes,
+                                                                int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return nvinfer1::DataType::kFLOAT;
+    }
+
+    // IPluginV2 Methods
+    const char* TRTMultiLevelRoiAlign::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTMultiLevelRoiAlign::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTMultiLevelRoiAlign::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTMultiLevelRoiAlign::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
+               serialized_size(mAlignedWidth) + serialized_size(mPoolMode) + serialized_size(mSampleNum) +
+               serialized_size(mRoiScaleFactor) + serialized_size(mFinestScale) +
+               serialized_size(mAligned);
+    }
+
+    void TRTMultiLevelRoiAlign::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mAlignedHeight);
+        serialize_value(&buffer, mAlignedWidth);
+        serialize_value(&buffer, mPoolMode);
+        serialize_value(&buffer, mSampleNum);
+        serialize_value(&buffer, mRoiScaleFactor);
+        serialize_value(&buffer, mFinestScale);
+        serialize_value(&buffer, mAligned);
+        serialize_value(&buffer, mFeatmapStrides);
+    }
+
+    TRTMultiLevelRoiAlignCreator::TRTMultiLevelRoiAlignCreator()
+    {
+        mPluginAttributes = std::vector<nvinfer1::PluginField>(
+            {nvinfer1::PluginField("output_height"), nvinfer1::PluginField("output_width"), nvinfer1::PluginField("pool_mode"), nvinfer1::PluginField("sampling_ratio"), nvinfer1::PluginField("featmap_strides"), nvinfer1::PluginField("roi_scale_factor"), nvinfer1::PluginField("finest_scale"), nvinfer1::PluginField("aligned")});
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTMultiLevelRoiAlignCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTMultiLevelRoiAlignCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTMultiLevelRoiAlignCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        int                alignedHeight = 7;
+        int                alignedWidth  = 7;
+        int                poolMode      = 0;
+        int                sampleNum     = 2;
+        std::vector<float> featmapStrides;
+        float              roiScaleFactor = -1;
+        int                finestScale    = 56;
+        bool               aligned        = false;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("output_height") == 0)
+            {
+                alignedHeight = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("output_width") == 0)
+            {
+                alignedWidth = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("pool_mode") == 0)
+            {
+                poolMode = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("sampling_ratio") == 0)
+            {
+                sampleNum = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("roi_scale_factor") == 0)
+            {
+                roiScaleFactor = static_cast<const float*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("finest_scale") == 0)
+            {
+                finestScale = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("featmap_strides") == 0)
+            {
+                int          data_size  = (fc->fields[i].length);
+                const float* data_start = static_cast<const float*>(fc->fields[i].data);
+                featmapStrides          = std::vector<float>(data_start, data_start + data_size);
+            }
+            else if (field_name.compare("aligned") == 0)
+            {
+                int aligned_int = static_cast<const int*>(fc->fields[i].data)[0];
+                aligned         = aligned_int != 0;
+            }
+        }
+
+        ASSERT(featmapStrides.size() != 0);
+
+        TRTMultiLevelRoiAlign* plugin =
+            new TRTMultiLevelRoiAlign(name, alignedHeight, alignedWidth, poolMode, sampleNum, featmapStrides, roiScaleFactor, finestScale, aligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTMultiLevelRoiAlignCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new TRTMultiLevelRoiAlign(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRoiAlignCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
index a9a06236e0..814118d29b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
@@ -10,69 +10,65 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class TRTMultiLevelRoiAlign : public TRTPluginBase {
- public:
-  TRTMultiLevelRoiAlign(const std::string &name, int alignedHeight, int alignedWidth, int poolMode,
-                        int sampleNum, const std::vector<float> &featmapStrides,
-                        float roiScaleFactor = -1, int finestScale = 56, bool aligned = false);
+namespace mmdeploy
+{
+    class TRTMultiLevelRoiAlign : public TRTPluginBase
+    {
+      public:
+        TRTMultiLevelRoiAlign(const std::string& name, int alignedHeight, int alignedWidth, int poolMode, int sampleNum, const std::vector<float>& featmapStrides, float roiScaleFactor = -1, int finestScale = 56, bool aligned = false);
 
-  TRTMultiLevelRoiAlign(const std::string name, const void *data, size_t length);
+        TRTMultiLevelRoiAlign(const std::string name, const void* data, size_t length);
 
-  TRTMultiLevelRoiAlign() = delete;
+        TRTMultiLevelRoiAlign() = delete;
 
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
 
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
 
- private:
-  int mAlignedHeight;
-  int mAlignedWidth;
-  int mPoolMode;
-  int mSampleNum;
-  std::vector<float> mFeatmapStrides;
-  float mRoiScaleFactor;
-  int mFinestScale;
-  bool mAligned;
-};
+      private:
+        int                mAlignedHeight;
+        int                mAlignedWidth;
+        int                mPoolMode;
+        int                mSampleNum;
+        std::vector<float> mFeatmapStrides;
+        float              mRoiScaleFactor;
+        int                mFinestScale;
+        bool               mAligned;
+    };
 
-class TRTMultiLevelRoiAlignCreator : public TRTPluginCreatorBase {
- public:
-  TRTMultiLevelRoiAlignCreator();
+    class TRTMultiLevelRoiAlignCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTMultiLevelRoiAlignCreator();
 
-  const char *getPluginName() const TRT_NOEXCEPT override;
+        const char*          getPluginName() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_ROI_ALIGN_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
index 9eefbe3f32..1663088e30 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
@@ -10,167 +10,234 @@
 #include "trt_plugin_helper.hpp"
 
 const int kMAX_FEATMAP_SIZE = 10;
-struct FeatData {
-  const void *data[kMAX_FEATMAP_SIZE];
-  int batch_size;
-  int channels;
-  int h[kMAX_FEATMAP_SIZE];
-  int w[kMAX_FEATMAP_SIZE];
-  float spatial_scale[kMAX_FEATMAP_SIZE];
-  int num_featmap;
+struct FeatData
+{
+    const void* data[kMAX_FEATMAP_SIZE];
+    int         batch_size;
+    int         channels;
+    int         h[kMAX_FEATMAP_SIZE];
+    int         w[kMAX_FEATMAP_SIZE];
+    float       spatial_scale[kMAX_FEATMAP_SIZE];
+    int         num_featmap;
 };
 
-template <typename scalar_t, bool aligned, int pool_mode>
-__device__ scalar_t roi_align_single(const scalar_t *__restrict__ bottom_data,
-                                     const int roi_batch_ind, const scalar_t roi_start_w,
-                                     const scalar_t roi_start_h, const scalar_t roi_end_w,
-                                     const scalar_t roi_end_h, const scalar_t spatial_scale,
-                                     const int pw, const int ph, const int c, const int sample_num,
-                                     const int channels, const int height, const int width,
-                                     const int pooled_height, const int pooled_width) {
-  // Force malformed ROIs to be 1x1
-  scalar_t roi_width = max(roi_end_w - roi_start_w, (scalar_t)(aligned ? 0. : 1.));
-  scalar_t roi_height = max(roi_end_h - roi_start_h, (scalar_t)(aligned ? 0. : 1.));
-
-  const scalar_t bin_size_h = roi_height / pooled_height;
-  const scalar_t bin_size_w = roi_width / pooled_width;
-
-  const scalar_t *offset_bottom_data =
-      bottom_data + (roi_batch_ind * channels + c) * height * width;
-
-  const int sample_num_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height);
-  const int sample_num_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
-
-  scalar_t output_val = (pool_mode == 0) ? -FLT_MAX : 0;
-  const scalar_t y_offset = roi_start_h + ph * bin_size_h;
-  const scalar_t y_scale = bin_size_h / (scalar_t)(sample_num_h);
-  const scalar_t x_offset = roi_start_w + pw * bin_size_w;
-  const scalar_t x_scale = bin_size_w / (scalar_t)(sample_num_w);
-  for (int iy = 0; iy < sample_num_h; iy++) {
-    const scalar_t y = fma(scalar_t(iy) + scalar_t(.5f), y_scale, y_offset);
-    for (int ix = 0; ix < sample_num_w; ix++) {
-      const scalar_t x = fma(scalar_t(ix) + scalar_t(.5f), x_scale, x_offset);
-      scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data, height, width, y, x);
-      if (pool_mode == 0) {
-        output_val = max(output_val, val);
-      } else {
-        output_val += val;
-      }
+template<typename scalar_t, bool aligned, int pool_mode>
+__device__ scalar_t roi_align_single(const scalar_t* __restrict__ bottom_data,
+                                     const int      roi_batch_ind,
+                                     const scalar_t roi_start_w,
+                                     const scalar_t roi_start_h,
+                                     const scalar_t roi_end_w,
+                                     const scalar_t roi_end_h,
+                                     const scalar_t spatial_scale,
+                                     const int      pw,
+                                     const int      ph,
+                                     const int      c,
+                                     const int      sample_num,
+                                     const int      channels,
+                                     const int      height,
+                                     const int      width,
+                                     const int      pooled_height,
+                                     const int      pooled_width)
+{
+    // Force malformed ROIs to be 1x1
+    scalar_t        roi_width  = max(roi_end_w - roi_start_w, (scalar_t)(aligned ? 0. : 1.));
+    scalar_t        roi_height = max(roi_end_h - roi_start_h, (scalar_t)(aligned ? 0. : 1.));
+
+    const scalar_t  bin_size_h = roi_height / pooled_height;
+    const scalar_t  bin_size_w = roi_width / pooled_width;
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    const int      sample_num_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height);
+    const int      sample_num_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+    scalar_t       output_val = (pool_mode == 0) ? -FLT_MAX : 0;
+    const scalar_t y_offset   = roi_start_h + ph * bin_size_h;
+    const scalar_t y_scale    = bin_size_h / (scalar_t)(sample_num_h);
+    const scalar_t x_offset   = roi_start_w + pw * bin_size_w;
+    const scalar_t x_scale    = bin_size_w / (scalar_t)(sample_num_w);
+    for (int iy = 0; iy < sample_num_h; iy++)
+    {
+        const scalar_t y = fma(scalar_t(iy) + scalar_t(.5f), y_scale, y_offset);
+        for (int ix = 0; ix < sample_num_w; ix++)
+        {
+            const scalar_t x   = fma(scalar_t(ix) + scalar_t(.5f), x_scale, x_offset);
+            scalar_t       val = bilinear_interpolate<scalar_t>(offset_bottom_data, height, width, y, x);
+            if (pool_mode == 0)
+            {
+                output_val = max(output_val, val);
+            }
+            else
+            {
+                output_val += val;
+            }
+        }
+    }
+    if (pool_mode != 0)
+    {
+        output_val /= max(sample_num_h * sample_num_w, 1);
     }
-  }
-  if (pool_mode != 0) {
-    output_val /= max(sample_num_h * sample_num_w, 1);
-  }
 
-  return output_val;
+    return output_val;
 }
 
-template <typename scalar_t, bool aligned>
-__global__ void roi_extractor_kernel(scalar_t *__restrict__ output,
-                                     const scalar_t *__restrict__ bottom_rois, FeatData feat_data,
-                                     const int pool_mode, const int sample_num,
-                                     const float roi_scale_factor, const int finest_scale,
-                                     const int pooled_height, const int pooled_width,
-                                     int nThreads) {
-  CUDA_1D_KERNEL_LOOP(index, nThreads) {
-    const int channels = feat_data.channels;
-    int tmp_index = index;
-    const int pw = tmp_index % pooled_width;
-    tmp_index /= pooled_width;
-    const int ph = tmp_index % pooled_height;
-    tmp_index /= pooled_height;
-    const int c = tmp_index % channels;
-    const int n = tmp_index / channels;
-
-    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
-
-    scalar_t roi_offset_x0 = offset_bottom_rois[1];
-    scalar_t roi_offset_y0 = offset_bottom_rois[2];
-    scalar_t roi_offset_x1 = offset_bottom_rois[3];
-    scalar_t roi_offset_y1 = offset_bottom_rois[4];
-
-    const scalar_t scale = sqrtf((roi_offset_y1 - roi_offset_y0) * (roi_offset_x1 - roi_offset_x0));
-
-    const int target_lvls =
-        min(feat_data.num_featmap - 1,
-            max(0, int(floorf(log2f(scale / (scalar_t)(finest_scale) + 1e-6)))));
-
-    if (roi_scale_factor > 0.) {
-      const scalar_t roi_off_cx = (roi_offset_x0 + roi_offset_x1) * 0.5;
-      const scalar_t roi_off_cy = (roi_offset_y0 + roi_offset_y1) * 0.5;
-      const scalar_t half_scale_factor = roi_scale_factor * 0.5;
-      const scalar_t half_roi_off_w =
-          fma(roi_offset_x1 - roi_offset_x0 + 1, half_scale_factor, scalar_t(-0.5));
-      const scalar_t half_roi_off_h =
-          fma(roi_offset_y1 - roi_offset_y0 + 1, half_scale_factor, scalar_t(-0.5));
-
-      roi_offset_x0 = roi_off_cx - half_roi_off_w;
-      roi_offset_x1 = roi_off_cx + half_roi_off_w;
-      roi_offset_y0 = roi_off_cy - half_roi_off_h;
-      roi_offset_y1 = roi_off_cy + half_roi_off_h;
-    }
-
-    const scalar_t spatial_scale = (scalar_t)feat_data.spatial_scale[target_lvls];
-    const int height = feat_data.h[target_lvls];
-    const int width = feat_data.w[target_lvls];
-    const scalar_t *bottom_data = (scalar_t *)feat_data.data[target_lvls];
-
-    const int roi_batch_ind = offset_bottom_rois[0];
-    const scalar_t offset = aligned ? (scalar_t)-0.5 : (scalar_t)0.0;
-    const scalar_t roi_start_w =
-        fma(roi_offset_x0, spatial_scale, offset);  // roi_offset_x0 * spatial_scale + offset;
-    const scalar_t roi_start_h =
-        fma(roi_offset_y0, spatial_scale, offset);  // roi_offset_y0 * spatial_scale + offset;
-    const scalar_t roi_end_w =
-        fma(roi_offset_x1, spatial_scale, offset);  // (roi_offset_x1) * spatial_scale - offset;
-    const scalar_t roi_end_h =
-        fma(roi_offset_y1, spatial_scale, offset);  // (roi_offset_y1)*spatial_scale - offset;
-
-    if (pool_mode == 0) {
-      const scalar_t output_val = roi_align_single<scalar_t, aligned, 0>(
-          bottom_data, roi_batch_ind, roi_start_w, roi_start_h, roi_end_w, roi_end_h, spatial_scale,
-          pw, ph, c, sample_num, channels, height, width, pooled_height, pooled_width);
-      output[index] = output_val;
-    } else {
-      const scalar_t output_val = roi_align_single<scalar_t, aligned, 1>(
-          bottom_data, roi_batch_ind, roi_start_w, roi_start_h, roi_end_w, roi_end_h, spatial_scale,
-          pw, ph, c, sample_num, channels, height, width, pooled_height, pooled_width);
-      output[index] = output_val;
+template<typename scalar_t, bool aligned>
+__global__ void roi_extractor_kernel(scalar_t* __restrict__ output,
+                                     const scalar_t* __restrict__ bottom_rois,
+                                     FeatData    feat_data,
+                                     const int   pool_mode,
+                                     const int   sample_num,
+                                     const float roi_scale_factor,
+                                     const int   finest_scale,
+                                     const int   pooled_height,
+                                     const int   pooled_width,
+                                     int         nThreads)
+{
+    CUDA_1D_KERNEL_LOOP(index, nThreads)
+    {
+        const int channels  = feat_data.channels;
+        int       tmp_index = index;
+        const int pw        = tmp_index % pooled_width;
+        tmp_index /= pooled_width;
+        const int ph = tmp_index % pooled_height;
+        tmp_index /= pooled_height;
+        const int       c = tmp_index % channels;
+        const int       n = tmp_index / channels;
+
+        const scalar_t* offset_bottom_rois = bottom_rois + n * 5;
+
+        scalar_t        roi_offset_x0 = offset_bottom_rois[1];
+        scalar_t        roi_offset_y0 = offset_bottom_rois[2];
+        scalar_t        roi_offset_x1 = offset_bottom_rois[3];
+        scalar_t        roi_offset_y1 = offset_bottom_rois[4];
+
+        const scalar_t  scale = sqrtf((roi_offset_y1 - roi_offset_y0) * (roi_offset_x1 - roi_offset_x0));
+
+        const int       target_lvls =
+            min(feat_data.num_featmap - 1,
+                max(0, int(floorf(log2f(scale / (scalar_t)(finest_scale) + 1e-6)))));
+
+        if (roi_scale_factor > 0.)
+        {
+            const scalar_t roi_off_cx        = (roi_offset_x0 + roi_offset_x1) * 0.5;
+            const scalar_t roi_off_cy        = (roi_offset_y0 + roi_offset_y1) * 0.5;
+            const scalar_t half_scale_factor = roi_scale_factor * 0.5;
+            const scalar_t half_roi_off_w =
+                fma(roi_offset_x1 - roi_offset_x0 + 1, half_scale_factor, scalar_t(-0.5));
+            const scalar_t half_roi_off_h =
+                fma(roi_offset_y1 - roi_offset_y0 + 1, half_scale_factor, scalar_t(-0.5));
+
+            roi_offset_x0 = roi_off_cx - half_roi_off_w;
+            roi_offset_x1 = roi_off_cx + half_roi_off_w;
+            roi_offset_y0 = roi_off_cy - half_roi_off_h;
+            roi_offset_y1 = roi_off_cy + half_roi_off_h;
+        }
+
+        const scalar_t  spatial_scale = (scalar_t)feat_data.spatial_scale[target_lvls];
+        const int       height        = feat_data.h[target_lvls];
+        const int       width         = feat_data.w[target_lvls];
+        const scalar_t* bottom_data   = (scalar_t*)feat_data.data[target_lvls];
+
+        const int       roi_batch_ind = offset_bottom_rois[0];
+        const scalar_t  offset        = aligned ? (scalar_t)-0.5 : (scalar_t)0.0;
+        const scalar_t  roi_start_w =
+            fma(roi_offset_x0, spatial_scale, offset);  // roi_offset_x0 * spatial_scale + offset;
+        const scalar_t roi_start_h =
+            fma(roi_offset_y0, spatial_scale, offset);  // roi_offset_y0 * spatial_scale + offset;
+        const scalar_t roi_end_w =
+            fma(roi_offset_x1, spatial_scale, offset);  // (roi_offset_x1) * spatial_scale - offset;
+        const scalar_t roi_end_h =
+            fma(roi_offset_y1, spatial_scale, offset);  // (roi_offset_y1)*spatial_scale - offset;
+
+        if (pool_mode == 0)
+        {
+            const scalar_t output_val = roi_align_single<scalar_t, aligned, 0>(
+                bottom_data,
+                roi_batch_ind,
+                roi_start_w,
+                roi_start_h,
+                roi_end_w,
+                roi_end_h,
+                spatial_scale,
+                pw,
+                ph,
+                c,
+                sample_num,
+                channels,
+                height,
+                width,
+                pooled_height,
+                pooled_width);
+            output[index] = output_val;
+        }
+        else
+        {
+            const scalar_t output_val = roi_align_single<scalar_t, aligned, 1>(
+                bottom_data,
+                roi_batch_ind,
+                roi_start_w,
+                roi_start_h,
+                roi_end_w,
+                roi_end_h,
+                spatial_scale,
+                pw,
+                ph,
+                c,
+                sample_num,
+                channels,
+                height,
+                width,
+                pooled_height,
+                pooled_width);
+            output[index] = output_val;
+        }
     }
-  }
 }
 
-template <typename T>
-void multi_level_roi_align(T *output, const T *rois, int num_rois, const void *const *feats,
-                           int num_feats, int n, int c, int *h, int *w, float *strides,
-                           int aligned_height, int aligned_width, int pool_mode, int sample_num,
-                           float roi_scale_factor, int finest_scale, bool aligned,
-                           cudaStream_t stream) {
-  FeatData feat_data;
-  feat_data.batch_size = n;
-  feat_data.channels = c;
-  feat_data.num_featmap = num_feats;
-  for (int i = 0; i < num_feats; ++i) {
-    feat_data.data[i] = feats[i];
-    feat_data.h[i] = h[i];
-    feat_data.w[i] = w[i];
-    feat_data.spatial_scale[i] = 1. / float(strides[i]);
-  }
-  int nThreads = num_rois * c * aligned_height * aligned_width;
-  if (aligned) {
-    roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-        output, rois, feat_data, pool_mode, sample_num, roi_scale_factor, finest_scale,
-        aligned_height, aligned_width, nThreads);
-  } else {
-    roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-        output, rois, feat_data, pool_mode, sample_num, roi_scale_factor, finest_scale,
-        aligned_height, aligned_width, nThreads);
-  }
+template<typename T>
+void multi_level_roi_align(T* output, const T* rois, int num_rois, const void* const* feats, int num_feats, int n, int c, int* h, int* w, float* strides, int aligned_height, int aligned_width, int pool_mode, int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream)
+{
+    FeatData feat_data;
+    feat_data.batch_size  = n;
+    feat_data.channels    = c;
+    feat_data.num_featmap = num_feats;
+    for (int i = 0; i < num_feats; ++i)
+    {
+        feat_data.data[i]          = feats[i];
+        feat_data.h[i]             = h[i];
+        feat_data.w[i]             = w[i];
+        feat_data.spatial_scale[i] = 1. / float(strides[i]);
+    }
+    int nThreads = num_rois * c * aligned_height * aligned_width;
+    if (aligned)
+    {
+        roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
+            output,
+            rois,
+            feat_data,
+            pool_mode,
+            sample_num,
+            roi_scale_factor,
+            finest_scale,
+            aligned_height,
+            aligned_width,
+            nThreads);
+    }
+    else
+    {
+        roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
+            output,
+            rois,
+            feat_data,
+            pool_mode,
+            sample_num,
+            roi_scale_factor,
+            finest_scale,
+            aligned_height,
+            aligned_width,
+            nThreads);
+    }
 }
 
-template void multi_level_roi_align<float>(float *output, const float *rois, int num_rois,
-                                           const void *const *feats, int num_feats, int n, int c,
-                                           int *h, int *w, float *strides, int aligned_height,
-                                           int aligned_width, int pool_mode, int sample_num,
-                                           float roi_scale_factor, int finest_scale, bool aligned,
-                                           cudaStream_t stream);
+template void multi_level_roi_align<float>(float* output, const float* rois, int num_rois, const void* const* feats, int num_feats, int n, int c, int* h, int* w, float* strides, int aligned_height, int aligned_width, int pool_mode, int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.hpp
index 5f7220dbf0..efd5564a27 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.hpp
@@ -3,11 +3,7 @@
 #define TRT_MULTI_LEVEL_ROI_ALIGN_KERNEL_HPP
 #include <cuda_runtime.h>
 
-template <typename T>
-void multi_level_roi_align(T *output, const T *rois, int num_rois, const void *const *feats,
-                           int num_feats, int n, int c, int *h, int *w, float *strides,
-                           int aligned_height, int aligned_width, int pool_mode, int sample_num,
-                           float roi_scale_factor, int finest_scale, bool aligned,
-                           cudaStream_t stream);
+template<typename T>
+void multi_level_roi_align(T* output, const T* rois, int num_rois, const void* const* feats, int num_feats, int n, int c, int* h, int* w, float* strides, int aligned_height, int aligned_width, int pool_mode, int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream);
 
 #endif  // TRT_MULTI_LEVEL_ROI_ALIGN_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp
index 6637603128..492a171efd 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp
@@ -9,220 +9,282 @@
 #include "trt_multi_level_rotated_roi_align_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 #include "trt_serialize.hpp"
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVMultiLevelRotatedRoiAlign"};
-}  // namespace
-
-TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(
-    const std::string &name, int alignedHeight, int alignedWidth, int clockwise, int sampleNum,
-    const std::vector<float> &featmapStrides, float roiScaleFactor, int finestScale, bool aligned)
-    : TRTPluginBase(name),
-      mAlignedHeight(alignedHeight),
-      mAlignedWidth(alignedWidth),
-      mClockwise(clockwise),
-      mSampleNum(sampleNum),
-      mFeatmapStrides(featmapStrides),
-      mRoiScaleFactor(roiScaleFactor),
-      mFinestScale(finestScale),
-      mAligned(aligned) {}
-
-TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(const std::string name, const void *data,
-                                                           size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mAlignedHeight);
-  deserialize_value(&data, &length, &mAlignedWidth);
-  deserialize_value(&data, &length, &mClockwise);
-  deserialize_value(&data, &length, &mSampleNum);
-  deserialize_value(&data, &length, &mRoiScaleFactor);
-  deserialize_value(&data, &length, &mFinestScale);
-  deserialize_value(&data, &length, &mAligned);
-  deserialize_value(&data, &length, &mFeatmapStrides);
-}
-
-nvinfer1::IPluginV2DynamicExt *TRTMultiLevelRotatedRoiAlign::clone() const TRT_NOEXCEPT {
-  TRTMultiLevelRotatedRoiAlign *plugin = new TRTMultiLevelRotatedRoiAlign(
-      mLayerName, mAlignedHeight, mAlignedWidth, mClockwise, mSampleNum, mFeatmapStrides,
-      mRoiScaleFactor, mFinestScale, mAligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTMultiLevelRotatedRoiAlign::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  // warning, nbInputs should equal to mFeatmapStrides.size() + 1
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[1].d[1];
-  ret.d[2] = exprBuilder.constant(mAlignedHeight);
-  ret.d[3] = exprBuilder.constant(mAlignedWidth);
-
-  return ret;
-}
-
-bool TRTMultiLevelRotatedRoiAlign::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-}
-
-void TRTMultiLevelRotatedRoiAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
-                                                   int nbInputs,
-                                                   const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                                   int nbOutputs) TRT_NOEXCEPT {
-  // Validate input arguments
-  ASSERT(nbOutputs == 1);
-  ASSERT(nbInputs >= 1);
-  mFeatmapStrides =
-      std::vector<float>(mFeatmapStrides.begin(), mFeatmapStrides.begin() + nbInputs - 1);
-}
-
-size_t TRTMultiLevelRotatedRoiAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
-                                                      int nbInputs,
-                                                      const nvinfer1::PluginTensorDesc *outputs,
-                                                      int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int TRTMultiLevelRotatedRoiAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                          const nvinfer1::PluginTensorDesc *outputDesc,
-                                          const void *const *inputs, void *const *outputs,
-                                          void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  int num_rois = inputDesc[0].dims.d[0];
-  int batch_size = inputDesc[1].dims.d[0];
-  int channels = inputDesc[1].dims.d[1];
-
-  const int kMaxFeatMap = 10;
-  int heights[kMaxFeatMap];
-  int widths[kMaxFeatMap];
-  float strides[kMaxFeatMap];
-
-  int num_feats = mFeatmapStrides.size();
-  for (int i = 0; i < num_feats; ++i) {
-    heights[i] = inputDesc[i + 1].dims.d[2];
-    widths[i] = inputDesc[i + 1].dims.d[3];
-    strides[i] = mFeatmapStrides[i];
-  }
-
-  const void *rois = inputs[0];
-  const void *const *feats = inputs + 1;
-
-  multi_level_rotated_roi_align<float>((float *)outputs[0], (const float *)rois, num_rois, feats,
-                                       num_feats, batch_size, channels, &heights[0], &widths[0],
-                                       &strides[0], mAlignedHeight, mAlignedWidth, mClockwise,
-                                       mSampleNum, mRoiScaleFactor, mFinestScale, mAligned, stream);
-
-  return 0;
-}
-
-nvinfer1::DataType TRTMultiLevelRotatedRoiAlign::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
-  return nvinfer1::DataType::kFLOAT;
-}
-
-// IPluginV2 Methods
-const char *TRTMultiLevelRotatedRoiAlign::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTMultiLevelRotatedRoiAlign::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-int TRTMultiLevelRotatedRoiAlign::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTMultiLevelRotatedRoiAlign::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
-         serialized_size(mAlignedWidth) + serialized_size(mClockwise) +
-         serialized_size(mSampleNum) + serialized_size(mRoiScaleFactor) +
-         serialized_size(mFinestScale) + serialized_size(mAligned);
-}
-
-void TRTMultiLevelRotatedRoiAlign::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mAlignedHeight);
-  serialize_value(&buffer, mAlignedWidth);
-  serialize_value(&buffer, mClockwise);
-  serialize_value(&buffer, mSampleNum);
-  serialize_value(&buffer, mRoiScaleFactor);
-  serialize_value(&buffer, mFinestScale);
-  serialize_value(&buffer, mAligned);
-  serialize_value(&buffer, mFeatmapStrides);
-}
-
-TRTMultiLevelRotatedRoiAlignCreator::TRTMultiLevelRotatedRoiAlignCreator() {
-  mPluginAttributes = std::vector<nvinfer1::PluginField>(
-      {nvinfer1::PluginField("output_height"), nvinfer1::PluginField("output_width"),
-       nvinfer1::PluginField("clockwise"), nvinfer1::PluginField("sampling_ratio"),
-       nvinfer1::PluginField("featmap_strides"), nvinfer1::PluginField("roi_scale_factor"),
-       nvinfer1::PluginField("finest_scale"), nvinfer1::PluginField("aligned")});
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTMultiLevelRotatedRoiAlignCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *TRTMultiLevelRotatedRoiAlignCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *TRTMultiLevelRotatedRoiAlignCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  int alignedHeight = 7;
-  int alignedWidth = 7;
-  int clockwise = 0;
-  int sampleNum = 2;
-  std::vector<float> featmapStrides;
-  float roiScaleFactor = -1;
-  int finestScale = 56;
-  bool aligned = false;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("output_height") == 0) {
-      alignedHeight = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("output_width") == 0) {
-      alignedWidth = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("clockwise") == 0) {
-      clockwise = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("sampling_ratio") == 0) {
-      sampleNum = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("roi_scale_factor") == 0) {
-      roiScaleFactor = static_cast<const float *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("finest_scale") == 0) {
-      finestScale = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("featmap_strides") == 0) {
-      int data_size = (fc->fields[i].length);
-      const float *data_start = static_cast<const float *>(fc->fields[i].data);
-      featmapStrides = std::vector<float>(data_start, data_start + data_size);
-    } else if (field_name.compare("aligned") == 0) {
-      int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
-      aligned = aligned_int != 0;
-    }
-  }
-
-  ASSERT(featmapStrides.size() != 0);
-
-  TRTMultiLevelRotatedRoiAlign *plugin =
-      new TRTMultiLevelRotatedRoiAlign(name, alignedHeight, alignedWidth, clockwise, sampleNum,
-                                       featmapStrides, roiScaleFactor, finestScale, aligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *TRTMultiLevelRotatedRoiAlignCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new TRTMultiLevelRotatedRoiAlign(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRotatedRoiAlignCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVMultiLevelRotatedRoiAlign"};
+    }  // namespace
+
+    TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(
+        const std::string&        name,
+        int                       alignedHeight,
+        int                       alignedWidth,
+        int                       clockwise,
+        int                       sampleNum,
+        const std::vector<float>& featmapStrides,
+        float                     roiScaleFactor,
+        int                       finestScale,
+        bool                      aligned)
+        : TRTPluginBase(name)
+        , mAlignedHeight(alignedHeight)
+        , mAlignedWidth(alignedWidth)
+        , mClockwise(clockwise)
+        , mSampleNum(sampleNum)
+        , mFeatmapStrides(featmapStrides)
+        , mRoiScaleFactor(roiScaleFactor)
+        , mFinestScale(finestScale)
+        , mAligned(aligned)
+    {
+    }
+
+    TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mAlignedHeight);
+        deserialize_value(&data, &length, &mAlignedWidth);
+        deserialize_value(&data, &length, &mClockwise);
+        deserialize_value(&data, &length, &mSampleNum);
+        deserialize_value(&data, &length, &mRoiScaleFactor);
+        deserialize_value(&data, &length, &mFinestScale);
+        deserialize_value(&data, &length, &mAligned);
+        deserialize_value(&data, &length, &mFeatmapStrides);
+    }
+
+    nvinfer1::IPluginV2DynamicExt* TRTMultiLevelRotatedRoiAlign::clone() const TRT_NOEXCEPT
+    {
+        TRTMultiLevelRotatedRoiAlign* plugin = new TRTMultiLevelRotatedRoiAlign(
+            mLayerName,
+            mAlignedHeight,
+            mAlignedWidth,
+            mClockwise,
+            mSampleNum,
+            mFeatmapStrides,
+            mRoiScaleFactor,
+            mFinestScale,
+            mAligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTMultiLevelRotatedRoiAlign::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        // warning, nbInputs should equal to mFeatmapStrides.size() + 1
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 4;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[1].d[1];
+        ret.d[2]   = exprBuilder.constant(mAlignedHeight);
+        ret.d[3]   = exprBuilder.constant(mAlignedWidth);
+
+        return ret;
+    }
+
+    bool TRTMultiLevelRotatedRoiAlign::supportsFormatCombination(
+        int                               pos,
+        const nvinfer1::PluginTensorDesc* ioDesc,
+        int                               nbInputs,
+        int                               nbOutputs) TRT_NOEXCEPT
+    {
+        return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+
+    void TRTMultiLevelRotatedRoiAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        // Validate input arguments
+        ASSERT(nbOutputs == 1);
+        ASSERT(nbInputs >= 1);
+        mFeatmapStrides =
+            std::vector<float>(mFeatmapStrides.begin(), mFeatmapStrides.begin() + nbInputs - 1);
+    }
+
+    size_t TRTMultiLevelRotatedRoiAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                          int                               nbInputs,
+                                                          const nvinfer1::PluginTensorDesc* outputs,
+                                                          int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int TRTMultiLevelRotatedRoiAlign::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                              const nvinfer1::PluginTensorDesc* outputDesc,
+                                              const void* const*                inputs,
+                                              void* const*                      outputs,
+                                              void*                             workSpace,
+                                              cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int       num_rois   = inputDesc[0].dims.d[0];
+        int       batch_size = inputDesc[1].dims.d[0];
+        int       channels   = inputDesc[1].dims.d[1];
+
+        const int kMaxFeatMap = 10;
+        int       heights[kMaxFeatMap];
+        int       widths[kMaxFeatMap];
+        float     strides[kMaxFeatMap];
+
+        int       num_feats = mFeatmapStrides.size();
+        for (int i = 0; i < num_feats; ++i)
+        {
+            heights[i] = inputDesc[i + 1].dims.d[2];
+            widths[i]  = inputDesc[i + 1].dims.d[3];
+            strides[i] = mFeatmapStrides[i];
+        }
+
+        const void*        rois  = inputs[0];
+        const void* const* feats = inputs + 1;
+
+        multi_level_rotated_roi_align<float>((float*)outputs[0], (const float*)rois, num_rois, feats, num_feats, batch_size, channels, &heights[0], &widths[0], &strides[0], mAlignedHeight, mAlignedWidth, mClockwise, mSampleNum, mRoiScaleFactor, mFinestScale, mAligned, stream);
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTMultiLevelRotatedRoiAlign::getOutputDataType(
+        int                       index,
+        const nvinfer1::DataType* inputTypes,
+        int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return nvinfer1::DataType::kFLOAT;
+    }
+
+    // IPluginV2 Methods
+    const char* TRTMultiLevelRotatedRoiAlign::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTMultiLevelRotatedRoiAlign::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTMultiLevelRotatedRoiAlign::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTMultiLevelRotatedRoiAlign::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
+               serialized_size(mAlignedWidth) + serialized_size(mClockwise) +
+               serialized_size(mSampleNum) + serialized_size(mRoiScaleFactor) +
+               serialized_size(mFinestScale) + serialized_size(mAligned);
+    }
+
+    void TRTMultiLevelRotatedRoiAlign::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mAlignedHeight);
+        serialize_value(&buffer, mAlignedWidth);
+        serialize_value(&buffer, mClockwise);
+        serialize_value(&buffer, mSampleNum);
+        serialize_value(&buffer, mRoiScaleFactor);
+        serialize_value(&buffer, mFinestScale);
+        serialize_value(&buffer, mAligned);
+        serialize_value(&buffer, mFeatmapStrides);
+    }
+
+    TRTMultiLevelRotatedRoiAlignCreator::TRTMultiLevelRotatedRoiAlignCreator()
+    {
+        mPluginAttributes = std::vector<nvinfer1::PluginField>(
+            {nvinfer1::PluginField("output_height"), nvinfer1::PluginField("output_width"), nvinfer1::PluginField("clockwise"), nvinfer1::PluginField("sampling_ratio"), nvinfer1::PluginField("featmap_strides"), nvinfer1::PluginField("roi_scale_factor"), nvinfer1::PluginField("finest_scale"), nvinfer1::PluginField("aligned")});
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTMultiLevelRotatedRoiAlignCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTMultiLevelRotatedRoiAlignCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTMultiLevelRotatedRoiAlignCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        int                alignedHeight = 7;
+        int                alignedWidth  = 7;
+        int                clockwise     = 0;
+        int                sampleNum     = 2;
+        std::vector<float> featmapStrides;
+        float              roiScaleFactor = -1;
+        int                finestScale    = 56;
+        bool               aligned        = false;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("output_height") == 0)
+            {
+                alignedHeight = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("output_width") == 0)
+            {
+                alignedWidth = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("clockwise") == 0)
+            {
+                clockwise = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("sampling_ratio") == 0)
+            {
+                sampleNum = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("roi_scale_factor") == 0)
+            {
+                roiScaleFactor = static_cast<const float*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("finest_scale") == 0)
+            {
+                finestScale = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("featmap_strides") == 0)
+            {
+                int          data_size  = (fc->fields[i].length);
+                const float* data_start = static_cast<const float*>(fc->fields[i].data);
+                featmapStrides          = std::vector<float>(data_start, data_start + data_size);
+            }
+            else if (field_name.compare("aligned") == 0)
+            {
+                int aligned_int = static_cast<const int*>(fc->fields[i].data)[0];
+                aligned         = aligned_int != 0;
+            }
+        }
+
+        ASSERT(featmapStrides.size() != 0);
+
+        TRTMultiLevelRotatedRoiAlign* plugin =
+            new TRTMultiLevelRotatedRoiAlign(name, alignedHeight, alignedWidth, clockwise, sampleNum, featmapStrides, roiScaleFactor, finestScale, aligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTMultiLevelRotatedRoiAlignCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new TRTMultiLevelRotatedRoiAlign(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRotatedRoiAlignCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp
index cf0bab7584..570317ebde 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp
@@ -10,70 +10,65 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class TRTMultiLevelRotatedRoiAlign : public TRTPluginBase {
- public:
-  TRTMultiLevelRotatedRoiAlign(const std::string &name, int alignedHeight, int alignedWidth,
-                               int clockwise, int sampleNum,
-                               const std::vector<float> &featmapStrides, float roiScaleFactor = -1,
-                               int finestScale = 56, bool aligned = false);
+namespace mmdeploy
+{
+    class TRTMultiLevelRotatedRoiAlign : public TRTPluginBase
+    {
+      public:
+        TRTMultiLevelRotatedRoiAlign(const std::string& name, int alignedHeight, int alignedWidth, int clockwise, int sampleNum, const std::vector<float>& featmapStrides, float roiScaleFactor = -1, int finestScale = 56, bool aligned = false);
 
-  TRTMultiLevelRotatedRoiAlign(const std::string name, const void *data, size_t length);
+        TRTMultiLevelRotatedRoiAlign(const std::string name, const void* data, size_t length);
 
-  TRTMultiLevelRotatedRoiAlign() = delete;
+        TRTMultiLevelRotatedRoiAlign() = delete;
 
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
 
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
 
- private:
-  int mAlignedHeight;
-  int mAlignedWidth;
-  int mClockwise;
-  int mSampleNum;
-  std::vector<float> mFeatmapStrides;
-  float mRoiScaleFactor;
-  int mFinestScale;
-  bool mAligned;
-};
+      private:
+        int                mAlignedHeight;
+        int                mAlignedWidth;
+        int                mClockwise;
+        int                mSampleNum;
+        std::vector<float> mFeatmapStrides;
+        float              mRoiScaleFactor;
+        int                mFinestScale;
+        bool               mAligned;
+    };
 
-class TRTMultiLevelRotatedRoiAlignCreator : public TRTPluginCreatorBase {
- public:
-  TRTMultiLevelRotatedRoiAlignCreator();
+    class TRTMultiLevelRotatedRoiAlignCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTMultiLevelRotatedRoiAlignCreator();
 
-  const char *getPluginName() const TRT_NOEXCEPT override;
+        const char*          getPluginName() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_MULTI_LEVEL_ROTATED_ROI_ALIGN_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu
index 1c6f292bae..897ae69e8b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu
@@ -10,155 +10,223 @@
 #include "trt_plugin_helper.hpp"
 
 const int kMAX_FEATMAP_SIZE = 10;
-struct FeatData {
-  const void *data[kMAX_FEATMAP_SIZE];
-  int batch_size;
-  int channels;
-  int h[kMAX_FEATMAP_SIZE];
-  int w[kMAX_FEATMAP_SIZE];
-  float spatial_scale[kMAX_FEATMAP_SIZE];
-  int num_featmap;
+struct FeatData
+{
+    const void* data[kMAX_FEATMAP_SIZE];
+    int         batch_size;
+    int         channels;
+    int         h[kMAX_FEATMAP_SIZE];
+    int         w[kMAX_FEATMAP_SIZE];
+    float       spatial_scale[kMAX_FEATMAP_SIZE];
+    int         num_featmap;
 };
 
-template <typename scalar_t, bool aligned>
-__device__ scalar_t roi_align_single(const scalar_t *__restrict__ bottom_data,
-                                     const int roi_batch_ind, scalar_t roi_center_w,
-                                     scalar_t roi_center_h, scalar_t roi_width, scalar_t roi_height,
-                                     scalar_t theta, const scalar_t spatial_scale, const int pw,
-                                     const int ph, const int c, const int sample_num,
-                                     const int channels, const int height, const int width,
-                                     const int pooled_height, const int pooled_width) {
-  // Force malformed ROIs to be 1x1
-
-  roi_width = max(roi_width, (scalar_t)1.);
-  roi_height = max(roi_height, (scalar_t)1.);
-
-  const scalar_t bin_size_h = roi_height / scalar_t(pooled_height);
-  const scalar_t bin_size_w = roi_width / scalar_t(pooled_width);
-
-  const scalar_t *offset_bottom_data =
-      bottom_data + (roi_batch_ind * channels + c) * height * width;
-
-  const int roi_bin_grid_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height);
-  const int roi_bin_grid_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
-
-  const scalar_t roi_start_h = -roi_height / scalar_t(2.0);
-  const scalar_t roi_start_w = -roi_width / scalar_t(2.0);
-  const scalar_t cosscalar_theta = cos(theta);
-  const scalar_t sinscalar_theta = sin(theta);
-
-  // We do average (integral) pooling inside a bin
-  const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-  scalar_t output_val = 0.;
-
-  for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
-    const scalar_t yy = roi_start_h + ph * bin_size_h +
-                        static_cast<scalar_t>(iy + .5f) * bin_size_h /
-                            static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-    for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-      const scalar_t xx =
-          roi_start_w + pw * bin_size_w +
-          static_cast<scalar_t>(ix + .5f) * bin_size_w / static_cast<scalar_t>(roi_bin_grid_w);
-
-      // Rotate by theta (counterclockwise) around the center and translate
-      scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
-      scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
-
-      scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data, height, width, y, x);
-      output_val += val;
+template<typename scalar_t, bool aligned>
+__device__ scalar_t roi_align_single(const scalar_t* __restrict__ bottom_data,
+                                     const int      roi_batch_ind,
+                                     scalar_t       roi_center_w,
+                                     scalar_t       roi_center_h,
+                                     scalar_t       roi_width,
+                                     scalar_t       roi_height,
+                                     scalar_t       theta,
+                                     const scalar_t spatial_scale,
+                                     const int      pw,
+                                     const int      ph,
+                                     const int      c,
+                                     const int      sample_num,
+                                     const int      channels,
+                                     const int      height,
+                                     const int      width,
+                                     const int      pooled_height,
+                                     const int      pooled_width)
+{
+    // Force malformed ROIs to be 1x1
+
+    roi_width  = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+
+    const scalar_t  bin_size_h = roi_height / scalar_t(pooled_height);
+    const scalar_t  bin_size_w = roi_width / scalar_t(pooled_width);
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    const int      roi_bin_grid_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height);
+    const int      roi_bin_grid_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+    const scalar_t roi_start_h     = -roi_height / scalar_t(2.0);
+    const scalar_t roi_start_w     = -roi_width / scalar_t(2.0);
+    const scalar_t cosscalar_theta = cos(theta);
+    const scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t       output_val = 0.;
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++)
+    {  // e.g., iy = 0, 1
+        const scalar_t yy = roi_start_h + ph * bin_size_h +
+                            static_cast<scalar_t>(iy + .5f) * bin_size_h /
+                                static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < roi_bin_grid_w; ix++)
+        {
+            const scalar_t xx =
+                roi_start_w + pw * bin_size_w +
+                static_cast<scalar_t>(ix + .5f) * bin_size_w / static_cast<scalar_t>(roi_bin_grid_w);
+
+            // Rotate by theta (counterclockwise) around the center and translate
+            scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+            scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+            scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data, height, width, y, x);
+            output_val += val;
+        }
     }
-  }
 
-  return output_val / count;
+    return output_val / count;
 }
 
-template <typename scalar_t, bool aligned>
-__global__ void rotated_roi_extractor_kernel(scalar_t *__restrict__ output,
-                                             const scalar_t *__restrict__ bottom_rois,
-                                             FeatData feat_data, const int clockwise,
-                                             const int sample_num, const float roi_scale_factor,
-                                             const int finest_scale, const int pooled_height,
-                                             const int pooled_width, int nThreads) {
-  CUDA_1D_KERNEL_LOOP(index, nThreads) {
-    const int channels = feat_data.channels;
-    int tmp_index = index;
-    const int pw = tmp_index % pooled_width;
-    tmp_index /= pooled_width;
-    const int ph = tmp_index % pooled_height;
-    tmp_index /= pooled_height;
-    const int c = tmp_index % channels;
-    const int n = tmp_index / channels;
-
-    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
-
-    scalar_t roi_offset_x0 = offset_bottom_rois[1];
-    scalar_t roi_offset_y0 = offset_bottom_rois[2];
-    scalar_t roi_offset_width = offset_bottom_rois[3];
-    scalar_t roi_offset_height = offset_bottom_rois[4];
-    scalar_t theta = offset_bottom_rois[5];
-
-    const scalar_t scale = sqrtf(roi_offset_width * roi_offset_height);
-
-    const int target_lvls =
-        min(feat_data.num_featmap - 1,
-            max(0, int(floorf(log2f(scale / (scalar_t)(finest_scale) + 1e-6)))));
-
-    if (roi_scale_factor > 0.) {
-      roi_offset_width = roi_offset_width * roi_scale_factor;
-      roi_offset_height = roi_offset_height * roi_scale_factor;
+template<typename scalar_t, bool aligned>
+__global__ void rotated_roi_extractor_kernel(scalar_t* __restrict__ output,
+                                             const scalar_t* __restrict__ bottom_rois,
+                                             FeatData    feat_data,
+                                             const int   clockwise,
+                                             const int   sample_num,
+                                             const float roi_scale_factor,
+                                             const int   finest_scale,
+                                             const int   pooled_height,
+                                             const int   pooled_width,
+                                             int         nThreads)
+{
+    CUDA_1D_KERNEL_LOOP(index, nThreads)
+    {
+        const int channels  = feat_data.channels;
+        int       tmp_index = index;
+        const int pw        = tmp_index % pooled_width;
+        tmp_index /= pooled_width;
+        const int ph = tmp_index % pooled_height;
+        tmp_index /= pooled_height;
+        const int       c = tmp_index % channels;
+        const int       n = tmp_index / channels;
+
+        const scalar_t* offset_bottom_rois = bottom_rois + n * 6;
+
+        scalar_t        roi_offset_x0     = offset_bottom_rois[1];
+        scalar_t        roi_offset_y0     = offset_bottom_rois[2];
+        scalar_t        roi_offset_width  = offset_bottom_rois[3];
+        scalar_t        roi_offset_height = offset_bottom_rois[4];
+        scalar_t        theta             = offset_bottom_rois[5];
+
+        const scalar_t  scale = sqrtf(roi_offset_width * roi_offset_height);
+
+        const int       target_lvls =
+            min(feat_data.num_featmap - 1,
+                max(0, int(floorf(log2f(scale / (scalar_t)(finest_scale) + 1e-6)))));
+
+        if (roi_scale_factor > 0.)
+        {
+            roi_offset_width  = roi_offset_width * roi_scale_factor;
+            roi_offset_height = roi_offset_height * roi_scale_factor;
+        }
+
+        const scalar_t  spatial_scale = (scalar_t)feat_data.spatial_scale[target_lvls];
+        const int       height        = feat_data.h[target_lvls];
+        const int       width         = feat_data.w[target_lvls];
+        const scalar_t* bottom_data   = (scalar_t*)feat_data.data[target_lvls];
+
+        const int       roi_batch_ind = offset_bottom_rois[0];
+        const scalar_t  offset        = aligned ? (scalar_t)-0.5 : (scalar_t)0.0;
+        const scalar_t  roi_center_w  = fma(roi_offset_x0, spatial_scale, offset);
+        const scalar_t  roi_center_h  = fma(roi_offset_y0, spatial_scale, offset);
+        const scalar_t  roi_width     = roi_offset_width * spatial_scale;
+        const scalar_t  roi_height    = roi_offset_height * spatial_scale;
+
+        theta = clockwise > 0 ? -theta : theta;
+
+        const scalar_t output_val = roi_align_single<scalar_t, aligned>(
+            bottom_data,
+            roi_batch_ind,
+            roi_center_w,
+            roi_center_h,
+            roi_width,
+            roi_height,
+            theta,
+            spatial_scale,
+            pw,
+            ph,
+            c,
+            sample_num,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width);
+        output[index] = output_val;
     }
-
-    const scalar_t spatial_scale = (scalar_t)feat_data.spatial_scale[target_lvls];
-    const int height = feat_data.h[target_lvls];
-    const int width = feat_data.w[target_lvls];
-    const scalar_t *bottom_data = (scalar_t *)feat_data.data[target_lvls];
-
-    const int roi_batch_ind = offset_bottom_rois[0];
-    const scalar_t offset = aligned ? (scalar_t)-0.5 : (scalar_t)0.0;
-    const scalar_t roi_center_w = fma(roi_offset_x0, spatial_scale, offset);
-    const scalar_t roi_center_h = fma(roi_offset_y0, spatial_scale, offset);
-    const scalar_t roi_width = roi_offset_width * spatial_scale;
-    const scalar_t roi_height = roi_offset_height * spatial_scale;
-
-    theta = clockwise > 0 ? -theta : theta;
-
-    const scalar_t output_val = roi_align_single<scalar_t, aligned>(
-        bottom_data, roi_batch_ind, roi_center_w, roi_center_h, roi_width, roi_height, theta,
-        spatial_scale, pw, ph, c, sample_num, channels, height, width, pooled_height, pooled_width);
-    output[index] = output_val;
-  }
 }
 
-template <typename T>
-void multi_level_rotated_roi_align(T *output, const T *rois, int num_rois, const void *const *feats,
-                                   int num_feats, int n, int c, int *h, int *w, float *strides,
-                                   int aligned_height, int aligned_width, int clockwise,
-                                   int sample_num, float roi_scale_factor, int finest_scale,
-                                   bool aligned, cudaStream_t stream) {
-  FeatData feat_data;
-  feat_data.batch_size = n;
-  feat_data.channels = c;
-  feat_data.num_featmap = num_feats;
-  for (int i = 0; i < num_feats; ++i) {
-    feat_data.data[i] = feats[i];
-    feat_data.h[i] = h[i];
-    feat_data.w[i] = w[i];
-    feat_data.spatial_scale[i] = 1. / float(strides[i]);
-  }
-  int nThreads = num_rois * c * aligned_height * aligned_width;
-  if (aligned) {
-    rotated_roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-        output, rois, feat_data, clockwise, sample_num, roi_scale_factor, finest_scale,
-        aligned_height, aligned_width, nThreads);
-  } else {
-    rotated_roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-        output, rois, feat_data, clockwise, sample_num, roi_scale_factor, finest_scale,
-        aligned_height, aligned_width, nThreads);
-  }
+template<typename T>
+void multi_level_rotated_roi_align(T* output, const T* rois, int num_rois, const void* const* feats, int num_feats, int n, int c, int* h, int* w, float* strides, int aligned_height, int aligned_width, int clockwise, int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream)
+{
+    FeatData feat_data;
+    feat_data.batch_size  = n;
+    feat_data.channels    = c;
+    feat_data.num_featmap = num_feats;
+    for (int i = 0; i < num_feats; ++i)
+    {
+        feat_data.data[i]          = feats[i];
+        feat_data.h[i]             = h[i];
+        feat_data.w[i]             = w[i];
+        feat_data.spatial_scale[i] = 1. / float(strides[i]);
+    }
+    int nThreads = num_rois * c * aligned_height * aligned_width;
+    if (aligned)
+    {
+        rotated_roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
+            output,
+            rois,
+            feat_data,
+            clockwise,
+            sample_num,
+            roi_scale_factor,
+            finest_scale,
+            aligned_height,
+            aligned_width,
+            nThreads);
+    }
+    else
+    {
+        rotated_roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
+            output,
+            rois,
+            feat_data,
+            clockwise,
+            sample_num,
+            roi_scale_factor,
+            finest_scale,
+            aligned_height,
+            aligned_width,
+            nThreads);
+    }
 }
 
 template void multi_level_rotated_roi_align<float>(
-    float *output, const float *rois, int num_rois, const void *const *feats, int num_feats, int n,
-    int c, int *h, int *w, float *strides, int aligned_height, int aligned_width, int clockwise,
-    int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream);
+    float*             output,
+    const float*       rois,
+    int                num_rois,
+    const void* const* feats,
+    int                num_feats,
+    int                n,
+    int                c,
+    int*               h,
+    int*               w,
+    float*             strides,
+    int                aligned_height,
+    int                aligned_width,
+    int                clockwise,
+    int                sample_num,
+    float              roi_scale_factor,
+    int                finest_scale,
+    bool               aligned,
+    cudaStream_t       stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp
index fc3700df3b..f3fb25df83 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp
@@ -3,11 +3,7 @@
 #define TRT_MULTI_LEVEL_ROTATED_ROI_ALIGN_KERNEL_HPP
 #include <cuda_runtime.h>
 
-template <typename T>
-void multi_level_rotated_roi_align(T *output, const T *rois, int num_rois, const void *const *feats,
-                                   int num_feats, int n, int c, int *h, int *w, float *strides,
-                                   int aligned_height, int aligned_width, int clockwise,
-                                   int sample_num, float roi_scale_factor, int finest_scale,
-                                   bool aligned, cudaStream_t stream);
+template<typename T>
+void multi_level_rotated_roi_align(T* output, const T* rois, int num_rois, const void* const* feats, int num_feats, int n, int c, int* h, int* w, float* strides, int aligned_height, int aligned_width, int clockwise, int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream);
 
 #endif  // TRT_MULTI_LEVEL_ROTATED_ROI_ALIGN_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp
index d14a25e929..ce9e81290d 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp
@@ -10,164 +10,208 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVMultiScaleDeformableAttention"};
-}  // namespace
-
-MultiScaleDeformableAttnPluginDynamic::MultiScaleDeformableAttnPluginDynamic(
-    const std::string &name)
-    : TRTPluginBase(name) {}
-
-MultiScaleDeformableAttnPluginDynamic::MultiScaleDeformableAttnPluginDynamic(const std::string name,
-                                                                             const void *data,
-                                                                             size_t length)
-    : TRTPluginBase(name) {}
-MultiScaleDeformableAttnPluginDynamic::~MultiScaleDeformableAttnPluginDynamic() {}
-
-nvinfer1::IPluginV2DynamicExt *MultiScaleDeformableAttnPluginDynamic::clone() const TRT_NOEXCEPT {
-  MultiScaleDeformableAttnPluginDynamic *plugin =
-      new MultiScaleDeformableAttnPluginDynamic(mLayerName);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs MultiScaleDeformableAttnPluginDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 3;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[3].d[1];
-
-  ret.d[2] = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *inputs[0].d[3]);
-
-  return ret;
-}
-
-bool MultiScaleDeformableAttnPluginDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) {
-    if ((pos == 1) || (pos == 2)) {
-      return (ioDesc[pos].type == nvinfer1::DataType::kINT32);
-    } else {
-      return ((ioDesc[pos].type == ioDesc[0].type) &&
-              ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT) ||
-               (ioDesc[pos].type == nvinfer1::DataType::kHALF)));
-    }
-  } else {
-    return false;
-  }
-}
-
-void MultiScaleDeformableAttnPluginDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) TRT_NOEXCEPT {}
-
-size_t MultiScaleDeformableAttnPluginDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int MultiScaleDeformableAttnPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                                   const nvinfer1::PluginTensorDesc *outputDesc,
-                                                   const void *const *inputs, void *const *outputs,
-                                                   void *workSpace,
-                                                   cudaStream_t stream) TRT_NOEXCEPT {
-  int32_t const batch = inputDesc[0].dims.d[0];
-  int32_t spatial_size = inputDesc[0].dims.d[1];
-  int32_t num_heads = inputDesc[0].dims.d[2];
-  int32_t channels = inputDesc[0].dims.d[3];
-  int32_t num_levels = inputDesc[1].dims.d[0];
-  int32_t num_query = inputDesc[3].dims.d[1];
-  int32_t num_point = inputDesc[3].dims.d[4];
-  int32_t rc = 0;
-  if (inputDesc[0].type == nvinfer1::DataType::kFLOAT) {
-    float const *value = static_cast<float const *>(inputs[0]);
-    int32_t const *spatialShapes = static_cast<int32_t const *>(inputs[1]);
-    int32_t const *levelStartIndex = static_cast<int32_t const *>(inputs[2]);
-    float const *samplingLoc = static_cast<float const *>(inputs[3]);
-    float const *attnWeight = static_cast<float const *>(inputs[4]);
-    float *output = static_cast<float *>(outputs[0]);
-
-    rc = ms_deform_attn_cuda_forward(value, spatialShapes, levelStartIndex, samplingLoc, attnWeight,
-                                     output, batch, spatial_size, num_heads, channels, num_levels,
-                                     num_query, num_point, stream);
-  } else if (inputDesc[0].type == nvinfer1::DataType::kHALF) {
-    const __half *value = static_cast<const __half *>(inputs[0]);
-    int32_t const *spatialShapes = static_cast<int32_t const *>(inputs[1]);
-    int32_t const *levelStartIndex = static_cast<int32_t const *>(inputs[2]);
-    const __half *samplingLoc = static_cast<const __half *>(inputs[3]);
-    const __half *attnWeight = static_cast<const __half *>(inputs[4]);
-    __half *output = static_cast<__half *>(outputs[0]);
-
-    rc = ms_deform_attn_cuda_forward(value, spatialShapes, levelStartIndex, samplingLoc, attnWeight,
-                                     output, batch, spatial_size, num_heads, channels, num_levels,
-                                     num_query, num_point, stream);
-  }
-
-  return rc;
-}
-
-nvinfer1::DataType MultiScaleDeformableAttnPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *MultiScaleDeformableAttnPluginDynamic::getPluginType() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *MultiScaleDeformableAttnPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-int MultiScaleDeformableAttnPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t MultiScaleDeformableAttnPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
-  return 0;
-}
-
-void MultiScaleDeformableAttnPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {}
-
-void MultiScaleDeformableAttnPluginDynamic::attachToContext(
-    cudnnContext *cudnnContext, cublasContext *cublasContext,
-    nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {}
-
-void MultiScaleDeformableAttnPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
-
-////////////////////// creator /////////////////////////////
-
-MultiScaleDeformableAttnPluginDynamicCreator::MultiScaleDeformableAttnPluginDynamicCreator() {
-  mPluginAttributes.clear();
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *MultiScaleDeformableAttnPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *MultiScaleDeformableAttnPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *MultiScaleDeformableAttnPluginDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  MultiScaleDeformableAttnPluginDynamic *plugin = new MultiScaleDeformableAttnPluginDynamic(name);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *MultiScaleDeformableAttnPluginDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new MultiScaleDeformableAttnPluginDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(MultiScaleDeformableAttnPluginDynamicCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVMultiScaleDeformableAttention"};
+    }  // namespace
+
+    MultiScaleDeformableAttnPluginDynamic::MultiScaleDeformableAttnPluginDynamic(
+        const std::string& name)
+        : TRTPluginBase(name)
+    {
+    }
+
+    MultiScaleDeformableAttnPluginDynamic::MultiScaleDeformableAttnPluginDynamic(const std::string name,
+                                                                                 const void*       data,
+                                                                                 size_t            length)
+        : TRTPluginBase(name)
+    {
+    }
+    MultiScaleDeformableAttnPluginDynamic::~MultiScaleDeformableAttnPluginDynamic() {}
+
+    nvinfer1::IPluginV2DynamicExt* MultiScaleDeformableAttnPluginDynamic::clone() const TRT_NOEXCEPT
+    {
+        MultiScaleDeformableAttnPluginDynamic* plugin =
+            new MultiScaleDeformableAttnPluginDynamic(mLayerName);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs MultiScaleDeformableAttnPluginDynamic::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 3;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[3].d[1];
+
+        ret.d[2] = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *inputs[0].d[3]);
+
+        return ret;
+    }
+
+    bool MultiScaleDeformableAttnPluginDynamic::supportsFormatCombination(
+        int                               pos,
+        const nvinfer1::PluginTensorDesc* ioDesc,
+        int                               nbInputs,
+        int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR)
+        {
+            if ((pos == 1) || (pos == 2))
+            {
+                return (ioDesc[pos].type == nvinfer1::DataType::kINT32);
+            }
+            else
+            {
+                return ((ioDesc[pos].type == ioDesc[0].type) &&
+                        ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT) ||
+                         (ioDesc[pos].type == nvinfer1::DataType::kHALF)));
+            }
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    void MultiScaleDeformableAttnPluginDynamic::configurePlugin(
+        const nvinfer1::DynamicPluginTensorDesc* inputs,
+        int                                      nbInputs,
+        const nvinfer1::DynamicPluginTensorDesc* outputs,
+        int                                      nbOutputs) TRT_NOEXCEPT {}
+
+    size_t MultiScaleDeformableAttnPluginDynamic::getWorkspaceSize(
+        const nvinfer1::PluginTensorDesc* inputs,
+        int                               nbInputs,
+        const nvinfer1::PluginTensorDesc* outputs,
+        int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int MultiScaleDeformableAttnPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                                       const nvinfer1::PluginTensorDesc* outputDesc,
+                                                       const void* const*                inputs,
+                                                       void* const*                      outputs,
+                                                       void*                             workSpace,
+                                                       cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int32_t const batch        = inputDesc[0].dims.d[0];
+        int32_t       spatial_size = inputDesc[0].dims.d[1];
+        int32_t       num_heads    = inputDesc[0].dims.d[2];
+        int32_t       channels     = inputDesc[0].dims.d[3];
+        int32_t       num_levels   = inputDesc[1].dims.d[0];
+        int32_t       num_query    = inputDesc[3].dims.d[1];
+        int32_t       num_point    = inputDesc[3].dims.d[4];
+        int32_t       rc           = 0;
+        if (inputDesc[0].type == nvinfer1::DataType::kFLOAT)
+        {
+            float const*   value           = static_cast<float const*>(inputs[0]);
+            int32_t const* spatialShapes   = static_cast<int32_t const*>(inputs[1]);
+            int32_t const* levelStartIndex = static_cast<int32_t const*>(inputs[2]);
+            float const*   samplingLoc     = static_cast<float const*>(inputs[3]);
+            float const*   attnWeight      = static_cast<float const*>(inputs[4]);
+            float*         output          = static_cast<float*>(outputs[0]);
+
+            rc = ms_deform_attn_cuda_forward(value, spatialShapes, levelStartIndex, samplingLoc, attnWeight, output, batch, spatial_size, num_heads, channels, num_levels, num_query, num_point, stream);
+        }
+        else if (inputDesc[0].type == nvinfer1::DataType::kHALF)
+        {
+            const __half*  value           = static_cast<const __half*>(inputs[0]);
+            int32_t const* spatialShapes   = static_cast<int32_t const*>(inputs[1]);
+            int32_t const* levelStartIndex = static_cast<int32_t const*>(inputs[2]);
+            const __half*  samplingLoc     = static_cast<const __half*>(inputs[3]);
+            const __half*  attnWeight      = static_cast<const __half*>(inputs[4]);
+            __half*        output          = static_cast<__half*>(outputs[0]);
+
+            rc = ms_deform_attn_cuda_forward(value, spatialShapes, levelStartIndex, samplingLoc, attnWeight, output, batch, spatial_size, num_heads, channels, num_levels, num_query, num_point, stream);
+        }
+
+        return rc;
+    }
+
+    nvinfer1::DataType MultiScaleDeformableAttnPluginDynamic::getOutputDataType(
+        int                       index,
+        const nvinfer1::DataType* inputTypes,
+        int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* MultiScaleDeformableAttnPluginDynamic::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* MultiScaleDeformableAttnPluginDynamic::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int MultiScaleDeformableAttnPluginDynamic::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t MultiScaleDeformableAttnPluginDynamic::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    void MultiScaleDeformableAttnPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {}
+
+    void MultiScaleDeformableAttnPluginDynamic::attachToContext(
+        cudnnContext*            cudnnContext,
+        cublasContext*           cublasContext,
+        nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}
+
+    void MultiScaleDeformableAttnPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
+
+    ////////////////////// creator /////////////////////////////
+
+    MultiScaleDeformableAttnPluginDynamicCreator::MultiScaleDeformableAttnPluginDynamicCreator()
+    {
+        mPluginAttributes.clear();
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* MultiScaleDeformableAttnPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* MultiScaleDeformableAttnPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* MultiScaleDeformableAttnPluginDynamicCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        MultiScaleDeformableAttnPluginDynamic* plugin = new MultiScaleDeformableAttnPluginDynamic(name);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* MultiScaleDeformableAttnPluginDynamicCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new MultiScaleDeformableAttnPluginDynamic(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+    REGISTER_TENSORRT_PLUGIN(MultiScaleDeformableAttnPluginDynamicCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp
index 7e66e9e54d..5a2c78baf9 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp
@@ -9,62 +9,59 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class MultiScaleDeformableAttnPluginDynamic : public TRTPluginBase {
- public:
-  MultiScaleDeformableAttnPluginDynamic(const std::string &name);
+namespace mmdeploy
+{
+    class MultiScaleDeformableAttnPluginDynamic : public TRTPluginBase
+    {
+      public:
+        MultiScaleDeformableAttnPluginDynamic(const std::string& name);
 
-  MultiScaleDeformableAttnPluginDynamic(const std::string name, const void *data, size_t length);
+        MultiScaleDeformableAttnPluginDynamic(const std::string name, const void* data, size_t length);
 
-  MultiScaleDeformableAttnPluginDynamic();
+        MultiScaleDeformableAttnPluginDynamic();
 
-  ~MultiScaleDeformableAttnPluginDynamic() TRT_NOEXCEPT override;
+        ~MultiScaleDeformableAttnPluginDynamic() TRT_NOEXCEPT override;
 
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
-                       nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override;
-  void detachFromContext() TRT_NOEXCEPT override;
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+        void               attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+        void               detachFromContext() TRT_NOEXCEPT override;
 
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
 
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-};
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+    };
 
-class MultiScaleDeformableAttnPluginDynamicCreator : public TRTPluginCreatorBase {
- public:
-  MultiScaleDeformableAttnPluginDynamicCreator();
+    class MultiScaleDeformableAttnPluginDynamicCreator : public TRTPluginCreatorBase
+    {
+      public:
+        MultiScaleDeformableAttnPluginDynamicCreator();
 
-  const char *getPluginName() const TRT_NOEXCEPT override;
+        const char*          getPluginName() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_MS_DEFORM_ATTN_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu
index 6b7588eae0..81ddcc6585 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu
@@ -7,58 +7,91 @@
 #include "trt_ms_deform_attn_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename scalar_t>
-void ms_deformable_im2col_cuda(cudaStream_t stream, scalar_t const* dataValue,
-                               int32_t const* dataSpatialShapes, int32_t const* dataLevelStartIndex,
-                               scalar_t const* dataSamplingLoc, scalar_t const* dataAttnWeight,
-                               int32_t const batchSize, int32_t const spatialSize,
-                               int32_t const numHeads, int32_t const channels,
-                               int32_t const numLevels, int32_t const numQuery,
-                               int32_t const numPoint, scalar_t* dataCol) {
-  int32_t const numKernels = batchSize * numQuery * numHeads * channels;
-  int32_t const numActualKernels = batchSize * numQuery * numHeads * channels;
+template<typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream, scalar_t const* dataValue, int32_t const* dataSpatialShapes, int32_t const* dataLevelStartIndex, scalar_t const* dataSamplingLoc, scalar_t const* dataAttnWeight, int32_t const batchSize, int32_t const spatialSize, int32_t const numHeads, int32_t const channels, int32_t const numLevels, int32_t const numQuery, int32_t const numPoint, scalar_t* dataCol)
+{
+    int32_t const numKernels       = batchSize * numQuery * numHeads * channels;
+    int32_t const numActualKernels = batchSize * numQuery * numHeads * channels;
 
-  ms_deformable_im2col_gpu_kernel<scalar_t>
-      <<<GET_BLOCKS(numActualKernels), THREADS_PER_BLOCK, 0, stream>>>(
-          numKernels, dataValue, dataSpatialShapes, dataLevelStartIndex, dataSamplingLoc,
-          dataAttnWeight, batchSize, spatialSize, numHeads, channels, numLevels, numQuery, numPoint,
-          dataCol);
+    ms_deformable_im2col_gpu_kernel<scalar_t>
+        <<<GET_BLOCKS(numActualKernels), THREADS_PER_BLOCK, 0, stream>>>(
+            numKernels,
+            dataValue,
+            dataSpatialShapes,
+            dataLevelStartIndex,
+            dataSamplingLoc,
+            dataAttnWeight,
+            batchSize,
+            spatialSize,
+            numHeads,
+            channels,
+            numLevels,
+            numQuery,
+            numPoint,
+            dataCol);
 }
 
-template <typename scalar_t>
-int32_t ms_deform_attn_cuda_forward(const scalar_t* value, const int32_t* spatialShapes,
-                                    const int32_t* levelStartIndex, const scalar_t* samplingLoc,
-                                    const scalar_t* attnWeight, scalar_t* output, int32_t batch,
-                                    int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels,
-                                    int32_t mNumLevels, int32_t mNumQuery, int32_t mNumPoint,
-                                    cudaStream_t stream) {
-  auto perValueSize = mSpatialSize * mNumHeads * mChannels;
-  auto perSampleLocSize = mNumQuery * mNumHeads * mNumLevels * mNumPoint * 2;
-  auto perAttnWeightSize = mNumQuery * mNumHeads * mNumLevels * mNumPoint;
-  auto perOutputSize = mNumQuery * mNumHeads * mChannels;
+template<typename scalar_t>
+int32_t ms_deform_attn_cuda_forward(const scalar_t* value, const int32_t* spatialShapes, const int32_t* levelStartIndex, const scalar_t* samplingLoc, const scalar_t* attnWeight, scalar_t* output, int32_t batch, int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels, int32_t mNumLevels, int32_t mNumQuery, int32_t mNumPoint, cudaStream_t stream)
+{
+    auto    perValueSize      = mSpatialSize * mNumHeads * mChannels;
+    auto    perSampleLocSize  = mNumQuery * mNumHeads * mNumLevels * mNumPoint * 2;
+    auto    perAttnWeightSize = mNumQuery * mNumHeads * mNumLevels * mNumPoint;
+    auto    perOutputSize     = mNumQuery * mNumHeads * mChannels;
 
-  int32_t mIm2colStep = batch;
+    int32_t mIm2colStep = batch;
 
-  for (int32_t n = 0; n < batch / mIm2colStep; ++n) {
-    auto columns = output + n * mIm2colStep * perOutputSize;
-    ms_deformable_im2col_cuda<scalar_t>(
-        stream, value + n * mIm2colStep * perValueSize, spatialShapes, levelStartIndex,
-        samplingLoc + n * mIm2colStep * perSampleLocSize,
-        attnWeight + n * mIm2colStep * perAttnWeightSize, mIm2colStep, mSpatialSize, mNumHeads,
-        mChannels, mNumLevels, mNumQuery, mNumPoint, columns);
-  }
+    for (int32_t n = 0; n < batch / mIm2colStep; ++n)
+    {
+        auto columns = output + n * mIm2colStep * perOutputSize;
+        ms_deformable_im2col_cuda<scalar_t>(
+            stream,
+            value + n * mIm2colStep * perValueSize,
+            spatialShapes,
+            levelStartIndex,
+            samplingLoc + n * mIm2colStep * perSampleLocSize,
+            attnWeight + n * mIm2colStep * perAttnWeightSize,
+            mIm2colStep,
+            mSpatialSize,
+            mNumHeads,
+            mChannels,
+            mNumLevels,
+            mNumQuery,
+            mNumPoint,
+            columns);
+    }
 
-  return 0;
+    return 0;
 }
 
 template int32_t ms_deform_attn_cuda_forward<float>(
-    const float* value, const int32_t* spatialShapes, const int32_t* levelStartIndex,
-    const float* samplingLoc, const float* attnWeight, float* output, int32_t batch,
-    int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels, int32_t mNumLevels,
-    int32_t mNumQuery, int32_t mNumPoint, cudaStream_t stream);
+    const float*   value,
+    const int32_t* spatialShapes,
+    const int32_t* levelStartIndex,
+    const float*   samplingLoc,
+    const float*   attnWeight,
+    float*         output,
+    int32_t        batch,
+    int32_t        mSpatialSize,
+    int32_t        mNumHeads,
+    int32_t        mChannels,
+    int32_t        mNumLevels,
+    int32_t        mNumQuery,
+    int32_t        mNumPoint,
+    cudaStream_t   stream);
 
 template int32_t ms_deform_attn_cuda_forward<__half>(
-    const __half* value, const int32_t* spatialShapes, const int32_t* levelStartIndex,
-    const __half* samplingLoc, const __half* attnWeight, __half* output, int32_t batch,
-    int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels, int32_t mNumLevels,
-    int32_t mNumQuery, int32_t mNumPoint, cudaStream_t stream);
+    const __half*  value,
+    const int32_t* spatialShapes,
+    const int32_t* levelStartIndex,
+    const __half*  samplingLoc,
+    const __half*  attnWeight,
+    __half*        output,
+    int32_t        batch,
+    int32_t        mSpatialSize,
+    int32_t        mNumHeads,
+    int32_t        mChannels,
+    int32_t        mNumLevels,
+    int32_t        mNumQuery,
+    int32_t        mNumPoint,
+    cudaStream_t   stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh
index cee34cfe65..2b62e7fc30 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh
@@ -4,254 +4,294 @@
 
 #include "common_cuda_helper.hpp"
 
-template <typename scalar_t>
-__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t*& bottom_data, const int& height,
-                                                   const int& width, const int& nheads,
-                                                   const int& channels, const scalar_t& h,
-                                                   const scalar_t& w, const int& m, const int& c) {
-  const int h_low = floorf(h);
-  const int w_low = floorf(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
+template<typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t*& bottom_data, const int& height, const int& width, const int& nheads, const int& channels, const scalar_t& h, const scalar_t& w, const int& m, const int& c)
+{
+    const int      h_low  = floorf(h);
+    const int      w_low  = floorf(w);
+    const int      h_high = h_low + 1;
+    const int      w_high = w_low + 1;
 
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
+    const scalar_t lh = h - h_low;
+    const scalar_t lw = w - w_low;
+    const scalar_t hh = 1 - lh, hw = 1 - lw;
 
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
+    const int      w_stride          = nheads * channels;
+    const int      h_stride          = width * w_stride;
+    const int      h_low_ptr_offset  = h_low * h_stride;
+    const int      h_high_ptr_offset = h_low_ptr_offset + h_stride;
+    const int      w_low_ptr_offset  = w_low * w_stride;
+    const int      w_high_ptr_offset = w_low_ptr_offset + w_stride;
+    const int      base_ptr          = m * channels + c;
 
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0) {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-  }
+    scalar_t       v1 = 0;
+    if (h_low >= 0 && w_low >= 0)
+    {
+        const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+        v1             = bottom_data[ptr1];
+    }
+    scalar_t v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1)
+    {
+        const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+        v2             = bottom_data[ptr2];
+    }
+    scalar_t v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0)
+    {
+        const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+        v3             = bottom_data[ptr3];
+    }
+    scalar_t v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1)
+    {
+        const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+        v4             = bottom_data[ptr4];
+    }
 
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+    const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
 
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
+    const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return val;
 }
 
-template <>
+template<>
 __device__ __half ms_deform_attn_im2col_bilinear<__half>(
-    const __half*& bottomData, int32_t const& height, int32_t const& width, int32_t const& nHeads,
-    int32_t const& channels, const __half& h, const __half& w, int32_t const& m, int32_t const& c) {
-  int32_t const hLow = __half2int_rd(h);
-  int32_t const wLow = __half2int_rd(w);
-  int32_t const hHigh = hLow + 1;
-  int32_t const wHigh = wLow + 1;
+    const __half*& bottomData,
+    int32_t const& height,
+    int32_t const& width,
+    int32_t const& nHeads,
+    int32_t const& channels,
+    const __half&  h,
+    const __half&  w,
+    int32_t const& m,
+    int32_t const& c)
+{
+    int32_t const hLow  = __half2int_rd(h);
+    int32_t const wLow  = __half2int_rd(w);
+    int32_t const hHigh = hLow + 1;
+    int32_t const wHigh = wLow + 1;
 
-  const __half kZERO = __int2half_rz(0);
-  const __half one = __int2half_rz(1);
+    const __half  kZERO = __int2half_rz(0);
+    const __half  one   = __int2half_rz(1);
 
 #if __CUDA_ARCH__ >= 530
-  const __half lh = __hsub(h, __int2half_rd(hLow));
-  const __half lw = __hsub(w, __int2half_rd(wLow));
-  const __half hh = __hsub(one, lh), hw = __hsub(one, lw);
+    const __half lh = __hsub(h, __int2half_rd(hLow));
+    const __half lw = __hsub(w, __int2half_rd(wLow));
+    const __half hh = __hsub(one, lh), hw = __hsub(one, lw);
 #else
-  const __half lh = __float2half(__half2float(h) - hLow);
-  const __half lw = __float2half(__half2float(w) - wLow);
-  const __half hh = __float2half(__half2float(one) - __half2float(lh));
-  const __half hw = __float2half(__half2float(one) - __half2float(lw));
+    const __half lh = __float2half(__half2float(h) - hLow);
+    const __half lw = __float2half(__half2float(w) - wLow);
+    const __half hh = __float2half(__half2float(one) - __half2float(lh));
+    const __half hw = __float2half(__half2float(one) - __half2float(lw));
 #endif
-  int32_t const wStride = nHeads * channels;
-  int32_t const hStride = width * wStride;
-  int32_t const hLowPtrOffset = hLow * hStride;
-  int32_t const hHighPtrOffset = hLowPtrOffset + hStride;
-  int32_t const wLowPtrOffset = wLow * wStride;
-  int32_t const wHighPtrOffset = wLowPtrOffset + wStride;
-  int32_t const basePtr = m * channels + c;
+    int32_t const wStride        = nHeads * channels;
+    int32_t const hStride        = width * wStride;
+    int32_t const hLowPtrOffset  = hLow * hStride;
+    int32_t const hHighPtrOffset = hLowPtrOffset + hStride;
+    int32_t const wLowPtrOffset  = wLow * wStride;
+    int32_t const wHighPtrOffset = wLowPtrOffset + wStride;
+    int32_t const basePtr        = m * channels + c;
 
-  __half v1 = kZERO;
-  if (hLow >= 0 && wLow >= 0) {
-    int32_t const ptr1 = hLowPtrOffset + wLowPtrOffset + basePtr;
-    v1 = bottomData[ptr1];
-  }
-  __half v2 = kZERO;
-  if (hLow >= 0 && wHigh <= width - 1) {
-    int32_t const ptr2 = hLowPtrOffset + wHighPtrOffset + basePtr;
-    v2 = bottomData[ptr2];
-  }
-  __half v3 = kZERO;
-  if (hHigh <= height - 1 && wLow >= 0) {
-    int32_t const ptr3 = hHighPtrOffset + wLowPtrOffset + basePtr;
-    v3 = bottomData[ptr3];
-  }
-  __half v4 = kZERO;
-  if (hHigh <= height - 1 && wHigh <= width - 1) {
-    int32_t const ptr4 = hHighPtrOffset + wHighPtrOffset + basePtr;
-    v4 = bottomData[ptr4];
-  }
+    __half        v1 = kZERO;
+    if (hLow >= 0 && wLow >= 0)
+    {
+        int32_t const ptr1 = hLowPtrOffset + wLowPtrOffset + basePtr;
+        v1                 = bottomData[ptr1];
+    }
+    __half v2 = kZERO;
+    if (hLow >= 0 && wHigh <= width - 1)
+    {
+        int32_t const ptr2 = hLowPtrOffset + wHighPtrOffset + basePtr;
+        v2                 = bottomData[ptr2];
+    }
+    __half v3 = kZERO;
+    if (hHigh <= height - 1 && wLow >= 0)
+    {
+        int32_t const ptr3 = hHighPtrOffset + wLowPtrOffset + basePtr;
+        v3                 = bottomData[ptr3];
+    }
+    __half v4 = kZERO;
+    if (hHigh <= height - 1 && wHigh <= width - 1)
+    {
+        int32_t const ptr4 = hHighPtrOffset + wHighPtrOffset + basePtr;
+        v4                 = bottomData[ptr4];
+    }
 
 #if __CUDA_ARCH__ >= 530
-  __half w1 = __hmul(__hmul(hh, hw), v1);
-  __half w2 = __hmul(__hmul(hh, lw), v2);
-  __half w3 = __hmul(__hmul(lh, hw), v3);
-  __half w4 = __hmul(__hmul(lh, lw), v4);
+    __half w1 = __hmul(__hmul(hh, hw), v1);
+    __half w2 = __hmul(__hmul(hh, lw), v2);
+    __half w3 = __hmul(__hmul(lh, hw), v3);
+    __half w4 = __hmul(__hmul(lh, lw), v4);
 
-  w1 = __hadd(w1, w2);
-  w3 = __hadd(w3, w4);
+    w1 = __hadd(w1, w2);
+    w3 = __hadd(w3, w4);
 
-  const __half val = __hadd(w1, w3);
+    const __half val = __hadd(w1, w3);
 #else
-  __half w1 = __float2half((__half2float(hh) * __half2float(hw)) * __half2float(v1));
-  __half w2 = __float2half((__half2float(hh) * __half2float(lw)) * __half2float(v2));
-  __half w3 = __float2half((__half2float(lh) * __half2float(hw)) * __half2float(v3));
-  __half w4 = __float2half((__half2float(lh) * __half2float(lw)) * __half2float(v4));
+    __half w1 = __float2half((__half2float(hh) * __half2float(hw)) * __half2float(v1));
+    __half w2 = __float2half((__half2float(hh) * __half2float(lw)) * __half2float(v2));
+    __half w3 = __float2half((__half2float(lh) * __half2float(hw)) * __half2float(v3));
+    __half w4 = __float2half((__half2float(lh) * __half2float(lw)) * __half2float(v4));
 
-  w1 = __float2half(__half2float(w1) + __half2float(w2));
-  w3 = __float2half(__half2float(w3) + __half2float(w4));
+    w1 = __float2half(__half2float(w1) + __half2float(w2));
+    w3 = __float2half(__half2float(w3) + __half2float(w4));
 
-  const __half val = __float2half(__half2float(w1) + __half2float(w3));
+    const __half val = __float2half(__half2float(w1) + __half2float(w3));
 #endif
-  return val;
+    return val;
 }
 
 #if 1
-template <typename scalar_t>
+template<typename scalar_t>
 __global__ void ms_deformable_im2col_gpu_kernel(
-    int32_t const n, scalar_t const* dataValue, int32_t const* dataSpatialShapes,
-    int32_t const* dataLevelStartIndex, scalar_t const* dataSamplingLoc,
-    scalar_t const* dataAttnWeight, int32_t const batchSize, int32_t const spatialSize,
-    int32_t const numHeads, int32_t const channels, int32_t const numLevels, int32_t const numQuery,
-    int32_t const numPoint, scalar_t* dataCol) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    int32_t _temp = index;
-    int32_t const cCol = _temp % channels;
-    _temp /= channels;
-    int32_t const samplingIndex = _temp;
-    int32_t const mCol = _temp % numHeads;
-    _temp /= numHeads;
-    _temp /= numQuery;
-    int32_t const bCol = _temp;
+    int32_t const   n,
+    scalar_t const* dataValue,
+    int32_t const*  dataSpatialShapes,
+    int32_t const*  dataLevelStartIndex,
+    scalar_t const* dataSamplingLoc,
+    scalar_t const* dataAttnWeight,
+    int32_t const   batchSize,
+    int32_t const   spatialSize,
+    int32_t const   numHeads,
+    int32_t const   channels,
+    int32_t const   numLevels,
+    int32_t const   numQuery,
+    int32_t const   numPoint,
+    scalar_t*       dataCol)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        int32_t       _temp = index;
+        int32_t const cCol  = _temp % channels;
+        _temp /= channels;
+        int32_t const samplingIndex = _temp;
+        int32_t const mCol          = _temp % numHeads;
+        _temp /= numHeads;
+        _temp /= numQuery;
+        int32_t const bCol = _temp;
 
-    scalar_t* dataColPtr = dataCol + index;
-    int32_t dataWeightPtr = samplingIndex * numLevels * numPoint;
-    int32_t dataLocWPtr = dataWeightPtr << 1;
-    int32_t const qidStride = numHeads * channels;
-    int32_t const dataValuePtrInitOffset = bCol * spatialSize * qidStride;
-    scalar_t col = 0;
+        scalar_t*     dataColPtr             = dataCol + index;
+        int32_t       dataWeightPtr          = samplingIndex * numLevels * numPoint;
+        int32_t       dataLocWPtr            = dataWeightPtr << 1;
+        int32_t const qidStride              = numHeads * channels;
+        int32_t const dataValuePtrInitOffset = bCol * spatialSize * qidStride;
+        scalar_t      col                    = 0;
 
-    for (int32_t lCol = 0; lCol < numLevels; ++lCol) {
-      int32_t const levelStartId = dataLevelStartIndex[lCol];
-      int32_t const spatialHPtr = lCol << 1;
-      int32_t const spatialH = dataSpatialShapes[spatialHPtr];
-      int32_t const spatialW = dataSpatialShapes[spatialHPtr + 1];
-      scalar_t const* dataValuePtr =
-          dataValue + (dataValuePtrInitOffset + levelStartId * qidStride);
-      for (int32_t pCol = 0; pCol < numPoint; ++pCol) {
-        scalar_t const locW = dataSamplingLoc[dataLocWPtr];
-        scalar_t const locH = dataSamplingLoc[dataLocWPtr + 1];
-        scalar_t const weight = dataAttnWeight[dataWeightPtr];
+        for (int32_t lCol = 0; lCol < numLevels; ++lCol)
+        {
+            int32_t const   levelStartId = dataLevelStartIndex[lCol];
+            int32_t const   spatialHPtr  = lCol << 1;
+            int32_t const   spatialH     = dataSpatialShapes[spatialHPtr];
+            int32_t const   spatialW     = dataSpatialShapes[spatialHPtr + 1];
+            scalar_t const* dataValuePtr =
+                dataValue + (dataValuePtrInitOffset + levelStartId * qidStride);
+            for (int32_t pCol = 0; pCol < numPoint; ++pCol)
+            {
+                scalar_t const locW   = dataSamplingLoc[dataLocWPtr];
+                scalar_t const locH   = dataSamplingLoc[dataLocWPtr + 1];
+                scalar_t const weight = dataAttnWeight[dataWeightPtr];
 
-        scalar_t const hIm = locH * spatialH - 0.5;
-        scalar_t const wIm = locW * spatialW - 0.5;
+                scalar_t const hIm = locH * spatialH - 0.5;
+                scalar_t const wIm = locW * spatialW - 0.5;
 
-        if (hIm > -1 && wIm > -1 && hIm < spatialH && wIm < spatialW) {
-          col += ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads,
-                                                channels, hIm, wIm, mCol, cCol) *
-                 weight;
-        }
+                if (hIm > -1 && wIm > -1 && hIm < spatialH && wIm < spatialW)
+                {
+                    col += ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads, channels, hIm, wIm, mCol, cCol) *
+                           weight;
+                }
 
-        dataWeightPtr += 1;
-        dataLocWPtr += 2;
-      }
+                dataWeightPtr += 1;
+                dataLocWPtr += 2;
+            }
+        }
+        *dataColPtr = col;
     }
-    *dataColPtr = col;
-  }
 }
 
-template <>
+template<>
 __global__ void ms_deformable_im2col_gpu_kernel<__half>(
-    int32_t const n, const __half* dataValue, int32_t const* dataSpatialShapes,
-    int32_t const* dataLevelStartIndex, const __half* dataSamplingLoc, const __half* dataAttnWeight,
-    int32_t const batchSize, int32_t const spatialSize, int32_t const numHeads,
-    int32_t const channels, int32_t const numLevels, int32_t const numQuery, int32_t const numPoint,
-    __half* dataCol) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    int32_t _temp = index;
-    int32_t const cCol = _temp % channels;
-    _temp /= channels;
-    int32_t const samplingIndex = _temp;
-    int32_t const mCol = _temp % numHeads;
-    _temp /= numHeads;
-    _temp /= numQuery;
-    int32_t const bCol = _temp;
+    int32_t const  n,
+    const __half*  dataValue,
+    int32_t const* dataSpatialShapes,
+    int32_t const* dataLevelStartIndex,
+    const __half*  dataSamplingLoc,
+    const __half*  dataAttnWeight,
+    int32_t const  batchSize,
+    int32_t const  spatialSize,
+    int32_t const  numHeads,
+    int32_t const  channels,
+    int32_t const  numLevels,
+    int32_t const  numQuery,
+    int32_t const  numPoint,
+    __half*        dataCol)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        int32_t       _temp = index;
+        int32_t const cCol  = _temp % channels;
+        _temp /= channels;
+        int32_t const samplingIndex = _temp;
+        int32_t const mCol          = _temp % numHeads;
+        _temp /= numHeads;
+        _temp /= numQuery;
+        int32_t const bCol = _temp;
 
-    __half* dataColPtr = dataCol + index;
-    int32_t dataWeightPtr = samplingIndex * numLevels * numPoint;
-    int32_t dataLocWPtr = dataWeightPtr << 1;
-    int32_t const qidStride = numHeads * channels;
-    int32_t const dataValuePtrInitOffset = bCol * spatialSize * qidStride;
-    const __half kZERO_POINT_FIVE = __float2half(0.5f);
-    const __half kMINUS_ONE = __float2half(-1.0f);
-    const __half kZERO = __int2half_rz(0);
-    __half tpVal = kZERO;
-    __half col = kZERO;
+        __half*       dataColPtr             = dataCol + index;
+        int32_t       dataWeightPtr          = samplingIndex * numLevels * numPoint;
+        int32_t       dataLocWPtr            = dataWeightPtr << 1;
+        int32_t const qidStride              = numHeads * channels;
+        int32_t const dataValuePtrInitOffset = bCol * spatialSize * qidStride;
+        const __half  kZERO_POINT_FIVE       = __float2half(0.5f);
+        const __half  kMINUS_ONE             = __float2half(-1.0f);
+        const __half  kZERO                  = __int2half_rz(0);
+        __half        tpVal                  = kZERO;
+        __half        col                    = kZERO;
 
-    for (int32_t lCol = 0; lCol < numLevels; ++lCol) {
-      int32_t const levelStartId = dataLevelStartIndex[lCol];
-      int32_t const spatialHPtr = lCol << 1;
-      int32_t const spatialH = dataSpatialShapes[spatialHPtr];
-      int32_t const spatialW = dataSpatialShapes[spatialHPtr + 1];
-      const __half spatialHHalf = __int2half_rd(spatialH);
-      const __half spatialWHalf = __int2half_rd(spatialW);
-      const __half* dataValuePtr = dataValue + (dataValuePtrInitOffset + levelStartId * qidStride);
-      for (int32_t pCol = 0; pCol < numPoint; ++pCol) {
-        const __half locW = dataSamplingLoc[dataLocWPtr];
-        const __half locH = dataSamplingLoc[dataLocWPtr + 1];
-        const __half weight = dataAttnWeight[dataWeightPtr];
-#if __CUDA_ARCH__ >= 530
-        const __half hIm = __hsub(__hmul(locH, spatialHHalf), kZERO_POINT_FIVE);
-        const __half wIm = __hsub(__hmul(locW, spatialWHalf), kZERO_POINT_FIVE);
+        for (int32_t lCol = 0; lCol < numLevels; ++lCol)
+        {
+            int32_t const levelStartId = dataLevelStartIndex[lCol];
+            int32_t const spatialHPtr  = lCol << 1;
+            int32_t const spatialH     = dataSpatialShapes[spatialHPtr];
+            int32_t const spatialW     = dataSpatialShapes[spatialHPtr + 1];
+            const __half  spatialHHalf = __int2half_rd(spatialH);
+            const __half  spatialWHalf = __int2half_rd(spatialW);
+            const __half* dataValuePtr = dataValue + (dataValuePtrInitOffset + levelStartId * qidStride);
+            for (int32_t pCol = 0; pCol < numPoint; ++pCol)
+            {
+                const __half locW   = dataSamplingLoc[dataLocWPtr];
+                const __half locH   = dataSamplingLoc[dataLocWPtr + 1];
+                const __half weight = dataAttnWeight[dataWeightPtr];
+    #if __CUDA_ARCH__ >= 530
+                const __half hIm = __hsub(__hmul(locH, spatialHHalf), kZERO_POINT_FIVE);
+                const __half wIm = __hsub(__hmul(locW, spatialWHalf), kZERO_POINT_FIVE);
 
-        if (__hgt(hIm, kMINUS_ONE) && __hgt(wIm, kMINUS_ONE) && __hlt(hIm, spatialHHalf) &&
-            __hlt(wIm, spatialWHalf)) {
-          tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads,
-                                                 channels, hIm, wIm, mCol, cCol);
-          col = __hadd(col, __hmul(tpVal, weight));
-        }
-#else
-        const __half hIm = __float2half(__half2float(locH) * __half2float(spatialHHalf) -
-                                        __half2float(kZERO_POINT_FIVE));
-        const __half wIm = __float2half(__half2float(locW) * __half2float(spatialWHalf) -
-                                        __half2float(kZERO_POINT_FIVE));
+                if (__hgt(hIm, kMINUS_ONE) && __hgt(wIm, kMINUS_ONE) && __hlt(hIm, spatialHHalf) &&
+                    __hlt(wIm, spatialWHalf))
+                {
+                    tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads, channels, hIm, wIm, mCol, cCol);
+                    col   = __hadd(col, __hmul(tpVal, weight));
+                }
+    #else
+                const __half hIm = __float2half(__half2float(locH) * __half2float(spatialHHalf) -
+                                                __half2float(kZERO_POINT_FIVE));
+                const __half wIm = __float2half(__half2float(locW) * __half2float(spatialWHalf) -
+                                                __half2float(kZERO_POINT_FIVE));
 
-        if ((__half2float(hIm) > __half2float(kMINUS_ONE)) &&
-            (__half2float(wIm) > __half2float(kMINUS_ONE)) &&
-            (__half2float(hIm) < __half2float(spatialHHalf)) &&
-            (__half2float(wIm) < __half2float(spatialWHalf))) {
-          tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads,
-                                                 channels, hIm, wIm, mCol, cCol);
-          col = __float2half(__half2float(col) + (__half2float(tpVal) * __half2float(weight)));
+                if ((__half2float(hIm) > __half2float(kMINUS_ONE)) &&
+                    (__half2float(wIm) > __half2float(kMINUS_ONE)) &&
+                    (__half2float(hIm) < __half2float(spatialHHalf)) &&
+                    (__half2float(wIm) < __half2float(spatialWHalf)))
+                {
+                    tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads, channels, hIm, wIm, mCol, cCol);
+                    col   = __float2half(__half2float(col) + (__half2float(tpVal) * __half2float(weight)));
+                }
+    #endif
+                dataWeightPtr += 1;
+                dataLocWPtr += 2;
+            }
         }
-#endif
-        dataWeightPtr += 1;
-        dataLocWPtr += 2;
-      }
+        *dataColPtr = col;
     }
-    *dataColPtr = col;
-  }
 }
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp
index adbe2566fd..5dafa5a169 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp
@@ -4,12 +4,7 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 
-template <typename scalar_t>
-int32_t ms_deform_attn_cuda_forward(const scalar_t* value, const int32_t* spatialShapes,
-                                    const int32_t* levelStartIndex, const scalar_t* samplingLoc,
-                                    const scalar_t* attnWeight, scalar_t* output, int32_t batch,
-                                    int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels,
-                                    int32_t mNumLevels, int32_t mNumQuery, int32_t mNumPoint,
-                                    cudaStream_t stream);
+template<typename scalar_t>
+int32_t ms_deform_attn_cuda_forward(const scalar_t* value, const int32_t* spatialShapes, const int32_t* levelStartIndex, const scalar_t* samplingLoc, const scalar_t* attnWeight, scalar_t* output, int32_t batch, int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels, int32_t mNumLevels, int32_t mNumQuery, int32_t mNumPoint, cudaStream_t stream);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
index 988893125d..4325f7b89c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
@@ -9,233 +9,290 @@
 #include "trt_roi_align_kernel.hpp"
 #include "trt_serialize.hpp"
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVRoiAlign"};
-}  // namespace
-
-TRTRoIAlign::TRTRoIAlign(const std::string &name, int outWidth, int outHeight, float spatialScale,
-                         int sampleRatio, int poolMode, bool aligned)
-    : TRTPluginBase(name),
-      mOutWidth(outWidth),
-      mOutHeight(outHeight),
-      mSpatialScale(spatialScale),
-      mSampleRatio(sampleRatio),
-      mPoolMode(poolMode),
-      mAligned(aligned) {}
-
-TRTRoIAlign::TRTRoIAlign(const std::string name, const void *data, size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mOutWidth);
-  deserialize_value(&data, &length, &mOutHeight);
-  deserialize_value(&data, &length, &mSpatialScale);
-  deserialize_value(&data, &length, &mSampleRatio);
-  deserialize_value(&data, &length, &mPoolMode);
-  deserialize_value(&data, &length, &mAligned);
-}
-
-nvinfer1::IPluginV2DynamicExt *TRTRoIAlign::clone() const TRT_NOEXCEPT {
-  TRTRoIAlign *plugin = new TRTRoIAlign(mLayerName, mOutWidth, mOutHeight, mSpatialScale,
-                                        mSampleRatio, mPoolMode, mAligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTRoIAlign::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[1].d[0];
-  ret.d[1] = inputs[0].d[1];
-  ret.d[2] = exprBuilder.constant(mOutHeight);
-  ret.d[3] = exprBuilder.constant(mOutWidth);
-
-  return ret;
-}
-
-bool TRTRoIAlign::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
-                                            int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-}
-
-void TRTRoIAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-                                  const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                  int nbOutputs) TRT_NOEXCEPT {}
-
-size_t TRTRoIAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                                     const nvinfer1::PluginTensorDesc *outputs,
-                                     int nbOutputs) const TRT_NOEXCEPT {
-  size_t output_size = 0;
-  size_t word_size = 0;
-  switch (mPoolMode) {
-    case 0:  // max
-      output_size =
-          outputs[0].dims.d[0] * outputs[0].dims.d[1] * outputs[0].dims.d[2] * outputs[0].dims.d[3];
-      word_size = mmdeploy::getElementSize(outputs[0].type);
-      return output_size * word_size * 2;
-      break;
-    case 1:
-      return 0;
-      break;
-    default:
-      return 0;
-  }
-  return 0;
-}
-
-int TRTRoIAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                         const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-                         void *const *outputs, void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  int channels = inputDesc[0].dims.d[1];
-  int height = inputDesc[0].dims.d[2];
-  int width = inputDesc[0].dims.d[3];
-
-  int output_size = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] * outputDesc[0].dims.d[2] *
-                    outputDesc[0].dims.d[3];
-  int word_size = mmdeploy::getElementSize(outputDesc[0].type);
-
-  const void *feat = inputs[0];
-  const void *rois = inputs[1];
-  void *output = outputs[0];
-  void *argmax_y = nullptr;
-  void *argmax_x = nullptr;
-
-  switch (mPoolMode) {
-    case 0:  // max
-      argmax_y = workSpace;
-      argmax_x = (char *)argmax_y + output_size * word_size;
-      break;
-    case 1:  // avg
-      break;
-  }
-
-  switch (outputDesc[0].type) {
-    case nvinfer1::DataType::kFLOAT:
-      TRTRoIAlignForwardCUDAKernelLauncher<float>(
-          (const float *)feat, (const float *)rois, (float *)output, (float *)argmax_y,
-          (float *)argmax_x, output_size, channels, height, width, mOutHeight, mOutWidth,
-          mSpatialScale, mSampleRatio, mPoolMode, mAligned, stream);
-      break;
-
-    default:
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType TRTRoIAlign::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                                  int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *TRTRoIAlign::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTRoIAlign::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int TRTRoIAlign::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTRoIAlign::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mOutWidth) + serialized_size(mOutHeight) + serialized_size(mSpatialScale) +
-         serialized_size(mSampleRatio) + serialized_size(mPoolMode) + serialized_size(mAligned);
-}
-
-void TRTRoIAlign::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mOutWidth);
-  serialize_value(&buffer, mOutHeight);
-  serialize_value(&buffer, mSpatialScale);
-  serialize_value(&buffer, mSampleRatio);
-  serialize_value(&buffer, mPoolMode);
-  serialize_value(&buffer, mAligned);
-}
-
-TRTRoIAlignCreator::TRTRoIAlignCreator() {
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("output_height"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("output_width"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("spatial_scale"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("sampling_ratio"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("aligned"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTRoIAlignCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTRoIAlignCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-nvinfer1::IPluginV2 *TRTRoIAlignCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  int outWidth = 7;
-  int outHeight = 7;
-  float spatialScale = 1.0;
-  int sampleRatio = 0;
-  int poolMode = -1;
-  bool aligned = true;
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("output_height") == 0) {
-      outHeight = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("output_width") == 0) {
-      outWidth = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("spatial_scale") == 0) {
-      spatialScale = static_cast<const float *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("sampling_ratio") == 0) {
-      sampleRatio = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("mode") == 0) {
-      int data_size = fc->fields[i].length;
-      ASSERT(data_size > 0);
-      const char *data_start = static_cast<const char *>(fc->fields[i].data);
-      std::string pool_mode(data_start);
-      if (pool_mode == "avg") {
-        poolMode = 1;
-      } else if (pool_mode == "max") {
-        poolMode = 0;
-      } else {
-        std::cout << "Unknown pool mode \"" << pool_mode << "\"." << std::endl;
-      }
-      ASSERT(poolMode >= 0);
-    }
-
-    if (field_name.compare("aligned") == 0) {
-      int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
-      aligned = aligned_int != 0;
-    }
-  }
-
-  ASSERT(outHeight > 0);
-  ASSERT(outWidth > 0);
-  ASSERT(spatialScale > 0.);
-  ASSERT(poolMode >= 0);
-
-  TRTRoIAlign *plugin =
-      new TRTRoIAlign(name, outWidth, outHeight, spatialScale, sampleRatio, poolMode, aligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *TRTRoIAlignCreator::deserializePlugin(const char *name, const void *serialData,
-                                                           size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new TRTRoIAlign(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(TRTRoIAlignCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVRoiAlign"};
+    }  // namespace
+
+    TRTRoIAlign::TRTRoIAlign(const std::string& name, int outWidth, int outHeight, float spatialScale, int sampleRatio, int poolMode, bool aligned)
+        : TRTPluginBase(name)
+        , mOutWidth(outWidth)
+        , mOutHeight(outHeight)
+        , mSpatialScale(spatialScale)
+        , mSampleRatio(sampleRatio)
+        , mPoolMode(poolMode)
+        , mAligned(aligned)
+    {
+    }
+
+    TRTRoIAlign::TRTRoIAlign(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mOutWidth);
+        deserialize_value(&data, &length, &mOutHeight);
+        deserialize_value(&data, &length, &mSpatialScale);
+        deserialize_value(&data, &length, &mSampleRatio);
+        deserialize_value(&data, &length, &mPoolMode);
+        deserialize_value(&data, &length, &mAligned);
+    }
+
+    nvinfer1::IPluginV2DynamicExt* TRTRoIAlign::clone() const TRT_NOEXCEPT
+    {
+        TRTRoIAlign* plugin = new TRTRoIAlign(mLayerName, mOutWidth, mOutHeight, mSpatialScale, mSampleRatio, mPoolMode, mAligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTRoIAlign::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 4;
+        ret.d[0]   = inputs[1].d[0];
+        ret.d[1]   = inputs[0].d[1];
+        ret.d[2]   = exprBuilder.constant(mOutHeight);
+        ret.d[3]   = exprBuilder.constant(mOutWidth);
+
+        return ret;
+    }
+
+    bool TRTRoIAlign::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    {
+        return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+
+    void   TRTRoIAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT {}
+
+    size_t TRTRoIAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT
+    {
+        size_t output_size = 0;
+        size_t word_size   = 0;
+        switch (mPoolMode)
+        {
+            case 0:  // max
+                output_size =
+                    outputs[0].dims.d[0] * outputs[0].dims.d[1] * outputs[0].dims.d[2] * outputs[0].dims.d[3];
+                word_size = mmdeploy::getElementSize(outputs[0].type);
+                return output_size * word_size * 2;
+                break;
+            case 1:
+                return 0;
+                break;
+            default:
+                return 0;
+        }
+        return 0;
+    }
+
+    int TRTRoIAlign::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                             const nvinfer1::PluginTensorDesc* outputDesc,
+                             const void* const*                inputs,
+                             void* const*                      outputs,
+                             void*                             workSpace,
+                             cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int channels = inputDesc[0].dims.d[1];
+        int height   = inputDesc[0].dims.d[2];
+        int width    = inputDesc[0].dims.d[3];
+
+        int output_size = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] * outputDesc[0].dims.d[2] *
+                          outputDesc[0].dims.d[3];
+        int         word_size = mmdeploy::getElementSize(outputDesc[0].type);
+
+        const void* feat     = inputs[0];
+        const void* rois     = inputs[1];
+        void*       output   = outputs[0];
+        void*       argmax_y = nullptr;
+        void*       argmax_x = nullptr;
+
+        switch (mPoolMode)
+        {
+            case 0:  // max
+                argmax_y = workSpace;
+                argmax_x = (char*)argmax_y + output_size * word_size;
+                break;
+            case 1:  // avg
+                break;
+        }
+
+        switch (outputDesc[0].type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                TRTRoIAlignForwardCUDAKernelLauncher<float>(
+                    (const float*)feat,
+                    (const float*)rois,
+                    (float*)output,
+                    (float*)argmax_y,
+                    (float*)argmax_x,
+                    output_size,
+                    channels,
+                    height,
+                    width,
+                    mOutHeight,
+                    mOutWidth,
+                    mSpatialScale,
+                    mSampleRatio,
+                    mPoolMode,
+                    mAligned,
+                    stream);
+                break;
+
+            default:
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTRoIAlign::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* TRTRoIAlign::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTRoIAlign::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTRoIAlign::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTRoIAlign::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mOutWidth) + serialized_size(mOutHeight) + serialized_size(mSpatialScale) +
+               serialized_size(mSampleRatio) + serialized_size(mPoolMode) + serialized_size(mAligned);
+    }
+
+    void TRTRoIAlign::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mOutWidth);
+        serialize_value(&buffer, mOutHeight);
+        serialize_value(&buffer, mSpatialScale);
+        serialize_value(&buffer, mSampleRatio);
+        serialize_value(&buffer, mPoolMode);
+        serialize_value(&buffer, mAligned);
+    }
+
+    TRTRoIAlignCreator::TRTRoIAlignCreator()
+    {
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("output_height"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("output_width"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("spatial_scale"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("sampling_ratio"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("aligned"));
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTRoIAlignCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTRoIAlignCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTRoIAlignCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        int   outWidth     = 7;
+        int   outHeight    = 7;
+        float spatialScale = 1.0;
+        int   sampleRatio  = 0;
+        int   poolMode     = -1;
+        bool  aligned      = true;
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("output_height") == 0)
+            {
+                outHeight = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("output_width") == 0)
+            {
+                outWidth = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("spatial_scale") == 0)
+            {
+                spatialScale = static_cast<const float*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("sampling_ratio") == 0)
+            {
+                sampleRatio = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("mode") == 0)
+            {
+                int data_size = fc->fields[i].length;
+                ASSERT(data_size > 0);
+                const char* data_start = static_cast<const char*>(fc->fields[i].data);
+                std::string pool_mode(data_start);
+                if (pool_mode == "avg")
+                {
+                    poolMode = 1;
+                }
+                else if (pool_mode == "max")
+                {
+                    poolMode = 0;
+                }
+                else
+                {
+                    std::cout << "Unknown pool mode \"" << pool_mode << "\"." << std::endl;
+                }
+                ASSERT(poolMode >= 0);
+            }
+
+            if (field_name.compare("aligned") == 0)
+            {
+                int aligned_int = static_cast<const int*>(fc->fields[i].data)[0];
+                aligned         = aligned_int != 0;
+            }
+        }
+
+        ASSERT(outHeight > 0);
+        ASSERT(outWidth > 0);
+        ASSERT(spatialScale > 0.);
+        ASSERT(poolMode >= 0);
+
+        TRTRoIAlign* plugin =
+            new TRTRoIAlign(name, outWidth, outHeight, spatialScale, sampleRatio, poolMode, aligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTRoIAlignCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new TRTRoIAlign(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+    REGISTER_TENSORRT_PLUGIN(TRTRoIAlignCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
index cfc14758f7..45301e014e 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
@@ -8,65 +8,62 @@
 #include <vector>
 
 #include "trt_plugin_base.hpp"
-namespace mmdeploy {
-class TRTRoIAlign : public TRTPluginBase {
- public:
-  TRTRoIAlign(const std::string &name, int outWidth, int outHeight, float spatialScale,
-              int sampleRatio, int poolMode, bool aligned);
+namespace mmdeploy
+{
+    class TRTRoIAlign : public TRTPluginBase
+    {
+      public:
+        TRTRoIAlign(const std::string& name, int outWidth, int outHeight, float spatialScale, int sampleRatio, int poolMode, bool aligned);
 
-  TRTRoIAlign(const std::string name, const void *data, size_t length);
+        TRTRoIAlign(const std::string name, const void* data, size_t length);
 
-  TRTRoIAlign() = delete;
+        TRTRoIAlign() = delete;
 
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
 
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
 
- private:
-  int mOutWidth;
-  int mOutHeight;
-  float mSpatialScale;
-  int mSampleRatio;
-  int mPoolMode;  // 1:avg 0:max
-  bool mAligned;
-};
+      private:
+        int   mOutWidth;
+        int   mOutHeight;
+        float mSpatialScale;
+        int   mSampleRatio;
+        int   mPoolMode;  // 1:avg 0:max
+        bool  mAligned;
+    };
 
-class TRTRoIAlignCreator : public TRTPluginCreatorBase {
- public:
-  TRTRoIAlignCreator();
+    class TRTRoIAlignCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTRoIAlignCreator();
 
-  const char *getPluginName() const TRT_NOEXCEPT override;
+        const char*          getPluginName() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_ROI_ALIGN_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
index 4e1a825d4f..4cd448aa52 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
@@ -4,104 +4,135 @@
 #include "trt_roi_align_kernel.hpp"
 
 /*** Forward ***/
-template <typename T>
-__global__ void roi_align_forward_cuda_kernel(const int nthreads, const T* input, const T* rois,
-                                              T* output, T* argmax_y, T* argmax_x,
-                                              const int pooled_height, const int pooled_width,
-                                              const T spatial_scale, const int sampling_ratio,
-                                              const int pool_mode,  // 0 - max pool, 1 - avg pool
-                                              const bool aligned, const int channels,
-                                              const int height, const int width) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
+template<typename T>
+__global__ void roi_align_forward_cuda_kernel(const int nthreads, const T* input, const T* rois, T* output, T* argmax_y, T* argmax_x, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio,
+                                              const int  pool_mode,  // 0 - max pool, 1 - avg pool
+                                              const bool aligned,
+                                              const int  channels,
+                                              const int  height,
+                                              const int  width)
+{
+    CUDA_1D_KERNEL_LOOP(index, nthreads)
+    {
+        // (n, c, ph, pw) is an element in the pooled output
+        int      pw = index % pooled_width;
+        int      ph = (index / pooled_width) % pooled_height;
+        int      c  = (index / pooled_width / pooled_height) % channels;
+        int      n  = index / pooled_width / pooled_height / channels;
 
-    const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
+        const T* offset_rois   = rois + n * 5;
+        int      roi_batch_ind = offset_rois[0];
 
-    // Do not using rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+        // Do not using rounding; this implementation detail is critical
+        T        offset      = aligned ? (T)0.5 : (T)0.0;
+        T        roi_start_w = offset_rois[1] * spatial_scale - offset;
+        T        roi_start_h = offset_rois[2] * spatial_scale - offset;
+        T        roi_end_w   = offset_rois[3] * spatial_scale - offset;
+        T        roi_end_h   = offset_rois[4] * spatial_scale - offset;
 
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = max(roi_width, (T)1.);
-      roi_height = max(roi_height, (T)1.);
-    }
+        T        roi_width  = roi_end_w - roi_start_w;
+        T        roi_height = roi_end_h - roi_start_h;
+        if (!aligned)
+        {  // for backward-compatibility only
+            roi_width  = max(roi_width, (T)1.);
+            roi_height = max(roi_height, (T)1.);
+        }
 
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+        T        bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+        T        bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
-    const T* offset_input = input + (roi_batch_ind * channels + c) * height * width;
+        const T* offset_input = input + (roi_batch_ind * channels + c) * height * width;
 
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h =
-        (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceilf(roi_height / pooled_height));
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceilf(roi_width / pooled_width));
+        // We use roi_bin_grid to sample the grid and mimic integral
+        int      roi_bin_grid_h =
+            (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceilf(roi_height / pooled_height));
+        int roi_bin_grid_w =
+            (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceilf(roi_width / pooled_width));
 
-    if (pool_mode == 0) {
-      // We do max pooling inside a bin
-      T maxval = -FLT_MAX;
-      T maxidx_y = -1.f, maxidx_x = -1.f;
-      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-        const T y = roi_start_h + ph * bin_size_h +
-                    static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
-        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-          const T x = roi_start_w + pw * bin_size_w +
-                      static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
-          T val = bilinear_interpolate(offset_input, height, width, y, x);
-          if (val > maxval) {
-            maxval = val;
-            maxidx_y = y;
-            maxidx_x = x;
-          }
+        if (pool_mode == 0)
+        {
+            // We do max pooling inside a bin
+            T maxval   = -FLT_MAX;
+            T maxidx_y = -1.f, maxidx_x = -1.f;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++)
+            {
+                const T y = roi_start_h + ph * bin_size_h +
+                            static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
+                for (int ix = 0; ix < roi_bin_grid_w; ix++)
+                {
+                    const T x = roi_start_w + pw * bin_size_w +
+                                static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+                    T val = bilinear_interpolate(offset_input, height, width, y, x);
+                    if (val > maxval)
+                    {
+                        maxval   = val;
+                        maxidx_y = y;
+                        maxidx_x = x;
+                    }
+                }
+            }
+            output[index]   = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
         }
-      }
-      output[index] = maxval;
-      argmax_y[index] = maxidx_y;
-      argmax_x[index] = maxidx_x;
-    } else if (pool_mode == 1) {
-      // We do average pooling inside a bin
-      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
-      T output_val = 0.;
-      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-        const T y = roi_start_h + ph * bin_size_h +
-                    static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
-        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-          const T x = roi_start_w + pw * bin_size_w +
-                      static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
-          T val = bilinear_interpolate(offset_input, height, width, y, x);
-          output_val += val;
+        else if (pool_mode == 1)
+        {
+            // We do average pooling inside a bin
+            const T count      = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+            T       output_val = 0.;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++)
+            {
+                const T y = roi_start_h + ph * bin_size_h +
+                            static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
+                for (int ix = 0; ix < roi_bin_grid_w; ix++)
+                {
+                    const T x = roi_start_w + pw * bin_size_w +
+                                static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+                    T val = bilinear_interpolate(offset_input, height, width, y, x);
+                    output_val += val;
+                }
+            }
+            output[index] = output_val / count;
         }
-      }
-      output[index] = output_val / count;
     }
-  }
 }
 
-template <typename scalar_t>
-void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input, const scalar_t* rois,
-                                          scalar_t* output, scalar_t* argmax_y, scalar_t* argmax_x,
-                                          int output_size, int channels, int height, int width,
-                                          int aligned_height, int aligned_width,
-                                          scalar_t spatial_scale, int sampling_ratio, int pool_mode,
-                                          bool aligned, cudaStream_t stream) {
-  roi_align_forward_cuda_kernel<scalar_t>
-      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-          output_size, input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
-          static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode, aligned, channels,
-          height, width);
+template<typename scalar_t>
+void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input, const scalar_t* rois, scalar_t* output, scalar_t* argmax_y, scalar_t* argmax_x, int output_size, int channels, int height, int width, int aligned_height, int aligned_width, scalar_t spatial_scale, int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream)
+{
+    roi_align_forward_cuda_kernel<scalar_t>
+        <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+            output_size,
+            input,
+            rois,
+            output,
+            argmax_y,
+            argmax_x,
+            aligned_height,
+            aligned_width,
+            static_cast<scalar_t>(spatial_scale),
+            sampling_ratio,
+            pool_mode,
+            aligned,
+            channels,
+            height,
+            width);
 }
 
 template void TRTRoIAlignForwardCUDAKernelLauncher<float>(
-    const float* input, const float* rois, float* output, float* argmax_y, float* argmax_x,
-    int output_size, int channels, int height, int width, int aligned_height, int aligned_width,
-    float spatial_scale, int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
+    const float* input,
+    const float* rois,
+    float*       output,
+    float*       argmax_y,
+    float*       argmax_x,
+    int          output_size,
+    int          channels,
+    int          height,
+    int          width,
+    int          aligned_height,
+    int          aligned_width,
+    float        spatial_scale,
+    int          sampling_ratio,
+    int          pool_mode,
+    bool         aligned,
+    cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
index 3db656bff9..39e8dc7893 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
@@ -4,12 +4,7 @@
 
 #include "common_cuda_helper.hpp"
 
-template <typename scalar_t>
-void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input, const scalar_t* rois,
-                                          scalar_t* output, scalar_t* argmax_y, scalar_t* argmax_x,
-                                          int output_size, int channels, int height, int width,
-                                          int aligned_height, int aligned_width,
-                                          scalar_t spatial_scale, int sampling_ratio, int pool_mode,
-                                          bool aligned, cudaStream_t stream);
+template<typename scalar_t>
+void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input, const scalar_t* rois, scalar_t* output, scalar_t* argmax_y, scalar_t* argmax_x, int output_size, int channels, int height, int width, int aligned_height, int aligned_width, scalar_t spatial_scale, int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
 
 #endif  // ROI_ALIGN_CUDA_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp
index a4ecb2356a..b20a4b37ea 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp
@@ -10,174 +10,223 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"ScaledDotProductAttentionTRT"};
-}  // namespace
-
-ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string &name)
-    : TRTPluginBase(name), mask_dim(0) {}
-
-ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string name, const void *data,
-                                                           size_t length)
-    : TRTPluginBase(name), mask_dim(0) {}
-
-ScaledDotProductAttentionTRT::~ScaledDotProductAttentionTRT() {}
-
-nvinfer1::IPluginV2DynamicExt *ScaledDotProductAttentionTRT::clone() const TRT_NOEXCEPT {
-  ScaledDotProductAttentionTRT *plugin = new ScaledDotProductAttentionTRT(mLayerName);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::DimsExprs ScaledDotProductAttentionTRT::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  if (outputIndex == 0) return inputs[0];
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 3;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[0].d[1];
-  ret.d[2] = inputs[1].d[1];
-
-  return ret;
-}
-
-bool ScaledDotProductAttentionTRT::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-  } else {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  }
-}
-
-// Attach the plugin object to an execution context and grant the plugin the
-// access to some context resource.
-void ScaledDotProductAttentionTRT::attachToContext(cudnnContext *cudnnContext,
-                                                   cublasContext *cublasContext,
-                                                   IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {
-  _cublas_handle = cublasContext;
-  _cudnn_handle = cudnnContext;
-  cudnnCreateTensorDescriptor(&_x_desc);
-  cudnnCreateTensorDescriptor(&_y_desc);
-  cudnnCreateTensorDescriptor(&_mask_desc);
-}
-
-// Detach the plugin object from its execution context.
-void ScaledDotProductAttentionTRT::detachFromContext() TRT_NOEXCEPT {
-  cudnnDestroyTensorDescriptor(_y_desc);
-  cudnnDestroyTensorDescriptor(_x_desc);
-  cudnnDestroyTensorDescriptor(_mask_desc);
-}
-
-void ScaledDotProductAttentionTRT::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
-                                                   int nbInputs,
-                                                   const nvinfer1::DynamicPluginTensorDesc *out,
-                                                   int nbOutputs) TRT_NOEXCEPT {
-  if (nbInputs != 4) {
-    mask_dim = 0;
-  } else {
-    mask_dim = in[3].desc.dims.nbDims;
-  }
-}
-
-int ScaledDotProductAttentionTRT::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                          const nvinfer1::PluginTensorDesc *outputDesc,
-                                          const void *const *inputs, void *const *outputs,
-                                          void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  if (CUDNN_STATUS_SUCCESS != cudnnSetStream(_cudnn_handle, stream)) return 1;
-  if (CUBLAS_STATUS_SUCCESS != cublasSetStream(_cublas_handle, stream)) return 1;
-  int B = inputDesc[0].dims.d[0];  // batch * heads
-  int Nt = inputDesc[0].dims.d[1];
-  int Ns = inputDesc[1].dims.d[1];
-  int E = inputDesc[0].dims.d[2];  // embeding size
-
-  const void *query = inputs[0];
-  const void *key = inputs[1];
-  const void *value = inputs[2];
-  const void *mask = nullptr;
-
-  int mask_dims[3];
-  mask_dims[0] = 0;
-  if (mask_dim > 0) {
-    mask = inputs[3];
-    // check if mask need broadcast
-    if (mask_dim == 2) {
-      mask_dims[0] = 1;
-      mask_dims[1] = inputDesc[3].dims.d[0];
-      mask_dims[2] = inputDesc[3].dims.d[1];
-    } else {
-      mask_dims[0] = inputDesc[3].dims.d[0];
-      mask_dims[1] = inputDesc[3].dims.d[1];
-      mask_dims[2] = inputDesc[3].dims.d[2];
-    }
-  }
-
-  void *output = outputs[0];
-  void *attn = outputs[1];
-
-  auto data_type = inputDesc[0].type;
-  cudnnDataType_t cudnn_dtype{};
-  convert_trt2cudnn_dtype(data_type, &cudnn_dtype);
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      dot_product_attention_impl<float>((float *)query, (float *)key, (float *)value, (float *)mask,
-                                        (float *)attn, (float *)output, B, Nt, Ns, E, &mask_dims[0],
-                                        _x_desc, _y_desc, _mask_desc, cudnn_dtype, stream,
-                                        _cublas_handle, _cudnn_handle);
-      break;
-    default:
-      return 1;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType ScaledDotProductAttentionTRT::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *ScaledDotProductAttentionTRT::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *ScaledDotProductAttentionTRT::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-int ScaledDotProductAttentionTRT::getNbOutputs() const TRT_NOEXCEPT { return 2; }
-
-size_t ScaledDotProductAttentionTRT::getSerializationSize() const TRT_NOEXCEPT { return 0; }
-
-void ScaledDotProductAttentionTRT::serialize(void *buffer) const TRT_NOEXCEPT {}
-
-////////////////////// creator /////////////////////////////
-
-ScaledDotProductAttentionTRTCreator::ScaledDotProductAttentionTRTCreator() {}
-
-const char *ScaledDotProductAttentionTRTCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *ScaledDotProductAttentionTRTCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *ScaledDotProductAttentionTRTCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  ScaledDotProductAttentionTRT *plugin = new ScaledDotProductAttentionTRT(name);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *ScaledDotProductAttentionTRTCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new ScaledDotProductAttentionTRT(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(ScaledDotProductAttentionTRTCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"ScaledDotProductAttentionTRT"};
+    }  // namespace
+
+    ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string& name)
+        : TRTPluginBase(name)
+        , mask_dim(0)
+    {
+    }
+
+    ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+        , mask_dim(0)
+    {
+    }
+
+    ScaledDotProductAttentionTRT::~ScaledDotProductAttentionTRT() {}
+
+    nvinfer1::IPluginV2DynamicExt* ScaledDotProductAttentionTRT::clone() const TRT_NOEXCEPT
+    {
+        ScaledDotProductAttentionTRT* plugin = new ScaledDotProductAttentionTRT(mLayerName);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs ScaledDotProductAttentionTRT::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        if (outputIndex == 0) return inputs[0];
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 3;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[0].d[1];
+        ret.d[2]   = inputs[1].d[1];
+
+        return ret;
+    }
+
+    bool ScaledDotProductAttentionTRT::supportsFormatCombination(
+        int                               pos,
+        const nvinfer1::PluginTensorDesc* ioDesc,
+        int                               nbInputs,
+        int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+    }
+
+    // Attach the plugin object to an execution context and grant the plugin the
+    // access to some context resource.
+    void ScaledDotProductAttentionTRT::attachToContext(cudnnContext*  cudnnContext,
+                                                       cublasContext* cublasContext,
+                                                       IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
+    {
+        _cublas_handle = cublasContext;
+        _cudnn_handle  = cudnnContext;
+        cudnnCreateTensorDescriptor(&_x_desc);
+        cudnnCreateTensorDescriptor(&_y_desc);
+        cudnnCreateTensorDescriptor(&_mask_desc);
+    }
+
+    // Detach the plugin object from its execution context.
+    void ScaledDotProductAttentionTRT::detachFromContext() TRT_NOEXCEPT
+    {
+        cudnnDestroyTensorDescriptor(_y_desc);
+        cudnnDestroyTensorDescriptor(_x_desc);
+        cudnnDestroyTensorDescriptor(_mask_desc);
+    }
+
+    void ScaledDotProductAttentionTRT::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        if (nbInputs != 4)
+        {
+            mask_dim = 0;
+        }
+        else
+        {
+            mask_dim = in[3].desc.dims.nbDims;
+        }
+    }
+
+    int ScaledDotProductAttentionTRT::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                              const nvinfer1::PluginTensorDesc* outputDesc,
+                                              const void* const*                inputs,
+                                              void* const*                      outputs,
+                                              void*                             workSpace,
+                                              cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        if (CUDNN_STATUS_SUCCESS != cudnnSetStream(_cudnn_handle, stream)) return 1;
+        if (CUBLAS_STATUS_SUCCESS != cublasSetStream(_cublas_handle, stream)) return 1;
+        int         B  = inputDesc[0].dims.d[0];  // batch * heads
+        int         Nt = inputDesc[0].dims.d[1];
+        int         Ns = inputDesc[1].dims.d[1];
+        int         E  = inputDesc[0].dims.d[2];  // embeding size
+
+        const void* query = inputs[0];
+        const void* key   = inputs[1];
+        const void* value = inputs[2];
+        const void* mask  = nullptr;
+
+        int         mask_dims[3];
+        mask_dims[0] = 0;
+        if (mask_dim > 0)
+        {
+            mask = inputs[3];
+            // check if mask need broadcast
+            if (mask_dim == 2)
+            {
+                mask_dims[0] = 1;
+                mask_dims[1] = inputDesc[3].dims.d[0];
+                mask_dims[2] = inputDesc[3].dims.d[1];
+            }
+            else
+            {
+                mask_dims[0] = inputDesc[3].dims.d[0];
+                mask_dims[1] = inputDesc[3].dims.d[1];
+                mask_dims[2] = inputDesc[3].dims.d[2];
+            }
+        }
+
+        void*           output = outputs[0];
+        void*           attn   = outputs[1];
+
+        auto            data_type = inputDesc[0].type;
+        cudnnDataType_t cudnn_dtype{};
+        convert_trt2cudnn_dtype(data_type, &cudnn_dtype);
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                dot_product_attention_impl<float>((float*)query, (float*)key, (float*)value, (float*)mask, (float*)attn, (float*)output, B, Nt, Ns, E, &mask_dims[0], _x_desc, _y_desc, _mask_desc, cudnn_dtype, stream, _cublas_handle, _cudnn_handle);
+                break;
+            default:
+                return 1;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType ScaledDotProductAttentionTRT::getOutputDataType(
+        int                       index,
+        const nvinfer1::DataType* inputTypes,
+        int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* ScaledDotProductAttentionTRT::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* ScaledDotProductAttentionTRT::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int ScaledDotProductAttentionTRT::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 2;
+    }
+
+    size_t ScaledDotProductAttentionTRT::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    void ScaledDotProductAttentionTRT::serialize(void* buffer) const TRT_NOEXCEPT {}
+
+    ////////////////////// creator /////////////////////////////
+
+    ScaledDotProductAttentionTRTCreator::ScaledDotProductAttentionTRTCreator() {}
+
+    const char* ScaledDotProductAttentionTRTCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* ScaledDotProductAttentionTRTCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* ScaledDotProductAttentionTRTCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        ScaledDotProductAttentionTRT* plugin = new ScaledDotProductAttentionTRT(name);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* ScaledDotProductAttentionTRTCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new ScaledDotProductAttentionTRT(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+    REGISTER_TENSORRT_PLUGIN(ScaledDotProductAttentionTRTCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp
index 86d35616a9..9e184626cb 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp
@@ -9,65 +9,64 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class ScaledDotProductAttentionTRT : public TRTPluginBase {
- public:
-  ScaledDotProductAttentionTRT(const std::string &name);
+namespace mmdeploy
+{
+    class ScaledDotProductAttentionTRT : public TRTPluginBase
+    {
+      public:
+        ScaledDotProductAttentionTRT(const std::string& name);
 
-  ScaledDotProductAttentionTRT(const std::string name, const void *data, size_t length);
+        ScaledDotProductAttentionTRT(const std::string name, const void* data, size_t length);
 
-  ScaledDotProductAttentionTRT() = delete;
+        ScaledDotProductAttentionTRT() = delete;
 
-  ~ScaledDotProductAttentionTRT() TRT_NOEXCEPT override;
+        ~ScaledDotProductAttentionTRT() TRT_NOEXCEPT override;
 
-  virtual void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                               const nvinfer1::DynamicPluginTensorDesc *out,
-                               int nbOutputs) TRT_NOEXCEPT override;
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        virtual void                   configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
 
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-  void attachToContext(cudnnContext *cudnn, cublasContext *cublas,
-                       nvinfer1::IGpuAllocator *allocator) TRT_NOEXCEPT override;
-  void detachFromContext() TRT_NOEXCEPT override;
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+        void               attachToContext(cudnnContext* cudnn, cublasContext* cublas, nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
+        void               detachFromContext() TRT_NOEXCEPT override;
 
- private:
-  int mask_dim;
-  cublasHandle_t _cublas_handle{};
-  cudnnHandle_t _cudnn_handle{};
-  cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _mask_desc{};
-};
+      private:
+        int                     mask_dim;
+        cublasHandle_t          _cublas_handle{};
+        cudnnHandle_t           _cudnn_handle{};
+        cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _mask_desc{};
+    };
 
-class ScaledDotProductAttentionTRTCreator : public TRTPluginCreatorBase {
- public:
-  ScaledDotProductAttentionTRTCreator();
+    class ScaledDotProductAttentionTRTCreator : public TRTPluginCreatorBase
+    {
+      public:
+        ScaledDotProductAttentionTRTCreator();
 
-  const char *getPluginName() const TRT_NOEXCEPT override;
+        const char*          getPluginName() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_SCALED_DOT_PRODUCT_ATTENTION_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu
index a0ee16c998..738316b9a8 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu
@@ -11,93 +11,79 @@
 #include "scaled_dot_product_attention_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename scalar_t>
-cublasStatus_t cublasgemmStridedBatchedWrap(cublasHandle_t handle, cublasOperation_t transa,
-                                            cublasOperation_t transb, int m, int n, int k,
-                                            const scalar_t* alpha, const scalar_t* A, int lda,
-                                            long long int strideA, const scalar_t* B, int ldb,
-                                            long long int strideB, const scalar_t* beta,
-                                            scalar_t* C, int ldc, long long int strideC,
-                                            int batchCount);
+template<typename scalar_t>
+cublasStatus_t cublasgemmStridedBatchedWrap(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha, const scalar_t* A, int lda, long long int strideA, const scalar_t* B, int ldb, long long int strideB, const scalar_t* beta, scalar_t* C, int ldc, long long int strideC, int batchCount);
 
-template <>
-cublasStatus_t cublasgemmStridedBatchedWrap<float>(cublasHandle_t handle, cublasOperation_t transa,
-                                                   cublasOperation_t transb, int m, int n, int k,
-                                                   const float* alpha, const float* A, int lda,
-                                                   long long int strideA, const float* B, int ldb,
-                                                   long long int strideB, const float* beta,
-                                                   float* C, int ldc, long long int strideC,
-                                                   int batchCount) {
-  return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb,
-                                   strideB, beta, C, ldc, strideC, batchCount);
+template<>
+cublasStatus_t cublasgemmStridedBatchedWrap<float>(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, long long int strideA, const float* B, int ldb, long long int strideB, const float* beta, float* C, int ldc, long long int strideC, int batchCount)
+{
+    return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-template <>
-cublasStatus_t cublasgemmStridedBatchedWrap<__half>(cublasHandle_t handle, cublasOperation_t transa,
-                                                    cublasOperation_t transb, int m, int n, int k,
-                                                    const __half* alpha, const __half* A, int lda,
-                                                    long long int strideA, const __half* B, int ldb,
-                                                    long long int strideB, const __half* beta,
-                                                    __half* C, int ldc, long long int strideC,
-                                                    int batchCount) {
-  return cublasHgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb,
-                                   strideB, beta, C, ldc, strideC, batchCount);
+template<>
+cublasStatus_t cublasgemmStridedBatchedWrap<__half>(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* A, int lda, long long int strideA, const __half* B, int ldb, long long int strideB, const __half* beta, __half* C, int ldc, long long int strideC, int batchCount)
+{
+    return cublasHgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-template <typename scalar_t>
-void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, const scalar_t* value,
-                                const scalar_t* mask, scalar_t* attn, scalar_t* output, int B,
-                                int Nt, int Ns, int E, const int* mask_dims,
-                                cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc,
-                                cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype,
-                                cudaStream_t stream, cublasHandle_t cublas_handle,
-                                cudnnHandle_t cudnn_handle) {
-  {
-    // Q @ K
-    const int m = Ns;
-    const int n = Nt;
-    const int k = E;
-    const auto alpha = scalar_t(1.0f / sqrt(float(E)));
-    const auto beta = scalar_t(0);
-    cublasgemmStridedBatchedWrap(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, key, k,
-                                 Ns * E, query, k, Nt * E, &beta, attn, m, Nt * Ns, B);
-  }
+template<typename scalar_t>
+void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, const scalar_t* value, const scalar_t* mask, scalar_t* attn, scalar_t* output, int B, int Nt, int Ns, int E, const int* mask_dims, cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc, cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype, cudaStream_t stream, cublasHandle_t cublas_handle, cudnnHandle_t cudnn_handle)
+{
+    {
+        // Q @ K
+        const int  m     = Ns;
+        const int  n     = Nt;
+        const int  k     = E;
+        const auto alpha = scalar_t(1.0f / sqrt(float(E)));
+        const auto beta  = scalar_t(0);
+        cublasgemmStridedBatchedWrap(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, key, k, Ns * E, query, k, Nt * E, &beta, attn, m, Nt * Ns, B);
+    }
 
-  if (mask_dims != nullptr && mask_dims[0] != 0) {
-    const auto alpha = scalar_t(1);
-    const auto beta = scalar_t(1);
-    cudnnSetTensor4dDescriptor(mask_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, mask_dims[0],
-                               mask_dims[1], mask_dims[2]);
-    cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, B, Nt, Ns);
-    cudnnAddTensor(cudnn_handle, &alpha, mask_desc, mask, &beta, x_desc, attn);
-  }
+    if (mask_dims != nullptr && mask_dims[0] != 0)
+    {
+        const auto alpha = scalar_t(1);
+        const auto beta  = scalar_t(1);
+        cudnnSetTensor4dDescriptor(mask_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, mask_dims[0], mask_dims[1], mask_dims[2]);
+        cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, B, Nt, Ns);
+        cudnnAddTensor(cudnn_handle, &alpha, mask_desc, mask, &beta, x_desc, attn);
+    }
 
-  {
-    // softmax attention
-    const auto alpha = scalar_t(1);
-    const auto beta = scalar_t(0);
-    cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
-    cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
-    cudnnSoftmaxForward(cudnn_handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, &alpha,
-                        x_desc, attn, &beta, y_desc, attn);
-  }
+    {
+        // softmax attention
+        const auto alpha = scalar_t(1);
+        const auto beta  = scalar_t(0);
+        cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
+        cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
+        cudnnSoftmaxForward(cudnn_handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, &alpha, x_desc, attn, &beta, y_desc, attn);
+    }
 
-  {
-    // attn @ v
-    const int m = E;
-    const int n = Nt;
-    const int k = Ns;
-    const auto alpha = scalar_t(1);
-    const auto beta = scalar_t(0);
-    cublasgemmStridedBatchedWrap(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, value, m,
-                                 Ns * E, (const scalar_t*)(attn), k, Ns * Nt, &beta, output, m,
-                                 Nt * E, B);
-  }
+    {
+        // attn @ v
+        const int  m     = E;
+        const int  n     = Nt;
+        const int  k     = Ns;
+        const auto alpha = scalar_t(1);
+        const auto beta  = scalar_t(0);
+        cublasgemmStridedBatchedWrap(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, value, m, Ns * E, (const scalar_t*)(attn), k, Ns * Nt, &beta, output, m, Nt * E, B);
+    }
 }
 
 template void dot_product_attention_impl<float>(
-    const float* query, const float* key, const float* value, const float* mask, float* attn,
-    float* output, int B, int Nt, int Ns, int E, const int* mask_dims,
-    cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc,
-    cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype, cudaStream_t stream,
-    cublasHandle_t cublas_handle, cudnnHandle_t cudnn_handle);
+    const float*             query,
+    const float*             key,
+    const float*             value,
+    const float*             mask,
+    float*                   attn,
+    float*                   output,
+    int                      B,
+    int                      Nt,
+    int                      Ns,
+    int                      E,
+    const int*               mask_dims,
+    cudnnTensorDescriptor_t& x_desc,
+    cudnnTensorDescriptor_t& y_desc,
+    cudnnTensorDescriptor_t& mask_desc,
+    cudnnDataType_t          cudnn_dtype,
+    cudaStream_t             stream,
+    cublasHandle_t           cublas_handle,
+    cudnnHandle_t            cudnn_handle);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp
index d1cdc7773a..10db2aade1 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp
@@ -5,13 +5,7 @@
 #include <cuda_runtime.h>
 #include <cudnn.h>
 
-template <typename scalar_t>
-void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, const scalar_t* value,
-                                const scalar_t* mask, scalar_t* attn, scalar_t* output, int B,
-                                int Nt, int Ns, int E, const int* mask_dims,
-                                cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc,
-                                cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype,
-                                cudaStream_t stream, cublasHandle_t cublas_handle,
-                                cudnnHandle_t cudnn_handle);
+template<typename scalar_t>
+void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, const scalar_t* value, const scalar_t* mask, scalar_t* attn, scalar_t* output, int B, int Nt, int Ns, int E, const int* mask_dims, cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc, cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype, cudaStream_t stream, cublasHandle_t cublas_handle, cudnnHandle_t cudnn_handle);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
index 13c637f408..ca0ed9afa0 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
@@ -2,155 +2,192 @@
 #include "NvInferVersion.h"
 // ScatterND is supported since TensorRT8
 #if NV_TENSORRT_MAJOR <= 7
-#include <assert.h>
-#include <stdio.h>
-
-#include <chrono>
-
-#include "trt_scatternd.hpp"
-#include "trt_scatternd_kernel.hpp"
-#include "trt_serialize.hpp"
-
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"ScatterND"};
-}  // namespace
-
-TRTScatterND::TRTScatterND(const std::string &name) : TRTPluginBase(name) {}
-
-TRTScatterND::TRTScatterND(const std::string name, const void *data, size_t length)
-    : TRTPluginBase(name) {}
-
-nvinfer1::IPluginV2DynamicExt *TRTScatterND::clone() const TRT_NOEXCEPT {
-  TRTScatterND *plugin = new TRTScatterND(mLayerName);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTScatterND::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  return inputs[0];
-}
-
-bool TRTScatterND::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
-                                             int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos < nbInputs) {
-    switch (pos) {
-      case 0:
-        // data
-        return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-                ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
-               (ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-                ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-      case 1:
-        // indices
-        return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-      case 2:
-        // updates
-        return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-      default:
-        return true;
+    #include <assert.h>
+    #include <stdio.h>
+
+    #include <chrono>
+
+    #include "trt_scatternd.hpp"
+    #include "trt_scatternd_kernel.hpp"
+    #include "trt_serialize.hpp"
+
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"ScatterND"};
+    }  // namespace
+
+    TRTScatterND::TRTScatterND(const std::string& name)
+        : TRTPluginBase(name)
+    {
+    }
+
+    TRTScatterND::TRTScatterND(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
     }
-  } else {
-    switch (pos - nbInputs) {
-      case 0:
-        // output
-        return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-      default:
+
+    nvinfer1::IPluginV2DynamicExt* TRTScatterND::clone() const TRT_NOEXCEPT
+    {
+        TRTScatterND* plugin = new TRTScatterND(mLayerName);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTScatterND::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        return inputs[0];
+    }
+
+    bool TRTScatterND::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos < nbInputs)
+        {
+            switch (pos)
+            {
+                case 0:
+                    // data
+                    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
+                           (ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+                case 1:
+                    // indices
+                    return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                           ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+                case 2:
+                    // updates
+                    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+                default:
+                    return true;
+            }
+        }
+        else
+        {
+            switch (pos - nbInputs)
+            {
+                case 0:
+                    // output
+                    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+                default:
+                    return true;
+            }
+        }
         return true;
     }
-  }
-  return true;
-}
-
-void TRTScatterND::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-                                   const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                   int nbOutputs) TRT_NOEXCEPT {}
-
-size_t TRTScatterND::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                                      const nvinfer1::PluginTensorDesc *outputs,
-                                      int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int TRTScatterND::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                          const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-                          void *const *outputs, void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  const int *dims = &(inputDesc[0].dims.d[0]);
-  const int *indices_dims = &(inputDesc[1].dims.d[0]);
-  int nbDims = inputDesc[0].dims.nbDims;
-  int indice_nbDims = inputDesc[1].dims.nbDims;
-
-  const void *data = inputs[0];
-  const void *indices = inputs[1];
-  const void *update = inputs[2];
-  void *output = outputs[0];
-
-  auto data_type = inputDesc[0].type;
-
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      TRTONNXScatterNDKernelLauncher<float>((float *)data, (int *)indices, (float *)update, dims,
-                                            nbDims, indices_dims, indice_nbDims, (float *)output,
-                                            stream);
-      break;
-
-    case nvinfer1::DataType::kINT32:
-      TRTONNXScatterNDKernelLauncher<int>((int *)data, (int *)indices, (int *)update, dims, nbDims,
-                                          indices_dims, indice_nbDims, (int *)output, stream);
-      break;
-    default:
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType TRTScatterND::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                                   int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *TRTScatterND::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTScatterND::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int TRTScatterND::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTScatterND::getSerializationSize() const TRT_NOEXCEPT { return 0; }
-
-void TRTScatterND::serialize(void *buffer) const TRT_NOEXCEPT {}
-
-TRTScatterNDCreator::TRTScatterNDCreator() {
-  mPluginAttributes.clear();
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTScatterNDCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTScatterNDCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-nvinfer1::IPluginV2 *TRTScatterNDCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  TRTScatterND *plugin = new TRTScatterND(name);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *TRTScatterNDCreator::deserializePlugin(const char *name,
-                                                            const void *serialData,
-                                                            size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new TRTScatterND(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(TRTScatterNDCreator);
+
+    void   TRTScatterND::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT {}
+
+    size_t TRTScatterND::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int TRTScatterND::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                              const nvinfer1::PluginTensorDesc* outputDesc,
+                              const void* const*                inputs,
+                              void* const*                      outputs,
+                              void*                             workSpace,
+                              cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        const int*  dims          = &(inputDesc[0].dims.d[0]);
+        const int*  indices_dims  = &(inputDesc[1].dims.d[0]);
+        int         nbDims        = inputDesc[0].dims.nbDims;
+        int         indice_nbDims = inputDesc[1].dims.nbDims;
+
+        const void* data    = inputs[0];
+        const void* indices = inputs[1];
+        const void* update  = inputs[2];
+        void*       output  = outputs[0];
+
+        auto        data_type = inputDesc[0].type;
+
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                TRTONNXScatterNDKernelLauncher<float>((float*)data, (int*)indices, (float*)update, dims, nbDims, indices_dims, indice_nbDims, (float*)output, stream);
+                break;
+
+            case nvinfer1::DataType::kINT32:
+                TRTONNXScatterNDKernelLauncher<int>((int*)data, (int*)indices, (int*)update, dims, nbDims, indices_dims, indice_nbDims, (int*)output, stream);
+                break;
+            default:
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTScatterND::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* TRTScatterND::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTScatterND::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTScatterND::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTScatterND::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    void TRTScatterND::serialize(void* buffer) const TRT_NOEXCEPT {}
+
+    TRTScatterNDCreator::TRTScatterNDCreator()
+    {
+        mPluginAttributes.clear();
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTScatterNDCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTScatterNDCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTScatterNDCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        TRTScatterND* plugin = new TRTScatterND(name);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTScatterNDCreator::deserializePlugin(const char* name,
+                                                                const void* serialData,
+                                                                size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new TRTScatterND(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(TRTScatterNDCreator);
 }  // namespace mmdeploy
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
index d6b859855e..b75adc40c2 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
@@ -9,56 +9,54 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class TRTScatterND : public TRTPluginBase {
- public:
-  TRTScatterND(const std::string &name);
-
-  TRTScatterND(const std::string name, const void *data, size_t length);
-
-  TRTScatterND() = delete;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-};
-
-class TRTScatterNDCreator : public TRTPluginCreatorBase {
- public:
-  TRTScatterNDCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class TRTScatterND : public TRTPluginBase
+    {
+      public:
+        TRTScatterND(const std::string& name);
+
+        TRTScatterND(const std::string name, const void* data, size_t length);
+
+        TRTScatterND() = delete;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+    };
+
+    class TRTScatterNDCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTScatterNDCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_SCATTERND_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
index c763992e9f..a9ec98fa36 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
@@ -8,68 +8,70 @@
 
 using mmdeploy::TensorDesc;
 
-template <typename T>
-__global__ void onnx_scatternd_kernel(const int n, const int* indices, const T* update, T* output,
-                                      TensorDesc tensor_desc, TensorDesc indice_desc) {
-  const int indice_cols = indice_desc.shape[indice_desc.dim - 1];
-  const int copy_stride = tensor_desc.stride[indice_cols - 1];
-  const int* stride = &(tensor_desc.stride[0]);
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    int output_offset = 0;
-    const int* indices_current = indices + index * indice_cols;
-    for (int i = 0; i < indice_cols; ++i) {
-      output_offset += stride[i] * indices_current[i];
+template<typename T>
+__global__ void onnx_scatternd_kernel(const int n, const int* indices, const T* update, T* output, TensorDesc tensor_desc, TensorDesc indice_desc)
+{
+    const int  indice_cols = indice_desc.shape[indice_desc.dim - 1];
+    const int  copy_stride = tensor_desc.stride[indice_cols - 1];
+    const int* stride      = &(tensor_desc.stride[0]);
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        int        output_offset   = 0;
+        const int* indices_current = indices + index * indice_cols;
+        for (int i = 0; i < indice_cols; ++i)
+        {
+            output_offset += stride[i] * indices_current[i];
+        }
+        memcpy(output + output_offset, update + index * copy_stride, copy_stride * sizeof(T));
     }
-    memcpy(output + output_offset, update + index * copy_stride, copy_stride * sizeof(T));
-  }
 }
 
-template <typename T>
-void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices, const T* update,
-                                    const int* dims, int nbDims, const int* indices_dims,
-                                    int indice_nbDims, T* output, cudaStream_t stream) {
-  // fill tensordesc and initial
-  TensorDesc tensor_desc;
-  memset((void*)&tensor_desc, 0, sizeof(TensorDesc));
-  tensor_desc.dim = nbDims;
-  tensor_desc.shape[nbDims - 1] = dims[nbDims - 1];
-  tensor_desc.stride[nbDims - 1] = 1;
-  for (int i = nbDims - 2; i >= 0; --i) {
-    tensor_desc.shape[i] = dims[i];
-    tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
-  }
-  const int data_size = tensor_desc.stride[0] * tensor_desc.shape[0];
+template<typename T>
+void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices, const T* update, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, T* output, cudaStream_t stream)
+{
+    // fill tensordesc and initial
+    TensorDesc tensor_desc;
+    memset((void*)&tensor_desc, 0, sizeof(TensorDesc));
+    tensor_desc.dim                = nbDims;
+    tensor_desc.shape[nbDims - 1]  = dims[nbDims - 1];
+    tensor_desc.stride[nbDims - 1] = 1;
+    for (int i = nbDims - 2; i >= 0; --i)
+    {
+        tensor_desc.shape[i]  = dims[i];
+        tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
+    }
+    const int  data_size = tensor_desc.stride[0] * tensor_desc.shape[0];
 
-  TensorDesc indice_desc;
-  memset((void*)&indice_desc, 0, sizeof(TensorDesc));
-  indice_desc.dim = indice_nbDims;
-  indice_desc.shape[indice_nbDims - 1] = indices_dims[indice_nbDims - 1];
-  indice_desc.stride[indice_nbDims - 1] = 1;
-  for (int i = indice_nbDims - 2; i >= 0; --i) {
-    indice_desc.shape[i] = indices_dims[i];
-    indice_desc.stride[i] = indices_dims[i + 1] * indice_desc.stride[i + 1];
-  }
+    TensorDesc indice_desc;
+    memset((void*)&indice_desc, 0, sizeof(TensorDesc));
+    indice_desc.dim                       = indice_nbDims;
+    indice_desc.shape[indice_nbDims - 1]  = indices_dims[indice_nbDims - 1];
+    indice_desc.stride[indice_nbDims - 1] = 1;
+    for (int i = indice_nbDims - 2; i >= 0; --i)
+    {
+        indice_desc.shape[i]  = indices_dims[i];
+        indice_desc.stride[i] = indices_dims[i + 1] * indice_desc.stride[i + 1];
+    }
 
-  // output = np.copy(data)
-  cudaMemcpyAsync(output, data, data_size * sizeof(T), cudaMemcpyDeviceToDevice, stream);
+    // output = np.copy(data)
+    cudaMemcpyAsync(output, data, data_size * sizeof(T), cudaMemcpyDeviceToDevice, stream);
 
-  int num_update_indice = 1;
-  for (int i = 0; i < indice_nbDims - 1; ++i) {
-    num_update_indice *= indice_desc.shape[i];
-  }
-  // scatter
-  const int col_block = DIVUP(num_update_indice, THREADS_PER_BLOCK);
-  onnx_scatternd_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
-      num_update_indice, indices, update, output, tensor_desc, indice_desc);
+    int num_update_indice = 1;
+    for (int i = 0; i < indice_nbDims - 1; ++i)
+    {
+        num_update_indice *= indice_desc.shape[i];
+    }
+    // scatter
+    const int col_block = DIVUP(num_update_indice, THREADS_PER_BLOCK);
+    onnx_scatternd_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
+        num_update_indice,
+        indices,
+        update,
+        output,
+        tensor_desc,
+        indice_desc);
 }
 
-template void TRTONNXScatterNDKernelLauncher<float>(const float* data, const int* indices,
-                                                    const float* update, const int* dims,
-                                                    int nbDims, const int* indices_dims,
-                                                    int indice_nbDims, float* output,
-                                                    cudaStream_t stream);
+template void TRTONNXScatterNDKernelLauncher<float>(const float* data, const int* indices, const float* update, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, float* output, cudaStream_t stream);
 
-template void TRTONNXScatterNDKernelLauncher<int>(const int* data, const int* indices,
-                                                  const int* update, const int* dims, int nbDims,
-                                                  const int* indices_dims, int indice_nbDims,
-                                                  int* output, cudaStream_t stream);
+template void TRTONNXScatterNDKernelLauncher<int>(const int* data, const int* indices, const int* update, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, int* output, cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp
index b64b66494d..ae8ae2c34b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp
@@ -3,9 +3,7 @@
 #define TRT_SCATTERND_KERNEL_HPP
 #include <cuda_runtime.h>
 
-template <typename T>
-void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices, const T* update,
-                                    const int* dims, int nbDims, const int* indices_dims,
-                                    int indice_nbDims, T* output, cudaStream_t stream);
+template<typename T>
+void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices, const T* update, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, T* output, cudaStream_t stream);
 
 #endif  // TRT_SCATTERND_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/bind.cpp b/csrc/mmdeploy/backend_ops/torchscript/ops/bind.cpp
index f236ac9b66..777b2b1eed 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/bind.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/bind.cpp
@@ -1,13 +1,14 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 #include "torch/script.h"
 
-TORCH_LIBRARY(mmdeploy, m) {
-  m.def(
-       "modulated_deform_conv(Tensor input, Tensor weight, Tensor bias, Tensor offset, Tensor "
-       "mask, "
-       "int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int "
-       "dilation_h,int dilation_w, int groups, int deform_groups, bool with_bias) -> Tensor")
-      .def(
-          "coreml_nms(Tensor boxes, Tensor scores, float iou_threshold, "
-          "float score_threshold, int max_boxes) -> Tensor[]");
+TORCH_LIBRARY(mmdeploy, m)
+{
+    m.def(
+         "modulated_deform_conv(Tensor input, Tensor weight, Tensor bias, Tensor offset, Tensor "
+         "mask, "
+         "int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int "
+         "dilation_h,int dilation_w, int groups, int deform_groups, bool with_bias) -> Tensor")
+        .def(
+            "coreml_nms(Tensor boxes, Tensor scores, float iou_threshold, "
+            "float score_threshold, int max_boxes) -> Tensor[]");
 }
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp b/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp
index a78b701349..77fc5c6388 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp
@@ -4,28 +4,32 @@
 #include <vector>
 
 #include "torch/script.h"
-namespace mmdeploy {
-
-using at::Tensor;
-
-std::vector<Tensor> coreml_nms_cpu(Tensor boxes, Tensor scores, double iou_threshold,
-                                   double score_threshold, int64_t max_boxes) {
-  assert(boxes.dim() == 3);  // bboxes with shape (batch_size, num_bboxes, 4)
-  assert(boxes.size(2) == 4);
-  assert(boxes.size(0) == scores.size(0));  // check batch size
-  assert(boxes.size(1) == scores.size(1));  // check num boxes
-
-  auto batch_size = boxes.size(0);
-  auto num_boxes = boxes.size(1);
-  auto num_classes = scores.size(2);
-
-  Tensor ret_boxes = at::zeros({batch_size, max_boxes, 4});
-  Tensor ret_scores = at::zeros({batch_size, max_boxes, num_classes});
-  Tensor indices = at::zeros({batch_size, max_boxes}, at::kInt);
-  Tensor num_outputs = at::zeros({batch_size}, at::kInt);
-
-  return std::vector<Tensor>({ret_boxes, ret_scores, indices, num_outputs});
-}
-
-TORCH_LIBRARY_IMPL(mmdeploy, CPU, m) { m.impl("coreml_nms", coreml_nms_cpu); }
+namespace mmdeploy
+{
+
+    using at::Tensor;
+
+    std::vector<Tensor> coreml_nms_cpu(Tensor boxes, Tensor scores, double iou_threshold, double score_threshold, int64_t max_boxes)
+    {
+        assert(boxes.dim() == 3);  // bboxes with shape (batch_size, num_bboxes, 4)
+        assert(boxes.size(2) == 4);
+        assert(boxes.size(0) == scores.size(0));  // check batch size
+        assert(boxes.size(1) == scores.size(1));  // check num boxes
+
+        auto   batch_size  = boxes.size(0);
+        auto   num_boxes   = boxes.size(1);
+        auto   num_classes = scores.size(2);
+
+        Tensor ret_boxes   = at::zeros({batch_size, max_boxes, 4});
+        Tensor ret_scores  = at::zeros({batch_size, max_boxes, num_classes});
+        Tensor indices     = at::zeros({batch_size, max_boxes}, at::kInt);
+        Tensor num_outputs = at::zeros({batch_size}, at::kInt);
+
+        return std::vector<Tensor>({ret_boxes, ret_scores, indices, num_outputs});
+    }
+
+    TORCH_LIBRARY_IMPL(mmdeploy, CPU, m)
+    {
+        m.impl("coreml_nms", coreml_nms_cpu);
+    }
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp
index c6d980919f..cf404849b4 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp
@@ -3,19 +3,37 @@
 
 #include "torch/script.h"
 
-namespace mmdeploy {
-
-void modulated_deformable_im2col_cpu(
-    const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
-    const int64_t batch_size, const int64_t channels, const int64_t height_im,
-    const int64_t width_im, const int64_t height_col, const int64_t width_col,
-    const int64_t kernel_h, const int64_t kernel_w, const int64_t pad_h, const int64_t pad_w,
-    const int64_t stride_h, const int64_t stride_w, const int64_t dilation_h,
-    const int64_t dilation_w, int64_t deformable_group, at::Tensor data_col) {
-  // num_axes should be smaller than block size
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
+namespace mmdeploy
+{
+
+    void modulated_deformable_im2col_cpu(
+        const at::Tensor data_im,
+        const at::Tensor data_offset,
+        const at::Tensor data_mask,
+        const int64_t    batch_size,
+        const int64_t    channels,
+        const int64_t    height_im,
+        const int64_t    width_im,
+        const int64_t    height_col,
+        const int64_t    width_col,
+        const int64_t    kernel_h,
+        const int64_t    kernel_w,
+        const int64_t    pad_h,
+        const int64_t    pad_w,
+        const int64_t    stride_h,
+        const int64_t    stride_w,
+        const int64_t    dilation_h,
+        const int64_t    dilation_w,
+        int64_t          deformable_group,
+        at::Tensor       data_col)
+    {
+        // num_axes should be smaller than block size
+
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            data_im.scalar_type(),
+            "modulated_deformable_im2col_cpu",
+            ([&]
+             {
         const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
         const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
         const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
@@ -24,71 +42,66 @@ void modulated_deformable_im2col_cpu(
         deformable_im2col_2d<scalar_t>(data_im_, data_offset_, data_mask_, height_im, width_im,
                                        kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
                                        dilation_h, dilation_w, channels, deformable_group,
-                                       height_col, width_col, data_mask_ != nullptr, data_col_);
-      }));
-}
-
-at::Tensor modulated_deform_conv_forward_cpu(at::Tensor input, at::Tensor weight, at::Tensor bias,
-                                             at::Tensor offset, at::Tensor mask, int64_t kernel_h,
-                                             int64_t kernel_w, int64_t stride_h, int64_t stride_w,
-                                             int64_t pad_h, int64_t pad_w, int64_t dilation_h,
-                                             int64_t dilation_w, int64_t group,
-                                             int64_t deformable_group, bool with_bias) {
-  at::DeviceGuard guard(input.device());
-
-  const int batch = input.size(0);
-  const int channels = input.size(1);
-  const int height = input.size(2);
-  const int width = input.size(3);
-
-  const int channels_out = weight.size(0);
-  const int channels_kernel = weight.size(1);
-  const int kernel_h_ = weight.size(2);
-  const int kernel_w_ = weight.size(3);
-
-  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
-    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).", kernel_h_, kernel_w,
-             kernel_h_, kernel_w_);
-  if (channels != channels_kernel * group)
-    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).", channels,
-             channels_kernel * group);
-
-  const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-
-  // resize output
-  at::Tensor output =
-      at::zeros({batch, group, channels_out / group, height_out, width_out}, input.options());
-  // resize temporary columns
-  at::Tensor columns = at::zeros(
-      {group, channels * kernel_h * kernel_w / group, 1 * height_out * width_out}, input.options());
-
-  // divide into group
-  weight =
-      weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
-  for (int b = 0; b < batch; b++) {
-    modulated_deformable_im2col_cpu(input[b], offset[b], mask[b], 1, channels, height, width,
-                                    height_out, width_out, kernel_h, kernel_w, pad_h, pad_w,
-                                    stride_h, stride_w, dilation_h, dilation_w, deformable_group,
-                                    columns);
-
-    for (int g = 0; g < group; g++) {
-      output[b][g] =
-          output[b][g].flatten(1).addmm_(weight[g].flatten(1), columns[g]).view_as(output[b][g]);
+                                       height_col, width_col, data_mask_ != nullptr, data_col_); }));
     }
-  }
 
-  output = output.view(
-      {output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)});
-
-  if (with_bias) {
-    output += bias.view({1, bias.size(0), 1, 1});
-  }
-
-  return output;
-}
+    at::Tensor modulated_deform_conv_forward_cpu(at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor offset, at::Tensor mask, int64_t kernel_h, int64_t kernel_w, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, int64_t dilation_h, int64_t dilation_w, int64_t group, int64_t deformable_group, bool with_bias)
+    {
+        at::DeviceGuard guard(input.device());
+
+        const int       batch    = input.size(0);
+        const int       channels = input.size(1);
+        const int       height   = input.size(2);
+        const int       width    = input.size(3);
+
+        const int       channels_out    = weight.size(0);
+        const int       channels_kernel = weight.size(1);
+        const int       kernel_h_       = weight.size(2);
+        const int       kernel_w_       = weight.size(3);
+
+        if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+            AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+        if (channels != channels_kernel * group)
+            AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).", channels, channels_kernel * group);
+
+        const int  height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+        const int  width_out  = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+        // resize output
+        at::Tensor output =
+            at::zeros({batch, group, channels_out / group, height_out, width_out}, input.options());
+        // resize temporary columns
+        at::Tensor columns = at::zeros(
+            {group, channels * kernel_h * kernel_w / group, 1 * height_out * width_out},
+            input.options());
+
+        // divide into group
+        weight =
+            weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
+        for (int b = 0; b < batch; b++)
+        {
+            modulated_deformable_im2col_cpu(input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns);
+
+            for (int g = 0; g < group; g++)
+            {
+                output[b][g] =
+                    output[b][g].flatten(1).addmm_(weight[g].flatten(1), columns[g]).view_as(output[b][g]);
+            }
+        }
+
+        output = output.view(
+            {output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)});
+
+        if (with_bias)
+        {
+            output += bias.view({1, bias.size(0), 1, 1});
+        }
+
+        return output;
+    }
 
-TORCH_LIBRARY_IMPL(mmdeploy, CPU, m) {
-  m.impl("modulated_deform_conv", modulated_deform_conv_forward_cpu);
-}
+    TORCH_LIBRARY_IMPL(mmdeploy, CPU, m)
+    {
+        m.impl("modulated_deform_conv", modulated_deform_conv_forward_cpu);
+    }
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu
index 3f9b6aef08..83fddb8a8c 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu
@@ -3,21 +3,39 @@
 #include "modulated_deform_conv/modulated_deform_conv_cuda.cuh"
 #include "torch/script.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-void modulated_deformable_im2col_cuda(
-    const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
-    const int64_t batch_size, const int64_t channels, const int64_t height_im,
-    const int64_t width_im, const int64_t height_col, const int64_t width_col,
-    const int64_t kernel_h, const int64_t kernel_w, const int64_t pad_h, const int64_t pad_w,
-    const int64_t stride_h, const int64_t stride_w, const int64_t dilation_h,
-    const int64_t dilation_w, const int64_t deformable_group, at::Tensor data_col) {
-  // num_axes should be smaller than block size
-  const int channel_per_deformable_group = channels / deformable_group;
-  const int num_kernels = channels * batch_size * height_col * width_col;
+    void modulated_deformable_im2col_cuda(
+        const at::Tensor data_im,
+        const at::Tensor data_offset,
+        const at::Tensor data_mask,
+        const int64_t    batch_size,
+        const int64_t    channels,
+        const int64_t    height_im,
+        const int64_t    width_im,
+        const int64_t    height_col,
+        const int64_t    width_col,
+        const int64_t    kernel_h,
+        const int64_t    kernel_w,
+        const int64_t    pad_h,
+        const int64_t    pad_w,
+        const int64_t    stride_h,
+        const int64_t    stride_w,
+        const int64_t    dilation_h,
+        const int64_t    dilation_w,
+        const int64_t    deformable_group,
+        at::Tensor       data_col)
+    {
+        // num_axes should be smaller than block size
+        const int channel_per_deformable_group = channels / deformable_group;
+        const int num_kernels                  = channels * batch_size * height_col * width_col;
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.scalar_type(), "modulated_deformable_im2col_cuda", ([&] {
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            data_im.scalar_type(),
+            "modulated_deformable_im2col_cuda",
+            ([&]
+             {
         const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
         const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
         const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
@@ -27,71 +45,66 @@ void modulated_deformable_im2col_cuda(
                 num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h,
                 kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                 channel_per_deformable_group, batch_size, channels, deformable_group, height_col,
-                width_col, data_col_);
-      }));
-}
+                width_col, data_col_); }));
+    }
 
-at::Tensor modulated_deform_conv_forward_cuda(at::Tensor input, at::Tensor weight, at::Tensor bias,
-                                              at::Tensor offset, at::Tensor mask, int64_t kernel_h,
-                                              int64_t kernel_w, int64_t stride_h, int64_t stride_w,
-                                              int64_t pad_h, int64_t pad_w, int64_t dilation_h,
-                                              int64_t dilation_w, int64_t group,
-                                              int64_t deformable_group, bool with_bias) {
-  at::DeviceGuard guard(input.device());
+    at::Tensor modulated_deform_conv_forward_cuda(at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor offset, at::Tensor mask, int64_t kernel_h, int64_t kernel_w, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, int64_t dilation_h, int64_t dilation_w, int64_t group, int64_t deformable_group, bool with_bias)
+    {
+        at::DeviceGuard guard(input.device());
 
-  const int batch = input.size(0);
-  const int channels = input.size(1);
-  const int height = input.size(2);
-  const int width = input.size(3);
+        const int       batch    = input.size(0);
+        const int       channels = input.size(1);
+        const int       height   = input.size(2);
+        const int       width    = input.size(3);
 
-  const int channels_out = weight.size(0);
-  const int channels_kernel = weight.size(1);
-  const int kernel_h_ = weight.size(2);
-  const int kernel_w_ = weight.size(3);
+        const int       channels_out    = weight.size(0);
+        const int       channels_kernel = weight.size(1);
+        const int       kernel_h_       = weight.size(2);
+        const int       kernel_w_       = weight.size(3);
 
-  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
-    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).", kernel_h_, kernel_w,
-             kernel_h_, kernel_w_);
-  if (channels != channels_kernel * group)
-    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).", channels,
-             channels_kernel * group);
+        if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+            AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+        if (channels != channels_kernel * group)
+            AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).", channels, channels_kernel * group);
 
-  const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+        const int  height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+        const int  width_out  = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
 
-  // resize output
-  at::Tensor output =
-      at::zeros({batch, group, channels_out / group, height_out, width_out}, input.options());
-  // resize temporary columns
-  at::Tensor columns = at::zeros(
-      {group, channels * kernel_h * kernel_w / group, 1 * height_out * width_out}, input.options());
+        // resize output
+        at::Tensor output =
+            at::zeros({batch, group, channels_out / group, height_out, width_out}, input.options());
+        // resize temporary columns
+        at::Tensor columns = at::zeros(
+            {group, channels * kernel_h * kernel_w / group, 1 * height_out * width_out},
+            input.options());
 
-  // divide into group
-  weight =
-      weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
-  for (int b = 0; b < batch; b++) {
-    modulated_deformable_im2col_cuda(input[b], offset[b], mask[b], 1, channels, height, width,
-                                     height_out, width_out, kernel_h, kernel_w, pad_h, pad_w,
-                                     stride_h, stride_w, dilation_h, dilation_w, deformable_group,
-                                     columns);
+        // divide into group
+        weight =
+            weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
+        for (int b = 0; b < batch; b++)
+        {
+            modulated_deformable_im2col_cuda(input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns);
 
-    for (int g = 0; g < group; g++) {
-      output[b][g] =
-          output[b][g].flatten(1).addmm_(weight[g].flatten(1), columns[g]).view_as(output[b][g]);
-    }
-  }
+            for (int g = 0; g < group; g++)
+            {
+                output[b][g] =
+                    output[b][g].flatten(1).addmm_(weight[g].flatten(1), columns[g]).view_as(output[b][g]);
+            }
+        }
 
-  output = output.view(
-      {output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)});
+        output = output.view(
+            {output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)});
 
-  if (with_bias) {
-    output += bias.view({1, bias.size(0), 1, 1});
-  }
+        if (with_bias)
+        {
+            output += bias.view({1, bias.size(0), 1, 1});
+        }
 
-  return output;
-}
+        return output;
+    }
 
-TORCH_LIBRARY_IMPL(mmdeploy, CUDA, m) {
-  m.impl("modulated_deform_conv", modulated_deform_conv_forward_cuda);
-}
+    TORCH_LIBRARY_IMPL(mmdeploy, CUDA, m)
+    {
+        m.impl("modulated_deform_conv", modulated_deform_conv_forward_cuda);
+    }
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp
index 3b8bb0f632..58cf0c6018 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp
@@ -12,36 +12,39 @@
 #include "passes/onnx/merge_shape_concate.h"
 #include "passes/onnx/onnx_peephole.h"
 
-namespace mmdeploy {
-namespace torch_jit {
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
 
-void optimize_for_backend(torch::jit::Module& model, const std::string& ir = "torchscript",
-                          const std::string& backend = "torchscript") {
-  if (ir == "torchscript") {
-    model = optimize_for_torchscript(model);
-  } else if (ir == "onnx") {
-    model = optimize_for_onnx(model);
-  } else {
-    fprintf(stderr, "No optimize for combination ir: %s backend: %s\n", ir.c_str(),
-            backend.c_str());
-    exit(-1);
-  }
-}
+        void optimize_for_backend(torch::jit::Module& model, const std::string& ir = "torchscript", const std::string& backend = "torchscript")
+        {
+            if (ir == "torchscript")
+            {
+                model = optimize_for_torchscript(model);
+            }
+            else if (ir == "onnx")
+            {
+                model = optimize_for_onnx(model);
+            }
+            else
+            {
+                fprintf(stderr, "No optimize for combination ir: %s backend: %s\n", ir.c_str(), backend.c_str());
+                exit(-1);
+            }
+        }
 
-PYBIND11_MODULE(ts_optimizer, m) {
-  namespace py = pybind11;
-  m.def("optimize_for_backend", optimize_for_backend, py::arg("module"),
-        py::arg("ir") = std::string("torchscript"),
-        py::arg("backend") = std::string("torchscript"));
-  py::module_ onnx_module = m.def_submodule("onnx");
-  onnx_module.def("_jit_pass_merge_shape_concate", MergeShapeConcate, py::arg("graph"));
-  onnx_module.def("_jit_pass_onnx_peephole", ONNXPeephole, py::arg("graph"));
-  onnx_module.def("_jit_pass_flatten_cls_head", FlattenClsHead, py::arg("graph"));
-  onnx_module.def("_jit_pass_fuse_select_assign", FuseSelectAssign, py::arg("graph"),
-                  py::arg("params"));
-  onnx_module.def("_jit_pass_common_subgraph_elimination", CommonSubgraphElimination,
-                  py::arg("graph"), py::arg("params"));
-}
+        PYBIND11_MODULE(ts_optimizer, m)
+        {
+            namespace py = pybind11;
+            m.def("optimize_for_backend", optimize_for_backend, py::arg("module"), py::arg("ir") = std::string("torchscript"), py::arg("backend") = std::string("torchscript"));
+            py::module_ onnx_module = m.def_submodule("onnx");
+            onnx_module.def("_jit_pass_merge_shape_concate", MergeShapeConcate, py::arg("graph"));
+            onnx_module.def("_jit_pass_onnx_peephole", ONNXPeephole, py::arg("graph"));
+            onnx_module.def("_jit_pass_flatten_cls_head", FlattenClsHead, py::arg("graph"));
+            onnx_module.def("_jit_pass_fuse_select_assign", FuseSelectAssign, py::arg("graph"), py::arg("params"));
+            onnx_module.def("_jit_pass_common_subgraph_elimination", CommonSubgraphElimination, py::arg("graph"), py::arg("params"));
+        }
 
-}  // namespace torch_jit
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp
index 10ce9829d5..e5f06e9c8b 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp
@@ -8,306 +8,355 @@
 
 #include <regex>
 #include <stack>
-namespace mmdeploy {
-namespace torch_jit {
-
-using torch::jit::AttributeKind;
-using torch::jit::ClassType;
-using torch::jit::Node;
-using torch::jit::Symbol;
-using torch::jit::Value;
-
-namespace prim {
-using namespace ::c10::prim;
-}
-
-namespace attr {
-using namespace ::c10::attr;
-}
-
-/**
- * \brief A class implementing an API for comparing subgraphs.
- */
-class SubgraphMatcher::SubgraphMatcherImpl {
- public:
-  explicit SubgraphMatcherImpl(const Graph& pattern, MatchAttribute match_attribute)
-      : pattern_(pattern), match_attribute_(match_attribute) {}
-
-  /**
-   * \brief Compare matchGraph with the part of the graph denoted by a node \p
-   * ANCHOR.
-   *
-   * The anchor node would be compared against the deepest node in the
-   * match-graph. A node is considered matching if its number of inputs/outputs
-   * is the same as in the corresponding matchGraph node, its type is the same,
-   * and all nodes producing input-values also match.
-   */
-  bool matchesSubgraphFromAnchorNode(Node* anchor);
-
-  /** \brief Return match map for nodes. */
-  std::unordered_map<const Node*, Node*> nodes_map() const { return nodes_map_; }
-
-  /** \brief Return match map for values. */
-  std::unordered_map<const Value*, Value*> values_map() const { return values_map_; }
-
- private:
-  bool matchValues(const Value* v1, Value* v2);
-  bool matchNodes(const Node* n1, Node* n2);
-  bool matchAttributes(const Node* n1, Node* n2);
-
-  static bool isInput(const Value* v);
-  static bool isOutput(const Value* v);
-
-  std::unordered_map<const Node*, Node*> nodes_map_;
-  std::unordered_map<const Value*, Value*> values_map_;
-
-  const MatchAttribute match_attribute_;
-  const Graph& pattern_;
-  const Node* anchor_ = nullptr;
-};
-
-bool SubgraphMatcher::SubgraphMatcherImpl::isInput(const Value* v) {
-  return v->node()->kind() == prim::Param;
-}
-
-bool SubgraphMatcher::SubgraphMatcherImpl::isOutput(const Value* v) {
-  for (const Value* output : v->owningGraph()->outputs()) {
-    if (v == output) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/**
- * Compare two Values. V1 is from pattern, V2 is from the actual graph.
- *
- * The values are considered matching if:
- * 1) the nodes defining them match
- * 2) they have the same number of uses, except they are entry or exit nodes.
- */
-bool SubgraphMatcher::SubgraphMatcherImpl::matchValues(const Value* v1, Value* v2) {
-  // Check if we've already visited these values.
-  if (values_map_.count(v1)) {
-    if (values_map_.at(v1) != v2) {
-      GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(),
-                  " did not match because %", v1->debugName(), " has already been matched with %",
-                  values_map_.at(v1)->debugName(), ".\n");
-      return false;
-    }
-    return true;
-  }
-
-  // When V2 is ANCHOR, we're comparing exiting values, and when V1->node is
-  // PARAM, we're comparing entering values - in these two cases the number of
-  // uses don't need to be the same.
-  if (v1->uses().size() != v2->uses().size() && !isOutput(v1) && !isInput(v1)) {
-    GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(),
-                " did not match because number of their uses is different.\n");
-    return false;
-  }
-
-  // Add the values to the map before calling matchNodes to avoid infinite
-  // recursion.
-  GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(), " matched.\n");
-  values_map_[v1] = v2;
-  return matchNodes(v1->node(), v2->node());
-}
-
-bool SubgraphMatcher::SubgraphMatcherImpl::matchAttributes(const Node* n1, Node* n2) {
-  if (match_attribute_ == FORCE_MATCH && n1->numAttributes() != n2->numAttributes()) {
-    GRAPH_DEBUG("Nodes did not match in number attributes:\n", *n1, *n2);
-    return false;
-  }
-  for (const Symbol& attr_name : n1->attributeNames()) {
-    if (n1->kindOf(attr_name) != n2->kindOf(attr_name)) {
-      GRAPH_DEBUG("Nodes did not match because type of attribute '", attr_name.toQualString(),
-                  "' did not match:\n", *n1, *n2);
-      return false;
-    }
-    std::vector<int64_t> n1is, n2is;
-    std::vector<double> n1fs, n2fs;
-    switch (n1->kindOf(attr_name)) {
-      case AttributeKind::s:
-        if (!std::regex_match(n2->s(attr_name), std::regex(n1->s(attr_name)))) {
-          GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(),
-                      "' did not match: ", n1->s(attr_name), " != ", n2->s(attr_name), " \n", *n1,
-                      *n2);
-          return false;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using torch::jit::AttributeKind;
+        using torch::jit::ClassType;
+        using torch::jit::Node;
+        using torch::jit::Symbol;
+        using torch::jit::Value;
+
+        namespace prim
+        {
+            using namespace ::c10::prim;
         }
-        break;
-      case AttributeKind::f:
-        if (n1->f(attr_name) != n2->f(attr_name)) {
-          GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(),
-                      "' did not match:", n1->f(attr_name), " != ", n2->f(attr_name), " \n", *n1,
-                      *n2);
-          return false;
+
+        namespace attr
+        {
+            using namespace ::c10::attr;
+        }
+
+        /**
+         * \brief A class implementing an API for comparing subgraphs.
+         */
+        class SubgraphMatcher::SubgraphMatcherImpl
+        {
+          public:
+            explicit SubgraphMatcherImpl(const Graph& pattern, MatchAttribute match_attribute)
+                : pattern_(pattern)
+                , match_attribute_(match_attribute)
+            {
+            }
+
+            /**
+             * \brief Compare matchGraph with the part of the graph denoted by a node \p
+             * ANCHOR.
+             *
+             * The anchor node would be compared against the deepest node in the
+             * match-graph. A node is considered matching if its number of inputs/outputs
+             * is the same as in the corresponding matchGraph node, its type is the same,
+             * and all nodes producing input-values also match.
+             */
+            bool                                   matchesSubgraphFromAnchorNode(Node* anchor);
+
+            /** \brief Return match map for nodes. */
+            std::unordered_map<const Node*, Node*> nodes_map() const
+            {
+                return nodes_map_;
+            }
+
+            /** \brief Return match map for values. */
+            std::unordered_map<const Value*, Value*> values_map() const
+            {
+                return values_map_;
+            }
+
+          private:
+            bool                                     matchValues(const Value* v1, Value* v2);
+            bool                                     matchNodes(const Node* n1, Node* n2);
+            bool                                     matchAttributes(const Node* n1, Node* n2);
+
+            static bool                              isInput(const Value* v);
+            static bool                              isOutput(const Value* v);
+
+            std::unordered_map<const Node*, Node*>   nodes_map_;
+            std::unordered_map<const Value*, Value*> values_map_;
+
+            const MatchAttribute                     match_attribute_;
+            const Graph&                             pattern_;
+            const Node*                              anchor_ = nullptr;
+        };
+
+        bool SubgraphMatcher::SubgraphMatcherImpl::isInput(const Value* v)
+        {
+            return v->node()->kind() == prim::Param;
+        }
+
+        bool SubgraphMatcher::SubgraphMatcherImpl::isOutput(const Value* v)
+        {
+            for (const Value* output : v->owningGraph()->outputs())
+            {
+                if (v == output)
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        /**
+         * Compare two Values. V1 is from pattern, V2 is from the actual graph.
+         *
+         * The values are considered matching if:
+         * 1) the nodes defining them match
+         * 2) they have the same number of uses, except they are entry or exit nodes.
+         */
+        bool SubgraphMatcher::SubgraphMatcherImpl::matchValues(const Value* v1, Value* v2)
+        {
+            // Check if we've already visited these values.
+            if (values_map_.count(v1))
+            {
+                if (values_map_.at(v1) != v2)
+                {
+                    GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(), " did not match because %", v1->debugName(), " has already been matched with %", values_map_.at(v1)->debugName(), ".\n");
+                    return false;
+                }
+                return true;
+            }
+
+            // When V2 is ANCHOR, we're comparing exiting values, and when V1->node is
+            // PARAM, we're comparing entering values - in these two cases the number of
+            // uses don't need to be the same.
+            if (v1->uses().size() != v2->uses().size() && !isOutput(v1) && !isInput(v1))
+            {
+                GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(), " did not match because number of their uses is different.\n");
+                return false;
+            }
+
+            // Add the values to the map before calling matchNodes to avoid infinite
+            // recursion.
+            GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(), " matched.\n");
+            values_map_[v1] = v2;
+            return matchNodes(v1->node(), v2->node());
+        }
+
+        bool SubgraphMatcher::SubgraphMatcherImpl::matchAttributes(const Node* n1, Node* n2)
+        {
+            if (match_attribute_ == FORCE_MATCH && n1->numAttributes() != n2->numAttributes())
+            {
+                GRAPH_DEBUG("Nodes did not match in number attributes:\n", *n1, *n2);
+                return false;
+            }
+            for (const Symbol& attr_name : n1->attributeNames())
+            {
+                if (n1->kindOf(attr_name) != n2->kindOf(attr_name))
+                {
+                    GRAPH_DEBUG("Nodes did not match because type of attribute '", attr_name.toQualString(), "' did not match:\n", *n1, *n2);
+                    return false;
+                }
+                std::vector<int64_t> n1is, n2is;
+                std::vector<double>  n1fs, n2fs;
+                switch (n1->kindOf(attr_name))
+                {
+                    case AttributeKind::s:
+                        if (!std::regex_match(n2->s(attr_name), std::regex(n1->s(attr_name))))
+                        {
+                            GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(), "' did not match: ", n1->s(attr_name), " != ", n2->s(attr_name), " \n", *n1, *n2);
+                            return false;
+                        }
+                        break;
+                    case AttributeKind::f:
+                        if (n1->f(attr_name) != n2->f(attr_name))
+                        {
+                            GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(), "' did not match:", n1->f(attr_name), " != ", n2->f(attr_name), " \n", *n1, *n2);
+                            return false;
+                        }
+                        break;
+                    case AttributeKind::i:
+                        if (n1->i(attr_name) != n2->i(attr_name))
+                        {
+                            GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(), "' did not match:", n1->i(attr_name), " != ", n2->i(attr_name), " \n", *n1, *n2);
+                            return false;
+                        }
+                        break;
+                    case AttributeKind::is:
+                        n1is = n1->is(attr_name);
+                        n2is = n2->is(attr_name);
+                        if (n1is.size() != n2is.size()) return false;
+                        for (size_t i = 0; i < n1is.size(); ++i)
+                        {
+                            if (n1is[i] != n2is[i]) return false;
+                        }
+                        break;
+                    case AttributeKind::fs:
+                        n1fs = n1->fs(attr_name);
+                        n2fs = n2->fs(attr_name);
+                        if (n1fs.size() != n2fs.size()) return false;
+                        for (size_t i = 0; i < n1fs.size(); ++i)
+                        {
+                            if (n1fs[i] != n2fs[i]) return false;
+                        }
+                        break;
+                    default:
+                    {
+                        // Other attributes types not supported yet
+                        GRAPH_DEBUG("Nodes did not match because type of attribute '", attr_name.toQualString(), "' is not supported.\n", *n1, *n2);
+                        return false;
+                    }
+                }
+            }
+            return true;
+        }
+
+        static bool endsWith(const std::string& str, const std::string& suffix)
+        {
+            return str.size() >= suffix.size() &&
+                   0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
         }
-        break;
-      case AttributeKind::i:
-        if (n1->i(attr_name) != n2->i(attr_name)) {
-          GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(),
-                      "' did not match:", n1->i(attr_name), " != ", n2->i(attr_name), " \n", *n1,
-                      *n2);
-          return false;
+
+        /**
+         * Compare two Nodes. N1 is from pattern, N2 is from the actual graph.
+         *
+         * The nodes are considered matching if:
+         * 1) N1 and N2 are of the same kind.
+         * 2) Number of inputs and outputs is the same.
+         * 3) All input and output values match.
+         *
+         * A special case is when N1 is PARAM - this is considered outside the pattern,
+         * so it matches everything.
+         */
+        bool SubgraphMatcher::SubgraphMatcherImpl::matchNodes(const Node* n1, Node* n2)
+        {
+            // Check if we've already visited these nodes.
+            if (nodes_map_.count(n1))
+            {
+                return nodes_map_.at(n1) == n2;
+            }
+
+            // Param node in pattern graph matches everything.
+            if (n1->kind() == prim::Param)
+            {
+                GRAPH_DEBUG("Nodes matched:\n", *n1, *n2);
+                return true;
+            }
+
+            // We don't allow matches to span across blocks, so check if N2 is in the same
+            // block as the first (anchor) node.
+            if (n2->owningBlock() != anchor_->owningBlock())
+            {
+                GRAPH_DEBUG("Nodes did not match because it is in the different block:\n", *n1, *n2);
+                return false;
+            }
+
+            // Special handling for matching modules
+            if (n1->kind() == Symbol::fromQualString("match::module"))
+            {
+                if (n2->kind() == prim::GetAttr)
+                {
+                    if (!n1->hasAttributeS("name"))
+                    {
+                        GRAPH_DEBUG(
+                            "Nodes did not match because special node match::module does not have 'name' "
+                            "attribute:\n",
+                            *n1,
+                            *n2);
+                        return false;
+                    }
+                    auto t                = n2->output()->type()->expect<ClassType>();
+                    auto real_typename    = t->name()->qualifiedName();
+                    auto pattern_typename = n1->s(attr::name);
+                    if (!endsWith(real_typename, pattern_typename))
+                    {
+                        GRAPH_DEBUG("Nodes did not match because expected module type is different:\n");
+                        GRAPH_DEBUG("  actualtype:    ", real_typename, "\n");
+                        GRAPH_DEBUG("  expected type: ", pattern_typename, "\n");
+                        GRAPH_DEBUG("Nodes:", *n1, *n2);
+                        return false;
+                    }
+                }
+            }
+            else
+            {
+                if (n1->kind() != n2->kind() || n1->outputs().size() != n2->outputs().size() ||
+                    n1->inputs().size() != n2->inputs().size())
+                {
+                    GRAPH_DEBUG("Nodes did not match in their kind or number of inputs/outputs:\n", *n1, *n2);
+                    return false;
+                }
+
+                if (match_attribute_ != NO_MATCH)
+                {
+                    if (!matchAttributes(n1, n2))
+                    {
+                        return false;
+                    }
+                }
+            }
+
+            // Add nodes to the map before calling matchValues to avoid infinite
+            // recursion.
+            nodes_map_[n1] = n2;
+            for (const auto i : c10::irange(n1->outputs().size()))
+            {
+                if (!matchValues(n1->outputs()[i], n2->outputs()[i]))
+                {
+                    return false;
+                }
+            }
+            for (const auto i : c10::irange(n1->inputs().size()))
+            {
+                if (!matchValues(n1->inputs()[i], n2->inputs()[i]))
+                {
+                    return false;
+                }
+            }
+
+            GRAPH_DEBUG("Nodes matched:\n", *n1, *n2);
+            return true;
+        }
+
+        /**
+         * Recursively try to match pattern with the actual graph starting from the
+         * exiting node in the pattern and anchor node in the actual graph.
+         */
+        bool SubgraphMatcher::SubgraphMatcherImpl::matchesSubgraphFromAnchorNode(Node* anchor)
+        {
+            GRAPH_UPDATE("Starting match from a new anchor: ", *anchor);
+            nodes_map_.clear();
+            values_map_.clear();
+            anchor_ = anchor;
+
+            const Node* bottom_node = *(pattern_.nodes().end());
+            bottom_node             = bottom_node->input(0)->node();
+
+            if (!matchNodes(bottom_node, anchor))
+            {
+                return false;
+            }
+
+            for (const Value* output : pattern_.outputs())
+            {
+                AT_ASSERT(values_map_.count(output));
+            }
+
+            GRAPH_UPDATE("Pattern matched!\n");
+            return true;
         }
-        break;
-      case AttributeKind::is:
-        n1is = n1->is(attr_name);
-        n2is = n2->is(attr_name);
-        if (n1is.size() != n2is.size()) return false;
-        for (size_t i = 0; i < n1is.size(); ++i) {
-          if (n1is[i] != n2is[i]) return false;
+
+        SubgraphMatcher::SubgraphMatcher(const Graph& pattern, MatchAttribute match_attribute)
+            : impl_(new SubgraphMatcher::SubgraphMatcherImpl(pattern, match_attribute))
+        {
         }
-        break;
-      case AttributeKind::fs:
-        n1fs = n1->fs(attr_name);
-        n2fs = n2->fs(attr_name);
-        if (n1fs.size() != n2fs.size()) return false;
-        for (size_t i = 0; i < n1fs.size(); ++i) {
-          if (n1fs[i] != n2fs[i]) return false;
+
+        SubgraphMatcher::~SubgraphMatcher() = default;
+
+        bool SubgraphMatcher::matchesSubgraphFromAnchorNode(Node* anchor)
+        {
+            return impl_->matchesSubgraphFromAnchorNode(anchor);
         }
-        break;
-      default: {
-        // Other attributes types not supported yet
-        GRAPH_DEBUG("Nodes did not match because type of attribute '", attr_name.toQualString(),
-                    "' is not supported.\n", *n1, *n2);
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-static bool endsWith(const std::string& str, const std::string& suffix) {
-  return str.size() >= suffix.size() &&
-         0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
-
-/**
- * Compare two Nodes. N1 is from pattern, N2 is from the actual graph.
- *
- * The nodes are considered matching if:
- * 1) N1 and N2 are of the same kind.
- * 2) Number of inputs and outputs is the same.
- * 3) All input and output values match.
- *
- * A special case is when N1 is PARAM - this is considered outside the pattern,
- * so it matches everything.
- */
-bool SubgraphMatcher::SubgraphMatcherImpl::matchNodes(const Node* n1, Node* n2) {
-  // Check if we've already visited these nodes.
-  if (nodes_map_.count(n1)) {
-    return nodes_map_.at(n1) == n2;
-  }
-
-  // Param node in pattern graph matches everything.
-  if (n1->kind() == prim::Param) {
-    GRAPH_DEBUG("Nodes matched:\n", *n1, *n2);
-    return true;
-  }
-
-  // We don't allow matches to span across blocks, so check if N2 is in the same
-  // block as the first (anchor) node.
-  if (n2->owningBlock() != anchor_->owningBlock()) {
-    GRAPH_DEBUG("Nodes did not match because it is in the different block:\n", *n1, *n2);
-    return false;
-  }
-
-  // Special handling for matching modules
-  if (n1->kind() == Symbol::fromQualString("match::module")) {
-    if (n2->kind() == prim::GetAttr) {
-      if (!n1->hasAttributeS("name")) {
-        GRAPH_DEBUG(
-            "Nodes did not match because special node match::module does not have 'name' "
-            "attribute:\n",
-            *n1, *n2);
-        return false;
-      }
-      auto t = n2->output()->type()->expect<ClassType>();
-      auto real_typename = t->name()->qualifiedName();
-      auto pattern_typename = n1->s(attr::name);
-      if (!endsWith(real_typename, pattern_typename)) {
-        GRAPH_DEBUG("Nodes did not match because expected module type is different:\n");
-        GRAPH_DEBUG("  actualtype:    ", real_typename, "\n");
-        GRAPH_DEBUG("  expected type: ", pattern_typename, "\n");
-        GRAPH_DEBUG("Nodes:", *n1, *n2);
-        return false;
-      }
-    }
-  } else {
-    if (n1->kind() != n2->kind() || n1->outputs().size() != n2->outputs().size() ||
-        n1->inputs().size() != n2->inputs().size()) {
-      GRAPH_DEBUG("Nodes did not match in their kind or number of inputs/outputs:\n", *n1, *n2);
-      return false;
-    }
-
-    if (match_attribute_ != NO_MATCH) {
-      if (!matchAttributes(n1, n2)) {
-        return false;
-      }
-    }
-  }
-
-  // Add nodes to the map before calling matchValues to avoid infinite
-  // recursion.
-  nodes_map_[n1] = n2;
-  for (const auto i : c10::irange(n1->outputs().size())) {
-    if (!matchValues(n1->outputs()[i], n2->outputs()[i])) {
-      return false;
-    }
-  }
-  for (const auto i : c10::irange(n1->inputs().size())) {
-    if (!matchValues(n1->inputs()[i], n2->inputs()[i])) {
-      return false;
-    }
-  }
-
-  GRAPH_DEBUG("Nodes matched:\n", *n1, *n2);
-  return true;
-}
-
-/**
- * Recursively try to match pattern with the actual graph starting from the
- * exiting node in the pattern and anchor node in the actual graph.
- */
-bool SubgraphMatcher::SubgraphMatcherImpl::matchesSubgraphFromAnchorNode(Node* anchor) {
-  GRAPH_UPDATE("Starting match from a new anchor: ", *anchor);
-  nodes_map_.clear();
-  values_map_.clear();
-  anchor_ = anchor;
-
-  const Node* bottom_node = *(pattern_.nodes().end());
-  bottom_node = bottom_node->input(0)->node();
-
-  if (!matchNodes(bottom_node, anchor)) {
-    return false;
-  }
-
-  for (const Value* output : pattern_.outputs()) {
-    AT_ASSERT(values_map_.count(output));
-  }
-
-  GRAPH_UPDATE("Pattern matched!\n");
-  return true;
-}
-
-SubgraphMatcher::SubgraphMatcher(const Graph& pattern, MatchAttribute match_attribute)
-    : impl_(new SubgraphMatcher::SubgraphMatcherImpl(pattern, match_attribute)) {}
-
-SubgraphMatcher::~SubgraphMatcher() = default;
-
-bool SubgraphMatcher::matchesSubgraphFromAnchorNode(Node* anchor) {
-  return impl_->matchesSubgraphFromAnchorNode(anchor);
-}
-
-std::unordered_map<const Node*, Node*> SubgraphMatcher::nodes_map() const {
-  return impl_->nodes_map();
-}
-
-std::unordered_map<const Value*, Value*> SubgraphMatcher::values_map() const {
-  return impl_->values_map();
-}
-
-}  // namespace torch_jit
+
+        std::unordered_map<const Node*, Node*> SubgraphMatcher::nodes_map() const
+        {
+            return impl_->nodes_map();
+        }
+
+        std::unordered_map<const Value*, Value*> SubgraphMatcher::values_map() const
+        {
+            return impl_->values_map();
+        }
+
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.h
index e2488e252c..ffe1b51aa8 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.h
@@ -5,34 +5,42 @@
 #include <torch/script.h>
 
 #include <memory>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::jit::Graph;
-using torch::jit::Node;
-using torch::jit::Value;
-
-enum MatchAttribute { FORCE_MATCH, TRY_MATCH, NO_MATCH };
-
-class SubgraphMatcher {
- public:
-  explicit SubgraphMatcher(const Graph& pattern, MatchAttribute match_attribute = TRY_MATCH);
-
-  ~SubgraphMatcher();
-
-  bool matchesSubgraphFromAnchorNode(Node* anchor);
-
-  /** \brief Return match map for nodes. */
-  std::unordered_map<const Node*, Node*> nodes_map() const;
-
-  /** \brief Return match map for values. */
-  std::unordered_map<const Value*, Value*> values_map() const;
-
- private:
-  class SubgraphMatcherImpl;
-  std::unique_ptr<SubgraphMatcherImpl> impl_;
-};
-
-}  // namespace torch_jit
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::jit::Graph;
+        using torch::jit::Node;
+        using torch::jit::Value;
+
+        enum MatchAttribute
+        {
+            FORCE_MATCH,
+            TRY_MATCH,
+            NO_MATCH
+        };
+
+        class SubgraphMatcher
+        {
+          public:
+            explicit SubgraphMatcher(const Graph& pattern, MatchAttribute match_attribute = TRY_MATCH);
+
+            ~SubgraphMatcher();
+
+            bool                                     matchesSubgraphFromAnchorNode(Node* anchor);
+
+            /** \brief Return match map for nodes. */
+            std::unordered_map<const Node*, Node*>   nodes_map() const;
+
+            /** \brief Return match map for values. */
+            std::unordered_map<const Value*, Value*> values_map() const;
+
+          private:
+            class SubgraphMatcherImpl;
+            std::unique_ptr<SubgraphMatcherImpl> impl_;
+        };
+
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.cpp
index 05ef9d54cd..2178bb3a4e 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.cpp
@@ -12,59 +12,63 @@
 #include <torch/csrc/jit/passes/remove_expands.h>
 
 #if TORCH_VERSION_MINOR >= 9
-#include <torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h>
-#include <torch/csrc/jit/passes/frozen_linear_transpose.h>
-#include <torch/csrc/jit/passes/frozen_ops_to_mkldnn.h>
+    #include <torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h>
+    #include <torch/csrc/jit/passes/frozen_linear_transpose.h>
+    #include <torch/csrc/jit/passes/frozen_ops_to_mkldnn.h>
 #endif
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-using torch::jit::Graph;
-const std::shared_ptr<Graph>& required_passes(const std::shared_ptr<Graph>& graph) {
-  RemoveExpands(graph);
-  CanonicalizeOps(graph);
-  EliminateDeadCode(graph);
-  return graph;
-}
+    using torch::jit::Graph;
+    const std::shared_ptr<Graph>& required_passes(const std::shared_ptr<Graph>& graph)
+    {
+        RemoveExpands(graph);
+        CanonicalizeOps(graph);
+        EliminateDeadCode(graph);
+        return graph;
+    }
 
-Module optimize_for_torchscript(const Module& model) {
-  auto frozen_model = freeze_module(model);
-  auto graph = frozen_model.get_method("forward").graph();
-  OptimizeFrozenGraph(graph, true);
+    Module optimize_for_torchscript(const Module& model)
+    {
+        auto frozen_model = freeze_module(model);
+        auto graph        = frozen_model.get_method("forward").graph();
+        OptimizeFrozenGraph(graph, true);
 
 #if TORCH_VERSION_MINOR >= 9
-  FuseFrozenConvAddRelu(graph);
-  ConvertFrozenOpsToMKLDNN(graph);
-  FrozenLinearTranspose(graph);
+        FuseFrozenConvAddRelu(graph);
+        ConvertFrozenOpsToMKLDNN(graph);
+        FrozenLinearTranspose(graph);
 #endif
 
-  graph = required_passes(graph);
-  EliminateCommonSubexpression(graph);
-  PeepholeOptimize(graph);
-  ConstantPropagation(graph);
-  ConstantPooling(graph);
+        graph = required_passes(graph);
+        EliminateCommonSubexpression(graph);
+        PeepholeOptimize(graph);
+        ConstantPropagation(graph);
+        ConstantPooling(graph);
 
-  // TODO: add more custom passes
+        // TODO: add more custom passes
 
-  return frozen_model;
-}
+        return frozen_model;
+    }
 
-Module optimize_for_onnx(const Module& model) {
-  auto frozen_model = freeze_module(model, {"training"});
-  auto graph = frozen_model.get_method("forward").graph();
-  OptimizeFrozenGraph(graph, true);
+    Module optimize_for_onnx(const Module& model)
+    {
+        auto frozen_model = freeze_module(model, {"training"});
+        auto graph        = frozen_model.get_method("forward").graph();
+        OptimizeFrozenGraph(graph, true);
 
 #if TORCH_VERSION_MINOR >= 9
-  FuseFrozenConvAddRelu(graph);
-  ConvertFrozenOpsToMKLDNN(graph);
-  FrozenLinearTranspose(graph);
+        FuseFrozenConvAddRelu(graph);
+        ConvertFrozenOpsToMKLDNN(graph);
+        FrozenLinearTranspose(graph);
 #endif
 
-  // TODO: add more custom passes
+        // TODO: add more custom passes
 
-  return frozen_model;
-}
+        return frozen_model;
+    }
 
-// TODO: add optimizer for other backend/onnx
+    // TODO: add optimizer for other backend/onnx
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.h
index d0d91c627d..fc5a3725d1 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.h
@@ -1,10 +1,11 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 #include <torch/script.h>
 
-namespace mmdeploy {
-using torch::jit::script::Module;
+namespace mmdeploy
+{
+    using torch::jit::script::Module;
 
-Module optimize_for_torchscript(const Module &model);
+    Module optimize_for_torchscript(const Module& model);
 
-Module optimize_for_onnx(const Module &model);
+    Module optimize_for_onnx(const Module& model);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.cpp
index c6541e630a..c26db5a34f 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.cpp
@@ -4,135 +4,161 @@
 #include <torch/csrc/jit/ir/node_hashing.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 
-namespace mmdeploy {
-namespace torch_jit {
-
-using c10::Symbol;
-using torch::jit::Block;
-using torch::jit::EqualNode;
-using torch::jit::HashNode;
-using torch::jit::Node;
-using torch::jit::Value;
-
-struct EqualNodeWithParams {
-  EqualNodeWithParams(std::unordered_map<std::string, Tensor>& params) : params_(params) {}
-
-  bool operator()(const Node* lhs, const Node* rhs) const {
-    auto lhs_inputs = lhs->inputs();
-    auto rhs_inputs = rhs->inputs();
-  }
-
- private:
-  std::unordered_map<std::string, Tensor>& params_;
-};
-
-struct CommonSubexpressionEliminator {
-  using ParamMapType = std::unordered_map<std::string, std::pair<Tensor, Value*>>;
-  CommonSubexpressionEliminator(std::shared_ptr<Graph> graph,
-                                std::unordered_map<std::string, Tensor>& params)
-      : graph_(std::move(graph)), params_(params) {}
-
-  bool run(std::function<Node*(Node*)> parent_lookup_fn) {
-    ParamMapType param_map;
-    return run(graph_->block(), std::move(parent_lookup_fn), param_map);
-  }
-
-  // The function implements common subexpression elimination.
-  // Since the nodes are visited in topological order, one pass is enough.
-  // returns true if CSE made changes to a graph
-  bool run(Block* block, std::function<Node*(Node*)> parent_lookup_fn, ParamMapType& param_map) {
-    std::unordered_set<Node*, HashNode, EqualNode> subexprs;
-    bool changed = false;
-    for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
-      auto node = *it;
-
-      // check if inputs come from params(graph input)
-      auto node_inputs = node->inputs();
-      for (auto input : node_inputs) {
-        if (input->node()->kind() == Symbol::fromQualString("prim::Param")) {
-          auto debug_name = input->debugName();
-
-          // check if input in params_
-          if (params_.find(debug_name) == params_.end()) continue;
-
-          // check if input is already visited.
-          if (param_map.find(debug_name) != param_map.end()) continue;
-
-          // check if there is a param has same value with input
-          auto val = params_[debug_name];
-          bool update_map = true;
-          for (auto kv : param_map) {
-            auto param_val = kv.second.first;
-            if (val.device() != param_val.device()) continue;
-            if (val.dtype() != param_val.dtype()) continue;
-            if (!val.equal(param_val)) continue;
-            input->replaceAllUsesWith(kv.second.second);
-            update_map = false;
-            break;
-          }
-
-          // add input to param_map
-          if (update_map) {
-            param_map.emplace(debug_name,
-                              std::make_pair<Tensor, Value*>(std::move(val), std::move(input)));
-          }
-        }
-      }
-
-      if (!node->blocks().empty()) {
-        // Traverse sub-blocks.
-        for (auto block : node->blocks()) {
-          changed |= run(
-              block,
-              [&](Node* n) {
-                auto existing = subexprs.find(n);
-                if (existing != subexprs.end()) {
-                  return *existing;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using c10::Symbol;
+        using torch::jit::Block;
+        using torch::jit::EqualNode;
+        using torch::jit::HashNode;
+        using torch::jit::Node;
+        using torch::jit::Value;
+
+        struct EqualNodeWithParams
+        {
+            EqualNodeWithParams(std::unordered_map<std::string, Tensor>& params)
+                : params_(params)
+            {
+            }
+
+            bool operator()(const Node* lhs, const Node* rhs) const
+            {
+                auto lhs_inputs = lhs->inputs();
+                auto rhs_inputs = rhs->inputs();
+            }
+
+          private:
+            std::unordered_map<std::string, Tensor>& params_;
+        };
+
+        struct CommonSubexpressionEliminator
+        {
+            using ParamMapType = std::unordered_map<std::string, std::pair<Tensor, Value*>>;
+            CommonSubexpressionEliminator(std::shared_ptr<Graph>                   graph,
+                                          std::unordered_map<std::string, Tensor>& params)
+                : graph_(std::move(graph))
+                , params_(params)
+            {
+            }
+
+            bool run(std::function<Node*(Node*)> parent_lookup_fn)
+            {
+                ParamMapType param_map;
+                return run(graph_->block(), std::move(parent_lookup_fn), param_map);
+            }
+
+            // The function implements common subexpression elimination.
+            // Since the nodes are visited in topological order, one pass is enough.
+            // returns true if CSE made changes to a graph
+            bool run(Block* block, std::function<Node*(Node*)> parent_lookup_fn, ParamMapType& param_map)
+            {
+                std::unordered_set<Node*, HashNode, EqualNode> subexprs;
+                bool                                           changed = false;
+                for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it)
+                {
+                    auto node = *it;
+
+                    // check if inputs come from params(graph input)
+                    auto node_inputs = node->inputs();
+                    for (auto input : node_inputs)
+                    {
+                        if (input->node()->kind() == Symbol::fromQualString("prim::Param"))
+                        {
+                            auto debug_name = input->debugName();
+
+                            // check if input in params_
+                            if (params_.find(debug_name) == params_.end()) continue;
+
+                            // check if input is already visited.
+                            if (param_map.find(debug_name) != param_map.end()) continue;
+
+                            // check if there is a param has same value with input
+                            auto val        = params_[debug_name];
+                            bool update_map = true;
+                            for (auto kv : param_map)
+                            {
+                                auto param_val = kv.second.first;
+                                if (val.device() != param_val.device()) continue;
+                                if (val.dtype() != param_val.dtype()) continue;
+                                if (!val.equal(param_val)) continue;
+                                input->replaceAllUsesWith(kv.second.second);
+                                update_map = false;
+                                break;
+                            }
+
+                            // add input to param_map
+                            if (update_map)
+                            {
+                                param_map.emplace(debug_name,
+                                                  std::make_pair<Tensor, Value*>(std::move(val), std::move(input)));
+                            }
+                        }
+                    }
+
+                    if (!node->blocks().empty())
+                    {
+                        // Traverse sub-blocks.
+                        for (auto block : node->blocks())
+                        {
+                            changed |= run(
+                                block,
+                                [&](Node* n)
+                                {
+                                    auto existing = subexprs.find(n);
+                                    if (existing != subexprs.end())
+                                    {
+                                        return *existing;
+                                    }
+
+                                    return parent_lookup_fn(n);
+                                },
+                                param_map);
+                        }
+
+                        continue;
+                    }
+
+                    // Check for CSE opportunities in the parent block.
+                    auto parent_lookup = parent_lookup_fn(node);
+                    auto g_out         = node->owningGraph()->outputs();
+                    if (parent_lookup != nullptr)
+                    {
+                        changed = true;
+                        node->replaceAllUsesWith(parent_lookup);
+                        it.destroyCurrent();
+                        continue;
+                    }
+
+                    // Check whether the same subexpression already exists.
+                    auto subit = subexprs.insert(node);
+                    if (!subit.second)
+                    {
+                        // Subexpression exists, replace the uses of node, and destroy it.
+                        auto existing = *subit.first;
+
+                        changed = true;
+                        node->replaceAllUsesWith(existing);
+                        // Destroy the node.
+                        it.destroyCurrent();
+                    }
                 }
 
-                return parent_lookup_fn(n);
-              },
-              param_map);
-        }
+                return changed;
+            }
 
-        continue;
-      }
-
-      // Check for CSE opportunities in the parent block.
-      auto parent_lookup = parent_lookup_fn(node);
-      auto g_out = node->owningGraph()->outputs();
-      if (parent_lookup != nullptr) {
-        changed = true;
-        node->replaceAllUsesWith(parent_lookup);
-        it.destroyCurrent();
-        continue;
-      }
-
-      // Check whether the same subexpression already exists.
-      auto subit = subexprs.insert(node);
-      if (!subit.second) {
-        // Subexpression exists, replace the uses of node, and destroy it.
-        auto existing = *subit.first;
-
-        changed = true;
-        node->replaceAllUsesWith(existing);
-        // Destroy the node.
-        it.destroyCurrent();
-      }
-    }
-
-    return changed;
-  }
-
- private:
-  std::shared_ptr<Graph> graph_;
-  std::unordered_map<std::string, Tensor>& params_;
-};
-
-void CommonSubgraphElimination(std::shared_ptr<Graph>& graph,
-                               std::unordered_map<std::string, Tensor>& params) {
-  CommonSubexpressionEliminator cse(graph, params);
-  cse.run([](Node*) { return nullptr; });
-}
-}  // namespace torch_jit
+          private:
+            std::shared_ptr<Graph>                   graph_;
+            std::unordered_map<std::string, Tensor>& params_;
+        };
+
+        void CommonSubgraphElimination(std::shared_ptr<Graph>&                  graph,
+                                       std::unordered_map<std::string, Tensor>& params)
+        {
+            CommonSubexpressionEliminator cse(graph, params);
+            cse.run([](Node*)
+                    { return nullptr; });
+        }
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.h
index d90b98073e..da108ff733 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.h
@@ -3,18 +3,20 @@
 #define _COMMON_SUBGRAPH_ELIMINATION_H_
 
 #include <torch/script.h>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::Tensor;
-using torch::jit::Graph;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::Tensor;
+        using torch::jit::Graph;
 
-// This pass is used eliminate the common subgraph.
-// There are two main difference between the one in torch/csrc/jit/pass
-// 1. AliasDb is not needed in ONNX model
-// 2. params might also participated in the elimination
-void CommonSubgraphElimination(std::shared_ptr<Graph>& graph,
-                               std::unordered_map<std::string, Tensor>& params);
-}  // namespace torch_jit
+        // This pass is used eliminate the common subgraph.
+        // There are two main difference between the one in torch/csrc/jit/pass
+        // 1. AliasDb is not needed in ONNX model
+        // 2. params might also participated in the elimination
+        void CommonSubgraphElimination(std::shared_ptr<Graph>&                  graph,
+                                       std::unordered_map<std::string, Tensor>& params);
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.cpp
index 73f8965412..db44bdb4c1 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.cpp
@@ -9,89 +9,94 @@
 
 #include "utils.h"
 
-namespace mmdeploy {
-namespace torch_jit {
-
-using c10::Symbol;
-using torch::jit::IValue;
-using torch::jit::Match;
-using torch::jit::TensorType;
-using torch::jit::TypeKind;
-using torch::jit::Value;
-
-static bool matchClsHead(const Match& match, const std::unordered_map<std::string, Value*>& map) {
-  // TODO: check if value map in latest pytorch can ease the filter.
-
-  // check cat -1
-  {
-    // check if the shape of second inputs is 1
-    auto cat_v1 = match.values_map.at(map.at("cat1"));
-    if (cat_v1->type()->kind() != TypeKind::TensorType) return false;
-    auto cat_v1_type = cat_v1->type()->cast<TensorType>();
-    auto cat_v1_size = cat_v1_type->sizes().concrete_sizes();
-    if (!cat_v1_size.has_value()) return false;
-    IValue cat_v1_size_value(cat_v1_size.value());
-    auto size_list = cat_v1_size_value.toIntList();
-    if (size_list.size() != 1 || size_list[0] != 1) return false;
-  }
-
-  // check unsqueeze
-  auto cat_v0 = match.values_map.at(map.at("cat0"));
-  auto unsqueeze_node = cat_v0->node();
-  {
-    if (!is_kind(unsqueeze_node, "onnx::Unsqueeze")) return false;
-    auto unsqueeze_axes = unsqueeze_node->is(Symbol::attr("axes"));
-    if (unsqueeze_axes.size() != 1 || unsqueeze_axes[0] != 0) return false;
-  }
-
-  // check gather
-  auto gather_node = unsqueeze_node->input()->node();
-  auto gather_inputs = gather_node->inputs();
-  {
-    if (!is_kind(gather_node, "onnx::Gather")) return false;
-    auto gather_axis = gather_node->i(Symbol::attr("axis"));
-    if (gather_axis != 0) return false;
-  }
-
-  auto x = match.values_map.at(map.at("x"));
-  // check shape
-  auto shape_node = gather_inputs[0]->node();
-  {
-    if (!is_kind(shape_node, "onnx::Shape")) return false;
-    if (shape_node->input() != x) return false;
-  }
-
-  // check constant
-  auto const_node = gather_inputs[1]->node();
-  {
-    if (!is_kind(const_node, "onnx::Constant")) return false;
-    auto ival = const_node->t(Symbol::attr("value"));
-    if (ival.dim() != 0) return false;
-    auto ival_dataptr = ival.data_ptr<int64_t>();
-    if (ival_dataptr[0] != 0) return false;
-  }
-
-  // check if reshape is the output of the graph
-  auto reshape_pattern = map.at("reshape");
-  auto reshape_node = match.values_map.at(reshape_pattern);
-  auto uses = reshape_node->uses();
-  for (auto use : uses) {
-    auto user = use.user;
-    if (is_kind(user, "prim::Return")) return false;
-  }
-
-  return true;
-}
-
-// from:
-// x->shape->gather->unsqueeze->concat
-// |                              |
-// gap--------------------------reshape
-//
-// to:
-// x->gap->flatten
-void FlattenClsHead(std::shared_ptr<Graph>& graph) {
-  std::string pattern = R"IR(
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using c10::Symbol;
+        using torch::jit::IValue;
+        using torch::jit::Match;
+        using torch::jit::TensorType;
+        using torch::jit::TypeKind;
+        using torch::jit::Value;
+
+        static bool matchClsHead(const Match& match, const std::unordered_map<std::string, Value*>& map)
+        {
+            // TODO: check if value map in latest pytorch can ease the filter.
+
+            // check cat -1
+            {
+                // check if the shape of second inputs is 1
+                auto cat_v1 = match.values_map.at(map.at("cat1"));
+                if (cat_v1->type()->kind() != TypeKind::TensorType) return false;
+                auto cat_v1_type = cat_v1->type()->cast<TensorType>();
+                auto cat_v1_size = cat_v1_type->sizes().concrete_sizes();
+                if (!cat_v1_size.has_value()) return false;
+                IValue cat_v1_size_value(cat_v1_size.value());
+                auto   size_list = cat_v1_size_value.toIntList();
+                if (size_list.size() != 1 || size_list[0] != 1) return false;
+            }
+
+            // check unsqueeze
+            auto cat_v0         = match.values_map.at(map.at("cat0"));
+            auto unsqueeze_node = cat_v0->node();
+            {
+                if (!is_kind(unsqueeze_node, "onnx::Unsqueeze")) return false;
+                auto unsqueeze_axes = unsqueeze_node->is(Symbol::attr("axes"));
+                if (unsqueeze_axes.size() != 1 || unsqueeze_axes[0] != 0) return false;
+            }
+
+            // check gather
+            auto gather_node   = unsqueeze_node->input()->node();
+            auto gather_inputs = gather_node->inputs();
+            {
+                if (!is_kind(gather_node, "onnx::Gather")) return false;
+                auto gather_axis = gather_node->i(Symbol::attr("axis"));
+                if (gather_axis != 0) return false;
+            }
+
+            auto x          = match.values_map.at(map.at("x"));
+            // check shape
+            auto shape_node = gather_inputs[0]->node();
+            {
+                if (!is_kind(shape_node, "onnx::Shape")) return false;
+                if (shape_node->input() != x) return false;
+            }
+
+            // check constant
+            auto const_node = gather_inputs[1]->node();
+            {
+                if (!is_kind(const_node, "onnx::Constant")) return false;
+                auto ival = const_node->t(Symbol::attr("value"));
+                if (ival.dim() != 0) return false;
+                auto ival_dataptr = ival.data_ptr<int64_t>();
+                if (ival_dataptr[0] != 0) return false;
+            }
+
+            // check if reshape is the output of the graph
+            auto reshape_pattern = map.at("reshape");
+            auto reshape_node    = match.values_map.at(reshape_pattern);
+            auto uses            = reshape_node->uses();
+            for (auto use : uses)
+            {
+                auto user = use.user;
+                if (is_kind(user, "prim::Return")) return false;
+            }
+
+            return true;
+        }
+
+        // from:
+        // x->shape->gather->unsqueeze->concat
+        // |                              |
+        // gap--------------------------reshape
+        //
+        // to:
+        // x->gap->flatten
+        void FlattenClsHead(std::shared_ptr<Graph>& graph)
+        {
+            std::string                  pattern = R"IR(
       graph(%x, %cat0, %cat1):
         %gap = onnx::GlobalAveragePool(%x)
         %cat = onnx::Concat[axis=0](%cat0, %cat1)
@@ -99,21 +104,22 @@ void FlattenClsHead(std::shared_ptr<Graph>& graph) {
         return (%reshape)
   )IR";
 
-  std::string replacement = R"IR(
+            std::string                  replacement = R"IR(
       graph(%x, %cat0, %cat1):
         %gap = onnx::GlobalAveragePool(%x)
         %flatten = onnx::Flatten(%gap)
         return (%flatten)
   )IR";
 
-  torch::jit::SubgraphRewriter subgraph_rewriter;
-  subgraph_rewriter.RegisterRewritePattern(pattern, replacement);
-  subgraph_rewriter.runOnGraph(graph, matchClsHead);
+            torch::jit::SubgraphRewriter subgraph_rewriter;
+            subgraph_rewriter.RegisterRewritePattern(pattern, replacement);
+            subgraph_rewriter.runOnGraph(graph, matchClsHead);
 
-  torch::jit::EliminateDeadCode(
-      graph->block(), true,
-      torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
-}
+            torch::jit::EliminateDeadCode(
+                graph->block(),
+                true,
+                torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
+        }
 
-}  // namespace torch_jit
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.h
index b66b700d1c..64d8ea3352 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.h
@@ -3,12 +3,14 @@
 #define _FLATTEN_CLS_HEAD_H_
 
 #include <torch/script.h>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::jit::Graph;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::jit::Graph;
 
-void FlattenClsHead(std::shared_ptr<Graph>& graph);
-}  // namespace torch_jit
+        void FlattenClsHead(std::shared_ptr<Graph>& graph);
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp
index 8dc5847753..2798abaa8c 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp
@@ -6,131 +6,149 @@
 #include "common_subgraph_elimination.h"
 #include "torch/csrc/jit/ir/irparser.h"
 
-namespace mmdeploy {
-namespace torch_jit {
-
-using c10::Symbol;
-using torch::jit::Block;
-using torch::jit::IValue;
-using torch::jit::Node;
-
-bool RemoveBoolCast(Node* node) {
-  auto bottom_node = node->input()->node();
-  if (bottom_node->kind() != Symbol::onnx("Greater") &&
-      bottom_node->kind() != Symbol::onnx("Less")) {
-    return false;
-  }
-  node->output()->replaceAllUsesWith(bottom_node->output());
-  return true;
-}
-
-bool FuseSelectAssign(Node* node, std::unordered_map<std::string, Tensor>& params,
-                      std::unordered_map<std::string, Value*>& vmap, SubgraphMatcher& matcher) {
-  auto values_map = matcher.values_map();
-
-  auto cmp1 = values_map[vmap["cmp_1"]]->node();
-  auto cmp2 = values_map[vmap["cmp_2"]]->node();
-  if (cmp1 != cmp2) {
-    // cmp_1 == cmp_2, cmp in (Great, Less)
-    if (cmp1->kind() != cmp2->kind()) return false;
-    if (!(cmp1->kind() == Symbol::onnx("Greater") || cmp1->kind() == Symbol::onnx("Less")))
-      return false;
-
-    // check threshold
-    Node* cmps[] = {cmp1, cmp2};
-    float thres = 0.0f;
-    Node* x = nullptr;
-    for (int i = 0; i < 2; ++i) {
-      auto cmp = cmps[i];
-      auto threshold = cmp->inputs()[1]->node();
-      if (threshold->kind() != Symbol::onnx("Constant")) return false;
-      auto thres_val = threshold->t(Symbol::attr("value"));
-      if (i == 0) {
-        thres = thres_val.data_ptr<float>()[0];
-        x = cmp->inputs()[0]->node();
-      } else {
-        float tmp_val = thres_val.data_ptr<float>()[0];
-        if (fabs(thres - tmp_val) > 1e-10) {
-          return false;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using c10::Symbol;
+        using torch::jit::Block;
+        using torch::jit::IValue;
+        using torch::jit::Node;
+
+        bool RemoveBoolCast(Node* node)
+        {
+            auto bottom_node = node->input()->node();
+            if (bottom_node->kind() != Symbol::onnx("Greater") &&
+                bottom_node->kind() != Symbol::onnx("Less"))
+            {
+                return false;
+            }
+            node->output()->replaceAllUsesWith(bottom_node->output());
+            return true;
         }
-        if (x != cmp->inputs()[0]->node()) {
-          return false;
+
+        bool FuseSelectAssign(Node* node, std::unordered_map<std::string, Tensor>& params, std::unordered_map<std::string, Value*>& vmap, SubgraphMatcher& matcher)
+        {
+            auto values_map = matcher.values_map();
+
+            auto cmp1 = values_map[vmap["cmp_1"]]->node();
+            auto cmp2 = values_map[vmap["cmp_2"]]->node();
+            if (cmp1 != cmp2)
+            {
+                // cmp_1 == cmp_2, cmp in (Great, Less)
+                if (cmp1->kind() != cmp2->kind()) return false;
+                if (!(cmp1->kind() == Symbol::onnx("Greater") || cmp1->kind() == Symbol::onnx("Less")))
+                    return false;
+
+                // check threshold
+                Node* cmps[] = {cmp1, cmp2};
+                float thres  = 0.0f;
+                Node* x      = nullptr;
+                for (int i = 0; i < 2; ++i)
+                {
+                    auto cmp       = cmps[i];
+                    auto threshold = cmp->inputs()[1]->node();
+                    if (threshold->kind() != Symbol::onnx("Constant")) return false;
+                    auto thres_val = threshold->t(Symbol::attr("value"));
+                    if (i == 0)
+                    {
+                        thres = thres_val.data_ptr<float>()[0];
+                        x     = cmp->inputs()[0]->node();
+                    }
+                    else
+                    {
+                        float tmp_val = thres_val.data_ptr<float>()[0];
+                        if (fabs(thres - tmp_val) > 1e-10)
+                        {
+                            return false;
+                        }
+                        if (x != cmp->inputs()[0]->node())
+                        {
+                            return false;
+                        }
+                    }
+                }
+            }
+
+            {
+                // check shape of reshape
+                Node* shape     = values_map[vmap["reshape_1_shape"]]->node();
+                auto  shape_val = shape->t(Symbol::attr("value"));
+                if (shape_val.dim() != 1) return false;
+                if (shape_val.data_ptr<int64_t>()[0] != -1) return false;
+            }
+
+            {
+                // check transpose
+                Node* trans[] = {values_map[vmap["trans_1"]]->node(), values_map[vmap["trans_2"]]->node()};
+                for (auto tran : trans)
+                {
+                    auto tran_perm = tran->is(Symbol::attr("perm"));
+                    if (tran_perm.size() != 2) return false;
+                    if (tran_perm[0] != 1 || tran_perm[1] != 0) return false;
+                }
+            }
+
+            {
+                // check gather indice
+                Node* gather_inds = values_map[vmap["gather_inds_2"]]->node();
+                auto  inds_val    = gather_inds->t(Symbol::attr("value"));
+                if (inds_val.dim() != 0) return false;
+                if (inds_val.data_ptr<int64_t>()[0] != 0) return false;
+            }
+
+            {
+                // check slice start
+                Node* slice      = values_map[vmap["slice_2"]]->node();
+                auto  start_name = slice->inputs()[1]->debugName();
+                auto  start_val  = params[start_name];
+                if (start_val.dim() != 1) return false;
+                if (start_val.data_ptr<int64_t>()[0] != 0) return false;
+            }
+
+            // create new node
+            auto graph      = node->owningGraph();
+            auto z          = values_map[vmap["z"]];
+            auto y          = values_map[vmap["y"]];
+            auto where_node = graph->create(Symbol::onnx("Where"), {cmp1->output(), z, y});
+            where_node->insertBefore(node);
+            where_node->output()->copyMetadata(node->output());
+            node->output()->replaceAllUsesWith(where_node->output());
+            return true;
+        }
+
+        void FuseSelectAssign(Block* block, std::unordered_map<std::string, Tensor>& params, std::unordered_map<std::string, Value*>& vmap, SubgraphMatcher& matcher)
+        {
+            auto graph = block->owningGraph();
+            auto it    = block->nodes().begin();
+            while (it != block->nodes().end())
+            {
+                auto node = *it;
+                ++it;
+                for (auto block : node->blocks())
+                {
+                    FuseSelectAssign(block, params, vmap, matcher);
+                }
+
+                if (node->kind() == Symbol::onnx("Cast") && node->i(Symbol::attr("to")) == 9)
+                {
+                    RemoveBoolCast(node);
+                }
+                else if (matcher.matchesSubgraphFromAnchorNode(node))
+                {
+                    FuseSelectAssign(node, params, vmap, matcher);
+                }
+            }
         }
-      }
-    }
-  }
-
-  {
-    // check shape of reshape
-    Node* shape = values_map[vmap["reshape_1_shape"]]->node();
-    auto shape_val = shape->t(Symbol::attr("value"));
-    if (shape_val.dim() != 1) return false;
-    if (shape_val.data_ptr<int64_t>()[0] != -1) return false;
-  }
-
-  {
-    // check transpose
-    Node* trans[] = {values_map[vmap["trans_1"]]->node(), values_map[vmap["trans_2"]]->node()};
-    for (auto tran : trans) {
-      auto tran_perm = tran->is(Symbol::attr("perm"));
-      if (tran_perm.size() != 2) return false;
-      if (tran_perm[0] != 1 || tran_perm[1] != 0) return false;
-    }
-  }
-
-  {
-    // check gather indice
-    Node* gather_inds = values_map[vmap["gather_inds_2"]]->node();
-    auto inds_val = gather_inds->t(Symbol::attr("value"));
-    if (inds_val.dim() != 0) return false;
-    if (inds_val.data_ptr<int64_t>()[0] != 0) return false;
-  }
-
-  {
-    // check slice start
-    Node* slice = values_map[vmap["slice_2"]]->node();
-    auto start_name = slice->inputs()[1]->debugName();
-    auto start_val = params[start_name];
-    if (start_val.dim() != 1) return false;
-    if (start_val.data_ptr<int64_t>()[0] != 0) return false;
-  }
-
-  // create new node
-  auto graph = node->owningGraph();
-  auto z = values_map[vmap["z"]];
-  auto y = values_map[vmap["y"]];
-  auto where_node = graph->create(Symbol::onnx("Where"), {cmp1->output(), z, y});
-  where_node->insertBefore(node);
-  where_node->output()->copyMetadata(node->output());
-  node->output()->replaceAllUsesWith(where_node->output());
-  return true;
-}
-
-void FuseSelectAssign(Block* block, std::unordered_map<std::string, Tensor>& params,
-                      std::unordered_map<std::string, Value*>& vmap, SubgraphMatcher& matcher) {
-  auto graph = block->owningGraph();
-  auto it = block->nodes().begin();
-  while (it != block->nodes().end()) {
-    auto node = *it;
-    ++it;
-    for (auto block : node->blocks()) {
-      FuseSelectAssign(block, params, vmap, matcher);
-    }
-
-    if (node->kind() == Symbol::onnx("Cast") && node->i(Symbol::attr("to")) == 9) {
-      RemoveBoolCast(node);
-    } else if (matcher.matchesSubgraphFromAnchorNode(node)) {
-      FuseSelectAssign(node, params, vmap, matcher);
-    }
-  }
-}
-
-void FuseSelectAssign(std::shared_ptr<Graph>& graph,
-                      std::unordered_map<std::string, Tensor>& params) {
-  // cse before search
-  CommonSubgraphElimination(graph, params);
-
-  std::string pattern_str = R"IR(
+
+        void FuseSelectAssign(std::shared_ptr<Graph>&                  graph,
+                              std::unordered_map<std::string, Tensor>& params)
+        {
+            // cse before search
+            CommonSubgraphElimination(graph, params);
+
+            std::string                             pattern_str = R"IR(
       graph(%y, %z, %cmp_1, %cmp_2, %start, %axes, %shape_2):
         %nz_1 = onnx::NonZero(%cmp_1)
         %trans_1 = onnx::Transpose(%nz_1)
@@ -149,15 +167,16 @@ void FuseSelectAssign(std::shared_ptr<Graph>& graph,
         return (%scatter_2)
   )IR";
 
-  Graph pattern;
-  std::unordered_map<std::string, Value*> vmap;
-  torch::jit::parseIR(pattern_str, &pattern, vmap);
-
-  SubgraphMatcher matcher(pattern, MatchAttribute::NO_MATCH);
-  FuseSelectAssign(graph->block(), params, vmap, matcher);
-  torch::jit::EliminateDeadCode(
-      graph->block(), true,
-      torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
-}
-}  // namespace torch_jit
+            Graph                                   pattern;
+            std::unordered_map<std::string, Value*> vmap;
+            torch::jit::parseIR(pattern_str, &pattern, vmap);
+
+            SubgraphMatcher matcher(pattern, MatchAttribute::NO_MATCH);
+            FuseSelectAssign(graph->block(), params, vmap, matcher);
+            torch::jit::EliminateDeadCode(
+                graph->block(),
+                true,
+                torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
+        }
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.h
index afa0dc56d6..0e80ec1d67 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.h
@@ -3,15 +3,17 @@
 #define _FUSE_SELECT_ASSIGN_H_
 
 #include <torch/script.h>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::Tensor;
-using torch::jit::Graph;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::Tensor;
+        using torch::jit::Graph;
 
-// this pass is used to fuse y[x>thres] = z[x>thres]
-void FuseSelectAssign(std::shared_ptr<Graph>& graph,
-                      std::unordered_map<std::string, Tensor>& params);
-}  // namespace torch_jit
+        // this pass is used to fuse y[x>thres] = z[x>thres]
+        void FuseSelectAssign(std::shared_ptr<Graph>&                  graph,
+                              std::unordered_map<std::string, Tensor>& params);
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.cpp
index 3da4933b15..dea6909f8b 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.cpp
@@ -5,111 +5,131 @@
 
 #include "utils.h"
 
-namespace mmdeploy {
-namespace torch_jit {
-
-using c10::Symbol;
-using torch::jit::Block;
-using torch::jit::IValue;
-using torch::jit::Node;
-using torch::jit::TensorType;
-using torch::jit::Value;
-
-void MergeShapeConcate(Node* node) {
-  auto inputs = node->inputs();
-
-  std::vector<int64_t> gather_value;
-  Value* shape_from = nullptr;
-
-  std::vector<Node*> node_to_remove{node};
-
-  // check pattern shape->gather->unsqueeze->concate
-  for (auto input : inputs) {
-    auto unsqueeze_node = input->node();
-    if (!is_kind(unsqueeze_node, "onnx::Unsqueeze") || unsqueeze_node->output()->uses().size() != 1)
-      return;
-
-    if (unsqueeze_node->hasAttribute(Symbol::attr("axes"))) {
-      auto axes = unsqueeze_node->is(Symbol::attr("axes"));
-      if (axes.size() != 1 && axes[0] != 0) return;
-    }
-
-    auto gather_node = unsqueeze_node->input(0)->node();
-    if (!is_kind(gather_node, "onnx::Gather") || gather_node->i(Symbol::attr("axis")) != 0 ||
-        gather_node->output()->uses().size() != 1)
-      return;
-
-    auto gather_inputs = gather_node->inputs();
-    auto gather_data = gather_inputs[0];
-    auto gather_indices = gather_inputs[1];
-    auto shape_node = gather_data->node();
-    if (!is_kind(shape_node, "onnx::Shape") || shape_node->output()->uses().size() != 1) return;
-
-    auto current_shape_from = shape_node->input();
-    if (!shape_from) {
-      shape_from = current_shape_from;
-    } else {
-      if (shape_from != current_shape_from) return;
-    }
-
-    auto constant_node = gather_indices->node();
-    if (!is_kind(constant_node, "onnx::Constant")) return;
-
-    auto gather_indices_val = constant_node->t(Symbol::attr("value"));
-    int64_t* data_ptr = gather_indices_val.data_ptr<int64_t>();
-    if (gather_indices_val.dim() == 0) {
-      gather_value.push_back(data_ptr[0]);
-    } else {
-      int element_size = gather_indices_val.element_size();
-      for (int j = 0; j < element_size; ++j) {
-        gather_value.push_back(data_ptr[j]);
-      }
-    }
-
-    node_to_remove.insert(node_to_remove.end(), {unsqueeze_node, gather_node, shape_node});
-  }
-
-  // create constant value
-  auto graph = node->owningGraph();
-  auto const_node = graph->create(Symbol::onnx("Constant"));
-  const_node->t_(Symbol::attr("value"), at::tensor(gather_value));
-  auto first_node = node->owningGraph()->block()->nodes().front();
-  if (const_node != first_node) const_node->insertBefore(first_node);
-
-  // recreate shape node
-  auto shape_node = graph->create(Symbol::onnx("Shape"), {shape_from});
-  shape_node->insertBefore(node);
-
-  // create gather node
-  auto gather_node =
-      graph->create(Symbol::onnx("Gather"), {shape_node->output(), const_node->output()});
-
-  // insert into graph
-  gather_node->insertAfter(node);
-  node->output()->replaceAllUsesWith(gather_node->output());
-
-  for (auto n : node_to_remove) {
-    n->destroy();
-  }
-}
-
-void MergeShapeConcate(Block* block) {
-  auto graph = block->owningGraph();
-  auto it = block->nodes().begin();
-  while (it != block->nodes().end()) {
-    auto node = *it;
-    ++it;
-    for (auto block : node->blocks()) {
-      MergeShapeConcate(block);
-    }
-
-    if (is_kind(node, "onnx::Concat")) {
-      MergeShapeConcate(node);
-    }
-  }
-}
-
-void MergeShapeConcate(const std::shared_ptr<Graph>& graph) { MergeShapeConcate(graph->block()); }
-
-}  // namespace torch_jit
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using c10::Symbol;
+        using torch::jit::Block;
+        using torch::jit::IValue;
+        using torch::jit::Node;
+        using torch::jit::TensorType;
+        using torch::jit::Value;
+
+        void MergeShapeConcate(Node* node)
+        {
+            auto                 inputs = node->inputs();
+
+            std::vector<int64_t> gather_value;
+            Value*               shape_from = nullptr;
+
+            std::vector<Node*>   node_to_remove{node};
+
+            // check pattern shape->gather->unsqueeze->concate
+            for (auto input : inputs)
+            {
+                auto unsqueeze_node = input->node();
+                if (!is_kind(unsqueeze_node, "onnx::Unsqueeze") || unsqueeze_node->output()->uses().size() != 1)
+                    return;
+
+                if (unsqueeze_node->hasAttribute(Symbol::attr("axes")))
+                {
+                    auto axes = unsqueeze_node->is(Symbol::attr("axes"));
+                    if (axes.size() != 1 && axes[0] != 0) return;
+                }
+
+                auto gather_node = unsqueeze_node->input(0)->node();
+                if (!is_kind(gather_node, "onnx::Gather") || gather_node->i(Symbol::attr("axis")) != 0 ||
+                    gather_node->output()->uses().size() != 1)
+                    return;
+
+                auto gather_inputs  = gather_node->inputs();
+                auto gather_data    = gather_inputs[0];
+                auto gather_indices = gather_inputs[1];
+                auto shape_node     = gather_data->node();
+                if (!is_kind(shape_node, "onnx::Shape") || shape_node->output()->uses().size() != 1) return;
+
+                auto current_shape_from = shape_node->input();
+                if (!shape_from)
+                {
+                    shape_from = current_shape_from;
+                }
+                else
+                {
+                    if (shape_from != current_shape_from) return;
+                }
+
+                auto constant_node = gather_indices->node();
+                if (!is_kind(constant_node, "onnx::Constant")) return;
+
+                auto     gather_indices_val = constant_node->t(Symbol::attr("value"));
+                int64_t* data_ptr           = gather_indices_val.data_ptr<int64_t>();
+                if (gather_indices_val.dim() == 0)
+                {
+                    gather_value.push_back(data_ptr[0]);
+                }
+                else
+                {
+                    int element_size = gather_indices_val.element_size();
+                    for (int j = 0; j < element_size; ++j)
+                    {
+                        gather_value.push_back(data_ptr[j]);
+                    }
+                }
+
+                node_to_remove.insert(node_to_remove.end(), {unsqueeze_node, gather_node, shape_node});
+            }
+
+            // create constant value
+            auto graph      = node->owningGraph();
+            auto const_node = graph->create(Symbol::onnx("Constant"));
+            const_node->t_(Symbol::attr("value"), at::tensor(gather_value));
+            auto first_node = node->owningGraph()->block()->nodes().front();
+            if (const_node != first_node) const_node->insertBefore(first_node);
+
+            // recreate shape node
+            auto shape_node = graph->create(Symbol::onnx("Shape"), {shape_from});
+            shape_node->insertBefore(node);
+
+            // create gather node
+            auto gather_node =
+                graph->create(Symbol::onnx("Gather"), {shape_node->output(), const_node->output()});
+
+            // insert into graph
+            gather_node->insertAfter(node);
+            node->output()->replaceAllUsesWith(gather_node->output());
+
+            for (auto n : node_to_remove)
+            {
+                n->destroy();
+            }
+        }
+
+        void MergeShapeConcate(Block* block)
+        {
+            auto graph = block->owningGraph();
+            auto it    = block->nodes().begin();
+            while (it != block->nodes().end())
+            {
+                auto node = *it;
+                ++it;
+                for (auto block : node->blocks())
+                {
+                    MergeShapeConcate(block);
+                }
+
+                if (is_kind(node, "onnx::Concat"))
+                {
+                    MergeShapeConcate(node);
+                }
+            }
+        }
+
+        void MergeShapeConcate(const std::shared_ptr<Graph>& graph)
+        {
+            MergeShapeConcate(graph->block());
+        }
+
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.h
index 8656da63c2..13a67f0f47 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.h
@@ -3,12 +3,14 @@
 #define _MERGE_SHAPE_CONCATE_H_
 
 #include <torch/script.h>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::jit::Graph;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::jit::Graph;
 
-void MergeShapeConcate(const std::shared_ptr<Graph>& graph);
-}  // namespace torch_jit
+        void MergeShapeConcate(const std::shared_ptr<Graph>& graph);
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp
index f0ef5a5230..0b687c5083 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp
@@ -7,75 +7,91 @@
 
 #include "utils.h"
 
-namespace mmdeploy {
-namespace torch_jit {
-
-using c10::Symbol;
-using torch::jit::Block;
-using torch::jit::IValue;
-using torch::jit::Node;
-using torch::jit::TensorType;
-using torch::jit::Value;
-
-void RemoveReshapeChain(Node* node) {
-  // reshape->reshape => reshape
-  auto output = node->output();
-  if (!(output->hasUses())) {
-    return;
-  }
-  auto uses = output->uses();
-
-  for (auto use : uses) {
-    if (!is_kind(use.user, "onnx::Reshape") || use.offset != 0) {
-      return;
-    }
-  }
-
-  auto input = node->inputs()[0];
-  output->replaceAllUsesWith(input);
-
-  node->destroy();
-}
-
-void RemoveRedundantCast(Node* node) {
-  // Cast(type n)->Cast(type n) => Cast(type n)
-
-  auto to_type = node->i(Symbol::attr("to"));
-  auto input = node->input();
-
-  auto input_node = input->node();
-  if (is_kind(input_node, "onnx::Cast") && input_node->i(Symbol::attr("to")) == to_type) {
-    auto output = node->output();
-
-    output->replaceAllUsesWith(input);
-    node->destroy();
-  }
-}
-
-void ONNXPeephole(Block* block) {
-  auto graph = block->owningGraph();
-  auto it = block->nodes().begin();
-  while (it != block->nodes().end()) {
-    auto node = *it;
-    ++it;
-    for (auto block : node->blocks()) {
-      ONNXPeephole(block);
-    }
-
-    if (is_kind(node, "onnx::Reshape")) {
-      RemoveReshapeChain(node);
-    } else if (is_kind(node, "onnx::Cast")) {
-      RemoveRedundantCast(node);
-    }
-  }
-}
-
-void ONNXPeephole(const std::shared_ptr<Graph>& graph) {
-  ONNXPeephole(graph->block());
-  torch::jit::EliminateDeadCode(
-      graph->block(), true,
-      torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
-}
-
-}  // namespace torch_jit
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using c10::Symbol;
+        using torch::jit::Block;
+        using torch::jit::IValue;
+        using torch::jit::Node;
+        using torch::jit::TensorType;
+        using torch::jit::Value;
+
+        void RemoveReshapeChain(Node* node)
+        {
+            // reshape->reshape => reshape
+            auto output = node->output();
+            if (!(output->hasUses()))
+            {
+                return;
+            }
+            auto uses = output->uses();
+
+            for (auto use : uses)
+            {
+                if (!is_kind(use.user, "onnx::Reshape") || use.offset != 0)
+                {
+                    return;
+                }
+            }
+
+            auto input = node->inputs()[0];
+            output->replaceAllUsesWith(input);
+
+            node->destroy();
+        }
+
+        void RemoveRedundantCast(Node* node)
+        {
+            // Cast(type n)->Cast(type n) => Cast(type n)
+
+            auto to_type = node->i(Symbol::attr("to"));
+            auto input   = node->input();
+
+            auto input_node = input->node();
+            if (is_kind(input_node, "onnx::Cast") && input_node->i(Symbol::attr("to")) == to_type)
+            {
+                auto output = node->output();
+
+                output->replaceAllUsesWith(input);
+                node->destroy();
+            }
+        }
+
+        void ONNXPeephole(Block* block)
+        {
+            auto graph = block->owningGraph();
+            auto it    = block->nodes().begin();
+            while (it != block->nodes().end())
+            {
+                auto node = *it;
+                ++it;
+                for (auto block : node->blocks())
+                {
+                    ONNXPeephole(block);
+                }
+
+                if (is_kind(node, "onnx::Reshape"))
+                {
+                    RemoveReshapeChain(node);
+                }
+                else if (is_kind(node, "onnx::Cast"))
+                {
+                    RemoveRedundantCast(node);
+                }
+            }
+        }
+
+        void ONNXPeephole(const std::shared_ptr<Graph>& graph)
+        {
+            ONNXPeephole(graph->block());
+            torch::jit::EliminateDeadCode(
+                graph->block(),
+                true,
+                torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
+        }
+
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.h
index f388da1bfa..21b7be15d1 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.h
@@ -3,13 +3,15 @@
 #define _ONNX_PEEPHOLE_H_
 
 #include <torch/script.h>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::jit::Graph;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::jit::Graph;
 
-void ONNXPeephole(const std::shared_ptr<Graph>& graph);
+        void ONNXPeephole(const std::shared_ptr<Graph>& graph);
 
-}  // namespace torch_jit
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/utils.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/utils.h
index 1c92cd15a1..147e5b1349 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/utils.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/utils.h
@@ -3,18 +3,24 @@
 
 #include <torch/script.h>
 
-namespace mmdeploy {
-namespace torch_jit {
-using c10::Symbol;
-using torch::jit::Node;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using c10::Symbol;
+        using torch::jit::Node;
 
-inline bool is_kind(const Node* node, const Symbol& symbol) { return node->kind() == symbol; }
+        inline bool is_kind(const Node* node, const Symbol& symbol)
+        {
+            return node->kind() == symbol;
+        }
 
-inline bool is_kind(const Node* node, const char* symbol_name) {
-  return is_kind(node, Symbol::fromQualString(symbol_name));
-}
+        inline bool is_kind(const Node* node, const char* symbol_name)
+        {
+            return is_kind(node, Symbol::fromQualString(symbol_name));
+        }
 
-}  // namespace torch_jit
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/codebase/common.h b/csrc/mmdeploy/codebase/common.h
index 391f177590..f5d01c3bbe 100644
--- a/csrc/mmdeploy/codebase/common.h
+++ b/csrc/mmdeploy/codebase/common.h
@@ -9,69 +9,87 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/experimental/module_adapter.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-using namespace framework;
+    using namespace framework;
 
-class Context {
- public:
-  explicit Context(const Value& config) {
-    MMDEPLOY_DEBUG("config: {}", config);
-    device_ = config["context"]["device"].get<Device>();
-    stream_ = config["context"]["stream"].get<Stream>();
-  }
+    class Context
+    {
+      public:
+        explicit Context(const Value& config)
+        {
+            MMDEPLOY_DEBUG("config: {}", config);
+            device_ = config["context"]["device"].get<Device>();
+            stream_ = config["context"]["stream"].get<Stream>();
+        }
 
-  Device& device() { return device_; }
-  Stream& stream() { return stream_; }
+        Device& device()
+        {
+            return device_;
+        }
+        Stream& stream()
+        {
+            return stream_;
+        }
 
- protected:
-  Device device_;
-  Stream stream_;
-};
+      protected:
+        Device device_;
+        Stream stream_;
+    };
 
-template <class Tag>
-class CodebaseCreator : public Creator<Module> {
- public:
-  std::string_view name() const noexcept override { return Tag::name; }
-  std::unique_ptr<Module> Create(const Value& cfg) override {
-    constexpr auto key{"component"};
-    if (!cfg.contains(key)) {
-      MMDEPLOY_ERROR("no key '{}' in config {}", key, cfg);
-      throw_exception(eInvalidArgument);
-    }
-    if (!cfg[key].is_string()) {
-      MMDEPLOY_ERROR("key '{}' is not a string", key);
-      throw_exception(eInvalidArgument);
-    }
-    auto postprocess_type = cfg[key].get<std::string>();
-    auto creator = gRegistry<Tag>().Get(postprocess_type);
-    if (creator == nullptr) {
-      MMDEPLOY_ERROR("Could not found entry '{}' in {}. Available components: {}", postprocess_type,
-                     Tag::name, gRegistry<Tag>().List());
-      throw_exception(eEntryNotFound);
-    }
-    return creator->Create(cfg);
-  }
-};
+    template<class Tag>
+    class CodebaseCreator : public Creator<Module>
+    {
+      public:
+        std::string_view name() const noexcept override
+        {
+            return Tag::name;
+        }
+        std::unique_ptr<Module> Create(const Value& cfg) override
+        {
+            constexpr auto key{"component"};
+            if (!cfg.contains(key))
+            {
+                MMDEPLOY_ERROR("no key '{}' in config {}", key, cfg);
+                throw_exception(eInvalidArgument);
+            }
+            if (!cfg[key].is_string())
+            {
+                MMDEPLOY_ERROR("key '{}' is not a string", key);
+                throw_exception(eInvalidArgument);
+            }
+            auto postprocess_type = cfg[key].get<std::string>();
+            auto creator          = gRegistry<Tag>().Get(postprocess_type);
+            if (creator == nullptr)
+            {
+                MMDEPLOY_ERROR("Could not found entry '{}' in {}. Available components: {}", postprocess_type, Tag::name, gRegistry<Tag>().List());
+                throw_exception(eEntryNotFound);
+            }
+            return creator->Create(cfg);
+        }
+    };
 
-#define MMDEPLOY_DECLARE_CODEBASE(codebase_type, codebase_name)      \
-  class codebase_type : public Context {                             \
-   public:                                                           \
-    static constexpr const auto name = #codebase_name;               \
-    using type = std::unique_ptr<Module>;                            \
-    explicit codebase_type(const Value& config) : Context(config) {} \
-  };                                                                 \
-  MMDEPLOY_DECLARE_REGISTRY(codebase_type, std::unique_ptr<Module>(const Value& config));
+#define MMDEPLOY_DECLARE_CODEBASE(codebase_type, codebase_name)     \
+    class codebase_type : public Context                            \
+    {                                                               \
+      public:                                                       \
+        static constexpr const auto name = #codebase_name;          \
+        using type                       = std::unique_ptr<Module>; \
+        explicit codebase_type(const Value& config)                 \
+            : Context(config)                                       \
+        {                                                           \
+        }                                                           \
+    };                                                              \
+    MMDEPLOY_DECLARE_REGISTRY(codebase_type, std::unique_ptr<Module>(const Value& config));
 
-#define MMDEPLOY_REGISTER_CODEBASE(codebase)              \
-  using codebase##_##Creator = CodebaseCreator<codebase>; \
-  MMDEPLOY_REGISTER_CREATOR(Module, codebase##_##Creator) \
-  MMDEPLOY_DEFINE_REGISTRY(codebase)
+#define MMDEPLOY_REGISTER_CODEBASE(codebase)                \
+    using codebase##_##Creator = CodebaseCreator<codebase>; \
+    MMDEPLOY_REGISTER_CREATOR(Module, codebase##_##Creator) \
+    MMDEPLOY_DEFINE_REGISTRY(codebase)
 
-#define MMDEPLOY_REGISTER_CODEBASE_COMPONENT(codebase, component_type)                    \
-  MMDEPLOY_REGISTER_FACTORY_FUNC(codebase, (component_type, 0), [](const Value& config) { \
-    return CreateTask(component_type(config));                                            \
-  })
+#define MMDEPLOY_REGISTER_CODEBASE_COMPONENT(codebase, component_type) \
+    MMDEPLOY_REGISTER_FACTORY_FUNC(codebase, (component_type, 0), [](const Value& config) { return CreateTask(component_type(config)); })
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/codebase/mmaction/base_head.cpp b/csrc/mmdeploy/codebase/mmaction/base_head.cpp
index 931c9663eb..2e541fd660 100644
--- a/csrc/mmdeploy/codebase/mmaction/base_head.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/base_head.cpp
@@ -7,66 +7,75 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/utils/device_utils.h"
 
-namespace mmdeploy::mmaction {
+namespace mmdeploy::mmaction
+{
 
-class BaseHead : public MMAction {
- public:
-  explicit BaseHead(const Value& cfg) : MMAction(cfg) {
-    if (cfg.contains("params")) {
-      topk_ = cfg["params"].value("topk", 1);
-      if (topk_ <= 0) {
-        MMDEPLOY_ERROR("'topk' should be greater than 0, but got '{}'", topk_);
-        throw_exception(eInvalidArgument);
-      }
-    }
-  }
+    class BaseHead : public MMAction
+    {
+      public:
+        explicit BaseHead(const Value& cfg)
+            : MMAction(cfg)
+        {
+            if (cfg.contains("params"))
+            {
+                topk_ = cfg["params"].value("topk", 1);
+                if (topk_ <= 0)
+                {
+                    MMDEPLOY_ERROR("'topk' should be greater than 0, but got '{}'", topk_);
+                    throw_exception(eInvalidArgument);
+                }
+            }
+        }
 
-  Result<Value> operator()(const Value& infer_res) {
-    MMDEPLOY_DEBUG("infer_res: {}", infer_res);
-    auto output = infer_res["output"].get<Tensor>();
+        Result<Value> operator()(const Value& infer_res)
+        {
+            MMDEPLOY_DEBUG("infer_res: {}", infer_res);
+            auto output = infer_res["output"].get<Tensor>();
 
-    if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(),
-                     (int)output.data_type());
-      return Status(eNotSupported);
-    }
+            if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(), (int)output.data_type());
+                return Status(eNotSupported);
+            }
 
-    auto class_num = (int)output.shape(1);
+            auto class_num = (int)output.shape(1);
 
-    OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
-    OUTCOME_TRY(stream().Wait());
+            OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
+            OUTCOME_TRY(stream().Wait());
 
-    return GetLabels(_scores, class_num);
-  }
+            return GetLabels(_scores, class_num);
+        }
 
- private:
-  Value GetLabels(const Tensor& scores, int class_num) const {
-    auto scores_data = scores.data<float>();
-    Labels output;
-    output.reserve(topk_);
-    std::vector<int> idx(class_num);
-    iota(begin(idx), end(idx), 0);
-    partial_sort(begin(idx), begin(idx) + topk_, end(idx),
-                 [&](int i, int j) { return scores_data[i] > scores_data[j]; });
-    for (int i = 0; i < topk_; ++i) {
-      auto label = Label{idx[i], scores_data[idx[i]]};
-      MMDEPLOY_DEBUG("label_id: {}, score: {}", label.label_id, label.score);
-      output.push_back(label);
-    }
-    return to_value(std::move(output));
-  }
+      private:
+        Value GetLabels(const Tensor& scores, int class_num) const
+        {
+            auto   scores_data = scores.data<float>();
+            Labels output;
+            output.reserve(topk_);
+            std::vector<int> idx(class_num);
+            iota(begin(idx), end(idx), 0);
+            partial_sort(begin(idx), begin(idx) + topk_, end(idx), [&](int i, int j)
+                         { return scores_data[i] > scores_data[j]; });
+            for (int i = 0; i < topk_; ++i)
+            {
+                auto label = Label{idx[i], scores_data[idx[i]]};
+                MMDEPLOY_DEBUG("label_id: {}, score: {}", label.label_id, label.score);
+                output.push_back(label);
+            }
+            return to_value(std::move(output));
+        }
 
- private:
-  static constexpr const auto kHost = Device{0};
-  int topk_{1};
-};
+      private:
+        static constexpr const auto kHost = Device{0};
+        int                         topk_{1};
+    };
 
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, BaseHead);
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, BaseHead);
 
-using SlowFastHead = BaseHead;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, SlowFastHead);
+    using SlowFastHead = BaseHead;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, SlowFastHead);
 
-using TSNHead = BaseHead;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, TSNHead);
+    using TSNHead = BaseHead;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, TSNHead);
 
 }  // namespace mmdeploy::mmaction
diff --git a/csrc/mmdeploy/codebase/mmaction/format_shape.cpp b/csrc/mmdeploy/codebase/mmaction/format_shape.cpp
index 7d8c6ac5c6..ff65fe184d 100644
--- a/csrc/mmdeploy/codebase/mmaction/format_shape.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/format_shape.cpp
@@ -7,122 +7,141 @@
 
 using namespace std;
 
-namespace mmdeploy::mmaction {
-
-FormatShape::FormatShape(const Value& args) {
-  input_format_ = args.value("input_format", std::string(""));
-  if (input_format_ != "NCHW" && input_format_ != "NCTHW") {
-    MMDEPLOY_ERROR("'input_format' should be 'NCHW' or 'NCTHW'");
-    throw_exception(eInvalidArgument);
-  }
-  permute_ = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
-}
-
-Result<void> FormatShape::MergeInputs(const std::vector<Tensor>& images, Tensor& inputs) {
-  auto N = static_cast<int64_t>(images.size());
-  auto H = images[0].shape(1);
-  auto W = images[0].shape(2);
-  auto C = images[0].shape(3);
-  auto& device = operation::gContext().device();
-  auto& stream = operation::gContext().stream();
-
-  TensorDesc desc = {device, DataType::kFLOAT, {N, H, W, C}};
-  inputs = Tensor(desc);
-  auto offset = 0UL;
-  auto n_item = H * W * C;
-  auto copy_size = n_item * sizeof(float);
-  for (int i = 0; i < N; i++) {
-    auto src_buffer = images[i].buffer();
-    auto dst_buffer = inputs.buffer();
-    OUTCOME_TRY(stream.Copy(src_buffer, dst_buffer, copy_size, 0, offset));
-    offset += copy_size;
-  }
-  return success();
-}
-
-Result<void> FormatShape::Format(const std::vector<Tensor>& images, Tensor& output, int clip_len,
-                                 int num_clips) {
-  Tensor inputs;
-  OUTCOME_TRY(MergeInputs(images, inputs));
-
-  // Tensor dst;
-  if (input_format_ == "NCHW") {
-    OUTCOME_TRY(FormatNCHW(inputs, clip_len, num_clips, output));
-  }
-  if (input_format_ == "NCTHW") {
-    OUTCOME_TRY(FormatNCTHW(inputs, clip_len, num_clips, output));
-  }
-
-  TensorShape expand_dim = output.shape();
-  expand_dim.insert(expand_dim.begin(), 1);
-  output.Reshape(expand_dim);
-
-  return success();
-}
-
-Result<void> FormatShape::FormatNCHW(Tensor& src, int clip_len, int num_clips, Tensor& dst) {
-  const vector<int> axes = {0, 3, 1, 2};
-  OUTCOME_TRY(permute_.Apply(src, dst, axes));
-  return success();
-}
-
-Result<void> FormatShape::FormatNCTHW(Tensor& src, int clip_len, int num_clips, Tensor& dst) {
-  auto N = src.shape(0);
-  auto H = src.shape(1);
-  auto W = src.shape(2);
-  auto C = src.shape(3);
-  int L = clip_len;
-  if (N % L != 0) {
-    return Status(eInvalidArgument);
-  }
-  int M = N / L;
-  src.Reshape({M, L, H, W, C});
-  const vector<int> axes = {0, 4, 1, 2, 3};
-  OUTCOME_TRY(permute_.Apply(src, dst, axes));
-  return success();
-}
-
-Result<void> FormatShape::Apply(Value& data) {
-  MMDEPLOY_DEBUG("input: {}", data);
-
-  if (!data.is_array()) {
-    MMDEPLOY_ERROR("input of format shape should be array");
-    return Status(eInvalidArgument);
-  }
-  if (!(data[0].contains("imgs") || data[0].contains("img"))) {
-    MMDEPLOY_ERROR("input should contains imgs or img");
-    return Status(eInvalidArgument);
-  }
-
-  int n_image = data.size();
-  int clip_len = data[0]["clip_len"].get<int>();
-  int num_clips = data[0]["num_clips"].get<int>();
-  std::vector<Tensor> images;
-
-  if (data[0].contains("imgs")) {
-    int n_crop = data[0]["imgs"].size();
-    int total = n_image * n_crop;
-    images.reserve(total);
-    for (int i = 0; i < n_crop; i++) {
-      for (int j = 0; j < n_image; j++) {
-        images.push_back(data[j]["imgs"][i].get<Tensor>());
-      }
+namespace mmdeploy::mmaction
+{
+
+    FormatShape::FormatShape(const Value& args)
+    {
+        input_format_ = args.value("input_format", std::string(""));
+        if (input_format_ != "NCHW" && input_format_ != "NCTHW")
+        {
+            MMDEPLOY_ERROR("'input_format' should be 'NCHW' or 'NCTHW'");
+            throw_exception(eInvalidArgument);
+        }
+        permute_ = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
     }
-  } else if (data[0].contains("img")) {
-    images.reserve(n_image);
-    for (int i = 0; i < n_image; i++) {
-      images.push_back(data[i]["img"].get<Tensor>());
+
+    Result<void> FormatShape::MergeInputs(const std::vector<Tensor>& images, Tensor& inputs)
+    {
+        auto       N      = static_cast<int64_t>(images.size());
+        auto       H      = images[0].shape(1);
+        auto       W      = images[0].shape(2);
+        auto       C      = images[0].shape(3);
+        auto&      device = operation::gContext().device();
+        auto&      stream = operation::gContext().stream();
+
+        TensorDesc desc = {device, DataType::kFLOAT, {N, H, W, C}};
+        inputs          = Tensor(desc);
+        auto offset     = 0UL;
+        auto n_item     = H * W * C;
+        auto copy_size  = n_item * sizeof(float);
+        for (int i = 0; i < N; i++)
+        {
+            auto src_buffer = images[i].buffer();
+            auto dst_buffer = inputs.buffer();
+            OUTCOME_TRY(stream.Copy(src_buffer, dst_buffer, copy_size, 0, offset));
+            offset += copy_size;
+        }
+        return success();
+    }
+
+    Result<void> FormatShape::Format(const std::vector<Tensor>& images, Tensor& output, int clip_len, int num_clips)
+    {
+        Tensor inputs;
+        OUTCOME_TRY(MergeInputs(images, inputs));
+
+        // Tensor dst;
+        if (input_format_ == "NCHW")
+        {
+            OUTCOME_TRY(FormatNCHW(inputs, clip_len, num_clips, output));
+        }
+        if (input_format_ == "NCTHW")
+        {
+            OUTCOME_TRY(FormatNCTHW(inputs, clip_len, num_clips, output));
+        }
+
+        TensorShape expand_dim = output.shape();
+        expand_dim.insert(expand_dim.begin(), 1);
+        output.Reshape(expand_dim);
+
+        return success();
     }
-  }
 
-  Tensor dst;
-  data = Value{};
-  OUTCOME_TRY(Format(images, dst, clip_len, num_clips));
-  data["img"] = std::move(dst);
+    Result<void> FormatShape::FormatNCHW(Tensor& src, int clip_len, int num_clips, Tensor& dst)
+    {
+        const vector<int> axes = {0, 3, 1, 2};
+        OUTCOME_TRY(permute_.Apply(src, dst, axes));
+        return success();
+    }
 
-  return success();
-}
+    Result<void> FormatShape::FormatNCTHW(Tensor& src, int clip_len, int num_clips, Tensor& dst)
+    {
+        auto N = src.shape(0);
+        auto H = src.shape(1);
+        auto W = src.shape(2);
+        auto C = src.shape(3);
+        int  L = clip_len;
+        if (N % L != 0)
+        {
+            return Status(eInvalidArgument);
+        }
+        int M = N / L;
+        src.Reshape({M, L, H, W, C});
+        const vector<int> axes = {0, 4, 1, 2, 3};
+        OUTCOME_TRY(permute_.Apply(src, dst, axes));
+        return success();
+    }
+
+    Result<void> FormatShape::Apply(Value& data)
+    {
+        MMDEPLOY_DEBUG("input: {}", data);
+
+        if (!data.is_array())
+        {
+            MMDEPLOY_ERROR("input of format shape should be array");
+            return Status(eInvalidArgument);
+        }
+        if (!(data[0].contains("imgs") || data[0].contains("img")))
+        {
+            MMDEPLOY_ERROR("input should contains imgs or img");
+            return Status(eInvalidArgument);
+        }
+
+        int                 n_image   = data.size();
+        int                 clip_len  = data[0]["clip_len"].get<int>();
+        int                 num_clips = data[0]["num_clips"].get<int>();
+        std::vector<Tensor> images;
+
+        if (data[0].contains("imgs"))
+        {
+            int n_crop = data[0]["imgs"].size();
+            int total  = n_image * n_crop;
+            images.reserve(total);
+            for (int i = 0; i < n_crop; i++)
+            {
+                for (int j = 0; j < n_image; j++)
+                {
+                    images.push_back(data[j]["imgs"][i].get<Tensor>());
+                }
+            }
+        }
+        else if (data[0].contains("img"))
+        {
+            images.reserve(n_image);
+            for (int i = 0; i < n_image; i++)
+            {
+                images.push_back(data[i]["img"].get<Tensor>());
+            }
+        }
+
+        Tensor dst;
+        data = Value{};
+        OUTCOME_TRY(Format(images, dst, clip_len, num_clips));
+        data["img"] = std::move(dst);
+
+        return success();
+    }
 
-MMDEPLOY_REGISTER_TRANSFORM(FormatShape);
+    MMDEPLOY_REGISTER_TRANSFORM(FormatShape);
 
 }  // namespace mmdeploy::mmaction
diff --git a/csrc/mmdeploy/codebase/mmaction/format_shape.h b/csrc/mmdeploy/codebase/mmaction/format_shape.h
index 97e4f99356..7ea0326c84 100644
--- a/csrc/mmdeploy/codebase/mmaction/format_shape.h
+++ b/csrc/mmdeploy/codebase/mmaction/format_shape.h
@@ -12,27 +12,28 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::mmaction {
+namespace mmdeploy::mmaction
+{
 
-class FormatShape : public Transform {
- public:
-  explicit FormatShape(const Value& args);
+    class FormatShape : public Transform
+    {
+      public:
+        explicit FormatShape(const Value& args);
 
-  Result<void> Apply(Value& data) override;
+        Result<void> Apply(Value& data) override;
 
-  Result<void> Format(const std::vector<Tensor>& images, Tensor& output, int clip_len,
-                      int num_clips);
+        Result<void> Format(const std::vector<Tensor>& images, Tensor& output, int clip_len, int num_clips);
 
-  Result<void> FormatNCHW(Tensor& src, int clip_len, int num_clips, Tensor& dst);
+        Result<void> FormatNCHW(Tensor& src, int clip_len, int num_clips, Tensor& dst);
 
-  Result<void> FormatNCTHW(Tensor& src, int clip_len, int num_clips, Tensor& dst);
+        Result<void> FormatNCTHW(Tensor& src, int clip_len, int num_clips, Tensor& dst);
 
-  Result<void> MergeInputs(const std::vector<Tensor>& images, Tensor& inputs);
+        Result<void> MergeInputs(const std::vector<Tensor>& images, Tensor& inputs);
 
- private:
-  std::string input_format_;
-  operation::Managed<operation::Permute> permute_;
-};
+      private:
+        std::string                            input_format_;
+        operation::Managed<operation::Permute> permute_;
+    };
 
 }  // namespace mmdeploy::mmaction
 
diff --git a/csrc/mmdeploy/codebase/mmaction/mmaction.cpp b/csrc/mmdeploy/codebase/mmaction/mmaction.cpp
index dc590a1800..7de226ecd1 100644
--- a/csrc/mmdeploy/codebase/mmaction/mmaction.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/mmaction.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmaction/mmaction.h"
 
-namespace mmdeploy::mmaction {
+namespace mmdeploy::mmaction
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMAction);
+    MMDEPLOY_REGISTER_CODEBASE(MMAction);
 
 }  // namespace mmdeploy::mmaction
diff --git a/csrc/mmdeploy/codebase/mmaction/mmaction.h b/csrc/mmdeploy/codebase/mmaction/mmaction.h
index ef097e6f20..a3add86894 100644
--- a/csrc/mmdeploy/codebase/mmaction/mmaction.h
+++ b/csrc/mmdeploy/codebase/mmaction/mmaction.h
@@ -8,17 +8,19 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/serialization.h"
 
-namespace mmdeploy::mmaction {
+namespace mmdeploy::mmaction
+{
 
-struct Label {
-  int label_id;
-  float score;
-  MMDEPLOY_ARCHIVE_MEMBERS(label_id, score);
-};
+    struct Label
+    {
+        int   label_id;
+        float score;
+        MMDEPLOY_ARCHIVE_MEMBERS(label_id, score);
+    };
 
-using Labels = std::vector<Label>;
+    using Labels = std::vector<Label>;
 
-MMDEPLOY_DECLARE_CODEBASE(MMAction, mmaction);
+    MMDEPLOY_DECLARE_CODEBASE(MMAction, mmaction);
 
 }  // namespace mmdeploy::mmaction
 
diff --git a/csrc/mmdeploy/codebase/mmcls/linear_cls.cpp b/csrc/mmdeploy/codebase/mmcls/linear_cls.cpp
index 6d22a67b3a..bad8d7c1cf 100644
--- a/csrc/mmdeploy/codebase/mmcls/linear_cls.cpp
+++ b/csrc/mmdeploy/codebase/mmcls/linear_cls.cpp
@@ -12,105 +12,123 @@
 
 using std::vector;
 
-namespace mmdeploy::mmcls {
-
-class LinearClsHead : public MMClassification {
- public:
-  explicit LinearClsHead(const Value& cfg) : MMClassification(cfg) {
-    if (cfg.contains("params")) {
-      softmax_ = cfg["params"].value("softmax", false);
-      topk_ = cfg["params"].value("topk", 1);
-      if (topk_ <= 0) {
-        MMDEPLOY_ERROR("'topk' should be greater than 0, but got '{}'", topk_);
-        throw_exception(eInvalidArgument);
-      }
-    }
-  }
-
-  Result<Value> operator()(const Value& infer_res) {
-    MMDEPLOY_DEBUG("infer_res: {}", infer_res);
-    auto output = infer_res["output"].get<Tensor>();
-
-    if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(),
-                     (int)output.data_type());
-      return Status(eNotSupported);
-    }
-
-    auto class_num = (int)output.shape(1);
-
-    OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
-    OUTCOME_TRY(stream().Wait());
-
-    return GetLabels(_scores, class_num);
-  }
-
- private:
-  Value GetLabels(const Tensor& scores, int class_num) const {
-    auto scores_data = scores.data<float>();
-    auto topk = std::min(topk_, class_num);
-    Labels output;
-    output.reserve(topk);
-    std::vector<int> idx(class_num);
-    iota(begin(idx), end(idx), 0);
-    partial_sort(begin(idx), begin(idx) + topk, end(idx),
-                 [&](int i, int j) { return scores_data[i] > scores_data[j]; });
-
-    auto sum_exp = 0.f;
-    std::vector<float> exp_scores;
-    if (softmax_) {
-      exp_scores.reserve(class_num);
-      auto max_val = scores_data[idx[0]];
-      for (int i = 0; i < class_num; ++i) {
-        sum_exp += exp_scores.emplace_back(std::exp(scores_data[i] - max_val));
-      }
-    }
-    for (int i = 0; i < topk; ++i) {
-      float score = 0.f;
-      if (softmax_) {
-        score = exp_scores[idx[i]] / sum_exp;
-      } else {
-        score = scores_data[idx[i]];
-      }
-      output.push_back({idx[i], score});
-    }
-    return to_value(std::move(output));
-  }
-
- private:
-  static constexpr const auto kHost = Device{0};
-
-  bool softmax_{false};
-  int topk_{1};
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, LinearClsHead);
-using ConformerHead = LinearClsHead;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, ConformerHead);
-
-class CropBox {
- public:
-  Result<Value> operator()(const Value& img, const Value& dets) {
-    auto patch = img["ori_img"].get<Mat>();
-    if (dets.is_object() && dets.contains("bbox")) {
-      auto _box = from_value<std::vector<float>>(dets["bbox"]);
-      cv::Rect rect(cv::Rect_<float>(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
-      patch = crop(patch, rect);
-    }
-    return Value{{"ori_img", patch}};
-  }
-
- private:
-  static Mat crop(const Mat& img, cv::Rect rect) {
-    cv::Mat mat(img.height(), img.width(), CV_8UC(img.channel()), img.data<void>());
-    rect &= cv::Rect(cv::Point(0, 0), mat.size());
-    mat = mat(rect).clone();
-    std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
-    return Mat{mat.rows, mat.cols, img.pixel_format(), img.type(), std::move(data)};
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (CropBox, 0),
-                               [](const Value&) { return CreateTask(CropBox{}); });
+namespace mmdeploy::mmcls
+{
+
+    class LinearClsHead : public MMClassification
+    {
+      public:
+        explicit LinearClsHead(const Value& cfg)
+            : MMClassification(cfg)
+        {
+            if (cfg.contains("params"))
+            {
+                softmax_ = cfg["params"].value("softmax", false);
+                topk_    = cfg["params"].value("topk", 1);
+                if (topk_ <= 0)
+                {
+                    MMDEPLOY_ERROR("'topk' should be greater than 0, but got '{}'", topk_);
+                    throw_exception(eInvalidArgument);
+                }
+            }
+        }
+
+        Result<Value> operator()(const Value& infer_res)
+        {
+            MMDEPLOY_DEBUG("infer_res: {}", infer_res);
+            auto output = infer_res["output"].get<Tensor>();
+
+            if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(), (int)output.data_type());
+                return Status(eNotSupported);
+            }
+
+            auto class_num = (int)output.shape(1);
+
+            OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
+            OUTCOME_TRY(stream().Wait());
+
+            return GetLabels(_scores, class_num);
+        }
+
+      private:
+        Value GetLabels(const Tensor& scores, int class_num) const
+        {
+            auto   scores_data = scores.data<float>();
+            auto   topk        = std::min(topk_, class_num);
+            Labels output;
+            output.reserve(topk);
+            std::vector<int> idx(class_num);
+            iota(begin(idx), end(idx), 0);
+            partial_sort(begin(idx), begin(idx) + topk, end(idx), [&](int i, int j)
+                         { return scores_data[i] > scores_data[j]; });
+
+            auto               sum_exp = 0.f;
+            std::vector<float> exp_scores;
+            if (softmax_)
+            {
+                exp_scores.reserve(class_num);
+                auto max_val = scores_data[idx[0]];
+                for (int i = 0; i < class_num; ++i)
+                {
+                    sum_exp += exp_scores.emplace_back(std::exp(scores_data[i] - max_val));
+                }
+            }
+            for (int i = 0; i < topk; ++i)
+            {
+                float score = 0.f;
+                if (softmax_)
+                {
+                    score = exp_scores[idx[i]] / sum_exp;
+                }
+                else
+                {
+                    score = scores_data[idx[i]];
+                }
+                output.push_back({idx[i], score});
+            }
+            return to_value(std::move(output));
+        }
+
+      private:
+        static constexpr const auto kHost = Device{0};
+
+        bool                        softmax_{false};
+        int                         topk_{1};
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, LinearClsHead);
+    using ConformerHead = LinearClsHead;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, ConformerHead);
+
+    class CropBox
+    {
+      public:
+        Result<Value> operator()(const Value& img, const Value& dets)
+        {
+            auto patch = img["ori_img"].get<Mat>();
+            if (dets.is_object() && dets.contains("bbox"))
+            {
+                auto     _box = from_value<std::vector<float>>(dets["bbox"]);
+                cv::Rect rect(cv::Rect_<float>(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
+                patch = crop(patch, rect);
+            }
+            return Value{{"ori_img", patch}};
+        }
+
+      private:
+        static Mat crop(const Mat& img, cv::Rect rect)
+        {
+            cv::Mat mat(img.height(), img.width(), CV_8UC(img.channel()), img.data<void>());
+            rect &= cv::Rect(cv::Point(0, 0), mat.size());
+            mat = mat(rect).clone();
+            std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
+            return Mat{mat.rows, mat.cols, img.pixel_format(), img.type(), std::move(data)};
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (CropBox, 0), [](const Value&)
+                                   { return CreateTask(CropBox{}); });
 
 }  // namespace mmdeploy::mmcls
diff --git a/csrc/mmdeploy/codebase/mmcls/mmcls.cpp b/csrc/mmdeploy/codebase/mmcls/mmcls.cpp
index 27c11eb012..c29680e7b5 100644
--- a/csrc/mmdeploy/codebase/mmcls/mmcls.cpp
+++ b/csrc/mmdeploy/codebase/mmcls/mmcls.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmcls/mmcls.h"
 
-namespace mmdeploy::mmcls {
+namespace mmdeploy::mmcls
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMClassification);
+    MMDEPLOY_REGISTER_CODEBASE(MMClassification);
 
 }  // namespace mmdeploy::mmcls
diff --git a/csrc/mmdeploy/codebase/mmcls/mmcls.h b/csrc/mmdeploy/codebase/mmcls/mmcls.h
index 5ac3169fd9..99f1f9065d 100644
--- a/csrc/mmdeploy/codebase/mmcls/mmcls.h
+++ b/csrc/mmdeploy/codebase/mmcls/mmcls.h
@@ -8,17 +8,19 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/serialization.h"
 
-namespace mmdeploy::mmcls {
+namespace mmdeploy::mmcls
+{
 
-struct Label {
-  int label_id;
-  float score;
-  MMDEPLOY_ARCHIVE_MEMBERS(label_id, score);
-};
+    struct Label
+    {
+        int   label_id;
+        float score;
+        MMDEPLOY_ARCHIVE_MEMBERS(label_id, score);
+    };
 
-using Labels = std::vector<Label>;
+    using Labels = std::vector<Label>;
 
-MMDEPLOY_DECLARE_CODEBASE(MMClassification, mmcls);
+    MMDEPLOY_DECLARE_CODEBASE(MMClassification, mmcls);
 
 }  // namespace mmdeploy::mmcls
 
diff --git a/csrc/mmdeploy/codebase/mmcls/multi_label_linear_cls.cpp b/csrc/mmdeploy/codebase/mmcls/multi_label_linear_cls.cpp
index e3374891e6..0c9615ea32 100644
--- a/csrc/mmdeploy/codebase/mmcls/multi_label_linear_cls.cpp
+++ b/csrc/mmdeploy/codebase/mmcls/multi_label_linear_cls.cpp
@@ -11,45 +11,53 @@
 
 using std::vector;
 
-namespace mmdeploy::mmcls {
-
-class MultiLabelLinearClsHead : public MMClassification {
- public:
-  explicit MultiLabelLinearClsHead(const Value& cfg) : MMClassification(cfg) {}
-  Result<Value> operator()(const Value& infer_res) {
-    MMDEPLOY_DEBUG("infer_res: {}", infer_res);
-    auto output = infer_res["output"].get<Tensor>();
-
-    if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(),
-                     (int)output.data_type());
-      return Status(eNotSupported);
-    }
-
-    auto class_num = (int)output.shape(1);
-
-    OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
-    OUTCOME_TRY(stream().Wait());
-
-    return GetLabels(_scores, class_num);
-  }
-
- private:
-  Value GetLabels(const Tensor& scores, int class_num) const {
-    auto scores_data = scores.data<float>();
-    Labels output;
-    for (int i = 0; i < class_num; ++i) {
-      auto label = Label{i, scores_data[i]};
-      MMDEPLOY_DEBUG("label_id: {}, score: {}", label.label_id, label.score);
-      output.push_back(label);
-    }
-    return to_value(std::move(output));
-  }
-
- private:
-  static constexpr const auto kHost = Device{0};
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, MultiLabelLinearClsHead);
+namespace mmdeploy::mmcls
+{
+
+    class MultiLabelLinearClsHead : public MMClassification
+    {
+      public:
+        explicit MultiLabelLinearClsHead(const Value& cfg)
+            : MMClassification(cfg)
+        {
+        }
+        Result<Value> operator()(const Value& infer_res)
+        {
+            MMDEPLOY_DEBUG("infer_res: {}", infer_res);
+            auto output = infer_res["output"].get<Tensor>();
+
+            if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(), (int)output.data_type());
+                return Status(eNotSupported);
+            }
+
+            auto class_num = (int)output.shape(1);
+
+            OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
+            OUTCOME_TRY(stream().Wait());
+
+            return GetLabels(_scores, class_num);
+        }
+
+      private:
+        Value GetLabels(const Tensor& scores, int class_num) const
+        {
+            auto   scores_data = scores.data<float>();
+            Labels output;
+            for (int i = 0; i < class_num; ++i)
+            {
+                auto label = Label{i, scores_data[i]};
+                MMDEPLOY_DEBUG("label_id: {}, score: {}", label.label_id, label.score);
+                output.push_back(label);
+            }
+            return to_value(std::move(output));
+        }
+
+      private:
+        static constexpr const auto kHost = Device{0};
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, MultiLabelLinearClsHead);
 
 }  // namespace mmdeploy::mmcls
diff --git a/csrc/mmdeploy/codebase/mmdet/base_dense_head.cpp b/csrc/mmdeploy/codebase/mmdet/base_dense_head.cpp
index a085ff26a2..3576285f08 100644
--- a/csrc/mmdeploy/codebase/mmdet/base_dense_head.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/base_dense_head.cpp
@@ -8,97 +8,109 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "utils.h"
 
-namespace mmdeploy::mmdet {
-
-BaseDenseHead::BaseDenseHead(const Value& cfg) : MMDetection(cfg) {
-  auto init = [&]() -> Result<void> {
-    auto model = cfg["context"]["model"].get<Model>();
-    if (cfg.contains("params")) {
-      nms_pre_ = cfg["params"].value("nms_pre", -1);
-      score_thr_ = cfg["params"].value("score_thr", 0.02f);
-      min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
-      iou_threshold_ = cfg["params"].contains("nms")
-                           ? cfg["params"]["nms"].value("iou_threshold", 0.45f)
-                           : 0.45f;
+namespace mmdeploy::mmdet
+{
+
+    BaseDenseHead::BaseDenseHead(const Value& cfg)
+        : MMDetection(cfg)
+    {
+        auto init = [&]() -> Result<void>
+        {
+            auto model = cfg["context"]["model"].get<Model>();
+            if (cfg.contains("params"))
+            {
+                nms_pre_       = cfg["params"].value("nms_pre", -1);
+                score_thr_     = cfg["params"].value("score_thr", 0.02f);
+                min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
+                iou_threshold_ = cfg["params"].contains("nms") ? cfg["params"]["nms"].value("iou_threshold", 0.45f) : 0.45f;
+            }
+            return success();
+        };
+        init().value();
     }
-    return success();
-  };
-  init().value();
-}
-
-Result<Value> BaseDenseHead::operator()(const Value& prep_res, const Value& infer_res) {
-  MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
-  try {
-    auto dets = infer_res["dets"].get<Tensor>();
-    auto scores = infer_res["labels"].get<Tensor>();
-    const Device kHost{0, 0};
-    OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
-    OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(scores, kHost, stream()));
-    OUTCOME_TRY(stream().Wait());
-    OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], _dets, _scores));
-    return to_value(result);
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-
-Result<Detections> BaseDenseHead::GetBBoxes(const Value& prep_res, const Tensor& dets,
-                                            const Tensor& scores) const {
-  MMDEPLOY_DEBUG("dets: {}, {}", dets.shape(), dets.data_type());
-  MMDEPLOY_DEBUG("scores: {}, {}", scores.shape(), scores.data_type());
-
-  std::vector<float> probs;
-  std::vector<int> label_ids;
-  std::vector<int> anchor_idxs;
-
-  FilterScoresAndTopk(scores, score_thr_, nms_pre_, probs, label_ids, anchor_idxs);
-
-  Sort(probs, label_ids, anchor_idxs);
-
-  NMS(dets, iou_threshold_, anchor_idxs);
-
-  Detections objs;
-  std::vector<float> scale_factor;
-  if (prep_res.contains("scale_factor")) {
-    from_value(prep_res["scale_factor"], scale_factor);
-  } else {
-    scale_factor = {1.f, 1.f, 1.f, 1.f};
-  }
-  int ori_width = prep_res["ori_shape"][2].get<int>();
-  int ori_height = prep_res["ori_shape"][1].get<int>();
-  auto det_ptr = dets.data<float>();
-  for (int i = 0; i < anchor_idxs.size(); ++i) {
-    if (anchor_idxs[i] == -1) {
-      continue;
+
+    Result<Value> BaseDenseHead::operator()(const Value& prep_res, const Value& infer_res)
+    {
+        MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+        try
+        {
+            auto         dets   = infer_res["dets"].get<Tensor>();
+            auto         scores = infer_res["labels"].get<Tensor>();
+            const Device kHost{0, 0};
+            OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
+            OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(scores, kHost, stream()));
+            OUTCOME_TRY(stream().Wait());
+            OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], _dets, _scores));
+            return to_value(result);
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
     }
-    int j = anchor_idxs[i];
-    auto x1 = det_ptr[j * 4 + 0];
-    auto y1 = det_ptr[j * 4 + 1];
-    auto x2 = det_ptr[j * 4 + 2];
-    auto y2 = det_ptr[j * 4 + 3];
-    int label_id = label_ids[i];
-    float score = probs[i];
-
-    MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
-
-    auto rect =
-        MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height, 0, 0);
-    if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_) {
-      MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0],
-                     rect[3] - rect[1]);
-      continue;
+
+    Result<Detections> BaseDenseHead::GetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& scores) const
+    {
+        MMDEPLOY_DEBUG("dets: {}, {}", dets.shape(), dets.data_type());
+        MMDEPLOY_DEBUG("scores: {}, {}", scores.shape(), scores.data_type());
+
+        std::vector<float> probs;
+        std::vector<int>   label_ids;
+        std::vector<int>   anchor_idxs;
+
+        FilterScoresAndTopk(scores, score_thr_, nms_pre_, probs, label_ids, anchor_idxs);
+
+        Sort(probs, label_ids, anchor_idxs);
+
+        NMS(dets, iou_threshold_, anchor_idxs);
+
+        Detections         objs;
+        std::vector<float> scale_factor;
+        if (prep_res.contains("scale_factor"))
+        {
+            from_value(prep_res["scale_factor"], scale_factor);
+        }
+        else
+        {
+            scale_factor = {1.f, 1.f, 1.f, 1.f};
+        }
+        int  ori_width  = prep_res["ori_shape"][2].get<int>();
+        int  ori_height = prep_res["ori_shape"][1].get<int>();
+        auto det_ptr    = dets.data<float>();
+        for (int i = 0; i < anchor_idxs.size(); ++i)
+        {
+            if (anchor_idxs[i] == -1)
+            {
+                continue;
+            }
+            int   j        = anchor_idxs[i];
+            auto  x1       = det_ptr[j * 4 + 0];
+            auto  y1       = det_ptr[j * 4 + 1];
+            auto  x2       = det_ptr[j * 4 + 2];
+            auto  y2       = det_ptr[j * 4 + 3];
+            int   label_id = label_ids[i];
+            float score    = probs[i];
+
+            MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
+
+            auto rect =
+                MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height, 0, 0);
+            if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_)
+            {
+                MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0], rect[3] - rect[1]);
+                continue;
+            }
+            Detection det{};
+            det.index    = i;
+            det.label_id = label_id;
+            det.score    = score;
+            det.bbox     = rect;
+            objs.push_back(std::move(det));
+        }
+
+        return objs;
     }
-    Detection det{};
-    det.index = i;
-    det.label_id = label_id;
-    det.score = score;
-    det.bbox = rect;
-    objs.push_back(std::move(det));
-  }
-
-  return objs;
-}
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, BaseDenseHead);
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, BaseDenseHead);
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/base_dense_head.h b/csrc/mmdeploy/codebase/mmdet/base_dense_head.h
index 0bac99ced4..66ebd29ed5 100644
--- a/csrc/mmdeploy/codebase/mmdet/base_dense_head.h
+++ b/csrc/mmdeploy/codebase/mmdet/base_dense_head.h
@@ -5,22 +5,23 @@
 #include "mmdeploy/codebase/mmdet/mmdet.h"
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::mmdet {
+namespace mmdeploy::mmdet
+{
 
-class BaseDenseHead : public MMDetection {
- public:
-  explicit BaseDenseHead(const Value& cfg);
+    class BaseDenseHead : public MMDetection
+    {
+      public:
+        explicit BaseDenseHead(const Value& cfg);
 
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
-  Result<Detections> GetBBoxes(const Value& prep_res, const Tensor& dets,
-                               const Tensor& scores) const;
+        Result<Value>      operator()(const Value& prep_res, const Value& infer_res);
+        Result<Detections> GetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& scores) const;
 
- private:
-  float score_thr_{0.4f};
-  int nms_pre_{1000};
-  float iou_threshold_{0.45f};
-  int min_bbox_size_{0};
-};
+      private:
+        float score_thr_{0.4f};
+        int   nms_pre_{1000};
+        float iou_threshold_{0.45f};
+        int   min_bbox_size_{0};
+    };
 }  // namespace mmdeploy::mmdet
 
 #endif  // MMDEPLOY_CODEBASE_MMDET_BASE_DENSE_HEAD_H_
diff --git a/csrc/mmdeploy/codebase/mmdet/instance_segmentation.cpp b/csrc/mmdeploy/codebase/mmdet/instance_segmentation.cpp
index 270f4c6641..db59b06556 100644
--- a/csrc/mmdeploy/codebase/mmdet/instance_segmentation.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/instance_segmentation.cpp
@@ -9,194 +9,213 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv_utils.h"
 
-namespace mmdeploy::mmdet {
-
-class ResizeInstanceMask : public ResizeBBox {
- public:
-  explicit ResizeInstanceMask(const Value& cfg) : ResizeBBox(cfg) {
-    if (cfg.contains("params")) {
-      mask_thr_binary_ = cfg["params"].value("mask_thr_binary", mask_thr_binary_);
-      is_rcnn_ = cfg["params"].contains("rcnn");
-      is_resize_mask_ = cfg["params"].value("is_resize_mask", is_resize_mask_);
-    }
-    operation::Context ctx(device_, stream_);
-    warp_affine_ = operation::Managed<operation::WarpAffine>::Create("bilinear");
-    permute_ = operation::Managed<::mmdeploy::operation::Permute>::Create();
-  }
-
-  // TODO: remove duplication
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res) {
-    MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
-    try {
-      DeviceGuard guard(device_);
-      auto dets = infer_res["dets"].get<Tensor>();
-      auto labels = infer_res["labels"].get<Tensor>();
-      auto masks = infer_res["masks"].get<Tensor>();
-
-      MMDEPLOY_DEBUG("dets.shape: {}", dets.shape());
-      MMDEPLOY_DEBUG("labels.shape: {}", labels.shape());
-      MMDEPLOY_DEBUG("masks.shape: {}", masks.shape());
-
-      // `dets` is supposed to have 3 dims. They are 'batch', 'bboxes_number'
-      // and 'channels' respectively
-      if (!(dets.shape().size() == 3 && dets.data_type() == DataType::kFLOAT)) {
-        MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(),
-                       (int)dets.data_type());
-        return Status(eNotSupported);
-      }
-
-      // `labels` is supposed to have 2 dims, which are 'batch' and
-      // 'bboxes_number'
-      if (labels.shape().size() != 2) {
-        MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(),
-                       (int)labels.data_type());
-        return Status(eNotSupported);
-      }
-
-      if (!(masks.shape().size() == 4 && masks.data_type() == DataType::kFLOAT)) {
-        MMDEPLOY_ERROR("unsupported `mask` tensor, shape: {}, dtype: {}", masks.shape(),
-                       (int)masks.data_type());
-        return Status(eNotSupported);
-      }
-
-      OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
-      OUTCOME_TRY(auto _labels, MakeAvailableOnDevice(labels, kHost, stream()));
-      // Note: `masks` are kept on device to avoid data copy overhead from device to host.
-      // refer to https://github.com/open-mmlab/mmdeploy/issues/1849
-      // OUTCOME_TRY(auto _masks, MakeAvailableOnDevice(masks, kHost, stream()));
-      // OUTCOME_TRY(stream().Wait());
-
-      OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], _dets, _labels));
-      auto ori_w = prep_res["img_metas"]["ori_shape"][2].get<int>();
-      auto ori_h = prep_res["img_metas"]["ori_shape"][1].get<int>();
-      from_value(prep_res["img_metas"]["scale_factor"], scale_factor_);
-
-      ProcessMasks(result, masks, _dets, ori_w, ori_h);
-
-      return to_value(result);
-    } catch (const std::exception& e) {
-      MMDEPLOY_ERROR("{}", e.what());
-      return Status(eFail);
-    }
-  }
-
- protected:
-  Result<void> ProcessMasks(Detections& result, Tensor d_mask, Tensor cpu_dets, int img_w,
-                            int img_h) {
-    d_mask.Squeeze(0);
-    cpu_dets.Squeeze(0);
-
-    ::mmdeploy::operation::Context ctx(device_, stream_);
-
-    std::vector<Tensor> warped_masks;
-    warped_masks.reserve(result.size());
-
-    std::vector<Tensor> h_warped_masks;
-    h_warped_masks.reserve(result.size());
-
-    if (is_rcnn_) {  // mask r-cnn
-      for (auto& det : result) {
-        auto mask = d_mask.Slice(det.index);
-        auto mask_height = (int)mask.shape(1);
-        auto mask_width = (int)mask.shape(2);
-        mask.Reshape({1, mask_height, mask_width, 1});
-        // resize masks to origin image shape instead of input image shape
-        // default is true
-        if (is_resize_mask_) {
-          auto& bbox = det.bbox;
-          // same as mmdet with skip_empty = True
-          auto x0 = std::max(std::floor(bbox[0]) - 1, 0.f);
-          auto y0 = std::max(std::floor(bbox[1]) - 1, 0.f);
-          auto x1 = std::min(std::ceil(bbox[2]) + 1, (float)img_w);
-          auto y1 = std::min(std::ceil(bbox[3]) + 1, (float)img_h);
-          auto width = static_cast<int>(x1 - x0);
-          auto height = static_cast<int>(y1 - y0);
-          // params align_corners = False
-          float fx;
-          float fy;
-          float tx;
-          float ty;
-          fx = (float)mask_width / (bbox[2] - bbox[0]);
-          fy = (float)mask_height / (bbox[3] - bbox[1]);
-          tx = (x0 + .5f - bbox[0]) * fx - .5f;
-          ty = (y0 + .5f - bbox[1]) * fy - .5f;
-
-          float affine_matrix[] = {fx, 0, tx, 0, fy, ty};
-
-          cv::Mat_<float> m(2, 3, affine_matrix);
-          cv::invertAffineTransform(m, m);
-          Tensor& warped_mask = warped_masks.emplace_back();
-          OUTCOME_TRY(warp_affine_.Apply(mask, warped_mask, affine_matrix, height, width));
-          OUTCOME_TRY(CopyToHost(warped_mask, h_warped_masks.emplace_back()));
-
-        } else {
-          OUTCOME_TRY(CopyToHost(mask, h_warped_masks.emplace_back()));
+namespace mmdeploy::mmdet
+{
+
+    class ResizeInstanceMask : public ResizeBBox
+    {
+      public:
+        explicit ResizeInstanceMask(const Value& cfg)
+            : ResizeBBox(cfg)
+        {
+            if (cfg.contains("params"))
+            {
+                mask_thr_binary_ = cfg["params"].value("mask_thr_binary", mask_thr_binary_);
+                is_rcnn_         = cfg["params"].contains("rcnn");
+                is_resize_mask_  = cfg["params"].value("is_resize_mask", is_resize_mask_);
+            }
+            operation::Context ctx(device_, stream_);
+            warp_affine_ = operation::Managed<operation::WarpAffine>::Create("bilinear");
+            permute_     = operation::Managed<::mmdeploy::operation::Permute>::Create();
         }
-      }
-
-    } else {  // rtmdet-inst
-      auto mask_channel = (int)d_mask.shape(0);
-      auto mask_height = (int)d_mask.shape(1);
-      auto mask_width = (int)d_mask.shape(2);
-      // (C, H, W) -> (H, W, C)
-      std::vector<int> axes = {1, 2, 0};
-      OUTCOME_TRY(permute_.Apply(d_mask, d_mask, axes));
-      Device host{"cpu"};
-      OUTCOME_TRY(auto cpu_mask, MakeAvailableOnDevice(d_mask, host, stream_));
-      OUTCOME_TRY(stream().Wait());
-      cv::Mat mask_mat(mask_height, mask_width, CV_32FC(mask_channel), cpu_mask.data());
-      int resize_height = int(mask_height / scale_factor_[0] + 0.5);
-      int resize_width = int(mask_width / scale_factor_[1] + 0.5);
-      // skip resize if scale_factor is 1.0
-      if (resize_height != mask_height || resize_width != mask_width) {
-        cv::resize(mask_mat, mask_mat, cv::Size(resize_width, resize_height), cv::INTER_LINEAR);
-      }
-      // crop masks
-      mask_mat = mask_mat(cv::Range(0, img_h), cv::Range(0, img_w)).clone();
-
-      for (int i = 0; i < (int)result.size(); i++) {
-        cv::Mat mask_;
-        cv::extractChannel(mask_mat, mask_, i);
-        Tensor mask_t = cpu::CVMat2Tensor(mask_);
-        h_warped_masks.emplace_back(mask_t);
-      }
-    }
-
-    OUTCOME_TRY(stream_.Wait());
-
-    for (size_t i = 0; i < h_warped_masks.size(); ++i) {
-      result[i].mask = ThresholdMask(h_warped_masks[i]);
-    }
-
-    return success();
-  }
-
-  Result<void> CopyToHost(const Tensor& src, Tensor& dst) {
-    if (src.device() == kHost) {
-      dst = src;
-      return success();
-    }
-    dst = TensorDesc{kHost, src.data_type(), src.shape()};
-    OUTCOME_TRY(stream_.Copy(src.buffer(), dst.buffer(), dst.byte_size()));
-    return success();
-  }
-
-  Mat ThresholdMask(const Tensor& h_mask) const {
-    cv::Mat warped_mat = cpu::Tensor2CVMat(h_mask);
-    warped_mat = warped_mat > mask_thr_binary_;
-    return {warped_mat.rows, warped_mat.cols, PixelFormat::kGRAYSCALE, DataType::kINT8,
-            std::shared_ptr<void>(warped_mat.data, [mat = warped_mat](void*) {})};
-  }
-
- private:
-  operation::Managed<operation::WarpAffine> warp_affine_;
-  ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute> permute_;
-  float mask_thr_binary_{.5f};
-  bool is_rcnn_{true};
-  bool is_resize_mask_{true};
-  std::vector<float> scale_factor_{1.0, 1.0, 1.0, 1.0};
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, ResizeInstanceMask);
+
+        // TODO: remove duplication
+        Result<Value> operator()(const Value& prep_res, const Value& infer_res)
+        {
+            MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+            try
+            {
+                DeviceGuard guard(device_);
+                auto        dets   = infer_res["dets"].get<Tensor>();
+                auto        labels = infer_res["labels"].get<Tensor>();
+                auto        masks  = infer_res["masks"].get<Tensor>();
+
+                MMDEPLOY_DEBUG("dets.shape: {}", dets.shape());
+                MMDEPLOY_DEBUG("labels.shape: {}", labels.shape());
+                MMDEPLOY_DEBUG("masks.shape: {}", masks.shape());
+
+                // `dets` is supposed to have 3 dims. They are 'batch', 'bboxes_number'
+                // and 'channels' respectively
+                if (!(dets.shape().size() == 3 && dets.data_type() == DataType::kFLOAT))
+                {
+                    MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(), (int)dets.data_type());
+                    return Status(eNotSupported);
+                }
+
+                // `labels` is supposed to have 2 dims, which are 'batch' and
+                // 'bboxes_number'
+                if (labels.shape().size() != 2)
+                {
+                    MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(), (int)labels.data_type());
+                    return Status(eNotSupported);
+                }
+
+                if (!(masks.shape().size() == 4 && masks.data_type() == DataType::kFLOAT))
+                {
+                    MMDEPLOY_ERROR("unsupported `mask` tensor, shape: {}, dtype: {}", masks.shape(), (int)masks.data_type());
+                    return Status(eNotSupported);
+                }
+
+                OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
+                OUTCOME_TRY(auto _labels, MakeAvailableOnDevice(labels, kHost, stream()));
+                // Note: `masks` are kept on device to avoid data copy overhead from device to host.
+                // refer to https://github.com/open-mmlab/mmdeploy/issues/1849
+                // OUTCOME_TRY(auto _masks, MakeAvailableOnDevice(masks, kHost, stream()));
+                // OUTCOME_TRY(stream().Wait());
+
+                OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], _dets, _labels));
+                auto ori_w = prep_res["img_metas"]["ori_shape"][2].get<int>();
+                auto ori_h = prep_res["img_metas"]["ori_shape"][1].get<int>();
+                from_value(prep_res["img_metas"]["scale_factor"], scale_factor_);
+
+                ProcessMasks(result, masks, _dets, ori_w, ori_h);
+
+                return to_value(result);
+            }
+            catch (const std::exception& e)
+            {
+                MMDEPLOY_ERROR("{}", e.what());
+                return Status(eFail);
+            }
+        }
+
+      protected:
+        Result<void> ProcessMasks(Detections& result, Tensor d_mask, Tensor cpu_dets, int img_w, int img_h)
+        {
+            d_mask.Squeeze(0);
+            cpu_dets.Squeeze(0);
+
+            ::mmdeploy::operation::Context ctx(device_, stream_);
+
+            std::vector<Tensor>            warped_masks;
+            warped_masks.reserve(result.size());
+
+            std::vector<Tensor> h_warped_masks;
+            h_warped_masks.reserve(result.size());
+
+            if (is_rcnn_)
+            {  // mask r-cnn
+                for (auto& det : result)
+                {
+                    auto mask        = d_mask.Slice(det.index);
+                    auto mask_height = (int)mask.shape(1);
+                    auto mask_width  = (int)mask.shape(2);
+                    mask.Reshape({1, mask_height, mask_width, 1});
+                    // resize masks to origin image shape instead of input image shape
+                    // default is true
+                    if (is_resize_mask_)
+                    {
+                        auto& bbox   = det.bbox;
+                        // same as mmdet with skip_empty = True
+                        auto  x0     = std::max(std::floor(bbox[0]) - 1, 0.f);
+                        auto  y0     = std::max(std::floor(bbox[1]) - 1, 0.f);
+                        auto  x1     = std::min(std::ceil(bbox[2]) + 1, (float)img_w);
+                        auto  y1     = std::min(std::ceil(bbox[3]) + 1, (float)img_h);
+                        auto  width  = static_cast<int>(x1 - x0);
+                        auto  height = static_cast<int>(y1 - y0);
+                        // params align_corners = False
+                        float fx;
+                        float fy;
+                        float tx;
+                        float ty;
+                        fx = (float)mask_width / (bbox[2] - bbox[0]);
+                        fy = (float)mask_height / (bbox[3] - bbox[1]);
+                        tx = (x0 + .5f - bbox[0]) * fx - .5f;
+                        ty = (y0 + .5f - bbox[1]) * fy - .5f;
+
+                        float           affine_matrix[] = {fx, 0, tx, 0, fy, ty};
+
+                        cv::Mat_<float> m(2, 3, affine_matrix);
+                        cv::invertAffineTransform(m, m);
+                        Tensor& warped_mask = warped_masks.emplace_back();
+                        OUTCOME_TRY(warp_affine_.Apply(mask, warped_mask, affine_matrix, height, width));
+                        OUTCOME_TRY(CopyToHost(warped_mask, h_warped_masks.emplace_back()));
+                    }
+                    else
+                    {
+                        OUTCOME_TRY(CopyToHost(mask, h_warped_masks.emplace_back()));
+                    }
+                }
+            }
+            else
+            {  // rtmdet-inst
+                auto             mask_channel = (int)d_mask.shape(0);
+                auto             mask_height  = (int)d_mask.shape(1);
+                auto             mask_width   = (int)d_mask.shape(2);
+                // (C, H, W) -> (H, W, C)
+                std::vector<int> axes         = {1, 2, 0};
+                OUTCOME_TRY(permute_.Apply(d_mask, d_mask, axes));
+                Device host{"cpu"};
+                OUTCOME_TRY(auto cpu_mask, MakeAvailableOnDevice(d_mask, host, stream_));
+                OUTCOME_TRY(stream().Wait());
+                cv::Mat mask_mat(mask_height, mask_width, CV_32FC(mask_channel), cpu_mask.data());
+                int     resize_height = int(mask_height / scale_factor_[0] + 0.5);
+                int     resize_width  = int(mask_width / scale_factor_[1] + 0.5);
+                // skip resize if scale_factor is 1.0
+                if (resize_height != mask_height || resize_width != mask_width)
+                {
+                    cv::resize(mask_mat, mask_mat, cv::Size(resize_width, resize_height), cv::INTER_LINEAR);
+                }
+                // crop masks
+                mask_mat = mask_mat(cv::Range(0, img_h), cv::Range(0, img_w)).clone();
+
+                for (int i = 0; i < (int)result.size(); i++)
+                {
+                    cv::Mat mask_;
+                    cv::extractChannel(mask_mat, mask_, i);
+                    Tensor mask_t = cpu::CVMat2Tensor(mask_);
+                    h_warped_masks.emplace_back(mask_t);
+                }
+            }
+
+            OUTCOME_TRY(stream_.Wait());
+
+            for (size_t i = 0; i < h_warped_masks.size(); ++i)
+            {
+                result[i].mask = ThresholdMask(h_warped_masks[i]);
+            }
+
+            return success();
+        }
+
+        Result<void> CopyToHost(const Tensor& src, Tensor& dst)
+        {
+            if (src.device() == kHost)
+            {
+                dst = src;
+                return success();
+            }
+            dst = TensorDesc{kHost, src.data_type(), src.shape()};
+            OUTCOME_TRY(stream_.Copy(src.buffer(), dst.buffer(), dst.byte_size()));
+            return success();
+        }
+
+        Mat ThresholdMask(const Tensor& h_mask) const
+        {
+            cv::Mat warped_mat = cpu::Tensor2CVMat(h_mask);
+            warped_mat         = warped_mat > mask_thr_binary_;
+            return {warped_mat.rows, warped_mat.cols, PixelFormat::kGRAYSCALE, DataType::kINT8, std::shared_ptr<void>(warped_mat.data, [mat = warped_mat](void*) {})};
+        }
+
+      private:
+        operation::Managed<operation::WarpAffine>                      warp_affine_;
+        ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute> permute_;
+        float                                                          mask_thr_binary_{.5f};
+        bool                                                           is_rcnn_{true};
+        bool                                                           is_resize_mask_{true};
+        std::vector<float>                                             scale_factor_{1.0, 1.0, 1.0, 1.0};
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, ResizeInstanceMask);
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/mmdet.cpp b/csrc/mmdeploy/codebase/mmdet/mmdet.cpp
index fb4e4d1f1f..679b589dde 100644
--- a/csrc/mmdeploy/codebase/mmdet/mmdet.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/mmdet.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmdet/mmdet.h"
 
-namespace mmdeploy::mmdet {
+namespace mmdeploy::mmdet
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMDetection);
+    MMDEPLOY_REGISTER_CODEBASE(MMDetection);
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/mmdet.h b/csrc/mmdeploy/codebase/mmdet/mmdet.h
index bc02a9c1d2..51d3de43d3 100644
--- a/csrc/mmdeploy/codebase/mmdet/mmdet.h
+++ b/csrc/mmdeploy/codebase/mmdet/mmdet.h
@@ -12,20 +12,22 @@
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/core/serialization.h"
 
-namespace mmdeploy::mmdet {
-
-struct Detection {
-  int index;
-  int label_id;
-  float score;
-  std::array<float, 4> bbox;  // left, top, right, bottom
-  Mat mask;
-  MMDEPLOY_ARCHIVE_MEMBERS(index, label_id, score, bbox, mask);
-};
-
-using Detections = std::vector<Detection>;
-
-MMDEPLOY_DECLARE_CODEBASE(MMDetection, mmdet);
+namespace mmdeploy::mmdet
+{
+
+    struct Detection
+    {
+        int                  index;
+        int                  label_id;
+        float                score;
+        std::array<float, 4> bbox;  // left, top, right, bottom
+        Mat                  mask;
+        MMDEPLOY_ARCHIVE_MEMBERS(index, label_id, score, bbox, mask);
+    };
+
+    using Detections = std::vector<Detection>;
+
+    MMDEPLOY_DECLARE_CODEBASE(MMDetection, mmdet);
 
 }  // namespace mmdeploy::mmdet
 
diff --git a/csrc/mmdeploy/codebase/mmdet/object_detection.cpp b/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
index 561d38c102..b194cb897e 100644
--- a/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
@@ -10,175 +10,197 @@
 
 using namespace std;
 
-namespace mmdeploy::mmdet {
-
-ResizeBBox::ResizeBBox(const Value& cfg) : MMDetection(cfg) {
-  if (cfg.contains("params")) {
-    if (cfg["params"].contains("conf_thr")) {
-      // for mobilev2yolov3
-      score_thr_ = cfg["params"].value("conf_thr", 0.f);
-    } else {
-      score_thr_ = cfg["params"].value("score_thr", 0.f);
+namespace mmdeploy::mmdet
+{
+
+    ResizeBBox::ResizeBBox(const Value& cfg)
+        : MMDetection(cfg)
+    {
+        if (cfg.contains("params"))
+        {
+            if (cfg["params"].contains("conf_thr"))
+            {
+                // for mobilev2yolov3
+                score_thr_ = cfg["params"].value("conf_thr", 0.f);
+            }
+            else
+            {
+                score_thr_ = cfg["params"].value("score_thr", 0.f);
+            }
+            min_bbox_size_ = cfg["params"].value("min_bbox_size", 0.f);
+        }
     }
-    min_bbox_size_ = cfg["params"].value("min_bbox_size", 0.f);
-  }
-}
-std::vector<Tensor> ResizeBBox::GetDetsLabels(const Value& prep_res, const Value& infer_res) {
-  std::vector<Tensor> results;
-  if (infer_res.contains("dets") && infer_res.contains("labels")) {
-    results.push_back(infer_res["dets"].get<Tensor>());
-    results.push_back(infer_res["labels"].get<Tensor>());
-    return results;
-  } else if (infer_res.contains("detection_output") && (!infer_res.contains("dets")) &&
-             (!infer_res.contains("labels"))) {
-    int img_width = prep_res["img_metas"]["img_shape"][2].get<int>();
-    int img_height = prep_res["img_metas"]["img_shape"][1].get<int>();
-    auto detection_output = infer_res["detection_output"].get<Tensor>();
-    auto* detection_output_ptr = detection_output.data<float>();
-    // detection_output: (1, num_det, 6)
-    TensorDesc labeldesc = detection_output.desc();
-    int batch_size = detection_output.shape()[0];
-    int num_det = detection_output.shape()[1];
-    labeldesc.shape = {batch_size, num_det};
-    Tensor labels(labeldesc);
-    TensorDesc detdesc = detection_output.desc();
-    detdesc.shape = {batch_size, num_det, 5};
-    Tensor dets(detdesc);
-    auto* dets_ptr = dets.data<float>();
-    auto* labels_ptr = labels.data<float>();
-
-    for (int i = 0; i < batch_size * num_det; ++i) {
-      *labels_ptr++ = detection_output_ptr[0] - 1;
-      dets_ptr[4] = detection_output_ptr[1];
-      dets_ptr[0] = detection_output_ptr[2] * img_width;
-      dets_ptr[1] = detection_output_ptr[3] * img_height;
-      dets_ptr[2] = detection_output_ptr[4] * img_width;
-      dets_ptr[3] = detection_output_ptr[5] * img_height;
-      dets_ptr += 5;
-      detection_output_ptr += 6;
+    std::vector<Tensor> ResizeBBox::GetDetsLabels(const Value& prep_res, const Value& infer_res)
+    {
+        std::vector<Tensor> results;
+        if (infer_res.contains("dets") && infer_res.contains("labels"))
+        {
+            results.push_back(infer_res["dets"].get<Tensor>());
+            results.push_back(infer_res["labels"].get<Tensor>());
+            return results;
+        }
+        else if (infer_res.contains("detection_output") && (!infer_res.contains("dets")) &&
+                 (!infer_res.contains("labels")))
+        {
+            int        img_width            = prep_res["img_metas"]["img_shape"][2].get<int>();
+            int        img_height           = prep_res["img_metas"]["img_shape"][1].get<int>();
+            auto       detection_output     = infer_res["detection_output"].get<Tensor>();
+            auto*      detection_output_ptr = detection_output.data<float>();
+            // detection_output: (1, num_det, 6)
+            TensorDesc labeldesc            = detection_output.desc();
+            int        batch_size           = detection_output.shape()[0];
+            int        num_det              = detection_output.shape()[1];
+            labeldesc.shape                 = {batch_size, num_det};
+            Tensor     labels(labeldesc);
+            TensorDesc detdesc = detection_output.desc();
+            detdesc.shape      = {batch_size, num_det, 5};
+            Tensor dets(detdesc);
+            auto*  dets_ptr   = dets.data<float>();
+            auto*  labels_ptr = labels.data<float>();
+
+            for (int i = 0; i < batch_size * num_det; ++i)
+            {
+                *labels_ptr++ = detection_output_ptr[0] - 1;
+                dets_ptr[4]   = detection_output_ptr[1];
+                dets_ptr[0]   = detection_output_ptr[2] * img_width;
+                dets_ptr[1]   = detection_output_ptr[3] * img_height;
+                dets_ptr[2]   = detection_output_ptr[4] * img_width;
+                dets_ptr[3]   = detection_output_ptr[5] * img_height;
+                dets_ptr += 5;
+                detection_output_ptr += 6;
+            }
+            results.push_back(dets);
+            results.push_back(labels);
+            return results;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("No support for another key of detection results!");
+            return results;
+        }
     }
-    results.push_back(dets);
-    results.push_back(labels);
-    return results;
-  } else {
-    MMDEPLOY_ERROR("No support for another key of detection results!");
-    return results;
-  }
-}
-Result<Value> ResizeBBox::operator()(const Value& prep_res, const Value& infer_res) {
-  MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
-  try {
-    Tensor dets, labels;
-    vector<Tensor> outputs = GetDetsLabels(prep_res, infer_res);
-    dets = outputs[0];
-    labels = outputs[1];
-    MMDEPLOY_DEBUG("dets.shape: {}", dets.shape());
-    MMDEPLOY_DEBUG("labels.shape: {}", labels.shape());
-    // `dets` is supposed to have 3 dims. They are 'batch', 'bboxes_number'
-    // and 'channels' respectively
-    if (!(dets.shape().size() == 3 && dets.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(),
-                     (int)dets.data_type());
-      return Status(eNotSupported);
+    Result<Value> ResizeBBox::operator()(const Value& prep_res, const Value& infer_res)
+    {
+        MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+        try
+        {
+            Tensor         dets, labels;
+            vector<Tensor> outputs = GetDetsLabels(prep_res, infer_res);
+            dets                   = outputs[0];
+            labels                 = outputs[1];
+            MMDEPLOY_DEBUG("dets.shape: {}", dets.shape());
+            MMDEPLOY_DEBUG("labels.shape: {}", labels.shape());
+            // `dets` is supposed to have 3 dims. They are 'batch', 'bboxes_number'
+            // and 'channels' respectively
+            if (!(dets.shape().size() == 3 && dets.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(), (int)dets.data_type());
+                return Status(eNotSupported);
+            }
+
+            // `labels` is supposed to have 2 dims, which are 'batch' and
+            // 'bboxes_number'
+            if (labels.shape().size() != 2)
+            {
+                MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(), (int)labels.data_type());
+                return Status(eNotSupported);
+            }
+
+            OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
+            OUTCOME_TRY(auto _labels, MakeAvailableOnDevice(labels, kHost, stream()));
+            OUTCOME_TRY(stream().Wait());
+
+            OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], _dets, _labels));
+            return to_value(result);
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
     }
-
-    // `labels` is supposed to have 2 dims, which are 'batch' and
-    // 'bboxes_number'
-    if (labels.shape().size() != 2) {
-      MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(),
-                     (int)labels.data_type());
-      return Status(eNotSupported);
+    Result<Detections> ResizeBBox::DispatchGetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels)
+    {
+        auto data_type = labels.data_type();
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return GetBBoxes<float>(prep_res, dets, labels);
+            case DataType::kINT32:
+                return GetBBoxes<int32_t>(prep_res, dets, labels);
+            case DataType::kINT64:
+                return GetBBoxes<int64_t>(prep_res, dets, labels);
+            default:
+                return Status(eNotSupported);
+        }
     }
-
-    OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
-    OUTCOME_TRY(auto _labels, MakeAvailableOnDevice(labels, kHost, stream()));
-    OUTCOME_TRY(stream().Wait());
-
-    OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], _dets, _labels));
-    return to_value(result);
-
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-Result<Detections> ResizeBBox::DispatchGetBBoxes(const Value& prep_res, const Tensor& dets,
-                                                 const Tensor& labels) {
-  auto data_type = labels.data_type();
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return GetBBoxes<float>(prep_res, dets, labels);
-    case DataType::kINT32:
-      return GetBBoxes<int32_t>(prep_res, dets, labels);
-    case DataType::kINT64:
-      return GetBBoxes<int64_t>(prep_res, dets, labels);
-    default:
-      return Status(eNotSupported);
-  }
-}
-template <typename T>
-Result<Detections> ResizeBBox::GetBBoxes(const Value& prep_res, const Tensor& dets,
-                                         const Tensor& labels) {
-  Detections objs;
-  auto* dets_ptr = dets.data<float>();
-  auto* labels_ptr = labels.data<T>();
-  vector<float> scale_factor;
-  if (prep_res.contains("scale_factor")) {
-    from_value(prep_res["scale_factor"], scale_factor);
-  } else {
-    scale_factor = {1.f, 1.f, 1.f, 1.f};
-  }
-
-  int top_padding = 0;
-  int left_padding = 0;
-  if (prep_res.contains("pad_param")) {
-    top_padding = prep_res["pad_param"][0].get<int>();
-    left_padding = prep_res["pad_param"][1].get<int>();
-  }
-
-  float w_offset = 0.f;
-  float h_offset = 0.f;
-  if (prep_res.contains("border")) {
-    w_offset = -prep_res["border"][1].get<int>();
-    h_offset = -prep_res["border"][0].get<int>();
-  }
-  int ori_width = prep_res["ori_shape"][2].get<int>();
-  int ori_height = prep_res["ori_shape"][1].get<int>();
-
-  // `dets` has shape(1, n, 4) or shape(1, n, 5). The latter one has `score`
-  auto bboxes_number = dets.shape()[1];
-  auto channels = dets.shape()[2];
-  for (auto i = 0; i < bboxes_number; ++i, dets_ptr += channels, ++labels_ptr) {
-    float score = 0.f;
-    if (channels > 4 && dets_ptr[4] <= score_thr_) {
-      continue;
-    }
-    score = channels > 4 ? dets_ptr[4] : score;
-    auto left = dets_ptr[0];
-    auto top = dets_ptr[1];
-    auto right = dets_ptr[2];
-    auto bottom = dets_ptr[3];
-
-    MMDEPLOY_DEBUG("ori left {}, top {}, right {}, bottom {}, label {}", left, top, right, bottom,
-                   *labels_ptr);
-    auto rect = MapToOriginImage(left, top, right, bottom, scale_factor.data(), w_offset, h_offset,
-                                 ori_width, ori_height, top_padding, left_padding);
-    if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_) {
-      MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0],
-                     rect[3] - rect[1]);
-      continue;
+    template<typename T>
+    Result<Detections> ResizeBBox::GetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels)
+    {
+        Detections    objs;
+        auto*         dets_ptr   = dets.data<float>();
+        auto*         labels_ptr = labels.data<T>();
+        vector<float> scale_factor;
+        if (prep_res.contains("scale_factor"))
+        {
+            from_value(prep_res["scale_factor"], scale_factor);
+        }
+        else
+        {
+            scale_factor = {1.f, 1.f, 1.f, 1.f};
+        }
+
+        int top_padding  = 0;
+        int left_padding = 0;
+        if (prep_res.contains("pad_param"))
+        {
+            top_padding  = prep_res["pad_param"][0].get<int>();
+            left_padding = prep_res["pad_param"][1].get<int>();
+        }
+
+        float w_offset = 0.f;
+        float h_offset = 0.f;
+        if (prep_res.contains("border"))
+        {
+            w_offset = -prep_res["border"][1].get<int>();
+            h_offset = -prep_res["border"][0].get<int>();
+        }
+        int  ori_width  = prep_res["ori_shape"][2].get<int>();
+        int  ori_height = prep_res["ori_shape"][1].get<int>();
+
+        // `dets` has shape(1, n, 4) or shape(1, n, 5). The latter one has `score`
+        auto bboxes_number = dets.shape()[1];
+        auto channels      = dets.shape()[2];
+        for (auto i = 0; i < bboxes_number; ++i, dets_ptr += channels, ++labels_ptr)
+        {
+            float score = 0.f;
+            if (channels > 4 && dets_ptr[4] <= score_thr_)
+            {
+                continue;
+            }
+            score       = channels > 4 ? dets_ptr[4] : score;
+            auto left   = dets_ptr[0];
+            auto top    = dets_ptr[1];
+            auto right  = dets_ptr[2];
+            auto bottom = dets_ptr[3];
+
+            MMDEPLOY_DEBUG("ori left {}, top {}, right {}, bottom {}, label {}", left, top, right, bottom, *labels_ptr);
+            auto rect = MapToOriginImage(left, top, right, bottom, scale_factor.data(), w_offset, h_offset, ori_width, ori_height, top_padding, left_padding);
+            if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_)
+            {
+                MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0], rect[3] - rect[1]);
+                continue;
+            }
+            MMDEPLOY_DEBUG("remap left {}, top {}, right {}, bottom {}", rect[0], rect[1], rect[2], rect[3]);
+            Detection det{};
+            det.index    = i;
+            det.label_id = static_cast<int>(*labels_ptr);
+            det.score    = score;
+            det.bbox     = rect;
+            objs.push_back(std::move(det));
+        }
+        return objs;
     }
-    MMDEPLOY_DEBUG("remap left {}, top {}, right {}, bottom {}", rect[0], rect[1], rect[2],
-                   rect[3]);
-    Detection det{};
-    det.index = i;
-    det.label_id = static_cast<int>(*labels_ptr);
-    det.score = score;
-    det.bbox = rect;
-    objs.push_back(std::move(det));
-  }
-  return objs;
-}
 
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, ResizeBBox);
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, ResizeBBox);
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/object_detection.h b/csrc/mmdeploy/codebase/mmdet/object_detection.h
index acdda04ef6..ea31dd7922 100644
--- a/csrc/mmdeploy/codebase/mmdet/object_detection.h
+++ b/csrc/mmdeploy/codebase/mmdet/object_detection.h
@@ -7,28 +7,29 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::mmdet {
+namespace mmdeploy::mmdet
+{
 
-class ResizeBBox : public MMDetection {
- public:
-  explicit ResizeBBox(const Value& cfg);
+    class ResizeBBox : public MMDetection
+    {
+      public:
+        explicit ResizeBBox(const Value& cfg);
 
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
+        Result<Value> operator()(const Value& prep_res, const Value& infer_res);
 
- protected:
-  Result<Detections> DispatchGetBBoxes(const Value& prep_res, const Tensor& dets,
-                                       const Tensor& labels);
+      protected:
+        Result<Detections> DispatchGetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels);
 
-  template <typename T>
-  Result<Detections> GetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels);
+        template<typename T>
+        Result<Detections>  GetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels);
 
-  std::vector<Tensor> GetDetsLabels(const Value& prep_res, const Value& infer_res);
+        std::vector<Tensor> GetDetsLabels(const Value& prep_res, const Value& infer_res);
 
- protected:
-  constexpr static Device kHost{0, 0};
-  float score_thr_{0.f};
-  float min_bbox_size_{0.f};
-};
+      protected:
+        constexpr static Device kHost{0, 0};
+        float                   score_thr_{0.f};
+        float                   min_bbox_size_{0.f};
+    };
 
 }  // namespace mmdeploy::mmdet
 
diff --git a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp
index 27dc6578b5..5acbe061e4 100644
--- a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp
@@ -11,184 +11,203 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "utils.h"
 
-namespace mmdeploy::mmdet {
-
-RTMDetSepBNHead::RTMDetSepBNHead(const Value& cfg) : MMDetection(cfg) {
-  auto init = [&]() -> Result<void> {
-    auto model = cfg["context"]["model"].get<Model>();
-    if (cfg.contains("params")) {
-      nms_pre_ = cfg["params"].value("nms_pre", -1);
-      score_thr_ = cfg["params"].value("score_thr", 0.02f);
-      min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
-      max_per_img_ = cfg["params"].value("max_per_img", 100);
-      iou_threshold_ = cfg["params"].contains("nms")
-                           ? cfg["params"]["nms"].value("iou_threshold", 0.45f)
-                           : 0.45f;
-      if (cfg["params"].contains("anchor_generator")) {
-        offset_ = cfg["params"]["anchor_generator"].value("offset", 0);
-        from_value(cfg["params"]["anchor_generator"]["strides"], strides_);
-      }
+namespace mmdeploy::mmdet
+{
+
+    RTMDetSepBNHead::RTMDetSepBNHead(const Value& cfg)
+        : MMDetection(cfg)
+    {
+        auto init = [&]() -> Result<void>
+        {
+            auto model = cfg["context"]["model"].get<Model>();
+            if (cfg.contains("params"))
+            {
+                nms_pre_       = cfg["params"].value("nms_pre", -1);
+                score_thr_     = cfg["params"].value("score_thr", 0.02f);
+                min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
+                max_per_img_   = cfg["params"].value("max_per_img", 100);
+                iou_threshold_ = cfg["params"].contains("nms") ? cfg["params"]["nms"].value("iou_threshold", 0.45f) : 0.45f;
+                if (cfg["params"].contains("anchor_generator"))
+                {
+                    offset_ = cfg["params"]["anchor_generator"].value("offset", 0);
+                    from_value(cfg["params"]["anchor_generator"]["strides"], strides_);
+                }
+            }
+            return success();
+        };
+        init().value();
     }
-    return success();
-  };
-  init().value();
-}
-
-Result<Value> RTMDetSepBNHead::operator()(const Value& prep_res, const Value& infer_res) {
-  MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
-  try {
-    std::vector<Tensor> cls_scores;
-    std::vector<Tensor> bbox_preds;
-    const Device kHost{0, 0};
-    int i = 0;
-    int divisor = infer_res.size() / 2;
-    for (auto iter = infer_res.begin(); iter != infer_res.end(); iter++) {
-      auto pred_map = iter->get<Tensor>();
-      OUTCOME_TRY(auto _pred_map, MakeAvailableOnDevice(pred_map, kHost, stream()));
-      if (i < divisor)
-        cls_scores.push_back(_pred_map);
-      else
-        bbox_preds.push_back(_pred_map);
-      i++;
+
+    Result<Value> RTMDetSepBNHead::operator()(const Value& prep_res, const Value& infer_res)
+    {
+        MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+        try
+        {
+            std::vector<Tensor> cls_scores;
+            std::vector<Tensor> bbox_preds;
+            const Device        kHost{0, 0};
+            int                 i       = 0;
+            int                 divisor = infer_res.size() / 2;
+            for (auto iter = infer_res.begin(); iter != infer_res.end(); iter++)
+            {
+                auto pred_map = iter->get<Tensor>();
+                OUTCOME_TRY(auto _pred_map, MakeAvailableOnDevice(pred_map, kHost, stream()));
+                if (i < divisor)
+                    cls_scores.push_back(_pred_map);
+                else
+                    bbox_preds.push_back(_pred_map);
+                i++;
+            }
+            OUTCOME_TRY(stream().Wait());
+            OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], bbox_preds, cls_scores));
+            return to_value(result);
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
     }
-    OUTCOME_TRY(stream().Wait());
-    OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], bbox_preds, cls_scores));
-    return to_value(result);
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-
-static float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); }
-
-Result<Detections> RTMDetSepBNHead::GetBBoxes(const Value& prep_res,
-                                              const std::vector<Tensor>& bbox_preds,
-                                              const std::vector<Tensor>& cls_scores) const {
-  MMDEPLOY_DEBUG("bbox_pred: {}, {}", bbox_preds[0].shape(), dets[0].data_type());
-  MMDEPLOY_DEBUG("cls_score: {}, {}", scores[0].shape(), scores[0].data_type());
-
-  std::vector<float> filter_boxes;
-  std::vector<float> obj_probs;
-  std::vector<int> class_ids;
-
-  for (int i = 0; i < bbox_preds.size(); i++) {
-    RTMDetFeatDeocde(bbox_preds[i], cls_scores[i], strides_[i], offset_, filter_boxes, obj_probs,
-                     class_ids);
-  }
-
-  std::vector<int> indexArray;
-  for (int i = 0; i < obj_probs.size(); ++i) {
-    indexArray.push_back(i);
-  }
-  Sort(obj_probs, class_ids, indexArray);
-
-  Tensor dets(TensorDesc{Device{0, 0}, DataType::kFLOAT,
-                         TensorShape{int(filter_boxes.size() / 4), 4}, "dets"});
-  std::copy(filter_boxes.begin(), filter_boxes.end(), dets.data<float>());
-  NMS(dets, iou_threshold_, indexArray);
-
-  Detections objs;
-  std::vector<float> scale_factor;
-  if (prep_res.contains("scale_factor")) {
-    from_value(prep_res["scale_factor"], scale_factor);
-  } else {
-    scale_factor = {1.f, 1.f, 1.f, 1.f};
-  }
-  int ori_width = prep_res["ori_shape"][2].get<int>();
-  int ori_height = prep_res["ori_shape"][1].get<int>();
-  auto det_ptr = dets.data<float>();
-  for (int i = 0; i < indexArray.size(); ++i) {
-    if (indexArray[i] == -1) {
-      continue;
+
+    static float sigmoid(float x)
+    {
+        return 1.0 / (1.0 + expf(-x));
     }
-    int j = indexArray[i];
-    auto x1 = det_ptr[j * 4 + 0];
-    auto y1 = det_ptr[j * 4 + 1];
-    auto x2 = det_ptr[j * 4 + 2];
-    auto y2 = det_ptr[j * 4 + 3];
-    int label_id = class_ids[i];
-    float score = obj_probs[i];
-
-    MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
-
-    auto rect =
-        MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height, 0, 0);
-    if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_) {
-      MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0],
-                     rect[3] - rect[1]);
-      continue;
+
+    Result<Detections> RTMDetSepBNHead::GetBBoxes(const Value&               prep_res,
+                                                  const std::vector<Tensor>& bbox_preds,
+                                                  const std::vector<Tensor>& cls_scores) const
+    {
+        MMDEPLOY_DEBUG("bbox_pred: {}, {}", bbox_preds[0].shape(), dets[0].data_type());
+        MMDEPLOY_DEBUG("cls_score: {}, {}", scores[0].shape(), scores[0].data_type());
+
+        std::vector<float> filter_boxes;
+        std::vector<float> obj_probs;
+        std::vector<int>   class_ids;
+
+        for (int i = 0; i < bbox_preds.size(); i++)
+        {
+            RTMDetFeatDeocde(bbox_preds[i], cls_scores[i], strides_[i], offset_, filter_boxes, obj_probs, class_ids);
+        }
+
+        std::vector<int> indexArray;
+        for (int i = 0; i < obj_probs.size(); ++i)
+        {
+            indexArray.push_back(i);
+        }
+        Sort(obj_probs, class_ids, indexArray);
+
+        Tensor dets(TensorDesc{Device{0, 0}, DataType::kFLOAT, TensorShape{int(filter_boxes.size() / 4), 4}, "dets"});
+        std::copy(filter_boxes.begin(), filter_boxes.end(), dets.data<float>());
+        NMS(dets, iou_threshold_, indexArray);
+
+        Detections         objs;
+        std::vector<float> scale_factor;
+        if (prep_res.contains("scale_factor"))
+        {
+            from_value(prep_res["scale_factor"], scale_factor);
+        }
+        else
+        {
+            scale_factor = {1.f, 1.f, 1.f, 1.f};
+        }
+        int  ori_width  = prep_res["ori_shape"][2].get<int>();
+        int  ori_height = prep_res["ori_shape"][1].get<int>();
+        auto det_ptr    = dets.data<float>();
+        for (int i = 0; i < indexArray.size(); ++i)
+        {
+            if (indexArray[i] == -1)
+            {
+                continue;
+            }
+            int   j        = indexArray[i];
+            auto  x1       = det_ptr[j * 4 + 0];
+            auto  y1       = det_ptr[j * 4 + 1];
+            auto  x2       = det_ptr[j * 4 + 2];
+            auto  y2       = det_ptr[j * 4 + 3];
+            int   label_id = class_ids[i];
+            float score    = obj_probs[i];
+
+            MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
+
+            auto rect =
+                MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height, 0, 0);
+            if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_)
+            {
+                MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0], rect[3] - rect[1]);
+                continue;
+            }
+            Detection det{};
+            det.index    = i;
+            det.label_id = label_id;
+            det.score    = score;
+            det.bbox     = rect;
+            objs.push_back(std::move(det));
+        }
+
+        return objs;
     }
-    Detection det{};
-    det.index = i;
-    det.label_id = label_id;
-    det.score = score;
-    det.bbox = rect;
-    objs.push_back(std::move(det));
-  }
-
-  return objs;
-}
-
-int RTMDetSepBNHead::RTMDetFeatDeocde(const Tensor& bbox_pred, const Tensor& cls_score,
-                                      const float stride, const float offset,
-                                      std::vector<float>& filter_boxes,
-                                      std::vector<float>& obj_probs,
-                                      std::vector<int>& class_ids) const {
-  int cls_param_num = cls_score.shape(1);
-  int feat_h = bbox_pred.shape(2);
-  int feat_w = bbox_pred.shape(3);
-  int feat_size = feat_h * feat_w;
-  auto bbox_ptr = bbox_pred.data<float>();
-  auto score_ptr = cls_score.data<float>();  // (b, c, h, w)
-  int valid_count = 0;
-  for (int i = 0; i < feat_h; i++) {
-    for (int j = 0; j < feat_w; j++) {
-      float max_score = score_ptr[i * feat_w + j];
-      int class_id = 0;
-      for (int k = 0; k < cls_param_num; k++) {
-        float score = score_ptr[k * feat_size + i * feat_w + j];
-        if (score > max_score) {
-          max_score = score;
-          class_id = k;
+
+    int RTMDetSepBNHead::RTMDetFeatDeocde(const Tensor& bbox_pred, const Tensor& cls_score, const float stride, const float offset, std::vector<float>& filter_boxes, std::vector<float>& obj_probs, std::vector<int>& class_ids) const
+    {
+        int  cls_param_num = cls_score.shape(1);
+        int  feat_h        = bbox_pred.shape(2);
+        int  feat_w        = bbox_pred.shape(3);
+        int  feat_size     = feat_h * feat_w;
+        auto bbox_ptr      = bbox_pred.data<float>();
+        auto score_ptr     = cls_score.data<float>();  // (b, c, h, w)
+        int  valid_count   = 0;
+        for (int i = 0; i < feat_h; i++)
+        {
+            for (int j = 0; j < feat_w; j++)
+            {
+                float max_score = score_ptr[i * feat_w + j];
+                int   class_id  = 0;
+                for (int k = 0; k < cls_param_num; k++)
+                {
+                    float score = score_ptr[k * feat_size + i * feat_w + j];
+                    if (score > max_score)
+                    {
+                        max_score = score;
+                        class_id  = k;
+                    }
+                }
+                max_score = sigmoid(max_score);
+                if (max_score < score_thr_) continue;
+
+                obj_probs.push_back(max_score);
+                class_ids.push_back(class_id);
+
+                float tl_x = bbox_ptr[0 * feat_size + i * feat_w + j];
+                float tl_y = bbox_ptr[1 * feat_size + i * feat_w + j];
+                float br_x = bbox_ptr[2 * feat_size + i * feat_w + j];
+                float br_y = bbox_ptr[3 * feat_size + i * feat_w + j];
+
+                auto  box = RTMDetdecode(tl_x, tl_y, br_x, br_y, stride, offset, j, i);
+
+                tl_x = box[0];
+                tl_y = box[1];
+                br_x = box[2];
+                br_y = box[3];
+
+                filter_boxes.push_back(tl_x);
+                filter_boxes.push_back(tl_y);
+                filter_boxes.push_back(br_x);
+                filter_boxes.push_back(br_y);
+                valid_count++;
+            }
         }
-      }
-      max_score = sigmoid(max_score);
-      if (max_score < score_thr_) continue;
-
-      obj_probs.push_back(max_score);
-      class_ids.push_back(class_id);
-
-      float tl_x = bbox_ptr[0 * feat_size + i * feat_w + j];
-      float tl_y = bbox_ptr[1 * feat_size + i * feat_w + j];
-      float br_x = bbox_ptr[2 * feat_size + i * feat_w + j];
-      float br_y = bbox_ptr[3 * feat_size + i * feat_w + j];
-
-      auto box = RTMDetdecode(tl_x, tl_y, br_x, br_y, stride, offset, j, i);
-
-      tl_x = box[0];
-      tl_y = box[1];
-      br_x = box[2];
-      br_y = box[3];
-
-      filter_boxes.push_back(tl_x);
-      filter_boxes.push_back(tl_y);
-      filter_boxes.push_back(br_x);
-      filter_boxes.push_back(br_y);
-      valid_count++;
+        return valid_count;
     }
-  }
-  return valid_count;
-}
-
-std::array<float, 4> RTMDetSepBNHead::RTMDetdecode(float tl_x, float tl_y, float br_x, float br_y,
-                                                   float stride, float offset, int j, int i) const {
-  tl_x = (offset + j) * stride - tl_x;
-  tl_y = (offset + i) * stride - tl_y;
-  br_x = (offset + j) * stride + br_x;
-  br_y = (offset + i) * stride + br_y;
-  return std::array<float, 4>{tl_x, tl_y, br_x, br_y};
-}
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, RTMDetSepBNHead);
+
+    std::array<float, 4> RTMDetSepBNHead::RTMDetdecode(float tl_x, float tl_y, float br_x, float br_y, float stride, float offset, int j, int i) const
+    {
+        tl_x = (offset + j) * stride - tl_x;
+        tl_y = (offset + i) * stride - tl_y;
+        br_x = (offset + j) * stride + br_x;
+        br_y = (offset + i) * stride + br_y;
+        return std::array<float, 4>{tl_x, tl_y, br_x, br_y};
+    }
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, RTMDetSepBNHead);
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h
index 98665412a0..fbbe4ccc08 100644
--- a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h
+++ b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h
@@ -5,29 +5,27 @@
 #include "mmdeploy/codebase/mmdet/mmdet.h"
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::mmdet {
+namespace mmdeploy::mmdet
+{
 
-class RTMDetSepBNHead : public MMDetection {
- public:
-  explicit RTMDetSepBNHead(const Value& cfg);
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
-  Result<Detections> GetBBoxes(const Value& prep_res, const std::vector<Tensor>& bbox_preds,
-                               const std::vector<Tensor>& cls_scores) const;
-  int RTMDetFeatDeocde(const Tensor& bbox_pred, const Tensor& cls_score, const float stride,
-                       const float offset, std::vector<float>& filter_boxes,
-                       std::vector<float>& obj_probs, std::vector<int>& class_ids) const;
-  std::array<float, 4> RTMDetdecode(float tl_x, float tl_y, float br_x, float br_y, float stride,
-                                    float offset, int j, int i) const;
+    class RTMDetSepBNHead : public MMDetection
+    {
+      public:
+        explicit RTMDetSepBNHead(const Value& cfg);
+        Result<Value>        operator()(const Value& prep_res, const Value& infer_res);
+        Result<Detections>   GetBBoxes(const Value& prep_res, const std::vector<Tensor>& bbox_preds, const std::vector<Tensor>& cls_scores) const;
+        int                  RTMDetFeatDeocde(const Tensor& bbox_pred, const Tensor& cls_score, const float stride, const float offset, std::vector<float>& filter_boxes, std::vector<float>& obj_probs, std::vector<int>& class_ids) const;
+        std::array<float, 4> RTMDetdecode(float tl_x, float tl_y, float br_x, float br_y, float stride, float offset, int j, int i) const;
 
- private:
-  float score_thr_{0.4f};
-  int nms_pre_{1000};
-  float iou_threshold_{0.45f};
-  int min_bbox_size_{0};
-  int max_per_img_{100};
-  float offset_{0.0f};
-  std::vector<float> strides_;
-};
+      private:
+        float              score_thr_{0.4f};
+        int                nms_pre_{1000};
+        float              iou_threshold_{0.45f};
+        int                min_bbox_size_{0};
+        int                max_per_img_{100};
+        float              offset_{0.0f};
+        std::vector<float> strides_;
+    };
 
 }  // namespace mmdeploy::mmdet
 
diff --git a/csrc/mmdeploy/codebase/mmdet/utils.cpp b/csrc/mmdeploy/codebase/mmdet/utils.cpp
index fd1f9ba106..4e25b1c7be 100644
--- a/csrc/mmdeploy/codebase/mmdet/utils.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/utils.cpp
@@ -6,89 +6,97 @@
 
 using mmdeploy::framework::Tensor;
 
-namespace mmdeploy::mmdet {
+namespace mmdeploy::mmdet
+{
 
-std::array<float, 4> MapToOriginImage(float left, float top, float right, float bottom,
-                                      const float* scale_factor, float x_offset, float y_offset,
-                                      int ori_width, int ori_height, int top_padding,
-                                      int left_padding) {
-  left = std::max((left - left_padding) / scale_factor[0] + x_offset, 0.f);
-  top = std::max((top - top_padding) / scale_factor[1] + y_offset, 0.f);
-  right = std::min((right - left_padding) / scale_factor[2] + x_offset, (float)ori_width - 1.f);
-  bottom = std::min((bottom - top_padding) / scale_factor[3] + y_offset, (float)ori_height - 1.f);
-  return {left, top, right, bottom};
-}
+    std::array<float, 4> MapToOriginImage(float left, float top, float right, float bottom, const float* scale_factor, float x_offset, float y_offset, int ori_width, int ori_height, int top_padding, int left_padding)
+    {
+        left   = std::max((left - left_padding) / scale_factor[0] + x_offset, 0.f);
+        top    = std::max((top - top_padding) / scale_factor[1] + y_offset, 0.f);
+        right  = std::min((right - left_padding) / scale_factor[2] + x_offset, (float)ori_width - 1.f);
+        bottom = std::min((bottom - top_padding) / scale_factor[3] + y_offset, (float)ori_height - 1.f);
+        return {left, top, right, bottom};
+    }
 
-void FilterScoresAndTopk(const Tensor& scores, float score_thr, int topk, std::vector<float>& probs,
-                         std::vector<int>& label_ids, std::vector<int>& anchor_idxs) {
-  auto kDets = scores.shape(1);
-  auto kClasses = scores.shape(2);
-  auto score_ptr = scores.data<float>();
+    void FilterScoresAndTopk(const Tensor& scores, float score_thr, int topk, std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs)
+    {
+        auto kDets     = scores.shape(1);
+        auto kClasses  = scores.shape(2);
+        auto score_ptr = scores.data<float>();
 
-  for (auto i = 0; i < kDets; ++i, score_ptr += kClasses) {
-    auto iter = std::max_element(score_ptr, score_ptr + kClasses);
-    auto max_score = *iter;
-    if (*iter < score_thr) {
-      continue;
+        for (auto i = 0; i < kDets; ++i, score_ptr += kClasses)
+        {
+            auto iter      = std::max_element(score_ptr, score_ptr + kClasses);
+            auto max_score = *iter;
+            if (*iter < score_thr)
+            {
+                continue;
+            }
+            probs.push_back(*iter);
+            label_ids.push_back(iter - score_ptr);
+            anchor_idxs.push_back(i);
+        }
     }
-    probs.push_back(*iter);
-    label_ids.push_back(iter - score_ptr);
-    anchor_idxs.push_back(i);
-  }
-}
 
-float IOU(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1,
-          float ymax1) {
-  auto w = std::max(0.f, std::min(xmax0, xmax1) - std::max(xmin0, xmin1));
-  auto h = std::max(0.f, std::min(ymax0, ymax1) - std::max(ymin0, ymin1));
-  auto area = w * h;
-  auto sum = (xmax0 - xmin0) * (ymax0 - ymin0) + (xmax1 - xmin1) * (ymax1 - ymin1);
-  auto iou = area / (sum - area);
-  return iou <= 0.f ? 0.f : iou;
-}
+    float IOU(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1, float ymax1)
+    {
+        auto w    = std::max(0.f, std::min(xmax0, xmax1) - std::max(xmin0, xmin1));
+        auto h    = std::max(0.f, std::min(ymax0, ymax1) - std::max(ymin0, ymin1));
+        auto area = w * h;
+        auto sum  = (xmax0 - xmin0) * (ymax0 - ymin0) + (xmax1 - xmin1) * (ymax1 - ymin1);
+        auto iou  = area / (sum - area);
+        return iou <= 0.f ? 0.f : iou;
+    }
 
-void NMS(const Tensor& dets, float iou_threshold, std::vector<int>& keep_idxs) {
-  auto det_ptr = dets.data<float>();
-  for (auto i = 0; i < keep_idxs.size(); ++i) {
-    auto n = keep_idxs[i];
-    for (auto j = i + 1; j < keep_idxs.size(); ++j) {
-      auto m = keep_idxs[j];
+    void NMS(const Tensor& dets, float iou_threshold, std::vector<int>& keep_idxs)
+    {
+        auto det_ptr = dets.data<float>();
+        for (auto i = 0; i < keep_idxs.size(); ++i)
+        {
+            auto n = keep_idxs[i];
+            for (auto j = i + 1; j < keep_idxs.size(); ++j)
+            {
+                auto  m = keep_idxs[j];
 
-      // `delta_xywh_bbox_coder` decode return tl_x, tl_y, br_x, br_y
-      float xmin0 = det_ptr[n * 4 + 0];
-      float ymin0 = det_ptr[n * 4 + 1];
-      float xmax0 = det_ptr[n * 4 + 2];
-      float ymax0 = det_ptr[n * 4 + 3];
+                // `delta_xywh_bbox_coder` decode return tl_x, tl_y, br_x, br_y
+                float xmin0 = det_ptr[n * 4 + 0];
+                float ymin0 = det_ptr[n * 4 + 1];
+                float xmax0 = det_ptr[n * 4 + 2];
+                float ymax0 = det_ptr[n * 4 + 3];
 
-      float xmin1 = det_ptr[m * 4 + 0];
-      float ymin1 = det_ptr[m * 4 + 1];
-      float xmax1 = det_ptr[m * 4 + 2];
-      float ymax1 = det_ptr[m * 4 + 3];
+                float xmin1 = det_ptr[m * 4 + 0];
+                float ymin1 = det_ptr[m * 4 + 1];
+                float xmax1 = det_ptr[m * 4 + 2];
+                float ymax1 = det_ptr[m * 4 + 3];
 
-      float iou = IOU(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1);
+                float iou = IOU(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1);
 
-      if (iou > iou_threshold) {
-        keep_idxs[j] = -1;
-      }
+                if (iou > iou_threshold)
+                {
+                    keep_idxs[j] = -1;
+                }
+            }
+        }
     }
-  }
-}
 
-void Sort(std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs) {
-  std::vector<int> prob_idxs(probs.size());
-  std::iota(prob_idxs.begin(), prob_idxs.end(), 0);
-  std::sort(prob_idxs.begin(), prob_idxs.end(), [&](int i, int j) { return probs[i] > probs[j]; });
-  std::vector<float> _probs;
-  std::vector<int> _label_ids;
-  std::vector<int> _keep_idxs;
-  for (auto idx : prob_idxs) {
-    _probs.push_back(probs[idx]);
-    _label_ids.push_back(label_ids[idx]);
-    _keep_idxs.push_back(anchor_idxs[idx]);
-  }
-  probs = std::move(_probs);
-  label_ids = std::move(_label_ids);
-  anchor_idxs = std::move(_keep_idxs);
-}
+    void Sort(std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs)
+    {
+        std::vector<int> prob_idxs(probs.size());
+        std::iota(prob_idxs.begin(), prob_idxs.end(), 0);
+        std::sort(prob_idxs.begin(), prob_idxs.end(), [&](int i, int j)
+                  { return probs[i] > probs[j]; });
+        std::vector<float> _probs;
+        std::vector<int>   _label_ids;
+        std::vector<int>   _keep_idxs;
+        for (auto idx : prob_idxs)
+        {
+            _probs.push_back(probs[idx]);
+            _label_ids.push_back(label_ids[idx]);
+            _keep_idxs.push_back(anchor_idxs[idx]);
+        }
+        probs       = std::move(_probs);
+        label_ids   = std::move(_label_ids);
+        anchor_idxs = std::move(_keep_idxs);
+    }
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/utils.h b/csrc/mmdeploy/codebase/mmdet/utils.h
index 47867b7a60..8ef4b405ff 100644
--- a/csrc/mmdeploy/codebase/mmdet/utils.h
+++ b/csrc/mmdeploy/codebase/mmdet/utils.h
@@ -8,25 +8,20 @@
 
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::mmdet {
-std::array<float, 4> MapToOriginImage(float left, float top, float right, float bottom,
-                                      const float* scale_factor, float x_offset, float y_offset,
-                                      int ori_width, int ori_height, int top_padding,
-                                      int left_padding);
-// @brief Filter results using score threshold and topk candidates.
-// scores (Tensor): The scores, shape (num_bboxes, K).
-// probs: The scores after being filtered
-// label_ids: The class labels
-// anchor_idxs: The anchor indexes
-void FilterScoresAndTopk(const mmdeploy::framework::Tensor& scores, float score_thr, int topk,
-                         std::vector<float>& probs, std::vector<int>& label_ids,
-                         std::vector<int>& anchor_idxs);
-float IOU(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1,
-          float ymax1);
-
-void Sort(std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs);
-
-void NMS(const mmdeploy::framework::Tensor& dets, float iou_threshold, std::vector<int>& keep_idxs);
+namespace mmdeploy::mmdet
+{
+    std::array<float, 4> MapToOriginImage(float left, float top, float right, float bottom, const float* scale_factor, float x_offset, float y_offset, int ori_width, int ori_height, int top_padding, int left_padding);
+    // @brief Filter results using score threshold and topk candidates.
+    // scores (Tensor): The scores, shape (num_bboxes, K).
+    // probs: The scores after being filtered
+    // label_ids: The class labels
+    // anchor_idxs: The anchor indexes
+    void                 FilterScoresAndTopk(const mmdeploy::framework::Tensor& scores, float score_thr, int topk, std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs);
+    float                IOU(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1, float ymax1);
+
+    void                 Sort(std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs);
+
+    void                 NMS(const mmdeploy::framework::Tensor& dets, float iou_threshold, std::vector<int>& keep_idxs);
 
 }  // namespace mmdeploy::mmdet
 
diff --git a/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp b/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp
index 1920df1914..1999857d20 100644
--- a/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp
@@ -11,219 +11,242 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "utils.h"
 
-namespace mmdeploy::mmdet {
-
-YOLOHead::YOLOHead(const Value& cfg) : MMDetection(cfg) {
-  auto init = [&]() -> Result<void> {
-    auto model = cfg["context"]["model"].get<Model>();
-    if (cfg.contains("params")) {
-      nms_pre_ = cfg["params"].value("nms_pre", -1);
-      score_thr_ = cfg["params"].value("score_thr", 0.02f);
-      min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
-      iou_threshold_ = cfg["params"].contains("nms")
-                           ? cfg["params"]["nms"].value("iou_threshold", 0.45f)
-                           : 0.45f;
-      if (cfg["params"].contains("anchor_generator")) {
-        from_value(cfg["params"]["anchor_generator"]["base_sizes"], anchors_);
-        from_value(cfg["params"]["anchor_generator"]["strides"], strides_);
-      }
+namespace mmdeploy::mmdet
+{
+
+    YOLOHead::YOLOHead(const Value& cfg)
+        : MMDetection(cfg)
+    {
+        auto init = [&]() -> Result<void>
+        {
+            auto model = cfg["context"]["model"].get<Model>();
+            if (cfg.contains("params"))
+            {
+                nms_pre_       = cfg["params"].value("nms_pre", -1);
+                score_thr_     = cfg["params"].value("score_thr", 0.02f);
+                min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
+                iou_threshold_ = cfg["params"].contains("nms") ? cfg["params"]["nms"].value("iou_threshold", 0.45f) : 0.45f;
+                if (cfg["params"].contains("anchor_generator"))
+                {
+                    from_value(cfg["params"]["anchor_generator"]["base_sizes"], anchors_);
+                    from_value(cfg["params"]["anchor_generator"]["strides"], strides_);
+                }
+            }
+            return success();
+        };
+        init().value();
+    }
+
+    Result<Value> YOLOHead::operator()(const Value& prep_res, const Value& infer_res)
+    {
+        MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+        try
+        {
+            const Device        kHost{0, 0};
+            std::vector<Tensor> pred_maps;
+            for (auto iter = infer_res.begin(); iter != infer_res.end(); iter++)
+            {
+                auto pred_map = iter->get<Tensor>();
+                OUTCOME_TRY(auto _pred_map, MakeAvailableOnDevice(pred_map, kHost, stream()));
+                pred_maps.push_back(_pred_map);
+            }
+            OUTCOME_TRY(stream().Wait());
+            // reorder pred_maps according to strides and anchors, mainly for rknpu yolov3
+            if ((pred_maps.size() > 1) &&
+                !((strides_[0] < strides_[1]) ^ (pred_maps[0].shape(3) < pred_maps[1].shape(3))))
+            {
+                std::reverse(pred_maps.begin(), pred_maps.end());
+            }
+            OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], pred_maps));
+            return to_value(result);
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
+    }
+
+    inline static int clamp(float val, int min, int max)
+    {
+        return val > min ? (val < max ? val : max) : min;
     }
-    return success();
-  };
-  init().value();
-}
-
-Result<Value> YOLOHead::operator()(const Value& prep_res, const Value& infer_res) {
-  MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
-  try {
-    const Device kHost{0, 0};
-    std::vector<Tensor> pred_maps;
-    for (auto iter = infer_res.begin(); iter != infer_res.end(); iter++) {
-      auto pred_map = iter->get<Tensor>();
-      OUTCOME_TRY(auto _pred_map, MakeAvailableOnDevice(pred_map, kHost, stream()));
-      pred_maps.push_back(_pred_map);
+
+    static float sigmoid(float x)
+    {
+        return 1.0 / (1.0 + expf(-x));
     }
-    OUTCOME_TRY(stream().Wait());
-    // reorder pred_maps according to strides and anchors, mainly for rknpu yolov3
-    if ((pred_maps.size() > 1) &&
-        !((strides_[0] < strides_[1]) ^ (pred_maps[0].shape(3) < pred_maps[1].shape(3)))) {
-      std::reverse(pred_maps.begin(), pred_maps.end());
+
+    static float unsigmoid(float y)
+    {
+        return -1.0 * logf((1.0 / y) - 1.0);
     }
-    OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], pred_maps));
-    return to_value(result);
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-
-inline static int clamp(float val, int min, int max) {
-  return val > min ? (val < max ? val : max) : min;
-}
-
-static float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); }
-
-static float unsigmoid(float y) { return -1.0 * logf((1.0 / y) - 1.0); }
-
-int YOLOHead::YOLOFeatDecode(const Tensor& feat_map, const std::vector<std::vector<float>>& anchor,
-                             int grid_h, int grid_w, int height, int width, int stride,
-                             std::vector<float>& boxes, std::vector<float>& obj_probs,
-                             std::vector<int>& class_id, float threshold) const {
-  auto input = const_cast<float*>(feat_map.data<float>());
-  auto prop_box_size = feat_map.shape(1) / anchor.size();
-  const int kClasses = prop_box_size - 5;
-  int valid_count = 0;
-  int grid_len = grid_h * grid_w;
-  float thres = unsigmoid(threshold);
-  for (int a = 0; a < anchor.size(); a++) {
-    for (int i = 0; i < grid_h; i++) {
-      for (int j = 0; j < grid_w; j++) {
-        float box_confidence = input[(prop_box_size * a + 4) * grid_len + i * grid_w + j];
-        if (box_confidence >= thres) {
-          int offset = (prop_box_size * a) * grid_len + i * grid_w + j;
-          float* in_ptr = input + offset;
-
-          float box_x = sigmoid(*in_ptr);
-          float box_y = sigmoid(in_ptr[grid_len]);
-          float box_w = in_ptr[2 * grid_len];
-          float box_h = in_ptr[3 * grid_len];
-          auto box = yolo_decode(box_x, box_y, box_w, box_h, stride, anchor, j, i, a);
-
-          box_x = box[0];
-          box_y = box[1];
-          box_w = box[2];
-          box_h = box[3];
-
-          box_x -= (box_w / 2.0);
-          box_y -= (box_h / 2.0);
-          boxes.push_back(box_x);
-          boxes.push_back(box_y);
-          boxes.push_back(box_x + box_w);
-          boxes.push_back(box_y + box_h);
-
-          float max_class_probs = in_ptr[5 * grid_len];
-          int max_class_id = 0;
-          for (int k = 1; k < kClasses; ++k) {
-            float prob = in_ptr[(5 + k) * grid_len];
-            if (prob > max_class_probs) {
-              max_class_id = k;
-              max_class_probs = prob;
+
+    int YOLOHead::YOLOFeatDecode(const Tensor& feat_map, const std::vector<std::vector<float>>& anchor, int grid_h, int grid_w, int height, int width, int stride, std::vector<float>& boxes, std::vector<float>& obj_probs, std::vector<int>& class_id, float threshold) const
+    {
+        auto      input         = const_cast<float*>(feat_map.data<float>());
+        auto      prop_box_size = feat_map.shape(1) / anchor.size();
+        const int kClasses      = prop_box_size - 5;
+        int       valid_count   = 0;
+        int       grid_len      = grid_h * grid_w;
+        float     thres         = unsigmoid(threshold);
+        for (int a = 0; a < anchor.size(); a++)
+        {
+            for (int i = 0; i < grid_h; i++)
+            {
+                for (int j = 0; j < grid_w; j++)
+                {
+                    float box_confidence = input[(prop_box_size * a + 4) * grid_len + i * grid_w + j];
+                    if (box_confidence >= thres)
+                    {
+                        int    offset = (prop_box_size * a) * grid_len + i * grid_w + j;
+                        float* in_ptr = input + offset;
+
+                        float  box_x = sigmoid(*in_ptr);
+                        float  box_y = sigmoid(in_ptr[grid_len]);
+                        float  box_w = in_ptr[2 * grid_len];
+                        float  box_h = in_ptr[3 * grid_len];
+                        auto   box   = yolo_decode(box_x, box_y, box_w, box_h, stride, anchor, j, i, a);
+
+                        box_x = box[0];
+                        box_y = box[1];
+                        box_w = box[2];
+                        box_h = box[3];
+
+                        box_x -= (box_w / 2.0);
+                        box_y -= (box_h / 2.0);
+                        boxes.push_back(box_x);
+                        boxes.push_back(box_y);
+                        boxes.push_back(box_x + box_w);
+                        boxes.push_back(box_y + box_h);
+
+                        float max_class_probs = in_ptr[5 * grid_len];
+                        int   max_class_id    = 0;
+                        for (int k = 1; k < kClasses; ++k)
+                        {
+                            float prob = in_ptr[(5 + k) * grid_len];
+                            if (prob > max_class_probs)
+                            {
+                                max_class_id    = k;
+                                max_class_probs = prob;
+                            }
+                        }
+                        obj_probs.push_back(sigmoid(max_class_probs) * sigmoid(box_confidence));
+                        class_id.push_back(max_class_id);
+                        valid_count++;
+                    }
+                }
             }
-          }
-          obj_probs.push_back(sigmoid(max_class_probs) * sigmoid(box_confidence));
-          class_id.push_back(max_class_id);
-          valid_count++;
         }
-      }
+        return valid_count;
     }
-  }
-  return valid_count;
-}
-
-Result<Detections> YOLOHead::GetBBoxes(const Value& prep_res,
-                                       const std::vector<Tensor>& pred_maps) const {
-  std::vector<float> filter_boxes;
-  std::vector<float> obj_probs;
-  std::vector<int> class_id;
-
-  int model_in_h = prep_res["img_shape"][1].get<int>();
-  int model_in_w = prep_res["img_shape"][2].get<int>();
-
-  for (int i = 0; i < pred_maps.size(); i++) {
-    int stride = strides_[i];
-    int grid_h = model_in_h / stride;
-    int grid_w = model_in_w / stride;
-    YOLOFeatDecode(pred_maps[i], anchors_[i], grid_h, grid_w, model_in_h, model_in_w, stride,
-                   filter_boxes, obj_probs, class_id, score_thr_);
-  }
-
-  std::vector<int> indexArray;
-  for (int i = 0; i < obj_probs.size(); ++i) {
-    indexArray.push_back(i);
-  }
-  Sort(obj_probs, class_id, indexArray);
-
-  Tensor dets(TensorDesc{Device{0, 0}, DataType::kFLOAT,
-                         TensorShape{int(filter_boxes.size() / 4), 4}, "dets"});
-  std::copy(filter_boxes.begin(), filter_boxes.end(), dets.data<float>());
-  NMS(dets, iou_threshold_, indexArray);
-
-  Detections objs;
-  std::vector<float> scale_factor;
-  if (prep_res.contains("scale_factor")) {
-    from_value(prep_res["scale_factor"], scale_factor);
-  } else {
-    scale_factor = {1.f, 1.f, 1.f, 1.f};
-  }
-
-  int top_padding = 0;
-  int left_padding = 0;
-  if (prep_res.contains("pad_param")) {
-    top_padding = prep_res["pad_param"][0].get<int>();
-    left_padding = prep_res["pad_param"][1].get<int>();
-  }
-
-  int ori_width = prep_res["ori_shape"][2].get<int>();
-  int ori_height = prep_res["ori_shape"][1].get<int>();
-  auto det_ptr = dets.data<float>();
-  for (int i = 0; i < indexArray.size(); ++i) {
-    if (indexArray[i] == -1) {
-      continue;
+
+    Result<Detections> YOLOHead::GetBBoxes(const Value&               prep_res,
+                                           const std::vector<Tensor>& pred_maps) const
+    {
+        std::vector<float> filter_boxes;
+        std::vector<float> obj_probs;
+        std::vector<int>   class_id;
+
+        int                model_in_h = prep_res["img_shape"][1].get<int>();
+        int                model_in_w = prep_res["img_shape"][2].get<int>();
+
+        for (int i = 0; i < pred_maps.size(); i++)
+        {
+            int stride = strides_[i];
+            int grid_h = model_in_h / stride;
+            int grid_w = model_in_w / stride;
+            YOLOFeatDecode(pred_maps[i], anchors_[i], grid_h, grid_w, model_in_h, model_in_w, stride, filter_boxes, obj_probs, class_id, score_thr_);
+        }
+
+        std::vector<int> indexArray;
+        for (int i = 0; i < obj_probs.size(); ++i)
+        {
+            indexArray.push_back(i);
+        }
+        Sort(obj_probs, class_id, indexArray);
+
+        Tensor dets(TensorDesc{Device{0, 0}, DataType::kFLOAT, TensorShape{int(filter_boxes.size() / 4), 4}, "dets"});
+        std::copy(filter_boxes.begin(), filter_boxes.end(), dets.data<float>());
+        NMS(dets, iou_threshold_, indexArray);
+
+        Detections         objs;
+        std::vector<float> scale_factor;
+        if (prep_res.contains("scale_factor"))
+        {
+            from_value(prep_res["scale_factor"], scale_factor);
+        }
+        else
+        {
+            scale_factor = {1.f, 1.f, 1.f, 1.f};
+        }
+
+        int top_padding  = 0;
+        int left_padding = 0;
+        if (prep_res.contains("pad_param"))
+        {
+            top_padding  = prep_res["pad_param"][0].get<int>();
+            left_padding = prep_res["pad_param"][1].get<int>();
+        }
+
+        int  ori_width  = prep_res["ori_shape"][2].get<int>();
+        int  ori_height = prep_res["ori_shape"][1].get<int>();
+        auto det_ptr    = dets.data<float>();
+        for (int i = 0; i < indexArray.size(); ++i)
+        {
+            if (indexArray[i] == -1)
+            {
+                continue;
+            }
+            int   j        = indexArray[i];
+            auto  x1       = clamp(det_ptr[j * 4 + 0], 0, model_in_w);
+            auto  y1       = clamp(det_ptr[j * 4 + 1], 0, model_in_h);
+            auto  x2       = clamp(det_ptr[j * 4 + 2], 0, model_in_w);
+            auto  y2       = clamp(det_ptr[j * 4 + 3], 0, model_in_h);
+            int   label_id = class_id[i];
+            float score    = obj_probs[i];
+
+            MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
+
+            auto rect = MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height, top_padding, left_padding);
+            if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_)
+            {
+                MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0], rect[3] - rect[1]);
+                continue;
+            }
+            Detection det{};
+            det.index    = i;
+            det.label_id = label_id;
+            det.score    = score;
+            det.bbox     = rect;
+            objs.push_back(std::move(det));
+        }
+
+        return objs;
     }
-    int j = indexArray[i];
-    auto x1 = clamp(det_ptr[j * 4 + 0], 0, model_in_w);
-    auto y1 = clamp(det_ptr[j * 4 + 1], 0, model_in_h);
-    auto x2 = clamp(det_ptr[j * 4 + 2], 0, model_in_w);
-    auto y2 = clamp(det_ptr[j * 4 + 3], 0, model_in_h);
-    int label_id = class_id[i];
-    float score = obj_probs[i];
-
-    MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
-
-    auto rect = MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height,
-                                 top_padding, left_padding);
-    if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_) {
-      MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0],
-                     rect[3] - rect[1]);
-      continue;
+
+    std::array<float, 4> YOLOV3Head::yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride, const std::vector<std::vector<float>>& anchor, int j, int i, int a) const
+    {
+        box_x = (box_x + j) * stride;
+        box_y = (box_y + i) * stride;
+        box_w = expf(box_w) * anchor[a][0];
+        box_h = expf(box_h) * anchor[a][1];
+        return std::array<float, 4>{box_x, box_y, box_w, box_h};
     }
-    Detection det{};
-    det.index = i;
-    det.label_id = label_id;
-    det.score = score;
-    det.bbox = rect;
-    objs.push_back(std::move(det));
-  }
-
-  return objs;
-}
-
-std::array<float, 4> YOLOV3Head::yolo_decode(float box_x, float box_y, float box_w, float box_h,
-                                             float stride,
-                                             const std::vector<std::vector<float>>& anchor, int j,
-                                             int i, int a) const {
-  box_x = (box_x + j) * stride;
-  box_y = (box_y + i) * stride;
-  box_w = expf(box_w) * anchor[a][0];
-  box_h = expf(box_h) * anchor[a][1];
-  return std::array<float, 4>{box_x, box_y, box_w, box_h};
-}
-
-std::array<float, 4> YOLOv5Head::yolo_decode(float box_x, float box_y, float box_w, float box_h,
-                                             float stride,
-                                             const std::vector<std::vector<float>>& anchor, int j,
-                                             int i, int a) const {
-  box_x = box_x * 2 - 0.5;
-  box_y = box_y * 2 - 0.5;
-  box_w = sigmoid(box_w) * 2;
-  box_h = sigmoid(box_h) * 2;
-  box_x = (box_x + j) * stride;
-  box_y = (box_y + i) * stride;
-  box_w = box_w * box_w * anchor[a][0];
-  box_h = box_h * box_h * anchor[a][1];
-  return std::array<float, 4>{box_x, box_y, box_w, box_h};
-}
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, YOLOV3Head);
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, YOLOv5Head);
+
+    std::array<float, 4> YOLOv5Head::yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride, const std::vector<std::vector<float>>& anchor, int j, int i, int a) const
+    {
+        box_x = box_x * 2 - 0.5;
+        box_y = box_y * 2 - 0.5;
+        box_w = sigmoid(box_w) * 2;
+        box_h = sigmoid(box_h) * 2;
+        box_x = (box_x + j) * stride;
+        box_y = (box_y + i) * stride;
+        box_w = box_w * box_w * anchor[a][0];
+        box_h = box_h * box_h * anchor[a][1];
+        return std::array<float, 4>{box_x, box_y, box_w, box_h};
+    }
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, YOLOV3Head);
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, YOLOv5Head);
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/yolo_head.h b/csrc/mmdeploy/codebase/mmdet/yolo_head.h
index 918a359aab..f34734b24a 100644
--- a/csrc/mmdeploy/codebase/mmdet/yolo_head.h
+++ b/csrc/mmdeploy/codebase/mmdet/yolo_head.h
@@ -5,46 +5,40 @@
 #include "mmdeploy/codebase/mmdet/mmdet.h"
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::mmdet {
-
-class YOLOHead : public MMDetection {
- public:
-  explicit YOLOHead(const Value& cfg);
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
-  int YOLOFeatDecode(const Tensor& feat_map, const std::vector<std::vector<float>>& anchor,
-                     int grid_h, int grid_w, int height, int width, int stride,
-                     std::vector<float>& boxes, std::vector<float>& obj_probs,
-                     std::vector<int>& class_id, float threshold) const;
-  Result<Detections> GetBBoxes(const Value& prep_res, const std::vector<Tensor>& pred_maps) const;
-  virtual std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h,
-                                           float stride,
-                                           const std::vector<std::vector<float>>& anchor, int j,
-                                           int i, int a) const = 0;
-
- private:
-  float score_thr_{0.4f};
-  int nms_pre_{1000};
-  float iou_threshold_{0.45f};
-  int min_bbox_size_{0};
-  std::vector<std::vector<std::vector<float>>> anchors_;
-  std::vector<float> strides_;
-};
-
-class YOLOV3Head : public YOLOHead {
- public:
-  using YOLOHead::YOLOHead;
-  std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride,
-                                   const std::vector<std::vector<float>>& anchor, int j, int i,
-                                   int a) const override;
-};
-
-class YOLOv5Head : public YOLOHead {
- public:
-  using YOLOHead::YOLOHead;
-  std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride,
-                                   const std::vector<std::vector<float>>& anchor, int j, int i,
-                                   int a) const override;
-};
+namespace mmdeploy::mmdet
+{
+
+    class YOLOHead : public MMDetection
+    {
+      public:
+        explicit YOLOHead(const Value& cfg);
+        Result<Value>                operator()(const Value& prep_res, const Value& infer_res);
+        int                          YOLOFeatDecode(const Tensor& feat_map, const std::vector<std::vector<float>>& anchor, int grid_h, int grid_w, int height, int width, int stride, std::vector<float>& boxes, std::vector<float>& obj_probs, std::vector<int>& class_id, float threshold) const;
+        Result<Detections>           GetBBoxes(const Value& prep_res, const std::vector<Tensor>& pred_maps) const;
+        virtual std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride, const std::vector<std::vector<float>>& anchor, int j, int i, int a) const = 0;
+
+      private:
+        float                                        score_thr_{0.4f};
+        int                                          nms_pre_{1000};
+        float                                        iou_threshold_{0.45f};
+        int                                          min_bbox_size_{0};
+        std::vector<std::vector<std::vector<float>>> anchors_;
+        std::vector<float>                           strides_;
+    };
+
+    class YOLOV3Head : public YOLOHead
+    {
+      public:
+        using YOLOHead::YOLOHead;
+        std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride, const std::vector<std::vector<float>>& anchor, int j, int i, int a) const override;
+    };
+
+    class YOLOv5Head : public YOLOHead
+    {
+      public:
+        using YOLOHead::YOLOHead;
+        std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride, const std::vector<std::vector<float>>& anchor, int j, int i, int a) const override;
+    };
 
 }  // namespace mmdeploy::mmdet
 
diff --git a/csrc/mmdeploy/codebase/mmedit/mmedit.cpp b/csrc/mmdeploy/codebase/mmedit/mmedit.cpp
index b25317b27f..8935b15e63 100644
--- a/csrc/mmdeploy/codebase/mmedit/mmedit.cpp
+++ b/csrc/mmdeploy/codebase/mmedit/mmedit.cpp
@@ -4,8 +4,9 @@
 
 #include "mmdeploy/core/registry.h"
 
-namespace mmdeploy::mmedit {
+namespace mmdeploy::mmedit
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMEdit);
+    MMDEPLOY_REGISTER_CODEBASE(MMEdit);
 
 }  // namespace mmdeploy::mmedit
diff --git a/csrc/mmdeploy/codebase/mmedit/mmedit.h b/csrc/mmdeploy/codebase/mmedit/mmedit.h
index af51c5cd2c..4f47807afa 100644
--- a/csrc/mmdeploy/codebase/mmedit/mmedit.h
+++ b/csrc/mmdeploy/codebase/mmedit/mmedit.h
@@ -9,11 +9,12 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/serialization.h"
 
-namespace mmdeploy::mmedit {
+namespace mmdeploy::mmedit
+{
 
-using RestorerOutput = Mat;
+    using RestorerOutput = Mat;
 
-MMDEPLOY_DECLARE_CODEBASE(MMEdit, mmedit);
+    MMDEPLOY_DECLARE_CODEBASE(MMEdit, mmedit);
 
 }  // namespace mmdeploy::mmedit
 
diff --git a/csrc/mmdeploy/codebase/mmedit/restorer.cpp b/csrc/mmdeploy/codebase/mmedit/restorer.cpp
index 2b582f2425..8a4149b728 100644
--- a/csrc/mmdeploy/codebase/mmedit/restorer.cpp
+++ b/csrc/mmdeploy/codebase/mmedit/restorer.cpp
@@ -6,53 +6,63 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/utils/device_utils.h"
 
-namespace mmdeploy::mmedit {
-
-class TensorToImg : public MMEdit {
- public:
-  explicit TensorToImg(const Value& cfg) : MMEdit(cfg) {}
-
-  Result<Value> operator()(const Value& input) {
-    auto upscale = input["output"].get<Tensor>();
-    OUTCOME_TRY(auto upscale_cpu, MakeAvailableOnDevice(upscale, kHOST, stream()));
-    OUTCOME_TRY(stream().Wait());
-    if (upscale.shape().size() == 4 && upscale.data_type() == DataType::kFLOAT) {
-      auto channels = static_cast<int>(upscale.shape(1));
-      auto height = static_cast<int>(upscale.shape(2));
-      auto width = static_cast<int>(upscale.shape(3));
-      // TODO: handle BGR <-> RGB conversion
-      OUTCOME_TRY(auto format, ChannelsToFormat(channels));
-      Mat mat(height, width, format, DataType::kINT8, kHOST);
-      cv::Mat_<float> mat_chw(channels, height * width, upscale_cpu.data<float>());
-      cv::Mat mat_hwc(height * width, channels, CV_32F);
-      cv::transpose(mat_chw, mat_hwc);
-      cv::Mat rescale_uint8(height, width, CV_8UC(channels), mat.data<uint8_t>());
-      mat_hwc = mat_hwc.reshape(channels, height);
-      // convert has saturate_cast inside
-      mat_hwc.convertTo(rescale_uint8, CV_8UC(channels), 255.f);
-      return mat;
-    } else {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", upscale.shape(),
-                     (int)upscale.data_type());
-      return Status(eNotSupported);
-    }
-  }
-
- protected:
-  static Result<PixelFormat> ChannelsToFormat(int channels) {
-    switch (channels) {
-      case 1:
-        return PixelFormat::kGRAYSCALE;
-      case 3:
-        return PixelFormat::kRGB;
-      default:
-        return Status(eNotSupported);
-    }
-  }
-
-  static constexpr const Device kHOST{0, 0};
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMEdit, TensorToImg);
+namespace mmdeploy::mmedit
+{
+
+    class TensorToImg : public MMEdit
+    {
+      public:
+        explicit TensorToImg(const Value& cfg)
+            : MMEdit(cfg)
+        {
+        }
+
+        Result<Value> operator()(const Value& input)
+        {
+            auto upscale = input["output"].get<Tensor>();
+            OUTCOME_TRY(auto upscale_cpu, MakeAvailableOnDevice(upscale, kHOST, stream()));
+            OUTCOME_TRY(stream().Wait());
+            if (upscale.shape().size() == 4 && upscale.data_type() == DataType::kFLOAT)
+            {
+                auto channels = static_cast<int>(upscale.shape(1));
+                auto height   = static_cast<int>(upscale.shape(2));
+                auto width    = static_cast<int>(upscale.shape(3));
+                // TODO: handle BGR <-> RGB conversion
+                OUTCOME_TRY(auto format, ChannelsToFormat(channels));
+                Mat             mat(height, width, format, DataType::kINT8, kHOST);
+                cv::Mat_<float> mat_chw(channels, height * width, upscale_cpu.data<float>());
+                cv::Mat         mat_hwc(height * width, channels, CV_32F);
+                cv::transpose(mat_chw, mat_hwc);
+                cv::Mat rescale_uint8(height, width, CV_8UC(channels), mat.data<uint8_t>());
+                mat_hwc = mat_hwc.reshape(channels, height);
+                // convert has saturate_cast inside
+                mat_hwc.convertTo(rescale_uint8, CV_8UC(channels), 255.f);
+                return mat;
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", upscale.shape(), (int)upscale.data_type());
+                return Status(eNotSupported);
+            }
+        }
+
+      protected:
+        static Result<PixelFormat> ChannelsToFormat(int channels)
+        {
+            switch (channels)
+            {
+                case 1:
+                    return PixelFormat::kGRAYSCALE;
+                case 3:
+                    return PixelFormat::kRGB;
+                default:
+                    return Status(eNotSupported);
+            }
+        }
+
+        static constexpr const Device kHOST{0, 0};
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMEdit, TensorToImg);
 
 }  // namespace mmdeploy::mmedit
diff --git a/csrc/mmdeploy/codebase/mmocr/attention_convertor.cpp b/csrc/mmdeploy/codebase/mmocr/attention_convertor.cpp
index 4bef6cc13c..af2c7c31c7 100644
--- a/csrc/mmdeploy/codebase/mmocr/attention_convertor.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/attention_convertor.cpp
@@ -14,64 +14,73 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmocr.h"
 
-namespace mmdeploy::mmocr {
-
-using std::string;
-using std::vector;
-
-class AttnConvertor : public BaseConvertor {
- public:
-  explicit AttnConvertor(const Value& cfg) : BaseConvertor(cfg) {}
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) {
-    auto d_conf = _prob["output"].get<Tensor>();
-
-    if (!(d_conf.shape().size() == 3 && d_conf.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", d_conf.shape(),
-                     (int)d_conf.data_type());
-      return Status(eNotSupported);
-    }
-
-    OUTCOME_TRY(auto h_conf, MakeAvailableOnDevice(d_conf, Device{0}, stream()));
-    OUTCOME_TRY(stream().Wait());
-
-    auto data = h_conf.data<float>();
-
-    auto shape = d_conf.shape();
-    auto w = static_cast<int>(shape[1]);
-    auto c = static_cast<int>(shape[2]);
-
-    auto [indexes, scores] = Tensor2Idx(data, w, c);
-
-    auto text = Idx2Str(indexes);
-    MMDEPLOY_DEBUG("text: {}", text);
-
-    TextRecognition output{text, scores};
-
-    return make_pointer(to_value(output));
-  }
-
-  std::pair<vector<int>, vector<float> > Tensor2Idx(const float* data, int w, int c) {
-    auto decode_len = w;
-    vector<int> indexes;
-    indexes.reserve(decode_len);
-    vector<float> scores;
-    scores.reserve(decode_len);
-    for (int t = 0; t < decode_len; ++t, data += c) {
-      vector<float> prob(data, data + c);
-      auto iter = max_element(begin(prob), end(prob));
-      auto index = static_cast<int>(iter - begin(prob));
-      if (index == end_idx_) break;
-      if (std::find(ignore_indexes_.begin(), ignore_indexes_.end(), index) ==
-          ignore_indexes_.end()) {
-        indexes.push_back(index);
-        scores.push_back(*iter);
-      }
-    }
-    return {indexes, scores};
-  }
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, AttnConvertor);
+namespace mmdeploy::mmocr
+{
+
+    using std::string;
+    using std::vector;
+
+    class AttnConvertor : public BaseConvertor
+    {
+      public:
+        explicit AttnConvertor(const Value& cfg)
+            : BaseConvertor(cfg)
+        {
+        }
+
+        Result<Value> operator()(const Value& _data, const Value& _prob)
+        {
+            auto d_conf = _prob["output"].get<Tensor>();
+
+            if (!(d_conf.shape().size() == 3 && d_conf.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", d_conf.shape(), (int)d_conf.data_type());
+                return Status(eNotSupported);
+            }
+
+            OUTCOME_TRY(auto h_conf, MakeAvailableOnDevice(d_conf, Device{0}, stream()));
+            OUTCOME_TRY(stream().Wait());
+
+            auto data = h_conf.data<float>();
+
+            auto shape = d_conf.shape();
+            auto w     = static_cast<int>(shape[1]);
+            auto c     = static_cast<int>(shape[2]);
+
+            auto [indexes, scores] = Tensor2Idx(data, w, c);
+
+            auto text = Idx2Str(indexes);
+            MMDEPLOY_DEBUG("text: {}", text);
+
+            TextRecognition output{text, scores};
+
+            return make_pointer(to_value(output));
+        }
+
+        std::pair<vector<int>, vector<float>> Tensor2Idx(const float* data, int w, int c)
+        {
+            auto        decode_len = w;
+            vector<int> indexes;
+            indexes.reserve(decode_len);
+            vector<float> scores;
+            scores.reserve(decode_len);
+            for (int t = 0; t < decode_len; ++t, data += c)
+            {
+                vector<float> prob(data, data + c);
+                auto          iter  = max_element(begin(prob), end(prob));
+                auto          index = static_cast<int>(iter - begin(prob));
+                if (index == end_idx_) break;
+                if (std::find(ignore_indexes_.begin(), ignore_indexes_.end(), index) ==
+                    ignore_indexes_.end())
+                {
+                    indexes.push_back(index);
+                    scores.push_back(*iter);
+                }
+            }
+            return {indexes, scores};
+        }
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, AttnConvertor);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/base_convertor.cpp b/csrc/mmdeploy/codebase/mmocr/base_convertor.cpp
index 5047a48bef..baf66df81a 100644
--- a/csrc/mmdeploy/codebase/mmocr/base_convertor.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/base_convertor.cpp
@@ -2,145 +2,189 @@
 
 #include "base_convertor.h"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-using std::string;
-using std::unordered_map;
-using std::unordered_set;
-using std::vector;
+    using std::string;
+    using std::unordered_map;
+    using std::unordered_set;
+    using std::vector;
 
-BaseConvertor::BaseConvertor(const Value& cfg) : MMOCR(cfg) {
-  auto model = cfg["context"]["model"].get<Model>();
-  if (!cfg.contains("params")) {
-    MMDEPLOY_ERROR("'params' is required, but it's not in the config");
-    throw_exception(eInvalidArgument);
-  }
-  // BaseConverter
-  auto& _cfg = cfg["params"];
-  if (_cfg.contains("dict_file")) {
-    auto filename = _cfg["dict_file"].get<std::string>();
-    auto content = model.ReadFile(filename).value();
-    idx2char_ = SplitLines(content);
-  } else if (_cfg.contains("dict_list")) {
-    from_value(_cfg["dict_list"], idx2char_);
-  } else if (_cfg.contains("dict_type")) {
-    auto dict_type = _cfg["dict_type"].get<std::string>();
-    if (dict_type == "DICT36") {
-      idx2char_ = SplitChars(DICT36);
-    } else if (dict_type == "DICT90") {
-      idx2char_ = SplitChars(DICT90);
-    } else {
-      MMDEPLOY_ERROR("unknown dict_type: {}", dict_type);
-      throw_exception(eInvalidArgument);
-    }
-  } else {
-    MMDEPLOY_ERROR("either dict_file, dict_list or dict_type must be specified");
-    throw_exception(eInvalidArgument);
-  }
-  model_ = model;
+    BaseConvertor::BaseConvertor(const Value& cfg)
+        : MMOCR(cfg)
+    {
+        auto model = cfg["context"]["model"].get<Model>();
+        if (!cfg.contains("params"))
+        {
+            MMDEPLOY_ERROR("'params' is required, but it's not in the config");
+            throw_exception(eInvalidArgument);
+        }
+        // BaseConverter
+        auto& _cfg = cfg["params"];
+        if (_cfg.contains("dict_file"))
+        {
+            auto filename = _cfg["dict_file"].get<std::string>();
+            auto content  = model.ReadFile(filename).value();
+            idx2char_     = SplitLines(content);
+        }
+        else if (_cfg.contains("dict_list"))
+        {
+            from_value(_cfg["dict_list"], idx2char_);
+        }
+        else if (_cfg.contains("dict_type"))
+        {
+            auto dict_type = _cfg["dict_type"].get<std::string>();
+            if (dict_type == "DICT36")
+            {
+                idx2char_ = SplitChars(DICT36);
+            }
+            else if (dict_type == "DICT90")
+            {
+                idx2char_ = SplitChars(DICT90);
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unknown dict_type: {}", dict_type);
+                throw_exception(eInvalidArgument);
+            }
+        }
+        else
+        {
+            MMDEPLOY_ERROR("either dict_file, dict_list or dict_type must be specified");
+            throw_exception(eInvalidArgument);
+        }
+        model_ = model;
 
-  // Update Dictionary
-  bool with_start = _cfg.value("with_start", false);
-  bool with_end = _cfg.value("with_end", false);
-  bool same_start_end = _cfg.value("same_start_end", false);
-  bool with_padding = _cfg.value("with_padding", false);
-  bool with_unknown = _cfg.value("with_unknown", false);
+        // Update Dictionary
+        bool with_start     = _cfg.value("with_start", false);
+        bool with_end       = _cfg.value("with_end", false);
+        bool same_start_end = _cfg.value("same_start_end", false);
+        bool with_padding   = _cfg.value("with_padding", false);
+        bool with_unknown   = _cfg.value("with_unknown", false);
 
-  if (with_start && with_end && same_start_end) {
-    start_idx_ = static_cast<int>(idx2char_.size());
-    end_idx_ = start_idx_;
-    string start_end_token = _cfg.value("start_end_token", string("<BOS/EOS>"));
-    idx2char_.emplace_back(std::move(start_end_token));
-  } else {
-    if (with_start) {
-      start_idx_ = static_cast<int>(idx2char_.size());
-      string start_token = _cfg.value("start_token", string("<BOS>"));
-      idx2char_.emplace_back(std::move(start_token));
-    }
-    if (with_end) {
-      end_idx_ = static_cast<int>(idx2char_.size());
-      string end_token = _cfg.value("end_token", string("<EOS>"));
-      idx2char_.emplace_back(std::move(end_token));
-    }
-  }
-  if (with_padding) {
-    padding_idx_ = static_cast<int>(idx2char_.size());
-    string padding_token = _cfg.value("padding_token", string("<PAD>"));
-    idx2char_.emplace_back(std::move(padding_token));
-  }
-  if (with_unknown && _cfg.contains("unknown_token") && !_cfg["unknown_token"].is_null()) {
-    unknown_idx_ = static_cast<int>(idx2char_.size());
-    string unknown_token = _cfg.value("unknown_token", string("<UKN>"));
-    idx2char_.emplace_back(unknown_token);
-  }
+        if (with_start && with_end && same_start_end)
+        {
+            start_idx_             = static_cast<int>(idx2char_.size());
+            end_idx_               = start_idx_;
+            string start_end_token = _cfg.value("start_end_token", string("<BOS/EOS>"));
+            idx2char_.emplace_back(std::move(start_end_token));
+        }
+        else
+        {
+            if (with_start)
+            {
+                start_idx_         = static_cast<int>(idx2char_.size());
+                string start_token = _cfg.value("start_token", string("<BOS>"));
+                idx2char_.emplace_back(std::move(start_token));
+            }
+            if (with_end)
+            {
+                end_idx_         = static_cast<int>(idx2char_.size());
+                string end_token = _cfg.value("end_token", string("<EOS>"));
+                idx2char_.emplace_back(std::move(end_token));
+            }
+        }
+        if (with_padding)
+        {
+            padding_idx_         = static_cast<int>(idx2char_.size());
+            string padding_token = _cfg.value("padding_token", string("<PAD>"));
+            idx2char_.emplace_back(std::move(padding_token));
+        }
+        if (with_unknown && _cfg.contains("unknown_token") && !_cfg["unknown_token"].is_null())
+        {
+            unknown_idx_         = static_cast<int>(idx2char_.size());
+            string unknown_token = _cfg.value("unknown_token", string("<UKN>"));
+            idx2char_.emplace_back(unknown_token);
+        }
 
-  // char2idx
-  for (int i = 0; i < (int)idx2char_.size(); i++) {
-    char2idx_[idx2char_[i]] = i;
-  }
+        // char2idx
+        for (int i = 0; i < (int)idx2char_.size(); i++)
+        {
+            char2idx_[idx2char_[i]] = i;
+        }
 
-  vector<string> ignore_chars;
-  if (cfg.contains("ignore_chars")) {
-    for (int i = 0; i < cfg["ignore_chars"].size(); i++)
-      ignore_chars.emplace_back(cfg["ignore_chars"][i].get<string>());
-  } else {
-    ignore_chars.emplace_back("padding");
-  }
-  std::map<string, int> mapping_table = {
-      {"padding", padding_idx_}, {"end", end_idx_}, {"unknown", unknown_idx_}};
-  for (int i = 0; i < ignore_chars.size(); i++) {
-    const auto& ignore_char = ignore_chars[i];
-    int ignore_idx = -1;
+        vector<string> ignore_chars;
+        if (cfg.contains("ignore_chars"))
+        {
+            for (int i = 0; i < cfg["ignore_chars"].size(); i++)
+                ignore_chars.emplace_back(cfg["ignore_chars"][i].get<string>());
+        }
+        else
+        {
+            ignore_chars.emplace_back("padding");
+        }
+        std::map<string, int> mapping_table = {
+            {"padding", padding_idx_},
+            {"end", end_idx_},
+            {"unknown", unknown_idx_}};
+        for (int i = 0; i < ignore_chars.size(); i++)
+        {
+            const auto& ignore_char = ignore_chars[i];
+            int         ignore_idx  = -1;
 
-    if (auto it_default = mapping_table.find(ignore_char); it_default != mapping_table.end()) {
-      ignore_idx = it_default->second;
-    } else if (auto it_candidate = char2idx_.find(ignore_char); it_candidate != char2idx_.end()) {
-      ignore_idx = it_candidate->second;
-    } else if (with_unknown) {
-      ignore_idx = unknown_idx_;
-    }
+            if (auto it_default = mapping_table.find(ignore_char); it_default != mapping_table.end())
+            {
+                ignore_idx = it_default->second;
+            }
+            else if (auto it_candidate = char2idx_.find(ignore_char); it_candidate != char2idx_.end())
+            {
+                ignore_idx = it_candidate->second;
+            }
+            else if (with_unknown)
+            {
+                ignore_idx = unknown_idx_;
+            }
 
-    if (ignore_idx == -1 || (ignore_idx == unknown_idx_ && ignore_char != "unknown")) {
-      MMDEPLOY_WARN("{} does not exist in the dictionary", ignore_char);
-      continue;
+            if (ignore_idx == -1 || (ignore_idx == unknown_idx_ && ignore_char != "unknown"))
+            {
+                MMDEPLOY_WARN("{} does not exist in the dictionary", ignore_char);
+                continue;
+            }
+            ignore_indexes_.insert(ignore_idx);
+        }
     }
-    ignore_indexes_.insert(ignore_idx);
-  }
-}
 
-string BaseConvertor::Idx2Str(const vector<int>& indexes) {
-  size_t count = 0;
-  for (const auto& idx : indexes) {
-    if (idx >= idx2char_.size()) {
-      MMDEPLOY_ERROR("idx exceeds array bounds {} {}", idx, idx2char_.size());
+    string BaseConvertor::Idx2Str(const vector<int>& indexes)
+    {
+        size_t count = 0;
+        for (const auto& idx : indexes)
+        {
+            if (idx >= idx2char_.size())
+            {
+                MMDEPLOY_ERROR("idx exceeds array bounds {} {}", idx, idx2char_.size());
+            }
+            count += idx2char_[idx].size();
+        }
+        std::string text;
+        text.reserve(count);
+        for (const auto& idx : indexes)
+        {
+            text += idx2char_[idx];
+        }
+        return text;
     }
-    count += idx2char_[idx].size();
-  }
-  std::string text;
-  text.reserve(count);
-  for (const auto& idx : indexes) {
-    text += idx2char_[idx];
-  }
-  return text;
-}
 
-vector<string> BaseConvertor::SplitLines(const string& s) {
-  std::istringstream is(s);
-  vector<string> ret;
-  string line;
-  while (std::getline(is, line)) {
-    ret.push_back(std::move(line));
-  }
-  return ret;
-}
+    vector<string> BaseConvertor::SplitLines(const string& s)
+    {
+        std::istringstream is(s);
+        vector<string>     ret;
+        string             line;
+        while (std::getline(is, line))
+        {
+            ret.push_back(std::move(line));
+        }
+        return ret;
+    }
 
-vector<string> BaseConvertor::SplitChars(const string& s) {
-  vector<string> ret;
-  ret.reserve(s.size());
-  for (char c : s) {
-    ret.push_back({c});
-  }
-  return ret;
-}
+    vector<string> BaseConvertor::SplitChars(const string& s)
+    {
+        vector<string> ret;
+        ret.reserve(s.size());
+        for (char c : s)
+        {
+            ret.push_back({c});
+        }
+        return ret;
+    }
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/base_convertor.h b/csrc/mmdeploy/codebase/mmocr/base_convertor.h
index 8c1a28eb42..52b95627bd 100644
--- a/csrc/mmdeploy/codebase/mmocr/base_convertor.h
+++ b/csrc/mmdeploy/codebase/mmocr/base_convertor.h
@@ -15,44 +15,46 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmocr.h"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-using std::string;
-using std::unordered_map;
-using std::unordered_set;
-using std::vector;
+    using std::string;
+    using std::unordered_map;
+    using std::unordered_set;
+    using std::vector;
 
-class BaseConvertor : public MMOCR {
- public:
-  explicit BaseConvertor(const Value& cfg);
+    class BaseConvertor : public MMOCR
+    {
+      public:
+        explicit BaseConvertor(const Value& cfg);
 
-  string Idx2Str(const vector<int>& indexes);
+        string                Idx2Str(const vector<int>& indexes);
 
-  virtual Result<Value> operator()(const Value& _data, const Value& _prob) = 0;
+        virtual Result<Value> operator()(const Value& _data, const Value& _prob) = 0;
 
- protected:
-  static vector<string> SplitLines(const string& s);
+      protected:
+        static vector<string>       SplitLines(const string& s);
 
-  static vector<string> SplitChars(const string& s);
+        static vector<string>       SplitChars(const string& s);
 
-  static constexpr const auto DICT36 = R"(0123456789abcdefghijklmnopqrstuvwxyz)";
-  static constexpr const auto DICT90 = R"(0123456789abcdefghijklmnopqrstuvwxyz)"
-                                       R"(ABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'())"
-                                       R"(*+,-./:;<=>?@[\]_`~)";
+        static constexpr const auto DICT36 = R"(0123456789abcdefghijklmnopqrstuvwxyz)";
+        static constexpr const auto DICT90 = R"(0123456789abcdefghijklmnopqrstuvwxyz)"
+                                             R"(ABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'())"
+                                             R"(*+,-./:;<=>?@[\]_`~)";
 
-  static constexpr const auto kHost = Device(0);
+        static constexpr const auto kHost = Device(0);
 
-  Model model_;
+        Model                       model_;
 
-  int padding_idx_{-1};
-  int end_idx_{-1};
-  int start_idx_{-1};
-  int unknown_idx_{-1};
+        int                         padding_idx_{-1};
+        int                         end_idx_{-1};
+        int                         start_idx_{-1};
+        int                         unknown_idx_{-1};
 
-  unordered_set<int> ignore_indexes_;
-  unordered_map<string, int> char2idx_;
-  vector<string> idx2char_;
+        unordered_set<int>          ignore_indexes_;
+        unordered_map<string, int>  char2idx_;
+        vector<string>              idx2char_;
 
-};  // class BaseConvertor
+    };  // class BaseConvertor
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/contour_expand.cpp b/csrc/mmdeploy/codebase/mmocr/contour_expand.cpp
index 4199a964cb..4d08f83ecb 100644
--- a/csrc/mmdeploy/codebase/mmocr/contour_expand.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/contour_expand.cpp
@@ -11,114 +11,129 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/imgproc/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-using namespace std;
-using cv::Mat_;
-
-class Point2d {
- public:
-  int x;
-  int y;
-
-  Point2d() : x(0), y(0) {}
-  Point2d(int _x, int _y) : x(_x), y(_y) {}
-};
-
-void kernel_dilate(const uint8_t* data, int kernel_num, int height, int width, const int* label_map,
-                   int label_num, const float* score_map, int min_area, Mat_<int32_t>& text_labels,
-                   vector<int>& text_areas, vector<float>& text_scores,
-                   vector<vector<int>>& text_points) {
-  text_labels = Mat_<int32_t>::zeros(height, width);
-  text_areas = vector<int>(label_num);
-  text_scores = vector<float>(label_num);
-  text_points = vector<vector<int>>(label_num);
-
-  for (int x = 0; x < height; ++x) {
-    for (int y = 0; y < width; ++y) {
-      int label = label_map[x * width + y];
-      if (label == 0) continue;
-      text_areas[label] += 1;
-      text_scores[label] += score_map[x * width + y];
-      text_points[label].push_back(y);
-      text_points[label].push_back(x);
+namespace mmdeploy::mmocr
+{
+
+    using namespace std;
+    using cv::Mat_;
+
+    class Point2d
+    {
+      public:
+        int x;
+        int y;
+
+        Point2d()
+            : x(0)
+            , y(0)
+        {
+        }
+        Point2d(int _x, int _y)
+            : x(_x)
+            , y(_y)
+        {
+        }
+    };
+
+    void kernel_dilate(const uint8_t* data, int kernel_num, int height, int width, const int* label_map, int label_num, const float* score_map, int min_area, Mat_<int32_t>& text_labels, vector<int>& text_areas, vector<float>& text_scores, vector<vector<int>>& text_points)
+    {
+        text_labels = Mat_<int32_t>::zeros(height, width);
+        text_areas  = vector<int>(label_num);
+        text_scores = vector<float>(label_num);
+        text_points = vector<vector<int>>(label_num);
+
+        for (int x = 0; x < height; ++x)
+        {
+            for (int y = 0; y < width; ++y)
+            {
+                int label = label_map[x * width + y];
+                if (label == 0) continue;
+                text_areas[label] += 1;
+                text_scores[label] += score_map[x * width + y];
+                text_points[label].push_back(y);
+                text_points[label].push_back(x);
+            }
+        }
+
+        queue<Point2d> queue, next_queue;
+        for (int x = 0; x < height; ++x)
+        {
+            auto row = text_labels[x];
+            for (int y = 0; y < width; ++y)
+            {
+                int label = label_map[x * width + y];
+                if (label == 0) continue;
+                if (text_areas[label] < min_area) continue;
+                Point2d point(x, y);
+                queue.push(point);
+                row[y] = label;
+            }
+        }
+
+        const int   dx[] = {-1, 1, 0, 0};
+        const int   dy[] = {0, 0, -1, 1};
+        vector<int> kernel_step(kernel_num);
+        std::for_each(kernel_step.begin(), kernel_step.end(), [=](int& k)
+                      { return k * height * width; });
+
+        for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id)
+        {
+            while (!queue.empty())
+            {
+                Point2d point = queue.front();
+                queue.pop();
+                int  x       = point.x;
+                int  y       = point.y;
+                int  label   = text_labels[x][y];
+                bool is_edge = true;
+                for (int d = 0; d < 4; ++d)
+                {
+                    int tmp_x = x + dx[d];
+                    int tmp_y = y + dy[d];
+                    if (tmp_x < 0 || tmp_x >= height) continue;
+                    if (tmp_y < 0 || tmp_y >= width) continue;
+                    int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+                    if (kernel_value == 0) continue;
+                    if (text_labels[tmp_x][tmp_y] > 0) continue;
+                    Point2d point(tmp_x, tmp_y);
+                    queue.push(point);
+                    text_labels[tmp_x][tmp_y] = label;
+                    text_areas[label] += 1;
+                    text_scores[label] += score_map[tmp_x * width + tmp_y];
+                    text_points[label].push_back(tmp_y);
+                    text_points[label].push_back(tmp_x);
+                    is_edge = false;
+                }
+                if (is_edge)
+                {
+                    next_queue.push(point);
+                }
+            }
+            swap(queue, next_queue);
+        }
+
+        for (int i = 1; i < label_num; ++i)
+        {
+            if (text_areas[i])
+            {
+                text_scores[i] /= static_cast<float>(text_areas[i]);
+            }
+        }
     }
-  }
-
-  queue<Point2d> queue, next_queue;
-  for (int x = 0; x < height; ++x) {
-    auto row = text_labels[x];
-    for (int y = 0; y < width; ++y) {
-      int label = label_map[x * width + y];
-      if (label == 0) continue;
-      if (text_areas[label] < min_area) continue;
-      Point2d point(x, y);
-      queue.push(point);
-      row[y] = label;
-    }
-  }
-
-  const int dx[] = {-1, 1, 0, 0};
-  const int dy[] = {0, 0, -1, 1};
-  vector<int> kernel_step(kernel_num);
-  std::for_each(kernel_step.begin(), kernel_step.end(), [=](int& k) { return k * height * width; });
-
-  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
-    while (!queue.empty()) {
-      Point2d point = queue.front();
-      queue.pop();
-      int x = point.x;
-      int y = point.y;
-      int label = text_labels[x][y];
-      bool is_edge = true;
-      for (int d = 0; d < 4; ++d) {
-        int tmp_x = x + dx[d];
-        int tmp_y = y + dy[d];
-        if (tmp_x < 0 || tmp_x >= height) continue;
-        if (tmp_y < 0 || tmp_y >= width) continue;
-        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
-        if (kernel_value == 0) continue;
-        if (text_labels[tmp_x][tmp_y] > 0) continue;
-        Point2d point(tmp_x, tmp_y);
-        queue.push(point);
-        text_labels[tmp_x][tmp_y] = label;
-        text_areas[label] += 1;
-        text_scores[label] += score_map[tmp_x * width + tmp_y];
-        text_points[label].push_back(tmp_y);
-        text_points[label].push_back(tmp_x);
-        is_edge = false;
-      }
-      if (is_edge) {
-        next_queue.push(point);
-      }
-    }
-    swap(queue, next_queue);
-  }
 
-  for (int i = 1; i < label_num; ++i) {
-    if (text_areas[i]) {
-      text_scores[i] /= static_cast<float>(text_areas[i]);
+    void contour_expand(const Mat_<uint8_t>& kernel_masks, const Mat_<int32_t>& kernel_label, const Mat_<float>& score, int min_kernel_area, int kernel_num, vector<int>& text_areas, vector<float>& text_scores, vector<vector<int>>& text_points)
+    {
+        assert(kernel_masks.cols == kernel_label.total());
+        assert(score.size() == kernel_label.size());
+
+        auto                ptr_data       = kernel_masks.ptr<uint8_t>();
+        auto                data_score_map = score.ptr<float>();
+        auto                data_label_map = kernel_label.ptr<int32_t>();
+        vector<vector<int>> text_line;
+
+        Mat_<int32_t>       text_labels;
+
+        kernel_dilate(ptr_data, kernel_masks.rows, kernel_label.rows, kernel_label.cols, data_label_map, kernel_num, data_score_map, min_kernel_area, text_labels, text_areas, text_scores, text_points);
     }
-  }
-}
-
-void contour_expand(const Mat_<uint8_t>& kernel_masks, const Mat_<int32_t>& kernel_label,
-                    const Mat_<float>& score, int min_kernel_area, int kernel_num,
-                    vector<int>& text_areas, vector<float>& text_scores,
-                    vector<vector<int>>& text_points) {
-  assert(kernel_masks.cols == kernel_label.total());
-  assert(score.size() == kernel_label.size());
-
-  auto ptr_data = kernel_masks.ptr<uint8_t>();
-  auto data_score_map = score.ptr<float>();
-  auto data_label_map = kernel_label.ptr<int32_t>();
-  vector<vector<int>> text_line;
-
-  Mat_<int32_t> text_labels;
-
-  kernel_dilate(ptr_data, kernel_masks.rows, kernel_label.rows, kernel_label.cols, data_label_map,
-                kernel_num, data_score_map, min_kernel_area, text_labels, text_areas, text_scores,
-                text_points);
-}
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cpu/dbnet.cpp b/csrc/mmdeploy/codebase/mmocr/cpu/dbnet.cpp
index 05dc494397..e4ebae18d9 100644
--- a/csrc/mmdeploy/codebase/mmocr/cpu/dbnet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/dbnet.cpp
@@ -5,66 +5,72 @@
 #include "mmdeploy/core/utils/device_utils.h"
 #include "opencv2/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-class DbHeadCpuImpl : public DbHeadImpl {
- public:
-  void Init(const Stream& stream) override {
-    DbHeadImpl::Init(stream);
-    device_ = Device("cpu");
-  }
-
-  Result<void> Process(Tensor prob, float mask_thr, int max_candidates,
-                       std::vector<std::vector<cv::Point>>& points,
-                       std::vector<float>& scores) override {
-    OUTCOME_TRY(auto conf, MakeAvailableOnDevice(prob, device_, stream_));
-    OUTCOME_TRY(stream_.Wait());
-
-    auto h = conf.shape(1);
-    auto w = conf.shape(2);
-    auto data = conf.data<float>();
-
-    cv::Mat score_map((int)h, (int)w, CV_32F, data);
-
-    cv::Mat text_mask = score_map >= mask_thr;
-
-    std::vector<std::vector<cv::Point>> contours;
-    cv::findContours(text_mask, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
-
-    if (contours.size() > max_candidates) {
-      contours.resize(max_candidates);
-    }
-
-    for (auto& poly : contours) {
-      auto epsilon = 0.01 * cv::arcLength(poly, true);
-      std::vector<cv::Point> approx;
-      cv::approxPolyDP(poly, approx, epsilon, true);
-      if (approx.size() < 4) {
-        continue;
-      }
-      auto score = box_score_fast(score_map, approx);
-
-      points.push_back(approx);
-      scores.push_back(score);
-    }
-
-    return success();
-  }
-
-  static float box_score_fast(const cv::Mat& bitmap, const std::vector<cv::Point>& box) noexcept {
-    auto rect = cv::boundingRect(box) & cv::Rect({}, bitmap.size());
-
-    cv::Mat mask(rect.size(), CV_8U, cv::Scalar(0));
-
-    cv::fillPoly(mask, std::vector<std::vector<cv::Point>>{box}, 1, cv::LINE_8, 0, -rect.tl());
-    auto mean = cv::mean(bitmap(rect), mask)[0];
-    return static_cast<float>(mean);
-  }
-
-  Device device_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(DbHeadImpl, (cpu, 0),
-                               [] { return std::make_unique<DbHeadCpuImpl>(); });
+namespace mmdeploy::mmocr
+{
+
+    class DbHeadCpuImpl : public DbHeadImpl
+    {
+      public:
+        void Init(const Stream& stream) override
+        {
+            DbHeadImpl::Init(stream);
+            device_ = Device("cpu");
+        }
+
+        Result<void> Process(Tensor prob, float mask_thr, int max_candidates, std::vector<std::vector<cv::Point>>& points, std::vector<float>& scores) override
+        {
+            OUTCOME_TRY(auto conf, MakeAvailableOnDevice(prob, device_, stream_));
+            OUTCOME_TRY(stream_.Wait());
+
+            auto                                h    = conf.shape(1);
+            auto                                w    = conf.shape(2);
+            auto                                data = conf.data<float>();
+
+            cv::Mat                             score_map((int)h, (int)w, CV_32F, data);
+
+            cv::Mat                             text_mask = score_map >= mask_thr;
+
+            std::vector<std::vector<cv::Point>> contours;
+            cv::findContours(text_mask, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
+
+            if (contours.size() > max_candidates)
+            {
+                contours.resize(max_candidates);
+            }
+
+            for (auto& poly : contours)
+            {
+                auto                   epsilon = 0.01 * cv::arcLength(poly, true);
+                std::vector<cv::Point> approx;
+                cv::approxPolyDP(poly, approx, epsilon, true);
+                if (approx.size() < 4)
+                {
+                    continue;
+                }
+                auto score = box_score_fast(score_map, approx);
+
+                points.push_back(approx);
+                scores.push_back(score);
+            }
+
+            return success();
+        }
+
+        static float box_score_fast(const cv::Mat& bitmap, const std::vector<cv::Point>& box) noexcept
+        {
+            auto    rect = cv::boundingRect(box) & cv::Rect({}, bitmap.size());
+
+            cv::Mat mask(rect.size(), CV_8U, cv::Scalar(0));
+
+            cv::fillPoly(mask, std::vector<std::vector<cv::Point>>{box}, 1, cv::LINE_8, 0, -rect.tl());
+            auto mean = cv::mean(bitmap(rect), mask)[0];
+            return static_cast<float>(mean);
+        }
+
+        Device device_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(DbHeadImpl, (cpu, 0), []
+                                   { return std::make_unique<DbHeadCpuImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cpu/panet.cpp b/csrc/mmdeploy/codebase/mmocr/cpu/panet.cpp
index cf988046be..2be6ebfb41 100644
--- a/csrc/mmdeploy/codebase/mmocr/cpu/panet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/panet.cpp
@@ -4,52 +4,56 @@
 
 #include "opencv2/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-class PaHeadCpuImpl : public PaHeadImpl {
- public:
-  Result<void> Process(Tensor text_pred,             //
-                       Tensor kernel_pred,           //
-                       Tensor embed_pred,            //
-                       float min_text_confidence,    //
-                       float min_kernel_confidence,  //
-                       cv::Mat_<float>& text_score,  //
-                       cv::Mat_<uint8_t>& text,      //
-                       cv::Mat_<uint8_t>& kernel,    //
-                       cv::Mat_<int>& label,         //
-                       cv::Mat_<float>& embed,       //
-                       int& region_num) override {
-    OUTCOME_TRY(stream_.Wait());
-
-    auto height = static_cast<int>(text_pred.shape(1));
-    auto width = static_cast<int>(text_pred.shape(2));
-
-    text_score = cv::Mat_<float>(height, width, text_pred.data<float>());
-    sigmoid(text_score);
-
-    text = text_score > min_text_confidence;
-
-    cv::Mat_<float> kernel_score(height, width, kernel_pred.data<float>());
-    sigmoid(kernel_score);
-
-    kernel = kernel_score > min_kernel_confidence & text;
-
-    auto n_embed_channels = static_cast<int>(embed_pred.shape(0));
-    embed = cv::Mat_<float>(n_embed_channels, height * width, embed_pred.data<float>());
-    cv::transpose(embed, embed);
-
-    region_num = cv::connectedComponents(kernel, label, 4, CV_32S);
-
-    return success();
-  }
-
-  static void sigmoid(cv::Mat_<float>& score) {
-    cv::exp(-score, score);
-    score = 1 / (1 + score);
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(PaHeadImpl, (cpu, 0),
-                               [] { return std::make_unique<PaHeadCpuImpl>(); });
+namespace mmdeploy::mmocr
+{
+
+    class PaHeadCpuImpl : public PaHeadImpl
+    {
+      public:
+        Result<void> Process(Tensor             text_pred,              //
+                             Tensor             kernel_pred,            //
+                             Tensor             embed_pred,             //
+                             float              min_text_confidence,    //
+                             float              min_kernel_confidence,  //
+                             cv::Mat_<float>&   text_score,             //
+                             cv::Mat_<uint8_t>& text,                   //
+                             cv::Mat_<uint8_t>& kernel,                 //
+                             cv::Mat_<int>&     label,                  //
+                             cv::Mat_<float>&   embed,                  //
+                             int&               region_num) override
+        {
+            OUTCOME_TRY(stream_.Wait());
+
+            auto height = static_cast<int>(text_pred.shape(1));
+            auto width  = static_cast<int>(text_pred.shape(2));
+
+            text_score = cv::Mat_<float>(height, width, text_pred.data<float>());
+            sigmoid(text_score);
+
+            text = text_score > min_text_confidence;
+
+            cv::Mat_<float> kernel_score(height, width, kernel_pred.data<float>());
+            sigmoid(kernel_score);
+
+            kernel = kernel_score > min_kernel_confidence & text;
+
+            auto n_embed_channels = static_cast<int>(embed_pred.shape(0));
+            embed                 = cv::Mat_<float>(n_embed_channels, height * width, embed_pred.data<float>());
+            cv::transpose(embed, embed);
+
+            region_num = cv::connectedComponents(kernel, label, 4, CV_32S);
+
+            return success();
+        }
+
+        static void sigmoid(cv::Mat_<float>& score)
+        {
+            cv::exp(-score, score);
+            score = 1 / (1 + score);
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(PaHeadImpl, (cpu, 0), []
+                                   { return std::make_unique<PaHeadCpuImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cpu/psenet.cpp b/csrc/mmdeploy/codebase/mmocr/cpu/psenet.cpp
index 141d7c1cbd..f5ab0c478c 100644
--- a/csrc/mmdeploy/codebase/mmocr/cpu/psenet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/psenet.cpp
@@ -5,52 +5,60 @@
 #include "mmdeploy/core/utils/device_utils.h"
 #include "opencv2/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-class PseHeadCpuImpl : public PseHeadImpl {
- public:
-  PseHeadCpuImpl() : device_(0) {}
-
-  Result<void> Process(Tensor preds,                 //
-                       float min_kernel_confidence,  //
-                       cv::Mat_<float>& score,       //
-                       cv::Mat_<uint8_t>& masks,     //
-                       cv::Mat_<int>& label,         //
-                       int& region_num) override {
-    OUTCOME_TRY(preds, MakeAvailableOnDevice(preds, device_, stream_));
-    OUTCOME_TRY(stream_.Wait());
-
-    auto channels = static_cast<int>(preds.shape(0));
-    auto height = static_cast<int>(preds.shape(1));
-    auto width = static_cast<int>(preds.shape(2));
-
-    cv::Mat_<float> probs(channels, height * width, preds.data<float>());
-    sigmoid(probs);
-
-    probs.row(0).reshape(1, height).copyTo(score);
-
-    masks = probs > min_kernel_confidence;
-
-    for (int i = 1; i < channels; ++i) {
-      masks.row(i) &= masks.row(0);
-    }
-
-    cv::Mat_<uint8_t> kernel = masks.row(channels - 1).reshape(1, height);
-    region_num = cv::connectedComponents(kernel, label, 4, CV_32S);
-
-    return success();
-  }
-
-  static void sigmoid(cv::Mat_<float>& score) {
-    cv::exp(-score, score);
-    score = 1 / (1 + score);
-  }
-
- private:
-  Device device_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(PseHeadImpl, (cpu, 0),
-                               [] { return std::make_unique<PseHeadCpuImpl>(); });
+namespace mmdeploy::mmocr
+{
+
+    class PseHeadCpuImpl : public PseHeadImpl
+    {
+      public:
+        PseHeadCpuImpl()
+            : device_(0)
+        {
+        }
+
+        Result<void> Process(Tensor             preds,                  //
+                             float              min_kernel_confidence,  //
+                             cv::Mat_<float>&   score,                  //
+                             cv::Mat_<uint8_t>& masks,                  //
+                             cv::Mat_<int>&     label,                  //
+                             int&               region_num) override
+        {
+            OUTCOME_TRY(preds, MakeAvailableOnDevice(preds, device_, stream_));
+            OUTCOME_TRY(stream_.Wait());
+
+            auto            channels = static_cast<int>(preds.shape(0));
+            auto            height   = static_cast<int>(preds.shape(1));
+            auto            width    = static_cast<int>(preds.shape(2));
+
+            cv::Mat_<float> probs(channels, height * width, preds.data<float>());
+            sigmoid(probs);
+
+            probs.row(0).reshape(1, height).copyTo(score);
+
+            masks = probs > min_kernel_confidence;
+
+            for (int i = 1; i < channels; ++i)
+            {
+                masks.row(i) &= masks.row(0);
+            }
+
+            cv::Mat_<uint8_t> kernel = masks.row(channels - 1).reshape(1, height);
+            region_num               = cv::connectedComponents(kernel, label, 4, CV_32S);
+
+            return success();
+        }
+
+        static void sigmoid(cv::Mat_<float>& score)
+        {
+            cv::exp(-score, score);
+            score = 1 / (1 + score);
+        }
+
+      private:
+        Device device_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(PseHeadImpl, (cpu, 0), []
+                                   { return std::make_unique<PseHeadCpuImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/crnn.cpp b/csrc/mmdeploy/codebase/mmocr/crnn.cpp
index 807f83eece..bc08ec8f46 100644
--- a/csrc/mmdeploy/codebase/mmocr/crnn.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/crnn.cpp
@@ -14,66 +14,74 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmocr.h"
 
-namespace mmdeploy::mmocr {
-
-using std::string;
-using std::vector;
-
-class CTCConvertor : public BaseConvertor {
- public:
-  explicit CTCConvertor(const Value& cfg) : BaseConvertor(cfg) {}
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) {
-    auto d_conf = _prob["output"].get<Tensor>();
-
-    if (!(d_conf.shape().size() == 3 && d_conf.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", d_conf.shape(),
-                     (int)d_conf.data_type());
-      return Status(eNotSupported);
-    }
-
-    OUTCOME_TRY(auto h_conf, MakeAvailableOnDevice(d_conf, Device{0}, stream()));
-    OUTCOME_TRY(stream().Wait());
-
-    auto data = h_conf.data<float>();
-
-    auto shape = d_conf.shape();
-    auto w = static_cast<int>(shape[1]);
-    auto c = static_cast<int>(shape[2]);
-
-    auto valid_ratio = _data["img_metas"]["valid_ratio"].get<float>();
-    auto [indexes, scores] = Tensor2Idx(data, w, c, valid_ratio);
-
-    auto text = Idx2Str(indexes);
-    MMDEPLOY_DEBUG("text: {}", text);
-
-    TextRecognition output{text, scores};
-
-    return make_pointer(to_value(output));
-  }
-
-  std::pair<vector<int>, vector<float> > Tensor2Idx(const float* data, int w, int c,
-                                                    float valid_ratio) {
-    auto decode_len = std::min(w, static_cast<int>(std::ceil(w * valid_ratio)));
-    vector<int> indexes;
-    indexes.reserve(decode_len);
-    vector<float> scores;
-    scores.reserve(decode_len);
-    int prev = padding_idx_;
-    for (int t = 0; t < decode_len; ++t, data += c) {
-      vector<float> prob(data, data + c);
-      auto iter = max_element(begin(prob), end(prob));
-      auto index = static_cast<int>(iter - begin(prob));
-      if (index != prev && ignore_indexes_.find(index) == ignore_indexes_.end()) {
-        indexes.push_back(index);
-        scores.push_back(*iter);
-      }
-      prev = index;
-    }
-    return {indexes, scores};
-  }
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, CTCConvertor);
+namespace mmdeploy::mmocr
+{
+
+    using std::string;
+    using std::vector;
+
+    class CTCConvertor : public BaseConvertor
+    {
+      public:
+        explicit CTCConvertor(const Value& cfg)
+            : BaseConvertor(cfg)
+        {
+        }
+
+        Result<Value> operator()(const Value& _data, const Value& _prob)
+        {
+            auto d_conf = _prob["output"].get<Tensor>();
+
+            if (!(d_conf.shape().size() == 3 && d_conf.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", d_conf.shape(), (int)d_conf.data_type());
+                return Status(eNotSupported);
+            }
+
+            OUTCOME_TRY(auto h_conf, MakeAvailableOnDevice(d_conf, Device{0}, stream()));
+            OUTCOME_TRY(stream().Wait());
+
+            auto data = h_conf.data<float>();
+
+            auto shape = d_conf.shape();
+            auto w     = static_cast<int>(shape[1]);
+            auto c     = static_cast<int>(shape[2]);
+
+            auto valid_ratio       = _data["img_metas"]["valid_ratio"].get<float>();
+            auto [indexes, scores] = Tensor2Idx(data, w, c, valid_ratio);
+
+            auto text = Idx2Str(indexes);
+            MMDEPLOY_DEBUG("text: {}", text);
+
+            TextRecognition output{text, scores};
+
+            return make_pointer(to_value(output));
+        }
+
+        std::pair<vector<int>, vector<float>> Tensor2Idx(const float* data, int w, int c, float valid_ratio)
+        {
+            auto        decode_len = std::min(w, static_cast<int>(std::ceil(w * valid_ratio)));
+            vector<int> indexes;
+            indexes.reserve(decode_len);
+            vector<float> scores;
+            scores.reserve(decode_len);
+            int prev = padding_idx_;
+            for (int t = 0; t < decode_len; ++t, data += c)
+            {
+                vector<float> prob(data, data + c);
+                auto          iter  = max_element(begin(prob), end(prob));
+                auto          index = static_cast<int>(iter - begin(prob));
+                if (index != prev && ignore_indexes_.find(index) == ignore_indexes_.end())
+                {
+                    indexes.push_back(index);
+                    scores.push_back(*iter);
+                }
+                prev = index;
+            }
+            return {indexes, scores};
+        }
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, CTCConvertor);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.cu b/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.cu
index 03007b2a5a..5faaad486b 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.cu
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.cu
@@ -9,423 +9,498 @@
 #include "thrust/for_each.h"
 #include "thrust/iterator/counting_iterator.h"
 
-namespace mmdeploy {
-
-__device__ int start_distance(unsigned pixels, int tx) {
-  unsigned v = ~(pixels << (32 - tx));
-  return __clz(reinterpret_cast<int&>(v));
-}
-
-__device__ int end_distance(unsigned pixels, int tx) {
-  unsigned v = ~(pixels >> (tx + 1));
-  return __ffs(reinterpret_cast<int&>(v));
-}
-
-template <typename T>
-__device__ void swap(T& x, T& y) {
-  T tmp = x;
-  x = y;
-  y = tmp;
-}
-
-__device__ void merge(int* label, int u, int v) {
-  // find root of u
-  while (u != v && u != label[u]) {
-    u = label[u];
-  }
-  // find root of v
-  while (u != v && v != label[v]) {
-    v = label[v];
-  }
-  while (u != v) {
-    // post-condition: u > v
-    if (u < v) swap(u, v);
-    // try to set label[u] = v
-    auto w = atomicMin(label + u, v);
-    // if u is modified by other threads, try again
-    u = u == w ? v : w;
-  }
-}
-
-__host__ __device__ int div_up(int x, int y) { return (x + y - 1) / y; }
-
-__host__ __device__ int round_up(int x, int y) { return div_up(x, y) * y; }
-
-template <int block_w, int block_h>
-__global__ void LabelStripsKernel(const uint8_t* mask, int h, int w, int* label) {
-  __shared__ unsigned shared_pixels[block_h];
-  auto tx = static_cast<int>(threadIdx.x);
-  auto ty = static_cast<int>(threadIdx.y);
-  auto x0 = tx + static_cast<int>(blockIdx.x * blockDim.x);
-  auto y0 = ty + static_cast<int>(blockIdx.y * blockDim.y);
-  auto w_32 = round_up(w, 32);
-  for (auto y = y0; y < h; y += blockDim.y * gridDim.y) {
-    //* 0 -> current line
-    //* 1 -> line above
-    int distance0 = 0;
-    int distance1 = 0;
-    for (auto x = x0; x < w_32; x += blockDim.x * gridDim.x) {
-      unsigned active = __ballot_sync(0xffffffff, x < w);
-      if (x < w) {
-        auto key = y * w + x;
-        auto p0 = mask[y * w + x];
-        auto pixels0 = __ballot_sync(active, p0);
-        auto s_dist0 = start_distance(pixels0, tx);
-        if (p0 && s_dist0 == 0) {
-          auto l = tx ? key : key - distance0;
-          label[y * w + x] = static_cast<int>(l);
+namespace mmdeploy
+{
+
+    __device__ int start_distance(unsigned pixels, int tx)
+    {
+        unsigned v = ~(pixels << (32 - tx));
+        return __clz(reinterpret_cast<int&>(v));
+    }
+
+    __device__ int end_distance(unsigned pixels, int tx)
+    {
+        unsigned v = ~(pixels >> (tx + 1));
+        return __ffs(reinterpret_cast<int&>(v));
+    }
+
+    template<typename T>
+    __device__ void swap(T& x, T& y)
+    {
+        T tmp = x;
+        x     = y;
+        y     = tmp;
+    }
+
+    __device__ void merge(int* label, int u, int v)
+    {
+        // find root of u
+        while (u != v && u != label[u])
+        {
+            u = label[u];
         }
-        if (tx == 0) {
-          shared_pixels[ty] = pixels0;
+        // find root of v
+        while (u != v && v != label[v])
+        {
+            v = label[v];
         }
-        __syncthreads();
-        auto pixels1 = ty ? shared_pixels[ty - 1] : 0;
-        int p1 = (pixels1 & (1 << tx));
-        int s_dist1 = start_distance(pixels1, tx);
-        if (tx == 0) {
-          s_dist0 = distance0;
-          s_dist1 = distance1;
+        while (u != v)
+        {
+            // post-condition: u > v
+            if (u < v) swap(u, v);
+            // try to set label[u] = v
+            auto w = atomicMin(label + u, v);
+            // if u is modified by other threads, try again
+            u      = u == w ? v : w;
         }
-        if (p0 && p1 && (s_dist0 == 0 || s_dist1 == 0)) {
-          int label0 = key - s_dist0;
-          int label1 = key - w - s_dist1;
-          merge(label, label0, label1);
+    }
+
+    __host__ __device__ int div_up(int x, int y)
+    {
+        return (x + y - 1) / y;
+    }
+
+    __host__ __device__ int round_up(int x, int y)
+    {
+        return div_up(x, y) * y;
+    }
+
+    template<int block_w, int block_h>
+    __global__ void LabelStripsKernel(const uint8_t* mask, int h, int w, int* label)
+    {
+        __shared__ unsigned shared_pixels[block_h];
+        auto                tx   = static_cast<int>(threadIdx.x);
+        auto                ty   = static_cast<int>(threadIdx.y);
+        auto                x0   = tx + static_cast<int>(blockIdx.x * blockDim.x);
+        auto                y0   = ty + static_cast<int>(blockIdx.y * blockDim.y);
+        auto                w_32 = round_up(w, 32);
+        for (auto y = y0; y < h; y += blockDim.y * gridDim.y)
+        {
+            //* 0 -> current line
+            //* 1 -> line above
+            int distance0 = 0;
+            int distance1 = 0;
+            for (auto x = x0; x < w_32; x += blockDim.x * gridDim.x)
+            {
+                unsigned active = __ballot_sync(0xffffffff, x < w);
+                if (x < w)
+                {
+                    auto key     = y * w + x;
+                    auto p0      = mask[y * w + x];
+                    auto pixels0 = __ballot_sync(active, p0);
+                    auto s_dist0 = start_distance(pixels0, tx);
+                    if (p0 && s_dist0 == 0)
+                    {
+                        auto l           = tx ? key : key - distance0;
+                        label[y * w + x] = static_cast<int>(l);
+                    }
+                    if (tx == 0)
+                    {
+                        shared_pixels[ty] = pixels0;
+                    }
+                    __syncthreads();
+                    auto pixels1 = ty ? shared_pixels[ty - 1] : 0;
+                    int  p1      = (pixels1 & (1 << tx));
+                    int  s_dist1 = start_distance(pixels1, tx);
+                    if (tx == 0)
+                    {
+                        s_dist0 = distance0;
+                        s_dist1 = distance1;
+                    }
+                    if (p0 && p1 && (s_dist0 == 0 || s_dist1 == 0))
+                    {
+                        int label0 = key - s_dist0;
+                        int label1 = key - w - s_dist1;
+                        merge(label, label0, label1);
+                    }
+                    auto d1   = start_distance(pixels1, 32);
+                    distance1 = d1 == 32 ? d1 + distance1 : d1;
+                    auto d0   = start_distance(pixels0, 32);
+                    distance0 = d0 == 32 ? d0 + distance0 : d0;
+                }
+            }
         }
-        auto d1 = start_distance(pixels1, 32);
-        distance1 = d1 == 32 ? d1 + distance1 : d1;
-        auto d0 = start_distance(pixels0, 32);
-        distance0 = d0 == 32 ? d0 + distance0 : d0;
-      }
     }
-  }
-}
-
-__global__ void MergeStripsKernel(const uint8_t* mask, int h, int w, int* label) {
-  auto tx = threadIdx.x;
-  auto ty = threadIdx.y;
-  auto x0 = tx + blockIdx.x * blockDim.x;
-  auto y0 = ty + blockIdx.y * blockDim.y;
-  auto w_32 = round_up(w, 32);
-  for (auto y = y0; y < h; y += blockDim.y * gridDim.y) {
-    if (y > 0) {
-      for (auto x = x0; x < w_32; x += blockDim.x * gridDim.x) {
-        unsigned active = __ballot_sync(0xffffffff, x < w);
-        if (x < w) {
-          auto key0 = y * w + x;
-          auto key1 = key0 - w;
-          auto p0 = mask[key0];
-          auto p1 = mask[key1];
-          auto pixels0 = __ballot_sync(active, p0);
-          auto pixels1 = __ballot_sync(active, p1);
-          if (p0 && p1) {
-            auto s_dist0 = start_distance(pixels0, tx);
-            auto s_dist1 = start_distance(pixels1, tx);
-            if (s_dist0 == 0 || s_dist1 == 0) {
-              merge(label, key0 - s_dist0, key1 - s_dist1);
+
+    __global__ void MergeStripsKernel(const uint8_t* mask, int h, int w, int* label)
+    {
+        auto tx   = threadIdx.x;
+        auto ty   = threadIdx.y;
+        auto x0   = tx + blockIdx.x * blockDim.x;
+        auto y0   = ty + blockIdx.y * blockDim.y;
+        auto w_32 = round_up(w, 32);
+        for (auto y = y0; y < h; y += blockDim.y * gridDim.y)
+        {
+            if (y > 0)
+            {
+                for (auto x = x0; x < w_32; x += blockDim.x * gridDim.x)
+                {
+                    unsigned active = __ballot_sync(0xffffffff, x < w);
+                    if (x < w)
+                    {
+                        auto key0    = y * w + x;
+                        auto key1    = key0 - w;
+                        auto p0      = mask[key0];
+                        auto p1      = mask[key1];
+                        auto pixels0 = __ballot_sync(active, p0);
+                        auto pixels1 = __ballot_sync(active, p1);
+                        if (p0 && p1)
+                        {
+                            auto s_dist0 = start_distance(pixels0, tx);
+                            auto s_dist1 = start_distance(pixels1, tx);
+                            if (s_dist0 == 0 || s_dist1 == 0)
+                            {
+                                merge(label, key0 - s_dist0, key1 - s_dist1);
+                            }
+                        }
+                    }
+                }
             }
-          }
         }
-      }
     }
-  }
-}
 
-__device__ int encode(int label) { return -2 - label; }
+    __device__ int encode(int label)
+    {
+        return -2 - label;
+    }
 
-__device__ int decode(int label) { return -2 - label; }
+    __device__ int decode(int label)
+    {
+        return -2 - label;
+    }
 
-struct _discretize_label_op {
-  int* label;
-  int* n_comp;
-  __device__ void operator()(int index) const {
-    if (label[index] == index) {
-      auto comp = atomicAdd(n_comp, 1);
-      label[index] = encode(comp);
+    struct _discretize_label_op
+    {
+        int*            label;
+        int*            n_comp;
+        __device__ void operator()(int index) const
+        {
+            if (label[index] == index)
+            {
+                auto comp    = atomicAdd(n_comp, 1);
+                label[index] = encode(comp);
+            }
+        }
+    };
+
+    struct _decode_label_op
+    {
+        const int*      label;
+        int*            output;
+        __device__ void operator()(int index) const
+        {
+            auto comp     = label[index];
+            output[index] = comp < -1 ? decode(comp) + 1 : 0;
+        }
+    };
+
+    __global__ void RelabelStripsKernel(const uint8_t* mask, int h, int w, int* label)
+    {
+        auto       tx       = threadIdx.x;
+        auto       ty       = threadIdx.y;
+        auto       x0       = tx + blockIdx.x * blockDim.x;
+        auto       y0       = ty + blockIdx.y * blockDim.y;
+        const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
+        const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
+        const auto w_32     = round_up(w, 32);
+        for (auto y = y0; y < h; y += stride_y)
+        {
+            for (auto x = x0; x < w_32; x += stride_x)
+            {
+                unsigned active = __ballot_sync(0xffffffff, x < w);
+                if (x < w)
+                {
+                    auto k      = y * w + x;
+                    auto p      = mask[k];
+                    auto pixels = __ballot_sync(active, p);
+                    auto s_dist = start_distance(pixels, tx);
+                    auto idx    = 0;
+                    if (p && s_dist == 0)
+                    {
+                        idx = label[k];
+                        while (idx > 0)
+                        {
+                            idx = label[idx];
+                        }
+                    }
+                    idx = __shfl_sync(active, idx, tx - s_dist);
+                    if (p)
+                    {
+                        label[k] = idx;
+                    }
+                }
+            }
+        }
     }
-  }
-};
-
-struct _decode_label_op {
-  const int* label;
-  int* output;
-  __device__ void operator()(int index) const {
-    auto comp = label[index];
-    output[index] = comp < -1 ? decode(comp) + 1 : 0;
-  }
-};
-
-__global__ void RelabelStripsKernel(const uint8_t* mask, int h, int w, int* label) {
-  auto tx = threadIdx.x;
-  auto ty = threadIdx.y;
-  auto x0 = tx + blockIdx.x * blockDim.x;
-  auto y0 = ty + blockIdx.y * blockDim.y;
-  const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
-  const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
-  const auto w_32 = round_up(w, 32);
-  for (auto y = y0; y < h; y += stride_y) {
-    for (auto x = x0; x < w_32; x += stride_x) {
-      unsigned active = __ballot_sync(0xffffffff, x < w);
-      if (x < w) {
-        auto k = y * w + x;
-        auto p = mask[k];
-        auto pixels = __ballot_sync(active, p);
-        auto s_dist = start_distance(pixels, tx);
-        auto idx = 0;
-        if (p && s_dist == 0) {
-          idx = label[k];
-          while (idx > 0) {
-            idx = label[idx];
-          }
+
+    __global__ void ComputeStatsKernel_v2(const uint8_t* mask, const int* label, const float* score, int h, int w, float* comp_score, int* comp_area)
+    {
+        auto       tx       = threadIdx.x;
+        auto       ty       = threadIdx.y;
+        auto       x0       = tx + blockIdx.x * blockDim.x;
+        auto       y0       = ty + blockIdx.y * blockDim.y;
+        const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
+        const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
+        const auto w_32     = round_up(w, 32);
+        for (auto y = y0; y < h; y += stride_y)
+        {
+            for (auto x = x0; x < w_32; x += stride_x)
+            {
+                unsigned active = __ballot_sync(0xffffffff, x < w);
+                if (x < w)
+                {
+                    auto  k      = y * w + x;
+                    auto  p      = mask[k];
+                    auto  pixels = __ballot_sync(active, p);
+                    auto  s_dist = start_distance(pixels, tx);
+                    auto  count  = end_distance(pixels, tx);
+
+                    float s = p ? score[k] : 0;
+                    for (int offset = 16; offset > 0; offset /= 2)
+                    {
+                        auto v = __shfl_down_sync(active, s, offset);
+                        // mask out past-the-end items
+                        s += offset < count ? v : 0.f;
+                    }
+
+                    if (p && s_dist == 0)
+                    {
+                        auto idx = decode(label[k]);
+                        atomicAdd(comp_area + idx, count);
+                        atomicAdd(comp_score + idx, s);
+                    }
+                }
+            }
         }
-        idx = __shfl_sync(active, idx, tx - s_dist);
-        if (p) {
-          label[k] = idx;
+    }
+
+    __global__ void GetContoursKernel(const int* label, int h, int w, int2* contour, int* size)
+    {
+        const auto x0       = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+        const auto y0       = static_cast<int>(threadIdx.y + blockIdx.y * blockDim.y);
+        const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
+        const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
+        for (auto y = y0; y < h; y += stride_y)
+        {
+            for (auto x = x0; x < w; x += stride_x)
+            {
+                const auto index = y * w + x;
+                // encoded label
+                const auto comp  = label[index];
+                if (comp < -1)
+                {
+                    // non-linear filters
+                    const auto l  = x > 0 && label[index - 1] == comp;
+                    const auto t  = y > 0 && label[index - w] == comp;
+                    const auto r  = x < w - 1 && label[index + 1] == comp;
+                    const auto b  = y < h - 1 && label[index + w] == comp;
+                    const auto tl = y > 0 && x > 0 && label[index - w - 1] == comp;
+                    const auto tr = y > 0 && x < w - 1 && label[index - w + 1] == comp;
+                    const auto bl = y < h - 1 && x > 0 && label[index + w - 1] == comp;
+                    const auto br = y < h - 1 && x < w - 1 && label[index + w + 1] == comp;
+                    if (!((l && r) || (t && b) || (tl && br) || (tr && bl)))
+                    {
+                        const auto p = atomicAdd(size, 1);
+                        contour[p]   = {index, decode(comp)};
+                    }
+                }
+            }
         }
-      }
     }
-  }
-}
-
-__global__ void ComputeStatsKernel_v2(const uint8_t* mask, const int* label, const float* score,
-                                      int h, int w, float* comp_score, int* comp_area) {
-  auto tx = threadIdx.x;
-  auto ty = threadIdx.y;
-  auto x0 = tx + blockIdx.x * blockDim.x;
-  auto y0 = ty + blockIdx.y * blockDim.y;
-  const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
-  const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
-  const auto w_32 = round_up(w, 32);
-  for (auto y = y0; y < h; y += stride_y) {
-    for (auto x = x0; x < w_32; x += stride_x) {
-      unsigned active = __ballot_sync(0xffffffff, x < w);
-      if (x < w) {
-        auto k = y * w + x;
-        auto p = mask[k];
-        auto pixels = __ballot_sync(active, p);
-        auto s_dist = start_distance(pixels, tx);
-        auto count = end_distance(pixels, tx);
-
-        float s = p ? score[k] : 0;
-        for (int offset = 16; offset > 0; offset /= 2) {
-          auto v = __shfl_down_sync(active, s, offset);
-          // mask out past-the-end items
-          s += offset < count ? v : 0.f;
+
+    struct ConnectedComponents::Impl
+    {
+      public:
+        explicit Impl(cudaStream_t stream);
+
+        void Resize(int height, int width);
+
+        int  GetComponents(const uint8_t* d_mask, int* h_label);
+
+        void GetContours(std::vector<std::vector<cv::Point>>& corners);
+
+        void GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores, std::vector<int>& areas);
+
+        ~Impl();
+
+        int*         d_label_{nullptr};
+        float*       d_comp_score_{nullptr};
+        int*         d_comp_area_{nullptr};
+        int*         d_contour_{nullptr};  // int2
+        int*         d_contour_size_{nullptr};
+        int*         d_n_comp_{nullptr};
+        int          n_comp_{0};
+        int          height_{0};
+        int          width_{0};
+        size_t       size_{0};
+        size_t       capacity_{0};
+        double       growth_factor_{1.1};
+        cudaStream_t stream_{nullptr};
+        bool         owned_stream_{false};
+    };
+
+    int ConnectedComponents::Impl::GetComponents(const uint8_t* d_mask, int* h_label)
+    {
+        {
+            dim3 threads(32, 4);
+            dim3 blocks(1, div_up(height_, (int)threads.y));
+            cudaMemsetAsync(d_label_, -1, sizeof(int) * size_, stream_);
+            LabelStripsKernel<32, 4><<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
         }
+        {
+            dim3 threads(32, 4);
+            dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+            MergeStripsKernel<<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
+
+            cudaMemsetAsync(d_n_comp_, 0, sizeof(int), stream_);
+            thrust::for_each_n(thrust::cuda::par.on(stream_), thrust::counting_iterator<int>(0), height_ * width_, _discretize_label_op{d_label_, d_n_comp_});
+            RelabelStripsKernel<<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
+        }
+        cudaMemcpyAsync(&n_comp_, d_n_comp_, sizeof(int), cudaMemcpyDefault, stream_);
+        if (h_label)
+        {
+            dim3 threads(32, 4);
+            dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+            // reuse d_comp_area_, which is also an int buffer
+            thrust::for_each_n(thrust::cuda::par.on(stream_), thrust::counting_iterator<int>(0), height_ * width_, _decode_label_op{d_label_, d_comp_area_});
+            cudaMemcpyAsync(h_label, d_comp_area_, sizeof(int) * size_, cudaMemcpyDefault, stream_);
+        }
+        cudaStreamSynchronize(stream_);
+        return n_comp_;
+    }
 
-        if (p && s_dist == 0) {
-          auto idx = decode(label[k]);
-          atomicAdd(comp_area + idx, count);
-          atomicAdd(comp_score + idx, s);
+    void ConnectedComponents::Impl::GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores, std::vector<int>& areas)
+    {
+        cudaMemsetAsync(d_comp_score_, 0, sizeof(float) * size_, stream_);
+        cudaMemsetAsync(d_comp_area_, 0, sizeof(int) * size_, stream_);
+        dim3 threads(32, 4);
+        dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+        ComputeStatsKernel_v2<<<blocks, threads, 0, stream_>>>(d_mask, d_label_, d_score, height_, width_, d_comp_score_, d_comp_area_);
+        scores.resize(n_comp_);
+        areas.resize(n_comp_);
+        cudaMemcpyAsync(scores.data(), d_comp_score_, sizeof(float) * n_comp_, cudaMemcpyDefault, stream_);
+        cudaMemcpyAsync(areas.data(), d_comp_area_, sizeof(int) * n_comp_, cudaMemcpyDefault, stream_);
+
+        cudaStreamSynchronize(stream_);
+    }
+
+    void ConnectedComponents::Impl::GetContours(std::vector<std::vector<cv::Point>>& corners)
+    {
+        cudaMemsetAsync(d_contour_size_, 0, sizeof(int), stream_);
+
+        auto d_contour = reinterpret_cast<int2*>(d_contour_);
+        {
+            dim3 threads(32, 4);
+            dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+            GetContoursKernel<<<blocks, threads, 0, stream_>>>(d_label_, height_, width_, d_contour, d_contour_size_);
+        }
+
+        int contour_size{};
+        cudaMemcpyAsync(&contour_size, d_contour_size_, sizeof(int), cudaMemcpyDefault, stream_);
+
+        cudaStreamSynchronize(stream_);
+
+        std::vector<int2> index_comp(contour_size);
+        cudaMemcpyAsync(index_comp.data(), d_contour_, sizeof(int2) * contour_size, cudaMemcpyDefault, stream_);
+
+        cudaStreamSynchronize(stream_);
+
+        corners.resize(n_comp_);
+        for (const auto& p : index_comp)
+        {
+            auto comp = p.y;
+            assert(0 <= comp && comp < n_comp_);
+            corners[comp].emplace_back(p.x % width_, p.x / width_);
+        }
+    }
+
+    void ConnectedComponents::Impl::Resize(int height, int width)
+    {
+        size_t size = height * width;
+        if (size > capacity_)
+        {
+            if (!capacity_)
+            {
+                capacity_ = size;
+            }
+            else
+            {
+                while (capacity_ < size)
+                {
+                    capacity_ *= growth_factor_;
+                }
+            }
+            cudaFree(d_label_);
+            cudaFree(d_comp_score_);
+            cudaFree(d_comp_area_);
+            cudaFree(d_contour_);
+            cudaMalloc(&d_label_, sizeof(int) * capacity_);
+            cudaMalloc(&d_comp_score_, sizeof(float) * capacity_);
+            cudaMalloc(&d_comp_area_, sizeof(int) * capacity_);
+            cudaMalloc(&d_contour_, sizeof(int2) * capacity_);
+        }
+        if (!d_contour_size_)
+        {
+            cudaMalloc(&d_contour_size_, sizeof(int));
+        }
+        if (!d_n_comp_)
+        {
+            cudaMalloc(&d_n_comp_, sizeof(int));
+        }
+        height_ = height;
+        width_  = width;
+        size_   = size;
+    }
+
+    ConnectedComponents::Impl::Impl(cudaStream_t stream)
+        : stream_(stream)
+    {
+        if (!stream_)
+        {
+            cudaStreamCreate(&stream_);
+            owned_stream_ = true;
         }
-      }
     }
-  }
-}
-
-__global__ void GetContoursKernel(const int* label, int h, int w, int2* contour, int* size) {
-  const auto x0 = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
-  const auto y0 = static_cast<int>(threadIdx.y + blockIdx.y * blockDim.y);
-  const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
-  const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
-  for (auto y = y0; y < h; y += stride_y) {
-    for (auto x = x0; x < w; x += stride_x) {
-      const auto index = y * w + x;
-      // encoded label
-      const auto comp = label[index];
-      if (comp < -1) {
-        // non-linear filters
-        const auto l = x > 0 && label[index - 1] == comp;
-        const auto t = y > 0 && label[index - w] == comp;
-        const auto r = x < w - 1 && label[index + 1] == comp;
-        const auto b = y < h - 1 && label[index + w] == comp;
-        const auto tl = y > 0 && x > 0 && label[index - w - 1] == comp;
-        const auto tr = y > 0 && x < w - 1 && label[index - w + 1] == comp;
-        const auto bl = y < h - 1 && x > 0 && label[index + w - 1] == comp;
-        const auto br = y < h - 1 && x < w - 1 && label[index + w + 1] == comp;
-        if (!((l && r) || (t && b) || (tl && br) || (tr && bl))) {
-          const auto p = atomicAdd(size, 1);
-          contour[p] = {index, decode(comp)};
+
+    ConnectedComponents::Impl::~Impl()
+    {
+        cudaFree(d_label_);
+        cudaFree(d_comp_score_);
+        cudaFree(d_comp_area_);
+        cudaFree(d_contour_);
+        cudaFree(d_contour_size_);
+        cudaFree(d_n_comp_);
+        if (owned_stream_)
+        {
+            cudaStreamDestroy(stream_);
         }
-      }
     }
-  }
-}
-
-struct ConnectedComponents::Impl {
- public:
-  explicit Impl(cudaStream_t stream);
-
-  void Resize(int height, int width);
-
-  int GetComponents(const uint8_t* d_mask, int* h_label);
-
-  void GetContours(std::vector<std::vector<cv::Point>>& corners);
-
-  void GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores,
-                std::vector<int>& areas);
-
-  ~Impl();
-
-  int* d_label_{nullptr};
-  float* d_comp_score_{nullptr};
-  int* d_comp_area_{nullptr};
-  int* d_contour_{nullptr};  // int2
-  int* d_contour_size_{nullptr};
-  int* d_n_comp_{nullptr};
-  int n_comp_{0};
-  int height_{0};
-  int width_{0};
-  size_t size_{0};
-  size_t capacity_{0};
-  double growth_factor_{1.1};
-  cudaStream_t stream_{nullptr};
-  bool owned_stream_{false};
-};
-
-int ConnectedComponents::Impl::GetComponents(const uint8_t* d_mask, int* h_label) {
-  {
-    dim3 threads(32, 4);
-    dim3 blocks(1, div_up(height_, (int)threads.y));
-    cudaMemsetAsync(d_label_, -1, sizeof(int) * size_, stream_);
-    LabelStripsKernel<32, 4><<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
-  }
-  {
-    dim3 threads(32, 4);
-    dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
-    MergeStripsKernel<<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
-
-    cudaMemsetAsync(d_n_comp_, 0, sizeof(int), stream_);
-    thrust::for_each_n(thrust::cuda::par.on(stream_), thrust::counting_iterator<int>(0),
-                       height_ * width_, _discretize_label_op{d_label_, d_n_comp_});
-    RelabelStripsKernel<<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
-  }
-  cudaMemcpyAsync(&n_comp_, d_n_comp_, sizeof(int), cudaMemcpyDefault, stream_);
-  if (h_label) {
-    dim3 threads(32, 4);
-    dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
-    // reuse d_comp_area_, which is also an int buffer
-    thrust::for_each_n(thrust::cuda::par.on(stream_), thrust::counting_iterator<int>(0),
-                       height_ * width_, _decode_label_op{d_label_, d_comp_area_});
-    cudaMemcpyAsync(h_label, d_comp_area_, sizeof(int) * size_, cudaMemcpyDefault, stream_);
-  }
-  cudaStreamSynchronize(stream_);
-  return n_comp_;
-}
-
-void ConnectedComponents::Impl::GetStats(const uint8_t* d_mask, const float* d_score,
-                                         std::vector<float>& scores, std::vector<int>& areas) {
-  cudaMemsetAsync(d_comp_score_, 0, sizeof(float) * size_, stream_);
-  cudaMemsetAsync(d_comp_area_, 0, sizeof(int) * size_, stream_);
-  dim3 threads(32, 4);
-  dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
-  ComputeStatsKernel_v2<<<blocks, threads, 0, stream_>>>(d_mask, d_label_, d_score, height_, width_,
-                                                         d_comp_score_, d_comp_area_);
-  scores.resize(n_comp_);
-  areas.resize(n_comp_);
-  cudaMemcpyAsync(scores.data(), d_comp_score_, sizeof(float) * n_comp_, cudaMemcpyDefault,
-                  stream_);
-  cudaMemcpyAsync(areas.data(), d_comp_area_, sizeof(int) * n_comp_, cudaMemcpyDefault, stream_);
-
-  cudaStreamSynchronize(stream_);
-}
-
-void ConnectedComponents::Impl::GetContours(std::vector<std::vector<cv::Point>>& corners) {
-  cudaMemsetAsync(d_contour_size_, 0, sizeof(int), stream_);
-
-  auto d_contour = reinterpret_cast<int2*>(d_contour_);
-  {
-    dim3 threads(32, 4);
-    dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
-    GetContoursKernel<<<blocks, threads, 0, stream_>>>(d_label_, height_, width_, d_contour,
-                                                       d_contour_size_);
-  }
-
-  int contour_size{};
-  cudaMemcpyAsync(&contour_size, d_contour_size_, sizeof(int), cudaMemcpyDefault, stream_);
-
-  cudaStreamSynchronize(stream_);
-
-  std::vector<int2> index_comp(contour_size);
-  cudaMemcpyAsync(index_comp.data(), d_contour_, sizeof(int2) * contour_size, cudaMemcpyDefault,
-                  stream_);
-
-  cudaStreamSynchronize(stream_);
-
-  corners.resize(n_comp_);
-  for (const auto& p : index_comp) {
-    auto comp = p.y;
-    assert(0 <= comp && comp < n_comp_);
-    corners[comp].emplace_back(p.x % width_, p.x / width_);
-  }
-}
-
-void ConnectedComponents::Impl::Resize(int height, int width) {
-  size_t size = height * width;
-  if (size > capacity_) {
-    if (!capacity_) {
-      capacity_ = size;
-    } else {
-      while (capacity_ < size) {
-        capacity_ *= growth_factor_;
-      }
+
+    ConnectedComponents::ConnectedComponents(void* stream)
+        : impl_(std::make_unique<Impl>((cudaStream_t)stream))
+    {
+    }
+
+    ConnectedComponents::~ConnectedComponents() = default;
+
+    void ConnectedComponents::Resize(int height, int width)
+    {
+        impl_->Resize(height, width);
+    }
+
+    int ConnectedComponents::GetComponents(const uint8_t* d_mask, int* h_label)
+    {
+        return impl_->GetComponents(d_mask, h_label);
+    }
+
+    void ConnectedComponents::GetContours(std::vector<std::vector<cv::Point>>& corners)
+    {
+        return impl_->GetContours(corners);
+    }
+
+    void ConnectedComponents::GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores, std::vector<int>& areas)
+    {
+        return impl_->GetStats(d_mask, d_score, scores, areas);
     }
-    cudaFree(d_label_);
-    cudaFree(d_comp_score_);
-    cudaFree(d_comp_area_);
-    cudaFree(d_contour_);
-    cudaMalloc(&d_label_, sizeof(int) * capacity_);
-    cudaMalloc(&d_comp_score_, sizeof(float) * capacity_);
-    cudaMalloc(&d_comp_area_, sizeof(int) * capacity_);
-    cudaMalloc(&d_contour_, sizeof(int2) * capacity_);
-  }
-  if (!d_contour_size_) {
-    cudaMalloc(&d_contour_size_, sizeof(int));
-  }
-  if (!d_n_comp_) {
-    cudaMalloc(&d_n_comp_, sizeof(int));
-  }
-  height_ = height;
-  width_ = width;
-  size_ = size;
-}
-
-ConnectedComponents::Impl::Impl(cudaStream_t stream) : stream_(stream) {
-  if (!stream_) {
-    cudaStreamCreate(&stream_);
-    owned_stream_ = true;
-  }
-}
-
-ConnectedComponents::Impl::~Impl() {
-  cudaFree(d_label_);
-  cudaFree(d_comp_score_);
-  cudaFree(d_comp_area_);
-  cudaFree(d_contour_);
-  cudaFree(d_contour_size_);
-  cudaFree(d_n_comp_);
-  if (owned_stream_) {
-    cudaStreamDestroy(stream_);
-  }
-}
-
-ConnectedComponents::ConnectedComponents(void* stream)
-    : impl_(std::make_unique<Impl>((cudaStream_t)stream)) {}
-
-ConnectedComponents::~ConnectedComponents() = default;
-
-void ConnectedComponents::Resize(int height, int width) { impl_->Resize(height, width); }
-
-int ConnectedComponents::GetComponents(const uint8_t* d_mask, int* h_label) {
-  return impl_->GetComponents(d_mask, h_label);
-}
-
-void ConnectedComponents::GetContours(std::vector<std::vector<cv::Point>>& corners) {
-  return impl_->GetContours(corners);
-}
-
-void ConnectedComponents::GetStats(const uint8_t* d_mask, const float* d_score,
-                                   std::vector<float>& scores, std::vector<int>& areas) {
-  return impl_->GetStats(d_mask, d_score, scores, areas);
-}
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.h b/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.h
index d57337d447..a6e0287116 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.h
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.h
@@ -10,27 +10,28 @@
 
 #include "opencv2/core.hpp"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class ConnectedComponents {
- public:
-  explicit ConnectedComponents(void* stream);
+    class ConnectedComponents
+    {
+      public:
+        explicit ConnectedComponents(void* stream);
 
-  ~ConnectedComponents();
+        ~ConnectedComponents();
 
-  void Resize(int height, int width);
+        void Resize(int height, int width);
 
-  int GetComponents(const uint8_t* d_mask, int* h_label);
+        int  GetComponents(const uint8_t* d_mask, int* h_label);
 
-  void GetContours(std::vector<std::vector<cv::Point>>& corners);
+        void GetContours(std::vector<std::vector<cv::Point>>& corners);
 
-  void GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores,
-                std::vector<int>& areas);
+        void GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores, std::vector<int>& areas);
 
- private:
-  struct Impl;
-  std::unique_ptr<Impl> impl_;
-};
+      private:
+        struct Impl;
+        std::unique_ptr<Impl> impl_;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/dbnet.cpp b/csrc/mmdeploy/codebase/mmocr/cuda/dbnet.cpp
index b14ef83962..d413734469 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/dbnet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/dbnet.cpp
@@ -8,72 +8,77 @@
 #include "mmdeploy/device/cuda/cuda_device.h"
 #include "opencv2/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class DbHeadCudaImpl : public DbHeadImpl {
- public:
-  void Init(const Stream& stream) override {
-    DbHeadImpl::Init(stream);
-    device_ = stream_.GetDevice();
+    class DbHeadCudaImpl : public DbHeadImpl
     {
-      CudaDeviceGuard device_guard(device_);
-      cc_.emplace(GetNative<cudaStream_t>(stream_));
-    }
-  }
-
-  ~DbHeadCudaImpl() override {
-    CudaDeviceGuard device_guard(device_);
-    cc_.reset();
-  }
-
-  Result<void> Process(Tensor score, float mask_thr, int max_candidates,
-                       std::vector<std::vector<cv::Point>>& contours,
-                       std::vector<float>& scores) override {
-    CudaDeviceGuard device_guard(device_);
-
-    auto height = static_cast<int>(score.shape(1));
-    auto width = static_cast<int>(score.shape(2));
-
-    Buffer mask(device_, score.size() * sizeof(uint8_t));
-
-    auto score_data = score.data<float>();
-    auto mask_data = GetNative<uint8_t*>(mask);
-
-    dbnet::Threshold(score_data, height * width, mask_thr, mask_data,
-                     GetNative<cudaStream_t>(stream_));
-
-    cc_->Resize(height, width);
-    cc_->GetComponents(mask_data, nullptr);
-
-    std::vector<std::vector<cv::Point>> points;
-    cc_->GetContours(points);
-
-    std::vector<float> _scores;
-    std::vector<int> _areas;
-    cc_->GetStats(mask_data, score_data, _scores, _areas);
-
-    if (points.size() > max_candidates) {
-      points.resize(max_candidates);
-    }
-
-    for (int i = 0; i < points.size(); ++i) {
-      std::vector<cv::Point> hull;
-      cv::convexHull(points[i], hull);
-      if (hull.size() < 4) {
-        continue;
-      }
-      contours.push_back(hull);
-      scores.push_back(_scores[i] / (float)_areas[i]);
-    }
-    return success();
-  }
-
- private:
-  Device device_;
-  std::optional<ConnectedComponents> cc_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(DbHeadImpl, (cuda, 0),
-                               [] { return std::make_unique<DbHeadCudaImpl>(); });
+      public:
+        void Init(const Stream& stream) override
+        {
+            DbHeadImpl::Init(stream);
+            device_ = stream_.GetDevice();
+            {
+                CudaDeviceGuard device_guard(device_);
+                cc_.emplace(GetNative<cudaStream_t>(stream_));
+            }
+        }
+
+        ~DbHeadCudaImpl() override
+        {
+            CudaDeviceGuard device_guard(device_);
+            cc_.reset();
+        }
+
+        Result<void> Process(Tensor score, float mask_thr, int max_candidates, std::vector<std::vector<cv::Point>>& contours, std::vector<float>& scores) override
+        {
+            CudaDeviceGuard device_guard(device_);
+
+            auto            height = static_cast<int>(score.shape(1));
+            auto            width  = static_cast<int>(score.shape(2));
+
+            Buffer          mask(device_, score.size() * sizeof(uint8_t));
+
+            auto            score_data = score.data<float>();
+            auto            mask_data  = GetNative<uint8_t*>(mask);
+
+            dbnet::Threshold(score_data, height * width, mask_thr, mask_data, GetNative<cudaStream_t>(stream_));
+
+            cc_->Resize(height, width);
+            cc_->GetComponents(mask_data, nullptr);
+
+            std::vector<std::vector<cv::Point>> points;
+            cc_->GetContours(points);
+
+            std::vector<float> _scores;
+            std::vector<int>   _areas;
+            cc_->GetStats(mask_data, score_data, _scores, _areas);
+
+            if (points.size() > max_candidates)
+            {
+                points.resize(max_candidates);
+            }
+
+            for (int i = 0; i < points.size(); ++i)
+            {
+                std::vector<cv::Point> hull;
+                cv::convexHull(points[i], hull);
+                if (hull.size() < 4)
+                {
+                    continue;
+                }
+                contours.push_back(hull);
+                scores.push_back(_scores[i] / (float)_areas[i]);
+            }
+            return success();
+        }
+
+      private:
+        Device                             device_;
+        std::optional<ConnectedComponents> cc_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(DbHeadImpl, (cuda, 0), []
+                                   { return std::make_unique<DbHeadCudaImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/panet.cpp b/csrc/mmdeploy/codebase/mmocr/cuda/panet.cpp
index 3dcfec9411..edd2003342 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/panet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/panet.cpp
@@ -6,97 +6,102 @@
 #include "mmdeploy/codebase/mmocr/cuda/utils.h"
 #include "mmdeploy/device/cuda/cuda_device.h"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class PaHeadCudaImpl : public PaHeadImpl {
- public:
-  void Init(const Stream& stream) override {
-    PaHeadImpl::Init(stream);
-    device_ = stream.GetDevice();
+    class PaHeadCudaImpl : public PaHeadImpl
     {
-      CudaDeviceGuard device_guard(device_);
-      cc_.emplace(GetNative<cudaStream_t>(stream_));
-    }
-  }
-
-  ~PaHeadCudaImpl() override {
-    CudaDeviceGuard device_guard(device_);
-    cc_.reset();
-  }
-
-  Result<void> Process(Tensor text_pred,             //
-                       Tensor kernel_pred,           //
-                       Tensor embed_pred,            //
-                       float min_text_confidence,    //
-                       float min_kernel_confidence,  //
-                       cv::Mat_<float>& text_score,  //
-                       cv::Mat_<uint8_t>& text,      //
-                       cv::Mat_<uint8_t>& kernel,    //
-                       cv::Mat_<int>& label,         //
-                       cv::Mat_<float>& embed,       //
-                       int& region_num) override {
-    CudaDeviceGuard device_guard(device_);
-
-    auto height = static_cast<int>(text_pred.shape(1));
-    auto width = static_cast<int>(text_pred.shape(2));
-
-    Buffer text_buf(device_, height * width);
-    Buffer text_score_buf(device_, height * width * sizeof(float));
-    Buffer kernel_buf(device_, height * width);
-
-    auto text_buf_data = GetNative<uint8_t*>(text_buf);
-    auto text_score_buf_data = GetNative<float*>(text_score_buf);
-    auto kernel_buf_data = GetNative<uint8_t*>(kernel_buf);
-
-    auto stream = GetNative<cudaStream_t>(stream_);
-
-    panet::ProcessMasks(text_pred.data<float>(),    //
-                        kernel_pred.data<float>(),  //
-                        min_text_confidence,        //
-                        min_kernel_confidence,      //
-                        height * width,             //
-                        text_buf_data,              //
-                        kernel_buf_data,            //
-                        text_score_buf_data,        //
-                        stream);
-
-    auto n_embed_channels = embed_pred.shape(0);
-    Buffer embed_buf(device_, embed_pred.byte_size());
-
-    panet::Transpose(embed_pred.data<float>(),      //
-                     n_embed_channels,              //
-                     height * width,                //
-                     GetNative<float*>(embed_buf),  //
-                     stream);
-
-    label = cv::Mat_<int>(height, width);
-
-    cc_->Resize(height, width);
-    region_num = cc_->GetComponents(kernel_buf_data, label.ptr<int>()) + 1;
-
-    text_score = cv::Mat_<float>(label.size());
-    OUTCOME_TRY(stream_.Copy(text_score_buf, text_score.data));
-
-    text = cv::Mat_<uint8_t>(label.size());
-    OUTCOME_TRY(stream_.Copy(text_buf, text.data));
-
-    kernel = cv::Mat_<uint8_t>(label.size());
-    OUTCOME_TRY(stream_.Copy(kernel_buf, kernel.data));
-
-    embed = cv::Mat_<float>(height * width, n_embed_channels);
-    OUTCOME_TRY(stream_.Copy(embed_buf, embed.data));
-
-    OUTCOME_TRY(stream_.Wait());
-
-    return success();
-  }
-
- private:
-  Device device_;
-  std::optional<ConnectedComponents> cc_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(PaHeadImpl, (cuda, 0),
-                               [] { return std::make_unique<PaHeadCudaImpl>(); });
+      public:
+        void Init(const Stream& stream) override
+        {
+            PaHeadImpl::Init(stream);
+            device_ = stream.GetDevice();
+            {
+                CudaDeviceGuard device_guard(device_);
+                cc_.emplace(GetNative<cudaStream_t>(stream_));
+            }
+        }
+
+        ~PaHeadCudaImpl() override
+        {
+            CudaDeviceGuard device_guard(device_);
+            cc_.reset();
+        }
+
+        Result<void> Process(Tensor             text_pred,              //
+                             Tensor             kernel_pred,            //
+                             Tensor             embed_pred,             //
+                             float              min_text_confidence,    //
+                             float              min_kernel_confidence,  //
+                             cv::Mat_<float>&   text_score,             //
+                             cv::Mat_<uint8_t>& text,                   //
+                             cv::Mat_<uint8_t>& kernel,                 //
+                             cv::Mat_<int>&     label,                  //
+                             cv::Mat_<float>&   embed,                  //
+                             int&               region_num) override
+        {
+            CudaDeviceGuard device_guard(device_);
+
+            auto            height = static_cast<int>(text_pred.shape(1));
+            auto            width  = static_cast<int>(text_pred.shape(2));
+
+            Buffer          text_buf(device_, height * width);
+            Buffer          text_score_buf(device_, height * width * sizeof(float));
+            Buffer          kernel_buf(device_, height * width);
+
+            auto            text_buf_data       = GetNative<uint8_t*>(text_buf);
+            auto            text_score_buf_data = GetNative<float*>(text_score_buf);
+            auto            kernel_buf_data     = GetNative<uint8_t*>(kernel_buf);
+
+            auto            stream = GetNative<cudaStream_t>(stream_);
+
+            panet::ProcessMasks(text_pred.data<float>(),    //
+                                kernel_pred.data<float>(),  //
+                                min_text_confidence,        //
+                                min_kernel_confidence,      //
+                                height * width,             //
+                                text_buf_data,              //
+                                kernel_buf_data,            //
+                                text_score_buf_data,        //
+                                stream);
+
+            auto   n_embed_channels = embed_pred.shape(0);
+            Buffer embed_buf(device_, embed_pred.byte_size());
+
+            panet::Transpose(embed_pred.data<float>(),      //
+                             n_embed_channels,              //
+                             height * width,                //
+                             GetNative<float*>(embed_buf),  //
+                             stream);
+
+            label = cv::Mat_<int>(height, width);
+
+            cc_->Resize(height, width);
+            region_num = cc_->GetComponents(kernel_buf_data, label.ptr<int>()) + 1;
+
+            text_score = cv::Mat_<float>(label.size());
+            OUTCOME_TRY(stream_.Copy(text_score_buf, text_score.data));
+
+            text = cv::Mat_<uint8_t>(label.size());
+            OUTCOME_TRY(stream_.Copy(text_buf, text.data));
+
+            kernel = cv::Mat_<uint8_t>(label.size());
+            OUTCOME_TRY(stream_.Copy(kernel_buf, kernel.data));
+
+            embed = cv::Mat_<float>(height * width, n_embed_channels);
+            OUTCOME_TRY(stream_.Copy(embed_buf, embed.data));
+
+            OUTCOME_TRY(stream_.Wait());
+
+            return success();
+        }
+
+      private:
+        Device                             device_;
+        std::optional<ConnectedComponents> cc_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(PaHeadImpl, (cuda, 0), []
+                                   { return std::make_unique<PaHeadCudaImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/psenet.cpp b/csrc/mmdeploy/codebase/mmocr/cuda/psenet.cpp
index 6432c13247..9e01bdd041 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/psenet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/psenet.cpp
@@ -8,70 +8,74 @@
 #include "mmdeploy/device/cuda/cuda_device.h"
 #include "opencv2/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class PseHeadCudaImpl : public PseHeadImpl {
- public:
-  void Init(const Stream& stream) override {
-    PseHeadImpl::Init(stream);
-    device_ = stream.GetDevice();
+    class PseHeadCudaImpl : public PseHeadImpl
     {
-      CudaDeviceGuard device_guard(device_);
-      cc_.emplace(GetNative<cudaStream_t>(stream_));
-    }
-  }
-
-  ~PseHeadCudaImpl() override {
-    CudaDeviceGuard device_guard(device_);
-    cc_.reset();
-  }
-
-  Result<void> Process(Tensor preds,                 //
-                       float min_kernel_confidence,  //
-                       cv::Mat_<float>& score,       //
-                       cv::Mat_<uint8_t>& masks,     //
-                       cv::Mat_<int>& label,         //
-                       int& region_num) override {
-    CudaDeviceGuard device_guard(device_);
-
-    OUTCOME_TRY(preds, MakeAvailableOnDevice(preds, device_, stream_));
-    OUTCOME_TRY(stream_.Wait());
-
-    auto channels = static_cast<int>(preds.shape(0));
-    auto height = static_cast<int>(preds.shape(1));
-    auto width = static_cast<int>(preds.shape(2));
-
-    Buffer masks_buf(device_, preds.size());
-    Buffer score_buf(device_, height * width * sizeof(float));
-
-    auto masks_data = GetNative<uint8_t*>(masks_buf);
-    auto score_data = GetNative<float*>(score_buf);
-
-    psenet::ProcessMasks(preds.data<float>(), channels, height * width, min_kernel_confidence,
-                         masks_data, score_data, GetNative<cudaStream_t>(stream_));
-
-    cc_->Resize(height, width);
-
-    label = cv::Mat_<int>(height, width);
-
-    auto kernel_mask_data = masks_data + height * width * (channels - 1);
-    region_num = cc_->GetComponents(kernel_mask_data, label.ptr<int>()) + 1;
-
-    score = cv::Mat_<float>(label.size());
-    OUTCOME_TRY(stream_.Copy(score_buf, score.ptr<float>()));
-
-    masks = cv::Mat_<uint8_t>(channels, height * width);
-    OUTCOME_TRY(stream_.Copy(masks_buf, masks.ptr<uint8_t>()));
-
-    return success();
-  }
-
- private:
-  Device device_;
-  std::optional<ConnectedComponents> cc_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(PseHeadImpl, (cuda, 0),
-                               [] { return std::make_unique<PseHeadCudaImpl>(); });
+      public:
+        void Init(const Stream& stream) override
+        {
+            PseHeadImpl::Init(stream);
+            device_ = stream.GetDevice();
+            {
+                CudaDeviceGuard device_guard(device_);
+                cc_.emplace(GetNative<cudaStream_t>(stream_));
+            }
+        }
+
+        ~PseHeadCudaImpl() override
+        {
+            CudaDeviceGuard device_guard(device_);
+            cc_.reset();
+        }
+
+        Result<void> Process(Tensor             preds,                  //
+                             float              min_kernel_confidence,  //
+                             cv::Mat_<float>&   score,                  //
+                             cv::Mat_<uint8_t>& masks,                  //
+                             cv::Mat_<int>&     label,                  //
+                             int&               region_num) override
+        {
+            CudaDeviceGuard device_guard(device_);
+
+            OUTCOME_TRY(preds, MakeAvailableOnDevice(preds, device_, stream_));
+            OUTCOME_TRY(stream_.Wait());
+
+            auto   channels = static_cast<int>(preds.shape(0));
+            auto   height   = static_cast<int>(preds.shape(1));
+            auto   width    = static_cast<int>(preds.shape(2));
+
+            Buffer masks_buf(device_, preds.size());
+            Buffer score_buf(device_, height * width * sizeof(float));
+
+            auto   masks_data = GetNative<uint8_t*>(masks_buf);
+            auto   score_data = GetNative<float*>(score_buf);
+
+            psenet::ProcessMasks(preds.data<float>(), channels, height * width, min_kernel_confidence, masks_data, score_data, GetNative<cudaStream_t>(stream_));
+
+            cc_->Resize(height, width);
+
+            label = cv::Mat_<int>(height, width);
+
+            auto kernel_mask_data = masks_data + height * width * (channels - 1);
+            region_num            = cc_->GetComponents(kernel_mask_data, label.ptr<int>()) + 1;
+
+            score = cv::Mat_<float>(label.size());
+            OUTCOME_TRY(stream_.Copy(score_buf, score.ptr<float>()));
+
+            masks = cv::Mat_<uint8_t>(channels, height * width);
+            OUTCOME_TRY(stream_.Copy(masks_buf, masks.ptr<uint8_t>()));
+
+            return success();
+        }
+
+      private:
+        Device                             device_;
+        std::optional<ConnectedComponents> cc_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(PseHeadImpl, (cuda, 0), []
+                                   { return std::make_unique<PseHeadCudaImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/utils.cu b/csrc/mmdeploy/codebase/mmocr/cuda/utils.cu
index 2f0cf27419..2d19f16193 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/utils.cu
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/utils.cu
@@ -4,102 +4,119 @@
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/transform.h"
 
-namespace mmdeploy {
-
-namespace mmocr {
-
-__device__ float sigmoid(float x) { return 1.f / (1.f + expf(-x)); }
-
-namespace panet {
-
-struct _process_masks_op {
-  const float* text_pred;
-  const float* kernel_pred;
-  float text_thr;
-  float kernel_thr;
-  uint8_t* text_mask;
-  uint8_t* kernel_mask;
-  float* text_score;
-  __device__ void operator()(int index) const {
-    auto text_sigmoid = sigmoid(text_pred[index]);
-    auto kernel_sigmoid = sigmoid(kernel_pred[index]);
-    text_score[index] = text_sigmoid;
-    auto text_valid = text_sigmoid > text_thr;
-    text_mask[index] = text_valid ? 255 : 0;
-    kernel_mask[index] = (text_valid && kernel_sigmoid > kernel_thr) ? 255 : 0;
-  }
-};
-
-void ProcessMasks(const float* d_text_pred, const float* d_kernel_pred, float text_thr,
-                  float kernel_thr, int n, uint8_t* d_text_mask, uint8_t* d_kernel_mask,
-                  float* d_text_score, cudaStream_t stream) {
-  thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), n,
-                     _process_masks_op{d_text_pred, d_kernel_pred, text_thr, kernel_thr,
-                                       d_text_mask, d_kernel_mask, d_text_score});
-}
-
-struct _transpose_op {
-  const float* input;
-  float* output;
-  int h;
-  int w;
-  __device__ void operator()(int index) const {
-    int i = index / w;
-    int j = index % w;
-    output[j * h + i] = input[index];
-  }
-};
-
-void Transpose(const float* d_input, int h, int w, float* d_output, cudaStream_t stream) {
-  thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), h * w,
-                     _transpose_op{d_input, d_output, h, w});
-}
-
-}  // namespace panet
-
-namespace dbnet {
-
-struct _threshold_op {
-  float thr;
-  __device__ bool operator()(float score) const { return score >= thr; }
-};
-
-void Threshold(const float* d_score, int n, float thr, uint8_t* d_mask, cudaStream_t stream) {
-  thrust::transform(thrust::cuda::par.on(stream), d_score, d_score + n, d_mask, _threshold_op{thr});
-}
-
-}  // namespace dbnet
-
-namespace psenet {
-
-struct _process_masks_op {
-  const float* preds;
-  int c;
-  int n;
-  float thr;
-  uint8_t* masks;
-  float* score;
-  __device__ void operator()(int index) const {
-    bool m0 = false;
-    for (int i = 0; i < c; ++i) {
-      auto v = sigmoid(preds[i * n + index]);
-      if (i == 0) {
-        score[index] = v;
-        m0 = v > thr;
-      }
-      masks[i * n + index] = (m0 && v > thr) ? 255 : 0;
-    }
-  }
-};
-
-void ProcessMasks(const float* d_preds, int c, int n, float thr, uint8_t* d_masks, float* d_score,
-                  cudaStream_t stream) {
-  thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), n,
-                     _process_masks_op{d_preds, c, n, thr, d_masks, d_score});
-}
-
-}  // namespace psenet
-
-}  // namespace mmocr
+namespace mmdeploy
+{
+
+    namespace mmocr
+    {
+
+        __device__ float sigmoid(float x)
+        {
+            return 1.f / (1.f + expf(-x));
+        }
+
+        namespace panet
+        {
+
+            struct _process_masks_op
+            {
+                const float*    text_pred;
+                const float*    kernel_pred;
+                float           text_thr;
+                float           kernel_thr;
+                uint8_t*        text_mask;
+                uint8_t*        kernel_mask;
+                float*          text_score;
+                __device__ void operator()(int index) const
+                {
+                    auto text_sigmoid   = sigmoid(text_pred[index]);
+                    auto kernel_sigmoid = sigmoid(kernel_pred[index]);
+                    text_score[index]   = text_sigmoid;
+                    auto text_valid     = text_sigmoid > text_thr;
+                    text_mask[index]    = text_valid ? 255 : 0;
+                    kernel_mask[index]  = (text_valid && kernel_sigmoid > kernel_thr) ? 255 : 0;
+                }
+            };
+
+            void ProcessMasks(const float* d_text_pred, const float* d_kernel_pred, float text_thr, float kernel_thr, int n, uint8_t* d_text_mask, uint8_t* d_kernel_mask, float* d_text_score, cudaStream_t stream)
+            {
+                thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), n, _process_masks_op{d_text_pred, d_kernel_pred, text_thr, kernel_thr, d_text_mask, d_kernel_mask, d_text_score});
+            }
+
+            struct _transpose_op
+            {
+                const float*    input;
+                float*          output;
+                int             h;
+                int             w;
+                __device__ void operator()(int index) const
+                {
+                    int i             = index / w;
+                    int j             = index % w;
+                    output[j * h + i] = input[index];
+                }
+            };
+
+            void Transpose(const float* d_input, int h, int w, float* d_output, cudaStream_t stream)
+            {
+                thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), h * w, _transpose_op{d_input, d_output, h, w});
+            }
+
+        }  // namespace panet
+
+        namespace dbnet
+        {
+
+            struct _threshold_op
+            {
+                float           thr;
+                __device__ bool operator()(float score) const
+                {
+                    return score >= thr;
+                }
+            };
+
+            void Threshold(const float* d_score, int n, float thr, uint8_t* d_mask, cudaStream_t stream)
+            {
+                thrust::transform(thrust::cuda::par.on(stream), d_score, d_score + n, d_mask, _threshold_op{thr});
+            }
+
+        }  // namespace dbnet
+
+        namespace psenet
+        {
+
+            struct _process_masks_op
+            {
+                const float*    preds;
+                int             c;
+                int             n;
+                float           thr;
+                uint8_t*        masks;
+                float*          score;
+                __device__ void operator()(int index) const
+                {
+                    bool m0 = false;
+                    for (int i = 0; i < c; ++i)
+                    {
+                        auto v = sigmoid(preds[i * n + index]);
+                        if (i == 0)
+                        {
+                            score[index] = v;
+                            m0           = v > thr;
+                        }
+                        masks[i * n + index] = (m0 && v > thr) ? 255 : 0;
+                    }
+                }
+            };
+
+            void ProcessMasks(const float* d_preds, int c, int n, float thr, uint8_t* d_masks, float* d_score, cudaStream_t stream)
+            {
+                thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), n, _process_masks_op{d_preds, c, n, thr, d_masks, d_score});
+            }
+
+        }  // namespace psenet
+
+    }  // namespace mmocr
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/utils.h b/csrc/mmdeploy/codebase/mmocr/cuda/utils.h
index d0dd3b44b8..c9893dc51f 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/utils.h
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/utils.h
@@ -7,34 +7,36 @@
 
 #include "cuda_runtime.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-namespace mmocr {
+    namespace mmocr
+    {
 
-namespace panet {
+        namespace panet
+        {
 
-void ProcessMasks(const float* d_text_pred, const float* d_kernel_pred, float text_thr,
-                  float kernel_thr, int n, uint8_t* d_text_mask, uint8_t* d_kernel_mask,
-                  float* d_text_score, cudaStream_t stream);
+            void ProcessMasks(const float* d_text_pred, const float* d_kernel_pred, float text_thr, float kernel_thr, int n, uint8_t* d_text_mask, uint8_t* d_kernel_mask, float* d_text_score, cudaStream_t stream);
 
-void Transpose(const float* d_input, int h, int w, float* d_output, cudaStream_t stream);
+            void Transpose(const float* d_input, int h, int w, float* d_output, cudaStream_t stream);
 
-}  // namespace panet
+        }  // namespace panet
 
-namespace dbnet {
+        namespace dbnet
+        {
 
-void Threshold(const float* d_score, int n, float thr, uint8_t* d_mask, cudaStream_t stream);
+            void Threshold(const float* d_score, int n, float thr, uint8_t* d_mask, cudaStream_t stream);
 
-}
+        }
 
-namespace psenet {
+        namespace psenet
+        {
 
-void ProcessMasks(const float* d_preds, int c, int n, float thr, uint8_t* d_masks, float* d_score,
-                  cudaStream_t stream);
+            void ProcessMasks(const float* d_preds, int c, int n, float thr, uint8_t* d_masks, float* d_score, cudaStream_t stream);
 
-}
+        }
 
-}  // namespace mmocr
+    }  // namespace mmocr
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/codebase/mmocr/dbnet.cpp b/csrc/mmdeploy/codebase/mmocr/dbnet.cpp
index 40d43b9da2..c062e121ce 100644
--- a/csrc/mmdeploy/codebase/mmocr/dbnet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/dbnet.cpp
@@ -11,129 +11,144 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmocr.h"
 
-namespace mmdeploy {
-
-namespace mmocr {
-
-using std::string;
-using std::vector;
-
-class DBHead : public MMOCR {
- public:
-  explicit DBHead(const Value& config) : MMOCR(config) {
-    if (config.contains("params")) {
-      auto& params = config["params"];
-      text_repr_type_ = params.value("text_repr_type", text_repr_type_);
-      mask_thr_ = params.value("mask_thr", mask_thr_);
-      min_text_score_ = params.value("min_text_score", min_text_score_);
-      min_text_width_ = params.value("min_text_width", min_text_width_);
-      unclip_ratio_ = params.value("unclip_ratio", unclip_ratio_);
-      max_candidates_ = params.value("max_candidate", max_candidates_);
-      rescale_ = params.value("rescale", rescale_);
-      downsample_ratio_ = params.value("downsample_ratio", downsample_ratio_);
-    }
-    auto platform = Platform(device_.platform_id()).GetPlatformName();
-    auto creator = gRegistry<DbHeadImpl>().Get(platform);
-    if (!creator) {
-      MMDEPLOY_ERROR(
-          "DBHead: implementation for platform \"{}\" not found. Available platforms: {}", platform,
-          gRegistry<DbHeadImpl>().List());
-      throw_exception(eEntryNotFound);
-    }
-    impl_ = creator->Create();
-    impl_->Init(stream_);
-  }
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) const {
-    auto conf = _prob["output"].get<Tensor>();
-    if (!(conf.shape().size() == 3 && conf.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", conf.shape(),
-                     (int)conf.data_type());
-      return Status(eNotSupported);
-    }
-
-    std::vector<std::vector<cv::Point>> contours;
-    std::vector<float> scores;
-    OUTCOME_TRY(impl_->Process(conf, mask_thr_, max_candidates_, contours, scores));
-
-    auto scale_w = 1.f;
-    auto scale_h = 1.f;
-    if (rescale_) {
-      scale_w /= downsample_ratio_ * _data["img_metas"]["scale_factor"][0].get<float>();
-      scale_h /= downsample_ratio_ * _data["img_metas"]["scale_factor"][1].get<float>();
-    }
-
-    TextDetections output;
-    for (int idx = 0; idx < contours.size(); ++idx) {
-      if (scores[idx] < min_text_score_) {
-        continue;
-      }
-      auto expanded = unclip(contours[idx], unclip_ratio_);
-      if (expanded.empty()) {
-        continue;
-      }
-      auto rect = cv::minAreaRect(expanded);
-      if ((int)rect.size.width <= min_text_width_) {
-        continue;
-      }
-      std::array<cv::Point2f, 4> box_points;
-      rect.points(box_points.data());
-      auto& det = output.emplace_back();
-      for (int i = 0; i < 4; ++i) {
-        // ! performance metrics drops without rounding here
-        det.bbox[i * 2] = cvRound(box_points[i].x * scale_w);
-        det.bbox[i * 2 + 1] = cvRound(box_points[i].y * scale_h);
-      }
-      det.score = scores[idx];
-    }
-
-    return to_value(output);
-  }
-
-  static std::vector<cv::Point> unclip(std::vector<cv::Point>& box, float unclip_ratio) {
-    namespace cl = ClipperLib;
-
-    auto area = cv::contourArea(box);
-    auto length = cv::arcLength(box, true);
-    auto distance = area * unclip_ratio / length;
-
-    cl::Path src;
-    transform(begin(box), end(box), back_inserter(src), [](auto p) {
-      return cl::IntPoint{p.x, p.y};
-    });
-
-    cl::ClipperOffset offset;
-    offset.AddPath(src, cl::jtRound, cl::etClosedPolygon);
-
-    std::vector<cl::Path> dst;
-    offset.Execute(dst, distance);
-    if (dst.size() != 1) {
-      return {};
-    }
-
-    std::vector<cv::Point> ret;
-    transform(begin(dst[0]), end(dst[0]), back_inserter(ret), [](auto p) {
-      return cv::Point{static_cast<int>(p.X), static_cast<int>(p.Y)};
-    });
-    return ret;
-  }
-
-  std::string text_repr_type_{"quad"};
-  float mask_thr_{.3};
-  float min_text_score_{.3};
-  int min_text_width_{5};
-  float unclip_ratio_{1.5};
-  int max_candidates_{3000};
-  bool rescale_{true};
-  float downsample_ratio_{1.};
-
-  std::unique_ptr<DbHeadImpl> impl_;
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, DBHead);
-
-MMDEPLOY_DEFINE_REGISTRY(DbHeadImpl);
-
-}  // namespace mmocr
+namespace mmdeploy
+{
+
+    namespace mmocr
+    {
+
+        using std::string;
+        using std::vector;
+
+        class DBHead : public MMOCR
+        {
+          public:
+            explicit DBHead(const Value& config)
+                : MMOCR(config)
+            {
+                if (config.contains("params"))
+                {
+                    auto& params      = config["params"];
+                    text_repr_type_   = params.value("text_repr_type", text_repr_type_);
+                    mask_thr_         = params.value("mask_thr", mask_thr_);
+                    min_text_score_   = params.value("min_text_score", min_text_score_);
+                    min_text_width_   = params.value("min_text_width", min_text_width_);
+                    unclip_ratio_     = params.value("unclip_ratio", unclip_ratio_);
+                    max_candidates_   = params.value("max_candidate", max_candidates_);
+                    rescale_          = params.value("rescale", rescale_);
+                    downsample_ratio_ = params.value("downsample_ratio", downsample_ratio_);
+                }
+                auto platform = Platform(device_.platform_id()).GetPlatformName();
+                auto creator  = gRegistry<DbHeadImpl>().Get(platform);
+                if (!creator)
+                {
+                    MMDEPLOY_ERROR(
+                        "DBHead: implementation for platform \"{}\" not found. Available platforms: {}",
+                        platform,
+                        gRegistry<DbHeadImpl>().List());
+                    throw_exception(eEntryNotFound);
+                }
+                impl_ = creator->Create();
+                impl_->Init(stream_);
+            }
+
+            Result<Value> operator()(const Value& _data, const Value& _prob) const
+            {
+                auto conf = _prob["output"].get<Tensor>();
+                if (!(conf.shape().size() == 3 && conf.data_type() == DataType::kFLOAT))
+                {
+                    MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", conf.shape(), (int)conf.data_type());
+                    return Status(eNotSupported);
+                }
+
+                std::vector<std::vector<cv::Point>> contours;
+                std::vector<float>                  scores;
+                OUTCOME_TRY(impl_->Process(conf, mask_thr_, max_candidates_, contours, scores));
+
+                auto scale_w = 1.f;
+                auto scale_h = 1.f;
+                if (rescale_)
+                {
+                    scale_w /= downsample_ratio_ * _data["img_metas"]["scale_factor"][0].get<float>();
+                    scale_h /= downsample_ratio_ * _data["img_metas"]["scale_factor"][1].get<float>();
+                }
+
+                TextDetections output;
+                for (int idx = 0; idx < contours.size(); ++idx)
+                {
+                    if (scores[idx] < min_text_score_)
+                    {
+                        continue;
+                    }
+                    auto expanded = unclip(contours[idx], unclip_ratio_);
+                    if (expanded.empty())
+                    {
+                        continue;
+                    }
+                    auto rect = cv::minAreaRect(expanded);
+                    if ((int)rect.size.width <= min_text_width_)
+                    {
+                        continue;
+                    }
+                    std::array<cv::Point2f, 4> box_points;
+                    rect.points(box_points.data());
+                    auto& det = output.emplace_back();
+                    for (int i = 0; i < 4; ++i)
+                    {
+                        // ! performance metrics drops without rounding here
+                        det.bbox[i * 2]     = cvRound(box_points[i].x * scale_w);
+                        det.bbox[i * 2 + 1] = cvRound(box_points[i].y * scale_h);
+                    }
+                    det.score = scores[idx];
+                }
+
+                return to_value(output);
+            }
+
+            static std::vector<cv::Point> unclip(std::vector<cv::Point>& box, float unclip_ratio)
+            {
+                namespace cl = ClipperLib;
+
+                auto     area     = cv::contourArea(box);
+                auto     length   = cv::arcLength(box, true);
+                auto     distance = area * unclip_ratio / length;
+
+                cl::Path src;
+                transform(begin(box), end(box), back_inserter(src), [](auto p)
+                          { return cl::IntPoint{p.x, p.y}; });
+
+                cl::ClipperOffset offset;
+                offset.AddPath(src, cl::jtRound, cl::etClosedPolygon);
+
+                std::vector<cl::Path> dst;
+                offset.Execute(dst, distance);
+                if (dst.size() != 1)
+                {
+                    return {};
+                }
+
+                std::vector<cv::Point> ret;
+                transform(begin(dst[0]), end(dst[0]), back_inserter(ret), [](auto p)
+                          { return cv::Point{static_cast<int>(p.X), static_cast<int>(p.Y)}; });
+                return ret;
+            }
+
+            std::string                 text_repr_type_{"quad"};
+            float                       mask_thr_{.3};
+            float                       min_text_score_{.3};
+            int                         min_text_width_{5};
+            float                       unclip_ratio_{1.5};
+            int                         max_candidates_{3000};
+            bool                        rescale_{true};
+            float                       downsample_ratio_{1.};
+
+            std::unique_ptr<DbHeadImpl> impl_;
+        };
+
+        MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, DBHead);
+
+        MMDEPLOY_DEFINE_REGISTRY(DbHeadImpl);
+
+    }  // namespace mmocr
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/codebase/mmocr/dbnet.h b/csrc/mmdeploy/codebase/mmocr/dbnet.h
index c7510c7760..1f10e6a980 100644
--- a/csrc/mmdeploy/codebase/mmocr/dbnet.h
+++ b/csrc/mmdeploy/codebase/mmocr/dbnet.h
@@ -9,23 +9,26 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/core.hpp"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class DbHeadImpl {
- public:
-  virtual ~DbHeadImpl() = default;
+    class DbHeadImpl
+    {
+      public:
+        virtual ~DbHeadImpl() = default;
 
-  virtual void Init(const Stream& stream) { stream_ = stream; }
+        virtual void Init(const Stream& stream)
+        {
+            stream_ = stream;
+        }
 
-  virtual Result<void> Process(Tensor prob, float mask_thr, int max_candidates,
-                               std::vector<std::vector<cv::Point>>& points,
-                               std::vector<float>& scores) = 0;
+        virtual Result<void> Process(Tensor prob, float mask_thr, int max_candidates, std::vector<std::vector<cv::Point>>& points, std::vector<float>& scores) = 0;
 
- protected:
-  Stream stream_;
-};
+      protected:
+        Stream stream_;
+    };
 
-MMDEPLOY_DECLARE_REGISTRY(DbHeadImpl, std::unique_ptr<DbHeadImpl>());
+    MMDEPLOY_DECLARE_REGISTRY(DbHeadImpl, std::unique_ptr<DbHeadImpl>());
 
 }  // namespace mmdeploy::mmocr
 
diff --git a/csrc/mmdeploy/codebase/mmocr/mmocr.cpp b/csrc/mmdeploy/codebase/mmocr/mmocr.cpp
index 1654ddd929..d9690885cb 100644
--- a/csrc/mmdeploy/codebase/mmocr/mmocr.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/mmocr.cpp
@@ -5,8 +5,9 @@
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMOCR);
+    MMDEPLOY_REGISTER_CODEBASE(MMOCR);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/mmocr.h b/csrc/mmdeploy/codebase/mmocr/mmocr.h
index f8c67a4036..00d0527d26 100644
--- a/csrc/mmdeploy/codebase/mmocr/mmocr.h
+++ b/csrc/mmdeploy/codebase/mmocr/mmocr.h
@@ -9,23 +9,26 @@
 #include "mmdeploy/core/device.h"
 #include "mmdeploy/core/module.h"
 
-namespace mmdeploy::mmocr {
-
-struct TextDetection {
-  std::array<float, 8> bbox;
-  float score;
-  MMDEPLOY_ARCHIVE_MEMBERS(bbox, score);
-};
-
-using TextDetections = std::vector<TextDetection>;
-
-struct TextRecognition {
-  std::string text;
-  std::vector<float> score;
-  MMDEPLOY_ARCHIVE_MEMBERS(text, score);
-};
-
-MMDEPLOY_DECLARE_CODEBASE(MMOCR, mmocr);
+namespace mmdeploy::mmocr
+{
+
+    struct TextDetection
+    {
+        std::array<float, 8> bbox;
+        float                score;
+        MMDEPLOY_ARCHIVE_MEMBERS(bbox, score);
+    };
+
+    using TextDetections = std::vector<TextDetection>;
+
+    struct TextRecognition
+    {
+        std::string        text;
+        std::vector<float> score;
+        MMDEPLOY_ARCHIVE_MEMBERS(text, score);
+    };
+
+    MMDEPLOY_DECLARE_CODEBASE(MMOCR, mmocr);
 
 }  // namespace mmdeploy::mmocr
 
diff --git a/csrc/mmdeploy/codebase/mmocr/panet.cpp b/csrc/mmdeploy/codebase/mmocr/panet.cpp
index b00d7b736a..6c7db7fa19 100644
--- a/csrc/mmdeploy/codebase/mmocr/panet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/panet.cpp
@@ -11,116 +11,127 @@
 #include "mmdeploy/core/utils/device_utils.h"
 #include "opencv2/imgproc/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-std::vector<std::vector<float>> pixel_group_cpu(const cv::Mat_<float>& score,
-                                                const cv::Mat_<uint8_t>& mask,
-                                                const cv::Mat_<float>& embedding,
-                                                const cv::Mat_<int32_t>& kernel_label,
-                                                const cv::Mat_<uint8_t>& kernel_contour,
-                                                int kernel_region_num, float dis_threshold);
-
-class PANHead : public MMOCR {
- public:
-  explicit PANHead(const Value& config) : MMOCR(config) {
-    if (config.contains("params")) {
-      auto& params = config["params"];
-      min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
-      min_kernel_confidence_ = params.value("min_kernel_confidence", min_kernel_confidence_);
-      min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
-      min_text_area_ = params.value("min_text_area", min_text_area_);
-      rescale_ = params.value("rescale", rescale_);
-      downsample_ratio_ = params.value("downsample_ratio", downsample_ratio_);
-    }
-    auto platform = Platform(device_.platform_id()).GetPlatformName();
-    auto creator = gRegistry<PaHeadImpl>().Get(platform);
-    if (!creator) {
-      MMDEPLOY_ERROR(
-          "PANHead: implementation for platform \"{}\" not found. Available platforms: {}",
-          platform, gRegistry<PaHeadImpl>().List());
-      throw_exception(eEntryNotFound);
-    }
-    impl_ = creator->Create();
-    impl_->Init(stream_);
-  }
-
-  Result<Value> operator()(const Value& _data, const Value& _pred) noexcept {
-    OUTCOME_TRY(auto pred, MakeAvailableOnDevice(_pred["output"].get<Tensor>(), device_, stream_));
-    OUTCOME_TRY(stream_.Wait());
-
-    if (pred.shape().size() != 4 || pred.shape(0) != 1 || pred.data_type() != DataType::kFLOAT) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", pred.shape(),
-                     (int)pred.data_type());
-      return Status(eNotSupported);
-    }
-
-    // drop batch dimension
-    pred.Squeeze(0);
-
-    auto text_pred = pred.Slice(0);
-    auto kernel_pred = pred.Slice(1);
-    auto embed_pred = pred.Slice(2, pred.shape(0));
-
-    cv::Mat_<float> text_score;
-    cv::Mat_<uint8_t> text;
-    cv::Mat_<uint8_t> kernel;
-    cv::Mat_<int> labels;
-    cv::Mat_<float> embed;
-    int region_num = 0;
-
-    OUTCOME_TRY(impl_->Process(text_pred, kernel_pred, embed_pred, min_text_confidence_,
-                               min_kernel_confidence_, text_score, text, kernel, labels, embed,
-                               region_num));
-
-    auto text_points = pixel_group_cpu(text_score, text, embed, labels, kernel, region_num,
-                                       min_text_avg_confidence_);
-
-    auto scale_w = _data["img_metas"]["scale_factor"][0].get<float>();
-    auto scale_h = _data["img_metas"]["scale_factor"][1].get<float>();
-
-    TextDetections output;
-    for (auto& text_point : text_points) {
-      auto text_confidence = text_point[0];
-      auto area = text_point.size() - 2;
-      if (filter_instance(static_cast<float>(area), text_confidence, min_text_area_,
-                          min_text_avg_confidence_)) {
-        continue;
-      }
-      cv::Mat_<float> points(text_point.size() / 2 - 1, 2, text_point.data() + 2);
-      cv::RotatedRect rect = cv::minAreaRect(points);
-      std::vector<cv::Point2f> vertices(4);
-      rect.points(vertices.data());
-      if (rescale_) {
-        for (auto& p : vertices) {
-          p.x /= scale_w * downsample_ratio_;
-          p.y /= scale_h * downsample_ratio_;
+namespace mmdeploy::mmocr
+{
+
+    std::vector<std::vector<float>> pixel_group_cpu(const cv::Mat_<float>&   score,
+                                                    const cv::Mat_<uint8_t>& mask,
+                                                    const cv::Mat_<float>&   embedding,
+                                                    const cv::Mat_<int32_t>& kernel_label,
+                                                    const cv::Mat_<uint8_t>& kernel_contour,
+                                                    int                      kernel_region_num,
+                                                    float                    dis_threshold);
+
+    class PANHead : public MMOCR
+    {
+      public:
+        explicit PANHead(const Value& config)
+            : MMOCR(config)
+        {
+            if (config.contains("params"))
+            {
+                auto& params             = config["params"];
+                min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
+                min_kernel_confidence_   = params.value("min_kernel_confidence", min_kernel_confidence_);
+                min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
+                min_text_area_           = params.value("min_text_area", min_text_area_);
+                rescale_                 = params.value("rescale", rescale_);
+                downsample_ratio_        = params.value("downsample_ratio", downsample_ratio_);
+            }
+            auto platform = Platform(device_.platform_id()).GetPlatformName();
+            auto creator  = gRegistry<PaHeadImpl>().Get(platform);
+            if (!creator)
+            {
+                MMDEPLOY_ERROR(
+                    "PANHead: implementation for platform \"{}\" not found. Available platforms: {}",
+                    platform,
+                    gRegistry<PaHeadImpl>().List());
+                throw_exception(eEntryNotFound);
+            }
+            impl_ = creator->Create();
+            impl_->Init(stream_);
         }
-      }
-      auto& det = output.emplace_back();
-      for (int i = 0; i < 4; ++i) {
-        det.bbox[i * 2] = vertices[i].x;
-        det.bbox[i * 2 + 1] = vertices[i].y;
-      }
-      det.score = text_confidence;
-    }
-    return to_value(output);
-  }
-
-  static bool filter_instance(float area, float confidence, float min_area, float min_confidence) {
-    return area < min_area || confidence < min_confidence;
-  }
-
-  float min_text_confidence_{.5f};
-  float min_kernel_confidence_{.5f};
-  float min_text_avg_confidence_{0.85};
-  float min_text_area_{16};
-  bool rescale_{true};
-  float downsample_ratio_{.25f};
-  std::unique_ptr<PaHeadImpl> impl_;
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, PANHead);
-
-MMDEPLOY_DEFINE_REGISTRY(PaHeadImpl);
+
+        Result<Value> operator()(const Value& _data, const Value& _pred) noexcept
+        {
+            OUTCOME_TRY(auto pred, MakeAvailableOnDevice(_pred["output"].get<Tensor>(), device_, stream_));
+            OUTCOME_TRY(stream_.Wait());
+
+            if (pred.shape().size() != 4 || pred.shape(0) != 1 || pred.data_type() != DataType::kFLOAT)
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", pred.shape(), (int)pred.data_type());
+                return Status(eNotSupported);
+            }
+
+            // drop batch dimension
+            pred.Squeeze(0);
+
+            auto              text_pred   = pred.Slice(0);
+            auto              kernel_pred = pred.Slice(1);
+            auto              embed_pred  = pred.Slice(2, pred.shape(0));
+
+            cv::Mat_<float>   text_score;
+            cv::Mat_<uint8_t> text;
+            cv::Mat_<uint8_t> kernel;
+            cv::Mat_<int>     labels;
+            cv::Mat_<float>   embed;
+            int               region_num = 0;
+
+            OUTCOME_TRY(impl_->Process(text_pred, kernel_pred, embed_pred, min_text_confidence_, min_kernel_confidence_, text_score, text, kernel, labels, embed, region_num));
+
+            auto           text_points = pixel_group_cpu(text_score, text, embed, labels, kernel, region_num, min_text_avg_confidence_);
+
+            auto           scale_w = _data["img_metas"]["scale_factor"][0].get<float>();
+            auto           scale_h = _data["img_metas"]["scale_factor"][1].get<float>();
+
+            TextDetections output;
+            for (auto& text_point : text_points)
+            {
+                auto text_confidence = text_point[0];
+                auto area            = text_point.size() - 2;
+                if (filter_instance(static_cast<float>(area), text_confidence, min_text_area_, min_text_avg_confidence_))
+                {
+                    continue;
+                }
+                cv::Mat_<float>          points(text_point.size() / 2 - 1, 2, text_point.data() + 2);
+                cv::RotatedRect          rect = cv::minAreaRect(points);
+                std::vector<cv::Point2f> vertices(4);
+                rect.points(vertices.data());
+                if (rescale_)
+                {
+                    for (auto& p : vertices)
+                    {
+                        p.x /= scale_w * downsample_ratio_;
+                        p.y /= scale_h * downsample_ratio_;
+                    }
+                }
+                auto& det = output.emplace_back();
+                for (int i = 0; i < 4; ++i)
+                {
+                    det.bbox[i * 2]     = vertices[i].x;
+                    det.bbox[i * 2 + 1] = vertices[i].y;
+                }
+                det.score = text_confidence;
+            }
+            return to_value(output);
+        }
+
+        static bool filter_instance(float area, float confidence, float min_area, float min_confidence)
+        {
+            return area < min_area || confidence < min_confidence;
+        }
+
+        float                       min_text_confidence_{.5f};
+        float                       min_kernel_confidence_{.5f};
+        float                       min_text_avg_confidence_{0.85};
+        float                       min_text_area_{16};
+        bool                        rescale_{true};
+        float                       downsample_ratio_{.25f};
+        std::unique_ptr<PaHeadImpl> impl_;
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, PANHead);
+
+    MMDEPLOY_DEFINE_REGISTRY(PaHeadImpl);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/panet.h b/csrc/mmdeploy/codebase/mmocr/panet.h
index 32a5cd638c..bf13443b31 100644
--- a/csrc/mmdeploy/codebase/mmocr/panet.h
+++ b/csrc/mmdeploy/codebase/mmocr/panet.h
@@ -9,31 +9,36 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/core.hpp"
 
-namespace mmdeploy::mmocr {
-
-class PaHeadImpl {
- public:
-  virtual ~PaHeadImpl() = default;
-
-  virtual void Init(const Stream& stream) { stream_ = stream; }
-
-  virtual Result<void> Process(Tensor text_pred,             //
-                               Tensor kernel_pred,           //
-                               Tensor embed_pred,            //
-                               float min_text_confidence,    //
-                               float min_kernel_confidence,  //
-                               cv::Mat_<float>& text_score,  //
-                               cv::Mat_<uint8_t>& text,      //
-                               cv::Mat_<uint8_t>& kernel,    //
-                               cv::Mat_<int>& label,         //
-                               cv::Mat_<float>& embed,       //
-                               int& region_num) = 0;
-
- protected:
-  Stream stream_;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(PaHeadImpl, std::unique_ptr<PaHeadImpl>());
+namespace mmdeploy::mmocr
+{
+
+    class PaHeadImpl
+    {
+      public:
+        virtual ~PaHeadImpl() = default;
+
+        virtual void Init(const Stream& stream)
+        {
+            stream_ = stream;
+        }
+
+        virtual Result<void> Process(Tensor             text_pred,              //
+                                     Tensor             kernel_pred,            //
+                                     Tensor             embed_pred,             //
+                                     float              min_text_confidence,    //
+                                     float              min_kernel_confidence,  //
+                                     cv::Mat_<float>&   text_score,             //
+                                     cv::Mat_<uint8_t>& text,                   //
+                                     cv::Mat_<uint8_t>& kernel,                 //
+                                     cv::Mat_<int>&     label,                  //
+                                     cv::Mat_<float>&   embed,                  //
+                                     int&               region_num) = 0;
+
+      protected:
+        Stream stream_;
+    };
+
+    MMDEPLOY_DECLARE_REGISTRY(PaHeadImpl, std::unique_ptr<PaHeadImpl>());
 
 }  // namespace mmdeploy::mmocr
 
diff --git a/csrc/mmdeploy/codebase/mmocr/pixel_group.cpp b/csrc/mmdeploy/codebase/mmocr/pixel_group.cpp
index f9810a91bd..11022a4ee4 100644
--- a/csrc/mmdeploy/codebase/mmocr/pixel_group.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/pixel_group.cpp
@@ -10,116 +10,133 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/imgproc/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-std::vector<std::vector<float>> estimate_confidence(const int32_t* label, const float* score,
-                                                    int label_num, int height, int width) {
-  std::vector<std::vector<float>> point_vector;
-  for (int i = 0; i < label_num; i++) {
-    std::vector<float> point;
-    point.push_back(0);
-    point.push_back(0);
-    point_vector.push_back(point);
-  }
-  for (int y = 0; y < height; y++) {
-    auto label_tmp = label + y * width;
-    auto score_tmp = score + y * width;
-    for (int x = 0; x < width; x++) {
-      auto l = label_tmp[x];
-      if (l > 0) {
-        float confidence = score_tmp[x];
-        point_vector[l].push_back(x);
-        point_vector[l].push_back(y);
-        point_vector[l][0] += confidence;
-        point_vector[l][1] += 1;
-      }
-    }
-  }
-  for (size_t l = 0; l < point_vector.size(); l++)
-    if (point_vector[l][1] > 0) {
-      point_vector[l][0] /= point_vector[l][1];
+    std::vector<std::vector<float>> estimate_confidence(const int32_t* label, const float* score, int label_num, int height, int width)
+    {
+        std::vector<std::vector<float>> point_vector;
+        for (int i = 0; i < label_num; i++)
+        {
+            std::vector<float> point;
+            point.push_back(0);
+            point.push_back(0);
+            point_vector.push_back(point);
+        }
+        for (int y = 0; y < height; y++)
+        {
+            auto label_tmp = label + y * width;
+            auto score_tmp = score + y * width;
+            for (int x = 0; x < width; x++)
+            {
+                auto l = label_tmp[x];
+                if (l > 0)
+                {
+                    float confidence = score_tmp[x];
+                    point_vector[l].push_back(x);
+                    point_vector[l].push_back(y);
+                    point_vector[l][0] += confidence;
+                    point_vector[l][1] += 1;
+                }
+            }
+        }
+        for (size_t l = 0; l < point_vector.size(); l++)
+            if (point_vector[l][1] > 0)
+            {
+                point_vector[l][0] /= point_vector[l][1];
+            }
+        return point_vector;
     }
-  return point_vector;
-}
 
-std::vector<std::vector<float>> pixel_group_cpu(const cv::Mat_<float>& score,
-                                                const cv::Mat_<uint8_t>& mask,
-                                                const cv::Mat_<float>& embedding,
-                                                const cv::Mat_<int32_t>& kernel_label,
-                                                const cv::Mat_<uint8_t>& kernel_contour,
-                                                int kernel_region_num, float dis_threshold) {
-  int height = score.rows;
-  int width = score.cols;
-  assert(embedding.rows == height * width);
-  assert(height == mask.rows);
-  assert(width == mask.cols);
+    std::vector<std::vector<float>> pixel_group_cpu(const cv::Mat_<float>&   score,
+                                                    const cv::Mat_<uint8_t>& mask,
+                                                    const cv::Mat_<float>&   embedding,
+                                                    const cv::Mat_<int32_t>& kernel_label,
+                                                    const cv::Mat_<uint8_t>& kernel_contour,
+                                                    int                      kernel_region_num,
+                                                    float                    dis_threshold)
+    {
+        int height = score.rows;
+        int width  = score.cols;
+        assert(embedding.rows == height * width);
+        assert(height == mask.rows);
+        assert(width == mask.cols);
 
-  auto threshold_square = dis_threshold * dis_threshold;
-  auto ptr_score = score.ptr<float>();
-  auto ptr_mask = mask.ptr<uint8_t>();
-  auto ptr_kernel_contour = kernel_contour.ptr<uint8_t>();
-  auto ptr_embedding = embedding.ptr<float>();
-  auto ptr_kernel_label = kernel_label.ptr<int32_t>();
-  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
-  auto embedding_dim = embedding.cols;
-  std::vector<std::vector<float>> kernel_vector(kernel_region_num,
-                                                std::vector<float>(embedding_dim + 1, 0));
+        auto                                      threshold_square   = dis_threshold * dis_threshold;
+        auto                                      ptr_score          = score.ptr<float>();
+        auto                                      ptr_mask           = mask.ptr<uint8_t>();
+        auto                                      ptr_kernel_contour = kernel_contour.ptr<uint8_t>();
+        auto                                      ptr_embedding      = embedding.ptr<float>();
+        auto                                      ptr_kernel_label   = kernel_label.ptr<int32_t>();
+        std::queue<std::tuple<int, int, int32_t>> contour_pixels;
+        auto                                      embedding_dim = embedding.cols;
+        std::vector<std::vector<float>>           kernel_vector(kernel_region_num,
+                                                      std::vector<float>(embedding_dim + 1, 0));
 
-  cv::Mat_<int32_t> text_label = kernel_label.clone();
-  auto ptr_text_label = text_label.ptr<int32_t>();
+        cv::Mat_<int32_t>                         text_label     = kernel_label.clone();
+        auto                                      ptr_text_label = text_label.ptr<int32_t>();
 
-  for (int i = 0; i < height; i++) {
-    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
-    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
-    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
+        for (int i = 0; i < height; i++)
+        {
+            auto ptr_embedding_tmp      = ptr_embedding + i * width * embedding_dim;
+            auto ptr_kernel_label_tmp   = ptr_kernel_label + i * width;
+            auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
 
-    for (int j = 0, k = 0; j < width && k < width * embedding_dim; j++, k += embedding_dim) {
-      int32_t label = ptr_kernel_label_tmp[j];
-      if (label > 0) {
-        for (int d = 0; d < embedding_dim; d++) kernel_vector[label][d] += ptr_embedding_tmp[k + d];
-        kernel_vector[label][embedding_dim] += 1;
-        // kernel pixel number
-        if (ptr_kernel_contour_tmp[j]) {
-          contour_pixels.push(std::make_tuple(i, j, label));
+            for (int j = 0, k = 0; j < width && k < width * embedding_dim; j++, k += embedding_dim)
+            {
+                int32_t label = ptr_kernel_label_tmp[j];
+                if (label > 0)
+                {
+                    for (int d = 0; d < embedding_dim; d++) kernel_vector[label][d] += ptr_embedding_tmp[k + d];
+                    kernel_vector[label][embedding_dim] += 1;
+                    // kernel pixel number
+                    if (ptr_kernel_contour_tmp[j])
+                    {
+                        contour_pixels.push(std::make_tuple(i, j, label));
+                    }
+                }
+            }
         }
-      }
-    }
-  }
-  for (int i = 0; i < kernel_region_num; i++) {
-    for (int j = 0; j < embedding_dim; j++) {
-      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
-    }
-  }
-  int dx[4] = {-1, 1, 0, 0};
-  int dy[4] = {0, 0, -1, 1};
-  while (!contour_pixels.empty()) {
-    auto query_pixel = contour_pixels.front();
-    contour_pixels.pop();
-    int y = std::get<0>(query_pixel);
-    int x = std::get<1>(query_pixel);
-    int32_t l = std::get<2>(query_pixel);
-    auto kernel_cv = kernel_vector[l];
-    for (int idx = 0; idx < 4; idx++) {
-      int tmpy = y + dy[idx];
-      int tmpx = x + dx[idx];
-      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
-      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
-      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0) continue;
+        for (int i = 0; i < kernel_region_num; i++)
+        {
+            for (int j = 0; j < embedding_dim; j++)
+            {
+                kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
+            }
+        }
+        int dx[4] = {-1, 1, 0, 0};
+        int dy[4] = {0, 0, -1, 1};
+        while (!contour_pixels.empty())
+        {
+            auto query_pixel = contour_pixels.front();
+            contour_pixels.pop();
+            int     y         = std::get<0>(query_pixel);
+            int     x         = std::get<1>(query_pixel);
+            int32_t l         = std::get<2>(query_pixel);
+            auto    kernel_cv = kernel_vector[l];
+            for (int idx = 0; idx < 4; idx++)
+            {
+                int  tmpy               = y + dy[idx];
+                int  tmpx               = x + dx[idx];
+                auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
+                if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
+                if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0) continue;
 
-      float dis = 0;
-      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
-      for (size_t i = 0; i < embedding_dim; i++) {
-        dis += std::pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
-        // ignore further computing if dis is big enough
-        if (dis >= threshold_square) break;
-      }
-      if (dis >= threshold_square) continue;
-      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
-      ptr_text_label_tmp[tmpx] = l;
-    }
-  }
+                float dis               = 0;
+                auto  ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
+                for (size_t i = 0; i < embedding_dim; i++)
+                {
+                    dis += std::pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
+                    // ignore further computing if dis is big enough
+                    if (dis >= threshold_square) break;
+                }
+                if (dis >= threshold_square) continue;
+                contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
+                ptr_text_label_tmp[tmpx] = l;
+            }
+        }
 
-  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num, height, width);
-}
+        return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num, height, width);
+    }
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/psenet.cpp b/csrc/mmdeploy/codebase/mmocr/psenet.cpp
index ad66c675fe..afe3fef3f0 100644
--- a/csrc/mmdeploy/codebase/mmocr/psenet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/psenet.cpp
@@ -11,114 +11,123 @@
 #include "mmdeploy/core/utils/device_utils.h"
 #include "opencv2/imgproc/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-void contour_expand(const cv::Mat_<uint8_t>& kernel_masks, const cv::Mat_<int32_t>& kernel_label,
-                    const cv::Mat_<float>& score, int min_kernel_area, int kernel_num,
-                    std::vector<int>& text_areas, std::vector<float>& text_scores,
-                    std::vector<std::vector<int>>& text_points);
-
-class PSEHead : public MMOCR {
- public:
-  explicit PSEHead(const Value& config) : MMOCR(config) {
-    if (config.contains("params")) {
-      auto& params = config["params"];
-      min_kernel_confidence_ = params.value("min_kernel_confidence", min_kernel_confidence_);
-      min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
-      min_kernel_area_ = params.value("min_kernel_area", min_kernel_area_);
-      min_text_area_ = params.value("min_text_area", min_text_area_);
-      rescale_ = params.value("rescale", rescale_);
-      downsample_ratio_ = params.value("downsample_ratio", downsample_ratio_);
-    }
-    auto platform = Platform(device_.platform_id()).GetPlatformName();
-    auto creator = gRegistry<PseHeadImpl>().Get(platform);
-    if (!creator) {
-      MMDEPLOY_ERROR(
-          "PSEHead: implementation for platform \"{}\" not found. Available platforms: {}",
-          platform, gRegistry<PseHeadImpl>().List());
-      throw_exception(eEntryNotFound);
-    }
-    impl_ = creator->Create();
-    impl_->Init(stream_);
-  }
-
-  Result<Value> operator()(const Value& _data, const Value& _pred) noexcept {
-    auto _preds = _pred["output"].get<Tensor>();
-    if (_preds.shape().size() != 4 || _preds.shape(0) != 1 ||
-        _preds.data_type() != DataType::kFLOAT) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", _preds.shape(),
-                     (int)_preds.data_type());
-      return Status(eNotSupported);
-    }
-
-    // drop batch dimension
-    _preds.Squeeze(0);
-
-    cv::Mat_<uint8_t> masks;
-    cv::Mat_<int> kernel_labels;
-    cv::Mat_<float> score;
-    int region_num = 0;
-
-    OUTCOME_TRY(
-        impl_->Process(_preds, min_kernel_confidence_, score, masks, kernel_labels, region_num));
-
-    std::vector<int> text_areas;
-    std::vector<float> text_scores;
-    std::vector<std::vector<int>> text_points;
-    contour_expand(masks.rowRange(1, masks.rows), kernel_labels, score, min_kernel_area_,
-                   region_num, text_areas, text_scores, text_points);
-
-    auto scale_w = _data["img_metas"]["scale_factor"][0].get<float>();
-    auto scale_h = _data["img_metas"]["scale_factor"][1].get<float>();
-
-    TextDetections output;
-    for (int text_index = 1; text_index < region_num; ++text_index) {
-      auto& text_point = text_points[text_index];
-      auto text_confidence = text_scores[text_index];
-      auto area = text_areas[text_index];
-
-      if (filter_instance(static_cast<float>(area), text_confidence, min_text_area_,
-                          min_text_avg_confidence_)) {
-        continue;
-      }
-
-      cv::Mat_<int> points(text_point.size() / 2, 2, text_point.data());
-      cv::RotatedRect rect = cv::minAreaRect(points);
-      std::vector<cv::Point2f> vertices(4);
-      rect.points(vertices.data());
-
-      if (rescale_) {
-        for (auto& p : vertices) {
-          p.x /= scale_w * downsample_ratio_;
-          p.y /= scale_h * downsample_ratio_;
+namespace mmdeploy::mmocr
+{
+
+    void contour_expand(const cv::Mat_<uint8_t>& kernel_masks, const cv::Mat_<int32_t>& kernel_label, const cv::Mat_<float>& score, int min_kernel_area, int kernel_num, std::vector<int>& text_areas, std::vector<float>& text_scores, std::vector<std::vector<int>>& text_points);
+
+    class PSEHead : public MMOCR
+    {
+      public:
+        explicit PSEHead(const Value& config)
+            : MMOCR(config)
+        {
+            if (config.contains("params"))
+            {
+                auto& params             = config["params"];
+                min_kernel_confidence_   = params.value("min_kernel_confidence", min_kernel_confidence_);
+                min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
+                min_kernel_area_         = params.value("min_kernel_area", min_kernel_area_);
+                min_text_area_           = params.value("min_text_area", min_text_area_);
+                rescale_                 = params.value("rescale", rescale_);
+                downsample_ratio_        = params.value("downsample_ratio", downsample_ratio_);
+            }
+            auto platform = Platform(device_.platform_id()).GetPlatformName();
+            auto creator  = gRegistry<PseHeadImpl>().Get(platform);
+            if (!creator)
+            {
+                MMDEPLOY_ERROR(
+                    "PSEHead: implementation for platform \"{}\" not found. Available platforms: {}",
+                    platform,
+                    gRegistry<PseHeadImpl>().List());
+                throw_exception(eEntryNotFound);
+            }
+            impl_ = creator->Create();
+            impl_->Init(stream_);
         }
-      }
-      auto& det = output.emplace_back();
-      for (int i = 0; i < 4; ++i) {
-        det.bbox[i * 2] = vertices[i].x;
-        det.bbox[i * 2 + 1] = vertices[i].y;
-      }
-      det.score = text_confidence;
-    }
-    return to_value(output);
-  }
-
-  static bool filter_instance(float area, float confidence, float min_area, float min_confidence) {
-    return area < min_area || confidence < min_confidence;
-  }
-
-  float min_kernel_confidence_{.5f};
-  float min_text_avg_confidence_{0.85};
-  int min_kernel_area_{0};
-  float min_text_area_{16};
-  bool rescale_{true};
-  float downsample_ratio_{.25f};
-
-  std::unique_ptr<PseHeadImpl> impl_;
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, PSEHead);
-
-MMDEPLOY_DEFINE_REGISTRY(PseHeadImpl);
+
+        Result<Value> operator()(const Value& _data, const Value& _pred) noexcept
+        {
+            auto _preds = _pred["output"].get<Tensor>();
+            if (_preds.shape().size() != 4 || _preds.shape(0) != 1 ||
+                _preds.data_type() != DataType::kFLOAT)
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", _preds.shape(), (int)_preds.data_type());
+                return Status(eNotSupported);
+            }
+
+            // drop batch dimension
+            _preds.Squeeze(0);
+
+            cv::Mat_<uint8_t> masks;
+            cv::Mat_<int>     kernel_labels;
+            cv::Mat_<float>   score;
+            int               region_num = 0;
+
+            OUTCOME_TRY(
+                impl_->Process(_preds, min_kernel_confidence_, score, masks, kernel_labels, region_num));
+
+            std::vector<int>              text_areas;
+            std::vector<float>            text_scores;
+            std::vector<std::vector<int>> text_points;
+            contour_expand(masks.rowRange(1, masks.rows), kernel_labels, score, min_kernel_area_, region_num, text_areas, text_scores, text_points);
+
+            auto           scale_w = _data["img_metas"]["scale_factor"][0].get<float>();
+            auto           scale_h = _data["img_metas"]["scale_factor"][1].get<float>();
+
+            TextDetections output;
+            for (int text_index = 1; text_index < region_num; ++text_index)
+            {
+                auto& text_point      = text_points[text_index];
+                auto  text_confidence = text_scores[text_index];
+                auto  area            = text_areas[text_index];
+
+                if (filter_instance(static_cast<float>(area), text_confidence, min_text_area_, min_text_avg_confidence_))
+                {
+                    continue;
+                }
+
+                cv::Mat_<int>            points(text_point.size() / 2, 2, text_point.data());
+                cv::RotatedRect          rect = cv::minAreaRect(points);
+                std::vector<cv::Point2f> vertices(4);
+                rect.points(vertices.data());
+
+                if (rescale_)
+                {
+                    for (auto& p : vertices)
+                    {
+                        p.x /= scale_w * downsample_ratio_;
+                        p.y /= scale_h * downsample_ratio_;
+                    }
+                }
+                auto& det = output.emplace_back();
+                for (int i = 0; i < 4; ++i)
+                {
+                    det.bbox[i * 2]     = vertices[i].x;
+                    det.bbox[i * 2 + 1] = vertices[i].y;
+                }
+                det.score = text_confidence;
+            }
+            return to_value(output);
+        }
+
+        static bool filter_instance(float area, float confidence, float min_area, float min_confidence)
+        {
+            return area < min_area || confidence < min_confidence;
+        }
+
+        float                        min_kernel_confidence_{.5f};
+        float                        min_text_avg_confidence_{0.85};
+        int                          min_kernel_area_{0};
+        float                        min_text_area_{16};
+        bool                         rescale_{true};
+        float                        downsample_ratio_{.25f};
+
+        std::unique_ptr<PseHeadImpl> impl_;
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, PSEHead);
+
+    MMDEPLOY_DEFINE_REGISTRY(PseHeadImpl);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/psenet.h b/csrc/mmdeploy/codebase/mmocr/psenet.h
index cec376fdc2..069eecf49f 100644
--- a/csrc/mmdeploy/codebase/mmocr/psenet.h
+++ b/csrc/mmdeploy/codebase/mmocr/psenet.h
@@ -9,26 +9,31 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/core.hpp"
 
-namespace mmdeploy::mmocr {
-
-class PseHeadImpl {
- public:
-  virtual ~PseHeadImpl() = default;
-
-  virtual void Init(const Stream& stream) { stream_ = stream; }
-
-  virtual Result<void> Process(Tensor preds,                 //
-                               float min_kernel_confidence,  //
-                               cv::Mat_<float>& score,       //
-                               cv::Mat_<uint8_t>& masks,     //
-                               cv::Mat_<int>& label,         //
-                               int& region_num) = 0;
-
- protected:
-  Stream stream_;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(PseHeadImpl, std::unique_ptr<PseHeadImpl>());
+namespace mmdeploy::mmocr
+{
+
+    class PseHeadImpl
+    {
+      public:
+        virtual ~PseHeadImpl() = default;
+
+        virtual void Init(const Stream& stream)
+        {
+            stream_ = stream;
+        }
+
+        virtual Result<void> Process(Tensor             preds,                  //
+                                     float              min_kernel_confidence,  //
+                                     cv::Mat_<float>&   score,                  //
+                                     cv::Mat_<uint8_t>& masks,                  //
+                                     cv::Mat_<int>&     label,                  //
+                                     int&               region_num) = 0;
+
+      protected:
+        Stream stream_;
+    };
+
+    MMDEPLOY_DECLARE_REGISTRY(PseHeadImpl, std::unique_ptr<PseHeadImpl>());
 
 }  // namespace mmdeploy::mmocr
 
diff --git a/csrc/mmdeploy/codebase/mmocr/rescale_to_height.cpp b/csrc/mmdeploy/codebase/mmocr/rescale_to_height.cpp
index 434e6574b4..70f54c016f 100644
--- a/csrc/mmdeploy/codebase/mmocr/rescale_to_height.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/rescale_to_height.cpp
@@ -12,74 +12,75 @@
 
 using namespace std;
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class RescaleToHeight : public transform::Transform {
- public:
-  explicit RescaleToHeight(const Value& args) noexcept {
-    height_ = args.value("height", height_);
-    min_width_ = args.contains("min_width") && args["min_width"].is_number_integer()
-                     ? args["min_width"].get<int>()
-                     : min_width_;
-    max_width_ = args.contains("max_width") && args["max_width"].is_number_integer()
-                     ? args["max_width"].get<int>()
-                     : max_width_;
-    width_divisor_ = args.contains("width_divisor") && args["width_divisor"].is_number_integer()
-                         ? args["width_divisor"].get<int>()
-                         : width_divisor_;
-    resize_ = operation::Managed<operation::Resize>::Create("bilinear");
-  }
+    class RescaleToHeight : public transform::Transform
+    {
+      public:
+        explicit RescaleToHeight(const Value& args) noexcept
+        {
+            height_        = args.value("height", height_);
+            min_width_     = args.contains("min_width") && args["min_width"].is_number_integer() ? args["min_width"].get<int>() : min_width_;
+            max_width_     = args.contains("max_width") && args["max_width"].is_number_integer() ? args["max_width"].get<int>() : max_width_;
+            width_divisor_ = args.contains("width_divisor") && args["width_divisor"].is_number_integer() ? args["width_divisor"].get<int>() : width_divisor_;
+            resize_        = operation::Managed<operation::Resize>::Create("bilinear");
+        }
 
-  ~RescaleToHeight() override = default;
+        ~RescaleToHeight() override = default;
 
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto dst_height = height_;
-    auto dst_min_width = min_width_;
-    auto dst_max_width = max_width_;
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto             dst_height    = height_;
+            auto             dst_min_width = min_width_;
+            auto             dst_max_width = max_width_;
 
-    std::vector<int> img_shape;  // NHWC
-    from_value(data["img_shape"], img_shape);
+            std::vector<int> img_shape;  // NHWC
+            from_value(data["img_shape"], img_shape);
 
-    std::vector<int> ori_shape;  // NHWC
-    from_value(data["ori_shape"], ori_shape);
+            std::vector<int> ori_shape;  // NHWC
+            from_value(data["ori_shape"], ori_shape);
 
-    auto ori_height = ori_shape[1];
-    auto ori_width = ori_shape[2];
-    auto valid_ratio = 1.f;
+            auto   ori_height  = ori_shape[1];
+            auto   ori_width   = ori_shape[2];
+            auto   valid_ratio = 1.f;
 
-    auto img = data["img"].get<Tensor>();
-    Tensor img_resize;
-    auto new_width = static_cast<int>(std::ceil(1.f * dst_height / ori_height * ori_width));
-    auto width_divisor = width_divisor_;
-    if (dst_min_width > 0) {
-      new_width = std::max(dst_min_width, new_width);
-    }
-    if (dst_max_width > 0) {
-      new_width = std::min(dst_max_width, new_width);
-    }
-    if (new_width % width_divisor != 0) {
-      new_width = std::round(1.f * new_width / width_divisor) * width_divisor;
-    }
-    OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, new_width));
-    data["img"] = img_resize;
-    data["resize_shape"] = to_value(img_resize.desc().shape);
-    data["pad_shape"] = data["resize_shape"];
-    data["ori_shape"] = data["ori_shape"];
-    data["scale"] = to_value(std::vector<int>({new_width, dst_height}));
-    data["valid_ratio"] = valid_ratio;
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
+            auto   img = data["img"].get<Tensor>();
+            Tensor img_resize;
+            auto   new_width     = static_cast<int>(std::ceil(1.f * dst_height / ori_height * ori_width));
+            auto   width_divisor = width_divisor_;
+            if (dst_min_width > 0)
+            {
+                new_width = std::max(dst_min_width, new_width);
+            }
+            if (dst_max_width > 0)
+            {
+                new_width = std::min(dst_max_width, new_width);
+            }
+            if (new_width % width_divisor != 0)
+            {
+                new_width = std::round(1.f * new_width / width_divisor) * width_divisor;
+            }
+            OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, new_width));
+            data["img"]          = img_resize;
+            data["resize_shape"] = to_value(img_resize.desc().shape);
+            data["pad_shape"]    = data["resize_shape"];
+            data["ori_shape"]    = data["ori_shape"];
+            data["scale"]        = to_value(std::vector<int>({new_width, dst_height}));
+            data["valid_ratio"]  = valid_ratio;
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
 
- protected:
-  operation::Managed<operation::Resize> resize_;
-  int height_{-1};
-  int min_width_{-1};
-  int max_width_{-1};
-  bool keep_aspect_ratio_{true};
-  int width_divisor_{1};
-};
+      protected:
+        operation::Managed<operation::Resize> resize_;
+        int                                   height_{-1};
+        int                                   min_width_{-1};
+        int                                   max_width_{-1};
+        bool                                  keep_aspect_ratio_{true};
+        int                                   width_divisor_{1};
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(RescaleToHeight);
+    MMDEPLOY_REGISTER_TRANSFORM(RescaleToHeight);
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/resize_ocr.cpp b/csrc/mmdeploy/codebase/mmocr/resize_ocr.cpp
index 292ad9a4d7..89ddbe68b9 100644
--- a/csrc/mmdeploy/codebase/mmocr/resize_ocr.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/resize_ocr.cpp
@@ -10,93 +10,100 @@
 
 using namespace std;
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class ResizeOCR : public transform::Transform {
- public:
-  explicit ResizeOCR(const Value& args) noexcept {
-    height_ = args.value("height", height_);
-    min_width_ = args.contains("min_width") && args["min_width"].is_number_integer()
-                     ? args["min_width"].get<int>()
-                     : min_width_;
-    max_width_ = args.contains("max_width") && args["max_width"].is_number_integer()
-                     ? args["max_width"].get<int>()
-                     : max_width_;
-    keep_aspect_ratio_ = args.value("keep_aspect_ratio", keep_aspect_ratio_);
-    backend_ = args.contains("backend") && args["backend"].is_string()
-                   ? args["backend"].get<string>()
-                   : backend_;
-    img_pad_value_ = args.value("img_pad_value", img_pad_value_);
-    width_downsample_ratio_ = args.value("width_downsample_ratio", width_downsample_ratio_);
+    class ResizeOCR : public transform::Transform
+    {
+      public:
+        explicit ResizeOCR(const Value& args) noexcept
+        {
+            height_                 = args.value("height", height_);
+            min_width_              = args.contains("min_width") && args["min_width"].is_number_integer() ? args["min_width"].get<int>() : min_width_;
+            max_width_              = args.contains("max_width") && args["max_width"].is_number_integer() ? args["max_width"].get<int>() : max_width_;
+            keep_aspect_ratio_      = args.value("keep_aspect_ratio", keep_aspect_ratio_);
+            backend_                = args.contains("backend") && args["backend"].is_string() ? args["backend"].get<string>() : backend_;
+            img_pad_value_          = args.value("img_pad_value", img_pad_value_);
+            width_downsample_ratio_ = args.value("width_downsample_ratio", width_downsample_ratio_);
 
-    resize_ = operation::Managed<operation::Resize>::Create("bilinear");
-    pad_ = operation::Managed<operation::Pad>::Create("constant", img_pad_value_);
-  }
+            resize_ = operation::Managed<operation::Resize>::Create("bilinear");
+            pad_    = operation::Managed<operation::Pad>::Create("constant", img_pad_value_);
+        }
 
-  ~ResizeOCR() override = default;
+        ~ResizeOCR() override = default;
 
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto dst_height = height_;
-    auto dst_min_width = min_width_;
-    auto dst_max_width = max_width_;
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto             dst_height    = height_;
+            auto             dst_min_width = min_width_;
+            auto             dst_max_width = max_width_;
 
-    std::vector<int> img_shape;  // NHWC
-    from_value(data["img_shape"], img_shape);
+            std::vector<int> img_shape;  // NHWC
+            from_value(data["img_shape"], img_shape);
 
-    std::vector<int> ori_shape;  // NHWC
-    from_value(data["ori_shape"], ori_shape);
+            std::vector<int> ori_shape;  // NHWC
+            from_value(data["ori_shape"], ori_shape);
 
-    auto ori_height = ori_shape[1];
-    auto ori_width = ori_shape[2];
-    auto valid_ratio = 1.f;
+            auto   ori_height  = ori_shape[1];
+            auto   ori_width   = ori_shape[2];
+            auto   valid_ratio = 1.f;
 
-    auto img = data["img"].get<Tensor>();
-    Tensor img_resize;
-    if (keep_aspect_ratio_) {
-      auto new_width = static_cast<int>(std::ceil(1.f * dst_height / ori_height * ori_width));
-      auto width_divisor = static_cast<int>(1 / width_downsample_ratio_);
-      if (new_width % width_divisor != 0) {
-        new_width = std::round(1.f * new_width / width_divisor) * width_divisor;
-      }
-      if (dst_min_width > 0) {
-        new_width = std::max(dst_min_width, new_width);
-      }
-      if (dst_max_width > 0) {
-        valid_ratio = std::min(1., 1. * new_width / dst_max_width);
-        auto resize_width = std::min(dst_max_width, new_width);
-        OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, resize_width));
-        if (new_width < dst_max_width) {
-          auto pad_w = std::max(0, dst_max_width - resize_width);
-          OUTCOME_TRY(pad_.Apply(img_resize, img_resize, 0, 0, 0, pad_w));
-        }
-      } else {
-        OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, new_width));
-      }
-    } else {
-      OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, dst_max_width));
-    }
+            auto   img = data["img"].get<Tensor>();
+            Tensor img_resize;
+            if (keep_aspect_ratio_)
+            {
+                auto new_width     = static_cast<int>(std::ceil(1.f * dst_height / ori_height * ori_width));
+                auto width_divisor = static_cast<int>(1 / width_downsample_ratio_);
+                if (new_width % width_divisor != 0)
+                {
+                    new_width = std::round(1.f * new_width / width_divisor) * width_divisor;
+                }
+                if (dst_min_width > 0)
+                {
+                    new_width = std::max(dst_min_width, new_width);
+                }
+                if (dst_max_width > 0)
+                {
+                    valid_ratio       = std::min(1., 1. * new_width / dst_max_width);
+                    auto resize_width = std::min(dst_max_width, new_width);
+                    OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, resize_width));
+                    if (new_width < dst_max_width)
+                    {
+                        auto pad_w = std::max(0, dst_max_width - resize_width);
+                        OUTCOME_TRY(pad_.Apply(img_resize, img_resize, 0, 0, 0, pad_w));
+                    }
+                }
+                else
+                {
+                    OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, new_width));
+                }
+            }
+            else
+            {
+                OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, dst_max_width));
+            }
 
-    data["img"] = img_resize;
-    data["resize_shape"] = to_value(img_resize.desc().shape);
-    data["pad_shape"] = data["resize_shape"];
-    data["valid_ratio"] = valid_ratio;
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
+            data["img"]          = img_resize;
+            data["resize_shape"] = to_value(img_resize.desc().shape);
+            data["pad_shape"]    = data["resize_shape"];
+            data["valid_ratio"]  = valid_ratio;
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
 
- protected:
-  operation::Managed<operation::Resize> resize_;
-  operation::Managed<operation::Pad> pad_;
-  int height_{-1};
-  int min_width_{-1};
-  int max_width_{-1};
-  bool keep_aspect_ratio_{true};
-  float img_pad_value_{0};
-  float width_downsample_ratio_{1. / 16};
-  std::string backend_;
-};
+      protected:
+        operation::Managed<operation::Resize> resize_;
+        operation::Managed<operation::Pad>    pad_;
+        int                                   height_{-1};
+        int                                   min_width_{-1};
+        int                                   max_width_{-1};
+        bool                                  keep_aspect_ratio_{true};
+        float                                 img_pad_value_{0};
+        float                                 width_downsample_ratio_{1. / 16};
+        std::string                           backend_;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(ResizeOCR);
+    MMDEPLOY_REGISTER_TRANSFORM(ResizeOCR);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/short_scale_aspect_jitter.cpp b/csrc/mmdeploy/codebase/mmocr/short_scale_aspect_jitter.cpp
index 0390985c3e..a2d7585da4 100644
--- a/csrc/mmdeploy/codebase/mmocr/short_scale_aspect_jitter.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/short_scale_aspect_jitter.cpp
@@ -12,85 +12,92 @@
 
 using namespace std;
 
-namespace mmdeploy::mmocr {
-
-class ShortScaleAspectJitter : public transform::Transform {
- public:
-  explicit ShortScaleAspectJitter(const Value& args) noexcept {
-    short_size_ = args.contains("short_size") && args["short_size"].is_number_integer()
-                      ? args["short_size"].get<int>()
-                      : short_size_;
-    if (args["ratio_range"].is_array() && args["ratio_range"].size() == 2) {
-      ratio_range_[0] = args["ratio_range"][0].get<float>();
-      ratio_range_[1] = args["ratio_range"][1].get<float>();
-    } else {
-      MMDEPLOY_ERROR("'ratio_range' should be a float array of size 2");
-      throw_exception(eInvalidArgument);
-    }
-
-    if (args["aspect_ratio_range"].is_array() && args["aspect_ratio_range"].size() == 2) {
-      aspect_ratio_range_[0] = args["aspect_ratio_range"][0].get<float>();
-      aspect_ratio_range_[1] = args["aspect_ratio_range"][1].get<float>();
-    } else {
-      MMDEPLOY_ERROR("'aspect_ratio_range' should be a float array of size 2");
-      throw_exception(eInvalidArgument);
-    }
-    scale_divisor_ = args.contains("scale_divisor") && args["scale_divisor"].is_number_integer()
-                         ? args["scale_divisor"].get<int>()
-                         : scale_divisor_;
-    resize_ = operation::Managed<operation::Resize>::Create("bilinear");
-  }
-
-  ~ShortScaleAspectJitter() override = default;
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto short_size = short_size_;
-    auto ratio_range = ratio_range_;
-    auto aspect_ratio_range = aspect_ratio_range_;
-    auto scale_divisor = scale_divisor_;
-
-    if (ratio_range[0] != 1.0 || ratio_range[1] != 1.0 || aspect_ratio_range[0] != 1.0 ||
-        aspect_ratio_range[1] != 1.0) {
-      MMDEPLOY_ERROR("unsupported `ratio_range` and `aspect_ratio_range`");
-      return Status(eNotSupported);
-    }
-    std::vector<int> img_shape;  // NHWC
-    from_value(data["img_shape"], img_shape);
-
-    std::vector<int> ori_shape;  // NHWC
-    from_value(data["ori_shape"], ori_shape);
-
-    auto ori_height = ori_shape[1];
-    auto ori_width = ori_shape[2];
-
-    auto img = data["img"].get<Tensor>();
-    Tensor img_resize;
-    auto scale = static_cast<float>(1.0 * short_size / std::min(img_shape[1], img_shape[2]));
-    auto dst_height = static_cast<int>(std::round(scale * img_shape[1]));
-    auto dst_width = static_cast<int>(std::round(scale * img_shape[2]));
-    dst_height = static_cast<int>(std::ceil(1.0 * dst_height / scale_divisor) * scale_divisor);
-    dst_width = static_cast<int>(std::ceil(1.0 * dst_width / scale_divisor) * scale_divisor);
-    std::vector<float> scale_factor = {(float)1.0 * dst_width / img_shape[2],
-                                       (float)1.0 * dst_height / img_shape[1]};
-
-    OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, dst_width));
-    data["img"] = img_resize;
-    data["resize_shape"] = to_value(img_resize.desc().shape);
-    data["scale"] = to_value(std::vector<int>({dst_width, dst_height}));
-    data["scale_factor"] = to_value(scale_factor);
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- protected:
-  operation::Managed<operation::Resize> resize_;
-  int short_size_{736};
-  std::vector<float> ratio_range_{0.7, 1.3};
-  std::vector<float> aspect_ratio_range_{0.9, 1.1};
-  int scale_divisor_{1};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(ShortScaleAspectJitter);
+namespace mmdeploy::mmocr
+{
+
+    class ShortScaleAspectJitter : public transform::Transform
+    {
+      public:
+        explicit ShortScaleAspectJitter(const Value& args) noexcept
+        {
+            short_size_ = args.contains("short_size") && args["short_size"].is_number_integer() ? args["short_size"].get<int>() : short_size_;
+            if (args["ratio_range"].is_array() && args["ratio_range"].size() == 2)
+            {
+                ratio_range_[0] = args["ratio_range"][0].get<float>();
+                ratio_range_[1] = args["ratio_range"][1].get<float>();
+            }
+            else
+            {
+                MMDEPLOY_ERROR("'ratio_range' should be a float array of size 2");
+                throw_exception(eInvalidArgument);
+            }
+
+            if (args["aspect_ratio_range"].is_array() && args["aspect_ratio_range"].size() == 2)
+            {
+                aspect_ratio_range_[0] = args["aspect_ratio_range"][0].get<float>();
+                aspect_ratio_range_[1] = args["aspect_ratio_range"][1].get<float>();
+            }
+            else
+            {
+                MMDEPLOY_ERROR("'aspect_ratio_range' should be a float array of size 2");
+                throw_exception(eInvalidArgument);
+            }
+            scale_divisor_ = args.contains("scale_divisor") && args["scale_divisor"].is_number_integer() ? args["scale_divisor"].get<int>() : scale_divisor_;
+            resize_        = operation::Managed<operation::Resize>::Create("bilinear");
+        }
+
+        ~ShortScaleAspectJitter() override = default;
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto short_size         = short_size_;
+            auto ratio_range        = ratio_range_;
+            auto aspect_ratio_range = aspect_ratio_range_;
+            auto scale_divisor      = scale_divisor_;
+
+            if (ratio_range[0] != 1.0 || ratio_range[1] != 1.0 || aspect_ratio_range[0] != 1.0 ||
+                aspect_ratio_range[1] != 1.0)
+            {
+                MMDEPLOY_ERROR("unsupported `ratio_range` and `aspect_ratio_range`");
+                return Status(eNotSupported);
+            }
+            std::vector<int> img_shape;  // NHWC
+            from_value(data["img_shape"], img_shape);
+
+            std::vector<int> ori_shape;  // NHWC
+            from_value(data["ori_shape"], ori_shape);
+
+            auto   ori_height = ori_shape[1];
+            auto   ori_width  = ori_shape[2];
+
+            auto   img = data["img"].get<Tensor>();
+            Tensor img_resize;
+            auto   scale                    = static_cast<float>(1.0 * short_size / std::min(img_shape[1], img_shape[2]));
+            auto   dst_height               = static_cast<int>(std::round(scale * img_shape[1]));
+            auto   dst_width                = static_cast<int>(std::round(scale * img_shape[2]));
+            dst_height                      = static_cast<int>(std::ceil(1.0 * dst_height / scale_divisor) * scale_divisor);
+            dst_width                       = static_cast<int>(std::ceil(1.0 * dst_width / scale_divisor) * scale_divisor);
+            std::vector<float> scale_factor = {(float)1.0 * dst_width / img_shape[2],
+                                               (float)1.0 * dst_height / img_shape[1]};
+
+            OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, dst_width));
+            data["img"]          = img_resize;
+            data["resize_shape"] = to_value(img_resize.desc().shape);
+            data["scale"]        = to_value(std::vector<int>({dst_width, dst_height}));
+            data["scale_factor"] = to_value(scale_factor);
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      protected:
+        operation::Managed<operation::Resize> resize_;
+        int                                   short_size_{736};
+        std::vector<float>                    ratio_range_{0.7, 1.3};
+        std::vector<float>                    aspect_ratio_range_{0.9, 1.1};
+        int                                   scale_divisor_{1};
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(ShortScaleAspectJitter);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/warp.cpp b/csrc/mmdeploy/codebase/mmocr/warp.cpp
index cf009f096f..12b3ba5d46 100644
--- a/csrc/mmdeploy/codebase/mmocr/warp.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/warp.cpp
@@ -11,60 +11,67 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv_utils.h"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-// Warp rotated rect
-class WarpBbox {
- public:
-  Result<Value> operator()(const Value& img, const Value& det) {
-    auto ori_img = img["ori_img"].get<framework::Mat>();
-    if (det.is_object() && det.contains("bbox")) {
-      auto bbox = from_value<std::vector<cv::Point>>(det["bbox"]);
-      auto patch = warp(mmdeploy::cpu::Mat2CVMat(ori_img), bbox);
-      return Value{{"ori_img", cpu::CVMat2Mat(patch, ori_img.pixel_format())}};
-    } else {  // whole image as a bbox
-      return Value{{"ori_img", ori_img}};
-    }
-  }
+    // Warp rotated rect
+    class WarpBbox
+    {
+      public:
+        Result<Value> operator()(const Value& img, const Value& det)
+        {
+            auto ori_img = img["ori_img"].get<framework::Mat>();
+            if (det.is_object() && det.contains("bbox"))
+            {
+                auto bbox  = from_value<std::vector<cv::Point>>(det["bbox"]);
+                auto patch = warp(mmdeploy::cpu::Mat2CVMat(ori_img), bbox);
+                return Value{{"ori_img", cpu::CVMat2Mat(patch, ori_img.pixel_format())}};
+            }
+            else
+            {  // whole image as a bbox
+                return Value{{"ori_img", ori_img}};
+            }
+        }
 
-  // assuming rect
-  static cv::Mat warp(const cv::Mat& img, const std::vector<cv::Point>& _pts) {
-    auto pts = sort_vertex(_pts);
-    std::vector<cv::Point2f> src(begin(pts), end(pts));
-    auto e0 = norm(pts[0] - pts[1]);
-    auto e1 = norm(pts[1] - pts[2]);
-    auto w = static_cast<float>(std::max(e0, e1));
-    auto h = static_cast<float>(std::min(e0, e1));
-    std::vector<cv::Point2f> dst{{0, 0}, {w, 0}, {w, h}, {0, h}};
-    auto m = cv::getAffineTransform(src.data(), dst.data());
-    cv::Mat warped;
-    cv::warpAffine(img, warped, m, {static_cast<int>(w), static_cast<int>(h)});
-    return warped;
-  }
+        // assuming rect
+        static cv::Mat warp(const cv::Mat& img, const std::vector<cv::Point>& _pts)
+        {
+            auto                     pts = sort_vertex(_pts);
+            std::vector<cv::Point2f> src(begin(pts), end(pts));
+            auto                     e0 = norm(pts[0] - pts[1]);
+            auto                     e1 = norm(pts[1] - pts[2]);
+            auto                     w  = static_cast<float>(std::max(e0, e1));
+            auto                     h  = static_cast<float>(std::min(e0, e1));
+            std::vector<cv::Point2f> dst{{0, 0}, {w, 0}, {w, h}, {0, h}};
+            auto                     m = cv::getAffineTransform(src.data(), dst.data());
+            cv::Mat                  warped;
+            cv::warpAffine(img, warped, m, {static_cast<int>(w), static_cast<int>(h)});
+            return warped;
+        }
 
-  static std::vector<cv::Point> sort_vertex(std::vector<cv::Point> ps) {
-    auto pivot = *min_element(begin(ps), end(ps), [](auto r, auto p) {
-      return (r.y != p.y) ? (r.y < p.y) : (r.x < p.x);
-    });
-    // TODO: resolve tie with distance
-    sort(begin(ps), end(ps), [&](auto a, auto b) {
+        static std::vector<cv::Point> sort_vertex(std::vector<cv::Point> ps)
+        {
+            auto pivot = *min_element(begin(ps), end(ps), [](auto r, auto p)
+                                      { return (r.y != p.y) ? (r.y < p.y) : (r.x < p.x); });
+            // TODO: resolve tie with distance
+            sort(begin(ps), end(ps), [&](auto a, auto b)
+                 {
       if (a == pivot) return b != pivot;
-      return (a - pivot).cross(b - pivot) > 0;
-    });
-    auto tl = accumulate(begin(ps) + 1, end(ps), ps[0], [](auto r, auto p) {
-      return cv::Point{std::min(r.x, p.x), std::min(r.y, p.y)};
-    });
-    auto cmp = [&](auto r, auto p) {
-      cv::Point2f u{r - tl}, v{p - tl};
-      return u.dot(u) < v.dot(v);
+      return (a - pivot).cross(b - pivot) > 0; });
+            auto tl  = accumulate(begin(ps) + 1, end(ps), ps[0], [](auto r, auto p)
+                                 { return cv::Point{std::min(r.x, p.x), std::min(r.y, p.y)}; });
+            auto cmp = [&](auto r, auto p)
+            {
+                cv::Point2f u{r - tl}, v{p - tl};
+                return u.dot(u) < v.dot(v);
+            };
+            auto tl_idx = min_element(begin(ps), end(ps), cmp) - begin(ps);
+            rotate(begin(ps), begin(ps) + tl_idx, end(ps));
+            return ps;
+        }
     };
-    auto tl_idx = min_element(begin(ps), end(ps), cmp) - begin(ps);
-    rotate(begin(ps), begin(ps) + tl_idx, end(ps));
-    return ps;
-  }
-};
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (WarpBbox, 0),
-                               [](const Value&) { return CreateTask(WarpBbox{}); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (WarpBbox, 0), [](const Value&)
+                                   { return CreateTask(WarpBbox{}); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmpose/keypoints_from_heatmap.cpp b/csrc/mmdeploy/codebase/mmpose/keypoints_from_heatmap.cpp
index 4d7c0e7a92..ffdbbbc619 100644
--- a/csrc/mmdeploy/codebase/mmpose/keypoints_from_heatmap.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/keypoints_from_heatmap.cpp
@@ -13,359 +13,401 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmpose.h"
 
-namespace mmdeploy::mmpose {
-
-using std::string;
-using std::vector;
-
-std::string to_lower(const std::string& s) {
-  std::string t = s;
-  std::transform(t.begin(), t.end(), t.begin(), [](unsigned char c) { return std::tolower(c); });
-  return t;
-}
-
-class TopdownHeatmapBaseHeadDecode : public MMPose {
- public:
-  explicit TopdownHeatmapBaseHeadDecode(const Value& config) : MMPose(config) {
-    if (config.contains("params")) {
-      auto& params = config["params"];
-      flip_test_ = params.value("flip_test", flip_test_);
-      use_udp_ = params.value("use_udp", use_udp_);
-      target_type_ = params.value("target_type", target_type_);
-      valid_radius_factor_ = params.value("valid_radius_factor", valid_radius_factor_);
-      unbiased_decoding_ = params.value("unbiased_decoding", unbiased_decoding_);
-      post_process_ = params.value("post_process", post_process_);
-      shift_heatmap_ = params.value("shift_heatmap", shift_heatmap_);
-      modulate_kernel_ = params.value("modulate_kernel", modulate_kernel_);
-    }
-  }
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) {
-    MMDEPLOY_DEBUG("preprocess_result: {}", _data);
-    MMDEPLOY_DEBUG("inference_result: {}", _prob);
-
-    Device cpu_device{"cpu"};
-    OUTCOME_TRY(auto heatmap,
-                MakeAvailableOnDevice(_prob["output"].get<Tensor>(), cpu_device, stream()));
-    OUTCOME_TRY(stream().Wait());
-    if (!(heatmap.shape().size() == 4 && heatmap.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", heatmap.shape(),
-                     (int)heatmap.data_type());
-      return Status(eNotSupported);
-    }
-
-    auto& img_metas = _data["img_metas"];
-
-    vector<float> center;
-    vector<float> scale;
-    from_value(img_metas["center"], center);
-    from_value(img_metas["scale"], scale);
-    Tensor pred =
-        keypoints_from_heatmap(heatmap, center, scale, unbiased_decoding_, post_process_,
-                               modulate_kernel_, valid_radius_factor_, use_udp_, target_type_);
-
-    return GetOutput(pred);
-  }
-
-  Value GetOutput(Tensor& pred) {
-    PoseDetectorOutput output;
-    int K = pred.shape(1);
-    float* data = pred.data<float>();
-    for (int i = 0; i < K; i++) {
-      float x = *(data + 0);
-      float y = *(data + 1);
-      float s = *(data + 2);
-      output.key_points.push_back({{x, y}, s});
-      data += 3;
-    }
-    return to_value(std::move(output));
-  }
-
-  Tensor keypoints_from_heatmap(Tensor& heatmap, const vector<float>& center,
-                                const vector<float>& scale, bool unbiased_decoding,
-                                const string& post_process, int modulate_kernel,
-                                float valid_radius_factor, bool use_udp,
-                                const string& target_type) {
-    int K = heatmap.shape(1);
-    int H = heatmap.shape(2);
-    int W = heatmap.shape(3);
-
-    if (post_process == "megvii") {
-      heatmap = gaussian_blur(heatmap, modulate_kernel);
+namespace mmdeploy::mmpose
+{
+
+    using std::string;
+    using std::vector;
+
+    std::string to_lower(const std::string& s)
+    {
+        std::string t = s;
+        std::transform(t.begin(), t.end(), t.begin(), [](unsigned char c)
+                       { return std::tolower(c); });
+        return t;
     }
 
-    Tensor pred;
-
-    if (use_udp) {
-      if (to_lower(target_type) == to_lower(string("GaussianHeatMap"))) {
-        pred = get_max_pred(heatmap);
-        post_dark_udp(pred, heatmap, modulate_kernel);
-      } else if (to_lower(target_type) == to_lower(string("CombinedTarget"))) {
-        // output channel = 3 * channel_cfg['num_output_channels']
-        assert(K % 3 == 0);
-        for (int i = 0; i < K; i++) {
-          int kt = (i % 3 == 0) ? 2 * modulate_kernel + 1 : modulate_kernel;
-          float* data = heatmap.data<float>() + i * H * W;
-          cv::Mat work = cv::Mat(H, W, CV_32FC(1), data);
-          cv::GaussianBlur(work, work, {kt, kt}, 0);  // inplace
+    class TopdownHeatmapBaseHeadDecode : public MMPose
+    {
+      public:
+        explicit TopdownHeatmapBaseHeadDecode(const Value& config)
+            : MMPose(config)
+        {
+            if (config.contains("params"))
+            {
+                auto& params         = config["params"];
+                flip_test_           = params.value("flip_test", flip_test_);
+                use_udp_             = params.value("use_udp", use_udp_);
+                target_type_         = params.value("target_type", target_type_);
+                valid_radius_factor_ = params.value("valid_radius_factor", valid_radius_factor_);
+                unbiased_decoding_   = params.value("unbiased_decoding", unbiased_decoding_);
+                post_process_        = params.value("post_process", post_process_);
+                shift_heatmap_       = params.value("shift_heatmap", shift_heatmap_);
+                modulate_kernel_     = params.value("modulate_kernel", modulate_kernel_);
+            }
         }
-        float valid_radius = valid_radius_factor_ * H;
-        TensorDesc desc = {Device{"cpu"}, DataType::kFLOAT, {1, K / 3, H, W}};
-        Tensor offset_x(desc);
-        Tensor offset_y(desc);
-        Tensor heatmap_(desc);
+
+        Result<Value> operator()(const Value& _data, const Value& _prob)
         {
-          // split heatmap
-          float* src = heatmap.data<float>();
-          float* dst0 = heatmap_.data<float>();
-          float* dst1 = offset_x.data<float>();
-          float* dst2 = offset_y.data<float>();
-          for (int i = 0; i < K / 3; i++) {
-            std::copy_n(src, H * W, dst0);
-            std::transform(src + H * W, src + 2 * H * W, dst1,
-                           [=](float& x) { return x * valid_radius; });
-            std::transform(src + 2 * H * W, src + 3 * H * W, dst2,
-                           [=](float& x) { return x * valid_radius; });
-            src += 3 * H * W;
-            dst0 += H * W;
-            dst1 += H * W;
-            dst2 += H * W;
-          }
+            MMDEPLOY_DEBUG("preprocess_result: {}", _data);
+            MMDEPLOY_DEBUG("inference_result: {}", _prob);
+
+            Device cpu_device{"cpu"};
+            OUTCOME_TRY(auto heatmap,
+                        MakeAvailableOnDevice(_prob["output"].get<Tensor>(), cpu_device, stream()));
+            OUTCOME_TRY(stream().Wait());
+            if (!(heatmap.shape().size() == 4 && heatmap.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", heatmap.shape(), (int)heatmap.data_type());
+                return Status(eNotSupported);
+            }
+
+            auto&         img_metas = _data["img_metas"];
+
+            vector<float> center;
+            vector<float> scale;
+            from_value(img_metas["center"], center);
+            from_value(img_metas["scale"], scale);
+            Tensor pred =
+                keypoints_from_heatmap(heatmap, center, scale, unbiased_decoding_, post_process_, modulate_kernel_, valid_radius_factor_, use_udp_, target_type_);
+
+            return GetOutput(pred);
         }
-        pred = get_max_pred(heatmap_);
-        for (int i = 0; i < K / 3; i++) {
-          float* data = pred.data<float>() + i * 3;
-          int index = *(data + 0) + *(data + 1) * W + H * W * i;
-          float* offx = offset_x.data<float>() + index;
-          float* offy = offset_y.data<float>() + index;
-          *(data + 0) += *offx;
-          *(data + 1) += *offy;
+
+        Value GetOutput(Tensor& pred)
+        {
+            PoseDetectorOutput output;
+            int                K    = pred.shape(1);
+            float*             data = pred.data<float>();
+            for (int i = 0; i < K; i++)
+            {
+                float x = *(data + 0);
+                float y = *(data + 1);
+                float s = *(data + 2);
+                output.key_points.push_back({{x, y}, s});
+                data += 3;
+            }
+            return to_value(std::move(output));
         }
-      }
-    } else {
-      pred = get_max_pred(heatmap);
-      if (post_process == "unbiased") {
-        heatmap = gaussian_blur(heatmap, modulate_kernel);
-        float* data = heatmap.data<float>();
-        std::for_each(data, data + K * H * W, [](float& v) {
+
+        Tensor keypoints_from_heatmap(Tensor& heatmap, const vector<float>& center, const vector<float>& scale, bool unbiased_decoding, const string& post_process, int modulate_kernel, float valid_radius_factor, bool use_udp, const string& target_type)
+        {
+            int K = heatmap.shape(1);
+            int H = heatmap.shape(2);
+            int W = heatmap.shape(3);
+
+            if (post_process == "megvii")
+            {
+                heatmap = gaussian_blur(heatmap, modulate_kernel);
+            }
+
+            Tensor pred;
+
+            if (use_udp)
+            {
+                if (to_lower(target_type) == to_lower(string("GaussianHeatMap")))
+                {
+                    pred = get_max_pred(heatmap);
+                    post_dark_udp(pred, heatmap, modulate_kernel);
+                }
+                else if (to_lower(target_type) == to_lower(string("CombinedTarget")))
+                {
+                    // output channel = 3 * channel_cfg['num_output_channels']
+                    assert(K % 3 == 0);
+                    for (int i = 0; i < K; i++)
+                    {
+                        int     kt   = (i % 3 == 0) ? 2 * modulate_kernel + 1 : modulate_kernel;
+                        float*  data = heatmap.data<float>() + i * H * W;
+                        cv::Mat work = cv::Mat(H, W, CV_32FC(1), data);
+                        cv::GaussianBlur(work, work, {kt, kt}, 0);  // inplace
+                    }
+                    float      valid_radius = valid_radius_factor_ * H;
+                    TensorDesc desc         = {Device{"cpu"}, DataType::kFLOAT, {1, K / 3, H, W}};
+                    Tensor     offset_x(desc);
+                    Tensor     offset_y(desc);
+                    Tensor     heatmap_(desc);
+                    {
+                        // split heatmap
+                        float* src  = heatmap.data<float>();
+                        float* dst0 = heatmap_.data<float>();
+                        float* dst1 = offset_x.data<float>();
+                        float* dst2 = offset_y.data<float>();
+                        for (int i = 0; i < K / 3; i++)
+                        {
+                            std::copy_n(src, H * W, dst0);
+                            std::transform(src + H * W, src + 2 * H * W, dst1, [=](float& x)
+                                           { return x * valid_radius; });
+                            std::transform(src + 2 * H * W, src + 3 * H * W, dst2, [=](float& x)
+                                           { return x * valid_radius; });
+                            src += 3 * H * W;
+                            dst0 += H * W;
+                            dst1 += H * W;
+                            dst2 += H * W;
+                        }
+                    }
+                    pred = get_max_pred(heatmap_);
+                    for (int i = 0; i < K / 3; i++)
+                    {
+                        float* data  = pred.data<float>() + i * 3;
+                        int    index = *(data + 0) + *(data + 1) * W + H * W * i;
+                        float* offx  = offset_x.data<float>() + index;
+                        float* offy  = offset_y.data<float>() + index;
+                        *(data + 0) += *offx;
+                        *(data + 1) += *offy;
+                    }
+                }
+            }
+            else
+            {
+                pred = get_max_pred(heatmap);
+                if (post_process == "unbiased")
+                {
+                    heatmap     = gaussian_blur(heatmap, modulate_kernel);
+                    float* data = heatmap.data<float>();
+                    std::for_each(data, data + K * H * W, [](float& v)
+                                  {
           double _v = std::max((double)v, 1e-10);
-          v = std::log(_v);
-        });
-        for (int i = 0; i < K; i++) {
-          taylor(heatmap, pred, i);
+          v = std::log(_v); });
+                    for (int i = 0; i < K; i++)
+                    {
+                        taylor(heatmap, pred, i);
+                    }
+                }
+                else if (post_process != "null")
+                {
+                    for (int i = 0; i < K; i++)
+                    {
+                        float* data  = heatmap.data<float>() + i * W * H;
+                        auto   _data = [&](int y, int x)
+                        { return *(data + y * W + x); };
+                        int px = *(pred.data<float>() + i * 3 + 0);
+                        int py = *(pred.data<float>() + i * 3 + 1);
+                        if (1 < px && px < W - 1 && 1 < py && py < H - 1)
+                        {
+                            float v1 = _data(py, px + 1) - _data(py, px - 1);
+                            float v2 = _data(py + 1, px) - _data(py - 1, px);
+                            *(pred.data<float>() + i * 3 + 0) += (v1 > 0) ? 0.25 : ((v1 < 0) ? -0.25 : 0);
+                            *(pred.data<float>() + i * 3 + 1) += (v2 > 0) ? 0.25 : ((v2 < 0) ? -0.25 : 0);
+                            if (post_process_ == "megvii")
+                            {
+                                *(pred.data<float>() + i * 3 + 0) += 0.5;
+                                *(pred.data<float>() + i * 3 + 1) += 0.5;
+                            }
+                        }
+                    }
+                }
+            }
+
+            K = pred.shape(1);  // changed if target_type is CombinedTarget
+
+            // Transform back to the image
+            for (int i = 0; i < K; i++)
+            {
+                transform_pred(pred, i, center, scale, {W, H}, use_udp);
+            }
+
+            if (post_process_ == "megvii")
+            {
+                for (int i = 0; i < K; i++)
+                {
+                    float* data = pred.data<float>() + i * 3 + 2;
+                    *data       = *data / 255.0 + 0.5;
+                }
+            }
+
+            return pred;
         }
 
-      } else if (post_process != "null") {
-        for (int i = 0; i < K; i++) {
-          float* data = heatmap.data<float>() + i * W * H;
-          auto _data = [&](int y, int x) { return *(data + y * W + x); };
-          int px = *(pred.data<float>() + i * 3 + 0);
-          int py = *(pred.data<float>() + i * 3 + 1);
-          if (1 < px && px < W - 1 && 1 < py && py < H - 1) {
-            float v1 = _data(py, px + 1) - _data(py, px - 1);
-            float v2 = _data(py + 1, px) - _data(py - 1, px);
-            *(pred.data<float>() + i * 3 + 0) += (v1 > 0) ? 0.25 : ((v1 < 0) ? -0.25 : 0);
-            *(pred.data<float>() + i * 3 + 1) += (v2 > 0) ? 0.25 : ((v2 < 0) ? -0.25 : 0);
-            if (post_process_ == "megvii") {
-              *(pred.data<float>() + i * 3 + 0) += 0.5;
-              *(pred.data<float>() + i * 3 + 1) += 0.5;
+        void post_dark_udp(Tensor& pred, Tensor& heatmap, int kernel)
+        {
+            int K = heatmap.shape(1);
+            int H = heatmap.shape(2);
+            int W = heatmap.shape(3);
+            for (int i = 0; i < K; i++)
+            {
+                float*  data = heatmap.data<float>() + i * H * W;
+                cv::Mat work = cv::Mat(H, W, CV_32FC(1), data);
+                cv::GaussianBlur(work, work, {kernel, kernel}, 0);  // inplace
+            }
+            std::for_each(heatmap.data<float>(), heatmap.data<float>() + K * H * W, [](float& x)
+                          {
+      x = std::max(0.001f, std::min(50.f, x));
+      x = std::log(x); });
+            auto _heatmap_data = [&](int index, int c) -> float
+            {
+                int y = index / (W + 2);
+                int x = index % (W + 2);
+                y     = std::max(0, y - 1);
+                x     = std::max(0, x - 1);
+                return *(heatmap.data<float>() + c * H * W + y * W + x);
+            };
+            for (int i = 0; i < K; i++)
+            {
+                float*        data       = pred.data<float>() + i * 3;
+                int           index      = *(data + 0) + 1 + (*(data + 1) + 1) * (W + 2);
+                float         i_         = _heatmap_data(index, i);
+                float         ix1        = _heatmap_data(index + 1, i);
+                float         iy1        = _heatmap_data(index + W + 2, i);
+                float         ix1y1      = _heatmap_data(index + W + 3, i);
+                float         ix1_y1_    = _heatmap_data(index - W - 3, i);
+                float         ix1_       = _heatmap_data(index - 1, i);
+                float         iy1_       = _heatmap_data(index - 2 - W, i);
+                float         dx         = 0.5 * (ix1 - ix1_);
+                float         dy         = 0.5 * (iy1 - iy1_);
+                float         dxx        = ix1 - 2 * i_ + ix1_;
+                float         dyy        = iy1 - 2 * i_ + iy1_;
+                float         dxy        = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_);
+                vector<float> _data0     = {dx, dy};
+                vector<float> _data1     = {dxx, dxy, dxy, dyy};
+                cv::Mat       derivative = cv::Mat(2, 1, CV_32FC1, _data0.data());
+                cv::Mat       hessian    = cv::Mat(2, 2, CV_32FC1, _data1.data());
+                cv::Mat       hessianinv = hessian.inv();
+                cv::Mat       offset     = -hessianinv * derivative;
+                *(data + 0) += offset.at<float>(0, 0);
+                *(data + 1) += offset.at<float>(1, 0);
             }
-          }
         }
-      }
-    }
 
-    K = pred.shape(1);  // changed if target_type is CombinedTarget
+        void transform_pred(Tensor& pred, int k, const vector<float>& center, const vector<float>& _scale, const vector<int>& output_size, bool use_udp = false)
+        {
+            auto scale = _scale;
+            scale[0] *= 200;
+            scale[1] *= 200;
+
+            float scale_x, scale_y;
+            if (use_udp)
+            {
+                scale_x = scale[0] / (output_size[0] - 1.0);
+                scale_y = scale[1] / (output_size[1] - 1.0);
+            }
+            else
+            {
+                scale_x = scale[0] / output_size[0];
+                scale_y = scale[1] / output_size[1];
+            }
 
-    // Transform back to the image
-    for (int i = 0; i < K; i++) {
-      transform_pred(pred, i, center, scale, {W, H}, use_udp);
-    }
+            float* data = pred.data<float>() + k * 3;
+            *(data + 0) = *(data + 0) * scale_x + center[0] - scale[0] * 0.5;
+            *(data + 1) = *(data + 1) * scale_y + center[1] - scale[1] * 0.5;
+        }
 
-    if (post_process_ == "megvii") {
-      for (int i = 0; i < K; i++) {
-        float* data = pred.data<float>() + i * 3 + 2;
-        *data = *data / 255.0 + 0.5;
-      }
-    }
+        void taylor(const Tensor& heatmap, Tensor& pred, int k)
+        {
+            int K  = heatmap.shape(1);
+            int H  = heatmap.shape(2);
+            int W  = heatmap.shape(3);
+            int px = *(pred.data<float>() + k * 3 + 0);
+            int py = *(pred.data<float>() + k * 3 + 1);
+            if (1 < px && px < W - 2 && 1 < py && py < H - 2)
+            {
+                float* data     = const_cast<float*>(heatmap.data<float>() + k * H * W);
+                auto   get_data = [&](int r, int c)
+                { return *(data + r * W + c); };
+                float         dx  = 0.5 * (get_data(py, px + 1) - get_data(py, px - 1));
+                float         dy  = 0.5 * (get_data(py + 1, px) - get_data(py - 1, px));
+                float         dxx = 0.25 * (get_data(py, px + 2) - 2 * get_data(py, px) + get_data(py, px - 2));
+                float         dxy = 0.25 * (get_data(py + 1, px + 1) - get_data(py - 1, px + 1) -
+                                    get_data(py + 1, px - 1) + get_data(py - 1, px - 1));
+                float         dyy = 0.25 * (get_data(py + 2, px) - 2 * get_data(py, px) + get_data(py - 2, px));
+
+                vector<float> _data0     = {dx, dy};
+                vector<float> _data1     = {dxx, dxy, dxy, dyy};
+                cv::Mat       derivative = cv::Mat(2, 1, CV_32FC1, _data0.data());
+                cv::Mat       hessian    = cv::Mat(2, 2, CV_32FC1, _data1.data());
+                if (std::fabs(dxx * dyy - dxy * dxy) > 1e-6)
+                {
+                    cv::Mat hessianinv = hessian.inv();
+                    cv::Mat offset     = -hessianinv * derivative;
+                    *(pred.data<float>() + k * 3 + 0) += offset.at<float>(0, 0);
+                    *(pred.data<float>() + k * 3 + 1) += offset.at<float>(1, 0);
+                }
+            }
+        }
 
-    return pred;
-  }
-
-  void post_dark_udp(Tensor& pred, Tensor& heatmap, int kernel) {
-    int K = heatmap.shape(1);
-    int H = heatmap.shape(2);
-    int W = heatmap.shape(3);
-    for (int i = 0; i < K; i++) {
-      float* data = heatmap.data<float>() + i * H * W;
-      cv::Mat work = cv::Mat(H, W, CV_32FC(1), data);
-      cv::GaussianBlur(work, work, {kernel, kernel}, 0);  // inplace
-    }
-    std::for_each(heatmap.data<float>(), heatmap.data<float>() + K * H * W, [](float& x) {
-      x = std::max(0.001f, std::min(50.f, x));
-      x = std::log(x);
-    });
-    auto _heatmap_data = [&](int index, int c) -> float {
-      int y = index / (W + 2);
-      int x = index % (W + 2);
-      y = std::max(0, y - 1);
-      x = std::max(0, x - 1);
-      return *(heatmap.data<float>() + c * H * W + y * W + x);
-    };
-    for (int i = 0; i < K; i++) {
-      float* data = pred.data<float>() + i * 3;
-      int index = *(data + 0) + 1 + (*(data + 1) + 1) * (W + 2);
-      float i_ = _heatmap_data(index, i);
-      float ix1 = _heatmap_data(index + 1, i);
-      float iy1 = _heatmap_data(index + W + 2, i);
-      float ix1y1 = _heatmap_data(index + W + 3, i);
-      float ix1_y1_ = _heatmap_data(index - W - 3, i);
-      float ix1_ = _heatmap_data(index - 1, i);
-      float iy1_ = _heatmap_data(index - 2 - W, i);
-      float dx = 0.5 * (ix1 - ix1_);
-      float dy = 0.5 * (iy1 - iy1_);
-      float dxx = ix1 - 2 * i_ + ix1_;
-      float dyy = iy1 - 2 * i_ + iy1_;
-      float dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_);
-      vector<float> _data0 = {dx, dy};
-      vector<float> _data1 = {dxx, dxy, dxy, dyy};
-      cv::Mat derivative = cv::Mat(2, 1, CV_32FC1, _data0.data());
-      cv::Mat hessian = cv::Mat(2, 2, CV_32FC1, _data1.data());
-      cv::Mat hessianinv = hessian.inv();
-      cv::Mat offset = -hessianinv * derivative;
-      *(data + 0) += offset.at<float>(0, 0);
-      *(data + 1) += offset.at<float>(1, 0);
-    }
-  }
-
-  void transform_pred(Tensor& pred, int k, const vector<float>& center, const vector<float>& _scale,
-                      const vector<int>& output_size, bool use_udp = false) {
-    auto scale = _scale;
-    scale[0] *= 200;
-    scale[1] *= 200;
-
-    float scale_x, scale_y;
-    if (use_udp) {
-      scale_x = scale[0] / (output_size[0] - 1.0);
-      scale_y = scale[1] / (output_size[1] - 1.0);
-    } else {
-      scale_x = scale[0] / output_size[0];
-      scale_y = scale[1] / output_size[1];
-    }
+        Tensor gaussian_blur(const Tensor& _heatmap, int kernel)
+        {
+            assert(kernel % 2 == 1);
+
+            auto   desc = _heatmap.desc();
+            Tensor heatmap(desc);
+
+            int    K          = _heatmap.shape(1);
+            int    H          = _heatmap.shape(2);
+            int    W          = _heatmap.shape(3);
+            int    num_points = H * W;
+
+            int    border = (kernel - 1) / 2;
+
+            for (int i = 0; i < K; i++)
+            {
+                int      offset     = i * H * W;
+                float*   data       = const_cast<float*>(_heatmap.data<float>()) + offset;
+                float    origin_max = *std::max_element(data, data + num_points);
+                cv::Mat  work       = cv::Mat(H + 2 * border, W + 2 * border, CV_32FC1, cv::Scalar{});
+                cv::Mat  curr       = cv::Mat(H, W, CV_32FC1, data);
+                cv::Rect roi        = {border, border, W, H};
+                curr.copyTo(work(roi));
+                cv::GaussianBlur(work, work, {kernel, kernel}, 0);
+                cv::Mat valid   = work(roi).clone();
+                float   cur_max = *std::max_element((float*)valid.data, (float*)valid.data + num_points);
+                float*  dst     = heatmap.data<float>() + offset;
+                std::transform((float*)valid.data, (float*)valid.data + num_points, dst, [&](float v)
+                               { return v * origin_max / cur_max; });
+            }
+            return heatmap;
+        }
 
-    float* data = pred.data<float>() + k * 3;
-    *(data + 0) = *(data + 0) * scale_x + center[0] - scale[0] * 0.5;
-    *(data + 1) = *(data + 1) * scale_y + center[1] - scale[1] * 0.5;
-  }
-
-  void taylor(const Tensor& heatmap, Tensor& pred, int k) {
-    int K = heatmap.shape(1);
-    int H = heatmap.shape(2);
-    int W = heatmap.shape(3);
-    int px = *(pred.data<float>() + k * 3 + 0);
-    int py = *(pred.data<float>() + k * 3 + 1);
-    if (1 < px && px < W - 2 && 1 < py && py < H - 2) {
-      float* data = const_cast<float*>(heatmap.data<float>() + k * H * W);
-      auto get_data = [&](int r, int c) { return *(data + r * W + c); };
-      float dx = 0.5 * (get_data(py, px + 1) - get_data(py, px - 1));
-      float dy = 0.5 * (get_data(py + 1, px) - get_data(py - 1, px));
-      float dxx = 0.25 * (get_data(py, px + 2) - 2 * get_data(py, px) + get_data(py, px - 2));
-      float dxy = 0.25 * (get_data(py + 1, px + 1) - get_data(py - 1, px + 1) -
-                          get_data(py + 1, px - 1) + get_data(py - 1, px - 1));
-      float dyy = 0.25 * (get_data(py + 2, px) - 2 * get_data(py, px) + get_data(py - 2, px));
-
-      vector<float> _data0 = {dx, dy};
-      vector<float> _data1 = {dxx, dxy, dxy, dyy};
-      cv::Mat derivative = cv::Mat(2, 1, CV_32FC1, _data0.data());
-      cv::Mat hessian = cv::Mat(2, 2, CV_32FC1, _data1.data());
-      if (std::fabs(dxx * dyy - dxy * dxy) > 1e-6) {
-        cv::Mat hessianinv = hessian.inv();
-        cv::Mat offset = -hessianinv * derivative;
-        *(pred.data<float>() + k * 3 + 0) += offset.at<float>(0, 0);
-        *(pred.data<float>() + k * 3 + 1) += offset.at<float>(1, 0);
-      }
-    }
-  }
-
-  Tensor gaussian_blur(const Tensor& _heatmap, int kernel) {
-    assert(kernel % 2 == 1);
-
-    auto desc = _heatmap.desc();
-    Tensor heatmap(desc);
-
-    int K = _heatmap.shape(1);
-    int H = _heatmap.shape(2);
-    int W = _heatmap.shape(3);
-    int num_points = H * W;
-
-    int border = (kernel - 1) / 2;
-
-    for (int i = 0; i < K; i++) {
-      int offset = i * H * W;
-      float* data = const_cast<float*>(_heatmap.data<float>()) + offset;
-      float origin_max = *std::max_element(data, data + num_points);
-      cv::Mat work = cv::Mat(H + 2 * border, W + 2 * border, CV_32FC1, cv::Scalar{});
-      cv::Mat curr = cv::Mat(H, W, CV_32FC1, data);
-      cv::Rect roi = {border, border, W, H};
-      curr.copyTo(work(roi));
-      cv::GaussianBlur(work, work, {kernel, kernel}, 0);
-      cv::Mat valid = work(roi).clone();
-      float cur_max = *std::max_element((float*)valid.data, (float*)valid.data + num_points);
-      float* dst = heatmap.data<float>() + offset;
-      std::transform((float*)valid.data, (float*)valid.data + num_points, dst,
-                     [&](float v) { return v * origin_max / cur_max; });
-    }
-    return heatmap;
-  }
-
-  Tensor get_max_pred(const Tensor& heatmap) {
-    int K = heatmap.shape(1);
-    int H = heatmap.shape(2);
-    int W = heatmap.shape(3);
-    int num_points = H * W;
-    TensorDesc pred_desc = {Device{"cpu"}, DataType::kFLOAT, {1, K, 3}};
-    Tensor pred(pred_desc);
-
-    for (int i = 0; i < K; i++) {
-      float* src_data = const_cast<float*>(heatmap.data<float>()) + i * H * W;
-      cv::Mat mat = cv::Mat(H, W, CV_32FC1, src_data);
-      double min_val, max_val;
-      cv::Point min_loc, max_loc;
-      cv::minMaxLoc(mat, &min_val, &max_val, &min_loc, &max_loc);
-      float* dst_data = pred.data<float>() + i * 3;
-      *(dst_data + 0) = -1;
-      *(dst_data + 1) = -1;
-      *(dst_data + 2) = max_val;
-      if (max_val > 0.0) {
-        *(dst_data + 0) = max_loc.x;
-        *(dst_data + 1) = max_loc.y;
-      }
-    }
+        Tensor get_max_pred(const Tensor& heatmap)
+        {
+            int        K          = heatmap.shape(1);
+            int        H          = heatmap.shape(2);
+            int        W          = heatmap.shape(3);
+            int        num_points = H * W;
+            TensorDesc pred_desc  = {Device{"cpu"}, DataType::kFLOAT, {1, K, 3}};
+            Tensor     pred(pred_desc);
+
+            for (int i = 0; i < K; i++)
+            {
+                float*    src_data = const_cast<float*>(heatmap.data<float>()) + i * H * W;
+                cv::Mat   mat      = cv::Mat(H, W, CV_32FC1, src_data);
+                double    min_val, max_val;
+                cv::Point min_loc, max_loc;
+                cv::minMaxLoc(mat, &min_val, &max_val, &min_loc, &max_loc);
+                float* dst_data = pred.data<float>() + i * 3;
+                *(dst_data + 0) = -1;
+                *(dst_data + 1) = -1;
+                *(dst_data + 2) = max_val;
+                if (max_val > 0.0)
+                {
+                    *(dst_data + 0) = max_loc.x;
+                    *(dst_data + 1) = max_loc.y;
+                }
+            }
+
+            return pred;
+        }
+
+      private:
+        bool   flip_test_{true};
+        bool   shift_heatmap_{true};
+        string post_process_ = {"default"};
+        int    modulate_kernel_{11};
+        bool   unbiased_decoding_{false};
+        float  valid_radius_factor_{0.0546875f};
+        bool   use_udp_{false};
+        string target_type_{"GaussianHeatmap"};
+    };
 
-    return pred;
-  }
-
- private:
-  bool flip_test_{true};
-  bool shift_heatmap_{true};
-  string post_process_ = {"default"};
-  int modulate_kernel_{11};
-  bool unbiased_decoding_{false};
-  float valid_radius_factor_{0.0546875f};
-  bool use_udp_{false};
-  string target_type_{"GaussianHeatmap"};
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapBaseHeadDecode);
-
-// decode process is same
-using TopdownHeatmapSimpleHeadDecode = TopdownHeatmapBaseHeadDecode;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapSimpleHeadDecode);
-using TopdownHeatmapMultiStageHeadDecode = TopdownHeatmapBaseHeadDecode;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapMultiStageHeadDecode);
-using ViPNASHeatmapSimpleHeadDecode = TopdownHeatmapBaseHeadDecode;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, ViPNASHeatmapSimpleHeadDecode);
-using TopdownHeatmapMSMUHeadDecode = TopdownHeatmapBaseHeadDecode;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapMSMUHeadDecode);
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapBaseHeadDecode);
+
+    // decode process is same
+    using TopdownHeatmapSimpleHeadDecode = TopdownHeatmapBaseHeadDecode;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapSimpleHeadDecode);
+    using TopdownHeatmapMultiStageHeadDecode = TopdownHeatmapBaseHeadDecode;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapMultiStageHeadDecode);
+    using ViPNASHeatmapSimpleHeadDecode = TopdownHeatmapBaseHeadDecode;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, ViPNASHeatmapSimpleHeadDecode);
+    using TopdownHeatmapMSMUHeadDecode = TopdownHeatmapBaseHeadDecode;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapMSMUHeadDecode);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmpose/keypoints_from_regression.cpp b/csrc/mmdeploy/codebase/mmpose/keypoints_from_regression.cpp
index 06f372c788..e31376ce88 100644
--- a/csrc/mmdeploy/codebase/mmpose/keypoints_from_regression.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/keypoints_from_regression.cpp
@@ -12,102 +12,115 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmpose.h"
 
-namespace mmdeploy::mmpose {
-
-using std::string;
-using std::vector;
-
-class DeepposeRegressionHeadDecode : public MMPose {
- public:
-  explicit DeepposeRegressionHeadDecode(const Value& config) : MMPose(config) {}
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) {
-    MMDEPLOY_DEBUG("preprocess_result: {}", _data);
-    MMDEPLOY_DEBUG("inference_result: {}", _prob);
-
-    Device cpu_device{"cpu"};
-    OUTCOME_TRY(auto output,
-                MakeAvailableOnDevice(_prob["output"].get<Tensor>(), cpu_device, stream()));
-    OUTCOME_TRY(stream().Wait());
-    if (!(output.shape().size() == 3 && output.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(),
-                     (int)output.data_type());
-      return Status(eNotSupported);
-    }
-
-    auto& img_metas = _data["img_metas"];
-
-    vector<float> center;
-    vector<float> scale;
-    from_value(img_metas["center"], center);
-    from_value(img_metas["scale"], scale);
-    vector<int> img_size = {img_metas["img_shape"][2].get<int>(),
-                            img_metas["img_shape"][1].get<int>()};
-
-    Tensor pred = keypoints_from_regression(output, center, scale, img_size);
-
-    return GetOutput(pred);
-  }
-
-  Value GetOutput(Tensor& pred) {
-    PoseDetectorOutput output;
-    int K = pred.shape(1);
-    float* data = pred.data<float>();
-    for (int i = 0; i < K; i++) {
-      float x = *(data + 0);
-      float y = *(data + 1);
-      float s = *(data + 2);
-      output.key_points.push_back({{x, y}, s});
-      data += 3;
-    }
-    return to_value(std::move(output));
-  }
-
-  Tensor keypoints_from_regression(const Tensor& output, const vector<float>& center,
-                                   const vector<float>& scale, const vector<int>& img_size) {
-    int K = output.shape(1);
-    TensorDesc pred_desc = {Device{"cpu"}, DataType::kFLOAT, {1, K, 3}};
-    Tensor pred(pred_desc);
-
-    float* src = const_cast<float*>(output.data<float>());
-    float* dst = pred.data<float>();
-    for (int i = 0; i < K; i++) {
-      *(dst + 0) = *(src + 0) * img_size[0];
-      *(dst + 1) = *(src + 1) * img_size[1];
-      *(dst + 2) = 1.f;
-      src += 2;
-      dst += 3;
-    }
-
-    // Transform back to the image
-    for (int i = 0; i < K; i++) {
-      transform_pred(pred, i, center, scale, img_size, false);
-    }
-
-    return pred;
-  }
-
-  void transform_pred(Tensor& pred, int k, const vector<float>& center, const vector<float>& _scale,
-                      const vector<int>& output_size, bool use_udp = false) {
-    auto scale = _scale;
-    scale[0] *= 200;
-    scale[1] *= 200;
-
-    float scale_x, scale_y;
-    if (use_udp) {
-      scale_x = scale[0] / (output_size[0] - 1.0);
-      scale_y = scale[1] / (output_size[1] - 1.0);
-    } else {
-      scale_x = scale[0] / output_size[0];
-      scale_y = scale[1] / output_size[1];
-    }
-
-    float* data = pred.data<float>() + k * 3;
-    *(data + 0) = *(data + 0) * scale_x + center[0] - scale[0] * 0.5;
-    *(data + 1) = *(data + 1) * scale_y + center[1] - scale[1] * 0.5;
-  }
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, DeepposeRegressionHeadDecode);
+namespace mmdeploy::mmpose
+{
+
+    using std::string;
+    using std::vector;
+
+    class DeepposeRegressionHeadDecode : public MMPose
+    {
+      public:
+        explicit DeepposeRegressionHeadDecode(const Value& config)
+            : MMPose(config)
+        {
+        }
+
+        Result<Value> operator()(const Value& _data, const Value& _prob)
+        {
+            MMDEPLOY_DEBUG("preprocess_result: {}", _data);
+            MMDEPLOY_DEBUG("inference_result: {}", _prob);
+
+            Device cpu_device{"cpu"};
+            OUTCOME_TRY(auto output,
+                        MakeAvailableOnDevice(_prob["output"].get<Tensor>(), cpu_device, stream()));
+            OUTCOME_TRY(stream().Wait());
+            if (!(output.shape().size() == 3 && output.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(), (int)output.data_type());
+                return Status(eNotSupported);
+            }
+
+            auto&         img_metas = _data["img_metas"];
+
+            vector<float> center;
+            vector<float> scale;
+            from_value(img_metas["center"], center);
+            from_value(img_metas["scale"], scale);
+            vector<int> img_size = {img_metas["img_shape"][2].get<int>(),
+                                    img_metas["img_shape"][1].get<int>()};
+
+            Tensor      pred = keypoints_from_regression(output, center, scale, img_size);
+
+            return GetOutput(pred);
+        }
+
+        Value GetOutput(Tensor& pred)
+        {
+            PoseDetectorOutput output;
+            int                K    = pred.shape(1);
+            float*             data = pred.data<float>();
+            for (int i = 0; i < K; i++)
+            {
+                float x = *(data + 0);
+                float y = *(data + 1);
+                float s = *(data + 2);
+                output.key_points.push_back({{x, y}, s});
+                data += 3;
+            }
+            return to_value(std::move(output));
+        }
+
+        Tensor keypoints_from_regression(const Tensor& output, const vector<float>& center, const vector<float>& scale, const vector<int>& img_size)
+        {
+            int        K         = output.shape(1);
+            TensorDesc pred_desc = {Device{"cpu"}, DataType::kFLOAT, {1, K, 3}};
+            Tensor     pred(pred_desc);
+
+            float*     src = const_cast<float*>(output.data<float>());
+            float*     dst = pred.data<float>();
+            for (int i = 0; i < K; i++)
+            {
+                *(dst + 0) = *(src + 0) * img_size[0];
+                *(dst + 1) = *(src + 1) * img_size[1];
+                *(dst + 2) = 1.f;
+                src += 2;
+                dst += 3;
+            }
+
+            // Transform back to the image
+            for (int i = 0; i < K; i++)
+            {
+                transform_pred(pred, i, center, scale, img_size, false);
+            }
+
+            return pred;
+        }
+
+        void transform_pred(Tensor& pred, int k, const vector<float>& center, const vector<float>& _scale, const vector<int>& output_size, bool use_udp = false)
+        {
+            auto scale = _scale;
+            scale[0] *= 200;
+            scale[1] *= 200;
+
+            float scale_x, scale_y;
+            if (use_udp)
+            {
+                scale_x = scale[0] / (output_size[0] - 1.0);
+                scale_y = scale[1] / (output_size[1] - 1.0);
+            }
+            else
+            {
+                scale_x = scale[0] / output_size[0];
+                scale_y = scale[1] / output_size[1];
+            }
+
+            float* data = pred.data<float>() + k * 3;
+            *(data + 0) = *(data + 0) * scale_x + center[0] - scale[0] * 0.5;
+            *(data + 1) = *(data + 1) * scale_y + center[1] - scale[1] * 0.5;
+        }
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, DeepposeRegressionHeadDecode);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmpose/mmpose.cpp b/csrc/mmdeploy/codebase/mmpose/mmpose.cpp
index 6b2940b9f9..c4e2583eae 100644
--- a/csrc/mmdeploy/codebase/mmpose/mmpose.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/mmpose.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmpose/mmpose.h"
 
-namespace mmdeploy::mmpose {
+namespace mmdeploy::mmpose
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMPose);
+    MMDEPLOY_REGISTER_CODEBASE(MMPose);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmpose/mmpose.h b/csrc/mmdeploy/codebase/mmpose/mmpose.h
index 3da9a9332a..f219cd1892 100644
--- a/csrc/mmdeploy/codebase/mmpose/mmpose.h
+++ b/csrc/mmdeploy/codebase/mmpose/mmpose.h
@@ -9,19 +9,22 @@
 #include "mmdeploy/core/device.h"
 #include "mmdeploy/core/module.h"
 
-namespace mmdeploy::mmpose {
-
-struct PoseDetectorOutput {
-  struct KeyPoint {
-    std::array<float, 2> bbox;  // x, y
-    float score;
-    MMDEPLOY_ARCHIVE_MEMBERS(bbox, score);
-  };
-  std::vector<KeyPoint> key_points;
-  MMDEPLOY_ARCHIVE_MEMBERS(key_points);
-};
-
-MMDEPLOY_DECLARE_CODEBASE(MMPose, mmpose);
+namespace mmdeploy::mmpose
+{
+
+    struct PoseDetectorOutput
+    {
+        struct KeyPoint
+        {
+            std::array<float, 2> bbox;  // x, y
+            float                score;
+            MMDEPLOY_ARCHIVE_MEMBERS(bbox, score);
+        };
+        std::vector<KeyPoint> key_points;
+        MMDEPLOY_ARCHIVE_MEMBERS(key_points);
+    };
+
+    MMDEPLOY_DECLARE_CODEBASE(MMPose, mmpose);
 
 }  // namespace mmdeploy::mmpose
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/common.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/common.h
index ec24648148..b7cd819e13 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/common.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/common.h
@@ -9,48 +9,52 @@
 #include "mmdeploy/core/mpl/type_traits.h"
 #include "mmdeploy/pose_tracker.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-struct TrackerResult {
-  std::vector<std::vector<mmdeploy_point_t>> keypoints;
-  std::vector<std::vector<float>> scores;
-  std::vector<mmdeploy_rect_t> bboxes;
-  std::vector<uint32_t> track_ids;
-  // debug info
-  std::vector<std::array<float, 4>> pose_input_bboxes;
-  std::vector<std::array<float, 4>> pose_output_bboxes;
-};
-
-inline void SetDefaultParams(mmdeploy_pose_tracker_param_t& p) {
-  p.det_interval = 1;
-  p.det_label = 0;
-  p.det_min_bbox_size = -1;
-  p.det_thr = .5f;
-  p.det_nms_thr = .7f;
-  p.pose_max_num_bboxes = -1;
-  p.pose_min_keypoints = -1;
-  p.pose_min_bbox_size = 0;
-  p.pose_kpt_thr = .5f;
-  p.pose_nms_thr = .5f;
-  p.keypoint_sigmas = nullptr;
-  p.keypoint_sigmas_size = 0;
-  p.track_iou_thr = .4f;
-  p.pose_bbox_scale = 1.25f;
-  p.track_max_missing = 10;
-  p.track_history_size = 1;
-
-  p.std_weight_position = 1 / 20.f;
-  p.std_weight_velocity = 1 / 160.f;
-
-  (std::array<float, 3>&)p.smooth_params = {0.007, 1., 1.};
-}
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    struct TrackerResult
+    {
+        std::vector<std::vector<mmdeploy_point_t>> keypoints;
+        std::vector<std::vector<float>>            scores;
+        std::vector<mmdeploy_rect_t>               bboxes;
+        std::vector<uint32_t>                      track_ids;
+        // debug info
+        std::vector<std::array<float, 4>>          pose_input_bboxes;
+        std::vector<std::array<float, 4>>          pose_output_bboxes;
+    };
+
+    inline void SetDefaultParams(mmdeploy_pose_tracker_param_t& p)
+    {
+        p.det_interval         = 1;
+        p.det_label            = 0;
+        p.det_min_bbox_size    = -1;
+        p.det_thr              = .5f;
+        p.det_nms_thr          = .7f;
+        p.pose_max_num_bboxes  = -1;
+        p.pose_min_keypoints   = -1;
+        p.pose_min_bbox_size   = 0;
+        p.pose_kpt_thr         = .5f;
+        p.pose_nms_thr         = .5f;
+        p.keypoint_sigmas      = nullptr;
+        p.keypoint_sigmas_size = 0;
+        p.track_iou_thr        = .4f;
+        p.pose_bbox_scale      = 1.25f;
+        p.track_max_missing    = 10;
+        p.track_history_size   = 1;
+
+        p.std_weight_position = 1 / 20.f;
+        p.std_weight_velocity = 1 / 160.f;
+
+        (std::array<float, 3>&)p.smooth_params = {0.007, 1., 1.};
+    }
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_REGISTER_TYPE_ID(mmdeploy_pose_tracker_param_t*, 0x32bc6919d5287035);
-MMDEPLOY_REGISTER_TYPE_ID(mmpose::_pose_tracker::TrackerResult, 0xacb6ddb7dc1ffbca);
+    MMDEPLOY_REGISTER_TYPE_ID(mmdeploy_pose_tracker_param_t*, 0x32bc6919d5287035);
+    MMDEPLOY_REGISTER_TYPE_ID(mmpose::_pose_tracker::TrackerResult, 0xacb6ddb7dc1ffbca);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pipeline.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pipeline.cpp
index 6ae6012b2f..7ac22e2049 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pipeline.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pipeline.cpp
@@ -7,111 +7,126 @@
 #include "pose_tracker/common.h"
 #include "pose_tracker/pose_tracker.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_REGISTER_TYPE_ID(mmpose::_pose_tracker::Tracker, 0xcfe87980aa895d3a);
+    MMDEPLOY_REGISTER_TYPE_ID(mmpose::_pose_tracker::Tracker, 0xcfe87980aa895d3a);
 
 }
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
 #define REGISTER_SIMPLE_MODULE(name, fn) \
-  MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (name, 0), [](const Value&) { return CreateTask(fn); });
-
-Value Prepare(const Value& data, const Value& use_det, Value state) {
-  auto& tracker = state.get_ref<Tracker&>();
-  // set frame size for the first frame
-  if (tracker.frame_id() == 0) {
-    auto& frame = data["ori_img"].get_ref<const framework::Mat&>();
-    tracker.SetFrameSize(frame.height(), frame.width());
-  }
-  // use_det is set to non-auto mode
-  if (use_det.get<int>() != -1) {
-    return use_det;
-  }
-  auto interval = tracker.params().det_interval;
-  return interval > 0 && tracker.frame_id() % interval == 0;
-}
-
-REGISTER_SIMPLE_MODULE(pose_tracker::Prepare, Prepare);
-
-std::tuple<Value, Value> ProcessBboxes(const Value& det_val, const Value& data,
-                                       Value state) noexcept {
-  auto& tracker = state.get_ref<Tracker&>();
-
-  std::optional<Tracker::Detections> dets;
-
-  if (det_val.is_array()) {  // has detections
-    auto& [bboxes, scores, labels] = dets.emplace();
-    for (const auto& det : det_val.array()) {
-      bboxes.push_back(from_value<Bbox>(det["bbox"]));
-      scores.push_back(det["score"].get<float>());
-      labels.push_back(det["label_id"].get<int>());
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (name, 0), [](const Value&) { return CreateTask(fn); });
+
+    Value Prepare(const Value& data, const Value& use_det, Value state)
+    {
+        auto& tracker = state.get_ref<Tracker&>();
+        // set frame size for the first frame
+        if (tracker.frame_id() == 0)
+        {
+            auto& frame = data["ori_img"].get_ref<const framework::Mat&>();
+            tracker.SetFrameSize(frame.height(), frame.width());
+        }
+        // use_det is set to non-auto mode
+        if (use_det.get<int>() != -1)
+        {
+            return use_det;
+        }
+        auto interval = tracker.params().det_interval;
+        return interval > 0 && tracker.frame_id() % interval == 0;
     }
-  }
-
-  auto [bboxes, ids] = tracker.ProcessBboxes(dets);
-
-  Value::Array bbox_array;
-  Value track_ids_array;
-  // attach bboxes to image data
-  for (auto& bbox : bboxes) {
-    cv::Rect rect(cv::Rect2f(cv::Point2f(bbox[0], bbox[1]), cv::Point2f(bbox[2], bbox[3])));
-    bbox_array.push_back({
-        {"img", data["img"]},                                 // img
-        {"bbox", {rect.x, rect.y, rect.width, rect.height}},  // bbox
-    });
-  }
-
-  track_ids_array = to_value(ids);
-  return {std::move(bbox_array), std::move(track_ids_array)};
-}
-REGISTER_SIMPLE_MODULE(pose_tracker::ProcessBboxes, ProcessBboxes);
-
-Value TrackStep(const Value& poses, const Value& track_indices, Value state) noexcept {
-  assert(poses.is_array());
-  vector<Points> keypoints;
-  vector<Scores> scores;
-  for (auto& output : poses.array()) {
-    auto& k = keypoints.emplace_back();
-    auto& s = scores.emplace_back();
-    float avg = 0.f;
-    for (auto& kpt : output["key_points"].array()) {
-      k.emplace_back(kpt["bbox"][0].get<float>(), kpt["bbox"][1].get<float>());
-      s.push_back(kpt["score"].get<float>());
-      avg += s.back();
-    }
-  }
-  vector<int64_t> track_ids;
-  from_value(track_indices, track_ids);
-  auto& tracker = state.get_ref<Tracker&>();
-  tracker.TrackStep(keypoints, scores, track_ids);
-  TrackerResult result;
-  for (const auto& track : tracker.tracks()) {
-    if (track->missing()) {
-      continue;
+
+    REGISTER_SIMPLE_MODULE(pose_tracker::Prepare, Prepare);
+
+    std::tuple<Value, Value> ProcessBboxes(const Value& det_val, const Value& data, Value state) noexcept
+    {
+        auto&                              tracker = state.get_ref<Tracker&>();
+
+        std::optional<Tracker::Detections> dets;
+
+        if (det_val.is_array())
+        {  // has detections
+            auto& [bboxes, scores, labels] = dets.emplace();
+            for (const auto& det : det_val.array())
+            {
+                bboxes.push_back(from_value<Bbox>(det["bbox"]));
+                scores.push_back(det["score"].get<float>());
+                labels.push_back(det["label_id"].get<int>());
+            }
+        }
+
+        auto [bboxes, ids] = tracker.ProcessBboxes(dets);
+
+        Value::Array bbox_array;
+        Value        track_ids_array;
+        // attach bboxes to image data
+        for (auto& bbox : bboxes)
+        {
+            cv::Rect rect(cv::Rect2f(cv::Point2f(bbox[0], bbox[1]), cv::Point2f(bbox[2], bbox[3])));
+            bbox_array.push_back({
+                {"img", data["img"]},                                 // img
+                {"bbox", {rect.x, rect.y, rect.width, rect.height}},  // bbox
+            });
+        }
+
+        track_ids_array = to_value(ids);
+        return {std::move(bbox_array), std::move(track_ids_array)};
     }
-    vector<mmdeploy_point_t> kpts;
-    kpts.reserve(track->smoothed_kpts().size());
-    for (const auto& kpt : track->smoothed_kpts()) {
-      kpts.push_back({kpt.x, kpt.y});
+    REGISTER_SIMPLE_MODULE(pose_tracker::ProcessBboxes, ProcessBboxes);
+
+    Value TrackStep(const Value& poses, const Value& track_indices, Value state) noexcept
+    {
+        assert(poses.is_array());
+        vector<Points> keypoints;
+        vector<Scores> scores;
+        for (auto& output : poses.array())
+        {
+            auto& k   = keypoints.emplace_back();
+            auto& s   = scores.emplace_back();
+            float avg = 0.f;
+            for (auto& kpt : output["key_points"].array())
+            {
+                k.emplace_back(kpt["bbox"][0].get<float>(), kpt["bbox"][1].get<float>());
+                s.push_back(kpt["score"].get<float>());
+                avg += s.back();
+            }
+        }
+        vector<int64_t> track_ids;
+        from_value(track_indices, track_ids);
+        auto& tracker = state.get_ref<Tracker&>();
+        tracker.TrackStep(keypoints, scores, track_ids);
+        TrackerResult result;
+        for (const auto& track : tracker.tracks())
+        {
+            if (track->missing())
+            {
+                continue;
+            }
+            vector<mmdeploy_point_t> kpts;
+            kpts.reserve(track->smoothed_kpts().size());
+            for (const auto& kpt : track->smoothed_kpts())
+            {
+                kpts.push_back({kpt.x, kpt.y});
+            }
+            result.keypoints.push_back(std::move(kpts));
+            result.scores.push_back(track->scores());
+            auto& bbox = track->smoothed_bbox();
+            result.bboxes.push_back({bbox[0], bbox[1], bbox[2], bbox[3]});
+            result.track_ids.push_back(track->track_id());
+        }
+        result.pose_input_bboxes  = tracker.pose_input_bboxes();
+        result.pose_output_bboxes = tracker.pose_output_bboxes();
+        return result;
     }
-    result.keypoints.push_back(std::move(kpts));
-    result.scores.push_back(track->scores());
-    auto& bbox = track->smoothed_bbox();
-    result.bboxes.push_back({bbox[0], bbox[1], bbox[2], bbox[3]});
-    result.track_ids.push_back(track->track_id());
-  }
-  result.pose_input_bboxes = tracker.pose_input_bboxes();
-  result.pose_output_bboxes = tracker.pose_output_bboxes();
-  return result;
-}
-REGISTER_SIMPLE_MODULE(pose_tracker::TrackStep, TrackStep);
+    REGISTER_SIMPLE_MODULE(pose_tracker::TrackStep, TrackStep);
 
-// MSVC toolset v143 keeps ICEing when using a lambda here
-static Value CreateTracker(mmdeploy_pose_tracker_param_t* param) {
-  return make_pointer(Tracker{*param});
-}
-REGISTER_SIMPLE_MODULE(pose_tracker::Create, CreateTracker);
+    // MSVC toolset v143 keeps ICEing when using a lambda here
+    static Value CreateTracker(mmdeploy_pose_tracker_param_t* param)
+    {
+        return make_pointer(Tracker{*param});
+    }
+    REGISTER_SIMPLE_MODULE(pose_tracker::Create, CreateTracker);
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.cpp
index e1671dd7ac..712de43174 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.cpp
@@ -9,391 +9,461 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "pose_tracker/utils.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-Tracker::Tracker(const mmdeploy_pose_tracker_param_t& _params) : params_(_params) {
-  if (params_.keypoint_sigmas && params_.keypoint_sigmas_size) {
-    std::copy_n(params_.keypoint_sigmas, params_.keypoint_sigmas_size,
-                std::back_inserter(key_point_sigmas_));
-    params_.keypoint_sigmas = key_point_sigmas_.data();
-  }
-}
-
-void Tracker::SuppressOverlappingBoxes(const vector<Bbox>& bboxes,
-                                       vector<std::pair<int, float>>& ranks,
-                                       vector<int>& is_valid_bbox) const {
-  vector<float> iou(ranks.size() * ranks.size());
-  for (int i = 0; i < bboxes.size(); ++i) {
-    for (int j = 0; j < i; ++j) {
-      iou[i * bboxes.size() + j] = iou[j * bboxes.size() + i] =
-          intersection_over_union(bboxes[i], bboxes[j]);
-    }
-  }
-  suppress_non_maximum(ranks, iou, is_valid_bbox, params_.det_nms_thr);
-}
-
-void Tracker::SuppressOverlappingPoses(const vector<Points>& keypoints,
-                                       const vector<Scores>& scores, const vector<Bbox>& bboxes,
-                                       const vector<int64_t>& track_ids, vector<int>& is_valid,
-                                       float oks_thr) {
-  assert(keypoints.size() == is_valid.size());
-  assert(scores.size() == is_valid.size());
-  assert(bboxes.size() == is_valid.size());
-  const auto size = is_valid.size();
-  vector<float> similarity(size * size);
-  for (int i = 0; i < size; ++i) {
-    if (is_valid[i]) {
-      for (int j = 0; j < i; ++j) {
-        if (is_valid[j]) {
-          similarity[i * size + j] = similarity[j * size + i] =
-              GetPosePoseSimilarity(bboxes[i], keypoints[i], bboxes[j], keypoints[j]);
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    Tracker::Tracker(const mmdeploy_pose_tracker_param_t& _params)
+        : params_(_params)
+    {
+        if (params_.keypoint_sigmas && params_.keypoint_sigmas_size)
+        {
+            std::copy_n(params_.keypoint_sigmas, params_.keypoint_sigmas_size, std::back_inserter(key_point_sigmas_));
+            params_.keypoint_sigmas = key_point_sigmas_.data();
         }
-      }
-    }
-  }
-  vector<std::pair<bool, float>> ranks;
-  ranks.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    bool is_visible = false;
-    for (const auto& track : tracks_) {
-      if (track->track_id() == track_ids[i]) {
-        is_visible = track->missing() == 0;
-        break;
-      }
-    }
-    auto score = std::accumulate(scores[i].begin(), scores[i].end(), 0.f);
-    // prevents bboxes from missing tracks to suppress visible tracks
-    ranks.emplace_back(is_visible, score);
-  }
-  suppress_non_maximum(ranks, similarity, is_valid, oks_thr);
-}
-
-std::tuple<vector<Bbox>, vector<int64_t>> Tracker::ProcessBboxes(
-    const std::optional<Detections>& dets) {
-  vector<Bbox> bboxes;
-  vector<int64_t> prev_ids;
-
-  // 2 - visible tracks
-  // 1 - detection
-  // 0 - missing tracks
-  vector<int> types;
-
-  GetDetectedObjects(dets, bboxes, prev_ids, types);
-
-  GetTrackedObjects(bboxes, prev_ids, types);
-
-  vector<int> is_valid_bboxes(bboxes.size(), 1);
-
-  auto count = [&] {
-    std::array<int, 3> acc{};
-    for (size_t i = 0; i < is_valid_bboxes.size(); ++i) {
-      if (is_valid_bboxes[i]) {
-        ++acc[types[i]];
-      }
-    }
-    return acc;
-  };
-  POSE_TRACKER_DEBUG("frame {}, bboxes {}", frame_id_, count());
-
-  vector<std::pair<int, float>> ranks;
-  ranks.reserve(bboxes.size());
-  for (int i = 0; i < bboxes.size(); ++i) {
-    ranks.emplace_back(types[i], get_area(bboxes[i]));
-  }
-  SuppressOverlappingBoxes(bboxes, ranks, is_valid_bboxes);
-  POSE_TRACKER_DEBUG("frame {}, bboxes after nms: {}", frame_id_, count());
-
-  vector<int> idxs;
-  idxs.reserve(bboxes.size());
-  for (int i = 0; i < bboxes.size(); ++i) {
-    if (is_valid_bboxes[i]) {
-      idxs.push_back(i);
     }
-  }
-
-  std::stable_sort(idxs.begin(), idxs.end(), [&](int i, int j) { return ranks[i] > ranks[j]; });
-  std::fill(is_valid_bboxes.begin(), is_valid_bboxes.end(), 0);
-  {
-    vector<Bbox> tmp_bboxes;
-    vector<int64_t> tmp_track_ids;
-    for (const auto& i : idxs) {
-      if (tmp_bboxes.size() >= params_.pose_max_num_bboxes) {
-        break;
-      }
-      tmp_bboxes.push_back(bboxes[i]);
-      tmp_track_ids.push_back(prev_ids[i]);
-      is_valid_bboxes[i] = 1;
+
+    void Tracker::SuppressOverlappingBoxes(const vector<Bbox>&            bboxes,
+                                           vector<std::pair<int, float>>& ranks,
+                                           vector<int>&                   is_valid_bbox) const
+    {
+        vector<float> iou(ranks.size() * ranks.size());
+        for (int i = 0; i < bboxes.size(); ++i)
+        {
+            for (int j = 0; j < i; ++j)
+            {
+                iou[i * bboxes.size() + j] = iou[j * bboxes.size() + i] =
+                    intersection_over_union(bboxes[i], bboxes[j]);
+            }
+        }
+        suppress_non_maximum(ranks, iou, is_valid_bbox, params_.det_nms_thr);
     }
-    bboxes = std::move(tmp_bboxes);
-    prev_ids = std::move(tmp_track_ids);
-  }
-
-  pose_input_bboxes_ = bboxes;
-
-  POSE_TRACKER_DEBUG("frame {}, bboxes after sort: {}", frame_id_, count());
-  return {bboxes, prev_ids};
-}
-
-void Tracker::TrackStep(vector<Points>& keypoints, vector<Scores>& scores,
-                        const vector<int64_t>& prev_ids) noexcept {
-  vector<Bbox> bboxes(keypoints.size());
-  vector<int> is_unused_bbox(keypoints.size(), 1);
-
-  // key-points to bboxes
-  for (size_t i = 0; i < keypoints.size(); ++i) {
-    if (auto bbox = KeypointsToBbox(keypoints[i], scores[i])) {
-      bboxes[i] = *bbox;
-    } else {
-      is_unused_bbox[i] = false;
+
+    void Tracker::SuppressOverlappingPoses(const vector<Points>&  keypoints,
+                                           const vector<Scores>&  scores,
+                                           const vector<Bbox>&    bboxes,
+                                           const vector<int64_t>& track_ids,
+                                           vector<int>&           is_valid,
+                                           float                  oks_thr)
+    {
+        assert(keypoints.size() == is_valid.size());
+        assert(scores.size() == is_valid.size());
+        assert(bboxes.size() == is_valid.size());
+        const auto    size = is_valid.size();
+        vector<float> similarity(size * size);
+        for (int i = 0; i < size; ++i)
+        {
+            if (is_valid[i])
+            {
+                for (int j = 0; j < i; ++j)
+                {
+                    if (is_valid[j])
+                    {
+                        similarity[i * size + j] = similarity[j * size + i] =
+                            GetPosePoseSimilarity(bboxes[i], keypoints[i], bboxes[j], keypoints[j]);
+                    }
+                }
+            }
+        }
+        vector<std::pair<bool, float>> ranks;
+        ranks.reserve(size);
+        for (int i = 0; i < size; ++i)
+        {
+            bool is_visible = false;
+            for (const auto& track : tracks_)
+            {
+                if (track->track_id() == track_ids[i])
+                {
+                    is_visible = track->missing() == 0;
+                    break;
+                }
+            }
+            auto score = std::accumulate(scores[i].begin(), scores[i].end(), 0.f);
+            // prevents bboxes from missing tracks to suppress visible tracks
+            ranks.emplace_back(is_visible, score);
+        }
+        suppress_non_maximum(ranks, similarity, is_valid, oks_thr);
     }
-  }
 
-  pose_output_bboxes_ = bboxes;
+    std::tuple<vector<Bbox>, vector<int64_t>> Tracker::ProcessBboxes(
+        const std::optional<Detections>& dets)
+    {
+        vector<Bbox>    bboxes;
+        vector<int64_t> prev_ids;
+
+        // 2 - visible tracks
+        // 1 - detection
+        // 0 - missing tracks
+        vector<int>     types;
+
+        GetDetectedObjects(dets, bboxes, prev_ids, types);
+
+        GetTrackedObjects(bboxes, prev_ids, types);
+
+        vector<int> is_valid_bboxes(bboxes.size(), 1);
+
+        auto        count = [&]
+        {
+            std::array<int, 3> acc{};
+            for (size_t i = 0; i < is_valid_bboxes.size(); ++i)
+            {
+                if (is_valid_bboxes[i])
+                {
+                    ++acc[types[i]];
+                }
+            }
+            return acc;
+        };
+        POSE_TRACKER_DEBUG("frame {}, bboxes {}", frame_id_, count());
+
+        vector<std::pair<int, float>> ranks;
+        ranks.reserve(bboxes.size());
+        for (int i = 0; i < bboxes.size(); ++i)
+        {
+            ranks.emplace_back(types[i], get_area(bboxes[i]));
+        }
+        SuppressOverlappingBoxes(bboxes, ranks, is_valid_bboxes);
+        POSE_TRACKER_DEBUG("frame {}, bboxes after nms: {}", frame_id_, count());
+
+        vector<int> idxs;
+        idxs.reserve(bboxes.size());
+        for (int i = 0; i < bboxes.size(); ++i)
+        {
+            if (is_valid_bboxes[i])
+            {
+                idxs.push_back(i);
+            }
+        }
 
-  SuppressOverlappingPoses(keypoints, scores, bboxes, prev_ids, is_unused_bbox,
-                           params_.pose_nms_thr);
-  assert(is_unused_bbox.size() == bboxes.size());
+        std::stable_sort(idxs.begin(), idxs.end(), [&](int i, int j)
+                         { return ranks[i] > ranks[j]; });
+        std::fill(is_valid_bboxes.begin(), is_valid_bboxes.end(), 0);
+        {
+            vector<Bbox>    tmp_bboxes;
+            vector<int64_t> tmp_track_ids;
+            for (const auto& i : idxs)
+            {
+                if (tmp_bboxes.size() >= params_.pose_max_num_bboxes)
+                {
+                    break;
+                }
+                tmp_bboxes.push_back(bboxes[i]);
+                tmp_track_ids.push_back(prev_ids[i]);
+                is_valid_bboxes[i] = 1;
+            }
+            bboxes   = std::move(tmp_bboxes);
+            prev_ids = std::move(tmp_track_ids);
+        }
 
-  vector<float> similarity0;    // average mahalanobis dist - proportion of tracked key-points
-  vector<float> similarity1;    // iou
-  vector<vector<bool>> gating;  // key-point gating result
-  GetSimilarityMatrices(bboxes, keypoints, prev_ids, similarity0, similarity1, gating);
+        pose_input_bboxes_ = bboxes;
 
-  vector<int> is_unused_track(tracks_.size(), 1);
-  // disable missing tracks in the #1 assignment
-  for (int i = 0; i < tracks_.size(); ++i) {
-    if (tracks_[i]->missing()) {
-      is_unused_track[i] = 0;
-    }
-  }
-  const auto assignment_visible =
-      greedy_assignment(similarity0, is_unused_bbox, is_unused_track, -kInf / 10);
-  POSE_TRACKER_DEBUG("frame {}, assignment for visible {}", frame_id_, assignment_visible);
-
-  // enable missing tracks in the #2 assignment
-  for (int i = 0; i < tracks_.size(); ++i) {
-    if (tracks_[i]->missing()) {
-      is_unused_track[i] = 1;
-    }
-  }
-  const auto assignment_missing =
-      greedy_assignment(similarity1, is_unused_bbox, is_unused_track, params_.track_iou_thr);
-  POSE_TRACKER_DEBUG("frame {}, assignment for missing {}", frame_id_, assignment_missing);
-
-  // update assigned tracks
-  for (auto [i, j, _] : assignment_visible) {
-    tracks_[j]->UpdateVisible(bboxes[i], keypoints[i], scores[i], gating[i * tracks_.size() + j]);
-  }
-
-  // update recovered tracks
-  for (auto [i, j, _] : assignment_missing) {
-    tracks_[j]->UpdateRecovered(bboxes[i], keypoints[i], scores[i]);
-  }
-
-  // generating new tracks from detected bboxes
-  for (size_t i = 0; i < is_unused_bbox.size(); ++i) {
-    if (is_unused_bbox[i] && prev_ids[i] == -1) {
-      CreateTrack(bboxes[i], keypoints[i], scores[i]);
+        POSE_TRACKER_DEBUG("frame {}, bboxes after sort: {}", frame_id_, count());
+        return {bboxes, prev_ids};
     }
-  }
 
-  // update missing tracks
-  for (size_t i = 0; i < is_unused_track.size(); ++i) {
-    if (is_unused_track[i]) {
-      tracks_[i]->UpdateMissing();
-    }
-  }
+    void Tracker::TrackStep(vector<Points>& keypoints, vector<Scores>& scores, const vector<int64_t>& prev_ids) noexcept
+    {
+        vector<Bbox> bboxes(keypoints.size());
+        vector<int>  is_unused_bbox(keypoints.size(), 1);
+
+        // key-points to bboxes
+        for (size_t i = 0; i < keypoints.size(); ++i)
+        {
+            if (auto bbox = KeypointsToBbox(keypoints[i], scores[i]))
+            {
+                bboxes[i] = *bbox;
+            }
+            else
+            {
+                is_unused_bbox[i] = false;
+            }
+        }
+
+        pose_output_bboxes_ = bboxes;
+
+        SuppressOverlappingPoses(keypoints, scores, bboxes, prev_ids, is_unused_bbox, params_.pose_nms_thr);
+        assert(is_unused_bbox.size() == bboxes.size());
+
+        vector<float>        similarity0;  // average mahalanobis dist - proportion of tracked key-points
+        vector<float>        similarity1;  // iou
+        vector<vector<bool>> gating;       // key-point gating result
+        GetSimilarityMatrices(bboxes, keypoints, prev_ids, similarity0, similarity1, gating);
+
+        vector<int> is_unused_track(tracks_.size(), 1);
+        // disable missing tracks in the #1 assignment
+        for (int i = 0; i < tracks_.size(); ++i)
+        {
+            if (tracks_[i]->missing())
+            {
+                is_unused_track[i] = 0;
+            }
+        }
+        const auto assignment_visible =
+            greedy_assignment(similarity0, is_unused_bbox, is_unused_track, -kInf / 10);
+        POSE_TRACKER_DEBUG("frame {}, assignment for visible {}", frame_id_, assignment_visible);
+
+        // enable missing tracks in the #2 assignment
+        for (int i = 0; i < tracks_.size(); ++i)
+        {
+            if (tracks_[i]->missing())
+            {
+                is_unused_track[i] = 1;
+            }
+        }
+        const auto assignment_missing =
+            greedy_assignment(similarity1, is_unused_bbox, is_unused_track, params_.track_iou_thr);
+        POSE_TRACKER_DEBUG("frame {}, assignment for missing {}", frame_id_, assignment_missing);
+
+        // update assigned tracks
+        for (auto [i, j, _] : assignment_visible)
+        {
+            tracks_[j]->UpdateVisible(bboxes[i], keypoints[i], scores[i], gating[i * tracks_.size() + j]);
+        }
+
+        // update recovered tracks
+        for (auto [i, j, _] : assignment_missing)
+        {
+            tracks_[j]->UpdateRecovered(bboxes[i], keypoints[i], scores[i]);
+        }
+
+        // generating new tracks from detected bboxes
+        for (size_t i = 0; i < is_unused_bbox.size(); ++i)
+        {
+            if (is_unused_bbox[i] && prev_ids[i] == -1)
+            {
+                CreateTrack(bboxes[i], keypoints[i], scores[i]);
+            }
+        }
+
+        // update missing tracks
+        for (size_t i = 0; i < is_unused_track.size(); ++i)
+        {
+            if (is_unused_track[i])
+            {
+                tracks_[i]->UpdateMissing();
+            }
+        }
+
+        // diagnostic for missing tracks
+        DiagnosticMissingTracks(is_unused_track, is_unused_bbox, similarity0, similarity1);
 
-  // diagnostic for missing tracks
-  DiagnosticMissingTracks(is_unused_track, is_unused_bbox, similarity0, similarity1);
+        RemoveMissingTracks();
+
+        for (auto& track : tracks_)
+        {
+            track->Predict();
+        }
+
+        ++frame_id_;
+
+        // print track summary
+        //  SummaryTracks();
+    }
 
-  RemoveMissingTracks();
+    void Tracker::GetTrackedObjects(vector<Bbox>& bboxes, vector<int64_t>& track_ids, vector<int>& types) const
+    {
+        for (auto& track : tracks_)
+        {
+            std::optional<Bbox> bbox;
+            if (track->missing())
+            {
+                bbox = track->predicted_bbox();
+            }
+            else
+            {
+                bbox = keypoints_to_bbox(track->predicted_kpts(), track->scores(), frame_h_, frame_w_, params_.pose_bbox_scale, params_.pose_kpt_thr, params_.pose_min_keypoints);
+            }
+            if (bbox)
+            {
+                auto&            b = *bbox;
+                cv::Rect_<float> img_rect(0, 0, frame_w_, frame_h_);
+                cv::Rect_<float> box_rect(b[0], b[1], b[2] - b[0], b[3] - b[1]);
+                auto             roi = img_rect & box_rect;
+                if (roi.area() > 0 && get_area(b) > params_.pose_min_bbox_size * params_.pose_min_bbox_size)
+                {
+                    bboxes.push_back(*bbox);
+                    track_ids.push_back(track->track_id());
+                    types.push_back(track->missing() ? 0 : 2);
+                }
+            }
+        }
+    }
 
-  for (auto& track : tracks_) {
-    track->Predict();
-  }
+    void Tracker::GetDetectedObjects(const std::optional<Detections>& dets, vector<Bbox>& _bboxes, vector<int64_t>& track_ids, vector<int>& types) const
+    {
+        if (dets)
+        {
+            auto& [bboxes, scores, labels] = *dets;
+            for (size_t i = 0; i < bboxes.size(); ++i)
+            {
+                if (labels[i] == params_.det_label && scores[i] > params_.det_thr &&
+                    get_area(bboxes[i]) >= params_.det_min_bbox_size * params_.det_min_bbox_size)
+                {
+                    _bboxes.push_back(bboxes[i]);
+                    track_ids.push_back(-1);
+                    types.push_back(1);
+                }
+            }
+        }
+    }
 
-  ++frame_id_;
+    std::tuple<float, float, vector<bool>> Tracker::GetTrackPoseSimilarity(Track&        track,
+                                                                           const Bbox&   bbox,
+                                                                           const Points& kpts) const
+    {
+        static constexpr const std::array chi2inv95{0.f, 3.8415f, 5.9915f, 7.8147f, 9.4877f, 11.070f, 12.592f, 14.067f, 15.507f, 16.919f};
+        auto                              dists = track.KeyPointDistance(kpts);
+        vector<bool>                      gating;
+        gating.reserve(dists.size());
+        float dist  = 0.f;
+        int   count = 0;
+        for (const auto& d : dists)
+        {
+            if (d < chi2inv95[2])
+            {
+                dist += d;
+                ++count;
+                gating.push_back(true);
+            }
+            else
+            {
+                gating.push_back(false);
+            }
+        }
+        auto count_thr =
+            params_.pose_min_keypoints >= 0 ? params_.pose_min_keypoints : (dists.size() + 1) / 2;
+        if (count >= count_thr)
+        {
+            auto fcount = static_cast<float>(count);
+            dist        = dist / fcount - fcount / static_cast<float>(dists.size());
+        }
+        else
+        {
+            dist = kInf;
+        }
 
-  // print track summary
-  //  SummaryTracks();
-}
+        auto iou = intersection_over_union(track.predicted_bbox(), bbox);
+        if (key_point_sigmas_.empty())
+        {
+            return {dist, iou, gating};
+        }
 
-void Tracker::GetTrackedObjects(vector<Bbox>& bboxes, vector<int64_t>& track_ids,
-                                vector<int>& types) const {
-  for (auto& track : tracks_) {
-    std::optional<Bbox> bbox;
-    if (track->missing()) {
-      bbox = track->predicted_bbox();
-    } else {
-      bbox = keypoints_to_bbox(track->predicted_kpts(), track->scores(), frame_h_, frame_w_,
-                               params_.pose_bbox_scale, params_.pose_kpt_thr,
-                               params_.pose_min_keypoints);
+        return {dist, iou, gating};
     }
-    if (bbox) {
-      auto& b = *bbox;
-      cv::Rect_<float> img_rect(0, 0, frame_w_, frame_h_);
-      cv::Rect_<float> box_rect(b[0], b[1], b[2] - b[0], b[3] - b[1]);
-      auto roi = img_rect & box_rect;
-      if (roi.area() > 0 && get_area(b) > params_.pose_min_bbox_size * params_.pose_min_bbox_size) {
-        bboxes.push_back(*bbox);
-        track_ids.push_back(track->track_id());
-        types.push_back(track->missing() ? 0 : 2);
-      }
+
+    void Tracker::GetSimilarityMatrices(const vector<Bbox>& bboxes, const vector<Points>& keypoints, const vector<int64_t>& prev_ids, vector<float>& similarity0, vector<float>& similarity1, vector<vector<bool>>& gating)
+    {
+        const auto n_rows = static_cast<int>(bboxes.size());
+        const auto n_cols = static_cast<int>(tracks_.size());
+
+        // generate similarity matrix
+        similarity0 = vector<float>(n_rows * n_cols, -kInf);
+        similarity1 = vector<float>(n_rows * n_cols, -kInf);
+        gating      = vector<vector<bool>>(n_rows * n_cols);
+        for (size_t i = 0; i < n_rows; ++i)
+        {
+            const auto& bbox = bboxes[i];
+            const auto& kpts = keypoints[i];
+            for (size_t j = 0; j < n_cols; ++j)
+            {
+                auto& track = *tracks_[j];
+                if (prev_ids[i] != -1 && prev_ids[i] != track.track_id())
+                {
+                    continue;
+                }
+                const auto index         = i * n_cols + j;
+                auto&& [dist, iou, gate] = GetTrackPoseSimilarity(track, bbox, kpts);
+                similarity0[index]       = -dist;
+                similarity1[index]       = iou;
+                gating.push_back(std::move(gate));
+            }
+        }
     }
-  }
-}
-
-void Tracker::GetDetectedObjects(const std::optional<Detections>& dets, vector<Bbox>& _bboxes,
-                                 vector<int64_t>& track_ids, vector<int>& types) const {
-  if (dets) {
-    auto& [bboxes, scores, labels] = *dets;
-    for (size_t i = 0; i < bboxes.size(); ++i) {
-      if (labels[i] == params_.det_label && scores[i] > params_.det_thr &&
-          get_area(bboxes[i]) >= params_.det_min_bbox_size * params_.det_min_bbox_size) {
-        _bboxes.push_back(bboxes[i]);
-        track_ids.push_back(-1);
-        types.push_back(1);
-      }
+
+    float Tracker::GetPosePoseSimilarity(const Bbox& bbox0, const Points& kpts0, const Bbox& bbox1, const Points& kpts1)
+    {
+        if (key_point_sigmas_.empty())
+        {
+            return intersection_over_union(bbox0, bbox1);
+        }
+        // symmetric
+        return object_keypoint_similarity(kpts0, bbox0, kpts1, bbox1, key_point_sigmas_);
     }
-  }
-}
-
-std::tuple<float, float, vector<bool>> Tracker::GetTrackPoseSimilarity(Track& track,
-                                                                       const Bbox& bbox,
-                                                                       const Points& kpts) const {
-  static constexpr const std::array chi2inv95{0.f,     3.8415f, 5.9915f, 7.8147f, 9.4877f,
-                                              11.070f, 12.592f, 14.067f, 15.507f, 16.919f};
-  auto dists = track.KeyPointDistance(kpts);
-  vector<bool> gating;
-  gating.reserve(dists.size());
-  float dist = 0.f;
-  int count = 0;
-  for (const auto& d : dists) {
-    if (d < chi2inv95[2]) {
-      dist += d;
-      ++count;
-      gating.push_back(true);
-    } else {
-      gating.push_back(false);
+
+    void Tracker::CreateTrack(const Bbox& bbox, const Points& kpts, const Scores& scores)
+    {
+        *tracks_.emplace_back(std::make_unique<Track>(&params_, bbox, kpts, scores, next_id_++));
     }
-  }
-  auto count_thr =
-      params_.pose_min_keypoints >= 0 ? params_.pose_min_keypoints : (dists.size() + 1) / 2;
-  if (count >= count_thr) {
-    auto fcount = static_cast<float>(count);
-    dist = dist / fcount - fcount / static_cast<float>(dists.size());
-  } else {
-    dist = kInf;
-  }
-
-  auto iou = intersection_over_union(track.predicted_bbox(), bbox);
-  if (key_point_sigmas_.empty()) {
-    return {dist, iou, gating};
-  }
-
-  return {dist, iou, gating};
-}
-
-void Tracker::GetSimilarityMatrices(const vector<Bbox>& bboxes, const vector<Points>& keypoints,
-                                    const vector<int64_t>& prev_ids, vector<float>& similarity0,
-                                    vector<float>& similarity1, vector<vector<bool>>& gating) {
-  const auto n_rows = static_cast<int>(bboxes.size());
-  const auto n_cols = static_cast<int>(tracks_.size());
-
-  // generate similarity matrix
-  similarity0 = vector<float>(n_rows * n_cols, -kInf);
-  similarity1 = vector<float>(n_rows * n_cols, -kInf);
-  gating = vector<vector<bool>>(n_rows * n_cols);
-  for (size_t i = 0; i < n_rows; ++i) {
-    const auto& bbox = bboxes[i];
-    const auto& kpts = keypoints[i];
-    for (size_t j = 0; j < n_cols; ++j) {
-      auto& track = *tracks_[j];
-      if (prev_ids[i] != -1 && prev_ids[i] != track.track_id()) {
-        continue;
-      }
-      const auto index = i * n_cols + j;
-      auto&& [dist, iou, gate] = GetTrackPoseSimilarity(track, bbox, kpts);
-      similarity0[index] = -dist;
-      similarity1[index] = iou;
-      gating.push_back(std::move(gate));
+
+    std::optional<Bbox> Tracker::KeypointsToBbox(const Points& kpts, const Scores& scores) const
+    {
+        return keypoints_to_bbox(kpts, scores, frame_h_, frame_w_, params_.pose_bbox_scale, params_.pose_kpt_thr, params_.pose_min_keypoints);
     }
-  }
-}
-
-float Tracker::GetPosePoseSimilarity(const Bbox& bbox0, const Points& kpts0, const Bbox& bbox1,
-                                     const Points& kpts1) {
-  if (key_point_sigmas_.empty()) {
-    return intersection_over_union(bbox0, bbox1);
-  }
-  // symmetric
-  return object_keypoint_similarity(kpts0, bbox0, kpts1, bbox1, key_point_sigmas_);
-}
-
-void Tracker::CreateTrack(const Bbox& bbox, const Points& kpts, const Scores& scores) {
-  *tracks_.emplace_back(std::make_unique<Track>(&params_, bbox, kpts, scores, next_id_++));
-}
-
-std::optional<Bbox> Tracker::KeypointsToBbox(const Points& kpts, const Scores& scores) const {
-  return keypoints_to_bbox(kpts, scores, frame_h_, frame_w_, params_.pose_bbox_scale,
-                           params_.pose_kpt_thr, params_.pose_min_keypoints);
-}
-
-void Tracker::RemoveMissingTracks() {
-  size_t count{};
-  for (auto& track : tracks_) {
-    if (track->missing() <= params_.track_max_missing) {
-      tracks_[count++] = std::move(track);
+
+    void Tracker::RemoveMissingTracks()
+    {
+        size_t count{};
+        for (auto& track : tracks_)
+        {
+            if (track->missing() <= params_.track_max_missing)
+            {
+                tracks_[count++] = std::move(track);
+            }
+        }
+        tracks_.resize(count);
     }
-  }
-  tracks_.resize(count);
-}
-
-void Tracker::DiagnosticMissingTracks(const vector<int>& is_unused_track,
-                                      const vector<int>& is_unused_bbox,
-                                      const vector<float>& similarity0,
-                                      const vector<float>& similarity1) {
-  int missing = 0;
-  const auto n_cols = static_cast<int>(is_unused_track.size());
-  const auto n_rows = static_cast<int>(is_unused_bbox.size());
-  for (int i = 0; i < is_unused_track.size(); ++i) {
-    if (is_unused_track[i]) {
-      float best_s0 = 0.f;
-      float best_s1 = 0.f;
-      for (int j = 0; j < is_unused_bbox.size(); ++j) {
-        if (is_unused_bbox[j]) {
-          best_s0 = std::max(similarity0[j * n_cols + i], best_s0);
-          best_s1 = std::max(similarity1[j * n_cols + i], best_s1);
+
+    void Tracker::DiagnosticMissingTracks(const vector<int>&   is_unused_track,
+                                          const vector<int>&   is_unused_bbox,
+                                          const vector<float>& similarity0,
+                                          const vector<float>& similarity1)
+    {
+        int        missing = 0;
+        const auto n_cols  = static_cast<int>(is_unused_track.size());
+        const auto n_rows  = static_cast<int>(is_unused_bbox.size());
+        for (int i = 0; i < is_unused_track.size(); ++i)
+        {
+            if (is_unused_track[i])
+            {
+                float best_s0 = 0.f;
+                float best_s1 = 0.f;
+                for (int j = 0; j < is_unused_bbox.size(); ++j)
+                {
+                    if (is_unused_bbox[j])
+                    {
+                        best_s0 = std::max(similarity0[j * n_cols + i], best_s0);
+                        best_s1 = std::max(similarity1[j * n_cols + i], best_s1);
+                    }
+                }
+                POSE_TRACKER_DEBUG("frame {}: track missing {}, best_s0={}, best_s1={}", frame_id_, tracks_[i]->track_id(), best_s0, best_s1);
+                ++missing;
+            }
+        }
+        if (missing)
+        {
+            std::stringstream ss;
+            ss << cv::Mat_<float>(n_rows, n_cols, const_cast<float*>(similarity0.data()));
+            POSE_TRACKER_DEBUG("frame {}, similarity#0: \n{}", frame_id_, ss.str());
+            ss = std::stringstream{};
+            ss << cv::Mat_<float>(n_rows, n_cols, const_cast<float*>(similarity1.data()));
+            POSE_TRACKER_DEBUG("frame {}, similarity#1: \n{}", frame_id_, ss.str());
         }
-      }
-      POSE_TRACKER_DEBUG("frame {}: track missing {}, best_s0={}, best_s1={}", frame_id_,
-                         tracks_[i]->track_id(), best_s0, best_s1);
-      ++missing;
     }
-  }
-  if (missing) {
-    std::stringstream ss;
-    ss << cv::Mat_<float>(n_rows, n_cols, const_cast<float*>(similarity0.data()));
-    POSE_TRACKER_DEBUG("frame {}, similarity#0: \n{}", frame_id_, ss.str());
-    ss = std::stringstream{};
-    ss << cv::Mat_<float>(n_rows, n_cols, const_cast<float*>(similarity1.data()));
-    POSE_TRACKER_DEBUG("frame {}, similarity#1: \n{}", frame_id_, ss.str());
-  }
-}
-
-void Tracker::SummaryTracks() {
-  vector<std::tuple<int64_t, int>> summary;
-  for (const auto& track : tracks_) {
-    summary.emplace_back(track->track_id(), track->missing());
-  }
-  POSE_TRACKER_DEBUG("frame {}, track summary {}", frame_id_, summary);
-  for (const auto& track : tracks_) {
-    if (!track->missing()) {
-      POSE_TRACKER_DEBUG("frame {}, track {}, scores {}", frame_id_, track->track_id(),
-                         track->scores());
+
+    void Tracker::SummaryTracks()
+    {
+        vector<std::tuple<int64_t, int>> summary;
+        for (const auto& track : tracks_)
+        {
+            summary.emplace_back(track->track_id(), track->missing());
+        }
+        POSE_TRACKER_DEBUG("frame {}, track summary {}", frame_id_, summary);
+        for (const auto& track : tracks_)
+        {
+            if (!track->missing())
+            {
+                POSE_TRACKER_DEBUG("frame {}, track {}, scores {}", frame_id_, track->track_id(), track->scores());
+            }
+        }
     }
-  }
-}
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.h
index c27ae0b85b..697832b2f2 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.h
@@ -7,94 +7,107 @@
 #include "pose_tracker/common.h"
 #include "pose_tracker/track.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
-class Tracker {
- public:
-  explicit Tracker(const mmdeploy_pose_tracker_param_t& _params);
+    class Tracker
+    {
+      public:
+        explicit Tracker(const mmdeploy_pose_tracker_param_t& _params);
 
-  Tracker(const Tracker&) { assert(0); }
-  Tracker(Tracker&& o) noexcept = default;
+        Tracker(const Tracker&)
+        {
+            assert(0);
+        }
+        Tracker(Tracker&& o) noexcept = default;
 
-  struct Detections {
-    Bboxes bboxes;
-    Scores scores;
-    vector<int> labels;
-  };
+        struct Detections
+        {
+            Bboxes      bboxes;
+            Scores      scores;
+            vector<int> labels;
+        };
 
-  void SetFrameSize(int height, int width) {
-    frame_h_ = static_cast<float>(height);
-    frame_w_ = static_cast<float>(width);
-  }
+        void SetFrameSize(int height, int width)
+        {
+            frame_h_ = static_cast<float>(height);
+            frame_w_ = static_cast<float>(width);
+        }
 
-  const mmdeploy_pose_tracker_param_t& params() const noexcept { return params_; }
+        const mmdeploy_pose_tracker_param_t& params() const noexcept
+        {
+            return params_;
+        }
 
-  int64_t frame_id() const noexcept { return frame_id_; }
+        int64_t frame_id() const noexcept
+        {
+            return frame_id_;
+        }
 
-  const vector<std::unique_ptr<Track>>& tracks() const noexcept { return tracks_; }
+        const vector<std::unique_ptr<Track>>& tracks() const noexcept
+        {
+            return tracks_;
+        }
 
-  std::tuple<vector<Bbox>, vector<int64_t>> ProcessBboxes(const std::optional<Detections>& dets);
+        std::tuple<vector<Bbox>, vector<int64_t>> ProcessBboxes(const std::optional<Detections>& dets);
 
-  void TrackStep(vector<Points>& keypoints, vector<Scores>& scores,
-                 const vector<int64_t>& prev_ids) noexcept;
+        void                                      TrackStep(vector<Points>& keypoints, vector<Scores>& scores, const vector<int64_t>& prev_ids) noexcept;
 
- private:
-  void GetDetectedObjects(const std::optional<Detections>& dets, vector<Bbox>& _bboxes,
-                          vector<int64_t>& track_ids, vector<int>& types) const;
+      private:
+        void                                   GetDetectedObjects(const std::optional<Detections>& dets, vector<Bbox>& _bboxes, vector<int64_t>& track_ids, vector<int>& types) const;
 
-  void GetTrackedObjects(vector<Bbox>& bboxes, vector<int64_t>& track_ids,
-                         vector<int>& types) const;
+        void                                   GetTrackedObjects(vector<Bbox>& bboxes, vector<int64_t>& track_ids, vector<int>& types) const;
 
-  void SuppressOverlappingBoxes(const vector<Bbox>& bboxes, vector<std::pair<int, float>>& ranks,
-                                vector<int>& is_valid_bbox) const;
+        void                                   SuppressOverlappingBoxes(const vector<Bbox>& bboxes, vector<std::pair<int, float>>& ranks, vector<int>& is_valid_bbox) const;
 
-  void SuppressOverlappingPoses(const vector<Points>& keypoints, const vector<Scores>& scores,
-                                const vector<Bbox>& bboxes, const vector<int64_t>& track_ids,
-                                vector<int>& is_valid, float oks_thr);
+        void                                   SuppressOverlappingPoses(const vector<Points>& keypoints, const vector<Scores>& scores, const vector<Bbox>& bboxes, const vector<int64_t>& track_ids, vector<int>& is_valid, float oks_thr);
 
-  std::optional<Bbox> KeypointsToBbox(const Points& kpts, const Scores& scores) const;
+        std::optional<Bbox>                    KeypointsToBbox(const Points& kpts, const Scores& scores) const;
 
-  float GetPosePoseSimilarity(const Bbox& bbox0, const Points& kpts0, const Bbox& bbox1,
-                              const Points& kpts1);
+        float                                  GetPosePoseSimilarity(const Bbox& bbox0, const Points& kpts0, const Bbox& bbox1, const Points& kpts1);
 
-  void GetSimilarityMatrices(const vector<Bbox>& bboxes, const vector<Points>& keypoints,
-                             const vector<int64_t>& prev_ids, vector<float>& similarity0,
-                             vector<float>& similarity1, vector<vector<bool>>& gating);
+        void                                   GetSimilarityMatrices(const vector<Bbox>& bboxes, const vector<Points>& keypoints, const vector<int64_t>& prev_ids, vector<float>& similarity0, vector<float>& similarity1, vector<vector<bool>>& gating);
 
-  std::tuple<float, float, vector<bool>> GetTrackPoseSimilarity(Track& track, const Bbox& bbox,
-                                                                const Points& kpts) const;
+        std::tuple<float, float, vector<bool>> GetTrackPoseSimilarity(Track& track, const Bbox& bbox, const Points& kpts) const;
 
-  void CreateTrack(const Bbox& bbox, const Points& kpts, const Scores& scores);
+        void                                   CreateTrack(const Bbox& bbox, const Points& kpts, const Scores& scores);
 
-  void RemoveMissingTracks();
+        void                                   RemoveMissingTracks();
 
-  void DiagnosticMissingTracks(const vector<int>& is_unused_track,
-                               const vector<int>& is_unused_bbox, const vector<float>& similarity0,
-                               const vector<float>& similarity1);
+        void                                   DiagnosticMissingTracks(const vector<int>&   is_unused_track,
+                                                                       const vector<int>&   is_unused_bbox,
+                                                                       const vector<float>& similarity0,
+                                                                       const vector<float>& similarity1);
 
-  void SummaryTracks();
+        void                                   SummaryTracks();
 
- private:
-  static constexpr const auto kInf = 1000.f;
+      private:
+        static constexpr const auto    kInf = 1000.f;
 
-  float frame_h_ = 0;
-  float frame_w_ = 0;
+        float                          frame_h_ = 0;
+        float                          frame_w_ = 0;
 
-  vector<std::unique_ptr<Track>> tracks_;
-  int64_t next_id_{0};
+        vector<std::unique_ptr<Track>> tracks_;
+        int64_t                        next_id_{0};
 
-  std::vector<float> key_point_sigmas_;
-  mmdeploy_pose_tracker_param_t params_;
+        std::vector<float>             key_point_sigmas_;
+        mmdeploy_pose_tracker_param_t  params_;
 
-  vector<Bbox> pose_input_bboxes_;
-  vector<Bbox> pose_output_bboxes_;
+        vector<Bbox>                   pose_input_bboxes_;
+        vector<Bbox>                   pose_output_bboxes_;
 
-  int64_t frame_id_ = 0;
+        int64_t                        frame_id_ = 0;
 
- public:
-  const vector<Bbox>& pose_input_bboxes() const noexcept { return pose_input_bboxes_; }
-  const vector<Bbox>& pose_output_bboxes() const noexcept { return pose_output_bboxes_; }
-};
+      public:
+        const vector<Bbox>& pose_input_bboxes() const noexcept
+        {
+            return pose_input_bboxes_;
+        }
+        const vector<Bbox>& pose_output_bboxes() const noexcept
+        {
+            return pose_output_bboxes_;
+        }
+    };
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.cpp
index 9c2711a4f5..6e39be12cb 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.cpp
@@ -2,48 +2,55 @@
 
 #include "smoothing_filter.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-SmoothingFilter::SmoothingFilter(const Bbox& bbox, const Points& pts,
-                                 const SmoothingFilter::Params& params)
-    : params_(params),
-      pts_v_(pts.size()),
-      pts_x_(pts),
-      center_v_{},
-      center_x_{get_center(bbox)},
-      scale_v_{},
-      scale_x_{get_scale(bbox)} {}
-
-std::pair<Bbox, Points> SmoothingFilter::Step(const Bbox& bbox, const Points& kpts) {
-  constexpr auto abs = [](const Point& p) { return std::sqrt(p.dot(p)); };
-
-  // filter key-points
-  step<Point>(pts_x_, pts_v_, kpts, params_, abs);
-
-  // filter bbox center
-  std::array c{get_center(bbox)};
-  step<Point>(center_x_, center_v_, c, params_, abs);
-
-  // filter bbox scales
-  auto s = get_scale(bbox);
-  step<float>(scale_x_, scale_v_, s, params_, [](auto x) { return x; });
-
-  return {get_bbox(center_x_[0], scale_x_), pts_x_};
-}
-
-void SmoothingFilter::Reset(const Bbox& bbox, const Points& pts) {
-  pts_v_ = Points(pts_v_.size());
-  center_v_ = {};
-  scale_v_ = {};
-  pts_x_ = pts;
-  center_v_ = {get_center(bbox)};
-  scale_v_ = get_scale(bbox);
-}
-
-float SmoothingFilter::smoothing_factor(float cutoff) {
-  static constexpr float kPi = 3.1415926;
-  auto r = 2.f * kPi * cutoff;
-  return r / (r + 1.f);
-}
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    SmoothingFilter::SmoothingFilter(const Bbox& bbox, const Points& pts, const SmoothingFilter::Params& params)
+        : params_(params)
+        , pts_v_(pts.size())
+        , pts_x_(pts)
+        , center_v_{}
+        , center_x_{get_center(bbox)}
+        , scale_v_{}
+        , scale_x_{get_scale(bbox)}
+    {
+    }
+
+    std::pair<Bbox, Points> SmoothingFilter::Step(const Bbox& bbox, const Points& kpts)
+    {
+        constexpr auto abs = [](const Point& p)
+        { return std::sqrt(p.dot(p)); };
+
+        // filter key-points
+        step<Point>(pts_x_, pts_v_, kpts, params_, abs);
+
+        // filter bbox center
+        std::array c{get_center(bbox)};
+        step<Point>(center_x_, center_v_, c, params_, abs);
+
+        // filter bbox scales
+        auto s = get_scale(bbox);
+        step<float>(scale_x_, scale_v_, s, params_, [](auto x)
+                    { return x; });
+
+        return {get_bbox(center_x_[0], scale_x_), pts_x_};
+    }
+
+    void SmoothingFilter::Reset(const Bbox& bbox, const Points& pts)
+    {
+        pts_v_    = Points(pts_v_.size());
+        center_v_ = {};
+        scale_v_  = {};
+        pts_x_    = pts;
+        center_v_ = {get_center(bbox)};
+        scale_v_  = get_scale(bbox);
+    }
+
+    float SmoothingFilter::smoothing_factor(float cutoff)
+    {
+        static constexpr float kPi = 3.1415926;
+        auto                   r   = 2.f * kPi * cutoff;
+        return r / (r + 1.f);
+    }
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.h
index c136beb2d6..77358e727e 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.h
@@ -6,52 +6,58 @@
 #include "mmdeploy/core/mpl/span.h"
 #include "pose_tracker/utils.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-template <typename T>
-using span = mmdeploy::Span<T>;
-
-class SmoothingFilter {
- public:
-  struct Params {
-    float beta;
-    float fc_min;
-    float fc_v;
-  };
-  explicit SmoothingFilter(const Bbox& bbox, const Points& pts, const Params& params);
-
-  std::pair<Bbox, Points> Step(const Bbox& bbox, const Points& kpts);
-
-  void Reset(const Bbox& bbox, const Points& pts);
-
- private:
-  static float smoothing_factor(float cutoff);
-
-  template <typename T, typename Norm>
-  static void step(span<T> x, span<T> v, span<const T> x1, const Params& params, Norm norm) {
-    auto a_v = smoothing_factor(params.fc_v);
-    for (int i = 0; i < v.size(); ++i) {
-      v[i] = smooth(a_v, v[i], x1[i] - x[i]);
-      auto fc = params.fc_min + params.beta * norm(v[i]);
-      auto a_x = smoothing_factor(fc);
-      x[i] = smooth(a_x, x[i], x1[i]);
-    }
-  }
-
-  template <typename T>
-  static T smooth(float a, const T& x0, const T& x1) {
-    return (1.f - a) * x0 + a * x1;
-  }
-
- private:
-  Params params_;
-  std::vector<Point> pts_v_;
-  std::vector<Point> pts_x_;
-  std::array<Point, 1> center_v_;
-  std::array<Point, 1> center_x_;
-  std::array<float, 2> scale_v_;
-  std::array<float, 2> scale_x_;
-};
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    template<typename T>
+    using span = mmdeploy::Span<T>;
+
+    class SmoothingFilter
+    {
+      public:
+        struct Params
+        {
+            float beta;
+            float fc_min;
+            float fc_v;
+        };
+        explicit SmoothingFilter(const Bbox& bbox, const Points& pts, const Params& params);
+
+        std::pair<Bbox, Points> Step(const Bbox& bbox, const Points& kpts);
+
+        void                    Reset(const Bbox& bbox, const Points& pts);
+
+      private:
+        static float smoothing_factor(float cutoff);
+
+        template<typename T, typename Norm>
+        static void step(span<T> x, span<T> v, span<const T> x1, const Params& params, Norm norm)
+        {
+            auto a_v = smoothing_factor(params.fc_v);
+            for (int i = 0; i < v.size(); ++i)
+            {
+                v[i]     = smooth(a_v, v[i], x1[i] - x[i]);
+                auto fc  = params.fc_min + params.beta * norm(v[i]);
+                auto a_x = smoothing_factor(fc);
+                x[i]     = smooth(a_x, x[i], x1[i]);
+            }
+        }
+
+        template<typename T>
+        static T smooth(float a, const T& x0, const T& x1)
+        {
+            return (1.f - a) * x0 + a * x1;
+        }
+
+      private:
+        Params               params_;
+        std::vector<Point>   pts_v_;
+        std::vector<Point>   pts_x_;
+        std::array<Point, 1> center_v_;
+        std::array<Point, 1> center_x_;
+        std::array<float, 2> scale_v_;
+        std::array<float, 2> scale_x_;
+    };
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.cpp
index 8566433ba0..6fad771c3b 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.cpp
@@ -2,69 +2,83 @@
 
 #include "pose_tracker/track.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
-Track::Track(const mmdeploy_pose_tracker_param_t* params, const Bbox& bbox, const Points& kpts,
-             const Scores& ss, int64_t id)
-    : params_(params),
-      filter_(CreateFilter(bbox, kpts)),
-      smoother_(CreateSmoother(bbox, kpts)),
-      track_id_(id) {
-  POSE_TRACKER_DEBUG("new track {}", track_id_);
-  Add(bbox, kpts, ss);
-}
+    Track::Track(const mmdeploy_pose_tracker_param_t* params, const Bbox& bbox, const Points& kpts, const Scores& ss, int64_t id)
+        : params_(params)
+        , filter_(CreateFilter(bbox, kpts))
+        , smoother_(CreateSmoother(bbox, kpts))
+        , track_id_(id)
+    {
+        POSE_TRACKER_DEBUG("new track {}", track_id_);
+        Add(bbox, kpts, ss);
+    }
 
-Track::~Track() { POSE_TRACKER_DEBUG("track lost {}", track_id_); }
+    Track::~Track()
+    {
+        POSE_TRACKER_DEBUG("track lost {}", track_id_);
+    }
 
-void Track::UpdateVisible(const Bbox& bbox, const Points& kpts, const Scores& scores,
-                          const vector<bool>& tracked) {
-  auto [bbox_corr, kpts_corr] = filter_.Correct(bbox, kpts, tracked);
-  Add(bbox_corr, kpts_corr, scores);
-}
+    void Track::UpdateVisible(const Bbox& bbox, const Points& kpts, const Scores& scores, const vector<bool>& tracked)
+    {
+        auto [bbox_corr, kpts_corr] = filter_.Correct(bbox, kpts, tracked);
+        Add(bbox_corr, kpts_corr, scores);
+    }
 
-void Track::UpdateRecovered(const Bbox& bbox, const Points& kpts, const Scores& scores) {
-  POSE_TRACKER_DEBUG("track recovered {}", track_id_);
-  filter_ = CreateFilter(bbox, kpts);
-  smoother_.Reset(bbox, kpts);
-  Add(bbox, kpts, scores);
-  missing_ = 0;
-}
+    void Track::UpdateRecovered(const Bbox& bbox, const Points& kpts, const Scores& scores)
+    {
+        POSE_TRACKER_DEBUG("track recovered {}", track_id_);
+        filter_ = CreateFilter(bbox, kpts);
+        smoother_.Reset(bbox, kpts);
+        Add(bbox, kpts, scores);
+        missing_ = 0;
+    }
 
-void Track::UpdateMissing() {
-  missing_++;
-  if (missing_ <= params_->track_max_missing) {
-    // use predicted state to update the missing tracks
-    Add(bbox_predict_, kpts_predict_, vector<float>(kpts_predict_.size()));
-  }
-}
+    void Track::UpdateMissing()
+    {
+        missing_++;
+        if (missing_ <= params_->track_max_missing)
+        {
+            // use predicted state to update the missing tracks
+            Add(bbox_predict_, kpts_predict_, vector<float>(kpts_predict_.size()));
+        }
+    }
 
-void Track::Predict() {
-  // TODO: velocity decay for missing tracks
-  std::tie(bbox_predict_, kpts_predict_) = filter_.Predict();
-}
+    void Track::Predict()
+    {
+        // TODO: velocity decay for missing tracks
+        std::tie(bbox_predict_, kpts_predict_) = filter_.Predict();
+    }
 
-void Track::Add(const Bbox& bbox, const Points& kpts, const Scores& ss) {
-  bboxes_.push_back(bbox);
-  keypoints_.push_back(kpts);
-  scores_.push_back(ss);
-  if (bboxes_.size() > params_->track_history_size) {
-    std::rotate(bboxes_.begin(), bboxes_.begin() + 1, bboxes_.end());
-    std::rotate(keypoints_.begin(), keypoints_.begin() + 1, keypoints_.end());
-    std::rotate(scores_.begin(), scores_.begin() + 1, scores_.end());
-    bboxes_.pop_back();
-    keypoints_.pop_back();
-    scores_.pop_back();
-  }
-  std::tie(bbox_smooth_, kpts_smooth_) = smoother_.Step(bbox, kpts);
-}
+    void Track::Add(const Bbox& bbox, const Points& kpts, const Scores& ss)
+    {
+        bboxes_.push_back(bbox);
+        keypoints_.push_back(kpts);
+        scores_.push_back(ss);
+        if (bboxes_.size() > params_->track_history_size)
+        {
+            std::rotate(bboxes_.begin(), bboxes_.begin() + 1, bboxes_.end());
+            std::rotate(keypoints_.begin(), keypoints_.begin() + 1, keypoints_.end());
+            std::rotate(scores_.begin(), scores_.begin() + 1, scores_.end());
+            bboxes_.pop_back();
+            keypoints_.pop_back();
+            scores_.pop_back();
+        }
+        std::tie(bbox_smooth_, kpts_smooth_) = smoother_.Step(bbox, kpts);
+    }
 
-TrackingFilter Track::CreateFilter(const Bbox& bbox, const Points& pts) {
-  return {bbox, pts, params_->std_weight_position, params_->std_weight_velocity};
-}
+    TrackingFilter Track::CreateFilter(const Bbox& bbox, const Points& pts)
+    {
+        return {bbox, pts, params_->std_weight_position, params_->std_weight_velocity};
+    }
 
-SmoothingFilter Track::CreateSmoother(const Bbox& bbox, const Points& pts) {
-  return SmoothingFilter(
-      bbox, pts, {params_->smooth_params[0], params_->smooth_params[1], params_->smooth_params[2]});
-}
+    SmoothingFilter Track::CreateSmoother(const Bbox& bbox, const Points& pts)
+    {
+        return SmoothingFilter(
+            bbox,
+            pts,
+            {params_->smooth_params[0], params_->smooth_params[1], params_->smooth_params[2]});
+    }
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.h
index 5be73168ac..799039da4f 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.h
@@ -10,55 +10,82 @@
 #include "pose_tracker/tracking_filter.h"
 #include "pose_tracker/utils.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-class Track {
- public:
-  Track(const mmdeploy_pose_tracker_param_t* params, const Bbox& bbox, const Points& kpts,
-        const Scores& ss, int64_t id);
-  ~Track();
-
-  void UpdateVisible(const Bbox& bbox, const Points& kpts, const Scores& scores,
-                     const vector<bool>& tracked);
-  void UpdateRecovered(const Bbox& bbox, const Points& kpts, const Scores& scores);
-  void UpdateMissing();
-  void Predict();
-
-  float BboxDistance(const Bbox& bbox) { return filter_.BboxDistance(bbox); }
-
-  vector<float> KeyPointDistance(const Points& kpts) { return filter_.KeyPointDistance(kpts); }
-
-  uint32_t track_id() const noexcept { return track_id_; }
-  uint32_t missing() const noexcept { return missing_; }
-
-  const Bbox& predicted_bbox() const noexcept { return bbox_predict_; }
-  const Bbox& smoothed_bbox() const noexcept { return bbox_smooth_; }
-
-  const Points& predicted_kpts() const noexcept { return kpts_predict_; }
-  const Points& smoothed_kpts() const noexcept { return kpts_smooth_; }
-
-  const Scores& scores() const noexcept { return scores_.back(); }
-
- private:
-  void Add(const Bbox& bbox, const Points& kpts, const Scores& ss);
-
-  TrackingFilter CreateFilter(const Bbox& bbox, const Points& pts);
-  SmoothingFilter CreateSmoother(const Bbox& bbox, const Points& pts);
-
- private:
-  const mmdeploy_pose_tracker_param_t* params_;
-  vector<Bbox> bboxes_;
-  vector<Points> keypoints_;
-  vector<Scores> scores_;
-  uint32_t track_id_{};
-  Bbox bbox_predict_{};
-  Bbox bbox_smooth_{};
-  Points kpts_predict_;
-  Points kpts_smooth_;
-  uint32_t missing_{0};
-  TrackingFilter filter_;
-  SmoothingFilter smoother_;
-};
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    class Track
+    {
+      public:
+        Track(const mmdeploy_pose_tracker_param_t* params, const Bbox& bbox, const Points& kpts, const Scores& ss, int64_t id);
+        ~Track();
+
+        void  UpdateVisible(const Bbox& bbox, const Points& kpts, const Scores& scores, const vector<bool>& tracked);
+        void  UpdateRecovered(const Bbox& bbox, const Points& kpts, const Scores& scores);
+        void  UpdateMissing();
+        void  Predict();
+
+        float BboxDistance(const Bbox& bbox)
+        {
+            return filter_.BboxDistance(bbox);
+        }
+
+        vector<float> KeyPointDistance(const Points& kpts)
+        {
+            return filter_.KeyPointDistance(kpts);
+        }
+
+        uint32_t track_id() const noexcept
+        {
+            return track_id_;
+        }
+        uint32_t missing() const noexcept
+        {
+            return missing_;
+        }
+
+        const Bbox& predicted_bbox() const noexcept
+        {
+            return bbox_predict_;
+        }
+        const Bbox& smoothed_bbox() const noexcept
+        {
+            return bbox_smooth_;
+        }
+
+        const Points& predicted_kpts() const noexcept
+        {
+            return kpts_predict_;
+        }
+        const Points& smoothed_kpts() const noexcept
+        {
+            return kpts_smooth_;
+        }
+
+        const Scores& scores() const noexcept
+        {
+            return scores_.back();
+        }
+
+      private:
+        void            Add(const Bbox& bbox, const Points& kpts, const Scores& ss);
+
+        TrackingFilter  CreateFilter(const Bbox& bbox, const Points& pts);
+        SmoothingFilter CreateSmoother(const Bbox& bbox, const Points& pts);
+
+      private:
+        const mmdeploy_pose_tracker_param_t* params_;
+        vector<Bbox>                         bboxes_;
+        vector<Points>                       keypoints_;
+        vector<Scores>                       scores_;
+        uint32_t                             track_id_{};
+        Bbox                                 bbox_predict_{};
+        Bbox                                 bbox_smooth_{};
+        Points                               kpts_predict_;
+        Points                               kpts_smooth_;
+        uint32_t                             missing_{0};
+        TrackingFilter                       filter_;
+        SmoothingFilter                      smoother_;
+    };
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.cpp
index 03745a3c8f..e5de3c0a81 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.cpp
@@ -2,198 +2,222 @@
 
 #include "pose_tracker/tracking_filter.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-float get_mean_scale(float scale_w, float scale_h) { return std::sqrt(scale_w * scale_h); }
-
-TrackingFilter::TrackingFilter(const Bbox& bbox, const vector<Point>& kpts,
-                               float std_weight_position, float std_weight_velocity)
-    : std_weight_position_(std_weight_position), std_weight_velocity_(std_weight_velocity) {
-  auto center = get_center(bbox);
-  auto scale = get_scale(bbox);
-
-  auto mean_scale = get_mean_scale(scale[0], scale[1]);
-
-  const auto n = kpts.size();
-  pt_filters_.resize(n);
-  for (int i = 0; i < n; ++i) {
-    auto& f = pt_filters_[i];
-    f.init(4, 2);
-    SetKeyPointTransitionMat(i);
-    SetKeyPointMeasurementMat(i);
-
-    ResetKeyPoint(i, kpts[i], mean_scale);
-  }
-
-  {
-    // [x, y, w, h, dx, dy, dw, dh]
-    auto& f = bbox_filter_;
-
-    f.init(8, 4);
-
-    SetBboxTransitionMat();
-    SetBboxMeasurementMat();
-
-    SetBboxErrorCov(2 * std_weight_position * mean_scale,  //
-                    10 * std_weight_velocity * mean_scale);
-
-    f.statePost.at<float>(0) = center.x;
-    f.statePost.at<float>(1) = center.y;
-    f.statePost.at<float>(2) = scale[0];
-    f.statePost.at<float>(3) = scale[1];
-  }
-}
-
-std::pair<Bbox, Points> TrackingFilter::Predict() {
-  auto mean_scale = get_mean_scale(bbox_filter_.statePost.at<float>(2),  //
-                                   bbox_filter_.statePost.at<float>(3));
-  const auto n = pt_filters_.size();
-  Points pts(n);
-  for (int i = 0; i < n; ++i) {
-    SetKeyPointProcessCov(i, std_weight_position_ * mean_scale, std_weight_velocity_ * mean_scale);
-    auto mat = pt_filters_[i].predict();
-    pts[i].x = mat.at<float>(0);
-    pts[i].y = mat.at<float>(1);
-  }
-  Bbox bbox;
-  {
-    SetBboxProcessCov(std_weight_position_ * mean_scale, std_weight_velocity_ * mean_scale);
-    auto mat = bbox_filter_.predict();
-    auto x = mat.ptr<float>();
-    bbox = get_bbox({x[0], x[1]}, {x[2], x[3]});
-  }
-  return {bbox, pts};
-}
-
-std::pair<Bbox, Points> TrackingFilter::Correct(const Bbox& bbox, const Points& kpts,
-                                                const vector<bool>& tracked) {
-  auto mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
-                                   bbox_filter_.statePre.at<float>(3));
-  const auto n = pt_filters_.size();
-  Points corr_kpts(n);
-  for (int i = 0; i < n; ++i) {
-    if (!tracked.empty() && tracked[i]) {
-      SetKeyPointMeasurementCov(i, std_weight_position_ * mean_scale);
-      std::array<float, 2> m{kpts[i].x, kpts[i].y};
-      auto mat = pt_filters_[i].correct(as_mat(m));
-      corr_kpts[i].x = mat.at<float>(0);
-      corr_kpts[i].y = mat.at<float>(1);
-    } else {
-      ResetKeyPoint(i, kpts[i], mean_scale);
-      corr_kpts[i] = kpts[i];
-    }
-  }
-  Bbox corr_bbox;
-  {
-    SetBboxMeasurementCov(std_weight_position_ * mean_scale);
-    auto c = get_center(bbox);
-    auto s = get_scale(bbox);
-    std::array<float, 4> m{c.x, c.y, s[0], s[1]};
-    auto mat = bbox_filter_.correct(as_mat(m));
-    auto x = mat.ptr<float>();
-    corr_bbox = get_bbox({x[0], x[1]}, {x[2], x[3]});
-  }
-  return {corr_bbox, corr_kpts};
-}
-
-float TrackingFilter::BboxDistance(const Bbox& bbox) {
-  auto mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
-                                   bbox_filter_.statePre.at<float>(3));
-  SetBboxMeasurementCov(std_weight_position_ * mean_scale);
-  auto c = get_center(bbox);
-  auto s = get_scale(bbox);
-  std::array<float, 4> m{c.x, c.y, s[0], s[1]};
-  cv::Mat z = as_mat(m);
-  auto& f = bbox_filter_;
-  cv::Mat sigma;
-  cv::gemm(f.measurementMatrix * f.errorCovPre, f.measurementMatrix, 1, f.measurementNoiseCov, 1,
-           sigma, cv::GEMM_2_T);
-  cv::Mat r = z - f.measurementMatrix * f.statePre;
-  // ignore contribution of scales as it is unstable when inferred from key-points
-  r.at<float>(2) = 0;
-  r.at<float>(3) = 0;
-  cv::Mat d = r.t() * sigma.inv() * r;
-  return d.at<float>();
-}
-
-vector<float> TrackingFilter::KeyPointDistance(const Points& kpts) {
-  auto mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
-                                   bbox_filter_.statePre.at<float>(3));
-
-  const auto n = pt_filters_.size();
-  vector<float> dists(n);
-  for (int i = 0; i < n; ++i) {
-    SetKeyPointMeasurementCov(i, std_weight_position_ * mean_scale);
-    std::array<float, 2> m{kpts[i].x, kpts[i].y};
-    cv::Mat z = as_mat(m);
-    auto& f = pt_filters_[i];
-    cv::Mat sigma;
-    cv::gemm(f.measurementMatrix * f.errorCovPre, f.measurementMatrix, 1, f.measurementNoiseCov, 1,
-             sigma, cv::GEMM_2_T);
-    cv::Mat r = z - f.measurementMatrix * f.statePre;
-    cv::Mat d = r.t() * sigma.inv() * r;
-    dists[i] = d.at<float>();
-  }
-  return dists;
-}
-
-void TrackingFilter::SetBboxProcessCov(float sigma_p, float sigma_v) {
-  auto& m = bbox_filter_.processNoiseCov;
-  cv::setIdentity(m(cv::Rect(0, 0, 4, 4)), sigma_p * sigma_p);
-  cv::setIdentity(m(cv::Rect(4, 4, 4, 4)), sigma_v * sigma_v);
-}
-void TrackingFilter::SetBboxMeasurementCov(float sigma_p) {
-  auto& m = bbox_filter_.measurementNoiseCov;
-  cv::setIdentity(m, sigma_p * sigma_p);
-}
-void TrackingFilter::SetBboxErrorCov(float sigma_p, float sigma_v) {
-  auto& m = bbox_filter_.errorCovPost;
-  cv::setIdentity(m(cv::Rect(0, 0, 4, 4)), sigma_p * sigma_p);
-  cv::setIdentity(m(cv::Rect(4, 4, 4, 4)), sigma_v * sigma_v);
-}
-void TrackingFilter::SetBboxTransitionMat() {
-  auto& m = bbox_filter_.transitionMatrix;
-  cv::setIdentity(m(cv::Rect(4, 0, 4, 4)));  // with scale velocity
-  //  cv::setIdentity(m(cv::Rect(4, 0, 2, 2)));  // w/o scale velocity
-}
-void TrackingFilter::SetBboxMeasurementMat() {
-  auto& m = bbox_filter_.measurementMatrix;
-  cv::setIdentity(m(cv::Rect(0, 0, 4, 4)));
-}
-
-void TrackingFilter::SetKeyPointProcessCov(int index, float sigma_p, float sigma_v) {
-  auto& m = pt_filters_[index].processNoiseCov;
-  m.at<float>(0, 0) = sigma_p * sigma_p;
-  m.at<float>(1, 1) = sigma_p * sigma_p;
-  m.at<float>(2, 2) = sigma_v * sigma_v;
-  m.at<float>(3, 3) = sigma_v * sigma_v;
-}
-void TrackingFilter::SetKeyPointMeasurementCov(int index, float sigma_p) {
-  auto& m = pt_filters_[index].measurementNoiseCov;
-  m.at<float>(0, 0) = sigma_p * sigma_p;
-  m.at<float>(1, 1) = sigma_p * sigma_p;
-}
-void TrackingFilter::SetKeyPointErrorCov(int index, float sigma_p, float sigma_v) {
-  auto& m = pt_filters_[index].errorCovPost;
-  m.at<float>(0, 0) = sigma_p * sigma_p;
-  m.at<float>(1, 1) = sigma_p * sigma_p;
-  m.at<float>(2, 2) = sigma_v * sigma_v;
-  m.at<float>(3, 3) = sigma_v * sigma_v;
-}
-void TrackingFilter::SetKeyPointTransitionMat(int index) {
-  auto& m = pt_filters_[index].transitionMatrix;
-  cv::setIdentity(m(cv::Rect(2, 0, 2, 2)));
-}
-void TrackingFilter::SetKeyPointMeasurementMat(int index) {
-  auto& m = pt_filters_[index].measurementMatrix;
-  cv::setIdentity(m(cv::Rect(0, 0, 2, 2)));
-}
-
-void TrackingFilter::ResetKeyPoint(int index, const Point& kpt, float scale) {
-  auto& f = pt_filters_[index];
-  SetKeyPointErrorCov(index, 2 * std_weight_position_ * scale, 10 * std_weight_velocity_ * scale);
-  f.statePost.at<float>(0) = kpt.x;
-  f.statePost.at<float>(1) = kpt.y;
-}
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    float get_mean_scale(float scale_w, float scale_h)
+    {
+        return std::sqrt(scale_w * scale_h);
+    }
+
+    TrackingFilter::TrackingFilter(const Bbox& bbox, const vector<Point>& kpts, float std_weight_position, float std_weight_velocity)
+        : std_weight_position_(std_weight_position)
+        , std_weight_velocity_(std_weight_velocity)
+    {
+        auto       center = get_center(bbox);
+        auto       scale  = get_scale(bbox);
+
+        auto       mean_scale = get_mean_scale(scale[0], scale[1]);
+
+        const auto n = kpts.size();
+        pt_filters_.resize(n);
+        for (int i = 0; i < n; ++i)
+        {
+            auto& f = pt_filters_[i];
+            f.init(4, 2);
+            SetKeyPointTransitionMat(i);
+            SetKeyPointMeasurementMat(i);
+
+            ResetKeyPoint(i, kpts[i], mean_scale);
+        }
+
+        {
+            // [x, y, w, h, dx, dy, dw, dh]
+            auto& f = bbox_filter_;
+
+            f.init(8, 4);
+
+            SetBboxTransitionMat();
+            SetBboxMeasurementMat();
+
+            SetBboxErrorCov(2 * std_weight_position * mean_scale,  //
+                            10 * std_weight_velocity * mean_scale);
+
+            f.statePost.at<float>(0) = center.x;
+            f.statePost.at<float>(1) = center.y;
+            f.statePost.at<float>(2) = scale[0];
+            f.statePost.at<float>(3) = scale[1];
+        }
+    }
+
+    std::pair<Bbox, Points> TrackingFilter::Predict()
+    {
+        auto       mean_scale = get_mean_scale(bbox_filter_.statePost.at<float>(2),  //
+                                         bbox_filter_.statePost.at<float>(3));
+        const auto n          = pt_filters_.size();
+        Points     pts(n);
+        for (int i = 0; i < n; ++i)
+        {
+            SetKeyPointProcessCov(i, std_weight_position_ * mean_scale, std_weight_velocity_ * mean_scale);
+            auto mat = pt_filters_[i].predict();
+            pts[i].x = mat.at<float>(0);
+            pts[i].y = mat.at<float>(1);
+        }
+        Bbox bbox;
+        {
+            SetBboxProcessCov(std_weight_position_ * mean_scale, std_weight_velocity_ * mean_scale);
+            auto mat = bbox_filter_.predict();
+            auto x   = mat.ptr<float>();
+            bbox     = get_bbox({x[0], x[1]}, {x[2], x[3]});
+        }
+        return {bbox, pts};
+    }
+
+    std::pair<Bbox, Points> TrackingFilter::Correct(const Bbox& bbox, const Points& kpts, const vector<bool>& tracked)
+    {
+        auto       mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
+                                         bbox_filter_.statePre.at<float>(3));
+        const auto n          = pt_filters_.size();
+        Points     corr_kpts(n);
+        for (int i = 0; i < n; ++i)
+        {
+            if (!tracked.empty() && tracked[i])
+            {
+                SetKeyPointMeasurementCov(i, std_weight_position_ * mean_scale);
+                std::array<float, 2> m{kpts[i].x, kpts[i].y};
+                auto                 mat = pt_filters_[i].correct(as_mat(m));
+                corr_kpts[i].x           = mat.at<float>(0);
+                corr_kpts[i].y           = mat.at<float>(1);
+            }
+            else
+            {
+                ResetKeyPoint(i, kpts[i], mean_scale);
+                corr_kpts[i] = kpts[i];
+            }
+        }
+        Bbox corr_bbox;
+        {
+            SetBboxMeasurementCov(std_weight_position_ * mean_scale);
+            auto                 c = get_center(bbox);
+            auto                 s = get_scale(bbox);
+            std::array<float, 4> m{c.x, c.y, s[0], s[1]};
+            auto                 mat = bbox_filter_.correct(as_mat(m));
+            auto                 x   = mat.ptr<float>();
+            corr_bbox                = get_bbox({x[0], x[1]}, {x[2], x[3]});
+        }
+        return {corr_bbox, corr_kpts};
+    }
+
+    float TrackingFilter::BboxDistance(const Bbox& bbox)
+    {
+        auto mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
+                                         bbox_filter_.statePre.at<float>(3));
+        SetBboxMeasurementCov(std_weight_position_ * mean_scale);
+        auto                 c = get_center(bbox);
+        auto                 s = get_scale(bbox);
+        std::array<float, 4> m{c.x, c.y, s[0], s[1]};
+        cv::Mat              z = as_mat(m);
+        auto&                f = bbox_filter_;
+        cv::Mat              sigma;
+        cv::gemm(f.measurementMatrix * f.errorCovPre, f.measurementMatrix, 1, f.measurementNoiseCov, 1, sigma, cv::GEMM_2_T);
+        cv::Mat r      = z - f.measurementMatrix * f.statePre;
+        // ignore contribution of scales as it is unstable when inferred from key-points
+        r.at<float>(2) = 0;
+        r.at<float>(3) = 0;
+        cv::Mat d      = r.t() * sigma.inv() * r;
+        return d.at<float>();
+    }
+
+    vector<float> TrackingFilter::KeyPointDistance(const Points& kpts)
+    {
+        auto          mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
+                                         bbox_filter_.statePre.at<float>(3));
+
+        const auto    n = pt_filters_.size();
+        vector<float> dists(n);
+        for (int i = 0; i < n; ++i)
+        {
+            SetKeyPointMeasurementCov(i, std_weight_position_ * mean_scale);
+            std::array<float, 2> m{kpts[i].x, kpts[i].y};
+            cv::Mat              z = as_mat(m);
+            auto&                f = pt_filters_[i];
+            cv::Mat              sigma;
+            cv::gemm(f.measurementMatrix * f.errorCovPre, f.measurementMatrix, 1, f.measurementNoiseCov, 1, sigma, cv::GEMM_2_T);
+            cv::Mat r = z - f.measurementMatrix * f.statePre;
+            cv::Mat d = r.t() * sigma.inv() * r;
+            dists[i]  = d.at<float>();
+        }
+        return dists;
+    }
+
+    void TrackingFilter::SetBboxProcessCov(float sigma_p, float sigma_v)
+    {
+        auto& m = bbox_filter_.processNoiseCov;
+        cv::setIdentity(m(cv::Rect(0, 0, 4, 4)), sigma_p * sigma_p);
+        cv::setIdentity(m(cv::Rect(4, 4, 4, 4)), sigma_v * sigma_v);
+    }
+    void TrackingFilter::SetBboxMeasurementCov(float sigma_p)
+    {
+        auto& m = bbox_filter_.measurementNoiseCov;
+        cv::setIdentity(m, sigma_p * sigma_p);
+    }
+    void TrackingFilter::SetBboxErrorCov(float sigma_p, float sigma_v)
+    {
+        auto& m = bbox_filter_.errorCovPost;
+        cv::setIdentity(m(cv::Rect(0, 0, 4, 4)), sigma_p * sigma_p);
+        cv::setIdentity(m(cv::Rect(4, 4, 4, 4)), sigma_v * sigma_v);
+    }
+    void TrackingFilter::SetBboxTransitionMat()
+    {
+        auto& m = bbox_filter_.transitionMatrix;
+        cv::setIdentity(m(cv::Rect(4, 0, 4, 4)));  // with scale velocity
+                                                   //  cv::setIdentity(m(cv::Rect(4, 0, 2, 2)));  // w/o scale velocity
+    }
+    void TrackingFilter::SetBboxMeasurementMat()
+    {
+        auto& m = bbox_filter_.measurementMatrix;
+        cv::setIdentity(m(cv::Rect(0, 0, 4, 4)));
+    }
+
+    void TrackingFilter::SetKeyPointProcessCov(int index, float sigma_p, float sigma_v)
+    {
+        auto& m           = pt_filters_[index].processNoiseCov;
+        m.at<float>(0, 0) = sigma_p * sigma_p;
+        m.at<float>(1, 1) = sigma_p * sigma_p;
+        m.at<float>(2, 2) = sigma_v * sigma_v;
+        m.at<float>(3, 3) = sigma_v * sigma_v;
+    }
+    void TrackingFilter::SetKeyPointMeasurementCov(int index, float sigma_p)
+    {
+        auto& m           = pt_filters_[index].measurementNoiseCov;
+        m.at<float>(0, 0) = sigma_p * sigma_p;
+        m.at<float>(1, 1) = sigma_p * sigma_p;
+    }
+    void TrackingFilter::SetKeyPointErrorCov(int index, float sigma_p, float sigma_v)
+    {
+        auto& m           = pt_filters_[index].errorCovPost;
+        m.at<float>(0, 0) = sigma_p * sigma_p;
+        m.at<float>(1, 1) = sigma_p * sigma_p;
+        m.at<float>(2, 2) = sigma_v * sigma_v;
+        m.at<float>(3, 3) = sigma_v * sigma_v;
+    }
+    void TrackingFilter::SetKeyPointTransitionMat(int index)
+    {
+        auto& m = pt_filters_[index].transitionMatrix;
+        cv::setIdentity(m(cv::Rect(2, 0, 2, 2)));
+    }
+    void TrackingFilter::SetKeyPointMeasurementMat(int index)
+    {
+        auto& m = pt_filters_[index].measurementMatrix;
+        cv::setIdentity(m(cv::Rect(0, 0, 2, 2)));
+    }
+
+    void TrackingFilter::ResetKeyPoint(int index, const Point& kpt, float scale)
+    {
+        auto& f = pt_filters_[index];
+        SetKeyPointErrorCov(index, 2 * std_weight_position_ * scale, 10 * std_weight_velocity_ * scale);
+        f.statePost.at<float>(0) = kpt.x;
+        f.statePost.at<float>(1) = kpt.y;
+    }
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.h
index 593fddc369..fba6091c94 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.h
@@ -6,44 +6,44 @@
 #include "opencv2/video/video.hpp"
 #include "pose_tracker/utils.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
-// use Kalman filter to estimate and predict true states
-class TrackingFilter {
- public:
-  TrackingFilter(const Bbox& bbox, const vector<Point>& kpts, float std_weight_position,
-                 float std_weight_velocity);
+    // use Kalman filter to estimate and predict true states
+    class TrackingFilter
+    {
+      public:
+        TrackingFilter(const Bbox& bbox, const vector<Point>& kpts, float std_weight_position, float std_weight_velocity);
 
-  std::pair<Bbox, Points> Predict();
+        std::pair<Bbox, Points> Predict();
 
-  vector<float> KeyPointDistance(const Points& kpts);
+        vector<float>           KeyPointDistance(const Points& kpts);
 
-  float BboxDistance(const Bbox& bbox);
+        float                   BboxDistance(const Bbox& bbox);
 
-  std::pair<Bbox, Points> Correct(const Bbox& bbox, const Points& kpts,
-                                  const vector<bool>& tracked);
+        std::pair<Bbox, Points> Correct(const Bbox& bbox, const Points& kpts, const vector<bool>& tracked);
 
- private:
-  void SetBboxProcessCov(float sigma_p, float sigma_v);
-  void SetBboxMeasurementCov(float sigma_p);
-  void SetBboxErrorCov(float sigma_p, float sigma_v);
-  void SetBboxTransitionMat();
-  void SetBboxMeasurementMat();
+      private:
+        void SetBboxProcessCov(float sigma_p, float sigma_v);
+        void SetBboxMeasurementCov(float sigma_p);
+        void SetBboxErrorCov(float sigma_p, float sigma_v);
+        void SetBboxTransitionMat();
+        void SetBboxMeasurementMat();
 
-  void SetKeyPointProcessCov(int index, float sigma_p, float sigma_v);
-  void SetKeyPointMeasurementCov(int index, float sigma_p);
-  void SetKeyPointErrorCov(int index, float sigma_p, float sigma_v);
-  void SetKeyPointTransitionMat(int index);
-  void SetKeyPointMeasurementMat(int index);
+        void SetKeyPointProcessCov(int index, float sigma_p, float sigma_v);
+        void SetKeyPointMeasurementCov(int index, float sigma_p);
+        void SetKeyPointErrorCov(int index, float sigma_p, float sigma_v);
+        void SetKeyPointTransitionMat(int index);
+        void SetKeyPointMeasurementMat(int index);
 
-  void ResetKeyPoint(int index, const Point& kpt, float scale);
+        void ResetKeyPoint(int index, const Point& kpt, float scale);
 
- private:
-  std::vector<cv::KalmanFilter> pt_filters_;
-  cv::KalmanFilter bbox_filter_;
-  float std_weight_position_;
-  float std_weight_velocity_;
-};
+      private:
+        std::vector<cv::KalmanFilter> pt_filters_;
+        cv::KalmanFilter              bbox_filter_;
+        float                         std_weight_position_;
+        float                         std_weight_velocity_;
+    };
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.cpp
index e264a7b4f1..97c4540f42 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.cpp
@@ -2,135 +2,158 @@
 
 #include "pose_tracker/utils.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
-vector<std::tuple<int, int, float>> greedy_assignment(const vector<float>& scores,
-                                                      vector<int>& is_valid_row,
-                                                      vector<int>& is_valid_col, float thr) {
-  const auto n_rows = is_valid_row.size();
-  const auto n_cols = is_valid_col.size();
-  vector<std::tuple<int, int, float>> assignment;
-  assignment.reserve(std::max(n_rows, n_cols));
-  while (true) {
-    auto max_score = std::numeric_limits<float>::lowest();
-    int max_row = -1;
-    int max_col = -1;
-    for (int i = 0; i < n_rows; ++i) {
-      if (is_valid_row[i]) {
-        for (int j = 0; j < n_cols; ++j) {
-          if (is_valid_col[j]) {
-            if (scores[i * n_cols + j] > max_score) {
-              max_score = scores[i * n_cols + j];
-              max_row = i;
-              max_col = j;
+    vector<std::tuple<int, int, float>> greedy_assignment(const vector<float>& scores,
+                                                          vector<int>&         is_valid_row,
+                                                          vector<int>&         is_valid_col,
+                                                          float                thr)
+    {
+        const auto                          n_rows = is_valid_row.size();
+        const auto                          n_cols = is_valid_col.size();
+        vector<std::tuple<int, int, float>> assignment;
+        assignment.reserve(std::max(n_rows, n_cols));
+        while (true)
+        {
+            auto max_score = std::numeric_limits<float>::lowest();
+            int  max_row   = -1;
+            int  max_col   = -1;
+            for (int i = 0; i < n_rows; ++i)
+            {
+                if (is_valid_row[i])
+                {
+                    for (int j = 0; j < n_cols; ++j)
+                    {
+                        if (is_valid_col[j])
+                        {
+                            if (scores[i * n_cols + j] > max_score)
+                            {
+                                max_score = scores[i * n_cols + j];
+                                max_row   = i;
+                                max_col   = j;
+                            }
+                        }
+                    }
+                }
             }
-          }
+            if (max_score < thr)
+            {
+                break;
+            }
+            is_valid_row[max_row] = 0;
+            is_valid_col[max_col] = 0;
+            assignment.emplace_back(max_row, max_col, max_score);
         }
-      }
-    }
-    if (max_score < thr) {
-      break;
+        return assignment;
     }
-    is_valid_row[max_row] = 0;
-    is_valid_col[max_col] = 0;
-    assignment.emplace_back(max_row, max_col, max_score);
-  }
-  return assignment;
-}
 
-float intersection_over_union(const Bbox& a, const Bbox& b) {
-  auto x1 = std::max(a[0], b[0]);
-  auto y1 = std::max(a[1], b[1]);
-  auto x2 = std::min(a[2], b[2]);
-  auto y2 = std::min(a[3], b[3]);
+    float intersection_over_union(const Bbox& a, const Bbox& b)
+    {
+        auto x1 = std::max(a[0], b[0]);
+        auto y1 = std::max(a[1], b[1]);
+        auto x2 = std::min(a[2], b[2]);
+        auto y2 = std::min(a[3], b[3]);
 
-  auto inter_area = std::max(0.f, x2 - x1) * std::max(0.f, y2 - y1);
+        auto inter_area = std::max(0.f, x2 - x1) * std::max(0.f, y2 - y1);
 
-  auto a_area = get_area(a);
-  auto b_area = get_area(b);
-  auto union_area = a_area + b_area - inter_area;
+        auto a_area     = get_area(a);
+        auto b_area     = get_area(b);
+        auto union_area = a_area + b_area - inter_area;
 
-  if (union_area == 0.f) {
-    return 0;
-  }
-
-  return inter_area / union_area;
-}
+        if (union_area == 0.f)
+        {
+            return 0;
+        }
 
-float object_keypoint_similarity(const Points& pts_a, const Bbox& box_a, const Points& pts_b,
-                                 const Bbox& box_b, const vector<float>& sigmas) {
-  assert(pts_a.size() == sigmas.size());
-  assert(pts_b.size() == sigmas.size());
-  auto scale = [](const Bbox& bbox) -> float {
-    auto a = bbox[2] - bbox[0];
-    auto b = bbox[3] - bbox[1];
-    return std::sqrt(a * a + b * b);
-  };
-  auto oks = [](const Point& pa, const Point& pb, float s, float k) {
-    return std::exp(-(pa - pb).dot(pa - pb) / (2.f * s * s * k * k));
-  };
-  auto sum = 0.f;
-  const auto s = .5f * (scale(box_a) + scale(box_b));
-  for (int i = 0; i < sigmas.size(); ++i) {
-    sum += oks(pts_a[i], pts_b[i], s, sigmas[i]);
-  }
-  sum /= static_cast<float>(sigmas.size());
-  return sum;
-}
+        return inter_area / union_area;
+    }
 
-std::optional<Bbox> keypoints_to_bbox(const Points& keypoints, const Scores& scores, float img_h,
-                                      float img_w, float scale, float kpt_thr, int min_keypoints) {
-  int valid = 0;
-  auto x1 = static_cast<float>(img_w);
-  auto y1 = static_cast<float>(img_h);
-  auto x2 = 0.f;
-  auto y2 = 0.f;
-  for (size_t i = 0; i < keypoints.size(); ++i) {
-    auto& kpt = keypoints[i];
-    if (scores[i] >= kpt_thr) {
-      x1 = std::min(x1, kpt.x);
-      y1 = std::min(y1, kpt.y);
-      x2 = std::max(x2, kpt.x);
-      y2 = std::max(y2, kpt.y);
-      ++valid;
+    float object_keypoint_similarity(const Points& pts_a, const Bbox& box_a, const Points& pts_b, const Bbox& box_b, const vector<float>& sigmas)
+    {
+        assert(pts_a.size() == sigmas.size());
+        assert(pts_b.size() == sigmas.size());
+        auto scale = [](const Bbox& bbox) -> float
+        {
+            auto a = bbox[2] - bbox[0];
+            auto b = bbox[3] - bbox[1];
+            return std::sqrt(a * a + b * b);
+        };
+        auto oks = [](const Point& pa, const Point& pb, float s, float k)
+        {
+            return std::exp(-(pa - pb).dot(pa - pb) / (2.f * s * s * k * k));
+        };
+        auto       sum = 0.f;
+        const auto s   = .5f * (scale(box_a) + scale(box_b));
+        for (int i = 0; i < sigmas.size(); ++i)
+        {
+            sum += oks(pts_a[i], pts_b[i], s, sigmas[i]);
+        }
+        sum /= static_cast<float>(sigmas.size());
+        return sum;
     }
-  }
-  if (min_keypoints < 0) {
-    min_keypoints = (static_cast<int>(scores.size()) + 1) / 2;
-  }
-  if (valid < min_keypoints) {
-    return std::nullopt;
-  }
-  auto xc = .5f * (x1 + x2);
-  auto yc = .5f * (y1 + y2);
-  auto w = (x2 - x1) * scale;
-  auto h = (y2 - y1) * scale;
 
-  return std::array<float, 4>{
-      std::max(0.f, std::min(img_w, xc - .5f * w)),
-      std::max(0.f, std::min(img_h, yc - .5f * h)),
-      std::max(0.f, std::min(img_w, xc + .5f * w)),
-      std::max(0.f, std::min(img_h, yc + .5f * h)),
-  };
-}
+    std::optional<Bbox> keypoints_to_bbox(const Points& keypoints, const Scores& scores, float img_h, float img_w, float scale, float kpt_thr, int min_keypoints)
+    {
+        int  valid = 0;
+        auto x1    = static_cast<float>(img_w);
+        auto y1    = static_cast<float>(img_h);
+        auto x2    = 0.f;
+        auto y2    = 0.f;
+        for (size_t i = 0; i < keypoints.size(); ++i)
+        {
+            auto& kpt = keypoints[i];
+            if (scores[i] >= kpt_thr)
+            {
+                x1 = std::min(x1, kpt.x);
+                y1 = std::min(y1, kpt.y);
+                x2 = std::max(x2, kpt.x);
+                y2 = std::max(y2, kpt.y);
+                ++valid;
+            }
+        }
+        if (min_keypoints < 0)
+        {
+            min_keypoints = (static_cast<int>(scores.size()) + 1) / 2;
+        }
+        if (valid < min_keypoints)
+        {
+            return std::nullopt;
+        }
+        auto xc = .5f * (x1 + x2);
+        auto yc = .5f * (y1 + y2);
+        auto w  = (x2 - x1) * scale;
+        auto h  = (y2 - y1) * scale;
+
+        return std::array<float, 4>{
+            std::max(0.f, std::min(img_w, xc - .5f * w)),
+            std::max(0.f, std::min(img_h, yc - .5f * h)),
+            std::max(0.f, std::min(img_w, xc + .5f * w)),
+            std::max(0.f, std::min(img_h, yc + .5f * h)),
+        };
+    }
 
-Bbox map_bbox(const Bbox& box) {
-  Point p0(box[0], box[1]);
-  Point p1(box[2], box[3]);
-  auto c = .5f * (p0 + p1);
-  auto s = p1 - p0;
-  static constexpr std::array image_size{192.f, 256.f};
-  float aspect_ratio = image_size[0] * 1.0 / image_size[1];
-  if (s.x > aspect_ratio * s.y) {
-    s.y = s.x / aspect_ratio;
-  } else if (s.x < aspect_ratio * s.y) {
-    s.x = s.y * aspect_ratio;
-  }
-  s.x *= 1.25f;
-  s.y *= 1.25f;
-  p0 = c - .5f * s;
-  p1 = c + .5f * s;
-  return {p0.x, p0.y, p1.x, p1.y};
-}
+    Bbox map_bbox(const Bbox& box)
+    {
+        Point                       p0(box[0], box[1]);
+        Point                       p1(box[2], box[3]);
+        auto                        c = .5f * (p0 + p1);
+        auto                        s = p1 - p0;
+        static constexpr std::array image_size{192.f, 256.f};
+        float                       aspect_ratio = image_size[0] * 1.0 / image_size[1];
+        if (s.x > aspect_ratio * s.y)
+        {
+            s.y = s.x / aspect_ratio;
+        }
+        else if (s.x < aspect_ratio * s.y)
+        {
+            s.x = s.y * aspect_ratio;
+        }
+        s.x *= 1.25f;
+        s.y *= 1.25f;
+        p0 = c - .5f * s;
+        p1 = c + .5f * s;
+        return {p0.x, p0.y, p1.x, p1.y};
+    }
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h
index 676e87157d..b57882ed97 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h
@@ -12,84 +12,96 @@
 #include "opencv2/core/core.hpp"
 #include "pose_tracker/common.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
-using std::vector;
-using Bbox = std::array<float, 4>;
-using Bboxes = vector<Bbox>;
-using Point = cv::Point2f;
-using Points = vector<cv::Point2f>;
-using Score = float;
-using Scores = vector<float>;
+    using std::vector;
+    using Bbox   = std::array<float, 4>;
+    using Bboxes = vector<Bbox>;
+    using Point  = cv::Point2f;
+    using Points = vector<cv::Point2f>;
+    using Score  = float;
+    using Scores = vector<float>;
 
 #define POSE_TRACKER_DEBUG(...) MMDEPLOY_DEBUG(__VA_ARGS__)
 
-// opencv3 can't construct cv::Mat from std::array
-template <size_t N>
-cv::Mat as_mat(const std::array<float, N>& a) {
-  return cv::Mat_<float>(a.size(), 1, const_cast<float*>(a.data()));
-}
-
-// scale = 1.5, kpt_thr = 0.3
-std::optional<Bbox> keypoints_to_bbox(const Points& keypoints, const Scores& scores, float img_h,
-                                      float img_w, float scale, float kpt_thr, int min_keypoints);
-
-// xyxy format
-float intersection_over_union(const Bbox& a, const Bbox& b);
-
-float object_keypoint_similarity(const Points& pts_a, const Bbox& box_a, const Points& pts_b,
-                                 const Bbox& box_b, const vector<float>& sigmas);
-
-template <typename T>
-void suppress_non_maximum(const vector<T>& scores, const vector<float>& similarities,
-                          vector<int>& is_valid, float thresh);
-
-inline float get_area(const Bbox& bbox) { return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]); }
-
-inline Point get_center(const Bbox& bbox) {
-  return {.5f * (bbox[0] + bbox[2]), .5f * (bbox[1] + bbox[3])};
-}
-
-inline std::array<float, 2> get_scale(const Bbox& bbox) {
-  return {bbox[2] - bbox[0], bbox[3] - bbox[1]};
-}
-
-inline Bbox get_bbox(const Point& center, const std::array<float, 2>& scale) {
-  return {
-      center.x - .5f * scale[0],
-      center.y - .5f * scale[1],
-      center.x + .5f * scale[0],
-      center.y + .5f * scale[1],
-  };
-}
-
-vector<std::tuple<int, int, float>> greedy_assignment(const vector<float>& scores,
-                                                      vector<int>& is_valid_row,
-                                                      vector<int>& is_valid_col, float thr);
-
-template <typename T>
-inline void suppress_non_maximum(const vector<T>& scores, const vector<float>& similarities,
-                                 vector<int>& is_valid, float thresh) {
-  assert(is_valid.size() == scores.size());
-  vector<int> indices(scores.size());
-  std::iota(indices.begin(), indices.end(), 0);
-  std::sort(indices.begin(), indices.end(), [&](int i, int j) { return scores[i] > scores[j]; });
-  // suppress similar samples
-  for (int i = 0; i < indices.size(); ++i) {
-    if (auto u = indices[i]; is_valid[u]) {
-      for (int j = i + 1; j < indices.size(); ++j) {
-        if (auto v = indices[j]; is_valid[v]) {
-          if (similarities[u * scores.size() + v] >= thresh) {
-            is_valid[v] = false;
-          }
+    // opencv3 can't construct cv::Mat from std::array
+    template<size_t N>
+    cv::Mat as_mat(const std::array<float, N>& a)
+    {
+        return cv::Mat_<float>(a.size(), 1, const_cast<float*>(a.data()));
+    }
+
+    // scale = 1.5, kpt_thr = 0.3
+    std::optional<Bbox> keypoints_to_bbox(const Points& keypoints, const Scores& scores, float img_h, float img_w, float scale, float kpt_thr, int min_keypoints);
+
+    // xyxy format
+    float               intersection_over_union(const Bbox& a, const Bbox& b);
+
+    float               object_keypoint_similarity(const Points& pts_a, const Bbox& box_a, const Points& pts_b, const Bbox& box_b, const vector<float>& sigmas);
+
+    template<typename T>
+    void         suppress_non_maximum(const vector<T>& scores, const vector<float>& similarities, vector<int>& is_valid, float thresh);
+
+    inline float get_area(const Bbox& bbox)
+    {
+        return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]);
+    }
+
+    inline Point get_center(const Bbox& bbox)
+    {
+        return {.5f * (bbox[0] + bbox[2]), .5f * (bbox[1] + bbox[3])};
+    }
+
+    inline std::array<float, 2> get_scale(const Bbox& bbox)
+    {
+        return {bbox[2] - bbox[0], bbox[3] - bbox[1]};
+    }
+
+    inline Bbox get_bbox(const Point& center, const std::array<float, 2>& scale)
+    {
+        return {
+            center.x - .5f * scale[0],
+            center.y - .5f * scale[1],
+            center.x + .5f * scale[0],
+            center.y + .5f * scale[1],
+        };
+    }
+
+    vector<std::tuple<int, int, float>> greedy_assignment(const vector<float>& scores,
+                                                          vector<int>&         is_valid_row,
+                                                          vector<int>&         is_valid_col,
+                                                          float                thr);
+
+    template<typename T>
+    inline void suppress_non_maximum(const vector<T>& scores, const vector<float>& similarities, vector<int>& is_valid, float thresh)
+    {
+        assert(is_valid.size() == scores.size());
+        vector<int> indices(scores.size());
+        std::iota(indices.begin(), indices.end(), 0);
+        std::sort(indices.begin(), indices.end(), [&](int i, int j)
+                  { return scores[i] > scores[j]; });
+        // suppress similar samples
+        for (int i = 0; i < indices.size(); ++i)
+        {
+            if (auto u = indices[i]; is_valid[u])
+            {
+                for (int j = i + 1; j < indices.size(); ++j)
+                {
+                    if (auto v = indices[j]; is_valid[v])
+                    {
+                        if (similarities[u * scores.size() + v] >= thresh)
+                        {
+                            is_valid[v] = false;
+                        }
+                    }
+                }
+            }
         }
-      }
     }
-  }
-}
 
-// TopDownAffine's internal logic for mapping pose model inputs
-Bbox map_bbox(const Bbox& box);
+    // TopDownAffine's internal logic for mapping pose model inputs
+    Bbox map_bbox(const Bbox& box);
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
diff --git a/csrc/mmdeploy/codebase/mmpose/simcc_label.cpp b/csrc/mmdeploy/codebase/mmpose/simcc_label.cpp
index 6ad142f6fa..12bb2a9b9f 100644
--- a/csrc/mmdeploy/codebase/mmpose/simcc_label.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/simcc_label.cpp
@@ -14,111 +14,124 @@
 #include "mmpose.h"
 #include "opencv_utils.h"
 
-namespace mmdeploy::mmpose {
-
-using std::string;
-using std::vector;
-
-class SimCCLabelDecode : public MMPose {
- public:
-  explicit SimCCLabelDecode(const Value& config) : MMPose(config) {
-    if (config.contains("params")) {
-      auto& params = config["params"];
-      flip_test_ = params.value("flip_test", flip_test_);
-      simcc_split_ratio_ = params.value("simcc_split_ratio", simcc_split_ratio_);
-      export_postprocess_ = params.value("export_postprocess", export_postprocess_);
-      if (export_postprocess_) {
-        simcc_split_ratio_ = 1.0;
-      }
-      if (params.contains("input_size")) {
-        from_value(params["input_size"], input_size_);
-      }
-    }
-  }
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) {
-    MMDEPLOY_DEBUG("preprocess_result: {}", _data);
-    MMDEPLOY_DEBUG("inference_result: {}", _prob);
-
-    Device cpu_device{"cpu"};
-    OUTCOME_TRY(auto simcc_x,
-                MakeAvailableOnDevice(_prob["simcc_x"].get<Tensor>(), cpu_device, stream()));
-    OUTCOME_TRY(auto simcc_y,
-                MakeAvailableOnDevice(_prob["simcc_y"].get<Tensor>(), cpu_device, stream()));
-    OUTCOME_TRY(stream().Wait());
-    if (!(simcc_x.shape().size() == 3 && simcc_x.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `simcc_x` tensor, shape: {}, dtype: {}", simcc_x.shape(),
-                     (int)simcc_x.data_type());
-      return Status(eNotSupported);
-    }
-
-    auto& img_metas = _data["img_metas"];
-
-    Tensor keypoints({Device{"cpu"}, DataType::kFLOAT, {simcc_x.shape(0), simcc_x.shape(1), 2}});
-    Tensor scores({Device{"cpu"}, DataType::kFLOAT, {simcc_x.shape(0), simcc_x.shape(1), 1}});
-    float *keypoints_data = nullptr, *scores_data = nullptr;
-    if (!export_postprocess_) {
-      get_simcc_maximum(simcc_x, simcc_y, keypoints, scores);
-      keypoints_data = keypoints.data<float>();
-      scores_data = scores.data<float>();
-    } else {
-      keypoints_data = simcc_x.data<float>();
-      scores_data = simcc_y.data<float>();
-    }
-
-    std::vector<float> center;
-    std::vector<float> scale;
-    from_value(img_metas["center"], center);
-    from_value(img_metas["scale"], scale);
-    PoseDetectorOutput output;
-
-    float scale_value = 200, x = -1, y = -1, s = 0;
-    for (int i = 0; i < simcc_x.shape(1); i++) {
-      x = *(keypoints_data++) / simcc_split_ratio_;
-      y = *(keypoints_data++) / simcc_split_ratio_;
-      s = *(scores_data++);
-
-      x = x * scale[0] * scale_value / input_size_[0] + center[0] - scale[0] * scale_value * 0.5;
-      y = y * scale[1] * scale_value / input_size_[1] + center[1] - scale[1] * scale_value * 0.5;
-      output.key_points.push_back({{x, y}, s});
-    }
-    return to_value(output);
-  }
-
-  void get_simcc_maximum(const Tensor& simcc_x, const Tensor& simcc_y, Tensor& keypoints,
-                         Tensor& scores) {
-    int K = simcc_x.shape(1);
-    int N_x = simcc_x.shape(2);
-    int N_y = simcc_y.shape(2);
-
-    for (int i = 0; i < K; i++) {
-      float* data_x = const_cast<float*>(simcc_x.data<float>()) + i * N_x;
-      float* data_y = const_cast<float*>(simcc_y.data<float>()) + i * N_y;
-      cv::Mat mat_x = cv::Mat(N_x, 1, CV_32FC1, data_x);
-      cv::Mat mat_y = cv::Mat(N_y, 1, CV_32FC1, data_y);
-      double min_val_x, max_val_x, min_val_y, max_val_y;
-      cv::Point min_loc_x, max_loc_x, min_loc_y, max_loc_y;
-      cv::minMaxLoc(mat_x, &min_val_x, &max_val_x, &min_loc_x, &max_loc_x);
-      cv::minMaxLoc(mat_y, &min_val_y, &max_val_y, &min_loc_y, &max_loc_y);
-      float s = max_val_x > max_val_y ? max_val_y : max_val_x;
-      float x = s > 0 ? max_loc_x.y : -1.0;
-      float y = s > 0 ? max_loc_y.y : -1.0;
-      float* keypoints_data = keypoints.data<float>() + i * 2;
-      float* scores_data = scores.data<float>() + i;
-      *(scores_data) = s;
-      *(keypoints_data + 0) = x;
-      *(keypoints_data + 1) = y;
-    }
-  }
-
- private:
-  bool flip_test_{false};
-  bool export_postprocess_{false};
-  bool shift_heatmap_{false};
-  float simcc_split_ratio_{2.0};
-  std::vector<int> input_size_{192, 256};
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, SimCCLabelDecode);
+namespace mmdeploy::mmpose
+{
+
+    using std::string;
+    using std::vector;
+
+    class SimCCLabelDecode : public MMPose
+    {
+      public:
+        explicit SimCCLabelDecode(const Value& config)
+            : MMPose(config)
+        {
+            if (config.contains("params"))
+            {
+                auto& params        = config["params"];
+                flip_test_          = params.value("flip_test", flip_test_);
+                simcc_split_ratio_  = params.value("simcc_split_ratio", simcc_split_ratio_);
+                export_postprocess_ = params.value("export_postprocess", export_postprocess_);
+                if (export_postprocess_)
+                {
+                    simcc_split_ratio_ = 1.0;
+                }
+                if (params.contains("input_size"))
+                {
+                    from_value(params["input_size"], input_size_);
+                }
+            }
+        }
+
+        Result<Value> operator()(const Value& _data, const Value& _prob)
+        {
+            MMDEPLOY_DEBUG("preprocess_result: {}", _data);
+            MMDEPLOY_DEBUG("inference_result: {}", _prob);
+
+            Device cpu_device{"cpu"};
+            OUTCOME_TRY(auto simcc_x,
+                        MakeAvailableOnDevice(_prob["simcc_x"].get<Tensor>(), cpu_device, stream()));
+            OUTCOME_TRY(auto simcc_y,
+                        MakeAvailableOnDevice(_prob["simcc_y"].get<Tensor>(), cpu_device, stream()));
+            OUTCOME_TRY(stream().Wait());
+            if (!(simcc_x.shape().size() == 3 && simcc_x.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `simcc_x` tensor, shape: {}, dtype: {}", simcc_x.shape(), (int)simcc_x.data_type());
+                return Status(eNotSupported);
+            }
+
+            auto&  img_metas = _data["img_metas"];
+
+            Tensor keypoints({Device{"cpu"}, DataType::kFLOAT, {simcc_x.shape(0), simcc_x.shape(1), 2}});
+            Tensor scores({Device{"cpu"}, DataType::kFLOAT, {simcc_x.shape(0), simcc_x.shape(1), 1}});
+            float *keypoints_data = nullptr, *scores_data = nullptr;
+            if (!export_postprocess_)
+            {
+                get_simcc_maximum(simcc_x, simcc_y, keypoints, scores);
+                keypoints_data = keypoints.data<float>();
+                scores_data    = scores.data<float>();
+            }
+            else
+            {
+                keypoints_data = simcc_x.data<float>();
+                scores_data    = simcc_y.data<float>();
+            }
+
+            std::vector<float> center;
+            std::vector<float> scale;
+            from_value(img_metas["center"], center);
+            from_value(img_metas["scale"], scale);
+            PoseDetectorOutput output;
+
+            float              scale_value = 200, x = -1, y = -1, s = 0;
+            for (int i = 0; i < simcc_x.shape(1); i++)
+            {
+                x = *(keypoints_data++) / simcc_split_ratio_;
+                y = *(keypoints_data++) / simcc_split_ratio_;
+                s = *(scores_data++);
+
+                x = x * scale[0] * scale_value / input_size_[0] + center[0] - scale[0] * scale_value * 0.5;
+                y = y * scale[1] * scale_value / input_size_[1] + center[1] - scale[1] * scale_value * 0.5;
+                output.key_points.push_back({{x, y}, s});
+            }
+            return to_value(output);
+        }
+
+        void get_simcc_maximum(const Tensor& simcc_x, const Tensor& simcc_y, Tensor& keypoints, Tensor& scores)
+        {
+            int K   = simcc_x.shape(1);
+            int N_x = simcc_x.shape(2);
+            int N_y = simcc_y.shape(2);
+
+            for (int i = 0; i < K; i++)
+            {
+                float*    data_x = const_cast<float*>(simcc_x.data<float>()) + i * N_x;
+                float*    data_y = const_cast<float*>(simcc_y.data<float>()) + i * N_y;
+                cv::Mat   mat_x  = cv::Mat(N_x, 1, CV_32FC1, data_x);
+                cv::Mat   mat_y  = cv::Mat(N_y, 1, CV_32FC1, data_y);
+                double    min_val_x, max_val_x, min_val_y, max_val_y;
+                cv::Point min_loc_x, max_loc_x, min_loc_y, max_loc_y;
+                cv::minMaxLoc(mat_x, &min_val_x, &max_val_x, &min_loc_x, &max_loc_x);
+                cv::minMaxLoc(mat_y, &min_val_y, &max_val_y, &min_loc_y, &max_loc_y);
+                float  s              = max_val_x > max_val_y ? max_val_y : max_val_x;
+                float  x              = s > 0 ? max_loc_x.y : -1.0;
+                float  y              = s > 0 ? max_loc_y.y : -1.0;
+                float* keypoints_data = keypoints.data<float>() + i * 2;
+                float* scores_data    = scores.data<float>() + i;
+                *(scores_data)        = s;
+                *(keypoints_data + 0) = x;
+                *(keypoints_data + 1) = y;
+            }
+        }
+
+      private:
+        bool             flip_test_{false};
+        bool             export_postprocess_{false};
+        bool             shift_heatmap_{false};
+        float            simcc_split_ratio_{2.0};
+        std::vector<int> input_size_{192, 256};
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, SimCCLabelDecode);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmpose/topdown_affine.cpp b/csrc/mmdeploy/codebase/mmpose/topdown_affine.cpp
index f04ab0f38e..cf3632bcb0 100644
--- a/csrc/mmdeploy/codebase/mmpose/topdown_affine.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/topdown_affine.cpp
@@ -14,141 +14,153 @@
 
 using namespace std;
 
-namespace mmdeploy::mmpose {
-
-class TopDownAffine : public transform::Transform {
- public:
-  explicit TopDownAffine(const Value& args) noexcept {
-    assert(args.contains("image_size"));
-    from_value(args["image_size"], image_size_);
-    crop_resize_pad_ =
-        ::mmdeploy::operation::Managed<::mmdeploy::operation::CropResizePad>::Create();
-  }
-
-  ~TopDownAffine() override = default;
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("top_down_affine input: {}", data);
-
-    auto img = data["img"].get<Tensor>();
-
-    // prepare data
-    vector<float> bbox;
-    vector<float> c;  // center
-    vector<float> s;  // scale
-    if (data.contains("center") && data.contains("scale")) {
-      // after mmpose v0.26.0
-      from_value(data["center"], c);
-      from_value(data["scale"], s);
-      from_value(data["bbox"], bbox);
-    } else {
-      // before mmpose v0.26.0
-      from_value(data["bbox"], bbox);
-      Box2cs(bbox, c, s);
-    }
-    // end prepare data
-
-    Tensor dst;
+namespace mmdeploy::mmpose
+{
+
+    class TopDownAffine : public transform::Transform
     {
-      s[0] *= 200;
-      s[1] *= 200;
-      const std::array img_roi{0, 0, (int)img.shape(2), (int)img.shape(1)};
-      const std::array tmp_roi{0, 0, (int)image_size_[0], (int)image_size_[1]};
-      auto roi = round({c[0] - s[0] / 2.f, c[1] - s[1] / 2.f, s[0], s[1]});
-      auto src_roi = intersect(roi, img_roi);
-      // prior scale factor
-      auto factor = (float)image_size_[0] / s[0];
-      // rounded dst roi
-      auto dst_roi = round({(src_roi[0] - roi[0]) * factor,  //
-                            (src_roi[1] - roi[1]) * factor,  //
-                            src_roi[2] * factor,             //
-                            src_roi[3] * factor});
-      dst_roi = intersect(dst_roi, tmp_roi);
-      // exact scale factors
-      auto factor_x = (float)dst_roi[2] / src_roi[2];
-      auto factor_y = (float)dst_roi[3] / src_roi[3];
-      // center of src roi
-      auto c_src_x = src_roi[0] + (src_roi[2] - 1) / 2.f;
-      auto c_src_y = src_roi[1] + (src_roi[3] - 1) / 2.f;
-      // center of dst roi
-      auto c_dst_x = dst_roi[0] + (dst_roi[2] - 1) / 2.f;
-      auto c_dst_y = dst_roi[1] + (dst_roi[3] - 1) / 2.f;
-      // vector from c_dst to (w/2, h/2)
-      auto v_dst_x = image_size_[0] / 2.f - c_dst_x;
-      auto v_dst_y = image_size_[1] / 2.f - c_dst_y;
-      // vector from c_src to corrected center
-      auto v_src_x = v_dst_x / factor_x;
-      auto v_src_y = v_dst_y / factor_y;
-      // corrected center
-      c[0] = c_src_x + v_src_x;
-      c[1] = c_src_y + v_src_y;
-      // corrected scale
-      s[0] = image_size_[0] / factor_x / 200.f;
-      s[1] = image_size_[1] / factor_y / 200.f;
-
-      vector<int> crop_rect = {src_roi[1], src_roi[0], src_roi[1] + src_roi[3] - 1,
-                               src_roi[0] + src_roi[2] - 1};
-      vector<int> target_size = {dst_roi[2], dst_roi[3]};
-      vector<int> pad_rect = {dst_roi[1], dst_roi[0], image_size_[1] - dst_roi[3] - dst_roi[1],
-                              image_size_[0] - dst_roi[2] - dst_roi[0]};
-      crop_resize_pad_.Apply(img, crop_rect, target_size, pad_rect, dst);
-    }
-
-    data["img"] = std::move(dst);
-    data["img_shape"] = {1, image_size_[1], image_size_[0], img.shape(3)};
-    data["center"] = to_value(c);
-    data["scale"] = to_value(s);
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
-  static std::array<int, 4> round(const std::array<float, 4>& a) {
-    return {
-        static_cast<int>(std::round(a[0])),
-        static_cast<int>(std::round(a[1])),
-        static_cast<int>(std::round(a[2])),
-        static_cast<int>(std::round(a[3])),
+      public:
+        explicit TopDownAffine(const Value& args) noexcept
+        {
+            assert(args.contains("image_size"));
+            from_value(args["image_size"], image_size_);
+            crop_resize_pad_ =
+                ::mmdeploy::operation::Managed<::mmdeploy::operation::CropResizePad>::Create();
+        }
+
+        ~TopDownAffine() override = default;
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("top_down_affine input: {}", data);
+
+            auto          img = data["img"].get<Tensor>();
+
+            // prepare data
+            vector<float> bbox;
+            vector<float> c;  // center
+            vector<float> s;  // scale
+            if (data.contains("center") && data.contains("scale"))
+            {
+                // after mmpose v0.26.0
+                from_value(data["center"], c);
+                from_value(data["scale"], s);
+                from_value(data["bbox"], bbox);
+            }
+            else
+            {
+                // before mmpose v0.26.0
+                from_value(data["bbox"], bbox);
+                Box2cs(bbox, c, s);
+            }
+            // end prepare data
+
+            Tensor dst;
+            {
+                s[0] *= 200;
+                s[1] *= 200;
+                const std::array img_roi{0, 0, (int)img.shape(2), (int)img.shape(1)};
+                const std::array tmp_roi{0, 0, (int)image_size_[0], (int)image_size_[1]};
+                auto             roi     = round({c[0] - s[0] / 2.f, c[1] - s[1] / 2.f, s[0], s[1]});
+                auto             src_roi = intersect(roi, img_roi);
+                // prior scale factor
+                auto             factor  = (float)image_size_[0] / s[0];
+                // rounded dst roi
+                auto             dst_roi = round({(src_roi[0] - roi[0]) * factor,  //
+                                                  (src_roi[1] - roi[1]) * factor,  //
+                                                  src_roi[2] * factor,             //
+                                                  src_roi[3] * factor});
+                dst_roi                  = intersect(dst_roi, tmp_roi);
+                // exact scale factors
+                auto factor_x            = (float)dst_roi[2] / src_roi[2];
+                auto factor_y            = (float)dst_roi[3] / src_roi[3];
+                // center of src roi
+                auto c_src_x             = src_roi[0] + (src_roi[2] - 1) / 2.f;
+                auto c_src_y             = src_roi[1] + (src_roi[3] - 1) / 2.f;
+                // center of dst roi
+                auto c_dst_x             = dst_roi[0] + (dst_roi[2] - 1) / 2.f;
+                auto c_dst_y             = dst_roi[1] + (dst_roi[3] - 1) / 2.f;
+                // vector from c_dst to (w/2, h/2)
+                auto v_dst_x             = image_size_[0] / 2.f - c_dst_x;
+                auto v_dst_y             = image_size_[1] / 2.f - c_dst_y;
+                // vector from c_src to corrected center
+                auto v_src_x             = v_dst_x / factor_x;
+                auto v_src_y             = v_dst_y / factor_y;
+                // corrected center
+                c[0]                     = c_src_x + v_src_x;
+                c[1]                     = c_src_y + v_src_y;
+                // corrected scale
+                s[0]                     = image_size_[0] / factor_x / 200.f;
+                s[1]                     = image_size_[1] / factor_y / 200.f;
+
+                vector<int> crop_rect   = {src_roi[1], src_roi[0], src_roi[1] + src_roi[3] - 1, src_roi[0] + src_roi[2] - 1};
+                vector<int> target_size = {dst_roi[2], dst_roi[3]};
+                vector<int> pad_rect    = {dst_roi[1], dst_roi[0], image_size_[1] - dst_roi[3] - dst_roi[1], image_size_[0] - dst_roi[2] - dst_roi[0]};
+                crop_resize_pad_.Apply(img, crop_rect, target_size, pad_rect, dst);
+            }
+
+            data["img"]       = std::move(dst);
+            data["img_shape"] = {1, image_size_[1], image_size_[0], img.shape(3)};
+            data["center"]    = to_value(c);
+            data["scale"]     = to_value(s);
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+        static std::array<int, 4> round(const std::array<float, 4>& a)
+        {
+            return {
+                static_cast<int>(std::round(a[0])),
+                static_cast<int>(std::round(a[1])),
+                static_cast<int>(std::round(a[2])),
+                static_cast<int>(std::round(a[3])),
+            };
+        }
+
+        // xywh
+        template<typename T>
+        static std::array<T, 4> intersect(std::array<T, 4> a, std::array<T, 4> b)
+        {
+            auto x1 = std::max(a[0], b[0]);
+            auto y1 = std::max(a[1], b[1]);
+            a[2]    = std::min(a[0] + a[2], b[0] + b[2]) - x1;
+            a[3]    = std::min(a[1] + a[3], b[1] + b[3]) - y1;
+            a[0]    = x1;
+            a[1]    = y1;
+            if (a[2] <= 0 || a[3] <= 0)
+            {
+                a = {};
+            }
+            return a;
+        }
+
+        void Box2cs(vector<float>& box, vector<float>& center, vector<float>& scale)
+        {
+            // bbox_xywh2cs
+            float x            = box[0];
+            float y            = box[1];
+            float w            = box[2];
+            float h            = box[3];
+            float aspect_ratio = image_size_[0] * 1.0 / image_size_[1];
+            center.push_back(x + w * 0.5);
+            center.push_back(y + h * 0.5);
+            if (w > aspect_ratio * h)
+            {
+                h = w * 1.0 / aspect_ratio;
+            }
+            else if (w < aspect_ratio * h)
+            {
+                w = h * aspect_ratio;
+            }
+            scale.push_back(w / 200 * 1.25);
+            scale.push_back(h / 200 * 1.25);
+        }
+
+      protected:
+        vector<int>                                                          image_size_;
+        ::mmdeploy::operation::Managed<::mmdeploy::operation::CropResizePad> crop_resize_pad_;
     };
-  }
-
-  // xywh
-  template <typename T>
-  static std::array<T, 4> intersect(std::array<T, 4> a, std::array<T, 4> b) {
-    auto x1 = std::max(a[0], b[0]);
-    auto y1 = std::max(a[1], b[1]);
-    a[2] = std::min(a[0] + a[2], b[0] + b[2]) - x1;
-    a[3] = std::min(a[1] + a[3], b[1] + b[3]) - y1;
-    a[0] = x1;
-    a[1] = y1;
-    if (a[2] <= 0 || a[3] <= 0) {
-      a = {};
-    }
-    return a;
-  }
-
-  void Box2cs(vector<float>& box, vector<float>& center, vector<float>& scale) {
-    // bbox_xywh2cs
-    float x = box[0];
-    float y = box[1];
-    float w = box[2];
-    float h = box[3];
-    float aspect_ratio = image_size_[0] * 1.0 / image_size_[1];
-    center.push_back(x + w * 0.5);
-    center.push_back(y + h * 0.5);
-    if (w > aspect_ratio * h) {
-      h = w * 1.0 / aspect_ratio;
-    } else if (w < aspect_ratio * h) {
-      w = h * aspect_ratio;
-    }
-    scale.push_back(w / 200 * 1.25);
-    scale.push_back(h / 200 * 1.25);
-  }
-
- protected:
-  vector<int> image_size_;
-  ::mmdeploy::operation::Managed<::mmdeploy::operation::CropResizePad> crop_resize_pad_;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(TopDownAffine);
+
+    MMDEPLOY_REGISTER_TRANSFORM(TopDownAffine);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmpose/topdown_get_bbox_center_scale.cpp b/csrc/mmdeploy/codebase/mmpose/topdown_get_bbox_center_scale.cpp
index 1890803849..53f4d44764 100644
--- a/csrc/mmdeploy/codebase/mmpose/topdown_get_bbox_center_scale.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/topdown_get_bbox_center_scale.cpp
@@ -10,57 +10,64 @@
 
 using namespace std;
 
-namespace mmdeploy::mmpose {
+namespace mmdeploy::mmpose
+{
 
-class TopDownGetBboxCenterScale : public transform::Transform {
- public:
-  explicit TopDownGetBboxCenterScale(const Value& args) {
-    padding_ = args.value("padding", 1.25);
-    assert(args.contains("image_size"));
-    from_value(args["image_size"], image_size_);
-  }
+    class TopDownGetBboxCenterScale : public transform::Transform
+    {
+      public:
+        explicit TopDownGetBboxCenterScale(const Value& args)
+        {
+            padding_ = args.value("padding", 1.25);
+            assert(args.contains("image_size"));
+            from_value(args["image_size"], image_size_);
+        }
 
-  ~TopDownGetBboxCenterScale() override = default;
+        ~TopDownGetBboxCenterScale() override = default;
 
-  Result<void> Apply(Value& data) override {
-    vector<float> bbox;
-    from_value(data["bbox"], bbox);
+        Result<void> Apply(Value& data) override
+        {
+            vector<float> bbox;
+            from_value(data["bbox"], bbox);
 
-    vector<float> c;  // center
-    vector<float> s;  // scale
+            vector<float> c;  // center
+            vector<float> s;  // scale
 
-    Box2cs(bbox, c, s, padding_, pixel_std_);
-    data["center"] = to_value(c);
-    data["scale"] = to_value(s);
+            Box2cs(bbox, c, s, padding_, pixel_std_);
+            data["center"] = to_value(c);
+            data["scale"]  = to_value(s);
 
-    return success();
-  }
+            return success();
+        }
 
-  void Box2cs(vector<float>& box, vector<float>& center, vector<float>& scale, float padding,
-              float pixel_std) {
-    // bbox_xywh2cs
-    float x = box[0];
-    float y = box[1];
-    float w = box[2];
-    float h = box[3];
-    float aspect_ratio = image_size_[0] * 1.0 / image_size_[1];
-    center.push_back(x + w * 0.5);
-    center.push_back(y + h * 0.5);
-    if (w > aspect_ratio * h) {
-      h = w * 1.0 / aspect_ratio;
-    } else if (w < aspect_ratio * h) {
-      w = h * aspect_ratio;
-    }
-    scale.push_back(w / pixel_std * padding);
-    scale.push_back(h / pixel_std * padding);
-  }
+        void Box2cs(vector<float>& box, vector<float>& center, vector<float>& scale, float padding, float pixel_std)
+        {
+            // bbox_xywh2cs
+            float x            = box[0];
+            float y            = box[1];
+            float w            = box[2];
+            float h            = box[3];
+            float aspect_ratio = image_size_[0] * 1.0 / image_size_[1];
+            center.push_back(x + w * 0.5);
+            center.push_back(y + h * 0.5);
+            if (w > aspect_ratio * h)
+            {
+                h = w * 1.0 / aspect_ratio;
+            }
+            else if (w < aspect_ratio * h)
+            {
+                w = h * aspect_ratio;
+            }
+            scale.push_back(w / pixel_std * padding);
+            scale.push_back(h / pixel_std * padding);
+        }
 
- protected:
-  float padding_{1.25f};
-  float pixel_std_{200.f};
-  vector<int> image_size_;
-};
+      protected:
+        float       padding_{1.25f};
+        float       pixel_std_{200.f};
+        vector<int> image_size_;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(TopDownGetBboxCenterScale);
+    MMDEPLOY_REGISTER_TRANSFORM(TopDownGetBboxCenterScale);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmrotate/mmrotate.cpp b/csrc/mmdeploy/codebase/mmrotate/mmrotate.cpp
index ee22f35f99..38f70bf215 100644
--- a/csrc/mmdeploy/codebase/mmrotate/mmrotate.cpp
+++ b/csrc/mmdeploy/codebase/mmrotate/mmrotate.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmrotate/mmrotate.h"
 
-namespace mmdeploy::mmrotate {
+namespace mmdeploy::mmrotate
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMRotate);
+    MMDEPLOY_REGISTER_CODEBASE(MMRotate);
 
 }  // namespace mmdeploy::mmrotate
diff --git a/csrc/mmdeploy/codebase/mmrotate/mmrotate.h b/csrc/mmdeploy/codebase/mmrotate/mmrotate.h
index 83fc5bdc0f..378a4797ca 100644
--- a/csrc/mmdeploy/codebase/mmrotate/mmrotate.h
+++ b/csrc/mmdeploy/codebase/mmrotate/mmrotate.h
@@ -9,20 +9,23 @@
 #include "mmdeploy/core/device.h"
 #include "mmdeploy/core/module.h"
 
-namespace mmdeploy::mmrotate {
-
-struct RotatedDetectorOutput {
-  struct Detection {
-    int label_id;
-    float score;
-    std::array<float, 5> rbbox;  // cx,cy,w,h,ag
-    MMDEPLOY_ARCHIVE_MEMBERS(label_id, score, rbbox);
-  };
-  std::vector<Detection> detections;
-  MMDEPLOY_ARCHIVE_MEMBERS(detections);
-};
-
-MMDEPLOY_DECLARE_CODEBASE(MMRotate, mmrotate);
+namespace mmdeploy::mmrotate
+{
+
+    struct RotatedDetectorOutput
+    {
+        struct Detection
+        {
+            int                  label_id;
+            float                score;
+            std::array<float, 5> rbbox;  // cx,cy,w,h,ag
+            MMDEPLOY_ARCHIVE_MEMBERS(label_id, score, rbbox);
+        };
+        std::vector<Detection> detections;
+        MMDEPLOY_ARCHIVE_MEMBERS(detections);
+    };
+
+    MMDEPLOY_DECLARE_CODEBASE(MMRotate, mmrotate);
 
 }  // namespace mmdeploy::mmrotate
 
diff --git a/csrc/mmdeploy/codebase/mmrotate/oriented_object_detection.cpp b/csrc/mmdeploy/codebase/mmrotate/oriented_object_detection.cpp
index c5d8f8a38f..6ca9116671 100644
--- a/csrc/mmdeploy/codebase/mmrotate/oriented_object_detection.cpp
+++ b/csrc/mmdeploy/codebase/mmrotate/oriented_object_detection.cpp
@@ -12,102 +12,114 @@
 #include "mmrotate.h"
 #include "opencv_utils.h"
 
-namespace mmdeploy::mmrotate {
-
-using std::vector;
-
-class ResizeRBBox : public MMRotate {
- public:
-  explicit ResizeRBBox(const Value& cfg) : MMRotate(cfg) {
-    if (cfg.contains("params")) {
-      score_thr_ = cfg["params"].value("score_thr", 0.05f);
-    }
-  }
-
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res) {
-    MMDEPLOY_DEBUG("prep_res: {}", prep_res);
-    MMDEPLOY_DEBUG("infer_res: {}", infer_res);
-
-    Device cpu_device{"cpu"};
-    OUTCOME_TRY(auto dets,
-                MakeAvailableOnDevice(infer_res["dets"].get<Tensor>(), cpu_device, stream_));
-    OUTCOME_TRY(auto labels,
-                MakeAvailableOnDevice(infer_res["labels"].get<Tensor>(), cpu_device, stream_));
-    OUTCOME_TRY(stream_.Wait());
-
-    if (!(dets.shape().size() == 3 && dets.shape(2) == 6 && dets.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(),
-                     (int)dets.data_type());
-      return Status(eNotSupported);
-    }
-
-    if (labels.shape().size() != 2) {
-      MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(),
-                     (int)labels.data_type());
-      return Status(eNotSupported);
-    }
-
-    OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], dets, labels));
-    return to_value(result);
-  }
-
-  Result<RotatedDetectorOutput> DispatchGetBBoxes(const Value& prep_res, const Tensor& dets,
-                                                  const Tensor& labels) {
-    auto data_type = labels.data_type();
-    switch (data_type) {
-      case DataType::kFLOAT:
-        return GetRBBoxes<float>(prep_res, dets, labels);
-      case DataType::kINT32:
-        return GetRBBoxes<int32_t>(prep_res, dets, labels);
-      case DataType::kINT64:
-        return GetRBBoxes<int64_t>(prep_res, dets, labels);
-      default:
-        return Status(eNotSupported);
-    }
-  }
-
-  template <typename T>
-  Result<RotatedDetectorOutput> GetRBBoxes(const Value& prep_res, const Tensor& dets,
-                                           const Tensor& labels) {
-    RotatedDetectorOutput objs;
-    auto* dets_ptr = dets.data<float>();
-    auto* labels_ptr = labels.data<T>();
-    vector<float> scale_factor;
-    if (prep_res.contains("scale_factor")) {
-      from_value(prep_res["scale_factor"], scale_factor);
-    } else {
-      scale_factor = {1.f, 1.f, 1.f, 1.f};
-    }
-
-    int ori_width = prep_res["ori_shape"][2].get<int>();
-    int ori_height = prep_res["ori_shape"][1].get<int>();
-
-    auto bboxes_number = dets.shape()[1];
-    auto channels = dets.shape()[2];
-    for (auto i = 0; i < bboxes_number; ++i, dets_ptr += channels, ++labels_ptr) {
-      float score = dets_ptr[channels - 1];
-      if (score <= score_thr_) {
-        continue;
-      }
-      auto cx = dets_ptr[0] / scale_factor[0];
-      auto cy = dets_ptr[1] / scale_factor[1];
-      auto width = dets_ptr[2] / scale_factor[0];
-      auto height = dets_ptr[3] / scale_factor[1];
-      auto angle = dets_ptr[4];
-      RotatedDetectorOutput::Detection det{};
-      det.label_id = static_cast<int>(*labels_ptr);
-      det.score = score;
-      det.rbbox = {cx, cy, width, height, angle};
-      objs.detections.push_back(std::move(det));
-    }
-
-    return objs;
-  }
-
- private:
-  float score_thr_;
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMRotate, ResizeRBBox);
+namespace mmdeploy::mmrotate
+{
+
+    using std::vector;
+
+    class ResizeRBBox : public MMRotate
+    {
+      public:
+        explicit ResizeRBBox(const Value& cfg)
+            : MMRotate(cfg)
+        {
+            if (cfg.contains("params"))
+            {
+                score_thr_ = cfg["params"].value("score_thr", 0.05f);
+            }
+        }
+
+        Result<Value> operator()(const Value& prep_res, const Value& infer_res)
+        {
+            MMDEPLOY_DEBUG("prep_res: {}", prep_res);
+            MMDEPLOY_DEBUG("infer_res: {}", infer_res);
+
+            Device cpu_device{"cpu"};
+            OUTCOME_TRY(auto dets,
+                        MakeAvailableOnDevice(infer_res["dets"].get<Tensor>(), cpu_device, stream_));
+            OUTCOME_TRY(auto labels,
+                        MakeAvailableOnDevice(infer_res["labels"].get<Tensor>(), cpu_device, stream_));
+            OUTCOME_TRY(stream_.Wait());
+
+            if (!(dets.shape().size() == 3 && dets.shape(2) == 6 && dets.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(), (int)dets.data_type());
+                return Status(eNotSupported);
+            }
+
+            if (labels.shape().size() != 2)
+            {
+                MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(), (int)labels.data_type());
+                return Status(eNotSupported);
+            }
+
+            OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], dets, labels));
+            return to_value(result);
+        }
+
+        Result<RotatedDetectorOutput> DispatchGetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels)
+        {
+            auto data_type = labels.data_type();
+            switch (data_type)
+            {
+                case DataType::kFLOAT:
+                    return GetRBBoxes<float>(prep_res, dets, labels);
+                case DataType::kINT32:
+                    return GetRBBoxes<int32_t>(prep_res, dets, labels);
+                case DataType::kINT64:
+                    return GetRBBoxes<int64_t>(prep_res, dets, labels);
+                default:
+                    return Status(eNotSupported);
+            }
+        }
+
+        template<typename T>
+        Result<RotatedDetectorOutput> GetRBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels)
+        {
+            RotatedDetectorOutput objs;
+            auto*                 dets_ptr   = dets.data<float>();
+            auto*                 labels_ptr = labels.data<T>();
+            vector<float>         scale_factor;
+            if (prep_res.contains("scale_factor"))
+            {
+                from_value(prep_res["scale_factor"], scale_factor);
+            }
+            else
+            {
+                scale_factor = {1.f, 1.f, 1.f, 1.f};
+            }
+
+            int  ori_width  = prep_res["ori_shape"][2].get<int>();
+            int  ori_height = prep_res["ori_shape"][1].get<int>();
+
+            auto bboxes_number = dets.shape()[1];
+            auto channels      = dets.shape()[2];
+            for (auto i = 0; i < bboxes_number; ++i, dets_ptr += channels, ++labels_ptr)
+            {
+                float score = dets_ptr[channels - 1];
+                if (score <= score_thr_)
+                {
+                    continue;
+                }
+                auto                             cx     = dets_ptr[0] / scale_factor[0];
+                auto                             cy     = dets_ptr[1] / scale_factor[1];
+                auto                             width  = dets_ptr[2] / scale_factor[0];
+                auto                             height = dets_ptr[3] / scale_factor[1];
+                auto                             angle  = dets_ptr[4];
+                RotatedDetectorOutput::Detection det{};
+                det.label_id = static_cast<int>(*labels_ptr);
+                det.score    = score;
+                det.rbbox    = {cx, cy, width, height, angle};
+                objs.detections.push_back(std::move(det));
+            }
+
+            return objs;
+        }
+
+      private:
+        float score_thr_;
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMRotate, ResizeRBBox);
 
 }  // namespace mmdeploy::mmrotate
diff --git a/csrc/mmdeploy/codebase/mmseg/mmseg.cpp b/csrc/mmdeploy/codebase/mmseg/mmseg.cpp
index 36d0add8eb..d49f42734f 100644
--- a/csrc/mmdeploy/codebase/mmseg/mmseg.cpp
+++ b/csrc/mmdeploy/codebase/mmseg/mmseg.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmseg/mmseg.h"
 
-namespace mmdeploy::mmseg {
+namespace mmdeploy::mmseg
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMSegmentation);
+    MMDEPLOY_REGISTER_CODEBASE(MMSegmentation);
 
 }  // namespace mmdeploy::mmseg
diff --git a/csrc/mmdeploy/codebase/mmseg/mmseg.h b/csrc/mmdeploy/codebase/mmseg/mmseg.h
index 8f55fadce1..8e798ede75 100644
--- a/csrc/mmdeploy/codebase/mmseg/mmseg.h
+++ b/csrc/mmdeploy/codebase/mmseg/mmseg.h
@@ -8,18 +8,20 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::mmseg {
+namespace mmdeploy::mmseg
+{
 
-struct SegmentorOutput {
-  Tensor mask;
-  Tensor score;
-  int height;
-  int width;
-  int classes;
-  MMDEPLOY_ARCHIVE_MEMBERS(mask, score, height, width, classes);
-};
+    struct SegmentorOutput
+    {
+        Tensor mask;
+        Tensor score;
+        int    height;
+        int    width;
+        int    classes;
+        MMDEPLOY_ARCHIVE_MEMBERS(mask, score, height, width, classes);
+    };
 
-MMDEPLOY_DECLARE_CODEBASE(MMSegmentation, mmseg);
+    MMDEPLOY_DECLARE_CODEBASE(MMSegmentation, mmseg);
 
 }  // namespace mmdeploy::mmseg
 
diff --git a/csrc/mmdeploy/codebase/mmseg/segment.cpp b/csrc/mmdeploy/codebase/mmseg/segment.cpp
index eb0e971390..b251f48669 100644
--- a/csrc/mmdeploy/codebase/mmseg/segment.cpp
+++ b/csrc/mmdeploy/codebase/mmseg/segment.cpp
@@ -10,123 +10,143 @@
 #include "mmdeploy/preprocess/transform/transform.h"
 #include "opencv_utils.h"
 
-namespace mmdeploy::mmseg {
-
-// TODO: resize masks on device
-// TODO: when network output is on device, cast it to a smaller type (e.g. int16_t or int8_t
-//  according to num classes) to reduce DtoH footprint
-class ResizeMask : public MMSegmentation {
- public:
-  explicit ResizeMask(const Value &cfg) : MMSegmentation(cfg) {
-    try {
-      classes_ = cfg["params"]["num_classes"].get<int>();
-      with_argmax_ = cfg["params"].value("with_argmax", true);
-      little_endian_ = IsLittleEndian();
-      ::mmdeploy::operation::Context ctx(Device("cpu"), stream_);
-      permute_ = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
-    } catch (const std::exception &e) {
-      MMDEPLOY_ERROR("no ['params']['num_classes'] is specified in cfg: {}", cfg);
-      throw_exception(eInvalidArgument);
-    }
-  }
-
-  Result<Value> operator()(const Value &preprocess_result, const Value &inference_result) {
-    MMDEPLOY_DEBUG("preprocess: {}\ninference: {}", preprocess_result, inference_result);
-
-    auto mask = inference_result["output"].get<Tensor>();
-    MMDEPLOY_DEBUG("tensor.name: {}, tensor.shape: {}, tensor.data_type: {}", mask.name(),
-                   mask.shape(), mask.data_type());
-    if (!(mask.shape().size() == 4 && mask.shape(0) == 1)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}", mask.shape());
-      return Status(eNotSupported);
-    }
-    if ((mask.shape(1) != 1) && with_argmax_) {
-      MMDEPLOY_ERROR("probability feat map with shape: {} requires `with_argmax_=false`",
-                     mask.shape());
-      return Status(eNotSupported);
-    }
-    if ((mask.data_type() != DataType::kFLOAT) && !with_argmax_) {
-      MMDEPLOY_ERROR("probability feat map only support float32 output");
-      return Status(eNotSupported);
-    }
-
-    auto channel = (int)mask.shape(1);
-    auto height = (int)mask.shape(2);
-    auto width = (int)mask.shape(3);
-    auto input_height = preprocess_result["img_metas"]["ori_shape"][1].get<int>();
-    auto input_width = preprocess_result["img_metas"]["ori_shape"][2].get<int>();
-    Device host{"cpu"};
-    OUTCOME_TRY(auto host_tensor, MakeAvailableOnDevice(mask, host, stream_));
-    OUTCOME_TRY(stream().Wait());  // should sync even mask is on cpu
-    if (!with_argmax_) {
-      // (C, H, W) -> (H, W, C)
-      ::mmdeploy::operation::Context ctx(host, stream_);
-      std::vector<int> axes = {0, 2, 3, 1};
-      OUTCOME_TRY(permute_.Apply(host_tensor, host_tensor, axes));
-    }
-
-    OUTCOME_TRY(auto cv_type, GetCvType(mask.data_type(), channel));
-    cv::Mat mask_mat(height, width, cv_type, host_tensor.data());
-
-    cv::Mat resized_mask;
-    cv::Mat resized_score;
-
-    Tensor tensor_mask{};
-    Tensor tensor_score{};
-
-    if (with_argmax_) {
-      // mask
-      if (mask_mat.channels() > 1) {
-        cv::extractChannel(mask_mat, mask_mat, little_endian_ ? 0 : mask_mat.channels() - 1);
-      }
-      if (mask_mat.type() != CV_32S) {
-        mask_mat.convertTo(mask_mat, CV_32S);
-      }
-      resized_mask = cpu::Resize(mask_mat, input_height, input_width, "nearest");
-      tensor_mask = cpu::CVMat2Tensor(resized_mask);
-    } else {
-      // score
-      resized_score = cpu::Resize(mask_mat, input_height, input_width, "bilinear");
-      tensor_score = cpu::CVMat2Tensor(resized_score);
-      std::vector<int> axes = {0, 3, 1, 2};
-      ::mmdeploy::operation::Context ctx(host, stream_);
-      OUTCOME_TRY(permute_.Apply(tensor_score, tensor_score, axes));
-    }
-
-    SegmentorOutput output{tensor_mask, tensor_score, input_height, input_width, classes_};
-    return to_value(output);
-  }
-
- private:
-  static Result<int> GetCvType(DataType type, int channel) {
-    switch (type) {
-      case DataType::kFLOAT:
-        return CV_32FC(channel);
-      case DataType::kINT64:
-        return CV_32SC2;
-      case DataType::kINT32:
-        return CV_32S;
-      default:
-        return Status(eNotSupported);
-    }
-  }
-
-  static bool IsLittleEndian() {
-    union Un {
-      char a;
-      int b;
-    } un;
-    un.b = 1;
-    return (int)un.a == 1;
-  }
-
- protected:
-  ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute> permute_;
-  int classes_{};
-  bool with_argmax_{true};
-  bool little_endian_;
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMSegmentation, ResizeMask);
+namespace mmdeploy::mmseg
+{
+
+    // TODO: resize masks on device
+    // TODO: when network output is on device, cast it to a smaller type (e.g. int16_t or int8_t
+    //  according to num classes) to reduce DtoH footprint
+    class ResizeMask : public MMSegmentation
+    {
+      public:
+        explicit ResizeMask(const Value& cfg)
+            : MMSegmentation(cfg)
+        {
+            try
+            {
+                classes_       = cfg["params"]["num_classes"].get<int>();
+                with_argmax_   = cfg["params"].value("with_argmax", true);
+                little_endian_ = IsLittleEndian();
+                ::mmdeploy::operation::Context ctx(Device("cpu"), stream_);
+                permute_ = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
+            }
+            catch (const std::exception& e)
+            {
+                MMDEPLOY_ERROR("no ['params']['num_classes'] is specified in cfg: {}", cfg);
+                throw_exception(eInvalidArgument);
+            }
+        }
+
+        Result<Value> operator()(const Value& preprocess_result, const Value& inference_result)
+        {
+            MMDEPLOY_DEBUG("preprocess: {}\ninference: {}", preprocess_result, inference_result);
+
+            auto mask = inference_result["output"].get<Tensor>();
+            MMDEPLOY_DEBUG("tensor.name: {}, tensor.shape: {}, tensor.data_type: {}", mask.name(), mask.shape(), mask.data_type());
+            if (!(mask.shape().size() == 4 && mask.shape(0) == 1))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}", mask.shape());
+                return Status(eNotSupported);
+            }
+            if ((mask.shape(1) != 1) && with_argmax_)
+            {
+                MMDEPLOY_ERROR("probability feat map with shape: {} requires `with_argmax_=false`",
+                               mask.shape());
+                return Status(eNotSupported);
+            }
+            if ((mask.data_type() != DataType::kFLOAT) && !with_argmax_)
+            {
+                MMDEPLOY_ERROR("probability feat map only support float32 output");
+                return Status(eNotSupported);
+            }
+
+            auto   channel      = (int)mask.shape(1);
+            auto   height       = (int)mask.shape(2);
+            auto   width        = (int)mask.shape(3);
+            auto   input_height = preprocess_result["img_metas"]["ori_shape"][1].get<int>();
+            auto   input_width  = preprocess_result["img_metas"]["ori_shape"][2].get<int>();
+            Device host{"cpu"};
+            OUTCOME_TRY(auto host_tensor, MakeAvailableOnDevice(mask, host, stream_));
+            OUTCOME_TRY(stream().Wait());  // should sync even mask is on cpu
+            if (!with_argmax_)
+            {
+                // (C, H, W) -> (H, W, C)
+                ::mmdeploy::operation::Context ctx(host, stream_);
+                std::vector<int>               axes = {0, 2, 3, 1};
+                OUTCOME_TRY(permute_.Apply(host_tensor, host_tensor, axes));
+            }
+
+            OUTCOME_TRY(auto cv_type, GetCvType(mask.data_type(), channel));
+            cv::Mat mask_mat(height, width, cv_type, host_tensor.data());
+
+            cv::Mat resized_mask;
+            cv::Mat resized_score;
+
+            Tensor  tensor_mask{};
+            Tensor  tensor_score{};
+
+            if (with_argmax_)
+            {
+                // mask
+                if (mask_mat.channels() > 1)
+                {
+                    cv::extractChannel(mask_mat, mask_mat, little_endian_ ? 0 : mask_mat.channels() - 1);
+                }
+                if (mask_mat.type() != CV_32S)
+                {
+                    mask_mat.convertTo(mask_mat, CV_32S);
+                }
+                resized_mask = cpu::Resize(mask_mat, input_height, input_width, "nearest");
+                tensor_mask  = cpu::CVMat2Tensor(resized_mask);
+            }
+            else
+            {
+                // score
+                resized_score                       = cpu::Resize(mask_mat, input_height, input_width, "bilinear");
+                tensor_score                        = cpu::CVMat2Tensor(resized_score);
+                std::vector<int>               axes = {0, 3, 1, 2};
+                ::mmdeploy::operation::Context ctx(host, stream_);
+                OUTCOME_TRY(permute_.Apply(tensor_score, tensor_score, axes));
+            }
+
+            SegmentorOutput output{tensor_mask, tensor_score, input_height, input_width, classes_};
+            return to_value(output);
+        }
+
+      private:
+        static Result<int> GetCvType(DataType type, int channel)
+        {
+            switch (type)
+            {
+                case DataType::kFLOAT:
+                    return CV_32FC(channel);
+                case DataType::kINT64:
+                    return CV_32SC2;
+                case DataType::kINT32:
+                    return CV_32S;
+                default:
+                    return Status(eNotSupported);
+            }
+        }
+
+        static bool IsLittleEndian()
+        {
+            union Un
+            {
+                char a;
+                int  b;
+            } un;
+            un.b = 1;
+            return (int)un.a == 1;
+        }
+
+      protected:
+        ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute> permute_;
+        int                                                            classes_{};
+        bool                                                           with_argmax_{true};
+        bool                                                           little_endian_;
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMSegmentation, ResizeMask);
 
 }  // namespace mmdeploy::mmseg
diff --git a/csrc/mmdeploy/core/archive.h b/csrc/mmdeploy/core/archive.h
index a0dde248ba..b9bfe9957f 100644
--- a/csrc/mmdeploy/core/archive.h
+++ b/csrc/mmdeploy/core/archive.h
@@ -6,102 +6,130 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/core/serialization.h"
 
-namespace mmdeploy {
-
-template <typename T, typename A>
-using member_load_t = decltype(std::declval<T&>().load(std::declval<A&>()));
-
-template <typename T, typename A>
-using member_save_t = decltype(std::declval<T&>().save(std::declval<A&>()));
-
-template <typename T, typename A>
-using member_serialize_t = decltype(std::declval<T&>().serialize(std::declval<A&>()));
-
-template <typename T, typename A>
-using has_member_load = detail::is_detected<member_load_t, T, A>;
-
-template <typename T, typename A>
-using has_member_save = detail::is_detected<member_save_t, T, A>;
-
-template <typename T, typename A>
-using has_member_serialize = detail::is_detected<member_serialize_t, T, A>;
-
-template <typename T, typename A>
-using adl_load_t = decltype(adl_serializer<T>::load(std::declval<A&>(), std::declval<T&>()));
-
-template <typename T, typename A>
-using has_adl_load = detail::is_detected<adl_load_t, T, A>;
-
-template <typename T, typename A>
-using adl_save_t = decltype(adl_serializer<T>::save(std::declval<A&>(), std::declval<T&>()));
-
-template <typename T, typename A>
-using has_adl_save = detail::is_detected<adl_save_t, T, A>;
-
-template <typename T, typename A>
-using adl_serialize_t =
-    decltype(adl_serializer<T>::serialize(std::declval<A&>(), std::declval<T&>()));
-
-template <typename T, typename A>
-using has_adl_serialize = detail::is_detected<adl_serialize_t, T, A>;
-
-namespace detail {
-// ADL bridge for archives
-class ArchiveBase {};
-
-}  // namespace detail
-
-template <typename Archive>
-class OutputArchive : public detail::ArchiveBase {
- public:
-  template <typename... Args>
-  void operator()(Args&&... args) {
-    (dispatch(std::forward<Args>(args)), ...);
-  }
-
- private:
-  template <typename T>
-  void dispatch(T&& v) {
-    auto& archive = static_cast<Archive&>(*this);
-    if constexpr (has_member_save<T, Archive>::value) {
-      std::forward<T>(v).save(archive);
-    } else if constexpr (has_member_serialize<T, Archive>::value) {
-      std::forward<T>(v).serialize(archive);
-    } else if constexpr (has_adl_save<T, Archive>::value) {
-      adl_serializer<T>::save(archive, std::forward<T>(v));
-    } else if constexpr (has_adl_serialize<T, Archive>::value) {
-      adl_serializer<T>::serialize(archive, std::forward<T>(v));
-    } else {
-      archive.native(std::forward<T>(v));
-    }
-  }
-};
-
-template <typename Archive>
-class InputArchive : public detail::ArchiveBase {
- public:
-  template <typename... Args>
-  void operator()(Args&&... args) {
-    (dispatch(std::forward<Args>(args)), ...);
-  }
-
- private:
-  template <typename T>
-  void dispatch(T&& v) {
-    auto& archive = static_cast<Archive&>(*this);
-    if constexpr (has_member_load<T, Archive>::value) {
-      std::forward<T>(v).load(archive);
-    } else if constexpr (has_member_serialize<T, Archive>::value) {
-      std::forward<T>(v).serialize(archive);
-    } else if constexpr (has_adl_load<T, Archive>::value) {
-      adl_serializer<T>::load(archive, std::forward<T>(v));
-    } else if constexpr (has_adl_serialize<T, Archive>::value) {
-      adl_serializer<T>::serialize(archive, std::forward<T>(v));
-    } else {
-      archive.native(std::forward<T>(v));
-    }
-  }
-};
+namespace mmdeploy
+{
+
+    template<typename T, typename A>
+    using member_load_t = decltype(std::declval<T&>().load(std::declval<A&>()));
+
+    template<typename T, typename A>
+    using member_save_t = decltype(std::declval<T&>().save(std::declval<A&>()));
+
+    template<typename T, typename A>
+    using member_serialize_t = decltype(std::declval<T&>().serialize(std::declval<A&>()));
+
+    template<typename T, typename A>
+    using has_member_load = detail::is_detected<member_load_t, T, A>;
+
+    template<typename T, typename A>
+    using has_member_save = detail::is_detected<member_save_t, T, A>;
+
+    template<typename T, typename A>
+    using has_member_serialize = detail::is_detected<member_serialize_t, T, A>;
+
+    template<typename T, typename A>
+    using adl_load_t = decltype(adl_serializer<T>::load(std::declval<A&>(), std::declval<T&>()));
+
+    template<typename T, typename A>
+    using has_adl_load = detail::is_detected<adl_load_t, T, A>;
+
+    template<typename T, typename A>
+    using adl_save_t = decltype(adl_serializer<T>::save(std::declval<A&>(), std::declval<T&>()));
+
+    template<typename T, typename A>
+    using has_adl_save = detail::is_detected<adl_save_t, T, A>;
+
+    template<typename T, typename A>
+    using adl_serialize_t =
+        decltype(adl_serializer<T>::serialize(std::declval<A&>(), std::declval<T&>()));
+
+    template<typename T, typename A>
+    using has_adl_serialize = detail::is_detected<adl_serialize_t, T, A>;
+
+    namespace detail
+    {
+        // ADL bridge for archives
+        class ArchiveBase
+        {
+        };
+
+    }  // namespace detail
+
+    template<typename Archive>
+    class OutputArchive : public detail::ArchiveBase
+    {
+      public:
+        template<typename... Args>
+        void operator()(Args&&... args)
+        {
+            (dispatch(std::forward<Args>(args)), ...);
+        }
+
+      private:
+        template<typename T>
+        void dispatch(T&& v)
+        {
+            auto& archive = static_cast<Archive&>(*this);
+            if constexpr (has_member_save<T, Archive>::value)
+            {
+                std::forward<T>(v).save(archive);
+            }
+            else if constexpr (has_member_serialize<T, Archive>::value)
+            {
+                std::forward<T>(v).serialize(archive);
+            }
+            else if constexpr (has_adl_save<T, Archive>::value)
+            {
+                adl_serializer<T>::save(archive, std::forward<T>(v));
+            }
+            else if constexpr (has_adl_serialize<T, Archive>::value)
+            {
+                adl_serializer<T>::serialize(archive, std::forward<T>(v));
+            }
+            else
+            {
+                archive.native(std::forward<T>(v));
+            }
+        }
+    };
+
+    template<typename Archive>
+    class InputArchive : public detail::ArchiveBase
+    {
+      public:
+        template<typename... Args>
+        void operator()(Args&&... args)
+        {
+            (dispatch(std::forward<Args>(args)), ...);
+        }
+
+      private:
+        template<typename T>
+        void dispatch(T&& v)
+        {
+            auto& archive = static_cast<Archive&>(*this);
+            if constexpr (has_member_load<T, Archive>::value)
+            {
+                std::forward<T>(v).load(archive);
+            }
+            else if constexpr (has_member_serialize<T, Archive>::value)
+            {
+                std::forward<T>(v).serialize(archive);
+            }
+            else if constexpr (has_adl_load<T, Archive>::value)
+            {
+                adl_serializer<T>::load(archive, std::forward<T>(v));
+            }
+            else if constexpr (has_adl_serialize<T, Archive>::value)
+            {
+                adl_serializer<T>::serialize(archive, std::forward<T>(v));
+            }
+            else
+            {
+                archive.native(std::forward<T>(v));
+            }
+        }
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/device.h b/csrc/mmdeploy/core/device.h
index 5efc808137..170aa50f83 100644
--- a/csrc/mmdeploy/core/device.h
+++ b/csrc/mmdeploy/core/device.h
@@ -16,369 +16,509 @@
 #include "mmdeploy/core/status_code.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
+
+    namespace framework
+    {
+
+        class Platform;
+        class Device;
+        class Stream;
+        class Event;
+        class Allocator;
+        class Buffer;
+        class Kernel;
+
+        class PlatformImpl;
+        class StreamImpl;
+        class EventImpl;
+        class AllocatorImpl;
+        class BufferImpl;
+        class KernelImpl;
+
+        template<typename T>
+        using optional = std::optional<T>;
+
+        class DeviceId
+        {
+          public:
+            using ValueType = int32_t;
+            constexpr explicit DeviceId(ValueType value)
+                : value_(value)
+            {
+            }
+            constexpr operator ValueType() const
+            {
+                return value_;
+            }  // NOLINT
+            constexpr ValueType get() const
+            {
+                return value_;
+            }
+
+          private:
+            ValueType value_;
+        };
+
+        class PlatformId
+        {
+          public:
+            using ValueType = int32_t;
+            constexpr explicit PlatformId(ValueType value)
+                : value_(value)
+            {
+            }
+            constexpr operator ValueType() const
+            {
+                return value_;
+            }  // NOLINT
+            constexpr ValueType get() const
+            {
+                return value_;
+            }
+
+          private:
+            ValueType value_;
+        };
+
+        class Device
+        {
+          public:
+            constexpr Device()
+                : platform_id_(-1)
+                , device_id_(-1)
+            {
+            }
+
+            constexpr explicit Device(DeviceId device_id, PlatformId platform_id = PlatformId(-1))
+                : Device(platform_id.get(), device_id.get())
+            {
+            }
+
+            constexpr explicit Device(PlatformId platform_id, DeviceId device_id = DeviceId(-1))
+                : Device(platform_id.get(), device_id.get())
+            {
+            }
+
+            constexpr explicit Device(int platform_id, int device_id = 0)
+                : platform_id_(platform_id)
+                , device_id_(device_id)
+            {
+            }
+
+            MMDEPLOY_API explicit Device(const char* platform_name, int device_id = 0);
+
+            constexpr int device_id() const noexcept
+            {
+                return device_id_;
+            }
+
+            constexpr int platform_id() const noexcept
+            {
+                return platform_id_;
+            }
+
+            constexpr bool is_host() const noexcept
+            {
+                return platform_id() == 0;
+            }
+
+            constexpr bool is_device() const noexcept
+            {
+                return platform_id() > 0;
+            }
+
+            constexpr bool operator==(const Device& other) const noexcept
+            {
+                return platform_id_ == other.platform_id_ && device_id_ == other.device_id_;
+            }
+
+            constexpr bool operator!=(const Device& other) const noexcept
+            {
+                return !(*this == other);
+            }
+
+            constexpr explicit operator bool() const noexcept
+            {
+                return platform_id_ >= 0 && device_id_ >= 0;
+            }
+
+            constexpr operator DeviceId() const noexcept
+            {  // NOLINT
+                return DeviceId(device_id_);
+            }
+
+            constexpr operator PlatformId() const noexcept
+            {  // NOLINT
+                return PlatformId(platform_id_);
+            }
+
+            friend std::ostream& operator<<(std::ostream& os, const Device& device)
+            {
+                os << "(" << device.platform_id_ << ", " << device.device_id_ << ")";
+                return os;
+            }
+
+          private:
+            int platform_id_{0};
+            int device_id_{0};
+        };
+
+        enum class MemcpyKind : int
+        {
+            HtoD,
+            DtoH,
+            DtoD
+        };
+
+        class MMDEPLOY_API Platform
+        {
+          public:
+            // throws if not found
+            explicit Platform(const char* platform_name);
+
+            // throws if not found
+            explicit Platform(int platform_id);
+
+            // bind device with the current thread
+            Result<void> Bind(Device device, Device* prev);
+
+            // -1 if invalid
+            int          GetPlatformId() const;
+
+            // "" if invalid
+            const char*  GetPlatformName() const;
+
+            bool         operator==(const Platform& other)
+            {
+                return impl_ == other.impl_;
+            }
+
+            bool operator!=(const Platform& other)
+            {
+                return !(*this == other);
+            }
+
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
+
+          private:
+            explicit Platform(std::shared_ptr<PlatformImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
+
+          private:
+            friend class PlatformRegistry;
+            friend class Access;
+            std::shared_ptr<PlatformImpl> impl_;
+        };
+
+        MMDEPLOY_API const char* GetPlatformName(PlatformId id);
+
+        class DeviceGuard
+        {
+          public:
+            explicit DeviceGuard(Device device)
+                : platform_(device.platform_id())
+            {
+                auto r = platform_.Bind(device, &prev_);
+                if (!r)
+                {
+                    MMDEPLOY_ERROR("failed to bind device {}: {}", device, r.error().message().c_str());
+                }
+            }
 
-namespace framework {
+            ~DeviceGuard()
+            {
+                auto r = platform_.Bind(prev_, nullptr);
+                if (!r)
+                {
+                    MMDEPLOY_ERROR("failed to unbind device {}: {}", prev_, r.error().message().c_str());
+                }
+            }
 
-class Platform;
-class Device;
-class Stream;
-class Event;
-class Allocator;
-class Buffer;
-class Kernel;
+          private:
+            Platform platform_;
+            Device   prev_;
+        };
 
-class PlatformImpl;
-class StreamImpl;
-class EventImpl;
-class AllocatorImpl;
-class BufferImpl;
-class KernelImpl;
+        class MMDEPLOY_API Stream
+        {
+          public:
+            Stream() = default;
 
-template <typename T>
-using optional = std::optional<T>;
+            explicit Stream(Device device, uint64_t flags = 0);
 
-class DeviceId {
- public:
-  using ValueType = int32_t;
-  constexpr explicit DeviceId(ValueType value) : value_(value) {}
-  constexpr operator ValueType() const { return value_; }  // NOLINT
-  constexpr ValueType get() const { return value_; }
+            explicit Stream(Device device, void* native, uint64_t flags = 0);
 
- private:
-  ValueType value_;
-};
+            explicit Stream(Device device, std::shared_ptr<void> native, uint64_t flags = 0);
 
-class PlatformId {
- public:
-  using ValueType = int32_t;
-  constexpr explicit PlatformId(ValueType value) : value_(value) {}
-  constexpr operator ValueType() const { return value_; }  // NOLINT
-  constexpr ValueType get() const { return value_; }
+            Device       GetDevice() const;
 
- private:
-  ValueType value_;
-};
+            Result<void> Query();
 
-class Device {
- public:
-  constexpr Device() : platform_id_(-1), device_id_(-1) {}
+            Result<void> Wait();
 
-  constexpr explicit Device(DeviceId device_id, PlatformId platform_id = PlatformId(-1))
-      : Device(platform_id.get(), device_id.get()) {}
+            Result<void> DependsOn(Event& event);
 
-  constexpr explicit Device(PlatformId platform_id, DeviceId device_id = DeviceId(-1))
-      : Device(platform_id.get(), device_id.get()) {}
+            Result<void> Submit(Kernel& kernel);
 
-  constexpr explicit Device(int platform_id, int device_id = 0)
-      : platform_id_(platform_id), device_id_(device_id) {}
+            void*        GetNative(ErrorCode* ec = nullptr);
 
-  MMDEPLOY_API explicit Device(const char* platform_name, int device_id = 0);
+            Result<void> Copy(const Buffer& src, Buffer& dst, size_t size = -1, size_t src_offset = 0, size_t dst_offset = 0);
 
-  constexpr int device_id() const noexcept { return device_id_; }
+            Result<void> Copy(const void* host_ptr, Buffer& dst, size_t size = -1, size_t dst_offset = 0);
 
-  constexpr int platform_id() const noexcept { return platform_id_; }
+            Result<void> Copy(const Buffer& src, void* host_ptr, size_t size = -1, size_t src_offset = 0);
 
-  constexpr bool is_host() const noexcept { return platform_id() == 0; }
+            Result<void> Fill(const Buffer& dst, void* pattern, size_t pattern_size, size_t size = -1, size_t offset = 0);
 
-  constexpr bool is_device() const noexcept { return platform_id() > 0; }
+            bool         operator==(const Stream& other) const
+            {
+                return impl_ == other.impl_;
+            }
 
-  constexpr bool operator==(const Device& other) const noexcept {
-    return platform_id_ == other.platform_id_ && device_id_ == other.device_id_;
-  }
+            bool operator!=(const Stream& other) const
+            {
+                return !(*this == other);
+            }
 
-  constexpr bool operator!=(const Device& other) const noexcept { return !(*this == other); }
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
 
-  constexpr explicit operator bool() const noexcept { return platform_id_ >= 0 && device_id_ >= 0; }
+            static Stream GetDefault(Device device);
 
-  constexpr operator DeviceId() const noexcept {  // NOLINT
-    return DeviceId(device_id_);
-  }
+          private:
+            explicit Stream(std::shared_ptr<StreamImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
 
-  constexpr operator PlatformId() const noexcept {  // NOLINT
-    return PlatformId(platform_id_);
-  }
+          private:
+            friend class Access;
 
-  friend std::ostream& operator<<(std::ostream& os, const Device& device) {
-    os << "(" << device.platform_id_ << ", " << device.device_id_ << ")";
-    return os;
-  }
+            std::shared_ptr<StreamImpl> impl_;
+        };
 
- private:
-  int platform_id_{0};
-  int device_id_{0};
-};
+        template<typename T>
+        T GetNative(Stream& stream, ErrorCode* ec = nullptr)
+        {
+            return reinterpret_cast<T>(stream.GetNative(ec));
+        }
 
-enum class MemcpyKind : int { HtoD, DtoH, DtoD };
+        class MMDEPLOY_API Event
+        {
+          public:
+            Event() = default;
 
-class MMDEPLOY_API Platform {
- public:
-  // throws if not found
-  explicit Platform(const char* platform_name);
+            explicit Event(Device device, uint64_t flags = 0);
 
-  // throws if not found
-  explicit Platform(int platform_id);
+            explicit Event(Device device, void* native, uint64_t flags = 0);
 
-  // bind device with the current thread
-  Result<void> Bind(Device device, Device* prev);
+            explicit Event(Device device, std::shared_ptr<void> native, uint64_t flags = 0);
 
-  // -1 if invalid
-  int GetPlatformId() const;
+            Device       GetDevice();
 
-  // "" if invalid
-  const char* GetPlatformName() const;
+            Result<void> Query();
 
-  bool operator==(const Platform& other) { return impl_ == other.impl_; }
+            Result<void> Wait();
 
-  bool operator!=(const Platform& other) { return !(*this == other); }
+            Result<void> Record(Stream& stream);
 
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
+            void*        GetNative(ErrorCode* ec = nullptr);
 
- private:
-  explicit Platform(std::shared_ptr<PlatformImpl> impl) : impl_(std::move(impl)) {}
+            bool         operator==(const Event& other) const
+            {
+                return impl_ == other.impl_;
+            }
 
- private:
-  friend class PlatformRegistry;
-  friend class Access;
-  std::shared_ptr<PlatformImpl> impl_;
-};
+            bool operator!=(const Event& other) const
+            {
+                return !(*this == other);
+            }
 
-MMDEPLOY_API const char* GetPlatformName(PlatformId id);
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
 
-class DeviceGuard {
- public:
-  explicit DeviceGuard(Device device) : platform_(device.platform_id()) {
-    auto r = platform_.Bind(device, &prev_);
-    if (!r) {
-      MMDEPLOY_ERROR("failed to bind device {}: {}", device, r.error().message().c_str());
-    }
-  }
+          private:
+            explicit Event(std::shared_ptr<EventImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
 
-  ~DeviceGuard() {
-    auto r = platform_.Bind(prev_, nullptr);
-    if (!r) {
-      MMDEPLOY_ERROR("failed to unbind device {}: {}", prev_, r.error().message().c_str());
-    }
-  }
+          private:
+            friend class Access;
+            std::shared_ptr<EventImpl> impl_;
+        };
 
- private:
-  Platform platform_;
-  Device prev_;
-};
+        template<typename T>
+        T GetNative(Event& event, ErrorCode* ec = nullptr)
+        {
+            return reinterpret_cast<T>(event.GetNative(ec));
+        }
 
-class MMDEPLOY_API Stream {
- public:
-  Stream() = default;
+        class MMDEPLOY_API Kernel
+        {
+          public:
+            Kernel() = default;
+            explicit Kernel(std::shared_ptr<KernelImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
 
-  explicit Stream(Device device, uint64_t flags = 0);
+            Device   GetDevice() const;
 
-  explicit Stream(Device device, void* native, uint64_t flags = 0);
+            void*    GetNative(ErrorCode* ec = nullptr);
+
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
+
+          private:
+            std::shared_ptr<KernelImpl> impl_;
+        };
 
-  explicit Stream(Device device, std::shared_ptr<void> native, uint64_t flags = 0);
+        template<typename T>
+        T GetNative(Kernel& kernel, ErrorCode* ec = nullptr)
+        {
+            return reinterpret_cast<T>(kernel.GetNative(ec));
+        }
 
-  Device GetDevice() const;
+        class MMDEPLOY_API Allocator
+        {
+            friend class Access;
 
-  Result<void> Query();
+          public:
+            Allocator() = default;
 
-  Result<void> Wait();
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
 
-  Result<void> DependsOn(Event& event);
+          private:
+            explicit Allocator(std::shared_ptr<AllocatorImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
+            std::shared_ptr<AllocatorImpl> impl_;
+        };
 
-  Result<void> Submit(Kernel& kernel);
+        class MMDEPLOY_API Buffer
+        {
+          public:
+            Buffer() = default;
 
-  void* GetNative(ErrorCode* ec = nullptr);
+            Buffer(Device device, size_t size, size_t alignment = 1, uint64_t flags = 0)
+                : Buffer(device, size, Allocator{}, alignment, flags)
+            {
+            }
 
-  Result<void> Copy(const Buffer& src, Buffer& dst, size_t size = -1, size_t src_offset = 0,
-                    size_t dst_offset = 0);
+            Buffer(Device device, size_t size, Allocator allocator, size_t alignment = 1, uint64_t flags = 0);
 
-  Result<void> Copy(const void* host_ptr, Buffer& dst, size_t size = -1, size_t dst_offset = 0);
+            Buffer(Device device, size_t size, void* native, uint64_t flags = 0);
 
-  Result<void> Copy(const Buffer& src, void* host_ptr, size_t size = -1, size_t src_offset = 0);
+            Buffer(Device device, size_t size, std::shared_ptr<void> native, uint64_t flags = 0);
+            // create sub-buffer
+            Buffer(Buffer& buffer, size_t offset, size_t size, uint64_t flags = 0);
 
-  Result<void> Fill(const Buffer& dst, void* pattern, size_t pattern_size, size_t size = -1,
-                    size_t offset = 0);
+            size_t    GetSize(ErrorCode* ec = nullptr) const;
+
+            //  bool IsSubBuffer(ErrorCode* ec = nullptr);
+
+            void*     GetNative(ErrorCode* ec = nullptr) const;
+
+            Device    GetDevice() const;
+
+            Allocator GetAllocator() const;
+
+            bool      operator==(const Buffer& other) const
+            {
+                return impl_ == other.impl_;
+            }
 
-  bool operator==(const Stream& other) const { return impl_ == other.impl_; }
+            bool operator!=(const Buffer& other) const
+            {
+                return !(*this == other);
+            }
 
-  bool operator!=(const Stream& other) const { return !(*this == other); }
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
 
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
+          private:
+            explicit Buffer(std::shared_ptr<BufferImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
 
-  static Stream GetDefault(Device device);
+          private:
+            friend class Access;
+            std::shared_ptr<BufferImpl> impl_;
+        };
 
- private:
-  explicit Stream(std::shared_ptr<StreamImpl> impl) : impl_(std::move(impl)) {}
+        template<typename T>
+        T GetNative(Buffer& buffer, ErrorCode* ec = nullptr)
+        {
+            return reinterpret_cast<T>(buffer.GetNative(ec));
+        }
 
- private:
-  friend class Access;
+        template<typename T>
+        T GetNative(const Buffer& buffer, ErrorCode* ec = nullptr)
+        {
+            return reinterpret_cast<T>(buffer.GetNative(ec));
+        }
 
-  std::shared_ptr<StreamImpl> impl_;
-};
+        class MMDEPLOY_API PlatformRegistry
+        {
+          public:
+            using Creator = std::function<std::shared_ptr<PlatformImpl>()>;
 
-template <typename T>
-T GetNative(Stream& stream, ErrorCode* ec = nullptr) {
-  return reinterpret_cast<T>(stream.GetNative(ec));
-}
+            int           Register(Creator creator);
 
-class MMDEPLOY_API Event {
- public:
-  Event() = default;
+            int           AddAlias(const char* name, const char* target);
 
-  explicit Event(Device device, uint64_t flags = 0);
+            int           GetPlatform(const char* name, Platform* platform);
 
-  explicit Event(Device device, void* native, uint64_t flags = 0);
+            int           GetPlatform(int id, Platform* platform);
 
-  explicit Event(Device device, std::shared_ptr<void> native, uint64_t flags = 0);
+            int           GetPlatformId(const char* name);
 
-  Device GetDevice();
+            PlatformImpl* GetPlatformImpl(PlatformId id);
 
-  Result<void> Query();
+          private:
+            int  GetNextId();
 
-  Result<void> Wait();
+            bool IsAvailable(int id);
 
-  Result<void> Record(Stream& stream);
+          private:
+            struct Entry
+            {
+                std::string name;
+                int         id;
+                Platform    platform;
+            };
+            std::vector<Entry>                               entries_;
+            std::vector<std::pair<std::string, std::string>> aliases_;
+        };
 
-  void* GetNative(ErrorCode* ec = nullptr);
+        MMDEPLOY_API PlatformRegistry& gPlatformRegistry();
 
-  bool operator==(const Event& other) const { return impl_ == other.impl_; }
+    }  // namespace framework
 
-  bool operator!=(const Event& other) const { return !(*this == other); }
-
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
-
- private:
-  explicit Event(std::shared_ptr<EventImpl> impl) : impl_(std::move(impl)) {}
-
- private:
-  friend class Access;
-  std::shared_ptr<EventImpl> impl_;
-};
-
-template <typename T>
-T GetNative(Event& event, ErrorCode* ec = nullptr) {
-  return reinterpret_cast<T>(event.GetNative(ec));
-}
-
-class MMDEPLOY_API Kernel {
- public:
-  Kernel() = default;
-  explicit Kernel(std::shared_ptr<KernelImpl> impl) : impl_(std::move(impl)) {}
-
-  Device GetDevice() const;
-
-  void* GetNative(ErrorCode* ec = nullptr);
-
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
-
- private:
-  std::shared_ptr<KernelImpl> impl_;
-};
-
-template <typename T>
-T GetNative(Kernel& kernel, ErrorCode* ec = nullptr) {
-  return reinterpret_cast<T>(kernel.GetNative(ec));
-}
-
-class MMDEPLOY_API Allocator {
-  friend class Access;
-
- public:
-  Allocator() = default;
-
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
-
- private:
-  explicit Allocator(std::shared_ptr<AllocatorImpl> impl) : impl_(std::move(impl)) {}
-  std::shared_ptr<AllocatorImpl> impl_;
-};
-
-class MMDEPLOY_API Buffer {
- public:
-  Buffer() = default;
-
-  Buffer(Device device, size_t size, size_t alignment = 1, uint64_t flags = 0)
-      : Buffer(device, size, Allocator{}, alignment, flags) {}
-
-  Buffer(Device device, size_t size, Allocator allocator, size_t alignment = 1, uint64_t flags = 0);
-
-  Buffer(Device device, size_t size, void* native, uint64_t flags = 0);
-
-  Buffer(Device device, size_t size, std::shared_ptr<void> native, uint64_t flags = 0);
-  // create sub-buffer
-  Buffer(Buffer& buffer, size_t offset, size_t size, uint64_t flags = 0);
-
-  size_t GetSize(ErrorCode* ec = nullptr) const;
-
-  //  bool IsSubBuffer(ErrorCode* ec = nullptr);
-
-  void* GetNative(ErrorCode* ec = nullptr) const;
-
-  Device GetDevice() const;
-
-  Allocator GetAllocator() const;
-
-  bool operator==(const Buffer& other) const { return impl_ == other.impl_; }
-
-  bool operator!=(const Buffer& other) const { return !(*this == other); }
-
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
-
- private:
-  explicit Buffer(std::shared_ptr<BufferImpl> impl) : impl_(std::move(impl)) {}
-
- private:
-  friend class Access;
-  std::shared_ptr<BufferImpl> impl_;
-};
-
-template <typename T>
-T GetNative(Buffer& buffer, ErrorCode* ec = nullptr) {
-  return reinterpret_cast<T>(buffer.GetNative(ec));
-}
-
-template <typename T>
-T GetNative(const Buffer& buffer, ErrorCode* ec = nullptr) {
-  return reinterpret_cast<T>(buffer.GetNative(ec));
-}
-
-class MMDEPLOY_API PlatformRegistry {
- public:
-  using Creator = std::function<std::shared_ptr<PlatformImpl>()>;
-
-  int Register(Creator creator);
-
-  int AddAlias(const char* name, const char* target);
-
-  int GetPlatform(const char* name, Platform* platform);
-
-  int GetPlatform(int id, Platform* platform);
-
-  int GetPlatformId(const char* name);
-
-  PlatformImpl* GetPlatformImpl(PlatformId id);
-
- private:
-  int GetNextId();
-
-  bool IsAvailable(int id);
-
- private:
-  struct Entry {
-    std::string name;
-    int id;
-    Platform platform;
-  };
-  std::vector<Entry> entries_;
-  std::vector<std::pair<std::string, std::string>> aliases_;
-};
-
-MMDEPLOY_API PlatformRegistry& gPlatformRegistry();
-
-}  // namespace framework
-
-MMDEPLOY_REGISTER_TYPE_ID(framework::Device, 1);
-MMDEPLOY_REGISTER_TYPE_ID(framework::Buffer, 2);
-MMDEPLOY_REGISTER_TYPE_ID(framework::Stream, 3);
-MMDEPLOY_REGISTER_TYPE_ID(framework::Event, 4);
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Device, 1);
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Buffer, 2);
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Stream, 3);
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Event, 4);
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/device_impl.cpp b/csrc/mmdeploy/core/device_impl.cpp
index b65b82be07..75d60c2f82 100644
--- a/csrc/mmdeploy/core/device_impl.cpp
+++ b/csrc/mmdeploy/core/device_impl.cpp
@@ -7,111 +7,150 @@
 #include "mmdeploy/core/device.h"
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy::framework {
-
-template <typename T>
-T SetError(ErrorCode* ec, ErrorCode code, T ret) {
-  if (ec) {
-    *ec = code;
-  }
-  return ret;
-}
+namespace mmdeploy::framework
+{
+
+    template<typename T>
+    T SetError(ErrorCode* ec, ErrorCode code, T ret)
+    {
+        if (ec)
+        {
+            *ec = code;
+        }
+        return ret;
+    }
 
-////////////////////////////////////////////////////////////////////////////////
-/// Device
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Device
 
-Device::Device(const char* platform_name, int device_id) {
-  platform_id_ = gPlatformRegistry().GetPlatformId(platform_name);
-  device_id_ = device_id;
-}
-
-//////////////////////////////////////////////////
-/// Platform
+    Device::Device(const char* platform_name, int device_id)
+    {
+        platform_id_ = gPlatformRegistry().GetPlatformId(platform_name);
+        device_id_   = device_id;
+    }
 
-int Platform::GetPlatformId() const {
-  if (impl_) {
-    return impl_->GetPlatformId();
-  }
-  return -1;
-}
+    //////////////////////////////////////////////////
+    /// Platform
 
-const char* Platform::GetPlatformName() const {
-  if (impl_) {
-    return impl_->GetPlatformName();
-  }
-  return "";
-}
+    int Platform::GetPlatformId() const
+    {
+        if (impl_)
+        {
+            return impl_->GetPlatformId();
+        }
+        return -1;
+    }
 
-Platform::Platform(const char* platform_name) {
-  if (-1 == gPlatformRegistry().GetPlatform(platform_name, this)) {
-    throw_exception(eInvalidArgument);
-  }
-}
+    const char* Platform::GetPlatformName() const
+    {
+        if (impl_)
+        {
+            return impl_->GetPlatformName();
+        }
+        return "";
+    }
 
-Platform::Platform(int platform_id) {
-  if (-1 == gPlatformRegistry().GetPlatform(platform_id, this)) {
-    throw_exception(eInvalidArgument);
-  }
-}
+    Platform::Platform(const char* platform_name)
+    {
+        if (-1 == gPlatformRegistry().GetPlatform(platform_name, this))
+        {
+            throw_exception(eInvalidArgument);
+        }
+    }
 
-Result<void> Platform::Bind(Device device, Device* prev) { return impl_->BindDevice(device, prev); }
+    Platform::Platform(int platform_id)
+    {
+        if (-1 == gPlatformRegistry().GetPlatform(platform_id, this))
+        {
+            throw_exception(eInvalidArgument);
+        }
+    }
 
-const char* GetPlatformName(PlatformId id) {
-  if (auto impl = gPlatformRegistry().GetPlatformImpl(id); impl) {
-    return impl->GetPlatformName();
-  }
-  return nullptr;
-}
+    Result<void> Platform::Bind(Device device, Device* prev)
+    {
+        return impl_->BindDevice(device, prev);
+    }
 
-////////////////////////////////////////////////////////////////////////////////
-/// Buffer
+    const char* GetPlatformName(PlatformId id)
+    {
+        if (auto impl = gPlatformRegistry().GetPlatformImpl(id); impl)
+        {
+            return impl->GetPlatformName();
+        }
+        return nullptr;
+    }
 
-Buffer::Buffer(Device device, size_t size, Allocator allocator, size_t alignment, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    impl_ = p->CreateBuffer(device);
-    if (auto r = impl_->Init(size, std::move(allocator), alignment, flags); r.has_error()) {
-      impl_.reset();
-      r.error().throw_exception();
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Buffer
+
+    Buffer::Buffer(Device device, size_t size, Allocator allocator, size_t alignment, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            impl_ = p->CreateBuffer(device);
+            if (auto r = impl_->Init(size, std::move(allocator), alignment, flags); r.has_error())
+            {
+                impl_.reset();
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            throw_exception(eInvalidArgument);
+        }
     }
-  } else {
-    throw_exception(eInvalidArgument);
-  }
-}
 
-Buffer::Buffer(Device device, size_t size, void* native, uint64_t flags)
-    : Buffer(device, size, std::shared_ptr<void>(native, [](void*) {}), flags) {}
+    Buffer::Buffer(Device device, size_t size, void* native, uint64_t flags)
+        : Buffer(device, size, std::shared_ptr<void>(native, [](void*) {}), flags)
+    {
+    }
 
-Buffer::Buffer(Device device, size_t size, std::shared_ptr<void> native, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    impl_ = p->CreateBuffer(device);
-    if (auto r = impl_->Init(size, std::move(native), flags); r.has_error()) {
-      impl_.reset();
-      r.error().throw_exception();
+    Buffer::Buffer(Device device, size_t size, std::shared_ptr<void> native, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            impl_ = p->CreateBuffer(device);
+            if (auto r = impl_->Init(size, std::move(native), flags); r.has_error())
+            {
+                impl_.reset();
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            throw_exception(eInvalidArgument);
+        }
     }
-  } else {
-    throw_exception(eInvalidArgument);
-  }
-}
 
-Device Buffer::GetDevice() const { return impl_ ? impl_->GetDevice() : Device{}; }
+    Device Buffer::GetDevice() const
+    {
+        return impl_ ? impl_->GetDevice() : Device{};
+    }
 
-Allocator Buffer::GetAllocator() const { return impl_ ? impl_->GetAllocator() : Allocator{}; }
+    Allocator Buffer::GetAllocator() const
+    {
+        return impl_ ? impl_->GetAllocator() : Allocator{};
+    }
 
-void* Buffer::GetNative(ErrorCode* ec) const {
-  return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
-}
+    void* Buffer::GetNative(ErrorCode* ec) const
+    {
+        return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
+    }
 
-size_t Buffer::GetSize(ErrorCode* ec) const {
-  return impl_ ? impl_->GetSize(ec) : SetError(ec, eInvalidArgument, 0);
-}
+    size_t Buffer::GetSize(ErrorCode* ec) const
+    {
+        return impl_ ? impl_->GetSize(ec) : SetError(ec, eInvalidArgument, 0);
+    }
 
-Buffer::Buffer(Buffer& buffer, size_t offset, size_t size, uint64_t flags) {
-  auto impl = buffer.impl_->SubBuffer(offset, size, flags);
-  if (!impl) {
-    impl.error().throw_exception();
-  }
-  impl_ = std::move(impl).value();
-}
+    Buffer::Buffer(Buffer& buffer, size_t offset, size_t size, uint64_t flags)
+    {
+        auto impl = buffer.impl_->SubBuffer(offset, size, flags);
+        if (!impl)
+        {
+            impl.error().throw_exception();
+        }
+        impl_ = std::move(impl).value();
+    }
 
 #if 0
 int Copy(const void* host_ptr, Buffer& dst, size_t size, size_t dst_offset) {
@@ -141,270 +180,368 @@ int Copy(const Buffer& src, Buffer& dst, size_t size, size_t src_offset,
 }
 #endif
 
-//////////////////////////////////////////////////
-/// Stream
-
-Stream::Stream(Device device, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    auto impl = p->CreateStream(device);
-    if (auto r = impl->Init(flags)) {
-      impl_ = std::move(impl);
-    } else {
-      r.error().throw_exception();
-    }
-  } else {
-    MMDEPLOY_ERROR("{}, {}", device.device_id(), device.platform_id());
-    throw_exception(eInvalidArgument);
-  }
-}
-
-Stream::Stream(Device device, void* native, uint64_t flags)
-    : Stream(device, std::shared_ptr<void>(native, [](void*) {}), flags) {}
-
-Stream::Stream(Device device, std::shared_ptr<void> native, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    auto impl = p->CreateStream(device);
-    if (auto r = impl->Init(std::move(native), flags)) {
-      impl_ = std::move(impl);
-    } else {
-      r.error().throw_exception();
+    //////////////////////////////////////////////////
+    /// Stream
+
+    Stream::Stream(Device device, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            auto impl = p->CreateStream(device);
+            if (auto r = impl->Init(flags))
+            {
+                impl_ = std::move(impl);
+            }
+            else
+            {
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            MMDEPLOY_ERROR("{}, {}", device.device_id(), device.platform_id());
+            throw_exception(eInvalidArgument);
+        }
     }
-  } else {
-    throw_exception(eInvalidArgument);
-  }
-}
-
-Result<void> Stream::Query() {
-  if (impl_) {
-    return impl_->Query();
-  }
-  return Status(eInvalidArgument);
-}
 
-Result<void> Stream::Wait() {
-  if (impl_) {
-    return impl_->Wait();
-  }
-  return Status(eInvalidArgument);
-}
+    Stream::Stream(Device device, void* native, uint64_t flags)
+        : Stream(device, std::shared_ptr<void>(native, [](void*) {}), flags)
+    {
+    }
 
-Result<void> Stream::DependsOn(Event& event) {
-  return impl_ ? impl_->DependsOn(event) : Status(eInvalidArgument);
-}
+    Stream::Stream(Device device, std::shared_ptr<void> native, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            auto impl = p->CreateStream(device);
+            if (auto r = impl->Init(std::move(native), flags))
+            {
+                impl_ = std::move(impl);
+            }
+            else
+            {
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            throw_exception(eInvalidArgument);
+        }
+    }
 
-void* Stream::GetNative(ErrorCode* ec) {
-  return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
-}
+    Result<void> Stream::Query()
+    {
+        if (impl_)
+        {
+            return impl_->Query();
+        }
+        return Status(eInvalidArgument);
+    }
 
-Result<void> Stream::Submit(Kernel& kernel) {
-  return impl_ ? impl_->Submit(kernel) : Status(eInvalidArgument);
-}
+    Result<void> Stream::Wait()
+    {
+        if (impl_)
+        {
+            return impl_->Wait();
+        }
+        return Status(eInvalidArgument);
+    }
 
-Result<void> Stream::Copy(const Buffer& src, Buffer& dst, size_t size, size_t src_offset,
-                          size_t dst_offset) {
-  if (!impl_) {
-    return Status(eInvalidArgument);
-  }
-  if (size == static_cast<size_t>(-1)) {
-    size = src.GetSize();
-  }
-  if (auto p = GetPlatformImpl(GetDevice())) {
-    return p->Copy(src, dst, size, src_offset, dst_offset, *this);
-  }
-  return Status(eInvalidArgument);
-}
+    Result<void> Stream::DependsOn(Event& event)
+    {
+        return impl_ ? impl_->DependsOn(event) : Status(eInvalidArgument);
+    }
 
-Result<void> Stream::Copy(const void* host_ptr, Buffer& dst, size_t size, size_t dst_offset) {
-  if (!impl_) {
-    return Status(eInvalidArgument);
-  }
-  if (size == static_cast<size_t>(-1)) {
-    size = dst.GetSize();
-  }
-  auto device = GetDevice();
-  if (auto p = GetPlatformImpl(device)) {
-    return p->Copy(host_ptr, dst, size, dst_offset, *this);
-  }
-  return Status(eInvalidArgument);
-}
+    void* Stream::GetNative(ErrorCode* ec)
+    {
+        return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
+    }
 
-Result<void> Stream::Copy(const Buffer& src, void* host_ptr, size_t size, size_t src_offset) {
-  if (!impl_) {
-    return Status(eInvalidArgument);
-  }
-  if (size == static_cast<size_t>(-1)) {
-    size = src.GetSize();
-  }
-  if (auto p = GetPlatformImpl(GetDevice())) {
-    return p->Copy(src, host_ptr, size, src_offset, *this);
-  }
-  return Status(eInvalidArgument);
-}
+    Result<void> Stream::Submit(Kernel& kernel)
+    {
+        return impl_ ? impl_->Submit(kernel) : Status(eInvalidArgument);
+    }
 
-Result<void> Stream::Fill(const Buffer& dst, void* pattern, size_t pattern_size, size_t size,
-                          size_t offset) {
-  if (!impl_) {
-    return Status(eInvalidArgument);
-  }
-  return Status(eNotSupported);
-}
+    Result<void> Stream::Copy(const Buffer& src, Buffer& dst, size_t size, size_t src_offset, size_t dst_offset)
+    {
+        if (!impl_)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == static_cast<size_t>(-1))
+        {
+            size = src.GetSize();
+        }
+        if (auto p = GetPlatformImpl(GetDevice()))
+        {
+            return p->Copy(src, dst, size, src_offset, dst_offset, *this);
+        }
+        return Status(eInvalidArgument);
+    }
 
-Device Stream::GetDevice() const { return impl_ ? impl_->GetDevice() : Device{}; }
+    Result<void> Stream::Copy(const void* host_ptr, Buffer& dst, size_t size, size_t dst_offset)
+    {
+        if (!impl_)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == static_cast<size_t>(-1))
+        {
+            size = dst.GetSize();
+        }
+        auto device = GetDevice();
+        if (auto p = GetPlatformImpl(device))
+        {
+            return p->Copy(host_ptr, dst, size, dst_offset, *this);
+        }
+        return Status(eInvalidArgument);
+    }
 
-Stream Stream::GetDefault(Device device) {
-  Platform platform(device.platform_id());
-  assert(platform);
-  Stream stream = Access::get<PlatformImpl>(platform).GetDefaultStream(device.device_id()).value();
-  return stream;
-}
+    Result<void> Stream::Copy(const Buffer& src, void* host_ptr, size_t size, size_t src_offset)
+    {
+        if (!impl_)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == static_cast<size_t>(-1))
+        {
+            size = src.GetSize();
+        }
+        if (auto p = GetPlatformImpl(GetDevice()))
+        {
+            return p->Copy(src, host_ptr, size, src_offset, *this);
+        }
+        return Status(eInvalidArgument);
+    }
 
-/////////////////////////////////////////////////
-/// Event
+    Result<void> Stream::Fill(const Buffer& dst, void* pattern, size_t pattern_size, size_t size, size_t offset)
+    {
+        if (!impl_)
+        {
+            return Status(eInvalidArgument);
+        }
+        return Status(eNotSupported);
+    }
 
-Event::Event(Device device, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    auto impl = p->CreateEvent(device);
-    if (auto r = impl->Init(flags)) {
-      impl_ = std::move(impl);
-    } else {
-      r.error().throw_exception();
+    Device Stream::GetDevice() const
+    {
+        return impl_ ? impl_->GetDevice() : Device{};
     }
-  } else {
-    throw_exception(eInvalidArgument);
-  }
-}
 
-Event::Event(Device device, void* native, uint64_t flags)
-    : Event(device, std::shared_ptr<void>(native, [](void*) {}), flags) {}
+    Stream Stream::GetDefault(Device device)
+    {
+        Platform platform(device.platform_id());
+        assert(platform);
+        Stream stream = Access::get<PlatformImpl>(platform).GetDefaultStream(device.device_id()).value();
+        return stream;
+    }
 
-Event::Event(Device device, std::shared_ptr<void> native, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    auto impl = p->CreateEvent(device);
-    if (auto r = impl->Init(std::move(native), flags)) {
-      impl_ = std::move(impl);
-    } else {
-      r.error().throw_exception();
+    /////////////////////////////////////////////////
+    /// Event
+
+    Event::Event(Device device, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            auto impl = p->CreateEvent(device);
+            if (auto r = impl->Init(flags))
+            {
+                impl_ = std::move(impl);
+            }
+            else
+            {
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            throw_exception(eInvalidArgument);
+        }
     }
-  } else {
-    throw_exception(eInvalidArgument);
-  }
-}
 
-Result<void> Event::Query() { return impl_ ? impl_->Query() : Status(eInvalidArgument); }
+    Event::Event(Device device, void* native, uint64_t flags)
+        : Event(device, std::shared_ptr<void>(native, [](void*) {}), flags)
+    {
+    }
 
-Result<void> Event::Wait() { return impl_ ? impl_->Wait() : Status(eInvalidArgument); }
+    Event::Event(Device device, std::shared_ptr<void> native, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            auto impl = p->CreateEvent(device);
+            if (auto r = impl->Init(std::move(native), flags))
+            {
+                impl_ = std::move(impl);
+            }
+            else
+            {
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            throw_exception(eInvalidArgument);
+        }
+    }
 
-Result<void> Event::Record(Stream& stream) {
-  return impl_ ? impl_->Record(stream) : Status(eInvalidArgument);
-}
+    Result<void> Event::Query()
+    {
+        return impl_ ? impl_->Query() : Status(eInvalidArgument);
+    }
 
-void* Event::GetNative(ErrorCode* ec) {
-  return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
-}
+    Result<void> Event::Wait()
+    {
+        return impl_ ? impl_->Wait() : Status(eInvalidArgument);
+    }
 
-Device Event::GetDevice() { return impl_ ? impl_->GetDevice() : Device{}; }
+    Result<void> Event::Record(Stream& stream)
+    {
+        return impl_ ? impl_->Record(stream) : Status(eInvalidArgument);
+    }
 
-/////////////////////////////////////////////////
-/// Kernel
+    void* Event::GetNative(ErrorCode* ec)
+    {
+        return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
+    }
 
-Device Kernel::GetDevice() const { return impl_ ? impl_->GetDevice() : Device{}; }
+    Device Event::GetDevice()
+    {
+        return impl_ ? impl_->GetDevice() : Device{};
+    }
 
-void* Kernel::GetNative(ErrorCode* ec) {
-  return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
-}
+    /////////////////////////////////////////////////
+    /// Kernel
 
-/////////////////////////////////////////////////
-/// PlatformRegistry
-
-int PlatformRegistry::Register(Creator creator) {
-  Platform platform(creator());
-  auto proposed_id = platform.GetPlatformId();
-  std::string name = platform.GetPlatformName();
-  if (proposed_id == -1) {
-    proposed_id = GetNextId();
-    platform.impl_->SetPlatformId(proposed_id);
-  } else if (!IsAvailable(proposed_id)) {
-    return -1;
-  }
-  entries_.push_back({name, proposed_id, platform});
-  return 0;
-}
+    Device Kernel::GetDevice() const
+    {
+        return impl_ ? impl_->GetDevice() : Device{};
+    }
 
-int PlatformRegistry::AddAlias(const char* name, const char* target) {
-  aliases_.emplace_back(name, target);
-  return 0;
-}
+    void* Kernel::GetNative(ErrorCode* ec)
+    {
+        return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
+    }
 
-int PlatformRegistry::GetNextId() {
-  for (int i = 1;; ++i) {
-    if (IsAvailable(i)) {
-      return i;
+    /////////////////////////////////////////////////
+    /// PlatformRegistry
+
+    int PlatformRegistry::Register(Creator creator)
+    {
+        Platform    platform(creator());
+        auto        proposed_id = platform.GetPlatformId();
+        std::string name        = platform.GetPlatformName();
+        if (proposed_id == -1)
+        {
+            proposed_id = GetNextId();
+            platform.impl_->SetPlatformId(proposed_id);
+        }
+        else if (!IsAvailable(proposed_id))
+        {
+            return -1;
+        }
+        entries_.push_back({name, proposed_id, platform});
+        return 0;
     }
-  }
-}
 
-bool PlatformRegistry::IsAvailable(int id) {
-  for (const auto& entry : entries_) {
-    if (entry.id == id) {
-      return false;
+    int PlatformRegistry::AddAlias(const char* name, const char* target)
+    {
+        aliases_.emplace_back(name, target);
+        return 0;
     }
-  }
-  return true;
-}
 
-int PlatformRegistry::GetPlatform(const char* name, Platform* platform) {
-  for (const auto& alias : aliases_) {
-    if (name == alias.first) {
-      name = alias.second.c_str();
-      break;
+    int PlatformRegistry::GetNextId()
+    {
+        for (int i = 1;; ++i)
+        {
+            if (IsAvailable(i))
+            {
+                return i;
+            }
+        }
     }
-  }
-  for (const auto& entry : entries_) {
-    if (entry.name == name) {
-      *platform = entry.platform;
-      return 0;
+
+    bool PlatformRegistry::IsAvailable(int id)
+    {
+        for (const auto& entry : entries_)
+        {
+            if (entry.id == id)
+            {
+                return false;
+            }
+        }
+        return true;
     }
-  }
-  return -1;
-}
 
-int PlatformRegistry::GetPlatform(int id, Platform* platform) {
-  for (const auto& entry : entries_) {
-    if (entry.id == id) {
-      *platform = entry.platform;
-      return 0;
+    int PlatformRegistry::GetPlatform(const char* name, Platform* platform)
+    {
+        for (const auto& alias : aliases_)
+        {
+            if (name == alias.first)
+            {
+                name = alias.second.c_str();
+                break;
+            }
+        }
+        for (const auto& entry : entries_)
+        {
+            if (entry.name == name)
+            {
+                *platform = entry.platform;
+                return 0;
+            }
+        }
+        return -1;
     }
-  }
-  return -1;
-}
 
-int PlatformRegistry::GetPlatformId(const char* name) {
-  for (const auto& alias : aliases_) {
-    if (name == alias.first) {
-      name = alias.second.c_str();
-      break;
+    int PlatformRegistry::GetPlatform(int id, Platform* platform)
+    {
+        for (const auto& entry : entries_)
+        {
+            if (entry.id == id)
+            {
+                *platform = entry.platform;
+                return 0;
+            }
+        }
+        return -1;
     }
-  }
-  for (const auto& entry : entries_) {
-    if (entry.name == name) {
-      return entry.id;
+
+    int PlatformRegistry::GetPlatformId(const char* name)
+    {
+        for (const auto& alias : aliases_)
+        {
+            if (name == alias.first)
+            {
+                name = alias.second.c_str();
+                break;
+            }
+        }
+        for (const auto& entry : entries_)
+        {
+            if (entry.name == name)
+            {
+                return entry.id;
+            }
+        }
+        return -1;
     }
-  }
-  return -1;
-}
 
-PlatformImpl* PlatformRegistry::GetPlatformImpl(PlatformId id) {
-  for (const auto& entry : entries_) {
-    if (entry.id == id) {
-      return entry.platform.impl_.get();
+    PlatformImpl* PlatformRegistry::GetPlatformImpl(PlatformId id)
+    {
+        for (const auto& entry : entries_)
+        {
+            if (entry.id == id)
+            {
+                return entry.platform.impl_.get();
+            }
+        }
+        return nullptr;
     }
-  }
-  return nullptr;
-}
 
-PlatformRegistry& gPlatformRegistry() {
-  static PlatformRegistry instance;
-  return instance;
-}
+    PlatformRegistry& gPlatformRegistry()
+    {
+        static PlatformRegistry instance;
+        return instance;
+    }
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/device_impl.h b/csrc/mmdeploy/core/device_impl.h
index 8860c96105..1098808000 100644
--- a/csrc/mmdeploy/core/device_impl.h
+++ b/csrc/mmdeploy/core/device_impl.h
@@ -5,181 +5,232 @@
 
 #include "mmdeploy/core/device.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-using std::shared_ptr;
+    using std::shared_ptr;
 
-using PlatformImplPtr = shared_ptr<PlatformImpl>;
-using AllocatorImplPtr = shared_ptr<AllocatorImpl>;
-using BufferImplPtr = shared_ptr<BufferImpl>;
-using StreamImplPtr = shared_ptr<StreamImpl>;
-using EventImplPtr = shared_ptr<EventImpl>;
+    using PlatformImplPtr  = shared_ptr<PlatformImpl>;
+    using AllocatorImplPtr = shared_ptr<AllocatorImpl>;
+    using BufferImplPtr    = shared_ptr<BufferImpl>;
+    using StreamImplPtr    = shared_ptr<StreamImpl>;
+    using EventImplPtr     = shared_ptr<EventImpl>;
 
-class PlatformImpl {
- public:
-  PlatformImpl() : platform_id_(-1) {}
+    class PlatformImpl
+    {
+      public:
+        PlatformImpl()
+            : platform_id_(-1)
+        {
+        }
 
-  virtual ~PlatformImpl() = default;
+        virtual ~PlatformImpl() = default;
 
-  virtual const char* GetPlatformName() const noexcept = 0;
+        virtual const char* GetPlatformName() const noexcept = 0;
 
-  virtual int GetPlatformId() const noexcept { return platform_id_; }
+        virtual int         GetPlatformId() const noexcept
+        {
+            return platform_id_;
+        }
 
-  virtual void SetPlatformId(int id) { platform_id_ = id; }
+        virtual void SetPlatformId(int id)
+        {
+            platform_id_ = id;
+        }
 
-  virtual Result<void> BindDevice(Device device, Device* prev) = 0;
+        virtual Result<void>           BindDevice(Device device, Device* prev) = 0;
 
-  virtual shared_ptr<BufferImpl> CreateBuffer(Device device) = 0;
+        virtual shared_ptr<BufferImpl> CreateBuffer(Device device) = 0;
 
-  virtual shared_ptr<StreamImpl> CreateStream(Device device) = 0;
+        virtual shared_ptr<StreamImpl> CreateStream(Device device) = 0;
 
-  virtual shared_ptr<EventImpl> CreateEvent(Device device) = 0;
+        virtual shared_ptr<EventImpl>  CreateEvent(Device device) = 0;
 
-  virtual Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
-                            Stream stream) = 0;
+        virtual Result<void>           Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream) = 0;
 
-  virtual Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
-                            Stream stream) = 0;
+        virtual Result<void>           Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream) = 0;
 
-  virtual Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset,
-                            size_t dst_offset, Stream stream) = 0;
+        virtual Result<void>           Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream) = 0;
 
-  virtual Result<Stream> GetDefaultStream(int32_t device_id) = 0;
+        virtual Result<Stream>         GetDefaultStream(int32_t device_id) = 0;
 
- protected:
-  int platform_id_;
-};
+      protected:
+        int platform_id_;
+    };
 
-class AllocatorImpl {
- public:
-  struct Block {
-    explicit Block(void* _handle = nullptr, size_t _size = 0) : handle(_handle), size(_size) {}
-    void* handle;
-    size_t size;
-  };
-  virtual ~AllocatorImpl() = default;
-  virtual Block Allocate(size_t size) noexcept = 0;
-  virtual void Deallocate(Block& block) noexcept = 0;
-  virtual bool Owns(const Block& block) const noexcept = 0;
-  virtual const char* Name() const noexcept { return ""; }
-  //  virtual Device device() const noexcept = 0;
-};
+    class AllocatorImpl
+    {
+      public:
+        struct Block
+        {
+            explicit Block(void* _handle = nullptr, size_t _size = 0)
+                : handle(_handle)
+                , size(_size)
+            {
+            }
+            void*  handle;
+            size_t size;
+        };
+        virtual ~AllocatorImpl()                                    = default;
+        virtual Block       Allocate(size_t size) noexcept          = 0;
+        virtual void        Deallocate(Block& block) noexcept       = 0;
+        virtual bool        Owns(const Block& block) const noexcept = 0;
+        virtual const char* Name() const noexcept
+        {
+            return "";
+        }
+        //  virtual Device device() const noexcept = 0;
+    };
 
-// create, destroy, sub, MakeAvailableOnDevice, FromHost, fill, copy, map, unmap
-class BufferImpl {
- public:
-  explicit BufferImpl(Device device) : device_(device) {}
+    // create, destroy, sub, MakeAvailableOnDevice, FromHost, fill, copy, map, unmap
+    class BufferImpl
+    {
+      public:
+        explicit BufferImpl(Device device)
+            : device_(device)
+        {
+        }
 
-  virtual ~BufferImpl() = default;
+        virtual ~BufferImpl() = default;
 
-  virtual Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) = 0;
+        virtual Result<void>                   Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) = 0;
 
-  virtual Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) = 0;
+        virtual Result<void>                   Init(size_t size, std::shared_ptr<void> native, uint64_t flags) = 0;
 
-  virtual Result<shared_ptr<BufferImpl>> SubBuffer(size_t offset, size_t size, uint64_t flags) = 0;
+        virtual Result<shared_ptr<BufferImpl>> SubBuffer(size_t offset, size_t size, uint64_t flags) = 0;
 
-  virtual size_t GetSize(ErrorCode* ec) = 0;
+        virtual size_t                         GetSize(ErrorCode* ec) = 0;
 
-  virtual Allocator GetAllocator() const = 0;
+        virtual Allocator                      GetAllocator() const = 0;
 
-  virtual void* GetNative(ErrorCode* ec) = 0;
+        virtual void*                          GetNative(ErrorCode* ec) = 0;
 
-  Device GetDevice() const noexcept { return device_; }
+        Device                                 GetDevice() const noexcept
+        {
+            return device_;
+        }
 
- protected:
-  Device device_;
-};
+      protected:
+        Device device_;
+    };
 
-class StreamImpl {
- public:
-  explicit StreamImpl(Device device) : device_(device) {}
+    class StreamImpl
+    {
+      public:
+        explicit StreamImpl(Device device)
+            : device_(device)
+        {
+        }
 
-  virtual ~StreamImpl() = default;
+        virtual ~StreamImpl() = default;
 
-  virtual Result<void> Init(uint64_t flags) = 0;
+        virtual Result<void> Init(uint64_t flags) = 0;
 
-  virtual Result<void> Init(std::shared_ptr<void> native, uint64_t flags) = 0;
+        virtual Result<void> Init(std::shared_ptr<void> native, uint64_t flags) = 0;
 
-  virtual Result<void> Query() = 0;
+        virtual Result<void> Query() = 0;
 
-  virtual Result<void> Wait() = 0;
+        virtual Result<void> Wait() = 0;
 
-  virtual Result<void> Submit(Kernel& kernel) = 0;
+        virtual Result<void> Submit(Kernel& kernel) = 0;
 
-  virtual Result<void> DependsOn(Event& event) = 0;
+        virtual Result<void> DependsOn(Event& event) = 0;
 
-  virtual void* GetNative(ErrorCode* ec) = 0;
+        virtual void*        GetNative(ErrorCode* ec) = 0;
 
-  Device GetDevice() const noexcept { return device_; }
+        Device               GetDevice() const noexcept
+        {
+            return device_;
+        }
 
- protected:
-  Device device_;
-};
+      protected:
+        Device device_;
+    };
 
-class EventImpl {
- public:
-  explicit EventImpl(Device device) : device_(device) {}
+    class EventImpl
+    {
+      public:
+        explicit EventImpl(Device device)
+            : device_(device)
+        {
+        }
+
+        virtual ~EventImpl() = default;
+
+        virtual Result<void> Init(uint64_t flags) = 0;
+
+        virtual Result<void> Init(std::shared_ptr<void> native, uint64_t flags) = 0;
+
+        virtual Result<void> Query() = 0;
+
+        virtual Result<void> Record(Stream& st) = 0;
 
-  virtual ~EventImpl() = default;
+        virtual Result<void> Wait() = 0;
 
-  virtual Result<void> Init(uint64_t flags) = 0;
+        virtual void*        GetNative(ErrorCode* ec) = 0;
 
-  virtual Result<void> Init(std::shared_ptr<void> native, uint64_t flags) = 0;
+        Device               GetDevice() const noexcept
+        {
+            return device_;
+        }
 
-  virtual Result<void> Query() = 0;
+      protected:
+        Device device_;
+    };
 
-  virtual Result<void> Record(Stream& st) = 0;
+    class KernelWrapper
+    {
+      public:
+        virtual ~KernelWrapper()                           = default;
+        virtual int Invoke(const std::vector<void*>& args) = 0;
+    };
 
-  virtual Result<void> Wait() = 0;
+    class KernelImpl
+    {
+      public:
+        explicit KernelImpl(Device device)
+            : device_(device)
+        {
+        }
 
-  virtual void* GetNative(ErrorCode* ec) = 0;
+        virtual ~KernelImpl() = default;
 
-  Device GetDevice() const noexcept { return device_; }
+        Device GetDevice() const noexcept
+        {
+            return device_;
+        }
 
- protected:
-  Device device_;
-};
+        virtual void* GetNative(ErrorCode* ec) = 0;
 
-class KernelWrapper {
- public:
-  virtual ~KernelWrapper() = default;
-  virtual int Invoke(const std::vector<void*>& args) = 0;
-};
+      protected:
+        Device device_;
+    };
 
-class KernelImpl {
- public:
-  explicit KernelImpl(Device device) : device_(device) {}
+    struct Access
+    {
+        template<typename T, typename Obj>
+        static T& get(const Obj& obj)
+        {
+            return static_cast<T&>(*obj.impl_);
+        }
 
-  virtual ~KernelImpl() = default;
+        template<typename Obj>
+        static auto& get_impl(const Obj& obj)
+        {
+            return obj.impl_;
+        }
 
-  Device GetDevice() const noexcept { return device_; }
+        template<typename T, typename... Args>
+        static T create(Args&&... args)
+        {
+            return T(std::forward<Args>(args)...);
+        }
+    };
 
-  virtual void* GetNative(ErrorCode* ec) = 0;
-
- protected:
-  Device device_;
-};
-
-struct Access {
-  template <typename T, typename Obj>
-  static T& get(const Obj& obj) {
-    return static_cast<T&>(*obj.impl_);
-  }
-
-  template <typename Obj>
-  static auto& get_impl(const Obj& obj) {
-    return obj.impl_;
-  }
-
-  template <typename T, typename... Args>
-  static T create(Args&&... args) {
-    return T(std::forward<Args>(args)...);
-  }
-};
-
-inline PlatformImpl* GetPlatformImpl(const Device& device) {
-  return gPlatformRegistry().GetPlatformImpl(device);
-}
+    inline PlatformImpl* GetPlatformImpl(const Device& device)
+    {
+        return gPlatformRegistry().GetPlatformImpl(device);
+    }
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/core/graph.cpp b/csrc/mmdeploy/core/graph.cpp
index d83e45b97d..7d283074e2 100644
--- a/csrc/mmdeploy/core/graph.cpp
+++ b/csrc/mmdeploy/core/graph.cpp
@@ -7,126 +7,152 @@
 #include "mmdeploy/graph/common.h"
 #include "mmdeploy/graph/flattened.h"
 
-namespace mmdeploy::graph {
-
-namespace {
-
-struct Expr {
-  string lhs;
-  string rhs;
-  char operation{0};
-};
-
-// parse expressions like "x", "x=y", "x=*y" or "x=+y"
-Expr ParseExpr(const string& str) {
-  Expr expr;
-  bool split{};
-  for (const auto& c : str) {
-    switch (c) {
-      case '=':
-        split = true;
-        break;
-      case '*':
-      case '+':
-        expr.operation = c;
-        break;
-      default:
-        (split ? &expr.rhs : &expr.lhs)->push_back(c);
+namespace mmdeploy::graph
+{
+
+    namespace
+    {
+
+        struct Expr
+        {
+            string lhs;
+            string rhs;
+            char   operation{0};
+        };
+
+        // parse expressions like "x", "x=y", "x=*y" or "x=+y"
+        Expr ParseExpr(const string& str)
+        {
+            Expr expr;
+            bool split{};
+            for (const auto& c : str)
+            {
+                switch (c)
+                {
+                    case '=':
+                        split = true;
+                        break;
+                    case '*':
+                    case '+':
+                        expr.operation = c;
+                        break;
+                    default:
+                        (split ? &expr.rhs : &expr.lhs)->push_back(c);
+                }
+            }
+            if (!split)
+            {
+                expr.rhs = expr.lhs;
+            }
+            return std::move(expr);
+        }
+
+    }  // namespace
+
+    Result<void> Builder::SetInputs()
+    {
+        OUTCOME_TRY(auto inputs, ParseStringArray(config_["input"]));
+        vector<string> inputs_internal;
+        for (const auto& input : inputs)
+        {
+            auto expr = ParseExpr(input);
+            inputs_.push_back(expr.rhs);
+            inputs_internal.push_back(expr.lhs);
+            flatten_.push_back(expr.operation == '*');
+            broadcast_.push_back(expr.operation == '+');
+        }
+        config_["input"] = to_value(inputs_internal);
+        return success();
     }
-  }
-  if (!split) {
-    expr.rhs = expr.lhs;
-  }
-  return std::move(expr);
-}
-
-}  // namespace
-
-Result<void> Builder::SetInputs() {
-  OUTCOME_TRY(auto inputs, ParseStringArray(config_["input"]));
-  vector<string> inputs_internal;
-  for (const auto& input : inputs) {
-    auto expr = ParseExpr(input);
-    inputs_.push_back(expr.rhs);
-    inputs_internal.push_back(expr.lhs);
-    flatten_.push_back(expr.operation == '*');
-    broadcast_.push_back(expr.operation == '+');
-  }
-  config_["input"] = to_value(inputs_internal);
-  return success();
-}
-
-Result<void> Builder::SetOutputs() {
-  OUTCOME_TRY(auto outputs, ParseStringArray(config_["output"]));
-  vector<string> outputs_internal;
-  for (const auto& output : outputs) {
-    auto expr = ParseExpr(output);
-    outputs_.push_back(expr.lhs);
-    outputs_internal.push_back(expr.rhs);
-    unflatten_.push_back(expr.operation == '*');
-  }
-  config_["output"] = to_value(outputs_internal);
-  return success();
-}
-
-Builder::Builder(Value config) : config_(std::move(config)) {
-  name_ = config_.value<std::string>("name", "");
-}
-
-Result<unique_ptr<Node>> Builder::Build() {
-  OUTCOME_TRY(SetInputs());
-  OUTCOME_TRY(SetOutputs());
-  OUTCOME_TRY(auto node, BuildImpl());
-
-  // use Throttle to constraint resource usage
-  if (auto throttle = config_.value("throttle", 0)) {
-    MMDEPLOY_ERROR("Throttle is not implemented yet");
-    return Status(eNotSupported);
-  }
-
-  // create a FlattenedScope to flatten inputs and unflatten outputs
-  if (std::count(std::begin(flatten_), std::end(flatten_), true)) {
-    node = std::make_unique<Flattened>(std::move(node), flatten_, broadcast_, unflatten_);
-  }
-  return std::move(node);
-}
-
-Result<unique_ptr<Builder>> Builder::CreateFromConfig(const Value& config) {
-  // MMDEPLOY_WARN("config: {}", config);
-  auto type = config.value<string>("type", "");
-  auto cfg = config;
-  // backward compatibility
-  if (type.empty()) {
-    if (config.contains("pipeline")) {
-      type = "Pipeline";
-      cfg = config["pipeline"];
-      if (config.contains("context")) {
-        cfg["context"] = config["context"];
-      }
+
+    Result<void> Builder::SetOutputs()
+    {
+        OUTCOME_TRY(auto outputs, ParseStringArray(config_["output"]));
+        vector<string> outputs_internal;
+        for (const auto& output : outputs)
+        {
+            auto expr = ParseExpr(output);
+            outputs_.push_back(expr.lhs);
+            outputs_internal.push_back(expr.rhs);
+            unflatten_.push_back(expr.operation == '*');
+        }
+        config_["output"] = to_value(outputs_internal);
+        return success();
+    }
+
+    Builder::Builder(Value config)
+        : config_(std::move(config))
+    {
+        name_ = config_.value<std::string>("name", "");
+    }
+
+    Result<unique_ptr<Node>> Builder::Build()
+    {
+        OUTCOME_TRY(SetInputs());
+        OUTCOME_TRY(SetOutputs());
+        OUTCOME_TRY(auto node, BuildImpl());
+
+        // use Throttle to constraint resource usage
+        if (auto throttle = config_.value("throttle", 0))
+        {
+            MMDEPLOY_ERROR("Throttle is not implemented yet");
+            return Status(eNotSupported);
+        }
+
+        // create a FlattenedScope to flatten inputs and unflatten outputs
+        if (std::count(std::begin(flatten_), std::end(flatten_), true))
+        {
+            node = std::make_unique<Flattened>(std::move(node), flatten_, broadcast_, unflatten_);
+        }
+        return std::move(node);
     }
-  }
-  auto creator = gRegistry<Builder>().Get(type);
-  if (!creator) {
-    MMDEPLOY_ERROR("failed to find node creator: {}", type);
-    return Status(eEntryNotFound);
-  }
-  auto builder = creator->Create(cfg);
-  if (!builder) {
-    MMDEPLOY_ERROR("failed to create node builder: {}", type);
-    return Status(eFail);
-  }
-  return std::move(builder);
-}
-
-Result<std::vector<std::string>> ParseStringArray(const Value& value) {
-  if (value.is_string()) {
-    return std::vector{value.get<std::string>()};
-  } else if (value.is_array()) {
-    return from_value<std::vector<std::string>>(value);
-  }
-  return Status(eInvalidArgument);
-}
-
-MMDEPLOY_DEFINE_REGISTRY(Builder);
+
+    Result<unique_ptr<Builder>> Builder::CreateFromConfig(const Value& config)
+    {
+        // MMDEPLOY_WARN("config: {}", config);
+        auto type = config.value<string>("type", "");
+        auto cfg  = config;
+        // backward compatibility
+        if (type.empty())
+        {
+            if (config.contains("pipeline"))
+            {
+                type = "Pipeline";
+                cfg  = config["pipeline"];
+                if (config.contains("context"))
+                {
+                    cfg["context"] = config["context"];
+                }
+            }
+        }
+        auto creator = gRegistry<Builder>().Get(type);
+        if (!creator)
+        {
+            MMDEPLOY_ERROR("failed to find node creator: {}", type);
+            return Status(eEntryNotFound);
+        }
+        auto builder = creator->Create(cfg);
+        if (!builder)
+        {
+            MMDEPLOY_ERROR("failed to create node builder: {}", type);
+            return Status(eFail);
+        }
+        return std::move(builder);
+    }
+
+    Result<std::vector<std::string>> ParseStringArray(const Value& value)
+    {
+        if (value.is_string())
+        {
+            return std::vector{value.get<std::string>()};
+        }
+        else if (value.is_array())
+        {
+            return from_value<std::vector<std::string>>(value);
+        }
+        return Status(eInvalidArgument);
+    }
+
+    MMDEPLOY_DEFINE_REGISTRY(Builder);
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/core/graph.h b/csrc/mmdeploy/core/graph.h
index 2cb623529b..ddcfb71b4c 100644
--- a/csrc/mmdeploy/core/graph.h
+++ b/csrc/mmdeploy/core/graph.h
@@ -11,62 +11,79 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/execution/schedulers/registry.h"
 
-namespace mmdeploy::graph {
-
-using std::pair;
-using std::string;
-using std::unique_ptr;
-using std::vector;
-
-template <class... Ts>
-using Sender = TypeErasedSender<Ts...>;
-
-class MMDEPLOY_API Node {
- public:
-  virtual ~Node() = default;
-  virtual Sender<Value> Process(Sender<Value> input) = 0;
-
-  struct process_t {
-    Sender<Value> operator()(Sender<Value> sender, Node* node) const {
-      return node->Process(std::move(sender));
-    }
-  };
-  __closure::_BinderBack<process_t, Node*> Process() { return {{}, {}, {this}}; }
-};
-
-class MMDEPLOY_API Builder {
- public:
-  virtual ~Builder() = default;
-
-  const vector<string>& inputs() const noexcept { return inputs_; }
-  const vector<string>& outputs() const noexcept { return outputs_; }
-  const string& name() const noexcept { return name_; }
-
-  Result<unique_ptr<Node>> Build();
-
-  static Result<unique_ptr<Builder>> CreateFromConfig(const Value& config);
-
- protected:
-  explicit Builder(Value config);
-
-  Result<void> SetInputs();
-  Result<void> SetOutputs();
-
-  virtual Result<unique_ptr<Node>> BuildImpl() = 0;
-
- protected:
-  Value config_;
-  string name_;
-  vector<string> inputs_;
-  vector<string> outputs_;
-  vector<bool> flatten_;
-  vector<bool> broadcast_;
-  vector<bool> unflatten_;
-};
-
-MMDEPLOY_API Result<std::vector<std::string>> ParseStringArray(const Value& value);
-
-MMDEPLOY_DECLARE_REGISTRY(Builder, std::unique_ptr<Builder>(const Value& config));
+namespace mmdeploy::graph
+{
+
+    using std::pair;
+    using std::string;
+    using std::unique_ptr;
+    using std::vector;
+
+    template<class... Ts>
+    using Sender = TypeErasedSender<Ts...>;
+
+    class MMDEPLOY_API Node
+    {
+      public:
+        virtual ~Node()                                    = default;
+        virtual Sender<Value> Process(Sender<Value> input) = 0;
+
+        struct process_t
+        {
+            Sender<Value> operator()(Sender<Value> sender, Node* node) const
+            {
+                return node->Process(std::move(sender));
+            }
+        };
+        __closure::_BinderBack<process_t, Node*> Process()
+        {
+            return {{}, {}, {this}};
+        }
+    };
+
+    class MMDEPLOY_API Builder
+    {
+      public:
+        virtual ~Builder() = default;
+
+        const vector<string>& inputs() const noexcept
+        {
+            return inputs_;
+        }
+        const vector<string>& outputs() const noexcept
+        {
+            return outputs_;
+        }
+        const string& name() const noexcept
+        {
+            return name_;
+        }
+
+        Result<unique_ptr<Node>>           Build();
+
+        static Result<unique_ptr<Builder>> CreateFromConfig(const Value& config);
+
+      protected:
+        explicit Builder(Value config);
+
+        Result<void>                     SetInputs();
+        Result<void>                     SetOutputs();
+
+        virtual Result<unique_ptr<Node>> BuildImpl() = 0;
+
+      protected:
+        Value          config_;
+        string         name_;
+        vector<string> inputs_;
+        vector<string> outputs_;
+        vector<bool>   flatten_;
+        vector<bool>   broadcast_;
+        vector<bool>   unflatten_;
+    };
+
+    MMDEPLOY_API Result<std::vector<std::string>> ParseStringArray(const Value& value);
+
+    MMDEPLOY_DECLARE_REGISTRY(Builder, std::unique_ptr<Builder>(const Value& config));
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/core/logger.cpp b/csrc/mmdeploy/core/logger.cpp
index a1499cc072..bbf6b71111 100644
--- a/csrc/mmdeploy/core/logger.cpp
+++ b/csrc/mmdeploy/core/logger.cpp
@@ -5,71 +5,93 @@
 #include <cstdlib>
 
 #if SPDLOG_VER_MAJOR >= 1
-#if defined(__ANDROID__)
-#include <spdlog/sinks/android_sink.h>
-#else
-#include <spdlog/sinks/stdout_color_sinks.h>
-#if defined(_MSC_VER)
-#include <spdlog/sinks/stdout_sinks.h>
-#endif
-#endif
+    #if defined(__ANDROID__)
+        #include <spdlog/sinks/android_sink.h>
+    #else
+        #include <spdlog/sinks/stdout_color_sinks.h>
+        #if defined(_MSC_VER)
+            #include <spdlog/sinks/stdout_sinks.h>
+        #endif
+    #endif
 #endif
 
 #if SPDLOG_VER_MAJOR >= 1 && SPDLOG_VER_MINOR >= 6
-#define MMDEPLOY_SPDLOG_HAS_LOAD_ENV_LEVELS 1
-#include <spdlog/cfg/env.h>
+    #define MMDEPLOY_SPDLOG_HAS_LOAD_ENV_LEVELS 1
+    #include <spdlog/cfg/env.h>
 #endif
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-static void LoadEnvLevels() {
-  auto p = std::getenv("SPDLOG_LEVEL");
-  if (p) {
-    const std::string str(p);
-    if (str == "trace") {
-      spdlog::set_level(spdlog::level::trace);
-    } else if (str == "debug") {
-      spdlog::set_level(spdlog::level::debug);
-    } else if (str == "info") {
-      spdlog::set_level(spdlog::level::info);
-    } else if (str == "warn") {
-      spdlog::set_level(spdlog::level::warn);
-    } else if (str == "err") {
-      spdlog::set_level(spdlog::level::err);
-    } else if (str == "critical") {
-      spdlog::set_level(spdlog::level::critical);
-    } else if (str == "off") {
-      spdlog::set_level(spdlog::level::off);
+    static void LoadEnvLevels()
+    {
+        auto p = std::getenv("SPDLOG_LEVEL");
+        if (p)
+        {
+            const std::string str(p);
+            if (str == "trace")
+            {
+                spdlog::set_level(spdlog::level::trace);
+            }
+            else if (str == "debug")
+            {
+                spdlog::set_level(spdlog::level::debug);
+            }
+            else if (str == "info")
+            {
+                spdlog::set_level(spdlog::level::info);
+            }
+            else if (str == "warn")
+            {
+                spdlog::set_level(spdlog::level::warn);
+            }
+            else if (str == "err")
+            {
+                spdlog::set_level(spdlog::level::err);
+            }
+            else if (str == "critical")
+            {
+                spdlog::set_level(spdlog::level::critical);
+            }
+            else if (str == "off")
+            {
+                spdlog::set_level(spdlog::level::off);
+            }
+        }
     }
-  }
-}
 
-std::shared_ptr<spdlog::logger> CreateDefaultLogger() {
+    std::shared_ptr<spdlog::logger> CreateDefaultLogger()
+    {
 #if MMDEPLOY_SPDLOG_HAS_LOAD_ENV_LEVELS
-  spdlog::cfg::load_env_levels();
+        spdlog::cfg::load_env_levels();
 #else
-  LoadEnvLevels();
+        LoadEnvLevels();
 #endif
-  constexpr const auto logger_name = "mmdeploy";
+        constexpr const auto logger_name = "mmdeploy";
 #if defined(__ANDROID__)
-  return spdlog::android_logger_mt(logger_name);
+        return spdlog::android_logger_mt(logger_name);
 #elif defined(_MSC_VER)
-  return spdlog::stdout_logger_mt(logger_name);
+        return spdlog::stdout_logger_mt(logger_name);
 #else
-  return spdlog::stdout_color_mt(logger_name);
+        return spdlog::stdout_color_mt(logger_name);
 #endif
-}
+    }
 
-std::shared_ptr<spdlog::logger> &gLogger() {
-  // ! leaky singleton
-  static auto ptr = new std::shared_ptr<spdlog::logger>{CreateDefaultLogger()};
-  return *ptr;
-}
+    std::shared_ptr<spdlog::logger>& gLogger()
+    {
+        // ! leaky singleton
+        static auto ptr = new std::shared_ptr<spdlog::logger>{CreateDefaultLogger()};
+        return *ptr;
+    }
 
-spdlog::logger *GetLogger() { return gLogger().get(); }
+    spdlog::logger* GetLogger()
+    {
+        return gLogger().get();
+    }
 
-void SetLogger(spdlog::logger *logger) {
-  gLogger() = std::shared_ptr<spdlog::logger>(logger, [](auto) {});
-}
+    void SetLogger(spdlog::logger* logger)
+    {
+        gLogger() = std::shared_ptr<spdlog::logger>(logger, [](auto) {});
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/logger.h b/csrc/mmdeploy/core/logger.h
index 73de4f0ee1..826dedc5ac 100644
--- a/csrc/mmdeploy/core/logger.h
+++ b/csrc/mmdeploy/core/logger.h
@@ -7,85 +7,86 @@
 
 #include "mmdeploy/core/macro.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_API spdlog::logger *GetLogger();
+    MMDEPLOY_API spdlog::logger* GetLogger();
 
-MMDEPLOY_API void SetLogger(spdlog::logger *logger);
+    MMDEPLOY_API void            SetLogger(spdlog::logger* logger);
 
 }  // namespace mmdeploy
 
 // Honor spdlog settings if supported
 #if defined(SPDLOG_ACTIVE_LEVEL) && defined(SPDLOG_LEVEL_OFF)
 
-#define MMDEPLOY_LEVEL_TRACE SPDLOG_LEVEL_TRACE
-#define MMDEPLOY_LEVEL_DEBUG SPDLOG_LEVEL_DEBUG
-#define MMDEPLOY_LEVEL_INFO SPDLOG_LEVEL_INFO
-#define MMDEPLOY_LEVEL_WARN SPDLOG_LEVEL_WARN
-#define MMDEPLOY_LEVEL_ERROR SPDLOG_LEVEL_ERROR
-#define MMDEPLOY_LEVEL_CRITICAL SPDLOG_LEVEL_CRITICAL
-#define MMDEPLOY_LEVEL_OFF SPDLOG_LEVEL_OFF
+    #define MMDEPLOY_LEVEL_TRACE SPDLOG_LEVEL_TRACE
+    #define MMDEPLOY_LEVEL_DEBUG SPDLOG_LEVEL_DEBUG
+    #define MMDEPLOY_LEVEL_INFO SPDLOG_LEVEL_INFO
+    #define MMDEPLOY_LEVEL_WARN SPDLOG_LEVEL_WARN
+    #define MMDEPLOY_LEVEL_ERROR SPDLOG_LEVEL_ERROR
+    #define MMDEPLOY_LEVEL_CRITICAL SPDLOG_LEVEL_CRITICAL
+    #define MMDEPLOY_LEVEL_OFF SPDLOG_LEVEL_OFF
 
-#if !defined(MMDEPLOY_ACTIVE_LEVEL)
-#define MMDEPLOY_ACTIVE_LEVEL SPDLOG_ACTIVE_LEVEL
-#endif
+    #if !defined(MMDEPLOY_ACTIVE_LEVEL)
+        #define MMDEPLOY_ACTIVE_LEVEL SPDLOG_ACTIVE_LEVEL
+    #endif
 
 #else
 
-#define MMDEPLOY_LEVEL_TRACE 0
-#define MMDEPLOY_LEVEL_DEBUG 1
-#define MMDEPLOY_LEVEL_INFO 2
-#define MMDEPLOY_LEVEL_WARN 3
-#define MMDEPLOY_LEVEL_ERROR 4
-#define MMDEPLOY_LEVEL_CRITICAL 5
-#define MMDEPLOY_LEVEL_OFF 6
+    #define MMDEPLOY_LEVEL_TRACE 0
+    #define MMDEPLOY_LEVEL_DEBUG 1
+    #define MMDEPLOY_LEVEL_INFO 2
+    #define MMDEPLOY_LEVEL_WARN 3
+    #define MMDEPLOY_LEVEL_ERROR 4
+    #define MMDEPLOY_LEVEL_CRITICAL 5
+    #define MMDEPLOY_LEVEL_OFF 6
 
-#if !defined(MMDEPLOY_ACTIVE_LEVEL)
-#define MMDEPLOY_ACTIVE_LEVEL MMDEPLOY_LEVEL_INFO
-#endif
+    #if !defined(MMDEPLOY_ACTIVE_LEVEL)
+        #define MMDEPLOY_ACTIVE_LEVEL MMDEPLOY_LEVEL_INFO
+    #endif
 
 #endif
 
 #ifdef SPDLOG_LOGGER_CALL
-#define MMDEPLOY_LOG(level, ...) SPDLOG_LOGGER_CALL(mmdeploy::GetLogger(), level, __VA_ARGS__)
+    #define MMDEPLOY_LOG(level, ...) SPDLOG_LOGGER_CALL(mmdeploy::GetLogger(), level, __VA_ARGS__)
 #else
-#define MMDEPLOY_LOG(level, ...) mmdeploy::GetLogger()->log(level, __VA_ARGS__)
+    #define MMDEPLOY_LOG(level, ...) mmdeploy::GetLogger()->log(level, __VA_ARGS__)
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_TRACE
-#define MMDEPLOY_TRACE(...) MMDEPLOY_LOG(spdlog::level::trace, __VA_ARGS__)
+    #define MMDEPLOY_TRACE(...) MMDEPLOY_LOG(spdlog::level::trace, __VA_ARGS__)
 #else
-#define MMDEPLOY_TRACE(...) (void)0;
+    #define MMDEPLOY_TRACE(...) (void)0;
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_DEBUG
-#define MMDEPLOY_DEBUG(...) MMDEPLOY_LOG(spdlog::level::debug, __VA_ARGS__)
+    #define MMDEPLOY_DEBUG(...) MMDEPLOY_LOG(spdlog::level::debug, __VA_ARGS__)
 #else
-#define MMDEPLOY_DEBUG(...) (void)0;
+    #define MMDEPLOY_DEBUG(...) (void)0;
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_INFO
-#define MMDEPLOY_INFO(...) MMDEPLOY_LOG(spdlog::level::info, __VA_ARGS__)
+    #define MMDEPLOY_INFO(...) MMDEPLOY_LOG(spdlog::level::info, __VA_ARGS__)
 #else
-#define MMDEPLOY_INFO(...) (void)0;
+    #define MMDEPLOY_INFO(...) (void)0;
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_WARN
-#define MMDEPLOY_WARN(...) MMDEPLOY_LOG(spdlog::level::warn, __VA_ARGS__)
+    #define MMDEPLOY_WARN(...) MMDEPLOY_LOG(spdlog::level::warn, __VA_ARGS__)
 #else
-#define MMDEPLOY_WARN(...) (void)0;
+    #define MMDEPLOY_WARN(...) (void)0;
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_ERROR
-#define MMDEPLOY_ERROR(...) MMDEPLOY_LOG(spdlog::level::err, __VA_ARGS__)
+    #define MMDEPLOY_ERROR(...) MMDEPLOY_LOG(spdlog::level::err, __VA_ARGS__)
 #else
-#define MMDEPLOY_ERROR(...) (void)0;
+    #define MMDEPLOY_ERROR(...) (void)0;
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_CRITICAL
-#define MMDEPLOY_CRITICAL(...) MMDEPLOY_LOG(spdlog::level::critical, __VA_ARGS__)
+    #define MMDEPLOY_CRITICAL(...) MMDEPLOY_LOG(spdlog::level::critical, __VA_ARGS__)
 #else
-#define MMDEPLOY_CRITICAL(...) (void)0;
+    #define MMDEPLOY_CRITICAL(...) (void)0;
 #endif
 
 #endif  // !CORE_LOG_H
diff --git a/csrc/mmdeploy/core/macro.h b/csrc/mmdeploy/core/macro.h
index 8d3ebdb345..457cc28a78 100644
--- a/csrc/mmdeploy/core/macro.h
+++ b/csrc/mmdeploy/core/macro.h
@@ -4,19 +4,19 @@
 #define MMDEPLOY_SRC_CORE_MARCO_H_
 
 #ifndef MMDEPLOY_EXPORT
-#ifdef _MSC_VER
-#define MMDEPLOY_EXPORT __declspec(dllexport)
-#else
-#define MMDEPLOY_EXPORT __attribute__((visibility("default")))
-#endif
+    #ifdef _MSC_VER
+        #define MMDEPLOY_EXPORT __declspec(dllexport)
+    #else
+        #define MMDEPLOY_EXPORT __attribute__((visibility("default")))
+    #endif
 #endif
 
 #ifndef MMDEPLOY_API
-#ifdef MMDEPLOY_API_EXPORTS
-#define MMDEPLOY_API MMDEPLOY_EXPORT
-#else
-#define MMDEPLOY_API
-#endif
+    #ifdef MMDEPLOY_API_EXPORTS
+        #define MMDEPLOY_API MMDEPLOY_EXPORT
+    #else
+        #define MMDEPLOY_API
+    #endif
 #endif
 
 #define _MMDEPLOY_PP_CONCAT_IMPL(s1, s2) s1##s2
@@ -26,26 +26,22 @@
 
 // ! Be aware of ODR violation when using __COUNTER__
 #ifdef __COUNTER__
-#define MMDEPLOY_ANONYMOUS_VARIABLE(str) MMDEPLOY_PP_CONCAT(str, __COUNTER__)
+    #define MMDEPLOY_ANONYMOUS_VARIABLE(str) MMDEPLOY_PP_CONCAT(str, __COUNTER__)
 #else
-#define MMDEPLOY_ANONYMOUS_VARIABLE(str) MMDEPLOY_PP_CONCAT(str, __LINE__)
+    #define MMDEPLOY_ANONYMOUS_VARIABLE(str) MMDEPLOY_PP_CONCAT(str, __LINE__)
 #endif
 
 #define MMDEPLOY_PP_NARG(...) _MMDEPLOY_PP_NARG(__VA_ARGS__, _MMDEPLOY_PP_RESQ_N())
 
 #define _MMDEPLOY_PP_NARG(...) _MMDEPLOY_PP_ARG_N(__VA_ARGS__)
 
-#define _MMDEPLOY_PP_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, \
-                           _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30,  \
-                           _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44,  \
-                           _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58,  \
-                           _59, _60, _61, _62, _63, N, ...)                                       \
-  N
+#define _MMDEPLOY_PP_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, N, ...) \
+    N
 
-#define _MMDEPLOY_PP_RESQ_N()                                                                     \
-  63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, \
-      39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
-      16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#define _MMDEPLOY_PP_RESQ_N()                                                                       \
+    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, \
+        39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
+        16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 #define MMDEPLOY_PP_MAP_1(f, x) f(x)
 #define MMDEPLOY_PP_MAP_2(f, x, ...) f(x), MMDEPLOY_PP_MAP_1(f, __VA_ARGS__)
@@ -113,10 +109,10 @@
 #define MMDEPLOY_PP_MAP_64(f, x, ...) f(x), MMDEPLOY_PP_MAP_63(f, __VA_ARGS__)
 
 #define MMDEPLOY_PP_MAP(f, ...) \
-  _MMDEPLOY_PP_MAP_IMPL1(f, MMDEPLOY_PP_NARG(__VA_ARGS__), __VA_ARGS__)
+    _MMDEPLOY_PP_MAP_IMPL1(f, MMDEPLOY_PP_NARG(__VA_ARGS__), __VA_ARGS__)
 
 #define _MMDEPLOY_PP_MAP_IMPL1(f, n, ...) \
-  _MMDEPLOY_PP_MAP_IMPL2(f, MMDEPLOY_PP_CONCAT(MMDEPLOY_PP_MAP_, n), __VA_ARGS__)
+    _MMDEPLOY_PP_MAP_IMPL2(f, MMDEPLOY_PP_CONCAT(MMDEPLOY_PP_MAP_, n), __VA_ARGS__)
 
 #define _MMDEPLOY_PP_MAP_IMPL2(f, M_, ...) M_(f, __VA_ARGS__)
 
diff --git a/csrc/mmdeploy/core/mat.cpp b/csrc/mmdeploy/core/mat.cpp
index 1831dfb379..14c1b1d427 100644
--- a/csrc/mmdeploy/core/mat.cpp
+++ b/csrc/mmdeploy/core/mat.cpp
@@ -2,71 +2,90 @@
 
 #include "mat.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-Mat::Mat(int h, int w, PixelFormat format, DataType type, Device device, Allocator allocator)
-    : format_(format), type_(type), width_(w), height_(h) {
-  int bits_per_pixel = 0;
-  switch (format) {
-    case PixelFormat::kGRAYSCALE:
-      channel_ = 1;
-      bits_per_pixel = 8;
-      break;
-    case PixelFormat::kNV12:  // fall through
-    case PixelFormat::kNV21:
-      channel_ = 1;
-      bits_per_pixel = 12;
-      assert(w % 2 == 0);
-      break;
-    case PixelFormat::kBGR:  // fall through
-    case PixelFormat::kRGB:
-      channel_ = 3;
-      bits_per_pixel = 24;
-      break;
-    case PixelFormat::kBGRA:
-      channel_ = 4;
-      bits_per_pixel = 32;
-      break;
-    default:
-      throw_exception(eNotSupported);
-  }
+    Mat::Mat(int h, int w, PixelFormat format, DataType type, Device device, Allocator allocator)
+        : format_(format)
+        , type_(type)
+        , width_(w)
+        , height_(h)
+    {
+        int bits_per_pixel = 0;
+        switch (format)
+        {
+            case PixelFormat::kGRAYSCALE:
+                channel_       = 1;
+                bits_per_pixel = 8;
+                break;
+            case PixelFormat::kNV12:  // fall through
+            case PixelFormat::kNV21:
+                channel_       = 1;
+                bits_per_pixel = 12;
+                assert(w % 2 == 0);
+                break;
+            case PixelFormat::kBGR:  // fall through
+            case PixelFormat::kRGB:
+                channel_       = 3;
+                bits_per_pixel = 24;
+                break;
+            case PixelFormat::kBGRA:
+                channel_       = 4;
+                bits_per_pixel = 32;
+                break;
+            default:
+                throw_exception(eNotSupported);
+        }
 
-  size_ = height_ * width_ * channel_;
-  bytes_ = height_ * width_ * bits_per_pixel / 8;
+        size_  = height_ * width_ * channel_;
+        bytes_ = height_ * width_ * bits_per_pixel / 8;
 
-  switch (type) {
-    case DataType::kFLOAT:
-      bytes_ *= sizeof(float);
-      break;
-    case DataType::kHALF:
-      bytes_ *= 2;
-      break;
-    case DataType::kINT32:
-      bytes_ *= sizeof(int32_t);
-      break;
-    case DataType::kINT8:
-      break;
-    default:
-      throw_exception(eNotSupported);
-      break;
-  }
-  if (device.platform_id() >= 0 && bytes_ > 0) {
-    buf_ = Buffer(device, bytes_, std::move(allocator));
-  }
-}
+        switch (type)
+        {
+            case DataType::kFLOAT:
+                bytes_ *= sizeof(float);
+                break;
+            case DataType::kHALF:
+                bytes_ *= 2;
+                break;
+            case DataType::kINT32:
+                bytes_ *= sizeof(int32_t);
+                break;
+            case DataType::kINT8:
+                break;
+            default:
+                throw_exception(eNotSupported);
+                break;
+        }
+        if (device.platform_id() >= 0 && bytes_ > 0)
+        {
+            buf_ = Buffer(device, bytes_, std::move(allocator));
+        }
+    }
 
-Mat::Mat(int h, int w, PixelFormat format, DataType type, std::shared_ptr<void> data, Device device)
-    : Mat(h, w, format, type, device) {
-  buf_ = Buffer(device, bytes_, std::move(data));
-}
+    Mat::Mat(int h, int w, PixelFormat format, DataType type, std::shared_ptr<void> data, Device device)
+        : Mat(h, w, format, type, device)
+    {
+        buf_ = Buffer(device, bytes_, std::move(data));
+    }
 
-Mat::Mat(int h, int w, PixelFormat format, DataType type, void* data, Device device)
-    : Mat(h, w, format, type, device) {
-  buf_ = Buffer(device, bytes_, data);
-}
+    Mat::Mat(int h, int w, PixelFormat format, DataType type, void* data, Device device)
+        : Mat(h, w, format, type, device)
+    {
+        buf_ = Buffer(device, bytes_, data);
+    }
 
-Device Mat::device() const { return buf_.GetDevice(); }
-Buffer& Mat::buffer() { return buf_; }
-const Buffer& Mat::buffer() const { return buf_; }
+    Device Mat::device() const
+    {
+        return buf_.GetDevice();
+    }
+    Buffer& Mat::buffer()
+    {
+        return buf_;
+    }
+    const Buffer& Mat::buffer() const
+    {
+        return buf_;
+    }
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/mat.h b/csrc/mmdeploy/core/mat.h
index b285eba4c3..fad567bd25 100644
--- a/csrc/mmdeploy/core/mat.h
+++ b/csrc/mmdeploy/core/mat.h
@@ -10,89 +10,112 @@
 #include "mmdeploy/core/mpl/type_traits.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy {
-
-namespace framework {
-
-class MMDEPLOY_API Mat final {
- public:
-  Mat() = default;
-
-  /**
-   * @brief construct a Mat for an image
-   * @param h height of an image
-   * @param w width of an image
-   * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
-   * case of nv12 or nv21, height is the real height of an image,
-   * not height * 3 / 2
-   * @param type data type of an pixel in each channel
-   * @param device location Mat's buffer stores
-   */
-  Mat(int h, int w, PixelFormat format, DataType type, Device device = Device{0},
-      Allocator allocator = {});
-
-  /**@brief construct a Mat for an image using custom data
-   * @example
-   * ``` c++
-   * cv::Mat image = imread("test.jpg");
-   * std::shared_ptr<void> data(image.data, [image=image](void* p){});
-   * mmdeploy::Mat mat(image.rows, image.cols, kBGR, kINT8, data);
-   * ```
-   * @param h height of an image
-   * @param w width of an image
-   * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
-   * case of nv12 or nv21, height is the real height of an image,
-   * not height * 3 / 2
-   * @param type data type of an pixel in each channel
-   * @param data custom data
-   * @param device location where `data` is on
-   */
-  Mat(int h, int w, PixelFormat format, DataType type, std::shared_ptr<void> data,
-      Device device = Device{0});
-
-  /**
-   * @brief construct a Mat for an image using custom data
-   * @param h height of an image
-   * @param w width of an image
-   * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
-   * case of nv12 or nv21, height is the real height of an image,
-   * not height * 3 / 2
-   * @param type data type of an pixel in each channel
-   * @param data custom data
-   * @param device location where `data` is on
-   */
-  Mat(int h, int w, PixelFormat format, DataType type, void* data, Device device = Device{0});
-
-  Device device() const;
-  Buffer& buffer();
-  const Buffer& buffer() const;
-  PixelFormat pixel_format() const { return format_; }
-  DataType type() const { return type_; }
-  int height() const { return height_; }
-  int width() const { return width_; }
-  int channel() const { return channel_; }
-  int size() const { return size_; }
-  int byte_size() const { return bytes_; }
-
-  template <typename T>
-  T* data() const {
-    return reinterpret_cast<T*>(buf_.GetNative());
-  }
-
- private:
-  Buffer buf_;
-  PixelFormat format_{PixelFormat::kGRAYSCALE};
-  DataType type_{DataType::kINT8};
-  int width_{0};
-  int height_{0};
-  int channel_{0};
-  int size_{0};  // size of elements in mat
-  int bytes_{0};
-};
-
-}  // namespace framework
-
-MMDEPLOY_REGISTER_TYPE_ID(framework::Mat, 7);
+namespace mmdeploy
+{
+
+    namespace framework
+    {
+
+        class MMDEPLOY_API Mat final
+        {
+          public:
+            Mat() = default;
+
+            /**
+             * @brief construct a Mat for an image
+             * @param h height of an image
+             * @param w width of an image
+             * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
+             * case of nv12 or nv21, height is the real height of an image,
+             * not height * 3 / 2
+             * @param type data type of an pixel in each channel
+             * @param device location Mat's buffer stores
+             */
+            Mat(int h, int w, PixelFormat format, DataType type, Device device = Device{0}, Allocator allocator = {});
+
+            /**@brief construct a Mat for an image using custom data
+             * @example
+             * ``` c++
+             * cv::Mat image = imread("test.jpg");
+             * std::shared_ptr<void> data(image.data, [image=image](void* p){});
+             * mmdeploy::Mat mat(image.rows, image.cols, kBGR, kINT8, data);
+             * ```
+             * @param h height of an image
+             * @param w width of an image
+             * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
+             * case of nv12 or nv21, height is the real height of an image,
+             * not height * 3 / 2
+             * @param type data type of an pixel in each channel
+             * @param data custom data
+             * @param device location where `data` is on
+             */
+            Mat(int h, int w, PixelFormat format, DataType type, std::shared_ptr<void> data, Device device = Device{0});
+
+            /**
+             * @brief construct a Mat for an image using custom data
+             * @param h height of an image
+             * @param w width of an image
+             * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
+             * case of nv12 or nv21, height is the real height of an image,
+             * not height * 3 / 2
+             * @param type data type of an pixel in each channel
+             * @param data custom data
+             * @param device location where `data` is on
+             */
+            Mat(int h, int w, PixelFormat format, DataType type, void* data, Device device = Device{0});
+
+            Device        device() const;
+            Buffer&       buffer();
+            const Buffer& buffer() const;
+            PixelFormat   pixel_format() const
+            {
+                return format_;
+            }
+            DataType type() const
+            {
+                return type_;
+            }
+            int height() const
+            {
+                return height_;
+            }
+            int width() const
+            {
+                return width_;
+            }
+            int channel() const
+            {
+                return channel_;
+            }
+            int size() const
+            {
+                return size_;
+            }
+            int byte_size() const
+            {
+                return bytes_;
+            }
+
+            template<typename T>
+            T* data() const
+            {
+                return reinterpret_cast<T*>(buf_.GetNative());
+            }
+
+          private:
+            Buffer      buf_;
+            PixelFormat format_{PixelFormat::kGRAYSCALE};
+            DataType    type_{DataType::kINT8};
+            int         width_{0};
+            int         height_{0};
+            int         channel_{0};
+            int         size_{0};  // size of elements in mat
+            int         bytes_{0};
+        };
+
+    }  // namespace framework
+
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Mat, 7);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/model.cpp b/csrc/mmdeploy/core/model.cpp
index 871d1a114e..4edd499af6 100644
--- a/csrc/mmdeploy/core/model.cpp
+++ b/csrc/mmdeploy/core/model.cpp
@@ -9,73 +9,93 @@
 
 using namespace std;
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-Model::Model(const std::string& model_path) {
-  if (auto r = Model::Init(model_path); !r) {
-    MMDEPLOY_ERROR("Failed to load model \"{}\"", model_path);
-    r.error().throw_exception();
-  }
-}
+    Model::Model(const std::string& model_path)
+    {
+        if (auto r = Model::Init(model_path); !r)
+        {
+            MMDEPLOY_ERROR("Failed to load model \"{}\"", model_path);
+            r.error().throw_exception();
+        }
+    }
 
-Model::Model(const void* buffer, size_t size) { Init(buffer, size).value(); }
+    Model::Model(const void* buffer, size_t size)
+    {
+        Init(buffer, size).value();
+    }
 
-Result<void> Model::Init(const std::string& model_path) {
-  model_path_ = model_path;
-  if (!fs::exists(model_path)) {
-    MMDEPLOY_ERROR("File not found: \"{}\"", model_path);
-    return Status(eFileNotExist);
-  }
+    Result<void> Model::Init(const std::string& model_path)
+    {
+        model_path_ = model_path;
+        if (!fs::exists(model_path))
+        {
+            MMDEPLOY_ERROR("File not found: \"{}\"", model_path);
+            return Status(eFileNotExist);
+        }
 
-  for (const auto& creator : gRegistry<ModelImpl>().Creators()) {
-    if (auto impl = creator->Create(); impl->Init(model_path)) {
-      OUTCOME_TRY(auto meta, impl->ReadMeta());
-      impl_ = std::move(impl);
-      meta_ = std::move(meta);
-      MMDEPLOY_INFO("[{}] Load model: \"{}\"", creator->name(), model_path);
-      return success();
+        for (const auto& creator : gRegistry<ModelImpl>().Creators())
+        {
+            if (auto impl = creator->Create(); impl->Init(model_path))
+            {
+                OUTCOME_TRY(auto meta, impl->ReadMeta());
+                impl_ = std::move(impl);
+                meta_ = std::move(meta);
+                MMDEPLOY_INFO("[{}] Load model: \"{}\"", creator->name(), model_path);
+                return success();
+            }
+        }
+        MMDEPLOY_ERROR("Failed to load model: \"{}\", implementations tried: {}", model_path, gRegistry<ModelImpl>().List());
+        return Status(eNotSupported);
     }
-  }
-  MMDEPLOY_ERROR("Failed to load model: \"{}\", implementations tried: {}", model_path,
-                 gRegistry<ModelImpl>().List());
-  return Status(eNotSupported);
-}
 
-const std::string& Model::GetModelPath() const { return model_path_; }
+    const std::string& Model::GetModelPath() const
+    {
+        return model_path_;
+    }
 
-Result<void> Model::Init(const void* buffer, size_t size) {
-  for (const auto& creator : gRegistry<ModelImpl>().Creators()) {
-    if (auto impl = creator->Create(); impl->Init(buffer, size)) {
-      OUTCOME_TRY(auto meta, impl->ReadMeta());
-      impl_ = std::move(impl);
-      meta_ = std::move(meta);
-      MMDEPLOY_INFO("[{}] Parse model", creator->name());
-      return success();
+    Result<void> Model::Init(const void* buffer, size_t size)
+    {
+        for (const auto& creator : gRegistry<ModelImpl>().Creators())
+        {
+            if (auto impl = creator->Create(); impl->Init(buffer, size))
+            {
+                OUTCOME_TRY(auto meta, impl->ReadMeta());
+                impl_ = std::move(impl);
+                meta_ = std::move(meta);
+                MMDEPLOY_INFO("[{}] Parse model", creator->name());
+                return success();
+            }
+        }
+        MMDEPLOY_ERROR("Failed to parse model buffer, implementations tried: {}",
+                       gRegistry<ModelImpl>().List());
+        return Status(eNotSupported);
     }
-  }
-  MMDEPLOY_ERROR("Failed to parse model buffer, implementations tried: {}",
-                 gRegistry<ModelImpl>().List());
-  return Status(eNotSupported);
-}
 
-Result<model_meta_info_t> Model::GetModelConfig(const std::string& name) const {
-  for (auto& info : meta_.models) {
-    if (name == info.name) {
-      return info;
+    Result<model_meta_info_t> Model::GetModelConfig(const std::string& name) const
+    {
+        for (auto& info : meta_.models)
+        {
+            if (name == info.name)
+            {
+                return info;
+            }
+        }
+        MMDEPLOY_ERROR("Cannot find model '{}' in meta file", name);
+        return Status(eEntryNotFound);
     }
-  }
-  MMDEPLOY_ERROR("Cannot find model '{}' in meta file", name);
-  return Status(eEntryNotFound);
-}
 
-Result<std::string> Model::ReadFile(const std::string& file_path) noexcept {
-  return impl_->ReadFile(file_path);
-}
+    Result<std::string> Model::ReadFile(const std::string& file_path) noexcept
+    {
+        return impl_->ReadFile(file_path);
+    }
 
-Result<Value> Model::ReadConfig(const string& config_path) noexcept {
-  return impl_->ReadConfig(config_path);
-}
+    Result<Value> Model::ReadConfig(const string& config_path) noexcept
+    {
+        return impl_->ReadConfig(config_path);
+    }
 
-MMDEPLOY_DEFINE_REGISTRY(ModelImpl);
+    MMDEPLOY_DEFINE_REGISTRY(ModelImpl);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/model.h b/csrc/mmdeploy/core/model.h
index fcb396d267..01d06af9ed 100644
--- a/csrc/mmdeploy/core/model.h
+++ b/csrc/mmdeploy/core/model.h
@@ -13,96 +13,107 @@
 #include "mmdeploy/core/types.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
-
-namespace framework {
-
-struct model_meta_info_t {
-  std::string name;
-  std::string net;
-  std::string weights;
-  std::string backend;
-  int batch_size;
-  std::string precision;
-  bool dynamic_shape;
-  MMDEPLOY_ARCHIVE_MEMBERS(name, net, weights, backend, batch_size, precision, dynamic_shape);
-};
-
-struct deploy_meta_info_t {
-  std::string version;
-  std::vector<model_meta_info_t> models;
-  MMDEPLOY_ARCHIVE_MEMBERS(version, models);
-};
-
-class ModelImpl;
-
-/**
- * @class Model
- * @brief Load SDK model from file or buffer.
- */
-class MMDEPLOY_API Model {
- public:
-  Model() = default;
-
-  explicit Model(const std::string& model_path);
-
-  explicit Model(const void* buffer, size_t size);
-
-  ~Model() = default;
-
-  /**
-   * @brief Load SDK model.
-   * @param model_path file path of the model. It can be a file or a
-   * directory.
-   * @return status with an error code.
-   */
-  Result<void> Init(const std::string& model_path);
-
-  Result<void> Init(const void* buffer, size_t size);
-
-  /**
-   * @brief Get model's meta info
-   * @param name the name of a model in the SDK model file
-   * @return
-   */
-  Result<model_meta_info_t> GetModelConfig(const std::string& name) const;
-
-  /**
-   * @brief Read file from the SDK model
-   * @param file_path path relative to the root directory of the model.
-   * @return the content of file on success
-   */
-  Result<std::string> ReadFile(const std::string& file_path) noexcept;
-
-  Result<Value> ReadConfig(const std::string& config_path) noexcept;
-
-  /**
-   * @brief get meta information of the model
-   * @return SDK model's meta information
-   */
-  const deploy_meta_info_t& meta() const { return meta_; }
-
-  /**
-   * @brief Check if an instance of `Model` is valid
-   * @return the status of an instance of `Model`
-   */
-  explicit operator bool() const { return impl_ != nullptr; }
-
-  /**
-   * @brief get model_path that init with DirectoryModel
-   * @return file path of an sdk model
-   */
-  const std::string& GetModelPath() const;
-
- private:
-  std::string model_path_;
-  std::shared_ptr<ModelImpl> impl_;
-  deploy_meta_info_t meta_;
-};
-
-}  // namespace framework
-
-MMDEPLOY_REGISTER_TYPE_ID(framework::Model, 5);
+namespace mmdeploy
+{
+
+    namespace framework
+    {
+
+        struct model_meta_info_t
+        {
+            std::string name;
+            std::string net;
+            std::string weights;
+            std::string backend;
+            int         batch_size;
+            std::string precision;
+            bool        dynamic_shape;
+            MMDEPLOY_ARCHIVE_MEMBERS(name, net, weights, backend, batch_size, precision, dynamic_shape);
+        };
+
+        struct deploy_meta_info_t
+        {
+            std::string                    version;
+            std::vector<model_meta_info_t> models;
+            MMDEPLOY_ARCHIVE_MEMBERS(version, models);
+        };
+
+        class ModelImpl;
+
+        /**
+         * @class Model
+         * @brief Load SDK model from file or buffer.
+         */
+        class MMDEPLOY_API Model
+        {
+          public:
+            Model() = default;
+
+            explicit Model(const std::string& model_path);
+
+            explicit Model(const void* buffer, size_t size);
+
+            ~Model() = default;
+
+            /**
+             * @brief Load SDK model.
+             * @param model_path file path of the model. It can be a file or a
+             * directory.
+             * @return status with an error code.
+             */
+            Result<void>              Init(const std::string& model_path);
+
+            Result<void>              Init(const void* buffer, size_t size);
+
+            /**
+             * @brief Get model's meta info
+             * @param name the name of a model in the SDK model file
+             * @return
+             */
+            Result<model_meta_info_t> GetModelConfig(const std::string& name) const;
+
+            /**
+             * @brief Read file from the SDK model
+             * @param file_path path relative to the root directory of the model.
+             * @return the content of file on success
+             */
+            Result<std::string>       ReadFile(const std::string& file_path) noexcept;
+
+            Result<Value>             ReadConfig(const std::string& config_path) noexcept;
+
+            /**
+             * @brief get meta information of the model
+             * @return SDK model's meta information
+             */
+            const deploy_meta_info_t& meta() const
+            {
+                return meta_;
+            }
+
+            /**
+             * @brief Check if an instance of `Model` is valid
+             * @return the status of an instance of `Model`
+             */
+            explicit operator bool() const
+            {
+                return impl_ != nullptr;
+            }
+
+            /**
+             * @brief get model_path that init with DirectoryModel
+             * @return file path of an sdk model
+             */
+            const std::string& GetModelPath() const;
+
+          private:
+            std::string                model_path_;
+            std::shared_ptr<ModelImpl> impl_;
+            deploy_meta_info_t         meta_;
+        };
+
+    }  // namespace framework
+
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Model, 5);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/model_impl.h b/csrc/mmdeploy/core/model_impl.h
index 2b28e70058..5091686558 100644
--- a/csrc/mmdeploy/core/model_impl.h
+++ b/csrc/mmdeploy/core/model_impl.h
@@ -6,42 +6,50 @@
 #include "mmdeploy/core/model.h"
 #include "mmdeploy/core/registry.h"
 
-namespace mmdeploy::framework {
-
-/**
- * @class ModelImpl
- * @brief SDK model's implementation interface
- */
-class ModelImpl {
- public:
-  virtual ~ModelImpl() = default;
-
-  /**
-   * @brief Load an SDK model.
-   * @param model_path path of the model. It can be a file or a directory.
-   * @return status with an error code.
-   */
-  virtual Result<void> Init(const std::string& model_path) { return Status(eNotSupported); }
-
-  virtual Result<void> Init(const void* buffer, size_t size) { return Status(eNotSupported); }
-
-  /**
-   * @brief Read specified file from a SDK model
-   * @param file_path path relative to the root directory of the model.
-   * @return the content of the file on success
-   */
-  virtual Result<std::string> ReadFile(const std::string& file_path) const = 0;
-
-  virtual Result<Value> ReadConfig(const std::string& config_path) const = 0;
-
-  /**
-   * @brief get meta information of an sdk model
-   * @return SDK model's meta information
-   */
-  virtual Result<deploy_meta_info_t> ReadMeta() const = 0;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(ModelImpl, std::unique_ptr<ModelImpl>());
+namespace mmdeploy::framework
+{
+
+    /**
+     * @class ModelImpl
+     * @brief SDK model's implementation interface
+     */
+    class ModelImpl
+    {
+      public:
+        virtual ~ModelImpl() = default;
+
+        /**
+         * @brief Load an SDK model.
+         * @param model_path path of the model. It can be a file or a directory.
+         * @return status with an error code.
+         */
+        virtual Result<void> Init(const std::string& model_path)
+        {
+            return Status(eNotSupported);
+        }
+
+        virtual Result<void> Init(const void* buffer, size_t size)
+        {
+            return Status(eNotSupported);
+        }
+
+        /**
+         * @brief Read specified file from a SDK model
+         * @param file_path path relative to the root directory of the model.
+         * @return the content of the file on success
+         */
+        virtual Result<std::string>        ReadFile(const std::string& file_path) const = 0;
+
+        virtual Result<Value>              ReadConfig(const std::string& config_path) const = 0;
+
+        /**
+         * @brief get meta information of an sdk model
+         * @return SDK model's meta information
+         */
+        virtual Result<deploy_meta_info_t> ReadMeta() const = 0;
+    };
+
+    MMDEPLOY_DECLARE_REGISTRY(ModelImpl, std::unique_ptr<ModelImpl>());
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/core/module.cpp b/csrc/mmdeploy/core/module.cpp
index 31b964e730..2cfc1091d4 100644
--- a/csrc/mmdeploy/core/module.cpp
+++ b/csrc/mmdeploy/core/module.cpp
@@ -4,8 +4,9 @@
 
 #include "registry.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_DEFINE_REGISTRY(Module);
+    MMDEPLOY_DEFINE_REGISTRY(Module);
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/module.h b/csrc/mmdeploy/core/module.h
index c158887c45..f76615c8d1 100644
--- a/csrc/mmdeploy/core/module.h
+++ b/csrc/mmdeploy/core/module.h
@@ -8,15 +8,17 @@
 #include "mmdeploy/core/status_code.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class MMDEPLOY_API Module {
- public:
-  virtual ~Module() = default;
-  virtual Result<Value> Process(const Value& args) = 0;
-};
+    class MMDEPLOY_API Module
+    {
+      public:
+        virtual ~Module()                                = default;
+        virtual Result<Value> Process(const Value& args) = 0;
+    };
 
-MMDEPLOY_DECLARE_REGISTRY(Module, std::unique_ptr<Module>(const Value& config));
+    MMDEPLOY_DECLARE_REGISTRY(Module, std::unique_ptr<Module>(const Value& config));
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/mpl/detected.h b/csrc/mmdeploy/core/mpl/detected.h
index 1674c1e22f..1eee541eef 100644
--- a/csrc/mmdeploy/core/mpl/detected.h
+++ b/csrc/mmdeploy/core/mpl/detected.h
@@ -5,49 +5,53 @@
 
 #include <type_traits>
 
-namespace mmdeploy::detail {
-
-struct nonesuch {
-  nonesuch() = delete;
-  ~nonesuch() = delete;
-  nonesuch(nonesuch const&) = delete;
-  nonesuch(nonesuch const&&) = delete;
-  void operator=(nonesuch const&) = delete;
-  void operator=(nonesuch&&) = delete;
-};
-
-template <class Default, class AlwaysVoid, template <class...> class Op, class... Args>
-struct detector {
-  using value_t = std::false_type;
-  using type = Default;
-};
-
-template <class Default, template <class...> class Op, class... Args>
-struct detector<Default, std::void_t<Op<Args...>>, Op, Args...> {
-  using value_t = std::true_type;
-  using type = Op<Args...>;
-};
-
-template <template <class...> class Op, class... Args>
-using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
-
-template <template <class...> class Op, class... Args>
-using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
-
-template <template <class...> class Op, class... Args>
-constexpr inline bool is_detected_v = is_detected<Op, Args...>::value;
-
-template <class Default, template <class...> class Op, class... Args>
-using detected_or = detector<Default, void, Op, Args...>;
-
-template <class Default, template <class...> class Op, class... Args>
-using detected_or_t = typename detected_or<Default, Op, Args...>::type;
-
-template <class Expected, template <class...> class Op, class... Args>
-using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
-
-template <class To, template <class...> class Op, class... Args>
-using is_detected_convertible = std::is_convertible<detected_t<Op, Args...>, To>;
+namespace mmdeploy::detail
+{
+
+    struct nonesuch
+    {
+        nonesuch()                      = delete;
+        ~nonesuch()                     = delete;
+        nonesuch(nonesuch const&)       = delete;
+        nonesuch(nonesuch const&&)      = delete;
+        void operator=(nonesuch const&) = delete;
+        void operator=(nonesuch&&)      = delete;
+    };
+
+    template<class Default, class AlwaysVoid, template<class...> class Op, class... Args>
+    struct detector
+    {
+        using value_t = std::false_type;
+        using type    = Default;
+    };
+
+    template<class Default, template<class...> class Op, class... Args>
+    struct detector<Default, std::void_t<Op<Args...>>, Op, Args...>
+    {
+        using value_t = std::true_type;
+        using type    = Op<Args...>;
+    };
+
+    template<template<class...> class Op, class... Args>
+    using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
+
+    template<template<class...> class Op, class... Args>
+    using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
+
+    template<template<class...> class Op, class... Args>
+    constexpr inline bool is_detected_v = is_detected<Op, Args...>::value;
+
+    template<class Default, template<class...> class Op, class... Args>
+    using detected_or = detector<Default, void, Op, Args...>;
+
+    template<class Default, template<class...> class Op, class... Args>
+    using detected_or_t = typename detected_or<Default, Op, Args...>::type;
+
+    template<class Expected, template<class...> class Op, class... Args>
+    using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+    template<class To, template<class...> class Op, class... Args>
+    using is_detected_convertible = std::is_convertible<detected_t<Op, Args...>, To>;
 
 }  // namespace mmdeploy::detail
 
diff --git a/csrc/mmdeploy/core/mpl/iterator.h b/csrc/mmdeploy/core/mpl/iterator.h
index dc91ccb1cb..7667b2ea7c 100644
--- a/csrc/mmdeploy/core/mpl/iterator.h
+++ b/csrc/mmdeploy/core/mpl/iterator.h
@@ -7,13 +7,14 @@
 
 #include "type_traits.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-template <typename T>
-using iter_value_t = typename std::iterator_traits<uncvref_t<T> >::value_type;
+    template<typename T>
+    using iter_value_t = typename std::iterator_traits<uncvref_t<T>>::value_type;
 
-template <typename T>
-using iter_reference_t = decltype(*std::declval<T&>());
+    template<typename T>
+    using iter_reference_t = decltype(*std::declval<T&>());
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/mpl/priority_tag.h b/csrc/mmdeploy/core/mpl/priority_tag.h
index 6d52d6692d..fef515c1b1 100644
--- a/csrc/mmdeploy/core/mpl/priority_tag.h
+++ b/csrc/mmdeploy/core/mpl/priority_tag.h
@@ -3,12 +3,17 @@
 #ifndef MMDEPLOY_SRC_CORE_MPL_PRIORITY_TAG_H_
 #define MMDEPLOY_SRC_CORE_MPL_PRIORITY_TAG_H_
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-template <unsigned N>
-struct priority_tag : priority_tag<N - 1> {};
-template <>
-struct priority_tag<0> {};
+    template<unsigned N>
+    struct priority_tag : priority_tag<N - 1>
+    {
+    };
+    template<>
+    struct priority_tag<0>
+    {
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/mpl/span.h b/csrc/mmdeploy/core/mpl/span.h
index 9b081bb4d8..d600ee7100 100644
--- a/csrc/mmdeploy/core/mpl/span.h
+++ b/csrc/mmdeploy/core/mpl/span.h
@@ -9,113 +9,196 @@
 #include "detected.h"
 #include "iterator.h"
 
-namespace mmdeploy {
-
-namespace detail {
-
-template <typename T>
-using arrow_t = decltype(std::declval<T>().operator->());
-
-template <typename T>
-constexpr auto to_address(const T& p) noexcept {
-  if constexpr (std::is_pointer_v<T>) {
-    return p;
-  } else if (detail::is_detected_v<arrow_t, T>) {
-    return to_address(p.operator->());
-  }
-}
-
-}  // namespace detail
-
-template <typename T>
-class Span {
- public:
-  using element_type = T;
-  using value_type = std::remove_cv_t<T>;
-  using size_type = std::size_t;
-  using difference_type = std::ptrdiff_t;
-  using pointer = T*;
-  using const_pointer = const T*;
-  using reference = T&;
-  using const_reference = const T&;
-  using iterator = T*;
-  using reverse_iterator = std::reverse_iterator<iterator>;
-
- public:
-  constexpr Span() noexcept : data_(nullptr), size_(0) {}
-
-  // clang-format off
+namespace mmdeploy
+{
+
+    namespace detail
+    {
+
+        template<typename T>
+        using arrow_t = decltype(std::declval<T>().operator->());
+
+        template<typename T>
+        constexpr auto to_address(const T& p) noexcept
+        {
+            if constexpr (std::is_pointer_v<T>)
+            {
+                return p;
+            }
+            else if (detail::is_detected_v<arrow_t, T>)
+            {
+                return to_address(p.operator->());
+            }
+        }
+
+    }  // namespace detail
+
+    template<typename T>
+    class Span
+    {
+      public:
+        using element_type     = T;
+        using value_type       = std::remove_cv_t<T>;
+        using size_type        = std::size_t;
+        using difference_type  = std::ptrdiff_t;
+        using pointer          = T*;
+        using const_pointer    = const T*;
+        using reference        = T&;
+        using const_reference  = const T&;
+        using iterator         = T*;
+        using reverse_iterator = std::reverse_iterator<iterator>;
+
+      public:
+        constexpr Span() noexcept
+            : data_(nullptr)
+            , size_(0)
+        {
+        }
+
+        // clang-format off
   template <typename It,
       std::void_t<decltype(std::addressof(std::declval<It&>()))>* = nullptr>
-  // clang-format on
-  constexpr Span(It first, size_type size) : data_(detail::to_address(first)), size_(size) {}
-
-  template <typename It, typename End,
-            std::enable_if_t<!std::is_convertible_v<End, std::size_t>, int> = 0>
-  constexpr Span(It first, End last) : data_(detail::to_address(first)), size_(last - first) {}
-
-  template <typename U, typename = std::void_t<decltype(std::data(std::declval<U>()))>,
-            typename = std::void_t<decltype(std::size(std::declval<U>()))>>
-  constexpr Span(U& v) : data_(std::data(v)), size_(std::size(v)) {}
-
-  template <typename U, typename = std::void_t<decltype(std::data(std::declval<U>()))>,
-            typename = std::void_t<decltype(std::size(std::declval<U>()))>>
-  constexpr Span(const U& v) : data_(std::data(v)), size_(std::size(v)) {}
-
-  template <typename U>
-  constexpr Span(std::initializer_list<U> il) noexcept : Span(il.begin(), il.size()) {}
-
-  template <std::size_t N>
-  constexpr Span(element_type (&arr)[N]) noexcept : data_(std::data(arr)), size_(N) {}
-
-  constexpr Span(const Span& other) noexcept : data_(std::data(other)), size_(std::size(other)) {}
-
-  constexpr iterator begin() const noexcept { return data_; }
-  constexpr iterator end() const noexcept { return data_ + size_; }
-  constexpr reverse_iterator rbegin() const noexcept { return std::make_reverse_iterator(end()); }
-  constexpr reverse_iterator rend() const noexcept { return std::make_reverse_iterator(begin()); }
-  constexpr reference front() const { return data_[0]; }
-  constexpr reference back() const { return data_[size_ - 1]; }
-  constexpr reference operator[](size_type idx) const { return data_[idx]; }
-  constexpr pointer data() const noexcept { return data_; }
-  constexpr size_type size() const noexcept { return size_; }
-  constexpr size_type size_bytes() const noexcept { return sizeof(value_type) * size(); }
-  constexpr bool empty() const noexcept { return size_ == 0; }
-  constexpr Span<element_type> first(size_type count) const { return {begin(), count}; }
-  constexpr Span<element_type> last(size_type count) const { return {end() - count, count}; }
-  constexpr Span<element_type> subspan(size_type offset, size_type count = -1) const {
-    if (count == -1) {
-      return Span(begin() + offset, end());
-    } else {
-      return Span(begin() + offset, begin() + offset + count);
-    }
-  }
-
-  constexpr Span& operator=(const Span& other) noexcept = default;
-
-  template <typename U>
-  friend bool operator!=(const Span& a, const Span<U>& b) {
-    if (a.size() != b.size()) {
-      return true;
-    }
-    for (size_type i = 0; i < a.size(); ++i) {
-      if (a[i] != b[i]) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  template <typename U>
-  friend bool operator==(const Span& a, const Span<U>& b) {
-    return !(a != b);
-  }
-
- private:
-  T* data_;
-  size_type size_;
-};
-// clang-format off
+        // clang-format on
+        constexpr Span(It first, size_type size)
+            : data_(detail::to_address(first))
+            , size_(size)
+        {
+        }
+
+        template<typename It, typename End, std::enable_if_t<!std::is_convertible_v<End, std::size_t>, int> = 0>
+        constexpr Span(It first, End last)
+            : data_(detail::to_address(first))
+            , size_(last - first)
+        {
+        }
+
+        template<typename U, typename = std::void_t<decltype(std::data(std::declval<U>()))>, typename = std::void_t<decltype(std::size(std::declval<U>()))>>
+        constexpr Span(U& v)
+            : data_(std::data(v))
+            , size_(std::size(v))
+        {
+        }
+
+        template<typename U, typename = std::void_t<decltype(std::data(std::declval<U>()))>, typename = std::void_t<decltype(std::size(std::declval<U>()))>>
+        constexpr Span(const U& v)
+            : data_(std::data(v))
+            , size_(std::size(v))
+        {
+        }
+
+        template<typename U>
+        constexpr Span(std::initializer_list<U> il) noexcept
+            : Span(il.begin(), il.size())
+        {
+        }
+
+        template<std::size_t N>
+        constexpr Span(element_type (&arr)[N]) noexcept
+            : data_(std::data(arr))
+            , size_(N)
+        {
+        }
+
+        constexpr Span(const Span& other) noexcept
+            : data_(std::data(other))
+            , size_(std::size(other))
+        {
+        }
+
+        constexpr iterator begin() const noexcept
+        {
+            return data_;
+        }
+        constexpr iterator end() const noexcept
+        {
+            return data_ + size_;
+        }
+        constexpr reverse_iterator rbegin() const noexcept
+        {
+            return std::make_reverse_iterator(end());
+        }
+        constexpr reverse_iterator rend() const noexcept
+        {
+            return std::make_reverse_iterator(begin());
+        }
+        constexpr reference front() const
+        {
+            return data_[0];
+        }
+        constexpr reference back() const
+        {
+            return data_[size_ - 1];
+        }
+        constexpr reference operator[](size_type idx) const
+        {
+            return data_[idx];
+        }
+        constexpr pointer data() const noexcept
+        {
+            return data_;
+        }
+        constexpr size_type size() const noexcept
+        {
+            return size_;
+        }
+        constexpr size_type size_bytes() const noexcept
+        {
+            return sizeof(value_type) * size();
+        }
+        constexpr bool empty() const noexcept
+        {
+            return size_ == 0;
+        }
+        constexpr Span<element_type> first(size_type count) const
+        {
+            return {begin(), count};
+        }
+        constexpr Span<element_type> last(size_type count) const
+        {
+            return {end() - count, count};
+        }
+        constexpr Span<element_type> subspan(size_type offset, size_type count = -1) const
+        {
+            if (count == -1)
+            {
+                return Span(begin() + offset, end());
+            }
+            else
+            {
+                return Span(begin() + offset, begin() + offset + count);
+            }
+        }
+
+        constexpr Span& operator=(const Span& other) noexcept = default;
+
+        template<typename U>
+        friend bool operator!=(const Span& a, const Span<U>& b)
+        {
+            if (a.size() != b.size())
+            {
+                return true;
+            }
+            for (size_type i = 0; i < a.size(); ++i)
+            {
+                if (a[i] != b[i])
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        template<typename U>
+        friend bool operator==(const Span& a, const Span<U>& b)
+        {
+            return !(a != b);
+        }
+
+      private:
+        T*        data_;
+        size_type size_;
+    };
+    // clang-format off
 template <typename It, typename EndOrSize>
 Span(It, EndOrSize) -> Span<std::remove_reference_t<iter_reference_t<It>>>;
 
@@ -128,7 +211,7 @@ Span(U& v) -> Span<typename uncvref_t<U>::value_type>;
 
 template <typename T>
 Span(std::initializer_list<T>) -> Span<const T>;
-// clang-format on
+    // clang-format on
 }  // namespace mmdeploy
 
 #endif  // MMDEPLOY_SRC_CORE_MPL_SPAN_H_
diff --git a/csrc/mmdeploy/core/mpl/static_any.h b/csrc/mmdeploy/core/mpl/static_any.h
index ec1beba91b..cfea65e48f 100644
--- a/csrc/mmdeploy/core/mpl/static_any.h
+++ b/csrc/mmdeploy/core/mpl/static_any.h
@@ -13,452 +13,554 @@
 
 #include "mmdeploy/core/mpl/type_traits.h"
 
-namespace mmdeploy {
-
-namespace detail {
-
-template <typename T>
-struct is_in_place_type_impl : std::false_type {};
-
-template <typename T>
-struct is_in_place_type_impl<std::in_place_type_t<T>> : std::true_type {};
-
-template <typename T>
-struct is_in_place_type : public is_in_place_type_impl<T> {};
-
-}  // namespace detail
-
-class BadAnyCast : public std::bad_cast {
- public:
-  const char* what() const noexcept override { return "BadAnyCast"; }
-};
-
-[[noreturn]] inline void ThrowBadAnyCast() {
+namespace mmdeploy
+{
+
+    namespace detail
+    {
+
+        template<typename T>
+        struct is_in_place_type_impl : std::false_type
+        {
+        };
+
+        template<typename T>
+        struct is_in_place_type_impl<std::in_place_type_t<T>> : std::true_type
+        {
+        };
+
+        template<typename T>
+        struct is_in_place_type : public is_in_place_type_impl<T>
+        {
+        };
+
+    }  // namespace detail
+
+    class BadAnyCast : public std::bad_cast
+    {
+      public:
+        const char* what() const noexcept override
+        {
+            return "BadAnyCast";
+        }
+    };
+
+    [[noreturn]] inline void ThrowBadAnyCast()
+    {
 #if __cpp_exceptions
-  throw BadAnyCast{};
+        throw BadAnyCast{};
 #else
-  std::abort();
+        std::abort();
 #endif
-}
-
-// Forward declarations
-class StaticAny;
-
-template <class ValueType>
-std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(const StaticAny*) noexcept;
-
-template <class ValueType>
-std::add_pointer_t<ValueType> static_any_cast(StaticAny*) noexcept;
-
-namespace __static_any_impl {
-
-using _Buffer = std::aligned_storage_t<3 * sizeof(void*), std::alignment_of_v<void*>>;
-
-template <class T>
-using _IsSmallObject =
-    std::integral_constant<bool, sizeof(T) <= sizeof(_Buffer) &&
-                                     std::alignment_of_v<_Buffer> % std::alignment_of_v<T> == 0 &&
-                                     std::is_nothrow_move_constructible_v<T>>;
+    }
 
-enum class _Action { _Destroy, _Copy, _Move, _Get, _TypeInfo };
+    // Forward declarations
+    class StaticAny;
+
+    template<class ValueType>
+    std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(const StaticAny*) noexcept;
+
+    template<class ValueType>
+    std::add_pointer_t<ValueType> static_any_cast(StaticAny*) noexcept;
+
+    namespace __static_any_impl
+    {
+
+        using _Buffer = std::aligned_storage_t<3 * sizeof(void*), std::alignment_of_v<void*>>;
+
+        template<class T>
+        using _IsSmallObject =
+            std::integral_constant<bool, sizeof(T) <= sizeof(_Buffer) && std::alignment_of_v<_Buffer> % std::alignment_of_v<T> == 0 && std::is_nothrow_move_constructible_v<T>>;
+
+        enum class _Action
+        {
+            _Destroy,
+            _Copy,
+            _Move,
+            _Get,
+            _TypeInfo
+        };
+
+        union _Ret
+        {
+            void*             ptr_;
+            traits::type_id_t type_id_;
+        };
+
+        template<class T>
+        struct _SmallHandler;
+        template<class T>
+        struct _LargeHandler;
+
+        template<class T>
+        inline bool __compare_typeid(traits::type_id_t __id)
+        {
+            if (__id && __id == traits::TypeId<T>::value)
+            {
+                return true;
+            }
+            return false;
+        }
+
+        template<class T>
+        using _Handler = std::conditional_t<_IsSmallObject<T>::value, _SmallHandler<T>, _LargeHandler<T>>;
+
+    }  // namespace __static_any_impl
+
+    class StaticAny
+    {
+      public:
+        constexpr StaticAny() noexcept
+            : h_(nullptr)
+        {
+        }
+
+        StaticAny(const StaticAny& other)
+            : h_(nullptr)
+        {
+            if (other.h_)
+            {
+                other.__call(_Action::_Copy, this);
+            }
+        }
+
+        StaticAny(StaticAny&& other) noexcept
+            : h_(nullptr)
+        {
+            if (other.h_)
+            {
+                other.__call(_Action::_Move, this);
+            }
+        }
+
+        template<class ValueType, class T = std::decay_t<ValueType>, class = std::enable_if_t<!std::is_same<T, StaticAny>::value && !detail::is_in_place_type<ValueType>::value && std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        explicit StaticAny(ValueType&& value);
+
+        template<
+            class ValueType,
+            class... Args,
+            class T = std::decay_t<ValueType>,
+            class   = std::enable_if_t<std::is_constructible<T, Args...>::value &&
+                                     std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        explicit StaticAny(std::in_place_type_t<ValueType>, Args&&... args);
+
+        template<class ValueType, class U, class... Args, class T = std::decay_t<ValueType>, class = std::enable_if_t<std::is_constructible<T, std::initializer_list<U>&, Args...>::value && std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        explicit StaticAny(std::in_place_type_t<ValueType>, std::initializer_list<U>, Args&&... args);
+
+        ~StaticAny()
+        {
+            this->reset();
+        }
+
+        StaticAny& operator=(const StaticAny& rhs)
+        {
+            StaticAny(rhs).swap(*this);
+            return *this;
+        }
+
+        StaticAny& operator=(StaticAny&& rhs) noexcept
+        {
+            StaticAny(std::move(rhs)).swap(*this);
+            return *this;
+        }
+
+        template<
+            class ValueType,
+            class T = std::decay_t<ValueType>,
+            class   = std::enable_if_t<!std::is_same<T, StaticAny>::value &&
+                                     std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        StaticAny& operator=(ValueType&& v);
+
+        template<
+            class ValueType,
+            class... Args,
+            class T = std::decay_t<ValueType>,
+            class   = std::enable_if_t<std::is_constructible<T, Args...>::value &&
+                                     std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        T& emplace(Args&&... args);
+
+        template<class ValueType, class U, class... Args, class T = std::decay_t<ValueType>, class = std::enable_if_t<std::is_constructible<T, std::initializer_list<U>&, Args...>::value && std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        T&   emplace(std::initializer_list<U>, Args&&...);
+
+        void reset() noexcept
+        {
+            if (h_)
+            {
+                this->__call(_Action::_Destroy);
+            }
+        }
+
+        void swap(StaticAny& rhs) noexcept;
+
+        bool has_value() const noexcept
+        {
+            return h_ != nullptr;
+        }
+
+        traits::type_id_t type() const noexcept
+        {
+            if (h_)
+            {
+                return this->__call(_Action::_TypeInfo).type_id_;
+            }
+            else
+            {
+                return traits::TypeId<void>::value;
+            }
+        }
+
+      private:
+        using _Action        = __static_any_impl::_Action;
+        using _Ret           = __static_any_impl::_Ret;
+        using _HandleFuncPtr = _Ret (*)(_Action, const StaticAny*, StaticAny*, traits::type_id_t info);
+
+        union _Storage
+        {
+            constexpr _Storage()
+                : ptr_(nullptr)
+            {
+            }
+            void*                      ptr_;
+            __static_any_impl::_Buffer buf_;
+        };
+
+        _Ret __call(_Action a, StaticAny* other = nullptr, traits::type_id_t info = 0) const
+        {
+            return h_(a, this, other, info);
+        }
+
+        _Ret __call(_Action a, StaticAny* other = nullptr, traits::type_id_t info = 0)
+        {
+            return h_(a, this, other, info);
+        }
+
+        template<class>
+        friend struct __static_any_impl::_SmallHandler;
+
+        template<class>
+        friend struct __static_any_impl::_LargeHandler;
+
+        template<class ValueType>
+        friend std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(const StaticAny*) noexcept;
+
+        template<class ValueType>
+        friend std::add_pointer_t<ValueType> static_any_cast(StaticAny*) noexcept;
+
+        _HandleFuncPtr                       h_ = nullptr;
+        _Storage                             s_;
+    };
+
+    namespace __static_any_impl
+    {
+
+        template<class T>
+        struct _SmallHandler
+        {
+            static _Ret __handle(_Action action, const StaticAny* self, StaticAny* other, traits::type_id_t info)
+            {
+                _Ret ret;
+                ret.ptr_ = nullptr;
+                switch (action)
+                {
+                    case _Action::_Destroy:
+                        __destroy(const_cast<StaticAny&>(*self));
+                        break;
+                    case _Action::_Copy:
+                        __copy(*self, *other);
+                        break;
+                    case _Action::_Move:
+                        __move(const_cast<StaticAny&>(*self), *other);
+                        break;
+                    case _Action::_Get:
+                        ret.ptr_ = __get(const_cast<StaticAny&>(*self), info);
+                        break;
+                    case _Action::_TypeInfo:
+                        ret.type_id_ = __type_info();
+                        break;
+                }
+                return ret;
+            }
+
+            template<class... Args>
+            static T& __create(StaticAny& dest, Args&&... args)
+            {
+                T* ret  = ::new (static_cast<void*>(&dest.s_.buf_)) T(std::forward<Args>(args)...);
+                dest.h_ = &_SmallHandler::__handle;
+                return *ret;
+            }
+
+          private:
+            template<class... Args>
+            static void __destroy(StaticAny& self)
+            {
+                T& value = *static_cast<T*>(static_cast<void*>(&self.s_.buf_));
+                value.~T();
+                self.h_ = nullptr;
+            }
+
+            template<class... Args>
+            static void __copy(const StaticAny& self, StaticAny& dest)
+            {
+                _SmallHandler::__create(dest, *static_cast<const T*>(static_cast<const void*>(&self.s_.buf_)));
+            }
+
+            static void __move(StaticAny& self, StaticAny& dest)
+            {
+                _SmallHandler::__create(dest, std::move(*static_cast<T*>(static_cast<void*>(&self.s_.buf_))));
+                __destroy(self);
+            }
+
+            static void* __get(StaticAny& self, traits::type_id_t info)
+            {
+                if (__static_any_impl::__compare_typeid<T>(info))
+                {
+                    return static_cast<void*>(&self.s_.buf_);
+                }
+                return nullptr;
+            }
+
+            static traits::type_id_t __type_info()
+            {
+                return traits::TypeId<T>::value;
+            }
+        };
+
+        template<class T>
+        struct _LargeHandler
+        {
+            static _Ret __handle(_Action action, const StaticAny* self, StaticAny* other, traits::type_id_t info)
+            {
+                _Ret ret;
+                ret.ptr_ = nullptr;
+                switch (action)
+                {
+                    case _Action::_Destroy:
+                        __destroy(const_cast<StaticAny&>(*self));
+                        break;
+                    case _Action::_Copy:
+                        __copy(*self, *other);
+                        break;
+                    case _Action::_Move:
+                        __move(const_cast<StaticAny&>(*self), *other);
+                        break;
+                    case _Action::_Get:
+                        ret.ptr_ = __get(const_cast<StaticAny&>(*self), info);
+                        break;
+                    case _Action::_TypeInfo:
+                        ret.type_id_ = __type_info();
+                        break;
+                }
+                return ret;
+            }
+
+            template<class... Args>
+            static T& __create(StaticAny& dest, Args&&... args)
+            {
+                using _Alloc = std::allocator<T>;
+                _Alloc alloc;
+                auto   dealloc = [&](T* p)
+                { alloc.deallocate(p, 1); };
+                std::unique_ptr<T, decltype(dealloc)> hold(alloc.allocate(1), dealloc);
+                T*                                    ret = ::new ((void*)hold.get()) T(std::forward<Args>(args)...);
+                dest.s_.ptr_                              = hold.release();
+                dest.h_                                   = &_LargeHandler::__handle;
+                return *ret;
+            }
+
+          private:
+            static void __destroy(StaticAny& self)
+            {
+                delete static_cast<T*>(self.s_.ptr_);
+                self.h_ = nullptr;
+            }
+
+            static void __copy(const StaticAny& self, StaticAny& dest)
+            {
+                _LargeHandler::__create(dest, *static_cast<const T*>(self.s_.ptr_));
+            }
+
+            static void __move(StaticAny& self, StaticAny& dest)
+            {
+                dest.s_.ptr_ = self.s_.ptr_;
+                dest.h_      = &_LargeHandler::__handle;
+                self.h_      = nullptr;
+            }
+
+            static void* __get(StaticAny& self, traits::type_id_t info)
+            {
+                if (__static_any_impl::__compare_typeid<T>(info))
+                {
+                    return static_cast<void*>(self.s_.ptr_);
+                }
+                return nullptr;
+            }
+
+            static traits::type_id_t __type_info()
+            {
+                return traits::TypeId<T>::value;
+            }
+        };
+
+    }  // namespace __static_any_impl
+
+    template<class ValueType, class T, class>
+    StaticAny::StaticAny(ValueType&& v)
+        : h_(nullptr)
+    {
+        __static_any_impl::_Handler<T>::__create(*this, std::forward<ValueType>(v));
+    }
 
-union _Ret {
-  void* ptr_;
-  traits::type_id_t type_id_;
-};
+    template<class ValueType, class... Args, class T, class>
+    StaticAny::StaticAny(std::in_place_type_t<ValueType>, Args&&... args)
+    {
+        __static_any_impl::_Handler<T>::__create(*this, std::forward<Args>(args)...);
+    }
 
-template <class T>
-struct _SmallHandler;
-template <class T>
-struct _LargeHandler;
+    template<class ValueType, class U, class... Args, class T, class>
+    StaticAny::StaticAny(std::in_place_type_t<ValueType>, std::initializer_list<U> il, Args&&... args)
+    {
+        __static_any_impl::_Handler<T>::__create(*this, il, std::forward<Args>(args)...);
+    }
 
-template <class T>
-inline bool __compare_typeid(traits::type_id_t __id) {
-  if (__id && __id == traits::TypeId<T>::value) {
-    return true;
-  }
-  return false;
-}
+    template<class ValueType, class, class>
+    inline StaticAny& StaticAny::operator=(ValueType&& v)
+    {
+        StaticAny(std::forward<ValueType>(v)).swap(*this);
+        return *this;
+    }
 
-template <class T>
-using _Handler = std::conditional_t<_IsSmallObject<T>::value, _SmallHandler<T>, _LargeHandler<T>>;
+    template<class ValueType, class... Args, class T, class>
+    inline T& StaticAny::emplace(Args&&... args)
+    {
+        reset();
+        return __static_any_impl::_Handler<T>::__create(*this, std::forward<Args>(args)...);
+    }
 
-}  // namespace __static_any_impl
+    template<class ValueType, class U, class... Args, class T, class>
+    inline T& StaticAny::emplace(std::initializer_list<U> il, Args&&... args)
+    {
+        reset();
+        return __static_any_impl::_Handler<T>::_create(*this, il, std::forward<Args>(args)...);
+    }
 
-class StaticAny {
- public:
-  constexpr StaticAny() noexcept : h_(nullptr) {}
+    inline void StaticAny::swap(StaticAny& rhs) noexcept
+    {
+        if (this == &rhs)
+        {
+            return;
+        }
+        if (h_ && rhs.h_)
+        {
+            StaticAny tmp;
+            rhs.__call(_Action::_Move, &tmp);
+            this->__call(_Action::_Move, &rhs);
+            tmp.__call(_Action::_Move, this);
+        }
+        else if (h_)
+        {
+            this->__call(_Action::_Move, &rhs);
+        }
+        else if (rhs.h_)
+        {
+            rhs.__call(_Action::_Move, this);
+        }
+    }
 
-  StaticAny(const StaticAny& other) : h_(nullptr) {
-    if (other.h_) {
-      other.__call(_Action::_Copy, this);
+    inline void swap(StaticAny& lhs, StaticAny& rhs) noexcept
+    {
+        lhs.swap(rhs);
     }
-  }
 
-  StaticAny(StaticAny&& other) noexcept : h_(nullptr) {
-    if (other.h_) {
-      other.__call(_Action::_Move, this);
+    template<class T, class... Args>
+    inline StaticAny make_static_any(Args&&... args)
+    {
+        return StaticAny(std::in_place_type<T>, std::forward<Args>(args)...);
     }
-  }
-
-  template <class ValueType, class T = std::decay_t<ValueType>,
-            class = std::enable_if_t<
-                !std::is_same<T, StaticAny>::value && !detail::is_in_place_type<ValueType>::value &&
-                std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  explicit StaticAny(ValueType&& value);
-
-  template <
-      class ValueType, class... Args, class T = std::decay_t<ValueType>,
-      class = std::enable_if_t<std::is_constructible<T, Args...>::value &&
-                               std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  explicit StaticAny(std::in_place_type_t<ValueType>, Args&&... args);
-
-  template <class ValueType, class U, class... Args, class T = std::decay_t<ValueType>,
-            class = std::enable_if_t<
-                std::is_constructible<T, std::initializer_list<U>&, Args...>::value &&
-                std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  explicit StaticAny(std::in_place_type_t<ValueType>, std::initializer_list<U>, Args&&... args);
-
-  ~StaticAny() { this->reset(); }
-
-  StaticAny& operator=(const StaticAny& rhs) {
-    StaticAny(rhs).swap(*this);
-    return *this;
-  }
-
-  StaticAny& operator=(StaticAny&& rhs) noexcept {
-    StaticAny(std::move(rhs)).swap(*this);
-    return *this;
-  }
-
-  template <
-      class ValueType, class T = std::decay_t<ValueType>,
-      class = std::enable_if_t<!std::is_same<T, StaticAny>::value &&
-                               std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  StaticAny& operator=(ValueType&& v);
-
-  template <
-      class ValueType, class... Args, class T = std::decay_t<ValueType>,
-      class = std::enable_if_t<std::is_constructible<T, Args...>::value &&
-                               std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  T& emplace(Args&&... args);
-
-  template <class ValueType, class U, class... Args, class T = std::decay_t<ValueType>,
-            class = std::enable_if_t<
-                std::is_constructible<T, std::initializer_list<U>&, Args...>::value &&
-                std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  T& emplace(std::initializer_list<U>, Args&&...);
-
-  void reset() noexcept {
-    if (h_) {
-      this->__call(_Action::_Destroy);
+
+    template<class T, class U, class... Args>
+    StaticAny make_static_any(std::initializer_list<U> il, Args&&... args)
+    {
+        return StaticAny(std::in_place_type<T>, il, std::forward<Args>(args)...);
     }
-  }
 
-  void swap(StaticAny& rhs) noexcept;
+    template<class ValueType>
+    ValueType static_any_cast(const StaticAny& v)
+    {
+        using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
+        static_assert(std::is_constructible<ValueType, const _RawValueType&>::value,
+                      "ValueType is required to be a const lvalue reference "
+                      "or a CopyConstructible type");
+        auto tmp = static_any_cast<std::add_const_t<_RawValueType>>(&v);
+        if (tmp == nullptr)
+        {
+            ThrowBadAnyCast();
+        }
+        return static_cast<ValueType>(*tmp);
+    }
 
-  bool has_value() const noexcept { return h_ != nullptr; }
+    template<class ValueType>
+    inline ValueType static_any_cast(StaticAny& v)
+    {
+        using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
+        static_assert(std::is_constructible<ValueType, _RawValueType&>::value,
+                      "ValueType is required to be an lvalue reference "
+                      "or a CopyConstructible type");
+        auto tmp = static_any_cast<_RawValueType>(&v);
+        if (tmp == nullptr)
+        {
+            ThrowBadAnyCast();
+        }
+        return static_cast<ValueType>(*tmp);
+    }
 
-  traits::type_id_t type() const noexcept {
-    if (h_) {
-      return this->__call(_Action::_TypeInfo).type_id_;
-    } else {
-      return traits::TypeId<void>::value;
+    template<class ValueType>
+    inline ValueType static_any_cast(StaticAny&& v)
+    {
+        using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
+        static_assert(std::is_constructible<ValueType, _RawValueType>::value,
+                      "ValueType is required to be an rvalue reference "
+                      "or a CopyConstructible type");
+        auto tmp = static_any_cast<_RawValueType>(&v);
+        if (tmp == nullptr)
+        {
+            ThrowBadAnyCast();
+        }
+        return static_cast<ValueType>(std::move(*tmp));
     }
-  }
-
- private:
-  using _Action = __static_any_impl::_Action;
-  using _Ret = __static_any_impl::_Ret;
-  using _HandleFuncPtr = _Ret (*)(_Action, const StaticAny*, StaticAny*, traits::type_id_t info);
-
-  union _Storage {
-    constexpr _Storage() : ptr_(nullptr) {}
-    void* ptr_;
-    __static_any_impl::_Buffer buf_;
-  };
-
-  _Ret __call(_Action a, StaticAny* other = nullptr, traits::type_id_t info = 0) const {
-    return h_(a, this, other, info);
-  }
-
-  _Ret __call(_Action a, StaticAny* other = nullptr, traits::type_id_t info = 0) {
-    return h_(a, this, other, info);
-  }
-
-  template <class>
-  friend struct __static_any_impl::_SmallHandler;
-
-  template <class>
-  friend struct __static_any_impl::_LargeHandler;
-
-  template <class ValueType>
-  friend std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(const StaticAny*) noexcept;
-
-  template <class ValueType>
-  friend std::add_pointer_t<ValueType> static_any_cast(StaticAny*) noexcept;
-
-  _HandleFuncPtr h_ = nullptr;
-  _Storage s_;
-};
-
-namespace __static_any_impl {
-
-template <class T>
-struct _SmallHandler {
-  static _Ret __handle(_Action action, const StaticAny* self, StaticAny* other,
-                       traits::type_id_t info) {
-    _Ret ret;
-    ret.ptr_ = nullptr;
-    switch (action) {
-      case _Action::_Destroy:
-        __destroy(const_cast<StaticAny&>(*self));
-        break;
-      case _Action::_Copy:
-        __copy(*self, *other);
-        break;
-      case _Action::_Move:
-        __move(const_cast<StaticAny&>(*self), *other);
-        break;
-      case _Action::_Get:
-        ret.ptr_ = __get(const_cast<StaticAny&>(*self), info);
-        break;
-      case _Action::_TypeInfo:
-        ret.type_id_ = __type_info();
-        break;
+
+    template<class ValueType>
+    inline std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(
+        const StaticAny* __any) noexcept
+    {
+        static_assert(!std::is_reference<ValueType>::value, "ValueType may not be a reference.");
+        return static_any_cast<ValueType>(const_cast<StaticAny*>(__any));
     }
-    return ret;
-  }
-
-  template <class... Args>
-  static T& __create(StaticAny& dest, Args&&... args) {
-    T* ret = ::new (static_cast<void*>(&dest.s_.buf_)) T(std::forward<Args>(args)...);
-    dest.h_ = &_SmallHandler::__handle;
-    return *ret;
-  }
-
- private:
-  template <class... Args>
-  static void __destroy(StaticAny& self) {
-    T& value = *static_cast<T*>(static_cast<void*>(&self.s_.buf_));
-    value.~T();
-    self.h_ = nullptr;
-  }
-
-  template <class... Args>
-  static void __copy(const StaticAny& self, StaticAny& dest) {
-    _SmallHandler::__create(dest, *static_cast<const T*>(static_cast<const void*>(&self.s_.buf_)));
-  }
-
-  static void __move(StaticAny& self, StaticAny& dest) {
-    _SmallHandler::__create(dest, std::move(*static_cast<T*>(static_cast<void*>(&self.s_.buf_))));
-    __destroy(self);
-  }
-
-  static void* __get(StaticAny& self, traits::type_id_t info) {
-    if (__static_any_impl::__compare_typeid<T>(info)) {
-      return static_cast<void*>(&self.s_.buf_);
+
+    template<class RetType>
+    inline RetType __pointer_or_func_test(void* p, std::false_type) noexcept
+    {
+        return static_cast<RetType>(p);
     }
-    return nullptr;
-  }
-
-  static traits::type_id_t __type_info() { return traits::TypeId<T>::value; }
-};
-
-template <class T>
-struct _LargeHandler {
-  static _Ret __handle(_Action action, const StaticAny* self, StaticAny* other,
-                       traits::type_id_t info) {
-    _Ret ret;
-    ret.ptr_ = nullptr;
-    switch (action) {
-      case _Action::_Destroy:
-        __destroy(const_cast<StaticAny&>(*self));
-        break;
-      case _Action::_Copy:
-        __copy(*self, *other);
-        break;
-      case _Action::_Move:
-        __move(const_cast<StaticAny&>(*self), *other);
-        break;
-      case _Action::_Get:
-        ret.ptr_ = __get(const_cast<StaticAny&>(*self), info);
-        break;
-      case _Action::_TypeInfo:
-        ret.type_id_ = __type_info();
-        break;
+
+    template<class RetType>
+    inline RetType __pointer_or_func_test(void*, std::true_type) noexcept
+    {
+        return nullptr;
     }
-    return ret;
-  }
-
-  template <class... Args>
-  static T& __create(StaticAny& dest, Args&&... args) {
-    using _Alloc = std::allocator<T>;
-    _Alloc alloc;
-    auto dealloc = [&](T* p) { alloc.deallocate(p, 1); };
-    std::unique_ptr<T, decltype(dealloc)> hold(alloc.allocate(1), dealloc);
-    T* ret = ::new ((void*)hold.get()) T(std::forward<Args>(args)...);
-    dest.s_.ptr_ = hold.release();
-    dest.h_ = &_LargeHandler::__handle;
-    return *ret;
-  }
-
- private:
-  static void __destroy(StaticAny& self) {
-    delete static_cast<T*>(self.s_.ptr_);
-    self.h_ = nullptr;
-  }
-
-  static void __copy(const StaticAny& self, StaticAny& dest) {
-    _LargeHandler::__create(dest, *static_cast<const T*>(self.s_.ptr_));
-  }
-
-  static void __move(StaticAny& self, StaticAny& dest) {
-    dest.s_.ptr_ = self.s_.ptr_;
-    dest.h_ = &_LargeHandler::__handle;
-    self.h_ = nullptr;
-  }
-
-  static void* __get(StaticAny& self, traits::type_id_t info) {
-    if (__static_any_impl::__compare_typeid<T>(info)) {
-      return static_cast<void*>(self.s_.ptr_);
+
+    template<class ValueType>
+    std::add_pointer_t<ValueType> static_any_cast(StaticAny* any) noexcept
+    {
+        using __static_any_impl::_Action;
+        static_assert(!std::is_reference<ValueType>::value, "ValueType may not be a reference.");
+        using ReturnType = std::add_pointer_t<ValueType>;
+        if (any && any->h_)
+        {
+            void* p = any->__call(_Action::_Get, nullptr, traits::TypeId<ValueType>::value).ptr_;
+            return __pointer_or_func_test<ReturnType>(p, std::is_function<ValueType>{});
+        }
+        return nullptr;
     }
-    return nullptr;
-  }
-
-  static traits::type_id_t __type_info() { return traits::TypeId<T>::value; }
-};
-
-}  // namespace __static_any_impl
-
-template <class ValueType, class T, class>
-StaticAny::StaticAny(ValueType&& v) : h_(nullptr) {
-  __static_any_impl::_Handler<T>::__create(*this, std::forward<ValueType>(v));
-}
-
-template <class ValueType, class... Args, class T, class>
-StaticAny::StaticAny(std::in_place_type_t<ValueType>, Args&&... args) {
-  __static_any_impl::_Handler<T>::__create(*this, std::forward<Args>(args)...);
-}
-
-template <class ValueType, class U, class... Args, class T, class>
-StaticAny::StaticAny(std::in_place_type_t<ValueType>, std::initializer_list<U> il, Args&&... args) {
-  __static_any_impl::_Handler<T>::__create(*this, il, std::forward<Args>(args)...);
-}
-
-template <class ValueType, class, class>
-inline StaticAny& StaticAny::operator=(ValueType&& v) {
-  StaticAny(std::forward<ValueType>(v)).swap(*this);
-  return *this;
-}
-
-template <class ValueType, class... Args, class T, class>
-inline T& StaticAny::emplace(Args&&... args) {
-  reset();
-  return __static_any_impl::_Handler<T>::__create(*this, std::forward<Args>(args)...);
-}
-
-template <class ValueType, class U, class... Args, class T, class>
-inline T& StaticAny::emplace(std::initializer_list<U> il, Args&&... args) {
-  reset();
-  return __static_any_impl::_Handler<T>::_create(*this, il, std::forward<Args>(args)...);
-}
-
-inline void StaticAny::swap(StaticAny& rhs) noexcept {
-  if (this == &rhs) {
-    return;
-  }
-  if (h_ && rhs.h_) {
-    StaticAny tmp;
-    rhs.__call(_Action::_Move, &tmp);
-    this->__call(_Action::_Move, &rhs);
-    tmp.__call(_Action::_Move, this);
-  } else if (h_) {
-    this->__call(_Action::_Move, &rhs);
-  } else if (rhs.h_) {
-    rhs.__call(_Action::_Move, this);
-  }
-}
-
-inline void swap(StaticAny& lhs, StaticAny& rhs) noexcept { lhs.swap(rhs); }
-
-template <class T, class... Args>
-inline StaticAny make_static_any(Args&&... args) {
-  return StaticAny(std::in_place_type<T>, std::forward<Args>(args)...);
-}
-
-template <class T, class U, class... Args>
-StaticAny make_static_any(std::initializer_list<U> il, Args&&... args) {
-  return StaticAny(std::in_place_type<T>, il, std::forward<Args>(args)...);
-}
-
-template <class ValueType>
-ValueType static_any_cast(const StaticAny& v) {
-  using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
-  static_assert(std::is_constructible<ValueType, const _RawValueType&>::value,
-                "ValueType is required to be a const lvalue reference "
-                "or a CopyConstructible type");
-  auto tmp = static_any_cast<std::add_const_t<_RawValueType>>(&v);
-  if (tmp == nullptr) {
-    ThrowBadAnyCast();
-  }
-  return static_cast<ValueType>(*tmp);
-}
-
-template <class ValueType>
-inline ValueType static_any_cast(StaticAny& v) {
-  using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
-  static_assert(std::is_constructible<ValueType, _RawValueType&>::value,
-                "ValueType is required to be an lvalue reference "
-                "or a CopyConstructible type");
-  auto tmp = static_any_cast<_RawValueType>(&v);
-  if (tmp == nullptr) {
-    ThrowBadAnyCast();
-  }
-  return static_cast<ValueType>(*tmp);
-}
-
-template <class ValueType>
-inline ValueType static_any_cast(StaticAny&& v) {
-  using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
-  static_assert(std::is_constructible<ValueType, _RawValueType>::value,
-                "ValueType is required to be an rvalue reference "
-                "or a CopyConstructible type");
-  auto tmp = static_any_cast<_RawValueType>(&v);
-  if (tmp == nullptr) {
-    ThrowBadAnyCast();
-  }
-  return static_cast<ValueType>(std::move(*tmp));
-}
-
-template <class ValueType>
-inline std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(
-    const StaticAny* __any) noexcept {
-  static_assert(!std::is_reference<ValueType>::value, "ValueType may not be a reference.");
-  return static_any_cast<ValueType>(const_cast<StaticAny*>(__any));
-}
-
-template <class RetType>
-inline RetType __pointer_or_func_test(void* p, std::false_type) noexcept {
-  return static_cast<RetType>(p);
-}
-
-template <class RetType>
-inline RetType __pointer_or_func_test(void*, std::true_type) noexcept {
-  return nullptr;
-}
-
-template <class ValueType>
-std::add_pointer_t<ValueType> static_any_cast(StaticAny* any) noexcept {
-  using __static_any_impl::_Action;
-  static_assert(!std::is_reference<ValueType>::value, "ValueType may not be a reference.");
-  using ReturnType = std::add_pointer_t<ValueType>;
-  if (any && any->h_) {
-    void* p = any->__call(_Action::_Get, nullptr, traits::TypeId<ValueType>::value).ptr_;
-    return __pointer_or_func_test<ReturnType>(p, std::is_function<ValueType>{});
-  }
-  return nullptr;
-}
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/mpl/structure.h b/csrc/mmdeploy/core/mpl/structure.h
index 1e7f5b3804..84dc7aeed1 100644
--- a/csrc/mmdeploy/core/mpl/structure.h
+++ b/csrc/mmdeploy/core/mpl/structure.h
@@ -8,229 +8,285 @@
 #include <tuple>
 #include <utility>
 
-namespace mmdeploy {
-
-namespace _structure {
-
-using std::array;
-using std::index_sequence;
-using std::integral_constant;
-using std::tuple;
-
-// [p0][T0]...[p1][T1]...[pn][Tn]...[px][X]
-// ^                                     |
-// |-------------------------------------|
-template <size_t Size>
-class Storage {
-  static constexpr auto S = Size + 1;
-  using Indices = std::make_index_sequence<S>;
-
- public:
-  Storage(const Storage&) = delete;
-  Storage(Storage&&) noexcept = delete;
-  Storage& operator=(const Storage&) = delete;
-  Storage& operator=(Storage&&) noexcept = delete;
-
-  Storage(const array<size_t, Size>& sizes, const array<size_t, Size>& aligns) {
-    create(std::make_index_sequence<Size>{}, sizes, aligns);
-  }
-
-  template <size_t offset>
-  Storage(const array<size_t, Size>& sizes, const array<size_t, Size>& aligns,
-          integral_constant<size_t, offset> index, void* ptr) noexcept {
-    create(std::make_index_sequence<Size>{}, sizes, aligns, index, ptr);
-  }
-
-  template <size_t... i, typename... As>
-  void create(index_sequence<i...>, const array<size_t, Size>& sizes,
-              const array<size_t, Size>& aligns, As&&... as) {
-    std::tie(data_, pointers_) =
-        Creator{{sizes[i]..., sizeof(void*)}, {aligns[i]..., alignof(void*)}}.create((As &&) as...);
-  }
-
-  ~Storage() {
-    if (data_) {
-      delete[] static_cast<uint8_t*>(data_);
-      release();
-    }
-  }
-
-  void* data() const noexcept { return data_; }
-
-  template <size_t i>
-  void* at() const noexcept {
-    return pointers_[i];
-  }
-
-  array<void*, S>& pointers() { return pointers_; }
-
-  void* release() noexcept {
-    std::fill_n(pointers_.data(), S, nullptr);
-    return std::exchange(data_, nullptr);
-  }
-
- private:
-  struct Creator {
-    const array<size_t, S>& sizes_;
-    const array<size_t, S>& aligns_;
-
-    tuple<void*, array<void*, S>> create() {
-      auto space = get_space(Indices{});
-      void* data = new uint8_t[space];
-      auto ptr = data;
-      array<void*, S> pointers{};
-      // build the layout according to sizes and alignments
-      align<0>(ptr, space, pointers, Indices{});
-      // store a pointer to the head of data in the last slot
-      *reinterpret_cast<void**>(pointers.back()) = data;
-      return {data, pointers};
-    }
-
-    template <size_t offset>
-    tuple<void*, array<void*, S>> create(integral_constant<size_t, offset>, void* ptr) {
-      auto space = get_space(Indices{});
-      array<void*, S> pointers{};
-      // recover the layout after offset
-      align<offset>(ptr, space, pointers, std::make_index_sequence<S - offset>{});
-      // recover data pointer
-      auto data = ptr = *reinterpret_cast<void**>(pointers.back());
-      // recover the layout before offset
-      align<0>(ptr, space, pointers, std::make_index_sequence<offset>{});
-      return {data, pointers};
-    }
-
-   private:
-    template <size_t... i>
-    size_t get_space(index_sequence<i...>) const noexcept {
-      return ((sizes_[i] + aligns_[i]) + ...);
-    }
-
-    template <size_t offset, size_t... i>
-    void align(void*& ptr, size_t& space, array<void*, S>& pointers,
-               index_sequence<i...>) noexcept {
-      (align(ptr, space, pointers, integral_constant<size_t, offset + i>{}), ...);
-    }
-
-    template <size_t i>
-    void align(void*& ptr, size_t& space, array<void*, S>& pointers,
-               integral_constant<size_t, i>) noexcept {
-      pointers[i] = std::align(aligns_[i], sizes_[i], ptr, space);
-      ptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) + sizes_[i]);
-      space -= sizes_[i];
-    }
-  };
-
- private:
-  void* data_{};
-  array<void*, S> pointers_{};
-};
-
-template <typename T, typename... Ts>
-struct _count {
-  static constexpr size_t value = (std::is_same_v<T, Ts> + ...);
-};
-
-template <typename T, typename Ts, typename Is, typename = void>
-struct get_type_index {};
-
-template <typename T, typename... Ts, size_t... Is>
-struct get_type_index<T, tuple<Ts...>, std::index_sequence<Is...>,
-                      std::enable_if_t<_count<T, Ts...>::value == 1>> {
-  static constexpr size_t value = ((std::is_same_v<T, Ts> * Is) + ...);
-};
-
-template <typename T>
-using _size_t = size_t;
-
-template <typename... Ts>
-class Structure : public Storage<sizeof...(Ts)> {
-  static constexpr auto Size = sizeof...(Ts);
-  using Base = Storage<Size>;
-  using Indices = std::index_sequence_for<Ts...>;
-
- public:
-  explicit Structure() : Structure(1) {}
-
-  explicit Structure(size_t length) : Structure(array<size_t, Size>{_size_t<Ts>(length)...}) {}
-
-  explicit Structure(const array<size_t, Size>& lengths)
-      : Base(get_sizes(lengths, Indices{}), {alignof(Ts)...}), lengths_{lengths} {
-    construct(Indices{});
-  }
-
-  template <typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
-  explicit Structure(T* p) : Structure(1, integral_constant<size_t, index>{}, p) {}
-
-  template <typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
-  explicit Structure(size_t length, T* p)
-      : Structure(length, integral_constant<size_t, index>{}, p) {}
-
-  template <typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
-  explicit Structure(const array<size_t, Size>& lengths, T* p)
-      : Structure(lengths, integral_constant<size_t, index>{}, p) {}
-
-  template <size_t i>
-  explicit Structure(integral_constant<size_t, i> index, void* p) : Structure(1, index, p) {}
-
-  template <size_t i>
-  explicit Structure(size_t length, integral_constant<size_t, i> index, void* p)
-      : Structure({_size_t<Ts>(length)...}, index, p) {}
-
-  template <size_t i>
-  explicit Structure(const array<size_t, Size>& lengths, integral_constant<size_t, i> index,
-                     void* p)
-      : Base(get_sizes(lengths, Indices{}), {alignof(Ts)...}, index, p), lengths_{lengths} {}
-
-  ~Structure() {
-    if (this->data()) {
-      destruct(Indices{});
-    }
-  }
-
-  template <size_t i>
-  decltype(auto) get() const {
-    using T = std::tuple_element_t<i, tuple<Ts...>>;
-    return reinterpret_cast<T*>(this->template at<i>());
-  }
-
-  tuple<Ts*...> pointers() const noexcept { return pointers(Indices{}); }
-
- private:
-  template <size_t... i>
-  static array<size_t, Size> get_sizes(const array<size_t, Size>& lengths,
-                                       index_sequence<i...>) noexcept {
-    return {(sizeof(Ts) * lengths[i])...};
-  }
-
-  template <size_t... i>
-  tuple<Ts*...> pointers(index_sequence<i...>) const noexcept {
-    return {get<i>()...};
-  }
-
-  template <size_t... i>
-  void construct(index_sequence<i...>) {
-    (create_n(get<i>(), lengths_[i]), ...);
-  }
-
-  template <typename T>
-  static void create_n(T* data, size_t n) {
-    for (size_t i = 0; i < n; ++i) {
-      new (data + i) T{};
-    }
-  }
-
-  template <size_t... i>
-  void destruct(index_sequence<i...>) {
-    (std::destroy_n(get<i>(), lengths_[i]), ...);
-  }
-
- private:
-  array<size_t, Size> lengths_;
-};
-
-}  // namespace _structure
-
-using _structure::Structure;
+namespace mmdeploy
+{
+
+    namespace _structure
+    {
+
+        using std::array;
+        using std::index_sequence;
+        using std::integral_constant;
+        using std::tuple;
+
+        // [p0][T0]...[p1][T1]...[pn][Tn]...[px][X]
+        // ^                                     |
+        // |-------------------------------------|
+        template<size_t Size>
+        class Storage
+        {
+            static constexpr auto S = Size + 1;
+            using Indices           = std::make_index_sequence<S>;
+
+          public:
+            Storage(const Storage&)                = delete;
+            Storage(Storage&&) noexcept            = delete;
+            Storage& operator=(const Storage&)     = delete;
+            Storage& operator=(Storage&&) noexcept = delete;
+
+            Storage(const array<size_t, Size>& sizes, const array<size_t, Size>& aligns)
+            {
+                create(std::make_index_sequence<Size>{}, sizes, aligns);
+            }
+
+            template<size_t offset>
+            Storage(const array<size_t, Size>& sizes, const array<size_t, Size>& aligns, integral_constant<size_t, offset> index, void* ptr) noexcept
+            {
+                create(std::make_index_sequence<Size>{}, sizes, aligns, index, ptr);
+            }
+
+            template<size_t... i, typename... As>
+            void create(index_sequence<i...>, const array<size_t, Size>& sizes, const array<size_t, Size>& aligns, As&&... as)
+            {
+                std::tie(data_, pointers_) =
+                    Creator{{sizes[i]..., sizeof(void*)}, {aligns[i]..., alignof(void*)}}.create((As&&)as...);
+            }
+
+            ~Storage()
+            {
+                if (data_)
+                {
+                    delete[] static_cast<uint8_t*>(data_);
+                    release();
+                }
+            }
+
+            void* data() const noexcept
+            {
+                return data_;
+            }
+
+            template<size_t i>
+            void* at() const noexcept
+            {
+                return pointers_[i];
+            }
+
+            array<void*, S>& pointers()
+            {
+                return pointers_;
+            }
+
+            void* release() noexcept
+            {
+                std::fill_n(pointers_.data(), S, nullptr);
+                return std::exchange(data_, nullptr);
+            }
+
+          private:
+            struct Creator
+            {
+                const array<size_t, S>&       sizes_;
+                const array<size_t, S>&       aligns_;
+
+                tuple<void*, array<void*, S>> create()
+                {
+                    auto            space = get_space(Indices{});
+                    void*           data  = new uint8_t[space];
+                    auto            ptr   = data;
+                    array<void*, S> pointers{};
+                    // build the layout according to sizes and alignments
+                    align<0>(ptr, space, pointers, Indices{});
+                    // store a pointer to the head of data in the last slot
+                    *reinterpret_cast<void**>(pointers.back()) = data;
+                    return {data, pointers};
+                }
+
+                template<size_t offset>
+                tuple<void*, array<void*, S>> create(integral_constant<size_t, offset>, void* ptr)
+                {
+                    auto            space = get_space(Indices{});
+                    array<void*, S> pointers{};
+                    // recover the layout after offset
+                    align<offset>(ptr, space, pointers, std::make_index_sequence<S - offset>{});
+                    // recover data pointer
+                    auto data = ptr = *reinterpret_cast<void**>(pointers.back());
+                    // recover the layout before offset
+                    align<0>(ptr, space, pointers, std::make_index_sequence<offset>{});
+                    return {data, pointers};
+                }
+
+              private:
+                template<size_t... i>
+                size_t get_space(index_sequence<i...>) const noexcept
+                {
+                    return ((sizes_[i] + aligns_[i]) + ...);
+                }
+
+                template<size_t offset, size_t... i>
+                void align(void*& ptr, size_t& space, array<void*, S>& pointers, index_sequence<i...>) noexcept
+                {
+                    (align(ptr, space, pointers, integral_constant<size_t, offset + i>{}), ...);
+                }
+
+                template<size_t i>
+                void align(void*& ptr, size_t& space, array<void*, S>& pointers, integral_constant<size_t, i>) noexcept
+                {
+                    pointers[i] = std::align(aligns_[i], sizes_[i], ptr, space);
+                    ptr         = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) + sizes_[i]);
+                    space -= sizes_[i];
+                }
+            };
+
+          private:
+            void*           data_{};
+            array<void*, S> pointers_{};
+        };
+
+        template<typename T, typename... Ts>
+        struct _count
+        {
+            static constexpr size_t value = (std::is_same_v<T, Ts> + ...);
+        };
+
+        template<typename T, typename Ts, typename Is, typename = void>
+        struct get_type_index
+        {
+        };
+
+        template<typename T, typename... Ts, size_t... Is>
+        struct get_type_index<T, tuple<Ts...>, std::index_sequence<Is...>, std::enable_if_t<_count<T, Ts...>::value == 1>>
+        {
+            static constexpr size_t value = ((std::is_same_v<T, Ts> * Is) + ...);
+        };
+
+        template<typename T>
+        using _size_t = size_t;
+
+        template<typename... Ts>
+        class Structure : public Storage<sizeof...(Ts)>
+        {
+            static constexpr auto Size = sizeof...(Ts);
+            using Base                 = Storage<Size>;
+            using Indices              = std::index_sequence_for<Ts...>;
+
+          public:
+            explicit Structure()
+                : Structure(1)
+            {
+            }
+
+            explicit Structure(size_t length)
+                : Structure(array<size_t, Size>{_size_t<Ts>(length)...})
+            {
+            }
+
+            explicit Structure(const array<size_t, Size>& lengths)
+                : Base(get_sizes(lengths, Indices{}), {alignof(Ts)...})
+                , lengths_{lengths}
+            {
+                construct(Indices{});
+            }
+
+            template<typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
+            explicit Structure(T* p)
+                : Structure(1, integral_constant<size_t, index>{}, p)
+            {
+            }
+
+            template<typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
+            explicit Structure(size_t length, T* p)
+                : Structure(length, integral_constant<size_t, index>{}, p)
+            {
+            }
+
+            template<typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
+            explicit Structure(const array<size_t, Size>& lengths, T* p)
+                : Structure(lengths, integral_constant<size_t, index>{}, p)
+            {
+            }
+
+            template<size_t i>
+            explicit Structure(integral_constant<size_t, i> index, void* p)
+                : Structure(1, index, p)
+            {
+            }
+
+            template<size_t i>
+            explicit Structure(size_t length, integral_constant<size_t, i> index, void* p)
+                : Structure({_size_t<Ts>(length)...}, index, p)
+            {
+            }
+
+            template<size_t i>
+            explicit Structure(const array<size_t, Size>& lengths, integral_constant<size_t, i> index, void* p)
+                : Base(get_sizes(lengths, Indices{}), {alignof(Ts)...}, index, p)
+                , lengths_{lengths}
+            {
+            }
+
+            ~Structure()
+            {
+                if (this->data())
+                {
+                    destruct(Indices{});
+                }
+            }
+
+            template<size_t i>
+            decltype(auto) get() const
+            {
+                using T = std::tuple_element_t<i, tuple<Ts...>>;
+                return reinterpret_cast<T*>(this->template at<i>());
+            }
+
+            tuple<Ts*...> pointers() const noexcept
+            {
+                return pointers(Indices{});
+            }
+
+          private:
+            template<size_t... i>
+            static array<size_t, Size> get_sizes(const array<size_t, Size>& lengths,
+                                                 index_sequence<i...>) noexcept
+            {
+                return {(sizeof(Ts) * lengths[i])...};
+            }
+
+            template<size_t... i>
+            tuple<Ts*...> pointers(index_sequence<i...>) const noexcept
+            {
+                return {get<i>()...};
+            }
+
+            template<size_t... i>
+            void construct(index_sequence<i...>)
+            {
+                (create_n(get<i>(), lengths_[i]), ...);
+            }
+
+            template<typename T>
+            static void create_n(T* data, size_t n)
+            {
+                for (size_t i = 0; i < n; ++i)
+                {
+                    new (data + i) T{};
+                }
+            }
+
+            template<size_t... i>
+            void destruct(index_sequence<i...>)
+            {
+                (std::destroy_n(get<i>(), lengths_[i]), ...);
+            }
+
+          private:
+            array<size_t, Size> lengths_;
+        };
+
+    }  // namespace _structure
+
+    using _structure::Structure;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/mpl/type_traits.h b/csrc/mmdeploy/core/mpl/type_traits.h
index 3e03bf9717..edbb5f2637 100644
--- a/csrc/mmdeploy/core/mpl/type_traits.h
+++ b/csrc/mmdeploy/core/mpl/type_traits.h
@@ -6,44 +6,55 @@
 #include <cstdint>
 #include <type_traits>
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-template <typename T>
-struct uncvref {
-  typedef std::remove_cv_t<std::remove_reference_t<T>> type;
-};
+    template<typename T>
+    struct uncvref
+    {
+        typedef std::remove_cv_t<std::remove_reference_t<T>> type;
+    };
 
-template <typename T>
-using uncvref_t = typename uncvref<T>::type;
+    template<typename T>
+    using uncvref_t = typename uncvref<T>::type;
 
-template <class T>
-struct is_cast_by_erasure : std::false_type {};
+    template<class T>
+    struct is_cast_by_erasure : std::false_type
+    {
+    };
 
-namespace traits {
+    namespace traits
+    {
 
-using type_id_t = uint64_t;
+        using type_id_t = uint64_t;
 
-template <class T>
-struct TypeId {
-  static constexpr type_id_t value = 0;
-};
+        template<class T>
+        struct TypeId
+        {
+            static constexpr type_id_t value = 0;
+        };
 
-template <>
-struct TypeId<void> {
-  static constexpr auto value = static_cast<type_id_t>(-1);
-};
+        template<>
+        struct TypeId<void>
+        {
+            static constexpr auto value = static_cast<type_id_t>(-1);
+        };
 
 // ! This only works when calling inside mmdeploy namespace
-#define MMDEPLOY_REGISTER_TYPE_ID(type, id) \
-  namespace traits {                        \
-  template <>                               \
-  struct TypeId<type> {                     \
-    static constexpr type_id_t value = id;  \
-  };                                        \
-  }                                         \
-  template <>                               \
-  struct is_cast_by_erasure<type> : std::true_type {};
-}  // namespace traits
+#define MMDEPLOY_REGISTER_TYPE_ID(type, id)          \
+    namespace traits                                 \
+    {                                                \
+        template<>                                   \
+        struct TypeId<type>                          \
+        {                                            \
+            static constexpr type_id_t value = id;   \
+        };                                           \
+    }                                                \
+    template<>                                       \
+    struct is_cast_by_erasure<type> : std::true_type \
+    {                                                \
+    };
+    }  // namespace traits
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/net.cpp b/csrc/mmdeploy/core/net.cpp
index 0864bcf1c5..fa6e7c144f 100644
--- a/csrc/mmdeploy/core/net.cpp
+++ b/csrc/mmdeploy/core/net.cpp
@@ -4,8 +4,9 @@
 
 #include "registry.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-MMDEPLOY_DEFINE_REGISTRY(Net);
+    MMDEPLOY_DEFINE_REGISTRY(Net);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/net.h b/csrc/mmdeploy/core/net.h
index bfc7e6d416..6a2927057f 100644
--- a/csrc/mmdeploy/core/net.h
+++ b/csrc/mmdeploy/core/net.h
@@ -8,21 +8,23 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class Net {
- public:
-  virtual ~Net() = default;
-  virtual Result<void> Init(const Value& cfg) = 0;
-  virtual Result<void> Deinit() = 0;
-  virtual Result<Span<Tensor>> GetInputTensors() = 0;
-  virtual Result<Span<Tensor>> GetOutputTensors() = 0;
-  virtual Result<void> Reshape(Span<TensorShape> input_shapes) = 0;
-  virtual Result<void> Forward() = 0;
-  virtual Result<void> ForwardAsync(Event* event) = 0;
-};
+    class Net
+    {
+      public:
+        virtual ~Net()                                                       = default;
+        virtual Result<void>         Init(const Value& cfg)                  = 0;
+        virtual Result<void>         Deinit()                                = 0;
+        virtual Result<Span<Tensor>> GetInputTensors()                       = 0;
+        virtual Result<Span<Tensor>> GetOutputTensors()                      = 0;
+        virtual Result<void>         Reshape(Span<TensorShape> input_shapes) = 0;
+        virtual Result<void>         Forward()                               = 0;
+        virtual Result<void>         ForwardAsync(Event* event)              = 0;
+    };
 
-MMDEPLOY_DECLARE_REGISTRY(Net, std::unique_ptr<Net>(const Value& config));
+    MMDEPLOY_DECLARE_REGISTRY(Net, std::unique_ptr<Net>(const Value& config));
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/core/operator.cpp b/csrc/mmdeploy/core/operator.cpp
index 3fa4e0d669..e74d018b92 100644
--- a/csrc/mmdeploy/core/operator.cpp
+++ b/csrc/mmdeploy/core/operator.cpp
@@ -6,201 +6,260 @@
 
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::graph {
-
-Result<void> Gather(const Value::Array& array, const vector<int>& idxs, Value::Array& output) {
-  if (idxs.empty()) {
-    return success();
-  }
-  auto max_idx = *max_element(begin(idxs), end(idxs));
-  if (array.size() <= max_idx) {
-    return Status(eOutOfRange);
-  }
-  output.reserve(output.size() + idxs.size());
-  for (const auto& idx : idxs) {
-    output.push_back(array[idx]);
-  }
-  return success();
-}
-
-Result<void> Gather(Value::Array&& array, const vector<int>& idxs, Value::Array& output) {
-  if (idxs.empty()) {
-    return success();
-  }
-  auto max_idx = *max_element(begin(idxs), end(idxs));
-  if (array.size() <= max_idx) {
-    return Status(eOutOfRange);
-  }
-  output.reserve(output.size() + idxs.size());
-  for (const auto& idx : idxs) {
-    output.push_back(std::move(array[idx]));
-  }
-  return success();
-}
-
-Result<void> Gather(const Value::Object& object, const vector<std::string>& keys,
-                    Value::Array& output) {
-  output.reserve(output.size() + keys.size());
-  try {
-    for (const auto& key : keys) {
-      output.push_back(object.at(key));
-    }
-  } catch (const std::out_of_range& e) {
-    return Status(eOutOfRange);
-  }
-  return success();
-}
-
-Result<void> Gather(Value::Object&& object, const vector<std::string>& keys, Value::Array& output) {
-  output.reserve(output.size() + keys.size());
-  try {
-    for (const auto& key : keys) {
-      output.push_back(std::move(object.at(key)));
-    }
-  } catch (const std::out_of_range& e) {
-    return Status(eOutOfRange);
-  }
-  return success();
-}
-
-Result<void> Scatter(Value::Array array, const vector<int>& idxs, Value::Array& output) {
-  if (array.size() < idxs.size()) {
-    return Status(eOutOfRange);
-  }
-  for (int i = 0; i < idxs.size(); ++i) {
-    output[idxs[i]] = std::move(array[i]);
-  }
-  return success();
-}
-
-Result<void> Scatter(Value::Array array, const vector<std::string>& keys, Value::Object& output) {
-  if (array.size() < keys.size()) {
-    return Status(eOutOfRange);
-  }
-  for (int i = 0; i < keys.size(); ++i) {
-    output.emplace(keys[i], std::move(array[i]));
-  }
-  return success();
-}
-
-Result<Value> DistribOA(const Value& oa) {
-  if (!oa.is_object()) {
-    return Status(eInvalidArgument);
-  }
-  Value ao = ValueType::kArray;
-  for (auto inner = oa.begin(); inner != oa.end(); ++inner) {
-    if (!inner->is_array()) {
-      return Status(eInvalidArgument);
-    }
-    if (ao.empty()) {
-      for (int i = 0; i < inner->size(); ++i) ao.push_back(ValueType::kObject);
-    }
-    if (inner->size() != oa.size()) {
-      return Status(eInvalidArgument);
-    }
-    for (int i = 0; i < inner->size(); ++i) {
-      ao[i][inner.key()] = (*inner)[i];
+namespace mmdeploy::graph
+{
+
+    Result<void> Gather(const Value::Array& array, const vector<int>& idxs, Value::Array& output)
+    {
+        if (idxs.empty())
+        {
+            return success();
+        }
+        auto max_idx = *max_element(begin(idxs), end(idxs));
+        if (array.size() <= max_idx)
+        {
+            return Status(eOutOfRange);
+        }
+        output.reserve(output.size() + idxs.size());
+        for (const auto& idx : idxs)
+        {
+            output.push_back(array[idx]);
+        }
+        return success();
     }
-  }
-  return ao;
-}
-
-Result<Value> DistribAO(const Value& ao) {
-  if (!ao.is_array()) {
-    return Status(eInvalidArgument);
-  }
-  Value oa = ValueType::kObject;
-  for (const auto& inner : ao) {
-    if (inner.is_object()) {
-      return Status(eInvalidArgument);
+
+    Result<void> Gather(Value::Array&& array, const vector<int>& idxs, Value::Array& output)
+    {
+        if (idxs.empty())
+        {
+            return success();
+        }
+        auto max_idx = *max_element(begin(idxs), end(idxs));
+        if (array.size() <= max_idx)
+        {
+            return Status(eOutOfRange);
+        }
+        output.reserve(output.size() + idxs.size());
+        for (const auto& idx : idxs)
+        {
+            output.push_back(std::move(array[idx]));
+        }
+        return success();
     }
-    if (oa.empty()) {
-      for (auto item = inner.begin(); item != inner.end(); ++item) {
-        oa[item.key()] = ValueType::kObject;
-      }
+
+    Result<void> Gather(const Value::Object& object, const vector<std::string>& keys, Value::Array& output)
+    {
+        output.reserve(output.size() + keys.size());
+        try
+        {
+            for (const auto& key : keys)
+            {
+                output.push_back(object.at(key));
+            }
+        }
+        catch (const std::out_of_range& e)
+        {
+            return Status(eOutOfRange);
+        }
+        return success();
     }
-    if (inner.size() != oa.size()) {
-      return Status(eInvalidArgument);
+
+    Result<void> Gather(Value::Object&& object, const vector<std::string>& keys, Value::Array& output)
+    {
+        output.reserve(output.size() + keys.size());
+        try
+        {
+            for (const auto& key : keys)
+            {
+                output.push_back(std::move(object.at(key)));
+            }
+        }
+        catch (const std::out_of_range& e)
+        {
+            return Status(eOutOfRange);
+        }
+        return success();
     }
-    for (auto item = inner.begin(); item != inner.end(); ++item) {
-      if (!oa.contains(item.key())) {
-        return Status(eInvalidArgument);
-      }
-      oa[item.key()].push_back(*item);
+
+    Result<void> Scatter(Value::Array array, const vector<int>& idxs, Value::Array& output)
+    {
+        if (array.size() < idxs.size())
+        {
+            return Status(eOutOfRange);
+        }
+        for (int i = 0; i < idxs.size(); ++i)
+        {
+            output[idxs[i]] = std::move(array[i]);
+        }
+        return success();
     }
-  }
-  return oa;
-}
-
-Result<Value> DistribAA(const Value& a) {
-  if (!a.is_array()) {
-    return Status(eInvalidArgument);
-  }
-  auto ta = Value::Array{};
-  for (const auto& inner : a.get_ref<const Value::Array&>()) {
-    if (!inner.is_array()) {
-      return Status(eInvalidArgument);
+
+    Result<void> Scatter(Value::Array array, const vector<std::string>& keys, Value::Object& output)
+    {
+        if (array.size() < keys.size())
+        {
+            return Status(eOutOfRange);
+        }
+        for (int i = 0; i < keys.size(); ++i)
+        {
+            output.emplace(keys[i], std::move(array[i]));
+        }
+        return success();
     }
-    if (ta.empty()) {
-      ta.reserve(inner.size());
-      for (int i = 0; i < inner.size(); ++i) {
-        ta.emplace_back(Value::kArray);
-      }
+
+    Result<Value> DistribOA(const Value& oa)
+    {
+        if (!oa.is_object())
+        {
+            return Status(eInvalidArgument);
+        }
+        Value ao = ValueType::kArray;
+        for (auto inner = oa.begin(); inner != oa.end(); ++inner)
+        {
+            if (!inner->is_array())
+            {
+                return Status(eInvalidArgument);
+            }
+            if (ao.empty())
+            {
+                for (int i = 0; i < inner->size(); ++i) ao.push_back(ValueType::kObject);
+            }
+            if (inner->size() != oa.size())
+            {
+                return Status(eInvalidArgument);
+            }
+            for (int i = 0; i < inner->size(); ++i)
+            {
+                ao[i][inner.key()] = (*inner)[i];
+            }
+        }
+        return ao;
     }
-    if (inner.size() != ta.size()) {
-      return Status(eInvalidArgument);
+
+    Result<Value> DistribAO(const Value& ao)
+    {
+        if (!ao.is_array())
+        {
+            return Status(eInvalidArgument);
+        }
+        Value oa = ValueType::kObject;
+        for (const auto& inner : ao)
+        {
+            if (inner.is_object())
+            {
+                return Status(eInvalidArgument);
+            }
+            if (oa.empty())
+            {
+                for (auto item = inner.begin(); item != inner.end(); ++item)
+                {
+                    oa[item.key()] = ValueType::kObject;
+                }
+            }
+            if (inner.size() != oa.size())
+            {
+                return Status(eInvalidArgument);
+            }
+            for (auto item = inner.begin(); item != inner.end(); ++item)
+            {
+                if (!oa.contains(item.key()))
+                {
+                    return Status(eInvalidArgument);
+                }
+                oa[item.key()].push_back(*item);
+            }
+        }
+        return oa;
     }
-    for (int i = 0; i < inner.size(); ++i) {
-      ta[i].push_back(inner[i]);
+
+    Result<Value> DistribAA(const Value& a)
+    {
+        if (!a.is_array())
+        {
+            return Status(eInvalidArgument);
+        }
+        auto ta = Value::Array{};
+        for (const auto& inner : a.get_ref<const Value::Array&>())
+        {
+            if (!inner.is_array())
+            {
+                return Status(eInvalidArgument);
+            }
+            if (ta.empty())
+            {
+                ta.reserve(inner.size());
+                for (int i = 0; i < inner.size(); ++i)
+                {
+                    ta.emplace_back(Value::kArray);
+                }
+            }
+            if (inner.size() != ta.size())
+            {
+                return Status(eInvalidArgument);
+            }
+            for (int i = 0; i < inner.size(); ++i)
+            {
+                ta[i].push_back(inner[i]);
+            }
+        }
+        return ta;
     }
-  }
-  return ta;
-}
-
-std::tuple<Value::Array, std::vector<int>> FlattenArray(Value::Array values,
-                                                        const vector<bool>& predicate) {
-  assert(values.size() == predicate.size());
-  std::vector<int> indices;
-  for (int i = 0; i < values.size(); ++i) {
-    if (predicate[i]) {
-      std::vector<int> idx;
-      std::tie(values[i], idx) = Flatten(values[i]).value();
-      if (indices.empty()) {
-        indices.swap(idx);
-      } else {
-        assert(idx == indices);
-      }
+
+    std::tuple<Value::Array, std::vector<int>> FlattenArray(Value::Array        values,
+                                                            const vector<bool>& predicate)
+    {
+        assert(values.size() == predicate.size());
+        std::vector<int> indices;
+        for (int i = 0; i < values.size(); ++i)
+        {
+            if (predicate[i])
+            {
+                std::vector<int> idx;
+                std::tie(values[i], idx) = Flatten(values[i]).value();
+                if (indices.empty())
+                {
+                    indices.swap(idx);
+                }
+                else
+                {
+                    assert(idx == indices);
+                }
+            }
+        }
+        return {std::move(values), std::move(indices)};
     }
-  }
-  return {std::move(values), std::move(indices)};
-}
-
-Value::Array UnflattenArray(Value::Array values, const vector<int>& index,
-                            const vector<bool>& predicate) {
-  assert(values.size() == predicate.size());
-  for (int i = 0; i < values.size(); ++i) {
-    if (predicate[i]) {
-      values[i] = Unflatten(std::move(values[i]), index).value();
+
+    Value::Array UnflattenArray(Value::Array values, const vector<int>& index, const vector<bool>& predicate)
+    {
+        assert(values.size() == predicate.size());
+        for (int i = 0; i < values.size(); ++i)
+        {
+            if (predicate[i])
+            {
+                values[i] = Unflatten(std::move(values[i]), index).value();
+            }
+        }
+        return values;
     }
-  }
-  return values;
-}
-
-Value::Array BroadcastArray(Value::Array values, const vector<int>& index,
-                            const vector<bool>& predicate) {
-  assert(values.size() == predicate.size());
-  for (int i = 0; i < values.size(); ++i) {
-    if (predicate[i]) {
-      assert(values[i].is_array());
-      auto& val = values[i].array();
-      Value::Array ret(index.size() - 1);
-      for (int j = 0; j < ret.size(); ++j) {
-        ret[j] = val[index[j]];
-      }
-      values[i] = std::move(ret);
+
+    Value::Array BroadcastArray(Value::Array values, const vector<int>& index, const vector<bool>& predicate)
+    {
+        assert(values.size() == predicate.size());
+        for (int i = 0; i < values.size(); ++i)
+        {
+            if (predicate[i])
+            {
+                assert(values[i].is_array());
+                auto&        val = values[i].array();
+                Value::Array ret(index.size() - 1);
+                for (int j = 0; j < ret.size(); ++j)
+                {
+                    ret[j] = val[index[j]];
+                }
+                values[i] = std::move(ret);
+            }
+        }
+        return values;
     }
-  }
-  return values;
-}
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/core/operator.h b/csrc/mmdeploy/core/operator.h
index d45a088892..a1aea8e0a0 100644
--- a/csrc/mmdeploy/core/operator.h
+++ b/csrc/mmdeploy/core/operator.h
@@ -5,117 +5,126 @@
 
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy::graph {
-
-using std::string;
-using std::tuple;
-using std::vector;
-
-MMDEPLOY_API Result<void> Gather(const Value::Array& array, const vector<int>& idxs,
-                                 Value::Array& output);
-MMDEPLOY_API Result<void> Gather(Value::Array&& array, const vector<int>& idxs,
-                                 Value::Array& output);
-MMDEPLOY_API Result<void> Gather(const Value::Object& object, const vector<std::string>& keys,
-                                 Value::Array& output);
-MMDEPLOY_API Result<void> Gather(Value::Object&& object, const vector<std::string>& keys,
-                                 Value::Array& output);
-MMDEPLOY_API Result<void> Scatter(Value::Array array, const vector<int>& idxs,
-                                  Value::Array& output);
-MMDEPLOY_API Result<void> Scatter(Value::Array array, const vector<std::string>& keys,
-                                  Value::Object& output);
-
-inline Result<Value::Array> Gather(const Value::Array& array, const vector<int>& idxs) {
-  Value::Array output;
-  OUTCOME_TRY(Gather(array, idxs, output));
-  return output;
-}
-
-inline Result<Value::Array> Gather(Value::Array&& array, const vector<int>& idxs) {
-  Value::Array output;
-  OUTCOME_TRY(Gather(std::move(array), idxs, output));
-  return output;
-}
-
-inline Result<Value::Array> Gather(const Value::Object& object, const vector<std::string>& keys) {
-  Value::Array output;
-  OUTCOME_TRY(Gather(object, keys, output));
-  return output;
-}
-
-inline Result<Value::Array> Gather(Value::Object&& object, const vector<std::string>& keys) {
-  Value::Array output;
-  OUTCOME_TRY(Gather(std::move(object), keys, output));
-  return output;
-}
-
-inline Result<Value::Array> Scatter(Value::Array array, const vector<int>& idxs) {
-  Value::Array output(idxs.size(), Value::kNull);
-  OUTCOME_TRY(Scatter(std::move(array), idxs, output));
-  return output;
-}
-
-inline Result<Value::Object> Scatter(Value::Array array, const vector<std::string>& keys) {
-  Value::Object output;
-  OUTCOME_TRY(Scatter(std::move(array), keys, output));
-  return output;
-}
-
-template <class V, std::enable_if_t<is_value_v<std::decay_t<V>>, bool> = true>
-Result<tuple<Value, vector<int>>> Flatten(V&& input) {
-  if (!input.is_array()) {
-    return Status(eInvalidArgument);
-  }
-  Value output = ValueType::kArray;
-  std::vector<int> idxs;
-  for (int i = 0; i < input.size(); ++i) {
-    auto inner = std::forward<V>(input)[i];
-    if (!inner.is_array()) {
-      return Status(eInvalidArgument);
+namespace mmdeploy::graph
+{
+
+    using std::string;
+    using std::tuple;
+    using std::vector;
+
+    MMDEPLOY_API Result<void> Gather(const Value::Array& array, const vector<int>& idxs, Value::Array& output);
+    MMDEPLOY_API Result<void> Gather(Value::Array&& array, const vector<int>& idxs, Value::Array& output);
+    MMDEPLOY_API Result<void> Gather(const Value::Object& object, const vector<std::string>& keys, Value::Array& output);
+    MMDEPLOY_API Result<void> Gather(Value::Object&& object, const vector<std::string>& keys, Value::Array& output);
+    MMDEPLOY_API Result<void> Scatter(Value::Array array, const vector<int>& idxs, Value::Array& output);
+    MMDEPLOY_API Result<void>   Scatter(Value::Array array, const vector<std::string>& keys, Value::Object& output);
+
+    inline Result<Value::Array> Gather(const Value::Array& array, const vector<int>& idxs)
+    {
+        Value::Array output;
+        OUTCOME_TRY(Gather(array, idxs, output));
+        return output;
     }
-    for (auto& item : inner) {
-      output.push_back(std::move(item));
-      idxs.push_back(i);
+
+    inline Result<Value::Array> Gather(Value::Array&& array, const vector<int>& idxs)
+    {
+        Value::Array output;
+        OUTCOME_TRY(Gather(std::move(array), idxs, output));
+        return output;
+    }
+
+    inline Result<Value::Array> Gather(const Value::Object& object, const vector<std::string>& keys)
+    {
+        Value::Array output;
+        OUTCOME_TRY(Gather(object, keys, output));
+        return output;
+    }
+
+    inline Result<Value::Array> Gather(Value::Object&& object, const vector<std::string>& keys)
+    {
+        Value::Array output;
+        OUTCOME_TRY(Gather(std::move(object), keys, output));
+        return output;
+    }
+
+    inline Result<Value::Array> Scatter(Value::Array array, const vector<int>& idxs)
+    {
+        Value::Array output(idxs.size(), Value::kNull);
+        OUTCOME_TRY(Scatter(std::move(array), idxs, output));
+        return output;
+    }
+
+    inline Result<Value::Object> Scatter(Value::Array array, const vector<std::string>& keys)
+    {
+        Value::Object output;
+        OUTCOME_TRY(Scatter(std::move(array), keys, output));
+        return output;
     }
-  }
-  idxs.push_back(static_cast<int>(input.size()));
-  return {output, idxs};
-}
-
-template <class V, std::enable_if_t<is_value_v<std::decay_t<V>>, bool> = true>
-Result<Value> Unflatten(V&& input, const vector<int>& idxs) {
-  if (!input.is_array()) {
-    return Status(eInvalidArgument);
-  }
-  Value output = ValueType::kArray;
-  for (int i = 0; i < idxs.back(); ++i) {
-    output.push_back(ValueType::kArray);
-  }
-  for (int i = 0; i < input.size(); ++i) {
-    if (idxs[i] >= output.size()) {
-      return Status(eInvalidArgument);
+
+    template<class V, std::enable_if_t<is_value_v<std::decay_t<V>>, bool> = true>
+    Result<tuple<Value, vector<int>>> Flatten(V&& input)
+    {
+        if (!input.is_array())
+        {
+            return Status(eInvalidArgument);
+        }
+        Value            output = ValueType::kArray;
+        std::vector<int> idxs;
+        for (int i = 0; i < input.size(); ++i)
+        {
+            auto inner = std::forward<V>(input)[i];
+            if (!inner.is_array())
+            {
+                return Status(eInvalidArgument);
+            }
+            for (auto& item : inner)
+            {
+                output.push_back(std::move(item));
+                idxs.push_back(i);
+            }
+        }
+        idxs.push_back(static_cast<int>(input.size()));
+        return {output, idxs};
+    }
+
+    template<class V, std::enable_if_t<is_value_v<std::decay_t<V>>, bool> = true>
+    Result<Value> Unflatten(V&& input, const vector<int>& idxs)
+    {
+        if (!input.is_array())
+        {
+            return Status(eInvalidArgument);
+        }
+        Value output = ValueType::kArray;
+        for (int i = 0; i < idxs.back(); ++i)
+        {
+            output.push_back(ValueType::kArray);
+        }
+        for (int i = 0; i < input.size(); ++i)
+        {
+            if (idxs[i] >= output.size())
+            {
+                return Status(eInvalidArgument);
+            }
+            output[idxs[i]].push_back(std::forward<V>(input)[i]);
+        }
+        return output;
     }
-    output[idxs[i]].push_back(std::forward<V>(input)[i]);
-  }
-  return output;
-}
 
-// object of arrays -> array of objects, all arrays must be of same length
-MMDEPLOY_API Result<Value> DistribOA(const Value& oa);
+    // object of arrays -> array of objects, all arrays must be of same length
+    MMDEPLOY_API Result<Value> DistribOA(const Value& oa);
 
-// array of objects -> object of arrays, all objects must be isomorphic
-MMDEPLOY_API Result<Value> DistribAO(const Value& ao);
+    // array of objects -> object of arrays, all objects must be isomorphic
+    MMDEPLOY_API Result<Value> DistribAO(const Value& ao);
 
-// array of arrays -> array of arrays, this is equivalent to transpose
-MMDEPLOY_API Result<Value> DistribAA(const Value& a);
+    // array of arrays -> array of arrays, this is equivalent to transpose
+    MMDEPLOY_API Result<Value> DistribAA(const Value& a);
 
-MMDEPLOY_API std::tuple<Value::Array, std::vector<int>> FlattenArray(Value::Array values,
-                                                                     const vector<bool>& predicate);
+    MMDEPLOY_API std::tuple<Value::Array, std::vector<int>> FlattenArray(Value::Array        values,
+                                                                         const vector<bool>& predicate);
 
-MMDEPLOY_API Value::Array UnflattenArray(Value::Array values, const vector<int>& index,
-                                         const vector<bool>& predicate);
+    MMDEPLOY_API Value::Array UnflattenArray(Value::Array values, const vector<int>& index, const vector<bool>& predicate);
 
-MMDEPLOY_API Value::Array BroadcastArray(Value::Array values, const std::vector<int>& index,
-                                         const vector<bool>& predicate);
+    MMDEPLOY_API Value::Array BroadcastArray(Value::Array values, const std::vector<int>& index, const vector<bool>& predicate);
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/core/profiler.cpp b/csrc/mmdeploy/core/profiler.cpp
index 573e8ec383..09bc34d032 100644
--- a/csrc/mmdeploy/core/profiler.cpp
+++ b/csrc/mmdeploy/core/profiler.cpp
@@ -4,84 +4,101 @@
 
 #include <iomanip>
 
-namespace mmdeploy {
-namespace profiler {
-
-Event* Scope::Add(Event::Type type, Index index, TimePoint time_point) {
-  return profiler_->AddEvent({this, type, index, time_point});
-}
-
-Scope* Scope::CreateScope(std::string_view name) {
-  auto node = children_.emplace_back(profiler_->CreateScope(name));
-  node->parent_ = this;
-  return node;
-}
-
-void Scope::Dump(Scope* scope, std::ofstream& ofs) {
-  ofs << scope->name_ << " " << (void*)scope << " ";
-  for (auto& child : scope->children_) {
-    ofs << (void*)child << " ";
-  }
-  ofs << "\n";
-  for (const auto& child : scope->children_) {
-    Dump(child, ofs);
-  }
-}
-
-ScopedCounter::ScopedCounter(Scope* scope) {
-  if (scope) {
-    start_ = scope->Add(Event::kStart, scope->next_.fetch_add(1, std::memory_order_relaxed),
-                        Clock::now());
-  }
-}
-
-ScopedCounter::~ScopedCounter() {
-  if (start_) {
-    start_->scope->Add(Event::kEnd, start_->index, Clock::now());
-  }
-}
-
-Profiler::Profiler(std::string_view path) : path_(path) { root_ = CreateScope("."); }
-
-Scope* Profiler::CreateScope(std::string_view name) {
-  auto& node = nodes_.emplace_back();
-  node.profiler_ = this;
-  node.name_ = name;
-  return &node;
-}
-
-Event* Profiler::AddEvent(Event e) {
-  auto uptr = std::make_unique<Event>(e);
-  Event* pe = uptr.get();
-  events_.enqueue(std::move(uptr));
-  return pe;
-}
-
-void Profiler::Release() {
-  std::ofstream ofs(path_);
-  root_->Dump(ofs);
-  ofs << "----\n";
-
-  std::unique_ptr<Event> item;
-  std::vector<std::unique_ptr<Event>> vec;
-  while (events_.try_dequeue(item)) {
-    vec.push_back(std::move(item));
-  }
-
-  std::sort(vec.begin(), vec.end(),
-            [](const std::unique_ptr<Event>& a, const std::unique_ptr<Event>& b) {
-              return a->time_point < b->time_point;
-            });
-
-  for (int i = 0; i < vec.size(); i++) {
-    ofs << (void*)vec[i]->scope << " " << vec[i]->type << " " << vec[i]->index << " "
-        << std::chrono::duration_cast<std::chrono::microseconds>(vec[i]->time_point -
-                                                                 vec[0]->time_point)
-               .count()
-        << "\n";
-  }
-}
-
-}  // namespace profiler
+namespace mmdeploy
+{
+    namespace profiler
+    {
+
+        Event* Scope::Add(Event::Type type, Index index, TimePoint time_point)
+        {
+            return profiler_->AddEvent({this, type, index, time_point});
+        }
+
+        Scope* Scope::CreateScope(std::string_view name)
+        {
+            auto node     = children_.emplace_back(profiler_->CreateScope(name));
+            node->parent_ = this;
+            return node;
+        }
+
+        void Scope::Dump(Scope* scope, std::ofstream& ofs)
+        {
+            ofs << scope->name_ << " " << (void*)scope << " ";
+            for (auto& child : scope->children_)
+            {
+                ofs << (void*)child << " ";
+            }
+            ofs << "\n";
+            for (const auto& child : scope->children_)
+            {
+                Dump(child, ofs);
+            }
+        }
+
+        ScopedCounter::ScopedCounter(Scope* scope)
+        {
+            if (scope)
+            {
+                start_ = scope->Add(Event::kStart, scope->next_.fetch_add(1, std::memory_order_relaxed), Clock::now());
+            }
+        }
+
+        ScopedCounter::~ScopedCounter()
+        {
+            if (start_)
+            {
+                start_->scope->Add(Event::kEnd, start_->index, Clock::now());
+            }
+        }
+
+        Profiler::Profiler(std::string_view path)
+            : path_(path)
+        {
+            root_ = CreateScope(".");
+        }
+
+        Scope* Profiler::CreateScope(std::string_view name)
+        {
+            auto& node     = nodes_.emplace_back();
+            node.profiler_ = this;
+            node.name_     = name;
+            return &node;
+        }
+
+        Event* Profiler::AddEvent(Event e)
+        {
+            auto   uptr = std::make_unique<Event>(e);
+            Event* pe   = uptr.get();
+            events_.enqueue(std::move(uptr));
+            return pe;
+        }
+
+        void Profiler::Release()
+        {
+            std::ofstream ofs(path_);
+            root_->Dump(ofs);
+            ofs << "----\n";
+
+            std::unique_ptr<Event>              item;
+            std::vector<std::unique_ptr<Event>> vec;
+            while (events_.try_dequeue(item))
+            {
+                vec.push_back(std::move(item));
+            }
+
+            std::sort(vec.begin(), vec.end(), [](const std::unique_ptr<Event>& a, const std::unique_ptr<Event>& b)
+                      { return a->time_point < b->time_point; });
+
+            for (int i = 0; i < vec.size(); i++)
+            {
+                ofs << (void*)vec[i]->scope << " " << vec[i]->type << " " << vec[i]->index << " "
+                    << std::chrono::duration_cast<std::chrono::microseconds>(vec[i]->time_point -
+                                                                             vec[0]->time_point)
+                           .count()
+                    << "\n";
+            }
+        }
+
+    }  // namespace profiler
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/profiler.h b/csrc/mmdeploy/core/profiler.h
index 3f8f9b9876..899eda3c61 100644
--- a/csrc/mmdeploy/core/profiler.h
+++ b/csrc/mmdeploy/core/profiler.h
@@ -17,69 +17,86 @@
 #include "mmdeploy/core/macro.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
-namespace profiler {
-
-struct Profiler;
-struct Scope;
-
-using Clock = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
-                                 std::chrono::high_resolution_clock, std::chrono::steady_clock>;
-using TimePoint = Clock::time_point;
-using Index = uint64_t;
-
-struct Event {
-  enum Type { kStart, kEnd };
-  Scope* scope;
-  Type type;
-  Index index;
-  TimePoint time_point;
-};
-
-struct MMDEPLOY_API Scope {
-  Scope() = default;
-  Scope(const Scope&) = delete;
-  Scope(Scope&&) noexcept = delete;
-  Scope& operator=(const Scope&) = delete;
-  Scope& operator=(Scope&&) noexcept = delete;
-
-  Event* Add(Event::Type type, Index index, TimePoint time_point);
-
-  Scope* CreateScope(std::string_view name);
-
-  void Dump(Scope* scope, std::ofstream& ofs);
-  void Dump(std::ofstream& ofs) { Dump(this, ofs); }
-
-  Profiler* profiler_{};
-  Scope* parent_{};
-  std::vector<Scope*> children_;
-  std::atomic<Index> next_{};
-  std::string name_;
-};
-
-struct MMDEPLOY_API ScopedCounter {
-  explicit ScopedCounter(Scope* scope);
-  ~ScopedCounter();
-
-  Event* start_{};
-};
-
-struct MMDEPLOY_API Profiler {
-  explicit Profiler(std::string_view path);
-  Scope* CreateScope(std::string_view name);
-  Event* AddEvent(Event e);
-  Scope* scope() const noexcept { return root_; }
-  void Release();
-
-  std::string path_;
-  std::deque<Scope> nodes_;
-  moodycamel::ConcurrentQueue<std::unique_ptr<Event>> events_;
-  Scope* root_{};
-};
-
-}  // namespace profiler
-
-MMDEPLOY_REGISTER_TYPE_ID(profiler::Scope*, 10);
+namespace mmdeploy
+{
+    namespace profiler
+    {
+
+        struct Profiler;
+        struct Scope;
+
+        using Clock     = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
+                                         std::chrono::high_resolution_clock,
+                                         std::chrono::steady_clock>;
+        using TimePoint = Clock::time_point;
+        using Index     = uint64_t;
+
+        struct Event
+        {
+            enum Type
+            {
+                kStart,
+                kEnd
+            };
+            Scope*    scope;
+            Type      type;
+            Index     index;
+            TimePoint time_point;
+        };
+
+        struct MMDEPLOY_API Scope
+        {
+            Scope()                            = default;
+            Scope(const Scope&)                = delete;
+            Scope(Scope&&) noexcept            = delete;
+            Scope& operator=(const Scope&)     = delete;
+            Scope& operator=(Scope&&) noexcept = delete;
+
+            Event* Add(Event::Type type, Index index, TimePoint time_point);
+
+            Scope* CreateScope(std::string_view name);
+
+            void   Dump(Scope* scope, std::ofstream& ofs);
+            void   Dump(std::ofstream& ofs)
+            {
+                Dump(this, ofs);
+            }
+
+            Profiler*           profiler_{};
+            Scope*              parent_{};
+            std::vector<Scope*> children_;
+            std::atomic<Index>  next_{};
+            std::string         name_;
+        };
+
+        struct MMDEPLOY_API ScopedCounter
+        {
+            explicit ScopedCounter(Scope* scope);
+            ~ScopedCounter();
+
+            Event* start_{};
+        };
+
+        struct MMDEPLOY_API Profiler
+        {
+            explicit Profiler(std::string_view path);
+            Scope* CreateScope(std::string_view name);
+            Event* AddEvent(Event e);
+            Scope* scope() const noexcept
+            {
+                return root_;
+            }
+            void                                                Release();
+
+            std::string                                         path_;
+            std::deque<Scope>                                   nodes_;
+            moodycamel::ConcurrentQueue<std::unique_ptr<Event>> events_;
+            Scope*                                              root_{};
+        };
+
+    }  // namespace profiler
+
+    MMDEPLOY_REGISTER_TYPE_ID(profiler::Scope*, 10);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/registry.cpp b/csrc/mmdeploy/core/registry.cpp
index dab2c92416..82a0bb75a6 100644
--- a/csrc/mmdeploy/core/registry.cpp
+++ b/csrc/mmdeploy/core/registry.cpp
@@ -7,75 +7,97 @@
 
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy {
-
-namespace _registry {
-
-struct Registry<void>::Impl {
-  template <typename It>
-  auto convert(It u, It v) {
-    return std::pair{creators_.begin() + (u - names_.begin()),
-                     creators_.begin() + (v - names_.begin())};
-  }
-
-  Creator<void>* Get(const string_view& name, int version) {
-    const auto& [u, v] = std::equal_range(names_.begin(), names_.end(), name);
-    const auto& [i, j] = convert(u, v);
-    if (version == -1) {
-      if (auto n = j - i; n == 1) {
-        return *i;
-      }
-      return nullptr;
-    }
-    for (const auto& creator : iterator_range(i, j)) {
-      if (creator->version() == version) {
-        return creator;
-      }
-    }
-    return nullptr;
-  }
-
-  bool Add(Creator<void>& creator) {
-    const auto& [u, v] = std::equal_range(names_.begin(), names_.end(), creator.name());
-    const auto& [i, j] = convert(u, v);
-    if (i != j) {
-      for (const auto& other : iterator_range(i, j)) {
-        if (creator.version() == other->version()) {
-          MMDEPLOY_WARN("Adding duplicated creator ({}, {}).", creator.name(), creator.version());
-          return false;
+namespace mmdeploy
+{
+
+    namespace _registry
+    {
+
+        struct Registry<void>::Impl
+        {
+            template<typename It>
+            auto convert(It u, It v)
+            {
+                return std::pair{creators_.begin() + (u - names_.begin()),
+                                 creators_.begin() + (v - names_.begin())};
+            }
+
+            Creator<void>* Get(const string_view& name, int version)
+            {
+                const auto& [u, v] = std::equal_range(names_.begin(), names_.end(), name);
+                const auto& [i, j] = convert(u, v);
+                if (version == -1)
+                {
+                    if (auto n = j - i; n == 1)
+                    {
+                        return *i;
+                    }
+                    return nullptr;
+                }
+                for (const auto& creator : iterator_range(i, j))
+                {
+                    if (creator->version() == version)
+                    {
+                        return creator;
+                    }
+                }
+                return nullptr;
+            }
+
+            bool Add(Creator<void>& creator)
+            {
+                const auto& [u, v] = std::equal_range(names_.begin(), names_.end(), creator.name());
+                const auto& [i, j] = convert(u, v);
+                if (i != j)
+                {
+                    for (const auto& other : iterator_range(i, j))
+                    {
+                        if (creator.version() == other->version())
+                        {
+                            MMDEPLOY_WARN("Adding duplicated creator ({}, {}).", creator.name(), creator.version());
+                            return false;
+                        }
+                    }
+                }
+                names_.insert(v, creator.name());
+                creators_.insert(j, &creator);
+                return true;
+            }
+
+            Span<Creator<void>*> Creators()
+            {
+                return creators_;
+            }
+
+            std::vector<Creator<void>*> creators_;
+            std::vector<string_view>    names_;
+        };
+
+        Registry<void>::Registry()
+            : impl_(std::make_unique<Impl>())
+        {
         }
-      }
-    }
-    names_.insert(v, creator.name());
-    creators_.insert(j, &creator);
-    return true;
-  }
 
-  Span<Creator<void>*> Creators() { return creators_; }
+        Registry<void>::~Registry() = default;
 
-  std::vector<Creator<void>*> creators_;
-  std::vector<string_view> names_;
-};
-
-Registry<void>::Registry() : impl_(std::make_unique<Impl>()) {}
-
-Registry<void>::~Registry() = default;
-
-bool Registry<void>::AddCreator(Creator<void>& creator) {
-  assert(impl_);
-  return impl_->Add(creator);
-}
+        bool Registry<void>::AddCreator(Creator<void>& creator)
+        {
+            assert(impl_);
+            return impl_->Add(creator);
+        }
 
-Creator<void>* Registry<void>::GetCreator(const std::string_view& name, int version) {
-  assert(impl_);
-  return impl_->Get(name, version);
-}
+        Creator<void>* Registry<void>::GetCreator(const std::string_view& name, int version)
+        {
+            assert(impl_);
+            return impl_->Get(name, version);
+        }
 
-Span<Creator<void>*> Registry<void>::Creators() {
-  assert(impl_);
-  return impl_->Creators();
-}
+        Span<Creator<void>*> Registry<void>::Creators()
+        {
+            assert(impl_);
+            return impl_->Creators();
+        }
 
-}  // namespace _registry
+    }  // namespace _registry
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/registry.h b/csrc/mmdeploy/core/registry.h
index 60afe0577c..bf898d34b3 100644
--- a/csrc/mmdeploy/core/registry.h
+++ b/csrc/mmdeploy/core/registry.h
@@ -13,219 +13,284 @@
 #include "mmdeploy/core/macro.h"
 #include "mmdeploy/core/mpl/span.h"
 
-namespace mmdeploy {
-
-template <typename T>
-struct basic_type {
-  using type = T;
-};
-
-template <typename Iterator>
-class iterator_range {
- public:
-  explicit iterator_range(Iterator first, Iterator last) : first_(first), last_(last) {}
-  explicit iterator_range(std::pair<Iterator, Iterator> range)
-      : iterator_range(range.first, range.second) {}
-
-  auto begin() noexcept { return first_; }
-  auto end() noexcept { return last_; }
-
- private:
-  Iterator first_;
-  Iterator last_;
-};
-
-namespace _registry {
-
-using std::optional;
-using std::string_view;
-
-template <typename T, typename = void>
-struct _get_signature {
-  static_assert(!std::is_same_v<T, T>, "tag T is not associated with a signature");
-};
-
-template <typename T>
-using get_signature_t = decltype(get_signature(basic_type<T>{}));
-
-template <typename T>
-struct _get_signature<T, std::void_t<get_signature_t<T>>> {
-  using type = typename get_signature_t<T>::type;
-};
-
-template <typename T>
-using GetSignature = typename _get_signature<T>::type;
-
-template <typename Tag>
-class Creator;
-
-template <>
-class MMDEPLOY_API Creator<void> {
- public:
-  virtual ~Creator() = default;
-  virtual string_view name() const noexcept = 0;
-  virtual int version() const noexcept { return 0; }
-};
-
-template <typename Ret, typename... Args>
-class MMDEPLOY_API Creator<Ret(Args...)> : public Creator<void> {
- public:
-  virtual Ret Create(Args... args) = 0;
-};
-
-template <typename Tag>
-class SimpleCreator;
-
-template <typename Ret, typename... Args>
-class SimpleCreator<Ret(Args...)> : public Creator<Ret(Args...)> {
- public:
-  using FunctionType = std::function<Ret(Args...)>;
-
-  SimpleCreator(const string_view& name, int version, FunctionType func)
-      : name_(name), version_(version), func_(std::move(func)) {}
-
-  string_view name() const noexcept override { return name_; }
-  int version() const noexcept override { return version_; }
-  Ret Create(Args... args) override { return func_(args...); }
-
- private:
-  std::string name_;
-  int version_;
-  FunctionType func_;
-};
-
-template <typename Tag>
-class Registry;
-
-template <>
-class MMDEPLOY_API Registry<void> {
- public:
-  Registry();
-
-  ~Registry();
-
-  bool AddCreator(Creator<void>& creator);
-
-  Creator<void>* GetCreator(const string_view& name, int version);
-
-  Span<Creator<void>*> Creators();
-
- private:
-  struct Impl;
-  std::unique_ptr<Impl> impl_;
-};
-
-template <typename Signature>
-struct _result_of;
-
-template <typename R, typename... As>
-struct _result_of<R(As...)> {
-  using type = R;
-};
-
-template <typename Tag>
-class Registry : public Registry<void> {
- public:
-  using Signature = GetSignature<Tag>;
-  using CreatorType = Creator<Signature>;
-
-  // workaround for gcc-10.2 (https://github.com/open-mmlab/mmdeploy/issues/1796)
-  Registry() : Registry<void>{} {}
-
-  bool Add(CreatorType& creator) & { return AddCreator(creator); }
-
-  CreatorType* Get(const string_view& name, int version) & {
-    return static_cast<CreatorType*>(GetCreator(name, version));
-  }
-
-  CreatorType* Get(const string_view& name) & { return Get(name, -1); }
-
-  template <typename... Args>
-  auto Create(const std::pair<string_view, int>& desc,
-              Args&&... args) & -> optional<typename _result_of<Signature>::type> {
-    if (auto creator = Get(desc.first, desc.second); creator) {
-      return creator->Create((Args &&) args...);
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  template <typename... Args>
-  auto Create(const string_view& name, Args&&... args) & {
-    return Create(std::pair{name, -1}, (Args &&) args...);
-  }
-
-  Span<CreatorType*> Creators() & {
-    auto creators = Registry<void>::Creators();
-    return {reinterpret_cast<CreatorType**>(creators.data()), creators.size()};
-  }
-
-  auto List() & {
-    std::vector<std::pair<string_view, int>> list;
-    for (const auto& creator : Creators()) {
-      list.emplace_back(creator->name(), creator->version());
-    }
-    return list;
-  }
-};
-
-template <typename Tag>
-auto gRegistry() -> decltype((get_registry(basic_type<Tag>{}))) {
-  return get_registry(basic_type<Tag>{});
-}
-
-template <typename F>
-class Registerer {
- public:
-  explicit Registerer(F f) : func_(std::move(f)) { func_(); }
-
- private:
-  F func_;
-};
-
-}  // namespace _registry
-
-using _registry::gRegistry;
-using _registry::Registerer;
-using _registry::Registry;
-
-template <typename Tag>
-using Creator = _registry::Creator<_registry::GetSignature<Tag>>;
-
-template <typename Tag>
-using SimpleCreator = _registry::SimpleCreator<_registry::GetSignature<Tag>>;
+namespace mmdeploy
+{
+
+    template<typename T>
+    struct basic_type
+    {
+        using type = T;
+    };
+
+    template<typename Iterator>
+    class iterator_range
+    {
+      public:
+        explicit iterator_range(Iterator first, Iterator last)
+            : first_(first)
+            , last_(last)
+        {
+        }
+        explicit iterator_range(std::pair<Iterator, Iterator> range)
+            : iterator_range(range.first, range.second)
+        {
+        }
+
+        auto begin() noexcept
+        {
+            return first_;
+        }
+        auto end() noexcept
+        {
+            return last_;
+        }
+
+      private:
+        Iterator first_;
+        Iterator last_;
+    };
+
+    namespace _registry
+    {
+
+        using std::optional;
+        using std::string_view;
+
+        template<typename T, typename = void>
+        struct _get_signature
+        {
+            static_assert(!std::is_same_v<T, T>, "tag T is not associated with a signature");
+        };
+
+        template<typename T>
+        using get_signature_t = decltype(get_signature(basic_type<T>{}));
+
+        template<typename T>
+        struct _get_signature<T, std::void_t<get_signature_t<T>>>
+        {
+            using type = typename get_signature_t<T>::type;
+        };
+
+        template<typename T>
+        using GetSignature = typename _get_signature<T>::type;
+
+        template<typename Tag>
+        class Creator;
+
+        template<>
+        class MMDEPLOY_API Creator<void>
+        {
+          public:
+            virtual ~Creator()                        = default;
+            virtual string_view name() const noexcept = 0;
+            virtual int         version() const noexcept
+            {
+                return 0;
+            }
+        };
+
+        template<typename Ret, typename... Args>
+        class MMDEPLOY_API Creator<Ret(Args...)> : public Creator<void>
+        {
+          public:
+            virtual Ret Create(Args... args) = 0;
+        };
+
+        template<typename Tag>
+        class SimpleCreator;
+
+        template<typename Ret, typename... Args>
+        class SimpleCreator<Ret(Args...)> : public Creator<Ret(Args...)>
+        {
+          public:
+            using FunctionType = std::function<Ret(Args...)>;
+
+            SimpleCreator(const string_view& name, int version, FunctionType func)
+                : name_(name)
+                , version_(version)
+                , func_(std::move(func))
+            {
+            }
+
+            string_view name() const noexcept override
+            {
+                return name_;
+            }
+            int version() const noexcept override
+            {
+                return version_;
+            }
+            Ret Create(Args... args) override
+            {
+                return func_(args...);
+            }
+
+          private:
+            std::string  name_;
+            int          version_;
+            FunctionType func_;
+        };
+
+        template<typename Tag>
+        class Registry;
+
+        template<>
+        class MMDEPLOY_API Registry<void>
+        {
+          public:
+            Registry();
+
+            ~Registry();
+
+            bool                 AddCreator(Creator<void>& creator);
+
+            Creator<void>*       GetCreator(const string_view& name, int version);
+
+            Span<Creator<void>*> Creators();
+
+          private:
+            struct Impl;
+            std::unique_ptr<Impl> impl_;
+        };
+
+        template<typename Signature>
+        struct _result_of;
+
+        template<typename R, typename... As>
+        struct _result_of<R(As...)>
+        {
+            using type = R;
+        };
+
+        template<typename Tag>
+        class Registry : public Registry<void>
+        {
+          public:
+            using Signature   = GetSignature<Tag>;
+            using CreatorType = Creator<Signature>;
+
+            // workaround for gcc-10.2 (https://github.com/open-mmlab/mmdeploy/issues/1796)
+            Registry()
+                : Registry<void>{}
+            {
+            }
+
+            bool Add(CreatorType& creator) &
+            {
+                return AddCreator(creator);
+            }
+
+            CreatorType* Get(const string_view& name, int version) &
+            {
+                return static_cast<CreatorType*>(GetCreator(name, version));
+            }
+
+            CreatorType* Get(const string_view& name) &
+            {
+                return Get(name, -1);
+            }
+
+            template<typename... Args>
+            auto Create(const std::pair<string_view, int>& desc,
+                        Args&&... args) & -> optional<typename _result_of<Signature>::type>
+            {
+                if (auto creator = Get(desc.first, desc.second); creator)
+                {
+                    return creator->Create((Args&&)args...);
+                }
+                else
+                {
+                    return std::nullopt;
+                }
+            }
+
+            template<typename... Args>
+            auto Create(const string_view& name, Args&&... args) &
+            {
+                return Create(std::pair{name, -1}, (Args&&)args...);
+            }
+
+            Span<CreatorType*> Creators() &
+            {
+                auto creators = Registry<void>::Creators();
+                return {reinterpret_cast<CreatorType**>(creators.data()), creators.size()};
+            }
+
+            auto List() &
+            {
+                std::vector<std::pair<string_view, int>> list;
+                for (const auto& creator : Creators())
+                {
+                    list.emplace_back(creator->name(), creator->version());
+                }
+                return list;
+            }
+        };
+
+        template<typename Tag>
+        auto gRegistry() -> decltype((get_registry(basic_type<Tag>{})))
+        {
+            return get_registry(basic_type<Tag>{});
+        }
+
+        template<typename F>
+        class Registerer
+        {
+          public:
+            explicit Registerer(F f)
+                : func_(std::move(f))
+            {
+                func_();
+            }
+
+          private:
+            F func_;
+        };
+
+    }  // namespace _registry
+
+    using _registry::gRegistry;
+    using _registry::Registerer;
+    using _registry::Registry;
+
+    template<typename Tag>
+    using Creator = _registry::Creator<_registry::GetSignature<Tag>>;
+
+    template<typename Tag>
+    using SimpleCreator = _registry::SimpleCreator<_registry::GetSignature<Tag>>;
 
 }  // namespace mmdeploy
 
 // Specify creator signature for tag
 #define MMDEPLOY_CREATOR_SIGNATURE(tag, signature) \
-  ::mmdeploy::basic_type<signature> get_signature(::mmdeploy::basic_type<tag>);
+    ::mmdeploy::basic_type<signature> get_signature(::mmdeploy::basic_type<tag>);
 
 #define MMDEPLOY_DECLARE_REGISTRY(tag, signature) \
-  MMDEPLOY_CREATOR_SIGNATURE(tag, signature)      \
-  MMDEPLOY_API ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>);
+    MMDEPLOY_CREATOR_SIGNATURE(tag, signature)    \
+    MMDEPLOY_API ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>);
 
-#define MMDEPLOY_DECLARE_REGISTRY_EXPAND(tag, signature)        \
-  MMDEPLOY_CREATOR_SIGNATURE(tag, MMDEPLOY_PP_EXPAND signature) \
-  MMDEPLOY_API ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>);
+#define MMDEPLOY_DECLARE_REGISTRY_EXPAND(tag, signature)          \
+    MMDEPLOY_CREATOR_SIGNATURE(tag, MMDEPLOY_PP_EXPAND signature) \
+    MMDEPLOY_API ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>);
 
 #define MMDEPLOY_DEFINE_REGISTRY(tag)                                    \
-  ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>) { \
-    static ::mmdeploy::Registry<tag> instance{};                         \
-    return instance;                                                     \
-  }
+    ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>) \
+    {                                                                    \
+        static ::mmdeploy::Registry<tag> instance{};                     \
+        return instance;                                                 \
+    }
 
-#define MMDEPLOY_REGISTER_CREATOR(tag, creator_type)                    \
-  static ::mmdeploy::Registerer MMDEPLOY_ANONYMOUS_VARIABLE(register_){ \
-      [creator = creator_type{}]() mutable { ::mmdeploy::gRegistry<tag>().Add(creator); }};
+#define MMDEPLOY_REGISTER_CREATOR(tag, creator_type)                      \
+    static ::mmdeploy::Registerer MMDEPLOY_ANONYMOUS_VARIABLE(register_){ \
+        [creator = creator_type{}]() mutable { ::mmdeploy::gRegistry<tag>().Add(creator); }};
 
 #define MMDEPLOY_CREATOR_DESC(name, version) #name, version
 
-#define MMDEPLOY_REGISTER_FACTORY_FUNC(tag, creator_desc, func)                                  \
-  static ::mmdeploy::Registerer MMDEPLOY_ANONYMOUS_VARIABLE(register_){                          \
-      [creator =                                                                                 \
-           ::mmdeploy::SimpleCreator<tag>(MMDEPLOY_CREATOR_DESC creator_desc, func)]() mutable { \
-        ::mmdeploy::gRegistry<tag>().Add(creator);                                               \
-      }};
+#define MMDEPLOY_REGISTER_FACTORY_FUNC(tag, creator_desc, func)                                    \
+    static ::mmdeploy::Registerer MMDEPLOY_ANONYMOUS_VARIABLE(register_){                          \
+        [creator =                                                                                 \
+             ::mmdeploy::SimpleCreator<tag>(MMDEPLOY_CREATOR_DESC creator_desc, func)]() mutable { \
+            ::mmdeploy::gRegistry<tag>().Add(creator);                                             \
+        }};
 
 #endif  // MMDEPLOY_REGISTRY_H
diff --git a/csrc/mmdeploy/core/serialization.h b/csrc/mmdeploy/core/serialization.h
index a8c2f93e1f..c085b5e7b6 100644
--- a/csrc/mmdeploy/core/serialization.h
+++ b/csrc/mmdeploy/core/serialization.h
@@ -13,288 +13,345 @@
 #include "mmdeploy/core/mpl/type_traits.h"
 #include "mmdeploy/core/status_code.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
 #define MMDEPLOY_ARCHIVE_NVP(archive, ...) archive(MMDEPLOY_PP_MAP(MMDEPLOY_NVP, __VA_ARGS__))
 
-#define MMDEPLOY_ARCHIVE_MEMBERS(...)           \
-  template <typename Archive>                   \
-  void serialize(Archive &archive) {            \
-    MMDEPLOY_ARCHIVE_NVP(archive, __VA_ARGS__); \
-  }
-
-#define MMDEPLOY_NVP(var) \
-  ::mmdeploy::NamedValue { std::forward_as_tuple(#var, var) }
-
-template <typename NameT, typename ValueT>
-class NamedValue {
- public:
-  explicit NamedValue(std::tuple<NameT, ValueT> &&data) : data_(std::move(data)) {}
-  template <typename Archive>
-  void serialize(Archive &archive) {
-    archive.named_value(std::forward<NameT>(std::get<0>(data_)),
-                        std::forward<ValueT>(std::get<1>(data_)));
-  }
-  std::tuple<NameT, ValueT> &data() { return data_; }
-
- private:
-  std::tuple<NameT, ValueT> data_;
-};
-
-template <typename T>
-struct array_tag {
-  explicit array_tag(std::size_t size) : size_(size) {}
-  std::size_t size() const { return size_; }
-  std::size_t size_;
-};
-
-template <typename T>
-struct object_tag {};
-
-template <typename T>
-using mapped_type_t = typename T::mapped_type;
-
-template <typename T>
-using has_mapped_type = detail::is_detected<mapped_type_t, T>;
-
-template <typename T>
-using get_size_t = decltype(std::declval<T>().size());
-
-template <typename T>
-using has_size = detail::is_detected<get_size_t, T>;
-
-template <typename T>
-using reserve_t = decltype(std::declval<T>().reserve(std::size_t{0}));
-
-template <typename T>
-using has_reserve = detail::is_detected<reserve_t, T>;
-
-namespace detail {
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename ValueType = typename U::value_type,
-          std::enable_if_t<!std::is_same_v<U, std::string>, int> = 0>
-auto save(Archive &archive, T &&iterable)
-    -> std::void_t<decltype(iterable.begin(), iterable.end())> {
-  if constexpr (has_size<T>::value) {
-    archive.init(array_tag<ValueType>(iterable.size()));
-  }
-  for (auto &&x : iterable) {
-    archive.item(std::forward<decltype(x)>(x));
-  }
-}
-
-template <typename T0, typename T1>
-class KeyValue {
- public:
-  explicit KeyValue(std::tuple<T0, T1> &&data) : data_(std::move(data)) {}
-  template <typename Archive>
-  void serialize(Archive &archive) {
-    archive.named_value("key", std::forward<T0>(std::get<0>(data_)));
-    archive.named_value("value", std::forward<T1>(std::get<1>(data_)));
-  }
-  std::tuple<T0, T1> &data() { return data_; }
-
- private:
-  std::tuple<T0, T1> data_;
-};
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type,
-          std::enable_if_t<!std::is_constructible_v<std::string, KeyType>, int> = 0>
-auto save(Archive &archive, T &object) -> std::void_t<decltype(object.begin(), object.end())> {
-  if constexpr (has_size<T>::value) {
-    // TODO: provide meaningful type info
-    archive.init(array_tag<void>(object.size()));
-  }
-  for (auto &&[k, v] : object) {
-    archive.item(KeyValue{
-        std::forward_as_tuple(std::forward<decltype(k)>(k), std::forward<decltype(v)>(v))});
-  }
-}
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type,
-          std::enable_if_t<std::is_constructible_v<std::string, KeyType>, int> = 0>
-auto save(Archive &archive, T &object) -> std::void_t<decltype(object.begin(), object.end())> {
-  if constexpr (has_size<T>::value) {
-    archive.init(object_tag<MappedType>());
-  }
-  for (auto &&[k, v] : object) {
-    archive.named_value(std::forward<decltype(k)>(k), std::forward<decltype(v)>(v));
-  }
-}
-
-template <typename Archive, typename T, std::size_t... Is>
-void save_tuple_impl(Archive &archive, T &&tuple, std::index_sequence<Is...>) {
-  (archive.item(std::get<Is>(std::forward<T>(tuple))), ...);
-}
-
-template <typename Archive, typename... Ts>
-void save(Archive &archive, const std::tuple<Ts...> &tuple) {
-  save_tuple_impl(archive, tuple, std::index_sequence_for<Ts...>{});
-}
-
-template <typename Archive, typename T, size_t... Is>
-void load_tuple_impl(Archive &archive, T &tuple, std::index_sequence<Is...>) {
-  (archive.item(std::get<Is>(tuple)), ...);
-}
-
-template <typename Archive, typename T, std::size_t N>
-void save(Archive &archive, T (&v)[N]) {
-  archive.init(array_tag<T>(N));
-  for (std::size_t i = 0; i < N; ++i) {
-    archive.item(v[i]);
-  }
-}
-
-template <typename Archive, typename... Ts>
-void load(Archive &archive, std::tuple<Ts...> &tuple) {
-  std::size_t size{};
-  archive.init(size);
-  if (size != sizeof...(Ts)) {
-    throw_exception(eShapeMismatch);
-  }
-  load_tuple_impl(archive, tuple, std::index_sequence_for<Ts...>{});
-}
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename ValueType = typename U::value_type,
-          std::enable_if_t<!std::is_same_v<U, std::string>, int> = 0>
-auto load(Archive &&archive, T &&vec) -> std::void_t<decltype(vec.push_back(ValueType{}))> {
-  std::size_t size{};
-  archive.init(size);
-  vec.clear();
-  for (std::size_t i = 0; i < size; ++i) {
-    ValueType v{};
-    archive.item(v);
-    vec.push_back(std::move(v));
-  }
-}
-
-template <typename Archive, typename T, std::size_t N>
-void load(Archive &archive, std::array<T, N> &v) {
-  std::size_t size{};
-  archive.init(size);
-  for (std::size_t i = 0; i < size; ++i) {
-    archive.item(v[i]);
-  }
-}
-
-template <typename Archive, typename T, std::size_t N>
-void load(Archive &archive, T (&v)[N]) {
-  std::size_t size{};
-  archive.init(size);
-  for (std::size_t i = 0; i < size; ++i) {
-    archive.item(v[i]);
-  }
-}
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename ValueType = typename U::value_type,
-          std::enable_if_t<std::conjunction_v<std::is_default_constructible<ValueType>,
-                                              std::negation<has_mapped_type<U>>>,
-                           int> = 0>
-auto load(Archive &&archive, T &&set)
-    -> std::void_t<decltype(set.insert(std::declval<ValueType>()))> {
-  std::size_t size{};
-  archive.init(size);
-  for (std::size_t i = 0; i < size; ++i) {
-    ValueType v{};
-    archive.item(v);
-    set.insert(std::move(v));
-  }
-}
-
-template <
-    typename Archive, typename T, typename U = uncvref_t<T>,
-    typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type,
-    std::enable_if_t<std::conjunction_v<std::negation<std::is_constructible<KeyType, std::string>>,
-                                        std::is_default_constructible<KeyType>,
-                                        std::is_default_constructible<MappedType>>,
-                     int> = 0>
-void load(Archive &&archive, T &&object) {
-  std::size_t size{};
-  archive.init(size);
-  for (std::size_t i = 0; i < size; ++i) {
-    KeyType key;
-    MappedType mapped;
-    archive.item(KeyValue{std::tie(key, mapped)});
-    object.insert({std::move(key), std::move(mapped)});
-  };
-}
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type,
-          std::enable_if_t<std::conjunction_v<std::is_constructible<KeyType, std::string>,
-                                              std::is_default_constructible<MappedType>>,
-                           int> = 0>
-void load(Archive &&archive, T &&object) {
-  std::size_t size{};
-  archive.init(size);
-  for (std::size_t i = 0; i < size; ++i) {
-    std::string name;
-    MappedType value{};
-    archive.named_value(name, value);
-    object.insert({std::move(name), std::move(value)});
-  }
-}
-
-struct save_fn {
-  template <typename Archive, typename T>
-  auto operator()(Archive &&a, T &&v) const
-      -> decltype(save(std::forward<Archive>(a), std::forward<T>(v))) {
-    return save(std::forward<Archive>(a), std::forward<T>(v));
-  }
-};
-
-struct load_fn {
-  template <typename Archive, typename T>
-  auto operator()(Archive &&a, T &&v) const
-      -> decltype(load(std::forward<Archive>(a), std::forward<T>(v))) {
-    return load(std::forward<Archive>(a), std::forward<T>(v));
-  }
-};
-
-struct serialize_fn {
-  template <typename Archive, typename T>
-  auto operator()(Archive &&a, T &&v) const
-      -> decltype(serialize(std::forward<Archive>(a), std::forward<T>(v))) {
-    return serialize(std::forward<Archive>(a), std::forward<T>(v));
-  }
-};
-
-}  // namespace detail
-
-namespace {
-
-constexpr inline detail::save_fn save{};
-constexpr inline detail::load_fn load{};
-constexpr inline detail::serialize_fn serialize{};
-
-}  // namespace
-
-template <typename T = void, typename SFINAE = void>
-struct adl_serializer;
-
-template <typename, typename>
-struct adl_serializer {
-  template <typename Archive, typename T>
-  static auto save(Archive &&a, T &&v)
-      -> decltype(::mmdeploy::save(std::forward<Archive>(a), std::forward<T>(v))) {
-    ::mmdeploy::save(std::forward<Archive>(a), std::forward<T>(v));
-  }
-  template <typename Archive, typename T>
-  static auto load(Archive &&a, T &&v)
-      -> decltype(::mmdeploy::load(std::forward<Archive>(a), std::forward<T>(v))) {
-    ::mmdeploy::load(std::forward<Archive>(a), std::forward<T>(v));
-  }
-  template <typename Archive, typename T>
-  static auto serialize(Archive &&a, T &&v)
-      -> decltype(::mmdeploy::serialize(std::forward<Archive>(a), std::forward<T>(v))) {
-    ::mmdeploy::serialize(std::forward<Archive>(a), std::forward<T>(v));
-  }
-};
+#define MMDEPLOY_ARCHIVE_MEMBERS(...)               \
+    template<typename Archive>                      \
+    void serialize(Archive& archive)                \
+    {                                               \
+        MMDEPLOY_ARCHIVE_NVP(archive, __VA_ARGS__); \
+    }
+
+#define MMDEPLOY_NVP(var)                \
+    ::mmdeploy::NamedValue               \
+    {                                    \
+        std::forward_as_tuple(#var, var) \
+    }
+
+    template<typename NameT, typename ValueT>
+    class NamedValue
+    {
+      public:
+        explicit NamedValue(std::tuple<NameT, ValueT>&& data)
+            : data_(std::move(data))
+        {
+        }
+        template<typename Archive>
+        void serialize(Archive& archive)
+        {
+            archive.named_value(std::forward<NameT>(std::get<0>(data_)),
+                                std::forward<ValueT>(std::get<1>(data_)));
+        }
+        std::tuple<NameT, ValueT>& data()
+        {
+            return data_;
+        }
+
+      private:
+        std::tuple<NameT, ValueT> data_;
+    };
+
+    template<typename T>
+    struct array_tag
+    {
+        explicit array_tag(std::size_t size)
+            : size_(size)
+        {
+        }
+        std::size_t size() const
+        {
+            return size_;
+        }
+        std::size_t size_;
+    };
+
+    template<typename T>
+    struct object_tag
+    {
+    };
+
+    template<typename T>
+    using mapped_type_t = typename T::mapped_type;
+
+    template<typename T>
+    using has_mapped_type = detail::is_detected<mapped_type_t, T>;
+
+    template<typename T>
+    using get_size_t = decltype(std::declval<T>().size());
+
+    template<typename T>
+    using has_size = detail::is_detected<get_size_t, T>;
+
+    template<typename T>
+    using reserve_t = decltype(std::declval<T>().reserve(std::size_t{0}));
+
+    template<typename T>
+    using has_reserve = detail::is_detected<reserve_t, T>;
+
+    namespace detail
+    {
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename ValueType = typename U::value_type, std::enable_if_t<!std::is_same_v<U, std::string>, int> = 0>
+        auto save(Archive& archive, T&& iterable)
+            -> std::void_t<decltype(iterable.begin(), iterable.end())>
+        {
+            if constexpr (has_size<T>::value)
+            {
+                archive.init(array_tag<ValueType>(iterable.size()));
+            }
+            for (auto&& x : iterable)
+            {
+                archive.item(std::forward<decltype(x)>(x));
+            }
+        }
+
+        template<typename T0, typename T1>
+        class KeyValue
+        {
+          public:
+            explicit KeyValue(std::tuple<T0, T1>&& data)
+                : data_(std::move(data))
+            {
+            }
+            template<typename Archive>
+            void serialize(Archive& archive)
+            {
+                archive.named_value("key", std::forward<T0>(std::get<0>(data_)));
+                archive.named_value("value", std::forward<T1>(std::get<1>(data_)));
+            }
+            std::tuple<T0, T1>& data()
+            {
+                return data_;
+            }
+
+          private:
+            std::tuple<T0, T1> data_;
+        };
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type, std::enable_if_t<!std::is_constructible_v<std::string, KeyType>, int> = 0>
+        auto save(Archive& archive, T& object) -> std::void_t<decltype(object.begin(), object.end())>
+        {
+            if constexpr (has_size<T>::value)
+            {
+                // TODO: provide meaningful type info
+                archive.init(array_tag<void>(object.size()));
+            }
+            for (auto&& [k, v] : object)
+            {
+                archive.item(KeyValue{
+                    std::forward_as_tuple(std::forward<decltype(k)>(k), std::forward<decltype(v)>(v))});
+            }
+        }
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type, std::enable_if_t<std::is_constructible_v<std::string, KeyType>, int> = 0>
+        auto save(Archive& archive, T& object) -> std::void_t<decltype(object.begin(), object.end())>
+        {
+            if constexpr (has_size<T>::value)
+            {
+                archive.init(object_tag<MappedType>());
+            }
+            for (auto&& [k, v] : object)
+            {
+                archive.named_value(std::forward<decltype(k)>(k), std::forward<decltype(v)>(v));
+            }
+        }
+
+        template<typename Archive, typename T, std::size_t... Is>
+        void save_tuple_impl(Archive& archive, T&& tuple, std::index_sequence<Is...>)
+        {
+            (archive.item(std::get<Is>(std::forward<T>(tuple))), ...);
+        }
+
+        template<typename Archive, typename... Ts>
+        void save(Archive& archive, const std::tuple<Ts...>& tuple)
+        {
+            save_tuple_impl(archive, tuple, std::index_sequence_for<Ts...>{});
+        }
+
+        template<typename Archive, typename T, size_t... Is>
+        void load_tuple_impl(Archive& archive, T& tuple, std::index_sequence<Is...>)
+        {
+            (archive.item(std::get<Is>(tuple)), ...);
+        }
+
+        template<typename Archive, typename T, std::size_t N>
+        void save(Archive& archive, T (&v)[N])
+        {
+            archive.init(array_tag<T>(N));
+            for (std::size_t i = 0; i < N; ++i)
+            {
+                archive.item(v[i]);
+            }
+        }
+
+        template<typename Archive, typename... Ts>
+        void load(Archive& archive, std::tuple<Ts...>& tuple)
+        {
+            std::size_t size{};
+            archive.init(size);
+            if (size != sizeof...(Ts))
+            {
+                throw_exception(eShapeMismatch);
+            }
+            load_tuple_impl(archive, tuple, std::index_sequence_for<Ts...>{});
+        }
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename ValueType = typename U::value_type, std::enable_if_t<!std::is_same_v<U, std::string>, int> = 0>
+        auto load(Archive&& archive, T&& vec) -> std::void_t<decltype(vec.push_back(ValueType{}))>
+        {
+            std::size_t size{};
+            archive.init(size);
+            vec.clear();
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                ValueType v{};
+                archive.item(v);
+                vec.push_back(std::move(v));
+            }
+        }
+
+        template<typename Archive, typename T, std::size_t N>
+        void load(Archive& archive, std::array<T, N>& v)
+        {
+            std::size_t size{};
+            archive.init(size);
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                archive.item(v[i]);
+            }
+        }
+
+        template<typename Archive, typename T, std::size_t N>
+        void load(Archive& archive, T (&v)[N])
+        {
+            std::size_t size{};
+            archive.init(size);
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                archive.item(v[i]);
+            }
+        }
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename ValueType = typename U::value_type, std::enable_if_t<std::conjunction_v<std::is_default_constructible<ValueType>, std::negation<has_mapped_type<U>>>, int> = 0>
+        auto load(Archive&& archive, T&& set)
+            -> std::void_t<decltype(set.insert(std::declval<ValueType>()))>
+        {
+            std::size_t size{};
+            archive.init(size);
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                ValueType v{};
+                archive.item(v);
+                set.insert(std::move(v));
+            }
+        }
+
+        template<
+            typename Archive,
+            typename T,
+            typename U            = uncvref_t<T>,
+            typename KeyType      = typename U::key_type,
+            typename MappedType   = typename U::mapped_type,
+            std::enable_if_t<std::conjunction_v<std::negation<std::is_constructible<KeyType, std::string>>,
+                                                std::is_default_constructible<KeyType>,
+                                                std::is_default_constructible<MappedType>>,
+                             int> = 0>
+        void load(Archive&& archive, T&& object)
+        {
+            std::size_t size{};
+            archive.init(size);
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                KeyType    key;
+                MappedType mapped;
+                archive.item(KeyValue{std::tie(key, mapped)});
+                object.insert({std::move(key), std::move(mapped)});
+            };
+        }
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type, std::enable_if_t<std::conjunction_v<std::is_constructible<KeyType, std::string>, std::is_default_constructible<MappedType>>, int> = 0>
+        void load(Archive&& archive, T&& object)
+        {
+            std::size_t size{};
+            archive.init(size);
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                std::string name;
+                MappedType  value{};
+                archive.named_value(name, value);
+                object.insert({std::move(name), std::move(value)});
+            }
+        }
+
+        struct save_fn
+        {
+            template<typename Archive, typename T>
+            auto operator()(Archive&& a, T&& v) const
+                -> decltype(save(std::forward<Archive>(a), std::forward<T>(v)))
+            {
+                return save(std::forward<Archive>(a), std::forward<T>(v));
+            }
+        };
+
+        struct load_fn
+        {
+            template<typename Archive, typename T>
+            auto operator()(Archive&& a, T&& v) const
+                -> decltype(load(std::forward<Archive>(a), std::forward<T>(v)))
+            {
+                return load(std::forward<Archive>(a), std::forward<T>(v));
+            }
+        };
+
+        struct serialize_fn
+        {
+            template<typename Archive, typename T>
+            auto operator()(Archive&& a, T&& v) const
+                -> decltype(serialize(std::forward<Archive>(a), std::forward<T>(v)))
+            {
+                return serialize(std::forward<Archive>(a), std::forward<T>(v));
+            }
+        };
+
+    }  // namespace detail
+
+    namespace
+    {
+
+        constexpr inline detail::save_fn      save{};
+        constexpr inline detail::load_fn      load{};
+        constexpr inline detail::serialize_fn serialize{};
+
+    }  // namespace
+
+    template<typename T = void, typename SFINAE = void>
+    struct adl_serializer;
+
+    template<typename, typename>
+    struct adl_serializer
+    {
+        template<typename Archive, typename T>
+        static auto save(Archive&& a, T&& v)
+            -> decltype(::mmdeploy::save(std::forward<Archive>(a), std::forward<T>(v)))
+        {
+            ::mmdeploy::save(std::forward<Archive>(a), std::forward<T>(v));
+        }
+        template<typename Archive, typename T>
+        static auto load(Archive&& a, T&& v)
+            -> decltype(::mmdeploy::load(std::forward<Archive>(a), std::forward<T>(v)))
+        {
+            ::mmdeploy::load(std::forward<Archive>(a), std::forward<T>(v));
+        }
+        template<typename Archive, typename T>
+        static auto serialize(Archive&& a, T&& v)
+            -> decltype(::mmdeploy::serialize(std::forward<Archive>(a), std::forward<T>(v)))
+        {
+            ::mmdeploy::serialize(std::forward<Archive>(a), std::forward<T>(v));
+        }
+    };
 
 };  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/status_code.cpp b/csrc/mmdeploy/core/status_code.cpp
index 3747960b5d..0bce609241 100644
--- a/csrc/mmdeploy/core/status_code.cpp
+++ b/csrc/mmdeploy/core/status_code.cpp
@@ -5,43 +5,51 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/core/utils/source_location.h"
 
-namespace mmdeploy {
-
-void StatusDomain::_do_throw_exception(
-    const SYSTEM_ERROR2_NAMESPACE::status_code<void> &code) const {
-  assert(code.domain() == *this);
-  const auto &c = static_cast<const StatusCode &>(code);  // NOLINT
-  throw SYSTEM_ERROR2_NAMESPACE::status_error(c);
-}
-
-using string_ref = SYSTEM_ERROR2_NAMESPACE::status_code_domain::string_ref;
-using atomic_refcounted_string_ref =
-    SYSTEM_ERROR2_NAMESPACE::status_code_domain::atomic_refcounted_string_ref;
-
-string_ref Status::message() const {
-  std::string ret;
-  try {
+namespace mmdeploy
+{
+
+    void StatusDomain::_do_throw_exception(
+        const SYSTEM_ERROR2_NAMESPACE::status_code<void>& code) const
+    {
+        assert(code.domain() == *this);
+        const auto& c = static_cast<const StatusCode&>(code);  // NOLINT
+        throw SYSTEM_ERROR2_NAMESPACE::status_error(c);
+    }
+
+    using string_ref = SYSTEM_ERROR2_NAMESPACE::status_code_domain::string_ref;
+    using atomic_refcounted_string_ref =
+        SYSTEM_ERROR2_NAMESPACE::status_code_domain::atomic_refcounted_string_ref;
+
+    string_ref Status::message() const
+    {
+        std::string ret;
+        try
+        {
 #if MMDEPLOY_STATUS_USE_SOURCE_LOCATION
-#if MMDEPLOY_HAS_SOURCE_LOCATION
-    ret = fmt::format("{} ({}) @ {}:{}", to_string(ec), (int32_t)ec, file, line);
-#else
-    ret = fmt::format("{} ({})", to_string(ec), (int32_t)ec);
-#endif
+    #if MMDEPLOY_HAS_SOURCE_LOCATION
+            ret = fmt::format("{} ({}) @ {}:{}", to_string(ec), (int32_t)ec, file, line);
+    #else
+            ret = fmt::format("{} ({})", to_string(ec), (int32_t)ec);
+    #endif
 #elif MMDEPLOY_STATUS_USE_STACKTRACE
-    ret = fmt::format("{} ({}), stacktrace:\n{}", to_string(ec), (int32_t)ec, st.to_string());
+            ret = fmt::format("{} ({}), stacktrace:\n{}", to_string(ec), (int32_t)ec, st.to_string());
 #else
-    ret = fmt::format("{} ({})", to_string(ec), (int32_t)ec);
+            ret = fmt::format("{} ({})", to_string(ec), (int32_t)ec);
 #endif
-
-  } catch (...) {
-    return string_ref("Failed to retrieve message for status");
-  }
-  if (auto p = static_cast<char *>(malloc(ret.size() + 1))) {
-    memcpy(p, ret.c_str(), ret.size() + 1);
-    return atomic_refcounted_string_ref(p, ret.size());
-  } else {
-    return string_ref("Failed to allocate memory to store error string");
-  }
-}
+        }
+        catch (...)
+        {
+            return string_ref("Failed to retrieve message for status");
+        }
+        if (auto p = static_cast<char*>(malloc(ret.size() + 1)))
+        {
+            memcpy(p, ret.c_str(), ret.size() + 1);
+            return atomic_refcounted_string_ref(p, ret.size());
+        }
+        else
+        {
+            return string_ref("Failed to allocate memory to store error string");
+        }
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/status_code.h b/csrc/mmdeploy/core/status_code.h
index 337c4c7e8c..a2d016c02e 100644
--- a/csrc/mmdeploy/core/status_code.h
+++ b/csrc/mmdeploy/core/status_code.h
@@ -8,14 +8,15 @@
 #include "mmdeploy/core/macro.h"
 #include "outcome-experimental.hpp"
 #if MMDEPLOY_STATUS_USE_SOURCE_LOCATION
-#include "mmdeploy/core/utils/source_location.h"
+    #include "mmdeploy/core/utils/source_location.h"
 #elif MMDEPLOY_STATUS_USE_STACKTRACE
-#include "mmdeploy/core/utils/stacktrace.h"
+    #include "mmdeploy/core/utils/stacktrace.h"
 #endif
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-// clang-format off
+    // clang-format off
 
 enum class ErrorCode: int32_t {
   eSuccess         = 0,
@@ -31,93 +32,114 @@ enum class ErrorCode: int32_t {
   eUnknown         = -1,
 };
 
-// clang-format on
+    // clang-format on
 
 #define USING_ERROR_CODE(code) constexpr inline const auto code = ErrorCode::code  // NOLINT
 
-// note that eSuccess is not brought to the outer namespace on purpose
-USING_ERROR_CODE(eInvalidArgument);
-USING_ERROR_CODE(eNotSupported);
-USING_ERROR_CODE(eOutOfRange);
-USING_ERROR_CODE(eOutOfMemory);
-USING_ERROR_CODE(eFileNotExist);
-USING_ERROR_CODE(eFail);
-USING_ERROR_CODE(eShapeMismatch);
-USING_ERROR_CODE(eEntryNotFound);
-USING_ERROR_CODE(eNotReady);
-USING_ERROR_CODE(eUnknown);
-
-inline const char *to_string(ErrorCode code) {
-  switch (code) {
-    case ErrorCode::eSuccess:
-      return "success";
-    case ErrorCode::eInvalidArgument:
-      return "invalid argument";
-    case ErrorCode::eNotSupported:
-      return "not supported";
-    case ErrorCode::eOutOfRange:
-      return "out of range";
-    case ErrorCode::eOutOfMemory:
-      return "out of memory";
-    case ErrorCode::eFileNotExist:
-      return "file not exist";
-    case ErrorCode::eShapeMismatch:
-      return "shape mismatch";
-    case ErrorCode::eEntryNotFound:
-      return "entry not found";
-    case ErrorCode::eNotReady:
-      return "not ready";
-    default:
-      return "unknown";
-  }
-}
+    // note that eSuccess is not brought to the outer namespace on purpose
+    USING_ERROR_CODE(eInvalidArgument);
+    USING_ERROR_CODE(eNotSupported);
+    USING_ERROR_CODE(eOutOfRange);
+    USING_ERROR_CODE(eOutOfMemory);
+    USING_ERROR_CODE(eFileNotExist);
+    USING_ERROR_CODE(eFail);
+    USING_ERROR_CODE(eShapeMismatch);
+    USING_ERROR_CODE(eEntryNotFound);
+    USING_ERROR_CODE(eNotReady);
+    USING_ERROR_CODE(eUnknown);
+
+    inline const char* to_string(ErrorCode code)
+    {
+        switch (code)
+        {
+            case ErrorCode::eSuccess:
+                return "success";
+            case ErrorCode::eInvalidArgument:
+                return "invalid argument";
+            case ErrorCode::eNotSupported:
+                return "not supported";
+            case ErrorCode::eOutOfRange:
+                return "out of range";
+            case ErrorCode::eOutOfMemory:
+                return "out of memory";
+            case ErrorCode::eFileNotExist:
+                return "file not exist";
+            case ErrorCode::eShapeMismatch:
+                return "shape mismatch";
+            case ErrorCode::eEntryNotFound:
+                return "entry not found";
+            case ErrorCode::eNotReady:
+                return "not ready";
+            default:
+                return "unknown";
+        }
+    }
 
-struct MMDEPLOY_API Status {
-  ErrorCode ec{};
-  Status() = default;
-  SYSTEM_ERROR2_NAMESPACE::status_code_domain::string_ref message() const;
-  bool operator==(const ErrorCode &b) const noexcept { return ec == b; }
+    struct MMDEPLOY_API Status
+    {
+        ErrorCode ec{};
+        Status() = default;
+        SYSTEM_ERROR2_NAMESPACE::status_code_domain::string_ref message() const;
+        bool                                                    operator==(const ErrorCode& b) const noexcept
+        {
+            return ec == b;
+        }
 
 #if MMDEPLOY_STATUS_USE_SOURCE_LOCATION
-  const char *file{""};
-  int line{};
-  explicit Status(ErrorCode _ec, SourceLocation location = SourceLocation::current())
-      : ec(_ec), file(location.file_name()), line(static_cast<int>(location.line())) {}
+        const char* file{""};
+        int         line{};
+        explicit Status(ErrorCode _ec, SourceLocation location = SourceLocation::current())
+            : ec(_ec)
+            , file(location.file_name())
+            , line(static_cast<int>(location.line()))
+        {
+        }
 #elif MMDEPLOY_STATUS_USE_STACKTRACE
-  Stacktrace st;
-  explicit Status(ErrorCode _ec, Stacktrace _st = Stacktrace(0)) : ec(_ec), st(std::move(_st)) {}
+        Stacktrace st;
+        explicit Status(ErrorCode _ec, Stacktrace _st = Stacktrace(0))
+            : ec(_ec)
+            , st(std::move(_st))
+        {
+        }
 #else
-  explicit Status(ErrorCode _ec) : ec(_ec) {}
+        explicit Status(ErrorCode _ec)
+            : ec(_ec)
+        {
+        }
 #endif
-};
+    };
 
-class StatusDomain;
+    class StatusDomain;
 
-using StatusCode = SYSTEM_ERROR2_NAMESPACE::status_code<StatusDomain>;
+    using StatusCode = SYSTEM_ERROR2_NAMESPACE::status_code<StatusDomain>;
 
-class MMDEPLOY_API StatusDomain : public SYSTEM_ERROR2_NAMESPACE::status_code_domain {
-  using _base = status_code_domain;
+    class MMDEPLOY_API StatusDomain : public SYSTEM_ERROR2_NAMESPACE::status_code_domain
+    {
+        using _base = status_code_domain;
 
- public:
-  using value_type = Status;
+      public:
+        using value_type = Status;
 
-  constexpr explicit StatusDomain(typename _base::unique_id_type id = 0x3584b6716049efb4) noexcept
-      : _base(id) {}
+        constexpr explicit StatusDomain(typename _base::unique_id_type id = 0x3584b6716049efb4) noexcept
+            : _base(id)
+        {
+        }
 
-  StatusDomain(const StatusDomain &) = default;
-  StatusDomain(StatusDomain &&) = default;
-  StatusDomain &operator=(const StatusDomain &) = default;
-  StatusDomain &operator=(StatusDomain &&) = default;
-  ~StatusDomain() = default;
+        StatusDomain(const StatusDomain&)            = default;
+        StatusDomain(StatusDomain&&)                 = default;
+        StatusDomain& operator=(const StatusDomain&) = default;
+        StatusDomain& operator=(StatusDomain&&)      = default;
+        ~StatusDomain()                              = default;
 
-  static inline constexpr const StatusDomain &get();
+        static inline constexpr const StatusDomain& get();
 
-  string_ref name() const noexcept override {
-    static string_ref v("mmdeploy");
-    return v;
-  }
+        string_ref                                  name() const noexcept override
+        {
+            static string_ref v("mmdeploy");
+            return v;
+        }
 
-  // clang-format off
+        // clang-format off
   bool _do_failure(const SYSTEM_ERROR2_NAMESPACE::status_code<void> &code) const noexcept override {
     assert(code.domain() == *this);
     auto &c = static_cast<const StatusCode &>(code);  // NOLINT
@@ -145,56 +167,74 @@ class MMDEPLOY_API StatusDomain : public SYSTEM_ERROR2_NAMESPACE::status_code_do
     auto &c = static_cast<const StatusCode &>(code);  // NOLINT
     return c.value().message();
   }
-  // clang-format on
-  void _do_throw_exception(const SYSTEM_ERROR2_NAMESPACE::status_code<void> &code) const override;
-};
-
-constexpr inline StatusDomain status_domain;
-inline constexpr const StatusDomain &StatusDomain::get() { return status_domain; }
+        // clang-format on
+        void _do_throw_exception(const SYSTEM_ERROR2_NAMESPACE::status_code<void>& code) const override;
+    };
+
+    constexpr inline StatusDomain        status_domain;
+    inline constexpr const StatusDomain& StatusDomain::get()
+    {
+        return status_domain;
+    }
 
-inline StatusCode make_status_code(StatusCode::value_type v) {
-  return StatusCode(SYSTEM_ERROR2_NAMESPACE::in_place, static_cast<StatusCode::value_type &&>(v));
-}
+    inline StatusCode make_status_code(StatusCode::value_type v)
+    {
+        return StatusCode(SYSTEM_ERROR2_NAMESPACE::in_place, static_cast<StatusCode::value_type&&>(v));
+    }
 
-using OUTCOME_V2_NAMESPACE::failure;
-using OUTCOME_V2_NAMESPACE::in_place_type;
-using OUTCOME_V2_NAMESPACE::success;
+    using OUTCOME_V2_NAMESPACE::failure;
+    using OUTCOME_V2_NAMESPACE::in_place_type;
+    using OUTCOME_V2_NAMESPACE::success;
 
-inline bool operator==(const StatusCode &sc, ErrorCode ec) noexcept { return sc.value().ec == ec; }
-inline bool operator==(ErrorCode ec, const StatusCode &sc) noexcept { return sc.value().ec == ec; }
+    inline bool operator==(const StatusCode& sc, ErrorCode ec) noexcept
+    {
+        return sc.value().ec == ec;
+    }
+    inline bool operator==(ErrorCode ec, const StatusCode& sc) noexcept
+    {
+        return sc.value().ec == ec;
+    }
 
-using Error = SYSTEM_ERROR2_NAMESPACE::errored_status_code<StatusDomain>;
+    using Error = SYSTEM_ERROR2_NAMESPACE::errored_status_code<StatusDomain>;
 
-using Exception = SYSTEM_ERROR2_NAMESPACE::status_error<StatusDomain>;
+    using Exception = SYSTEM_ERROR2_NAMESPACE::status_error<StatusDomain>;
 
-template <typename T>
-struct Result : OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error> {
-  using OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error>::status_result;
-};
+    template<typename T>
+    struct Result : OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error>
+    {
+        using OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error>::status_result;
+    };
 
 #if MMDEPLOY_STATUS_USE_SOURCE_LOCATION
-[[noreturn]] inline void throw_exception(ErrorCode ec,
-                                         SourceLocation location = SourceLocation::current()) {
-  Error(Status(ec, location)).throw_exception();
-}
+    [[noreturn]] inline void throw_exception(ErrorCode      ec,
+                                             SourceLocation location = SourceLocation::current())
+    {
+        Error(Status(ec, location)).throw_exception();
+    }
 #elif MMDEPLOY_STATUS_USE_STACKTRACE
-[[noreturn]] inline void throw_exception(ErrorCode ec, Stacktrace stacktrace = Stacktrace(0)) {
-  Error(Status(ec, std::move(stacktrace))).throw_exception();
-}
+    [[noreturn]] inline void throw_exception(ErrorCode ec, Stacktrace stacktrace = Stacktrace(0))
+    {
+        Error(Status(ec, std::move(stacktrace))).throw_exception();
+    }
 #else
-[[noreturn]] inline void throw_exception(const ErrorCode ec) {
-  Error(Status(ec)).throw_exception();
-}
+    [[noreturn]] inline void throw_exception(const ErrorCode ec)
+    {
+        Error(Status(ec)).throw_exception();
+    }
 #endif
 
-template <typename T>
-struct is_result : std::false_type {};
+    template<typename T>
+    struct is_result : std::false_type
+    {
+    };
 
-template <typename T>
-struct is_result<Result<T>> : std::true_type {};
+    template<typename T>
+    struct is_result<Result<T>> : std::true_type
+    {
+    };
 
-template <typename T>
-inline constexpr bool is_result_v = is_result<T>::value;
+    template<typename T>
+    inline constexpr bool is_result_v = is_result<T>::value;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/tensor.cpp b/csrc/mmdeploy/core/tensor.cpp
index 1bde8cf7c7..8df8ec528a 100644
--- a/csrc/mmdeploy/core/tensor.cpp
+++ b/csrc/mmdeploy/core/tensor.cpp
@@ -11,191 +11,264 @@
 
 using std::stringstream;
 
-namespace mmdeploy::framework {
-
-static inline int64_t element_size(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return 4;
-    case DataType::kHALF:
-      return 2;
-    case DataType::kINT8:
-      return 1;
-    case DataType::kINT32:
-      return 4;
-    case DataType::kINT64:
-      return 8;
-    default:
-      return 0;
-  }
-}
-
-inline static std::string shape_string(const TensorShape& shape) {
-  if (shape.empty()) {
-    return "0";
-  }
-  stringstream ss;
-  ss << shape[0];
-  for (size_t i = 1; i < shape.size(); ++i) ss << "," << shape[i];
-  return ss.str();
-}
-
-Tensor::Tensor(const TensorDesc& desc, Allocator allocator)
-    : desc_(desc), allocator_(std::move(allocator)) {
-  buffer_ = Buffer(desc.device, byte_size(), allocator_);
-}
-
-Tensor::Tensor(const TensorDesc& desc, Buffer buffer)  // NOLINT
-    : desc_(desc), buffer_(std::move(buffer)) {}
-
-Tensor::Tensor(const TensorDesc& desc, std::shared_ptr<void> data) {
-  desc_ = desc;
-  buffer_ = Buffer(desc.device, byte_size(), std::move(data));
-}
-
-static inline int64_t get_size(const std::vector<int64_t>& shape) {
-  if (shape.empty()) {
-    return 0;
-  }
-  auto _size = std::accumulate(begin(shape), end(shape), 1LL, std::multiplies<>());
-  return std::max(0LL, _size);
-}
-
-int64_t Tensor::size() const { return get_size(shape()); }
-
-int64_t Tensor::byte_size() const { return size() * element_size(data_type()); }
-const TensorDesc& Tensor::desc() const { return desc_; }
-const TensorShape& Tensor::shape() const { return desc_.shape; }
-DataType Tensor::data_type() const { return desc_.data_type; }
-const char* Tensor::name() const { return desc_.name.c_str(); }
-const Buffer& Tensor::buffer() const { return buffer_; }
-
-Buffer& Tensor::buffer() {
-  Allocate();
-  return buffer_;
-}
-
-Device Tensor::device() const { return desc_.device; }
-
-void Tensor::Reshape(const TensorShape& shape) {
-  bool is_same_size = size() == get_size(shape);
-  desc_.shape = shape;
-  if (buffer_ && !is_same_size) {
-    // re-allocate buffer
-    buffer_ = {};
-    Allocate();
-  }
-}
-
-void Tensor::Squeeze() {
-  desc_.shape.erase(std::remove(desc_.shape.begin(), desc_.shape.end(), 1), desc_.shape.end());
-}
-
-void Tensor::Squeeze(int dim) {
-  if (shape(dim) == 1) {
-    desc_.shape.erase(desc_.shape.begin() + dim);
-  }
-}
-
-Result<void> Tensor::CopyFrom(const Tensor& tensor, Stream stream) {
-  if (desc_.shape.empty() || tensor.desc().shape.empty()) {
-    MMDEPLOY_ERROR("uninitialized tensor");
-    return Status(eInvalidArgument);
-  }
-  if (!(desc_.shape == tensor.desc().shape)) {
-    MMDEPLOY_ERROR("mismatched shape {} vs {}", shape_string(desc_.shape),
-                   shape_string(tensor.desc().shape));
-    return Status(eShapeMismatch);
-  }
-  if (desc_.data_type != tensor.desc().data_type) {
-    MMDEPLOY_ERROR("mismatched data type {} vs {}", desc_.data_type, tensor.desc().data_type);
-    return Status(eShapeMismatch);
-  }
-  Allocate();
-  if (!stream) {
-    auto device = desc_.device.is_device() ? desc_.device : tensor.desc().device;
-    auto default_stream = Stream::GetDefault(device);
-    OUTCOME_TRY(default_stream.Copy(tensor.buffer(), buffer_, tensor.byte_size()));
-  } else {
-    OUTCOME_TRY(stream.Copy(tensor.buffer(), buffer_, tensor.byte_size()));
-  }
-  return success();
-}
-
-Result<void> Tensor::CopyTo(Tensor& tensor, Stream stream) const {
-  if (desc_.shape.empty() || tensor.desc().shape.empty()) {
-    MMDEPLOY_ERROR("uninitialized tensor");
-    return Status(eInvalidArgument);
-  }
-
-  if (!(desc_.shape == tensor.desc().shape)) {
-    MMDEPLOY_ERROR("mismatched shape {} vs {}", shape_string(desc_.shape),
-                   shape_string(tensor.desc().shape));
-    return Status(eShapeMismatch);
-  }
-  if (desc_.data_type != tensor.desc().data_type) {
-    MMDEPLOY_ERROR("mismatched data type {} vs {}", desc_.data_type, tensor.desc().data_type);
-    return Status(eShapeMismatch);
-  }
-  tensor.Allocate();
-  if (!stream) {
-    Device device = desc_.device.is_device() ? desc_.device : tensor.desc().device;
-    Stream default_stream = Stream::GetDefault(device);
-    return default_stream.Copy(buffer_, tensor.buffer(), byte_size());
-  } else {
-    return stream.Copy(buffer_, tensor.buffer(), byte_size());
-  }
-}
-
-Result<void> Tensor::CopyFrom(void* host_ptr, Stream stream) {
-  if (nullptr == host_ptr) {
-    return Status(eInvalidArgument);
-  }
-  if (desc_.shape.empty()) {
-    MMDEPLOY_ERROR("uninitialized tensor");
-    return Status(eInvalidArgument);
-  }
-  Allocate();
-  if (!stream) {
-    auto default_stream = Stream::GetDefault(desc_.device);
-    return default_stream.Copy(host_ptr, buffer_, byte_size());
-  } else {
-    return stream.Copy(host_ptr, buffer_, byte_size());
-  }
-}
-
-Result<void> Tensor::CopyTo(void* host_ptr, Stream stream) const {
-  if (nullptr == host_ptr) {
-    return Status(eInvalidArgument);
-  }
-  if (desc_.shape.empty()) {
-    MMDEPLOY_ERROR("uninitialized tensor");
-    return Status(eInvalidArgument);
-  }
-  if (!stream) {
-    auto default_stream = Stream::GetDefault(desc_.device);
-    return default_stream.Copy(buffer_, host_ptr, byte_size());
-  } else {
-    return stream.Copy(buffer_, host_ptr, byte_size());
-  }
-}
-
-void Tensor::Allocate() {
-  if (!buffer_) {
-    auto _desc = desc();
-    *this = Tensor(_desc, allocator_);
-  }
-}
-
-Tensor Tensor::Slice(int start, int end) {
-  Tensor slice = *this;
-  slice.desc_.shape[0] = 1;
-  auto bytes = element_size(desc_.data_type) * get_size(slice.desc_.shape);
-  slice.desc_.shape[0] = end - start;
-  slice.buffer_ = Buffer(buffer(), start * bytes, (end - start) * bytes);
-  return slice;
-}
-
-TensorShape::value_type Tensor::shape(int dim) const { return desc().shape[dim]; }
+namespace mmdeploy::framework
+{
+
+    static inline int64_t element_size(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return 4;
+            case DataType::kHALF:
+                return 2;
+            case DataType::kINT8:
+                return 1;
+            case DataType::kINT32:
+                return 4;
+            case DataType::kINT64:
+                return 8;
+            default:
+                return 0;
+        }
+    }
+
+    inline static std::string shape_string(const TensorShape& shape)
+    {
+        if (shape.empty())
+        {
+            return "0";
+        }
+        stringstream ss;
+        ss << shape[0];
+        for (size_t i = 1; i < shape.size(); ++i) ss << "," << shape[i];
+        return ss.str();
+    }
+
+    Tensor::Tensor(const TensorDesc& desc, Allocator allocator)
+        : desc_(desc)
+        , allocator_(std::move(allocator))
+    {
+        buffer_ = Buffer(desc.device, byte_size(), allocator_);
+    }
+
+    Tensor::Tensor(const TensorDesc& desc, Buffer buffer)  // NOLINT
+        : desc_(desc)
+        , buffer_(std::move(buffer))
+    {
+    }
+
+    Tensor::Tensor(const TensorDesc& desc, std::shared_ptr<void> data)
+    {
+        desc_   = desc;
+        buffer_ = Buffer(desc.device, byte_size(), std::move(data));
+    }
+
+    static inline int64_t get_size(const std::vector<int64_t>& shape)
+    {
+        if (shape.empty())
+        {
+            return 0;
+        }
+        auto _size = std::accumulate(begin(shape), end(shape), 1LL, std::multiplies<>());
+        return std::max(0LL, _size);
+    }
+
+    int64_t Tensor::size() const
+    {
+        return get_size(shape());
+    }
+
+    int64_t Tensor::byte_size() const
+    {
+        return size() * element_size(data_type());
+    }
+    const TensorDesc& Tensor::desc() const
+    {
+        return desc_;
+    }
+    const TensorShape& Tensor::shape() const
+    {
+        return desc_.shape;
+    }
+    DataType Tensor::data_type() const
+    {
+        return desc_.data_type;
+    }
+    const char* Tensor::name() const
+    {
+        return desc_.name.c_str();
+    }
+    const Buffer& Tensor::buffer() const
+    {
+        return buffer_;
+    }
+
+    Buffer& Tensor::buffer()
+    {
+        Allocate();
+        return buffer_;
+    }
+
+    Device Tensor::device() const
+    {
+        return desc_.device;
+    }
+
+    void Tensor::Reshape(const TensorShape& shape)
+    {
+        bool is_same_size = size() == get_size(shape);
+        desc_.shape       = shape;
+        if (buffer_ && !is_same_size)
+        {
+            // re-allocate buffer
+            buffer_ = {};
+            Allocate();
+        }
+    }
+
+    void Tensor::Squeeze()
+    {
+        desc_.shape.erase(std::remove(desc_.shape.begin(), desc_.shape.end(), 1), desc_.shape.end());
+    }
+
+    void Tensor::Squeeze(int dim)
+    {
+        if (shape(dim) == 1)
+        {
+            desc_.shape.erase(desc_.shape.begin() + dim);
+        }
+    }
+
+    Result<void> Tensor::CopyFrom(const Tensor& tensor, Stream stream)
+    {
+        if (desc_.shape.empty() || tensor.desc().shape.empty())
+        {
+            MMDEPLOY_ERROR("uninitialized tensor");
+            return Status(eInvalidArgument);
+        }
+        if (!(desc_.shape == tensor.desc().shape))
+        {
+            MMDEPLOY_ERROR("mismatched shape {} vs {}", shape_string(desc_.shape), shape_string(tensor.desc().shape));
+            return Status(eShapeMismatch);
+        }
+        if (desc_.data_type != tensor.desc().data_type)
+        {
+            MMDEPLOY_ERROR("mismatched data type {} vs {}", desc_.data_type, tensor.desc().data_type);
+            return Status(eShapeMismatch);
+        }
+        Allocate();
+        if (!stream)
+        {
+            auto device         = desc_.device.is_device() ? desc_.device : tensor.desc().device;
+            auto default_stream = Stream::GetDefault(device);
+            OUTCOME_TRY(default_stream.Copy(tensor.buffer(), buffer_, tensor.byte_size()));
+        }
+        else
+        {
+            OUTCOME_TRY(stream.Copy(tensor.buffer(), buffer_, tensor.byte_size()));
+        }
+        return success();
+    }
+
+    Result<void> Tensor::CopyTo(Tensor& tensor, Stream stream) const
+    {
+        if (desc_.shape.empty() || tensor.desc().shape.empty())
+        {
+            MMDEPLOY_ERROR("uninitialized tensor");
+            return Status(eInvalidArgument);
+        }
+
+        if (!(desc_.shape == tensor.desc().shape))
+        {
+            MMDEPLOY_ERROR("mismatched shape {} vs {}", shape_string(desc_.shape), shape_string(tensor.desc().shape));
+            return Status(eShapeMismatch);
+        }
+        if (desc_.data_type != tensor.desc().data_type)
+        {
+            MMDEPLOY_ERROR("mismatched data type {} vs {}", desc_.data_type, tensor.desc().data_type);
+            return Status(eShapeMismatch);
+        }
+        tensor.Allocate();
+        if (!stream)
+        {
+            Device device         = desc_.device.is_device() ? desc_.device : tensor.desc().device;
+            Stream default_stream = Stream::GetDefault(device);
+            return default_stream.Copy(buffer_, tensor.buffer(), byte_size());
+        }
+        else
+        {
+            return stream.Copy(buffer_, tensor.buffer(), byte_size());
+        }
+    }
+
+    Result<void> Tensor::CopyFrom(void* host_ptr, Stream stream)
+    {
+        if (nullptr == host_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (desc_.shape.empty())
+        {
+            MMDEPLOY_ERROR("uninitialized tensor");
+            return Status(eInvalidArgument);
+        }
+        Allocate();
+        if (!stream)
+        {
+            auto default_stream = Stream::GetDefault(desc_.device);
+            return default_stream.Copy(host_ptr, buffer_, byte_size());
+        }
+        else
+        {
+            return stream.Copy(host_ptr, buffer_, byte_size());
+        }
+    }
+
+    Result<void> Tensor::CopyTo(void* host_ptr, Stream stream) const
+    {
+        if (nullptr == host_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (desc_.shape.empty())
+        {
+            MMDEPLOY_ERROR("uninitialized tensor");
+            return Status(eInvalidArgument);
+        }
+        if (!stream)
+        {
+            auto default_stream = Stream::GetDefault(desc_.device);
+            return default_stream.Copy(buffer_, host_ptr, byte_size());
+        }
+        else
+        {
+            return stream.Copy(buffer_, host_ptr, byte_size());
+        }
+    }
+
+    void Tensor::Allocate()
+    {
+        if (!buffer_)
+        {
+            auto _desc = desc();
+            *this      = Tensor(_desc, allocator_);
+        }
+    }
+
+    Tensor Tensor::Slice(int start, int end)
+    {
+        Tensor slice         = *this;
+        slice.desc_.shape[0] = 1;
+        auto bytes           = element_size(desc_.data_type) * get_size(slice.desc_.shape);
+        slice.desc_.shape[0] = end - start;
+        slice.buffer_        = Buffer(buffer(), start * bytes, (end - start) * bytes);
+        return slice;
+    }
+
+    TensorShape::value_type Tensor::shape(int dim) const
+    {
+        return desc().shape[dim];
+    }
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/tensor.h b/csrc/mmdeploy/core/tensor.h
index 0a25207278..1a941fcea3 100644
--- a/csrc/mmdeploy/core/tensor.h
+++ b/csrc/mmdeploy/core/tensor.h
@@ -9,82 +9,94 @@
 #include "mmdeploy/core/device.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy {
-
-namespace framework {
-
-using TensorShape = std::vector<int64_t>;
-struct TensorDesc {
-  Device device;
-  DataType data_type{DataType::kFLOAT};
-  TensorShape shape;
-  std::string name;
-};
-
-class MMDEPLOY_API Tensor {
- public:
-  Tensor() = default;
-  Tensor(const Tensor&) = default;
-  Tensor(Tensor&&) noexcept = default;
-  Tensor& operator=(const Tensor&) = default;
-  Tensor& operator=(Tensor&&) noexcept = default;
-
-  Tensor(const TensorDesc& desc, Allocator allocator = {});  // NOLINT
-  Tensor(const TensorDesc& desc, Buffer buffer);
-  Tensor(const TensorDesc& desc, std::shared_ptr<void> data);
-  ~Tensor() = default;
-
-  const TensorDesc& desc() const;
-  const TensorShape& shape() const;
-  TensorShape::value_type shape(int dim) const;
-  DataType data_type() const;
-  const char* name() const;
-  int64_t size() const;
-  int64_t byte_size() const;
-
-  const Buffer& buffer() const;
-  Buffer& buffer();
-  Device device() const;
-
-  void Reshape(const TensorShape& shape);
-
-  void Squeeze();
-  void Squeeze(int dim);
-
-  Tensor Slice(int start, int end);
-  Tensor Slice(int index) { return Slice(index, index + 1); }
-
-  Result<void> CopyFrom(const Tensor& tensor, Stream stream = {});
-  Result<void> CopyTo(Tensor& tensor, Stream stream = {}) const;
-
-  Result<void> CopyFrom(void* host_ptr, Stream stream = {});
-  Result<void> CopyTo(void* host_ptr, Stream stream = {}) const;
-
-  Allocator allocator() { return allocator_; }
-
-  template <typename T = void>
-  T* data() {
-    return GetNative<T*>(buffer());
-  }
-
-  template <typename T = void, typename U = std::add_const_t<T> >
-  U* data() const {
-    return GetNative<U*>(buffer());
-  }
-
- private:
-  void Allocate();
-
-  TensorDesc desc_;
-  Allocator allocator_;
-  Buffer buffer_;
-};
-
-// static_assert(sizeof(Tensor) == 80);
-
-}  // namespace framework
-
-MMDEPLOY_REGISTER_TYPE_ID(framework::Tensor, 6);
+namespace mmdeploy
+{
+
+    namespace framework
+    {
+
+        using TensorShape = std::vector<int64_t>;
+        struct TensorDesc
+        {
+            Device      device;
+            DataType    data_type{DataType::kFLOAT};
+            TensorShape shape;
+            std::string name;
+        };
+
+        class MMDEPLOY_API Tensor
+        {
+          public:
+            Tensor()                             = default;
+            Tensor(const Tensor&)                = default;
+            Tensor(Tensor&&) noexcept            = default;
+            Tensor& operator=(const Tensor&)     = default;
+            Tensor& operator=(Tensor&&) noexcept = default;
+
+            Tensor(const TensorDesc& desc, Allocator allocator = {});  // NOLINT
+            Tensor(const TensorDesc& desc, Buffer buffer);
+            Tensor(const TensorDesc& desc, std::shared_ptr<void> data);
+            ~Tensor() = default;
+
+            const TensorDesc&       desc() const;
+            const TensorShape&      shape() const;
+            TensorShape::value_type shape(int dim) const;
+            DataType                data_type() const;
+            const char*             name() const;
+            int64_t                 size() const;
+            int64_t                 byte_size() const;
+
+            const Buffer&           buffer() const;
+            Buffer&                 buffer();
+            Device                  device() const;
+
+            void                    Reshape(const TensorShape& shape);
+
+            void                    Squeeze();
+            void                    Squeeze(int dim);
+
+            Tensor                  Slice(int start, int end);
+            Tensor                  Slice(int index)
+            {
+                return Slice(index, index + 1);
+            }
+
+            Result<void> CopyFrom(const Tensor& tensor, Stream stream = {});
+            Result<void> CopyTo(Tensor& tensor, Stream stream = {}) const;
+
+            Result<void> CopyFrom(void* host_ptr, Stream stream = {});
+            Result<void> CopyTo(void* host_ptr, Stream stream = {}) const;
+
+            Allocator    allocator()
+            {
+                return allocator_;
+            }
+
+            template<typename T = void>
+            T* data()
+            {
+                return GetNative<T*>(buffer());
+            }
+
+            template<typename T = void, typename U = std::add_const_t<T>>
+            U* data() const
+            {
+                return GetNative<U*>(buffer());
+            }
+
+          private:
+            void       Allocate();
+
+            TensorDesc desc_;
+            Allocator  allocator_;
+            Buffer     buffer_;
+        };
+
+        // static_assert(sizeof(Tensor) == 80);
+
+    }  // namespace framework
+
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Tensor, 6);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/types.h b/csrc/mmdeploy/core/types.h
index 4076cac36e..e140c5fb74 100644
--- a/csrc/mmdeploy/core/types.h
+++ b/csrc/mmdeploy/core/types.h
@@ -7,9 +7,10 @@
 
 typedef int err_t;
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-// clang-format off
+    // clang-format off
 
 enum class PixelFormat : int32_t {
   kBGR,
@@ -32,44 +33,48 @@ enum class DataType : int32_t {
   kCOUNT
 };
 
-// clang-format on
-
-namespace pixel_formats {
-
-constexpr auto kBGR = PixelFormat::kBGR;
-constexpr auto kRGB = PixelFormat::kRGB;
-constexpr auto kGRAY = PixelFormat::kGRAYSCALE;
-constexpr auto kNV12 = PixelFormat::kNV12;
-constexpr auto kNV21 = PixelFormat::kNV21;
-constexpr auto kBGRA = PixelFormat::kBGRA;
-
-}  // namespace pixel_formats
-
-namespace data_types {
-
-constexpr auto kFLOAT = DataType::kFLOAT;
-constexpr auto kHALF = DataType::kHALF;
-constexpr auto kINT8 = DataType::kINT8;
-constexpr auto kINT32 = DataType::kINT32;
-constexpr auto kINT64 = DataType::kINT64;
-
-}  // namespace data_types
-
-class NonCopyable {
- public:
-  NonCopyable() = default;
-  NonCopyable(const NonCopyable&) = delete;
-  NonCopyable& operator=(const NonCopyable&) = delete;
-};
-
-class NonMovable {
- public:
-  NonMovable() = default;
-  NonMovable(const NonCopyable&) = delete;
-  NonMovable& operator=(const NonCopyable&) = delete;
-  NonMovable(NonMovable&&) noexcept = delete;
-  NonMovable& operator=(NonMovable&&) noexcept = delete;
-};
+    // clang-format on
+
+    namespace pixel_formats
+    {
+
+        constexpr auto kBGR  = PixelFormat::kBGR;
+        constexpr auto kRGB  = PixelFormat::kRGB;
+        constexpr auto kGRAY = PixelFormat::kGRAYSCALE;
+        constexpr auto kNV12 = PixelFormat::kNV12;
+        constexpr auto kNV21 = PixelFormat::kNV21;
+        constexpr auto kBGRA = PixelFormat::kBGRA;
+
+    }  // namespace pixel_formats
+
+    namespace data_types
+    {
+
+        constexpr auto kFLOAT = DataType::kFLOAT;
+        constexpr auto kHALF  = DataType::kHALF;
+        constexpr auto kINT8  = DataType::kINT8;
+        constexpr auto kINT32 = DataType::kINT32;
+        constexpr auto kINT64 = DataType::kINT64;
+
+    }  // namespace data_types
+
+    class NonCopyable
+    {
+      public:
+        NonCopyable()                              = default;
+        NonCopyable(const NonCopyable&)            = delete;
+        NonCopyable& operator=(const NonCopyable&) = delete;
+    };
+
+    class NonMovable
+    {
+      public:
+        NonMovable()                                 = default;
+        NonMovable(const NonCopyable&)               = delete;
+        NonMovable& operator=(const NonCopyable&)    = delete;
+        NonMovable(NonMovable&&) noexcept            = delete;
+        NonMovable& operator=(NonMovable&&) noexcept = delete;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/utils/device_utils.cpp b/csrc/mmdeploy/core/utils/device_utils.cpp
index e6ffbc7905..7f8c828453 100644
--- a/csrc/mmdeploy/core/utils/device_utils.cpp
+++ b/csrc/mmdeploy/core/utils/device_utils.cpp
@@ -4,42 +4,49 @@
 
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy::framework {
-
-Result<Mat> MakeAvailableOnDevice(const Mat& src, const Device& device, Stream& stream) {
-  if (src.device() == device) {
-    return src;
-  }
-
-  Mat dst{src.height(), src.width(), src.pixel_format(), src.type(), device};
-  OUTCOME_TRY(stream.Copy(src.buffer(), dst.buffer(), dst.byte_size()));
-
-  // ! When the target device is different from stream's device (e.g. DtoH), insert a sync op as
-  //   computation on dst won't be synchronized with stream
-  if (device != stream.GetDevice()) {
-    OUTCOME_TRY(stream.Wait());
-  }
-
-  return dst;
-}
-
-Result<Tensor> MakeAvailableOnDevice(const Tensor& src, const Device& device, Stream& stream) {
-  if (src.device() == device) {
-    return src;
-  }
-
-  TensorDesc desc{device, src.data_type(), src.shape(), src.name()};
-  Tensor dst(desc);
-
-  OUTCOME_TRY(stream.Copy(src.buffer(), dst.buffer(), src.byte_size()));
-
-  // ! When the target device is different from stream's device (e.g. DtoH), insert a sync op as
-  //   computation on dst won't be synchronized with stream
-  if (device != stream.GetDevice()) {
-    OUTCOME_TRY(stream.Wait());
-  }
-
-  return dst;
-}
+namespace mmdeploy::framework
+{
+
+    Result<Mat> MakeAvailableOnDevice(const Mat& src, const Device& device, Stream& stream)
+    {
+        if (src.device() == device)
+        {
+            return src;
+        }
+
+        Mat dst{src.height(), src.width(), src.pixel_format(), src.type(), device};
+        OUTCOME_TRY(stream.Copy(src.buffer(), dst.buffer(), dst.byte_size()));
+
+        // ! When the target device is different from stream's device (e.g. DtoH), insert a sync op as
+        //   computation on dst won't be synchronized with stream
+        if (device != stream.GetDevice())
+        {
+            OUTCOME_TRY(stream.Wait());
+        }
+
+        return dst;
+    }
+
+    Result<Tensor> MakeAvailableOnDevice(const Tensor& src, const Device& device, Stream& stream)
+    {
+        if (src.device() == device)
+        {
+            return src;
+        }
+
+        TensorDesc desc{device, src.data_type(), src.shape(), src.name()};
+        Tensor     dst(desc);
+
+        OUTCOME_TRY(stream.Copy(src.buffer(), dst.buffer(), src.byte_size()));
+
+        // ! When the target device is different from stream's device (e.g. DtoH), insert a sync op as
+        //   computation on dst won't be synchronized with stream
+        if (device != stream.GetDevice())
+        {
+            OUTCOME_TRY(stream.Wait());
+        }
+
+        return dst;
+    }
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/utils/device_utils.h b/csrc/mmdeploy/core/utils/device_utils.h
index 8ad2661a19..d2f91e334f 100644
--- a/csrc/mmdeploy/core/utils/device_utils.h
+++ b/csrc/mmdeploy/core/utils/device_utils.h
@@ -8,26 +8,25 @@
 #include "mmdeploy/core/mat.h"
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::framework {
-/**
- *
- * @param src
- * @param device
- * @param stream
- * @return
- */
-MMDEPLOY_API Result<Mat> MakeAvailableOnDevice(const Mat& src, const Device& device,
-                                               Stream& stream);
+namespace mmdeploy::framework
+{
+    /**
+     *
+     * @param src
+     * @param device
+     * @param stream
+     * @return
+     */
+    MMDEPLOY_API Result<Mat> MakeAvailableOnDevice(const Mat& src, const Device& device, Stream& stream);
 
-/**
- *
- * @param src
- * @param device
- * @param stream
- * @return
- */
-MMDEPLOY_API Result<Tensor> MakeAvailableOnDevice(const Tensor& src, const Device& device,
-                                                  Stream& stream);
+    /**
+     *
+     * @param src
+     * @param device
+     * @param stream
+     * @return
+     */
+    MMDEPLOY_API Result<Tensor> MakeAvailableOnDevice(const Tensor& src, const Device& device, Stream& stream);
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/core/utils/filesystem.h b/csrc/mmdeploy/core/utils/filesystem.h
index 3f2a3286ce..24a27a33af 100644
--- a/csrc/mmdeploy/core/utils/filesystem.h
+++ b/csrc/mmdeploy/core/utils/filesystem.h
@@ -4,10 +4,10 @@
 #define MMDEPLOY_CSRC_CORE_UTILS_FILESYSTEM_H_
 
 #if __GNUC__ >= 8 || _MSC_VER || __clang_major__ >= 7
-#include <filesystem>
+    #include <filesystem>
 namespace fs = std::filesystem;
 #else
-#include <experimental/filesystem>
+    #include <experimental/filesystem>
 namespace fs = std::experimental::filesystem;
 #endif
 
diff --git a/csrc/mmdeploy/core/utils/formatter.cpp b/csrc/mmdeploy/core/utils/formatter.cpp
index 9e78a9038e..8aebf626f0 100644
--- a/csrc/mmdeploy/core/utils/formatter.cpp
+++ b/csrc/mmdeploy/core/utils/formatter.cpp
@@ -5,8 +5,12 @@
 #include "mmdeploy/archive/json_archive.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-std::string format_value(const Value& value) { return mmdeploy::to_json(value).dump(2); }
+    std::string format_value(const Value& value)
+    {
+        return mmdeploy::to_json(value).dump(2);
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/utils/formatter.h b/csrc/mmdeploy/core/utils/formatter.h
index ee5c065712..9ffb610377 100644
--- a/csrc/mmdeploy/core/utils/formatter.h
+++ b/csrc/mmdeploy/core/utils/formatter.h
@@ -11,106 +11,126 @@
 #include "spdlog/fmt/ostr.h"
 
 #if FMT_VERSION >= 50000
-#include "spdlog/fmt/bundled/ostream.h"
-#include "spdlog/fmt/bundled/ranges.h"
+    #include "spdlog/fmt/bundled/ostream.h"
+    #include "spdlog/fmt/bundled/ranges.h"
 #else
-#include <type_traits>
+    #include <type_traits>
 #endif
 
-namespace mmdeploy {
-
-class Value;
-
-MMDEPLOY_API std::string format_value(const Value& value);
-
-inline std::string to_string(PixelFormat format) {
-  switch (format) {
-    case PixelFormat::kBGR:
-      return "BGR";
-    case PixelFormat::kRGB:
-      return "RGB";
-    case PixelFormat::kGRAYSCALE:
-      return "GRAYSCALE";
-    case PixelFormat::kNV12:
-      return "NV12";
-    case PixelFormat::kNV21:
-      return "NV21";
-    case PixelFormat::kBGRA:
-      return "BGRA";
-    default:
-      return "invalid_format_enum";
-  }
-}
-
-inline std::string to_string(DataType type) {
-  switch (type) {
-    case DataType::kFLOAT:
-      return "FLOAT";
-    case DataType::kHALF:
-      return "HALF";
-    case DataType::kINT8:
-      return "INT8";
-    case DataType::kINT32:
-      return "INT32";
-    case DataType::kINT64:
-      return "INT64";
-    default:
-      return "invalid_data_type_enum";
-  }
-}
-
-inline std::ostream& operator<<(std::ostream& os, PixelFormat format) {
-  return os << to_string(format);
-}
-
-inline std::ostream& operator<<(std::ostream& os, DataType type) { return os << to_string(type); }
+namespace mmdeploy
+{
+
+    class Value;
+
+    MMDEPLOY_API std::string format_value(const Value& value);
+
+    inline std::string       to_string(PixelFormat format)
+    {
+        switch (format)
+        {
+            case PixelFormat::kBGR:
+                return "BGR";
+            case PixelFormat::kRGB:
+                return "RGB";
+            case PixelFormat::kGRAYSCALE:
+                return "GRAYSCALE";
+            case PixelFormat::kNV12:
+                return "NV12";
+            case PixelFormat::kNV21:
+                return "NV21";
+            case PixelFormat::kBGRA:
+                return "BGRA";
+            default:
+                return "invalid_format_enum";
+        }
+    }
+
+    inline std::string to_string(DataType type)
+    {
+        switch (type)
+        {
+            case DataType::kFLOAT:
+                return "FLOAT";
+            case DataType::kHALF:
+                return "HALF";
+            case DataType::kINT8:
+                return "INT8";
+            case DataType::kINT32:
+                return "INT32";
+            case DataType::kINT64:
+                return "INT64";
+            default:
+                return "invalid_data_type_enum";
+        }
+    }
+
+    inline std::ostream& operator<<(std::ostream& os, PixelFormat format)
+    {
+        return os << to_string(format);
+    }
+
+    inline std::ostream& operator<<(std::ostream& os, DataType type)
+    {
+        return os << to_string(type);
+    }
 
 }  // namespace mmdeploy
 
-namespace fmt {
+namespace fmt
+{
 
 #if FMT_VERSION >= 50000
 
-// `Value` maybe an incomplete type at this point, making `operator<<` not usable
-template <>
-struct formatter<mmdeploy::Value> {
-  constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-  template <typename Context>
-  auto format(const mmdeploy::Value& value, Context& ctx) {
-    return format_to(ctx.out(), "{}", mmdeploy::format_value(value));
-  }
-};
+    // `Value` maybe an incomplete type at this point, making `operator<<` not usable
+    template<>
+    struct formatter<mmdeploy::Value>
+    {
+        constexpr auto parse(format_parse_context& ctx)
+        {
+            return ctx.begin();
+        }
+        template<typename Context>
+        auto format(const mmdeploy::Value& value, Context& ctx)
+        {
+            return format_to(ctx.out(), "{}", mmdeploy::format_value(value));
+        }
+    };
 
 #else
 
-inline void format_arg(BasicFormatter<char>& f, const char*, const mmdeploy::Value& d) {
-  f.writer() << mmdeploy::format_value(d);
-}
-
-template <typename T>
-auto format_arg(BasicFormatter<char>& f, const char*, const T& v)
-    -> std::void_t<decltype(begin(v), end(v))> {
-  f.writer() << "[";
-  bool first = true;
-  for (const auto& x : v) {
-    f.writer() << (first ? "" : ", ") << fmt::format("{}", x);
-    first = false;
-  }
-  f.writer() << "]";
-}
-
-template <class Tuple, size_t... Is>
-void format_tuple_impl(BasicFormatter<char>& f, const Tuple& t, std::index_sequence<Is...>) {
-  constexpr int last = sizeof...(Is) - 1;
-  f.writer() << "(";
-  ((f.writer() << fmt::format("{}", std::get<Is>(t)) << (Is != last ? ", " : "")), ...);
-  f.writer() << ")";
-}
-
-template <typename... Ts>
-void format_arg(BasicFormatter<char>& f, const char*, const std::tuple<Ts...>& t) {
-  format_tuple_impl(f, t, std::index_sequence_for<Ts...>{});
-}
+    inline void format_arg(BasicFormatter<char>& f, const char*, const mmdeploy::Value& d)
+    {
+        f.writer() << mmdeploy::format_value(d);
+    }
+
+    template<typename T>
+    auto format_arg(BasicFormatter<char>& f, const char*, const T& v)
+        -> std::void_t<decltype(begin(v), end(v))>
+    {
+        f.writer() << "[";
+        bool first = true;
+        for (const auto& x : v)
+        {
+            f.writer() << (first ? "" : ", ") << fmt::format("{}", x);
+            first = false;
+        }
+        f.writer() << "]";
+    }
+
+    template<class Tuple, size_t... Is>
+    void format_tuple_impl(BasicFormatter<char>& f, const Tuple& t, std::index_sequence<Is...>)
+    {
+        constexpr int last = sizeof...(Is) - 1;
+        f.writer() << "(";
+        ((f.writer() << fmt::format("{}", std::get<Is>(t)) << (Is != last ? ", " : "")), ...);
+        f.writer() << ")";
+    }
+
+    template<typename... Ts>
+    void format_arg(BasicFormatter<char>& f, const char*, const std::tuple<Ts...>& t)
+    {
+        format_tuple_impl(f, t, std::index_sequence_for<Ts...>{});
+    }
 
 #endif
 
diff --git a/csrc/mmdeploy/core/utils/source_location.h b/csrc/mmdeploy/core/utils/source_location.h
index e87a2ee3ba..2972014aaf 100644
--- a/csrc/mmdeploy/core/utils/source_location.h
+++ b/csrc/mmdeploy/core/utils/source_location.h
@@ -28,19 +28,36 @@
 // clang-format on
 
 #ifndef MMDEPLOY_HAS_SOURCE_LOCATION
-#include <cstdint>
-namespace mmdeploy {
-class SourceLocation {
- public:
-  constexpr SourceLocation() noexcept = default;
-  SourceLocation(const SourceLocation&) = default;
-  SourceLocation(SourceLocation&&) noexcept = default;
-  constexpr std::uint_least32_t line() const noexcept { return 0; };
-  constexpr std::uint_least32_t column() const noexcept { return 0; }
-  constexpr const char* file_name() const noexcept { return ""; }
-  constexpr const char* function_name() const noexcept { return ""; }
-  static constexpr SourceLocation current() noexcept { return {}; }
-};
+    #include <cstdint>
+namespace mmdeploy
+{
+    class SourceLocation
+    {
+      public:
+        constexpr SourceLocation() noexcept       = default;
+        SourceLocation(const SourceLocation&)     = default;
+        SourceLocation(SourceLocation&&) noexcept = default;
+        constexpr std::uint_least32_t line() const noexcept
+        {
+            return 0;
+        };
+        constexpr std::uint_least32_t column() const noexcept
+        {
+            return 0;
+        }
+        constexpr const char* file_name() const noexcept
+        {
+            return "";
+        }
+        constexpr const char* function_name() const noexcept
+        {
+            return "";
+        }
+        static constexpr SourceLocation current() noexcept
+        {
+            return {};
+        }
+    };
 }  // namespace mmdeploy
 #endif
 
diff --git a/csrc/mmdeploy/core/utils/stacktrace.cpp b/csrc/mmdeploy/core/utils/stacktrace.cpp
index e04d0d335a..bfa0074513 100644
--- a/csrc/mmdeploy/core/utils/stacktrace.cpp
+++ b/csrc/mmdeploy/core/utils/stacktrace.cpp
@@ -4,52 +4,76 @@
 
 #if MMDEPLOY_STATUS_USE_STACKTRACE
 
-#define BOOST_STACKTRACE_USE_BACKTRACE
-#include "boost/stacktrace.hpp"
-
-namespace mmdeploy {
-
-struct Stacktrace::Impl {
-  boost::stacktrace::stacktrace st_;
-};
-Stacktrace::~Stacktrace() = default;
-Stacktrace::Stacktrace(int)
-    : impl_(new Impl{boost::stacktrace::stacktrace(1, static_cast<std::size_t>(-1))}) {}
-Stacktrace::Stacktrace() noexcept = default;
-Stacktrace::Stacktrace(const Stacktrace& other) : impl_(std::make_unique<Impl>(*other.impl_)) {}
-Stacktrace::Stacktrace(Stacktrace&& other) noexcept : impl_(std::move(other.impl_)) {}
-Stacktrace& Stacktrace::operator=(Stacktrace&& other) noexcept {
-  impl_ = std::move(other.impl_);
-  return *this;
-}
-Stacktrace& Stacktrace::operator=(const Stacktrace& other) {
-  impl_ = std::make_unique<Impl>(*other.impl_);
-  return *this;
-}
-std::string Stacktrace::to_string() const {
-  if (impl_) {
-    return boost::stacktrace::to_string(impl_->st_);
-  }
-  return "";
-}
+    #define BOOST_STACKTRACE_USE_BACKTRACE
+    #include "boost/stacktrace.hpp"
+
+namespace mmdeploy
+{
+
+    struct Stacktrace::Impl
+    {
+        boost::stacktrace::stacktrace st_;
+    };
+    Stacktrace::~Stacktrace() = default;
+    Stacktrace::Stacktrace(int)
+        : impl_(new Impl{boost::stacktrace::stacktrace(1, static_cast<std::size_t>(-1))})
+    {
+    }
+    Stacktrace::Stacktrace() noexcept = default;
+    Stacktrace::Stacktrace(const Stacktrace& other)
+        : impl_(std::make_unique<Impl>(*other.impl_))
+    {
+    }
+    Stacktrace::Stacktrace(Stacktrace&& other) noexcept
+        : impl_(std::move(other.impl_))
+    {
+    }
+    Stacktrace& Stacktrace::operator=(Stacktrace&& other) noexcept
+    {
+        impl_ = std::move(other.impl_);
+        return *this;
+    }
+    Stacktrace& Stacktrace::operator=(const Stacktrace& other)
+    {
+        impl_ = std::make_unique<Impl>(*other.impl_);
+        return *this;
+    }
+    std::string Stacktrace::to_string() const
+    {
+        if (impl_)
+        {
+            return boost::stacktrace::to_string(impl_->st_);
+        }
+        return "";
+    }
 
 }  // namespace mmdeploy
 
 #else
-#include <string>
-namespace mmdeploy {
-
-struct Stacktrace::Impl {};
-Stacktrace::~Stacktrace() = default;
-Stacktrace::Stacktrace(int) {}
-Stacktrace::Stacktrace() noexcept = default;
-Stacktrace::Stacktrace(const Stacktrace&) {}
-Stacktrace::Stacktrace(Stacktrace&&) noexcept {}
-Stacktrace& Stacktrace::operator=(Stacktrace&&) noexcept { return *this; }
-Stacktrace& Stacktrace::operator=(const Stacktrace&) { return *this; }
-std::string Stacktrace::to_string() const {
-  return "the library is compiled with no stacktrace support";
-}
+    #include <string>
+namespace mmdeploy
+{
+
+    struct Stacktrace::Impl
+    {
+    };
+    Stacktrace::~Stacktrace() = default;
+    Stacktrace::Stacktrace(int) {}
+    Stacktrace::Stacktrace() noexcept = default;
+    Stacktrace::Stacktrace(const Stacktrace&) {}
+    Stacktrace::Stacktrace(Stacktrace&&) noexcept {}
+    Stacktrace& Stacktrace::operator=(Stacktrace&&) noexcept
+    {
+        return *this;
+    }
+    Stacktrace& Stacktrace::operator=(const Stacktrace&)
+    {
+        return *this;
+    }
+    std::string Stacktrace::to_string() const
+    {
+        return "the library is compiled with no stacktrace support";
+    }
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/utils/stacktrace.h b/csrc/mmdeploy/core/utils/stacktrace.h
index 53b1a44b99..8b525cf1c2 100644
--- a/csrc/mmdeploy/core/utils/stacktrace.h
+++ b/csrc/mmdeploy/core/utils/stacktrace.h
@@ -6,23 +6,25 @@
 #include <memory>
 #include <string>
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class Stacktrace {
- public:
-  ~Stacktrace();
-  Stacktrace() noexcept;
-  explicit Stacktrace(int);
-  Stacktrace& operator=(const Stacktrace&);
-  Stacktrace& operator=(Stacktrace&& other) noexcept;
-  Stacktrace(const Stacktrace&);
-  Stacktrace(Stacktrace&&) noexcept;
-  std::string to_string() const;
+    class Stacktrace
+    {
+      public:
+        ~Stacktrace();
+        Stacktrace() noexcept;
+        explicit Stacktrace(int);
+        Stacktrace& operator=(const Stacktrace&);
+        Stacktrace& operator=(Stacktrace&& other) noexcept;
+        Stacktrace(const Stacktrace&);
+        Stacktrace(Stacktrace&&) noexcept;
+        std::string to_string() const;
 
- private:
-  struct Impl;
-  std::unique_ptr<Impl> impl_;
-};
+      private:
+        struct Impl;
+        std::unique_ptr<Impl> impl_;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/value.h b/csrc/mmdeploy/core/value.h
index 45e14f509b..a9ca631248 100644
--- a/csrc/mmdeploy/core/value.h
+++ b/csrc/mmdeploy/core/value.h
@@ -18,1039 +18,1558 @@
 #include "mmdeploy/core/mpl/type_traits.h"
 #include "mmdeploy/core/status_code.h"
 
-namespace mmdeploy {
-
-enum class ValueType : int {
-  kNull = 0,
-  kBool,
-  kInt,
-  kUInt,
-  kFloat,
-  kString,
-  kBinary,
-  kArray,
-  kObject,
-  kPointer,
-  kDynamic,
-  kAny,
-};
-
-class Value;
+namespace mmdeploy
+{
+
+    enum class ValueType : int
+    {
+        kNull = 0,
+        kBool,
+        kInt,
+        kUInt,
+        kFloat,
+        kString,
+        kBinary,
+        kArray,
+        kObject,
+        kPointer,
+        kDynamic,
+        kAny,
+    };
+
+    class Value;
 
 #if __GNUC__ >= 8
-using Byte = std::byte;
+    using Byte = std::byte;
 #else
-enum class Byte : unsigned char {};
+    enum class Byte : unsigned char
+    {
+    };
 #endif
 
-namespace detail {
-class ValueRef;
-}
-
-template <typename T>
-class ValueIterator {
- public:
-  using value_type = Value;
-  using difference_type = std::ptrdiff_t;
-  using pointer = value_type*;
-  using reference = value_type&;
-  using iterator_category = std::bidirectional_iterator_tag;
-  using object_iterator_t = typename T::Object::iterator;
-  using array_iterator_t = typename T::Array::iterator;
-  ValueIterator() = default;
-  ValueIterator(T* value, object_iterator_t iter) : value_(value), object_iter_(iter) {}
-  ValueIterator(T* value, array_iterator_t iter) : value_(value), array_iter_(iter) {}
-  ValueIterator& operator++() {
-    if (value_->is_array()) {
-      ++array_iter_;
-    } else {
-      ++object_iter_;
+    namespace detail
+    {
+        class ValueRef;
     }
-    return *this;
-  }
-  ValueIterator operator++(int) {
-    auto it = *this;
-    ++(*this);
-    return it;
-  }
-  T& operator*() {
-    if (value_->is_array()) {
-      return *array_iter_;
-    } else {
-      return object_iter_->second;
-    }
-  }
-  const T& operator*() const {
-    if (value_->is_array()) {
-      return *array_iter_;
-    } else {
-      return object_iter_->second;
-    }
-  }
-  T* operator->() {
-    if (value_->is_array()) {
-      return &(*array_iter_);
-    } else {
-      return &object_iter_->second;
-    }
-  }
-  const T* operator->() const {
-    if (value_->is_array()) {
-      return &(*array_iter_);
-    } else {
-      return &object_iter_->second;
-    }
-  }
-  const std::string& key() {
-    if (value_->is_object()) {
-      return object_iter_->first;
-    }
-    throw_exception(eInvalidArgument);
-  }
-  bool operator==(const ValueIterator& other) const {
-    return value_ == other.value_ && object_iter_ == other.object_iter_ &&
-           array_iter_ == other.array_iter_;
-  }
-  bool operator!=(const ValueIterator& other) const { return !(*this == other); }
-
- private:
-  T* value_{};
-  object_iterator_t object_iter_{};
-  array_iterator_t array_iter_{};
-};
-
-class Dynamic;
-
-class Value;
-
-template <class T>
-struct EraseType {
-  T value;
-};
-
-template <class T>
-struct ArchiveType {
-  T value;
-};
-
-template <class T>
-EraseType<T&&> cast_by_erasure(T&& v) {
-  return {std::forward<T>(v)};
-}
-
-template <class T>
-ArchiveType<T&&> cast_by_archive(T&& v) {
-  return {std::forward<T>(v)};
-}
-
-template <typename T>
-struct is_value : std::is_same<T, Value> {};
-
-template <typename T>
-inline constexpr bool is_value_v = is_value<T>::value;
-
-namespace detail {
-template <typename T>
-struct is_pointer_to_const : std::false_type {};
-template <typename T>
-struct is_pointer_to_const<const T*> : std::true_type {};
-template <typename T>
-struct is_const_reference : std::false_type {};
-template <typename T>
-struct is_const_reference<const T&> : std::true_type {};
-}  // namespace detail
-
-class Value {
- public:
-  using value_type = Value;
-  using reference = value_type&;
-  using const_reference = const value_type&;
-  using difference_type = std::ptrdiff_t;
-  using size_type = std::size_t;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
-  using iterator = ValueIterator<Value>;
-  using const_iterator = ValueIterator<const Value>;
-
-  using Type = ValueType;
-
-  using Boolean = bool;
-  using Integer = int64_t;
-  using Unsigned = uint64_t;
-  using Float = double;
-  using String = std::string;
-  using Binary = std::vector<Byte>;
-  using Array = std::vector<Value>;
-  using Object = std::map<std::string, Value>;
-  using Pointer = std::shared_ptr<Value>;
-  using Dynamic = ::mmdeploy::Dynamic;
-  using Any = ::mmdeploy::StaticAny;
-  using ValueRef = detail::ValueRef;
-
-  static constexpr const auto kNull = ValueType::kNull;
-  static constexpr const auto kBool = ValueType::kBool;
-  static constexpr const auto kInt = ValueType::kInt;
-  static constexpr const auto kUInt = ValueType::kUInt;
-  static constexpr const auto kFloat = ValueType::kFloat;
-  static constexpr const auto kString = ValueType::kString;
-  static constexpr const auto kBinary = ValueType::kBinary;
-  static constexpr const auto kArray = ValueType::kArray;
-  static constexpr const auto kObject = ValueType::kObject;
-  static constexpr const auto kPointer = ValueType::kPointer;
-  static constexpr const auto kDynamic = ValueType::kDynamic;
-  static constexpr const auto kAny = ValueType::kAny;
-
-  Value(const ValueType v) : type_(v), data_(v) {}
-
-  Value(std::nullptr_t = nullptr) noexcept : Value(ValueType::kNull) {}
-
-  template <typename T, std::enable_if_t<std::is_same_v<T, ValueRef>, int> = 0>
-  Value(const T& ref) : Value(ref.moved_or_copied()) {}
-
-  Value(const Value& other) : type_(other.type_) {
-    switch (type_) {
-      case ValueType::kNull:
-        break;
-      case ValueType::kBool:
-        data_ = other.data_.boolean;
-        break;
-      case ValueType::kInt:
-        data_ = other.data_.number_integer;
-        break;
-      case ValueType::kUInt:
-        data_ = other.data_.number_unsigned;
-        break;
-      case ValueType::kFloat:
-        data_ = other.data_.number_float;
-        break;
-      case ValueType::kString:
-        data_ = *other.data_.string;
-        break;
-      case ValueType::kBinary:
-        data_ = *other.data_.binary;
-        break;
-      case ValueType::kArray:
-        data_ = *other.data_.array;
-        break;
-      case ValueType::kObject:
-        data_ = *other.data_.object;
-        break;
-      case ValueType::kPointer:
-        data_ = *other.data_.pointer;
-        break;
-      case ValueType::kAny:
-        data_.any = create<Any>(*other.data_.any);
-        break;
-      default:
-        throw_exception(eInvalidArgument);
-    }
-  }
-
-  template <class T, std::enable_if_t<std::is_same<std::decay_t<T>, bool>::value, bool> = true>
-  Value(T&& value) : type_(kBool), data_(Boolean{value}) {}
-
-  Value(int8_t value) : type_(kInt), data_(Integer{value}) {}
-  Value(int16_t value) : type_(kInt), data_(Integer{value}) {}
-  Value(int32_t value) : type_(kInt), data_(Integer{value}) {}
-  Value(int64_t value) : type_(kInt), data_(Integer{value}) {}
-  Value(uint8_t value) : type_(kUInt), data_(Unsigned{value}) {}
-  Value(uint16_t value) : type_(kUInt), data_(Unsigned{value}) {}
-  Value(uint32_t value) : type_(kUInt), data_(Unsigned{value}) {}
-  Value(uint64_t value) : type_(kUInt), data_(Unsigned{value}) {}
-  Value(float value) : type_(kFloat), data_(Float{value}) {}
-  Value(double value) : type_(kFloat), data_(Float{value}) {}
-  Value(Binary value) : type_(kBinary), data_(std::move(value)) {}
-  Value(Array value) : type_(kArray), data_(std::move(value)) {}
-  Value(Object value) : type_(kObject), data_(std::move(value)) {}
-  Value(Pointer value) : type_(kPointer), data_(std::move(value)) {}
-
-  template <class T, std::enable_if_t<std::is_constructible<String, T>::value, bool> = true>
-  Value(T&& value) : type_(kString), data_(String{std::forward<T>(value)}) {}
-
-  template <typename T, std::enable_if_t<is_cast_by_erasure<std::decay_t<T>>::value, bool> = true>
-  Value(T&& value) : Value(cast_by_erasure(std::forward<T>(value))) {}
-
-  template <typename T>
-  Value(EraseType<T>&& value) : type_(Type::kAny) {
-    data_.any = create<Any>(std::forward<T>(value.value));
-  }
-
-  Value(std::initializer_list<ValueRef> init, bool type_deduction = true,
-        Type manual_type = Type::kArray);
-
-  Value(Value&& other) noexcept : type_(other.type_), data_(other.data_) {
-    other.type_ = ValueType::kNull;
-    other.data_ = {};
-  }
-
-  // copy-and-swap
-  Value& operator=(Value other) noexcept {
-    using std::swap;
-    swap(type_, other.type_);
-    swap(data_, other.data_);
-    return *this;
-  }
-
-  ~Value() { data_.destroy(type_); }
-
-  operator Type() const noexcept { return type(); }
-  Type type() const noexcept { return _unwrap().type_; }
-  bool is_null() const noexcept { return _unwrap()._is_null(); }
-  bool is_array() const noexcept { return _unwrap()._is_array(); }
-  bool is_object() const noexcept { return _unwrap()._is_object(); }
-  template <typename T = void>
-  bool is_any() const noexcept {
-    return _unwrap()._is_any<T>();
-  }
-  bool is_boolean() const noexcept { return _unwrap()._is_boolean(); }
-  bool is_string() const noexcept { return _unwrap()._is_string(); }
-  bool is_binary() const noexcept { return _unwrap()._is_binary(); }
-  bool is_number() const noexcept { return _unwrap()._is_number(); }
-  bool is_number_integer() const noexcept { return _unwrap()._is_number_integer(); }
-  bool is_number_unsigned() const noexcept { return _unwrap()._is_number_unsigned(); }
-  bool is_number_float() const noexcept { return _unwrap()._is_number_float(); }
-  bool is_pointer() const noexcept { return _is_pointer(); }
-  size_t size() const noexcept { return _unwrap()._size(); }
-  bool empty() const noexcept { return _unwrap()._empty(); }
-
- private:
-  constexpr Type _type() const noexcept { return type_; }
-
-  constexpr bool _is_null() const noexcept { return type_ == Type::kNull; }
-  constexpr bool _is_array() const noexcept { return type_ == Type::kArray; }
-  constexpr bool _is_object() const noexcept { return type_ == Type::kObject; }
-
-  template <typename T = void>
-  constexpr bool _is_any() const noexcept {
-    if (type_ != Type::kAny) {
-      return false;
-    }
-    if constexpr (std::is_void_v<T>) {
-      return true;
-    } else {
-      return traits::TypeId<T>::value == data_.any->type();
-    }
-  }
-
-  constexpr bool _is_boolean() const noexcept { return type_ == Type::kBool; }
-  constexpr bool _is_string() const noexcept { return type_ == Type::kString; }
-  constexpr bool _is_binary() const noexcept { return type_ == Type::kBinary; }
-  constexpr bool _is_number() const noexcept { return _is_number_integer() || _is_number_float(); }
-
-  constexpr bool _is_number_integer() const noexcept {
-    return type_ == Type::kInt || type_ == Type::kUInt;
-  }
-
-  constexpr bool _is_number_unsigned() const noexcept { return type_ == Type::kUInt; }
-  constexpr bool _is_number_float() const noexcept { return type_ == Type::kFloat; }
-  constexpr bool _is_pointer() const noexcept { return type_ == Type::kPointer; }
-
-  size_t _size() const noexcept {
-    switch (_type()) {
-      case ValueType::kNull:
-        return 0;
-      case ValueType::kArray:
-        return data_.array->size();
-      case ValueType::kObject:
-        return data_.object->size();
-      default:
-        return 1;
-    }
-  }
-
-  bool _empty() const noexcept {
-    switch (_type()) {
-      case Type::kNull:
-        return true;
-      case Type::kArray:
-        return data_.array->empty();
-      case Type::kObject:
-        return data_.object->empty();
-      default:
-        return false;
-    }
-  }
-
- private:
-  Boolean* get_impl_ptr(Boolean*) noexcept { return _is_boolean() ? &data_.boolean : nullptr; }
-  const Boolean* get_impl_ptr(const Boolean*) const noexcept {
-    return _is_boolean() ? &data_.boolean : nullptr;
-  }
-  Integer* get_impl_ptr(Integer*) noexcept {
-    return _is_number_integer() ? &data_.number_integer : nullptr;
-  }
-  const Integer* get_impl_ptr(const Integer*) const noexcept {
-    return _is_number_integer() ? &data_.number_integer : nullptr;
-  }
-  Unsigned* get_impl_ptr(Unsigned*) noexcept {
-    return _is_number_unsigned() ? &data_.number_unsigned : nullptr;
-  }
-  const Unsigned* get_impl_ptr(const Unsigned*) const noexcept {
-    return _is_number_unsigned() ? &data_.number_unsigned : nullptr;
-  }
-  Float* get_impl_ptr(Float*) noexcept {
-    return _is_number_float() ? &data_.number_float : nullptr;
-  }
-  const Float* get_impl_ptr(const Float*) const noexcept {
-    return _is_number_float() ? &data_.number_float : nullptr;
-  }
-  String* get_impl_ptr(String*) noexcept { return _is_string() ? data_.string : nullptr; }
-  const String* get_impl_ptr(const String*) const noexcept {
-    return _is_string() ? data_.string : nullptr;
-  }
-  Binary* get_impl_ptr(Binary*) noexcept { return _is_binary() ? data_.binary : nullptr; }
-  const Binary* get_impl_ptr(const Binary*) const noexcept {
-    return _is_binary() ? data_.binary : nullptr;
-  }
-  Array* get_impl_ptr(Array*) noexcept { return _is_array() ? data_.array : nullptr; }
-  const Array* get_impl_ptr(const Array*) const noexcept {
-    return _is_array() ? data_.array : nullptr;
-  }
-  Object* get_impl_ptr(Object*) noexcept { return _is_object() ? data_.object : nullptr; }
-  const Object* get_impl_ptr(const Object*) const noexcept {
-    return _is_object() ? data_.object : nullptr;
-  }
-  Pointer* get_impl_ptr(Pointer*) noexcept { return _is_pointer() ? data_.pointer : nullptr; }
-  const Pointer* get_impl_ptr(const Pointer*) const noexcept {
-    return _is_pointer() ? data_.pointer : nullptr;
-  }
-  Any* get_impl_ptr(Any*) noexcept { return _is_any() ? data_.any : nullptr; }
-  const Any* get_impl_ptr(const Any*) const noexcept { return _is_any() ? data_.any : nullptr; }
-
-  template <typename T>
-  T* get_erased_ptr(EraseType<T>*) noexcept {
-    return _is_any() ? static_any_cast<T>(data_.any) : nullptr;
-  }
-  template <typename T>
-  const T* get_erased_ptr(const EraseType<T>*) const noexcept {
-    return _is_any() ? static_any_cast<T>(const_cast<const Any*>(data_.any)) : nullptr;
-  }
-
-  template <typename T, typename This>
-  static auto get_ref_impl(This& obj)
-      -> decltype((*obj.template get_ptr<std::add_pointer_t<T>>())) {
-    auto p = obj.template get_ptr<std::add_pointer_t<T>>();
-    if (p) {
-      return *p;
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  template <typename T, std::enable_if_t<std::is_pointer<T>::value, bool> = true>
-  auto _get_ptr() noexcept -> decltype(std::declval<Value&>().get_impl_ptr(std::declval<T>())) {
-    return get_impl_ptr(static_cast<T>(nullptr));
-  }
-
-  template <typename T, std::enable_if_t<detail::is_pointer_to_const<T>::value, bool> = true>
-  auto _get_ptr() const noexcept
-      -> decltype(std::declval<const Value&>().get_impl_ptr(std::declval<T>())) {
-    return get_impl_ptr(static_cast<T>(nullptr));
-  }
-
-  template <typename T, std::enable_if_t<std::is_pointer<T>::value, bool> = true>
-  auto _get_ptr() noexcept -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<T>())) {
-    return get_erased_ptr(static_cast<T>(nullptr));
-  }
-
-  template <typename T, std::enable_if_t<detail::is_pointer_to_const<T>::value, bool> = true>
-  auto _get_ptr() const noexcept
-      -> decltype(std::declval<const Value&>().get_erased_ptr(std::declval<T>())) {
-    return get_erased_ptr(static_cast<T>(nullptr));
-  }
-
-  // T* -> EraseType<T>*
-  template <
-      typename T, typename T0 = std::remove_pointer_t<T>,
-      std::enable_if_t<std::is_pointer<T>::value && is_cast_by_erasure<T0>::value, bool> = true>
-  auto _get_ptr() noexcept
-      -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<EraseType<T0>*>())) {
-    return get_erased_ptr(static_cast<EraseType<T0>*>(nullptr));
-  }
-
-  // const T* -> const EraseType<T>*
-  template <typename T, typename T0 = std::remove_const_t<std::remove_pointer_t<T>>,
-            std::enable_if_t<detail::is_pointer_to_const<T>::value && is_cast_by_erasure<T0>::value,
-                             bool> = true>
-  auto _get_ptr() const noexcept
-      -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<const EraseType<T0>*>())) {
-    return get_erased_ptr(static_cast<const EraseType<T0>*>(nullptr));
-  }
-
-  template <typename T>
-  static auto test_get_ptr(T) -> decltype(std::declval<Value&>()._get_ptr<T>(), std::true_type{});
-
-  static std::false_type test_get_ptr(...);
-
-  template <typename T>
-  using has_get_ptr = decltype(test_get_ptr(std::declval<std::add_pointer_t<T>>()));
-
-  template <typename T, std::enable_if_t<std::is_reference<T>::value, bool> = true>
-  auto _get_ref() -> decltype((get_ref_impl<T>(std::declval<Value&>()))) {
-    return get_ref_impl<T>(*this);
-  }
-
-  template <typename T, std::enable_if_t<detail::is_const_reference<T>::value, bool> = true>
-  auto _get_ref() const -> decltype((get_ref_impl<T>(std::declval<Value&>()))) {
-    return get_ref_impl<T>(*this);
-  }
-
-  template <typename T,
-            std::enable_if_t<std::is_same<std::remove_const_t<T>, Value>::value, bool> = true>
-  Value _get() const {
-    return *this;
-  }
-
-  template <typename T,
-            std::enable_if_t<!std::is_arithmetic<T>::value && has_get_ptr<T>::value, bool> = true>
-  auto _get() const
-      -> std::remove_reference_t<decltype(std::declval<Value&>()._get_ref<const T&>())> {
-    return get_ref<const T&>();
-  }
-
-  template <typename T, std::enable_if_t<std::is_arithmetic<T>::value, bool> = true>
-  T _get() const {
-    switch (_type()) {
-      case kInt:
-        return static_cast<T>(*_get_ptr<const Integer*>());
-      case kUInt:
-        return static_cast<T>(*_get_ptr<const Unsigned*>());
-      case kFloat:
-        return static_cast<T>(*_get_ptr<const Float*>());
-      case kBool:
-        return static_cast<T>(*_get_ptr<const Boolean*>());
-      default:
-        throw_exception(eInvalidArgument);
-    }
-  }
-
-  template <typename T, std::enable_if_t<std::is_same<T, const char*>::value, bool> = true>
-  const char* _get() const {
-    if (_is_string()) {
-      return data_.string->c_str();
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  template <typename T>
-  T& _get_to(T& v) const {
-    v = get<T>();
-    return v;
-  }
-
- public:
-  template <typename T>
-  auto get_ptr() noexcept -> decltype(std::declval<Value&>()._get_ptr<T>()) {
-    return _unwrap()._get_ptr<T>();
-  }
-
-  template <typename T>
-  auto get_ptr() const noexcept -> decltype(std::declval<const Value&>()._get_ptr<T>()) {
-    return _unwrap()._get_ptr<T>();
-  }
-
-  template <typename T>
-  auto get_ref() -> decltype((std::declval<Value&>()._get_ref<T>())) {
-    return _unwrap()._get_ref<T>();
-  }
-
-  template <typename T>
-  auto get_ref() const -> decltype((std::declval<const Value&>()._get_ref<T>())) {
-    return _unwrap()._get_ref<T>();
-  }
-
-  template <typename T>
-  auto get() -> decltype(std::declval<Value&>()._get<T>()) {
-    return _unwrap()._get<T>();
-  }
-
-  template <typename T>
-  auto get() const -> decltype(std::declval<const Value&>()._get<T>()) {
-    return _unwrap()._get<T>();
-  }
-
-  template <typename T>
-  auto get_to(T& v) const -> decltype((std::declval<const Value&>()._get_to(v))) {
-    return _unwrap()._get_to(v);
-  }
-
-  Array& array() & { return get_ref<Array&>(); }
-  Array&& array() && { return static_cast<Array&&>(get_ref<Array&>()); }
-  const Array& array() const& { return get_ref<const Array&>(); }
-  const Array&& array() const&& { return static_cast<const Array&&>(get_ref<const Array&>()); }
-
-  Object& object() & { return get_ref<Object&>(); }
-  Object&& object() && { return static_cast<Object&&>(get_ref<Object&>()); }
-  const Object& object() const& { return get_ref<const Object&>(); }
-  const Object&& object() const&& { return static_cast<const Object&&>(get_ref<const Object&>()); }
-
-  value_type& operator[](size_t idx) & {
-    return static_cast<value_type&>(_unwrap()._subscript(idx));
-  }
-
-  value_type&& operator[](size_t idx) && {
-    return static_cast<value_type&&>(_unwrap()._subscript(idx));
-  }
-
-  const value_type& operator[](size_t idx) const& {
-    return static_cast<const value_type&>(_unwrap()._subscript(idx));
-  }
-
-  const value_type&& operator[](size_t idx) const&& {
-    return static_cast<const value_type&&>(_unwrap()._subscript(idx));
-  }
-
-  value_type& operator[](const Object::key_type& idx) & {
-    return static_cast<value_type&>(_unwrap()._subscript(idx));
-  }
-
-  value_type&& operator[](const Object::key_type& idx) && {
-    return static_cast<value_type&&>(_unwrap()._subscript(idx));
-  }
-
-  const value_type& operator[](const Object::key_type& idx) const& {
-    return static_cast<const value_type&>(_unwrap()._subscript(idx));
-  }
-
-  const value_type&& operator[](const Object::key_type& idx) const&& {
-    return static_cast<const value_type&&>(_unwrap()._subscript(idx));
-  }
-
-  reference front() { return _unwrap()._front(); }
-
-  const_reference front() const { return _unwrap()._front(); }
-
-  reference back() { return _unwrap()._back(); }
-
-  const_reference back() const { return _unwrap()._back(); }
-
-  void push_back(Value&& val) { _unwrap()._push_back(std::move(val)); }
-
-  void push_back(const Value& val) { _unwrap()._push_back(val); }
-
-  template <typename Key>
-  bool contains(Key&& key) const {
-    return _unwrap()._contains(std::forward<Key>(key));
-  }
-
-  template <typename Key>
-  iterator find(Key&& key) {
-    return _unwrap()._find(std::forward<Key>(key));
-  }
-
-  template <typename Key>
-  const_iterator find(Key&& key) const {
-    return _unwrap()._find(std::forward<Key>(key));
-  }
-
-  template <typename T>
-  T value(const typename Object::key_type& key, const T& default_value) const {
-    return _unwrap()._value(key, default_value);
-  }
-
-  iterator begin() { return _unwrap()._begin(); }
-
-  iterator end() { return _unwrap()._end(); }
-
-  const_iterator begin() const { return _unwrap()._begin(); }
-
-  const_iterator end() const { return _unwrap()._end(); }
-
-  void update(const_reference v) { return _unwrap()._update(v); }
-
- private:
-  reference _front() {
-    if (_is_array()) {
-      return (*data_.array).front();
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  const_reference _front() const {
-    if (_is_array()) {
-      return (*data_.array).front();
-    }
-    throw_exception(eInvalidArgument);
-  }
 
-  reference _back() {
-    if (_is_array()) {
-      return (*data_.array).back();
+    template<typename T>
+    class ValueIterator
+    {
+      public:
+        using value_type        = Value;
+        using difference_type   = std::ptrdiff_t;
+        using pointer           = value_type*;
+        using reference         = value_type&;
+        using iterator_category = std::bidirectional_iterator_tag;
+        using object_iterator_t = typename T::Object::iterator;
+        using array_iterator_t  = typename T::Array::iterator;
+        ValueIterator()         = default;
+        ValueIterator(T* value, object_iterator_t iter)
+            : value_(value)
+            , object_iter_(iter)
+        {
+        }
+        ValueIterator(T* value, array_iterator_t iter)
+            : value_(value)
+            , array_iter_(iter)
+        {
+        }
+        ValueIterator& operator++()
+        {
+            if (value_->is_array())
+            {
+                ++array_iter_;
+            }
+            else
+            {
+                ++object_iter_;
+            }
+            return *this;
+        }
+        ValueIterator operator++(int)
+        {
+            auto it = *this;
+            ++(*this);
+            return it;
+        }
+        T& operator*()
+        {
+            if (value_->is_array())
+            {
+                return *array_iter_;
+            }
+            else
+            {
+                return object_iter_->second;
+            }
+        }
+        const T& operator*() const
+        {
+            if (value_->is_array())
+            {
+                return *array_iter_;
+            }
+            else
+            {
+                return object_iter_->second;
+            }
+        }
+        T* operator->()
+        {
+            if (value_->is_array())
+            {
+                return &(*array_iter_);
+            }
+            else
+            {
+                return &object_iter_->second;
+            }
+        }
+        const T* operator->() const
+        {
+            if (value_->is_array())
+            {
+                return &(*array_iter_);
+            }
+            else
+            {
+                return &object_iter_->second;
+            }
+        }
+        const std::string& key()
+        {
+            if (value_->is_object())
+            {
+                return object_iter_->first;
+            }
+            throw_exception(eInvalidArgument);
+        }
+        bool operator==(const ValueIterator& other) const
+        {
+            return value_ == other.value_ && object_iter_ == other.object_iter_ &&
+                   array_iter_ == other.array_iter_;
+        }
+        bool operator!=(const ValueIterator& other) const
+        {
+            return !(*this == other);
+        }
+
+      private:
+        T*                value_{};
+        object_iterator_t object_iter_{};
+        array_iterator_t  array_iter_{};
+    };
+
+    class Dynamic;
+
+    class Value;
+
+    template<class T>
+    struct EraseType
+    {
+        T value;
+    };
+
+    template<class T>
+    struct ArchiveType
+    {
+        T value;
+    };
+
+    template<class T>
+    EraseType<T&&> cast_by_erasure(T&& v)
+    {
+        return {std::forward<T>(v)};
     }
-    throw_exception(eInvalidArgument);
-  }
 
-  const_reference _back() const {
-    if (_is_array()) {
-      return (*data_.array).back();
+    template<class T>
+    ArchiveType<T&&> cast_by_archive(T&& v)
+    {
+        return {std::forward<T>(v)};
     }
-    throw_exception(eInvalidArgument);
-  }
 
-  void _push_back(Value&& val) {
-    if (!(_is_null() || _is_array())) {
-      throw_exception(eInvalidArgument);
+    template<typename T>
+    struct is_value : std::is_same<T, Value>
+    {
+    };
+
+    template<typename T>
+    inline constexpr bool is_value_v = is_value<T>::value;
+
+    namespace detail
+    {
+        template<typename T>
+        struct is_pointer_to_const : std::false_type
+        {
+        };
+        template<typename T>
+        struct is_pointer_to_const<const T*> : std::true_type
+        {
+        };
+        template<typename T>
+        struct is_const_reference : std::false_type
+        {
+        };
+        template<typename T>
+        struct is_const_reference<const T&> : std::true_type
+        {
+        };
+    }  // namespace detail
+
+    class Value
+    {
+      public:
+        using value_type      = Value;
+        using reference       = value_type&;
+        using const_reference = const value_type&;
+        using difference_type = std::ptrdiff_t;
+        using size_type       = std::size_t;
+        using pointer         = value_type*;
+        using const_pointer   = const value_type*;
+        using iterator        = ValueIterator<Value>;
+        using const_iterator  = ValueIterator<const Value>;
+
+        using Type = ValueType;
+
+        using Boolean  = bool;
+        using Integer  = int64_t;
+        using Unsigned = uint64_t;
+        using Float    = double;
+        using String   = std::string;
+        using Binary   = std::vector<Byte>;
+        using Array    = std::vector<Value>;
+        using Object   = std::map<std::string, Value>;
+        using Pointer  = std::shared_ptr<Value>;
+        using Dynamic  = ::mmdeploy::Dynamic;
+        using Any      = ::mmdeploy::StaticAny;
+        using ValueRef = detail::ValueRef;
+
+        static constexpr const auto kNull    = ValueType::kNull;
+        static constexpr const auto kBool    = ValueType::kBool;
+        static constexpr const auto kInt     = ValueType::kInt;
+        static constexpr const auto kUInt    = ValueType::kUInt;
+        static constexpr const auto kFloat   = ValueType::kFloat;
+        static constexpr const auto kString  = ValueType::kString;
+        static constexpr const auto kBinary  = ValueType::kBinary;
+        static constexpr const auto kArray   = ValueType::kArray;
+        static constexpr const auto kObject  = ValueType::kObject;
+        static constexpr const auto kPointer = ValueType::kPointer;
+        static constexpr const auto kDynamic = ValueType::kDynamic;
+        static constexpr const auto kAny     = ValueType::kAny;
+
+        Value(const ValueType v)
+            : type_(v)
+            , data_(v)
+        {
+        }
+
+        Value(std::nullptr_t = nullptr) noexcept
+            : Value(ValueType::kNull)
+        {
+        }
+
+        template<typename T, std::enable_if_t<std::is_same_v<T, ValueRef>, int> = 0>
+        Value(const T& ref)
+            : Value(ref.moved_or_copied())
+        {
+        }
+
+        Value(const Value& other)
+            : type_(other.type_)
+        {
+            switch (type_)
+            {
+                case ValueType::kNull:
+                    break;
+                case ValueType::kBool:
+                    data_ = other.data_.boolean;
+                    break;
+                case ValueType::kInt:
+                    data_ = other.data_.number_integer;
+                    break;
+                case ValueType::kUInt:
+                    data_ = other.data_.number_unsigned;
+                    break;
+                case ValueType::kFloat:
+                    data_ = other.data_.number_float;
+                    break;
+                case ValueType::kString:
+                    data_ = *other.data_.string;
+                    break;
+                case ValueType::kBinary:
+                    data_ = *other.data_.binary;
+                    break;
+                case ValueType::kArray:
+                    data_ = *other.data_.array;
+                    break;
+                case ValueType::kObject:
+                    data_ = *other.data_.object;
+                    break;
+                case ValueType::kPointer:
+                    data_ = *other.data_.pointer;
+                    break;
+                case ValueType::kAny:
+                    data_.any = create<Any>(*other.data_.any);
+                    break;
+                default:
+                    throw_exception(eInvalidArgument);
+            }
+        }
+
+        template<class T, std::enable_if_t<std::is_same<std::decay_t<T>, bool>::value, bool> = true>
+        Value(T&& value)
+            : type_(kBool)
+            , data_(Boolean{value})
+        {
+        }
+
+        Value(int8_t value)
+            : type_(kInt)
+            , data_(Integer{value})
+        {
+        }
+        Value(int16_t value)
+            : type_(kInt)
+            , data_(Integer{value})
+        {
+        }
+        Value(int32_t value)
+            : type_(kInt)
+            , data_(Integer{value})
+        {
+        }
+        Value(int64_t value)
+            : type_(kInt)
+            , data_(Integer{value})
+        {
+        }
+        Value(uint8_t value)
+            : type_(kUInt)
+            , data_(Unsigned{value})
+        {
+        }
+        Value(uint16_t value)
+            : type_(kUInt)
+            , data_(Unsigned{value})
+        {
+        }
+        Value(uint32_t value)
+            : type_(kUInt)
+            , data_(Unsigned{value})
+        {
+        }
+        Value(uint64_t value)
+            : type_(kUInt)
+            , data_(Unsigned{value})
+        {
+        }
+        Value(float value)
+            : type_(kFloat)
+            , data_(Float{value})
+        {
+        }
+        Value(double value)
+            : type_(kFloat)
+            , data_(Float{value})
+        {
+        }
+        Value(Binary value)
+            : type_(kBinary)
+            , data_(std::move(value))
+        {
+        }
+        Value(Array value)
+            : type_(kArray)
+            , data_(std::move(value))
+        {
+        }
+        Value(Object value)
+            : type_(kObject)
+            , data_(std::move(value))
+        {
+        }
+        Value(Pointer value)
+            : type_(kPointer)
+            , data_(std::move(value))
+        {
+        }
+
+        template<class T, std::enable_if_t<std::is_constructible<String, T>::value, bool> = true>
+        Value(T&& value)
+            : type_(kString)
+            , data_(String{std::forward<T>(value)})
+        {
+        }
+
+        template<typename T, std::enable_if_t<is_cast_by_erasure<std::decay_t<T>>::value, bool> = true>
+        Value(T&& value)
+            : Value(cast_by_erasure(std::forward<T>(value)))
+        {
+        }
+
+        template<typename T>
+        Value(EraseType<T>&& value)
+            : type_(Type::kAny)
+        {
+            data_.any = create<Any>(std::forward<T>(value.value));
+        }
+
+        Value(std::initializer_list<ValueRef> init, bool type_deduction = true, Type manual_type = Type::kArray);
+
+        Value(Value&& other) noexcept
+            : type_(other.type_)
+            , data_(other.data_)
+        {
+            other.type_ = ValueType::kNull;
+            other.data_ = {};
+        }
+
+        // copy-and-swap
+        Value& operator=(Value other) noexcept
+        {
+            using std::swap;
+            swap(type_, other.type_);
+            swap(data_, other.data_);
+            return *this;
+        }
+
+        ~Value()
+        {
+            data_.destroy(type_);
+        }
+
+        operator Type() const noexcept
+        {
+            return type();
+        }
+        Type type() const noexcept
+        {
+            return _unwrap().type_;
+        }
+        bool is_null() const noexcept
+        {
+            return _unwrap()._is_null();
+        }
+        bool is_array() const noexcept
+        {
+            return _unwrap()._is_array();
+        }
+        bool is_object() const noexcept
+        {
+            return _unwrap()._is_object();
+        }
+        template<typename T = void>
+        bool is_any() const noexcept
+        {
+            return _unwrap()._is_any<T>();
+        }
+        bool is_boolean() const noexcept
+        {
+            return _unwrap()._is_boolean();
+        }
+        bool is_string() const noexcept
+        {
+            return _unwrap()._is_string();
+        }
+        bool is_binary() const noexcept
+        {
+            return _unwrap()._is_binary();
+        }
+        bool is_number() const noexcept
+        {
+            return _unwrap()._is_number();
+        }
+        bool is_number_integer() const noexcept
+        {
+            return _unwrap()._is_number_integer();
+        }
+        bool is_number_unsigned() const noexcept
+        {
+            return _unwrap()._is_number_unsigned();
+        }
+        bool is_number_float() const noexcept
+        {
+            return _unwrap()._is_number_float();
+        }
+        bool is_pointer() const noexcept
+        {
+            return _is_pointer();
+        }
+        size_t size() const noexcept
+        {
+            return _unwrap()._size();
+        }
+        bool empty() const noexcept
+        {
+            return _unwrap()._empty();
+        }
+
+      private:
+        constexpr Type _type() const noexcept
+        {
+            return type_;
+        }
+
+        constexpr bool _is_null() const noexcept
+        {
+            return type_ == Type::kNull;
+        }
+        constexpr bool _is_array() const noexcept
+        {
+            return type_ == Type::kArray;
+        }
+        constexpr bool _is_object() const noexcept
+        {
+            return type_ == Type::kObject;
+        }
+
+        template<typename T = void>
+        constexpr bool _is_any() const noexcept
+        {
+            if (type_ != Type::kAny)
+            {
+                return false;
+            }
+            if constexpr (std::is_void_v<T>)
+            {
+                return true;
+            }
+            else
+            {
+                return traits::TypeId<T>::value == data_.any->type();
+            }
+        }
+
+        constexpr bool _is_boolean() const noexcept
+        {
+            return type_ == Type::kBool;
+        }
+        constexpr bool _is_string() const noexcept
+        {
+            return type_ == Type::kString;
+        }
+        constexpr bool _is_binary() const noexcept
+        {
+            return type_ == Type::kBinary;
+        }
+        constexpr bool _is_number() const noexcept
+        {
+            return _is_number_integer() || _is_number_float();
+        }
+
+        constexpr bool _is_number_integer() const noexcept
+        {
+            return type_ == Type::kInt || type_ == Type::kUInt;
+        }
+
+        constexpr bool _is_number_unsigned() const noexcept
+        {
+            return type_ == Type::kUInt;
+        }
+        constexpr bool _is_number_float() const noexcept
+        {
+            return type_ == Type::kFloat;
+        }
+        constexpr bool _is_pointer() const noexcept
+        {
+            return type_ == Type::kPointer;
+        }
+
+        size_t _size() const noexcept
+        {
+            switch (_type())
+            {
+                case ValueType::kNull:
+                    return 0;
+                case ValueType::kArray:
+                    return data_.array->size();
+                case ValueType::kObject:
+                    return data_.object->size();
+                default:
+                    return 1;
+            }
+        }
+
+        bool _empty() const noexcept
+        {
+            switch (_type())
+            {
+                case Type::kNull:
+                    return true;
+                case Type::kArray:
+                    return data_.array->empty();
+                case Type::kObject:
+                    return data_.object->empty();
+                default:
+                    return false;
+            }
+        }
+
+      private:
+        Boolean* get_impl_ptr(Boolean*) noexcept
+        {
+            return _is_boolean() ? &data_.boolean : nullptr;
+        }
+        const Boolean* get_impl_ptr(const Boolean*) const noexcept
+        {
+            return _is_boolean() ? &data_.boolean : nullptr;
+        }
+        Integer* get_impl_ptr(Integer*) noexcept
+        {
+            return _is_number_integer() ? &data_.number_integer : nullptr;
+        }
+        const Integer* get_impl_ptr(const Integer*) const noexcept
+        {
+            return _is_number_integer() ? &data_.number_integer : nullptr;
+        }
+        Unsigned* get_impl_ptr(Unsigned*) noexcept
+        {
+            return _is_number_unsigned() ? &data_.number_unsigned : nullptr;
+        }
+        const Unsigned* get_impl_ptr(const Unsigned*) const noexcept
+        {
+            return _is_number_unsigned() ? &data_.number_unsigned : nullptr;
+        }
+        Float* get_impl_ptr(Float*) noexcept
+        {
+            return _is_number_float() ? &data_.number_float : nullptr;
+        }
+        const Float* get_impl_ptr(const Float*) const noexcept
+        {
+            return _is_number_float() ? &data_.number_float : nullptr;
+        }
+        String* get_impl_ptr(String*) noexcept
+        {
+            return _is_string() ? data_.string : nullptr;
+        }
+        const String* get_impl_ptr(const String*) const noexcept
+        {
+            return _is_string() ? data_.string : nullptr;
+        }
+        Binary* get_impl_ptr(Binary*) noexcept
+        {
+            return _is_binary() ? data_.binary : nullptr;
+        }
+        const Binary* get_impl_ptr(const Binary*) const noexcept
+        {
+            return _is_binary() ? data_.binary : nullptr;
+        }
+        Array* get_impl_ptr(Array*) noexcept
+        {
+            return _is_array() ? data_.array : nullptr;
+        }
+        const Array* get_impl_ptr(const Array*) const noexcept
+        {
+            return _is_array() ? data_.array : nullptr;
+        }
+        Object* get_impl_ptr(Object*) noexcept
+        {
+            return _is_object() ? data_.object : nullptr;
+        }
+        const Object* get_impl_ptr(const Object*) const noexcept
+        {
+            return _is_object() ? data_.object : nullptr;
+        }
+        Pointer* get_impl_ptr(Pointer*) noexcept
+        {
+            return _is_pointer() ? data_.pointer : nullptr;
+        }
+        const Pointer* get_impl_ptr(const Pointer*) const noexcept
+        {
+            return _is_pointer() ? data_.pointer : nullptr;
+        }
+        Any* get_impl_ptr(Any*) noexcept
+        {
+            return _is_any() ? data_.any : nullptr;
+        }
+        const Any* get_impl_ptr(const Any*) const noexcept
+        {
+            return _is_any() ? data_.any : nullptr;
+        }
+
+        template<typename T>
+        T* get_erased_ptr(EraseType<T>*) noexcept
+        {
+            return _is_any() ? static_any_cast<T>(data_.any) : nullptr;
+        }
+        template<typename T>
+        const T* get_erased_ptr(const EraseType<T>*) const noexcept
+        {
+            return _is_any() ? static_any_cast<T>(const_cast<const Any*>(data_.any)) : nullptr;
+        }
+
+        template<typename T, typename This>
+        static auto get_ref_impl(This& obj)
+            -> decltype((*obj.template get_ptr<std::add_pointer_t<T>>()))
+        {
+            auto p = obj.template get_ptr<std::add_pointer_t<T>>();
+            if (p)
+            {
+                return *p;
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        template<typename T, std::enable_if_t<std::is_pointer<T>::value, bool> = true>
+        auto _get_ptr() noexcept -> decltype(std::declval<Value&>().get_impl_ptr(std::declval<T>()))
+        {
+            return get_impl_ptr(static_cast<T>(nullptr));
+        }
+
+        template<typename T, std::enable_if_t<detail::is_pointer_to_const<T>::value, bool> = true>
+        auto _get_ptr() const noexcept
+            -> decltype(std::declval<const Value&>().get_impl_ptr(std::declval<T>()))
+        {
+            return get_impl_ptr(static_cast<T>(nullptr));
+        }
+
+        template<typename T, std::enable_if_t<std::is_pointer<T>::value, bool> = true>
+        auto _get_ptr() noexcept -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<T>()))
+        {
+            return get_erased_ptr(static_cast<T>(nullptr));
+        }
+
+        template<typename T, std::enable_if_t<detail::is_pointer_to_const<T>::value, bool> = true>
+        auto _get_ptr() const noexcept
+            -> decltype(std::declval<const Value&>().get_erased_ptr(std::declval<T>()))
+        {
+            return get_erased_ptr(static_cast<T>(nullptr));
+        }
+
+        // T* -> EraseType<T>*
+        template<
+            typename T,
+            typename T0                                                                        = std::remove_pointer_t<T>,
+            std::enable_if_t<std::is_pointer<T>::value && is_cast_by_erasure<T0>::value, bool> = true>
+        auto _get_ptr() noexcept
+            -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<EraseType<T0>*>()))
+        {
+            return get_erased_ptr(static_cast<EraseType<T0>*>(nullptr));
+        }
+
+        // const T* -> const EraseType<T>*
+        template<typename T, typename T0 = std::remove_const_t<std::remove_pointer_t<T>>, std::enable_if_t<detail::is_pointer_to_const<T>::value && is_cast_by_erasure<T0>::value, bool> = true>
+        auto _get_ptr() const noexcept
+            -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<const EraseType<T0>*>()))
+        {
+            return get_erased_ptr(static_cast<const EraseType<T0>*>(nullptr));
+        }
+
+        template<typename T>
+        static auto            test_get_ptr(T) -> decltype(std::declval<Value&>()._get_ptr<T>(), std::true_type{});
+
+        static std::false_type test_get_ptr(...);
+
+        template<typename T>
+        using has_get_ptr = decltype(test_get_ptr(std::declval<std::add_pointer_t<T>>()));
+
+        template<typename T, std::enable_if_t<std::is_reference<T>::value, bool> = true>
+        auto _get_ref() -> decltype((get_ref_impl<T>(std::declval<Value&>())))
+        {
+            return get_ref_impl<T>(*this);
+        }
+
+        template<typename T, std::enable_if_t<detail::is_const_reference<T>::value, bool> = true>
+        auto _get_ref() const -> decltype((get_ref_impl<T>(std::declval<Value&>())))
+        {
+            return get_ref_impl<T>(*this);
+        }
+
+        template<typename T,
+                 std::enable_if_t<std::is_same<std::remove_const_t<T>, Value>::value, bool> = true>
+        Value _get() const
+        {
+            return *this;
+        }
+
+        template<typename T,
+                 std::enable_if_t<!std::is_arithmetic<T>::value && has_get_ptr<T>::value, bool> = true>
+        auto _get() const
+            -> std::remove_reference_t<decltype(std::declval<Value&>()._get_ref<const T&>())>
+        {
+            return get_ref<const T&>();
+        }
+
+        template<typename T, std::enable_if_t<std::is_arithmetic<T>::value, bool> = true>
+        T _get() const
+        {
+            switch (_type())
+            {
+                case kInt:
+                    return static_cast<T>(*_get_ptr<const Integer*>());
+                case kUInt:
+                    return static_cast<T>(*_get_ptr<const Unsigned*>());
+                case kFloat:
+                    return static_cast<T>(*_get_ptr<const Float*>());
+                case kBool:
+                    return static_cast<T>(*_get_ptr<const Boolean*>());
+                default:
+                    throw_exception(eInvalidArgument);
+            }
+        }
+
+        template<typename T, std::enable_if_t<std::is_same<T, const char*>::value, bool> = true>
+        const char* _get() const
+        {
+            if (_is_string())
+            {
+                return data_.string->c_str();
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        template<typename T>
+        T& _get_to(T& v) const
+        {
+            v = get<T>();
+            return v;
+        }
+
+      public:
+        template<typename T>
+        auto get_ptr() noexcept -> decltype(std::declval<Value&>()._get_ptr<T>())
+        {
+            return _unwrap()._get_ptr<T>();
+        }
+
+        template<typename T>
+        auto get_ptr() const noexcept -> decltype(std::declval<const Value&>()._get_ptr<T>())
+        {
+            return _unwrap()._get_ptr<T>();
+        }
+
+        template<typename T>
+        auto get_ref() -> decltype((std::declval<Value&>()._get_ref<T>()))
+        {
+            return _unwrap()._get_ref<T>();
+        }
+
+        template<typename T>
+        auto get_ref() const -> decltype((std::declval<const Value&>()._get_ref<T>()))
+        {
+            return _unwrap()._get_ref<T>();
+        }
+
+        template<typename T>
+        auto get() -> decltype(std::declval<Value&>()._get<T>())
+        {
+            return _unwrap()._get<T>();
+        }
+
+        template<typename T>
+        auto get() const -> decltype(std::declval<const Value&>()._get<T>())
+        {
+            return _unwrap()._get<T>();
+        }
+
+        template<typename T>
+        auto get_to(T& v) const -> decltype((std::declval<const Value&>()._get_to(v)))
+        {
+            return _unwrap()._get_to(v);
+        }
+
+        Array& array() &
+        {
+            return get_ref<Array&>();
+        }
+        Array&& array() &&
+        {
+            return static_cast<Array&&>(get_ref<Array&>());
+        }
+        const Array& array() const&
+        {
+            return get_ref<const Array&>();
+        }
+        const Array&& array() const&&
+        {
+            return static_cast<const Array&&>(get_ref<const Array&>());
+        }
+
+        Object& object() &
+        {
+            return get_ref<Object&>();
+        }
+        Object&& object() &&
+        {
+            return static_cast<Object&&>(get_ref<Object&>());
+        }
+        const Object& object() const&
+        {
+            return get_ref<const Object&>();
+        }
+        const Object&& object() const&&
+        {
+            return static_cast<const Object&&>(get_ref<const Object&>());
+        }
+
+        value_type& operator[](size_t idx) &
+        {
+            return static_cast<value_type&>(_unwrap()._subscript(idx));
+        }
+
+        value_type&& operator[](size_t idx) &&
+        {
+            return static_cast<value_type&&>(_unwrap()._subscript(idx));
+        }
+
+        const value_type& operator[](size_t idx) const&
+        {
+            return static_cast<const value_type&>(_unwrap()._subscript(idx));
+        }
+
+        const value_type&& operator[](size_t idx) const&&
+        {
+            return static_cast<const value_type&&>(_unwrap()._subscript(idx));
+        }
+
+        value_type& operator[](const Object::key_type& idx) &
+        {
+            return static_cast<value_type&>(_unwrap()._subscript(idx));
+        }
+
+        value_type&& operator[](const Object::key_type& idx) &&
+        {
+            return static_cast<value_type&&>(_unwrap()._subscript(idx));
+        }
+
+        const value_type& operator[](const Object::key_type& idx) const&
+        {
+            return static_cast<const value_type&>(_unwrap()._subscript(idx));
+        }
+
+        const value_type&& operator[](const Object::key_type& idx) const&&
+        {
+            return static_cast<const value_type&&>(_unwrap()._subscript(idx));
+        }
+
+        reference front()
+        {
+            return _unwrap()._front();
+        }
+
+        const_reference front() const
+        {
+            return _unwrap()._front();
+        }
+
+        reference back()
+        {
+            return _unwrap()._back();
+        }
+
+        const_reference back() const
+        {
+            return _unwrap()._back();
+        }
+
+        void push_back(Value&& val)
+        {
+            _unwrap()._push_back(std::move(val));
+        }
+
+        void push_back(const Value& val)
+        {
+            _unwrap()._push_back(val);
+        }
+
+        template<typename Key>
+        bool contains(Key&& key) const
+        {
+            return _unwrap()._contains(std::forward<Key>(key));
+        }
+
+        template<typename Key>
+        iterator find(Key&& key)
+        {
+            return _unwrap()._find(std::forward<Key>(key));
+        }
+
+        template<typename Key>
+        const_iterator find(Key&& key) const
+        {
+            return _unwrap()._find(std::forward<Key>(key));
+        }
+
+        template<typename T>
+        T value(const typename Object::key_type& key, const T& default_value) const
+        {
+            return _unwrap()._value(key, default_value);
+        }
+
+        iterator begin()
+        {
+            return _unwrap()._begin();
+        }
+
+        iterator end()
+        {
+            return _unwrap()._end();
+        }
+
+        const_iterator begin() const
+        {
+            return _unwrap()._begin();
+        }
+
+        const_iterator end() const
+        {
+            return _unwrap()._end();
+        }
+
+        void update(const_reference v)
+        {
+            return _unwrap()._update(v);
+        }
+
+      private:
+        reference _front()
+        {
+            if (_is_array())
+            {
+                return (*data_.array).front();
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        const_reference _front() const
+        {
+            if (_is_array())
+            {
+                return (*data_.array).front();
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        reference _back()
+        {
+            if (_is_array())
+            {
+                return (*data_.array).back();
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        const_reference _back() const
+        {
+            if (_is_array())
+            {
+                return (*data_.array).back();
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        void _push_back(Value&& val)
+        {
+            if (!(_is_null() || _is_array()))
+            {
+                throw_exception(eInvalidArgument);
+            }
+            if (_is_null())
+            {
+                *this = Type::kArray;
+            }
+            data_.array->push_back(std::move(val));
+        }
+
+        void _push_back(const Value& val)
+        {
+            if (!(_is_null() || _is_array()))
+            {
+                throw_exception(eInvalidArgument);
+            }
+            if (_is_null())
+            {
+                *this = Type::kArray;
+            }
+            data_.array->push_back(val);
+        }
+
+        template<typename Key>
+        bool _contains(Key&& key) const
+        {
+            return _is_object() && data_.object->find(std::forward<Key>(key)) != data_.object->end();
+        }
+
+        template<typename Key>
+        iterator _find(Key&& key)
+        {
+            if (_is_object())
+            {
+                auto iter = data_.object->find(std::forward<Key>(key));
+                return {this, iter};
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        template<typename Key>
+        const_iterator _find(Key&& key) const
+        {
+            if (_is_object())
+            {
+                auto iter = data_.object->find(std::forward<Key>(key));
+                return {this, iter};
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        template<typename T>
+        T _value(const typename Object::key_type& key, const T& default_value) const
+        {
+            if (_is_object())
+            {
+                const auto it = _find(key);
+                if (it != _end())
+                {
+                    return (*it)._get<T>();
+                }
+                return default_value;
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        iterator _begin()
+        {
+            if (_is_array())
+            {
+                return {this, data_.array->begin()};
+            }
+            else if (_is_object())
+            {
+                return {this, data_.object->begin()};
+            }
+            else
+            {
+                throw_exception(eInvalidArgument);
+            }
+        }
+
+        iterator _end()
+        {
+            if (_is_array())
+            {
+                return {this, data_.array->end()};
+            }
+            else if (_is_object())
+            {
+                return {this, data_.object->end()};
+            }
+            else
+            {
+                throw_exception(eInvalidArgument);
+            }
+        }
+
+        const_iterator _begin() const
+        {
+            if (_is_array())
+            {
+                return {this, data_.array->begin()};
+            }
+            else if (_is_object())
+            {
+                return {this, data_.object->begin()};
+            }
+            else
+            {
+                throw_exception(eInvalidArgument);
+            }
+        }
+
+        const_iterator _end() const
+        {
+            if (_is_array())
+            {
+                return {this, data_.array->end()};
+            }
+            else if (_is_object())
+            {
+                return {this, data_.object->end()};
+            }
+            else
+            {
+                throw_exception(eInvalidArgument);
+            }
+        }
+
+        void _update(const_reference v)
+        {
+            if (_is_null())
+            {
+                type_        = ValueType::kObject;
+                data_.object = create<Object>();
+            }
+            if (!(_is_object() && v._is_object()))
+            {
+                throw_exception(eInvalidArgument);
+            }
+            for (auto it = v._begin(); it != v._end(); ++it)
+            {
+                data_.object->operator[](it.key()) = *it;
+            }
+        }
+
+        Value& _unwrap()
+        {
+            auto p = this;
+            while (p->_is_pointer() && *p->data_.pointer)
+            {
+                p = p->data_.pointer->get();
+            }
+            return *p;
+        }
+
+        const Value& _unwrap() const
+        {
+            auto p = this;
+            while (p->_is_pointer() && *p->data_.pointer)
+            {
+                p = p->data_.pointer->get();
+            }
+            return *p;
+        }
+
+      private:
+        template<typename T, typename... Args>
+        static T* create(Args&&... args)
+        {
+            return new T(std::forward<Args>(args)...);
+        }
+
+        template<typename T>
+        static void release(T* ptr)
+        {
+            delete ptr;
+        }
+
+        value_type& _subscript(size_t idx)
+        {
+            if (_is_array())
+            {
+                return (*data_.array)[idx];
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        const value_type& _subscript(size_t idx) const
+        {
+            if (_is_array())
+            {
+                return (*data_.array)[idx];
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        reference _subscript(const Object::key_type& key)
+        {
+            if (_is_null())
+            {
+                type_        = Type::kObject;
+                data_.object = create<Object>();
+            }
+            if (_is_object())
+            {
+                return (*data_.object)[key];
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        const_reference _subscript(const Object::key_type& key) const
+        {
+            if (_is_object())
+            {
+                return (*data_.object)[key];
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+      private:
+        union ValueData
+        {
+            Boolean  boolean;
+            Integer  number_integer;
+            Unsigned number_unsigned;
+            Float    number_float;
+            String*  string;
+            Binary*  binary;
+            Array*   array;
+            Object*  object;
+            Dynamic* dynamic;
+            Pointer* pointer;
+            Any*     any;
+
+            ValueData() = default;
+
+            ValueData(Boolean v) noexcept
+                : boolean(v)
+            {
+            }
+
+            ValueData(Integer v) noexcept
+                : number_integer(v)
+            {
+            }
+
+            ValueData(Unsigned v) noexcept
+                : number_unsigned(v)
+            {
+            }
+
+            ValueData(Float v) noexcept
+                : number_float(v)
+            {
+            }
+
+            ValueData(Type type)
+            {
+                switch (type)
+                {
+                    case Type::kBool:
+                        boolean = Boolean{};
+                        break;
+                    case Type::kInt:
+                        number_integer = Integer{};
+                        break;
+                    case Type::kUInt:
+                        number_unsigned = Unsigned{};
+                        break;
+                    case Type::kFloat:
+                        number_float = Float{};
+                        break;
+                    case Type::kString:
+                        string = create<String>();
+                        break;
+                    case Type::kBinary:
+                        binary = create<Binary>();
+                        break;
+                    case Type::kArray:
+                        array = create<Array>();
+                        break;
+                    case Type::kObject:
+                        object = create<Object>();
+                        break;
+                    case Type::kPointer:
+                        pointer = create<Pointer>();
+                        break;
+                    case Type::kAny:
+                        any = create<Any>();
+                        break;
+                    case Type::kNull:
+                        object = nullptr;
+                        break;
+                    default:
+                        throw_exception(eNotSupported);
+                }
+            }
+
+            ValueData(const String& value)
+            {
+                string = create<String>(value);
+            }
+
+            ValueData(String&& value)
+            {
+                string = create<String>(std::move(value));
+            }
+
+            ValueData(const Binary& value)
+            {
+                binary = create<Binary>(value);
+            }
+
+            ValueData(Binary&& value)
+            {
+                binary = create<Binary>(std::move(value));
+            }
+
+            ValueData(const Object& value)
+            {
+                object = create<Object>(value);
+            }
+
+            ValueData(Object&& value)
+            {
+                object = create<Object>(std::move(value));
+            }
+
+            ValueData(const Array& value)
+            {
+                array = create<Array>(value);
+            }
+
+            ValueData(Array&& value)
+            {
+                array = create<Array>(std::move(value));
+            }
+
+            ValueData(const Pointer& value)
+            {
+                pointer = create<Pointer>(value);
+            }
+
+            ValueData(Pointer&& value)
+            {
+                pointer = create<Pointer>(std::move(value));
+            }
+
+            // nlohmann/json used an iterative implementation
+            void destroy(ValueType t)
+            {
+                switch (t)
+                {
+                    case ValueType::kString:
+                        release(string);
+                        break;
+                    case ValueType::kBinary:
+                        release(binary);
+                        break;
+                    case ValueType::kArray:
+                        release(array);
+                        break;
+                    case ValueType::kObject:
+                        release(object);
+                        break;
+                    case ValueType::kPointer:
+                        release(pointer);
+                        break;
+                    case ValueType::kAny:
+                        release(any);
+                        break;
+                    default:
+                        break;
+                }
+            }
+        };
+
+        ValueType type_ = ValueType::kNull;
+        ValueData data_ = {};
+    };
+
+    namespace detail
+    {
+
+        class ValueRef
+        {
+          public:
+            ValueRef(Value&& value)
+                : owned_value_(std::move(value))
+                , value_ref_(&owned_value_)
+                , is_rvalue_(true)
+            {
+            }
+
+            ValueRef(const Value& value)
+                : value_ref_(const_cast<Value*>(&value))
+                , is_rvalue_(false)
+            {
+            }
+
+            ValueRef(std::initializer_list<ValueRef> init)
+                : owned_value_(init)
+                , value_ref_(&owned_value_)
+                , is_rvalue_(true)
+            {
+            }
+
+            template<typename... Args, std::enable_if_t<std::is_constructible_v<Value, Args...>, int> = 0>
+            ValueRef(Args&&... args)
+                : owned_value_(std::forward<Args>(args)...)
+                , value_ref_(&owned_value_)
+                , is_rvalue_(true)
+            {
+            }
+
+            ValueRef(ValueRef&&)                 = default;
+            ValueRef(const ValueRef&)            = delete;
+            ValueRef& operator=(const ValueRef&) = delete;
+            ValueRef& operator=(ValueRef&&)      = delete;
+            ~ValueRef()                          = default;
+
+            Value moved_or_copied() const
+            {
+                if (is_rvalue_)
+                {
+                    return std::move(*value_ref_);
+                }
+                return *value_ref_;
+            }
+
+            const Value& operator*() const
+            {
+                return *static_cast<const Value*>(value_ref_);
+            }
+            const Value* operator->() const
+            {
+                return static_cast<const Value*>(value_ref_);
+            }
+
+          private:
+            mutable Value owned_value_;
+            Value*        value_ref_ = nullptr;
+            const bool    is_rvalue_ = true;
+        };
+
+    }  // namespace detail
+
+    inline Value::Value(std::initializer_list<ValueRef> init, bool type_deduction, Type manual_type)
+    {
+        bool is_an_object = true;
+        for (const auto& x : init)
+        {
+            if (!(x->_is_array() && x->_size() == 2 && x->_front()._is_string()))
+            {
+                is_an_object = false;
+                break;
+            }
+        }
+        if (!type_deduction)
+        {
+            if (manual_type == Type::kArray)
+            {
+                is_an_object = false;
+            }
+            if (manual_type == Type::kObject && !is_an_object)
+            {
+                throw_exception(eInvalidArgument);
+            }
+        }
+        if (is_an_object)
+        {
+            type_ = Type::kObject;
+            data_ = Type::kObject;
+            for (const auto& x : init)
+            {
+                auto e = x.moved_or_copied();
+                data_.object->emplace(std::move(*((*e.data_.array)[0].data_.string)),
+                                      std::move((*e.data_.array)[1]));
+            }
+        }
+        else
+        {
+            type_       = Type::kArray;
+            data_.array = create<Array>(init.begin(), init.end());
+        }
     }
-    if (_is_null()) {
-      *this = Type::kArray;
-    }
-    data_.array->push_back(std::move(val));
-  }
-
-  void _push_back(const Value& val) {
-    if (!(_is_null() || _is_array())) {
-      throw_exception(eInvalidArgument);
-    }
-    if (_is_null()) {
-      *this = Type::kArray;
-    }
-    data_.array->push_back(val);
-  }
-
-  template <typename Key>
-  bool _contains(Key&& key) const {
-    return _is_object() && data_.object->find(std::forward<Key>(key)) != data_.object->end();
-  }
-
-  template <typename Key>
-  iterator _find(Key&& key) {
-    if (_is_object()) {
-      auto iter = data_.object->find(std::forward<Key>(key));
-      return {this, iter};
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  template <typename Key>
-  const_iterator _find(Key&& key) const {
-    if (_is_object()) {
-      auto iter = data_.object->find(std::forward<Key>(key));
-      return {this, iter};
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  template <typename T>
-  T _value(const typename Object::key_type& key, const T& default_value) const {
-    if (_is_object()) {
-      const auto it = _find(key);
-      if (it != _end()) {
-        return (*it)._get<T>();
-      }
-      return default_value;
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  iterator _begin() {
-    if (_is_array()) {
-      return {this, data_.array->begin()};
-    } else if (_is_object()) {
-      return {this, data_.object->begin()};
-    } else {
-      throw_exception(eInvalidArgument);
-    }
-  }
-
-  iterator _end() {
-    if (_is_array()) {
-      return {this, data_.array->end()};
-    } else if (_is_object()) {
-      return {this, data_.object->end()};
-    } else {
-      throw_exception(eInvalidArgument);
-    }
-  }
-
-  const_iterator _begin() const {
-    if (_is_array()) {
-      return {this, data_.array->begin()};
-    } else if (_is_object()) {
-      return {this, data_.object->begin()};
-    } else {
-      throw_exception(eInvalidArgument);
-    }
-  }
-
-  const_iterator _end() const {
-    if (_is_array()) {
-      return {this, data_.array->end()};
-    } else if (_is_object()) {
-      return {this, data_.object->end()};
-    } else {
-      throw_exception(eInvalidArgument);
-    }
-  }
-
-  void _update(const_reference v) {
-    if (_is_null()) {
-      type_ = ValueType::kObject;
-      data_.object = create<Object>();
-    }
-    if (!(_is_object() && v._is_object())) {
-      throw_exception(eInvalidArgument);
-    }
-    for (auto it = v._begin(); it != v._end(); ++it) {
-      data_.object->operator[](it.key()) = *it;
-    }
-  }
-
-  Value& _unwrap() {
-    auto p = this;
-    while (p->_is_pointer() && *p->data_.pointer) {
-      p = p->data_.pointer->get();
-    }
-    return *p;
-  }
-
-  const Value& _unwrap() const {
-    auto p = this;
-    while (p->_is_pointer() && *p->data_.pointer) {
-      p = p->data_.pointer->get();
-    }
-    return *p;
-  }
-
- private:
-  template <typename T, typename... Args>
-  static T* create(Args&&... args) {
-    return new T(std::forward<Args>(args)...);
-  }
-
-  template <typename T>
-  static void release(T* ptr) {
-    delete ptr;
-  }
-
-  value_type& _subscript(size_t idx) {
-    if (_is_array()) {
-      return (*data_.array)[idx];
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  const value_type& _subscript(size_t idx) const {
-    if (_is_array()) {
-      return (*data_.array)[idx];
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  reference _subscript(const Object::key_type& key) {
-    if (_is_null()) {
-      type_ = Type::kObject;
-      data_.object = create<Object>();
-    }
-    if (_is_object()) {
-      return (*data_.object)[key];
-    }
-    throw_exception(eInvalidArgument);
-  }
 
-  const_reference _subscript(const Object::key_type& key) const {
-    if (_is_object()) {
-      return (*data_.object)[key];
+    inline Value make_pointer(Value v)
+    {
+        return std::make_shared<Value>(std::move(v));
     }
-    throw_exception(eInvalidArgument);
-  }
-
- private:
-  union ValueData {
-    Boolean boolean;
-    Integer number_integer;
-    Unsigned number_unsigned;
-    Float number_float;
-    String* string;
-    Binary* binary;
-    Array* array;
-    Object* object;
-    Dynamic* dynamic;
-    Pointer* pointer;
-    Any* any;
-
-    ValueData() = default;
-
-    ValueData(Boolean v) noexcept : boolean(v) {}
-
-    ValueData(Integer v) noexcept : number_integer(v) {}
-
-    ValueData(Unsigned v) noexcept : number_unsigned(v) {}
-
-    ValueData(Float v) noexcept : number_float(v) {}
-
-    ValueData(Type type) {
-      switch (type) {
-        case Type::kBool:
-          boolean = Boolean{};
-          break;
-        case Type::kInt:
-          number_integer = Integer{};
-          break;
-        case Type::kUInt:
-          number_unsigned = Unsigned{};
-          break;
-        case Type::kFloat:
-          number_float = Float{};
-          break;
-        case Type::kString:
-          string = create<String>();
-          break;
-        case Type::kBinary:
-          binary = create<Binary>();
-          break;
-        case Type::kArray:
-          array = create<Array>();
-          break;
-        case Type::kObject:
-          object = create<Object>();
-          break;
-        case Type::kPointer:
-          pointer = create<Pointer>();
-          break;
-        case Type::kAny:
-          any = create<Any>();
-          break;
-        case Type::kNull:
-          object = nullptr;
-          break;
-        default:
-          throw_exception(eNotSupported);
-      }
-    }
-
-    ValueData(const String& value) { string = create<String>(value); }
-
-    ValueData(String&& value) { string = create<String>(std::move(value)); }
-
-    ValueData(const Binary& value) { binary = create<Binary>(value); }
-
-    ValueData(Binary&& value) { binary = create<Binary>(std::move(value)); }
-
-    ValueData(const Object& value) { object = create<Object>(value); }
-
-    ValueData(Object&& value) { object = create<Object>(std::move(value)); }
-
-    ValueData(const Array& value) { array = create<Array>(value); }
-
-    ValueData(Array&& value) { array = create<Array>(std::move(value)); }
-
-    ValueData(const Pointer& value) { pointer = create<Pointer>(value); }
-
-    ValueData(Pointer&& value) { pointer = create<Pointer>(std::move(value)); }
-
-    // nlohmann/json used an iterative implementation
-    void destroy(ValueType t) {
-      switch (t) {
-        case ValueType::kString:
-          release(string);
-          break;
-        case ValueType::kBinary:
-          release(binary);
-          break;
-        case ValueType::kArray:
-          release(array);
-          break;
-        case ValueType::kObject:
-          release(object);
-          break;
-        case ValueType::kPointer:
-          release(pointer);
-          break;
-        case ValueType::kAny:
-          release(any);
-          break;
-        default:
-          break;
-      }
-    }
-  };
 
-  ValueType type_ = ValueType::kNull;
-  ValueData data_ = {};
-};
-
-namespace detail {
-
-class ValueRef {
- public:
-  ValueRef(Value&& value)
-      : owned_value_(std::move(value)), value_ref_(&owned_value_), is_rvalue_(true) {}
-
-  ValueRef(const Value& value) : value_ref_(const_cast<Value*>(&value)), is_rvalue_(false) {}
-
-  ValueRef(std::initializer_list<ValueRef> init)
-      : owned_value_(init), value_ref_(&owned_value_), is_rvalue_(true) {}
-
-  template <typename... Args, std::enable_if_t<std::is_constructible_v<Value, Args...>, int> = 0>
-  ValueRef(Args&&... args)
-      : owned_value_(std::forward<Args>(args)...), value_ref_(&owned_value_), is_rvalue_(true) {}
-
-  ValueRef(ValueRef&&) = default;
-  ValueRef(const ValueRef&) = delete;
-  ValueRef& operator=(const ValueRef&) = delete;
-  ValueRef& operator=(ValueRef&&) = delete;
-  ~ValueRef() = default;
-
-  Value moved_or_copied() const {
-    if (is_rvalue_) {
-      return std::move(*value_ref_);
-    }
-    return *value_ref_;
-  }
-
-  const Value& operator*() const { return *static_cast<const Value*>(value_ref_); }
-  const Value* operator->() const { return static_cast<const Value*>(value_ref_); }
-
- private:
-  mutable Value owned_value_;
-  Value* value_ref_ = nullptr;
-  const bool is_rvalue_ = true;
-};
-
-}  // namespace detail
-
-inline Value::Value(std::initializer_list<ValueRef> init, bool type_deduction, Type manual_type) {
-  bool is_an_object = true;
-  for (const auto& x : init) {
-    if (!(x->_is_array() && x->_size() == 2 && x->_front()._is_string())) {
-      is_an_object = false;
-      break;
-    }
-  }
-  if (!type_deduction) {
-    if (manual_type == Type::kArray) {
-      is_an_object = false;
-    }
-    if (manual_type == Type::kObject && !is_an_object) {
-      throw_exception(eInvalidArgument);
-    }
-  }
-  if (is_an_object) {
-    type_ = Type::kObject;
-    data_ = Type::kObject;
-    for (const auto& x : init) {
-      auto e = x.moved_or_copied();
-      data_.object->emplace(std::move(*((*e.data_.array)[0].data_.string)),
-                            std::move((*e.data_.array)[1]));
-    }
-  } else {
-    type_ = Type::kArray;
-    data_.array = create<Array>(init.begin(), init.end());
-  }
-}
-
-inline Value make_pointer(Value v) { return std::make_shared<Value>(std::move(v)); }
-
-inline void update(Value::Object& dst, const Value::Object& src, int depth) {
-  if (depth < 0) {
-    return;
-  }
-  for (const auto& [key, value] : src) {
-    auto ret = dst.insert({key, value});
-    if (!ret.second && ret.first->second.is_object() && value.is_object()) {
-      update(ret.first->second.object(), value.object(), depth - 1);
+    inline void update(Value::Object& dst, const Value::Object& src, int depth)
+    {
+        if (depth < 0)
+        {
+            return;
+        }
+        for (const auto& [key, value] : src)
+        {
+            auto ret = dst.insert({key, value});
+            if (!ret.second && ret.first->second.is_object() && value.is_object())
+            {
+                update(ret.first->second.object(), value.object(), depth - 1);
+            }
+        }
     }
-  }
-}
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/device/acl/acl_device.cpp b/csrc/mmdeploy/device/acl/acl_device.cpp
index 6316f38322..3c805f0514 100644
--- a/csrc/mmdeploy/device/acl/acl_device.cpp
+++ b/csrc/mmdeploy/device/acl/acl_device.cpp
@@ -2,13 +2,18 @@
 
 #include "mmdeploy/core/device_impl.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class AclPlatformRegisterer {
- public:
-  AclPlatformRegisterer() { gPlatformRegistry().AddAlias("npu", "cpu"); }
-};
+    class AclPlatformRegisterer
+    {
+      public:
+        AclPlatformRegisterer()
+        {
+            gPlatformRegistry().AddAlias("npu", "cpu");
+        }
+    };
 
-AclPlatformRegisterer g_acl_platform_registerer;
+    AclPlatformRegisterer g_acl_platform_registerer;
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/device/cpu/cpu_device.cpp b/csrc/mmdeploy/device/cpu/cpu_device.cpp
index eb4fa4c365..00f997e92e 100644
--- a/csrc/mmdeploy/device/cpu/cpu_device.cpp
+++ b/csrc/mmdeploy/device/cpu/cpu_device.cpp
@@ -5,422 +5,528 @@
 #include <cstdlib>
 #include <cstring>
 
-namespace mmdeploy::framework {
-
-class CpuHostMemory : public NonCopyable {
- public:
-  CpuHostMemory() : size_(), data_(), owned_data_{false} {}
-  Result<void> Init(size_t size, size_t alignment) {
-    alignment = std::max(alignment, sizeof(void*));
-    auto space = (size + alignment - 1) / alignment * alignment;
+namespace mmdeploy::framework
+{
+
+    class CpuHostMemory : public NonCopyable
+    {
+      public:
+        CpuHostMemory()
+            : size_()
+            , data_()
+            , owned_data_{false}
+        {
+        }
+        Result<void> Init(size_t size, size_t alignment)
+        {
+            alignment  = std::max(alignment, sizeof(void*));
+            auto space = (size + alignment - 1) / alignment * alignment;
 #ifdef _MSC_VER
-    data_ = _aligned_malloc(space, alignment);
+            data_ = _aligned_malloc(space, alignment);
 #elif defined(ANDROID)
-    posix_memalign(&data_, alignment, space);
+            posix_memalign(&data_, alignment, space);
 #else
-    data_ = aligned_alloc(alignment, space);
+            data_ = aligned_alloc(alignment, space);
 #endif
-    if (!data_) {
-      return Status(eOutOfMemory);
-    }
-    aligned_data_ = data_;
-    size_ = size;
-    owned_data_ = true;
-    return success();
-  }
-  Result<void> Init(size_t size, std::shared_ptr<void> data) {
-    size_ = size;
-    external_ = std::move(data);
-    data_ = external_.get();
-    owned_data_ = false;
-    return success();
-  }
-  Result<void> Init(size_t size, void* data) {
-    size_ = size;
-    data_ = data;
-    owned_data_ = false;
-    return success();
-  }
-  ~CpuHostMemory() {
-    if (data_) {
-      if (owned_data_) {
+            if (!data_)
+            {
+                return Status(eOutOfMemory);
+            }
+            aligned_data_ = data_;
+            size_         = size;
+            owned_data_   = true;
+            return success();
+        }
+        Result<void> Init(size_t size, std::shared_ptr<void> data)
+        {
+            size_       = size;
+            external_   = std::move(data);
+            data_       = external_.get();
+            owned_data_ = false;
+            return success();
+        }
+        Result<void> Init(size_t size, void* data)
+        {
+            size_       = size;
+            data_       = data;
+            owned_data_ = false;
+            return success();
+        }
+        ~CpuHostMemory()
+        {
+            if (data_)
+            {
+                if (owned_data_)
+                {
 #ifdef _MSC_VER
-        _aligned_free(data_);
+                    _aligned_free(data_);
 #else
-        std::free(data_);
+                    std::free(data_);
 #endif
-        owned_data_ = false;
-      }
-      data_ = nullptr;
-    }
-    external_.reset();
-    size_ = 0;
-  }
-  size_t size() const { return size_; }
-  void* data() const { return owned_data_ ? aligned_data_ : data_; }
-
- private:
-  size_t size_;
-  void* data_;
-  void* aligned_data_{nullptr};
-  bool owned_data_;
-  std::shared_ptr<void> external_;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// CpuPlatformImpl
-
-Result<void> CpuPlatformImpl::BindDevice(Device device, Device* prev) {
-  // do nothing
-  if (prev) {
-    *prev = device;
-  }
-  return success();
-}
-
-shared_ptr<BufferImpl> CpuPlatformImpl::CreateBuffer(Device device) {
-  return std::make_shared<CpuBufferImpl>(device);
-}
-
-shared_ptr<StreamImpl> CpuPlatformImpl::CreateStream(Device device) {
-  return std::make_shared<CpuStreamImpl>(device);
-}
-
-shared_ptr<EventImpl> CpuPlatformImpl::CreateEvent(Device device) {
-  return std::make_shared<CpuEventImpl>(device);
-}
-
-int CpuPlatformImpl::GetPlatformId() const noexcept { return 0; }
-
-const char* CpuPlatformImpl::GetPlatformName() const noexcept { return "cpu"; }
-
-bool CpuPlatformImpl::CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset,
-                                     size_t dst_offset, size_t copy_size) {
-  if (src_offset + copy_size > src_size) {
-    return false;
-  }
-  if (dst_offset + copy_size > dst_size) {
-    return false;
-  }
-  return true;
-}
-
-inline void* OffsetPtr(void* ptr, size_t offset) {
-  return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
-}
-
-inline const void* OffsetPtr(const void* ptr, size_t offset) {
-  return static_cast<const void*>(static_cast<const uint8_t*>(ptr) + offset);
-}
-
-Result<void> CpuPlatformImpl::CopyImpl(const void* src, void* dst, size_t src_size, size_t dst_size,
-                                       size_t src_offset, size_t dst_offset, size_t size,
-                                       Stream st) {
-  if (!CheckCopyParam(src_size, dst_size, src_offset, dst_offset, size)) {
-    return Status(eInvalidArgument);
-  }
-  auto task = [=] { std::memcpy(OffsetPtr(dst, dst_offset), OffsetPtr(src, src_offset), size); };
-  if (!st) {
-    task();
-    return success();
-  }
-  if (st.GetDevice().platform_id() != 0) {
-    return Status(eInvalidArgument);
-  }
-  auto cpu_stream = static_cast<CpuStreamImpl*>(st.GetNative());
-  if (!cpu_stream) {
-    return Status(eInvalidArgument);
-  }
-  return cpu_stream->Enqueue(std::move(task));
-}
-
-Result<void> CpuPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
-                                   Stream stream) {
-  auto dst_ptr = dst.GetNative();
-  if (!dst_ptr) {
-    return Status(eInvalidArgument);
-  }
-  if (dst.GetDevice().platform_id() != 0) {
-    return Status(eInvalidArgument);
-  }
-  return CopyImpl(host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size, stream);
-}
-
-Result<void> CpuPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
-                                   Stream stream) {
-  auto src_ptr = src.GetNative();
-  if (!src_ptr) {
-    return Status(eInvalidArgument);
-  }
-  if (src.GetDevice().platform_id() != 0) {
-    return Status(eInvalidArgument);
-  }
-  return CopyImpl(src_ptr, host_ptr, src.GetSize(), size, src_offset, 0, size, stream);
-}
-Result<void> CpuPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t src_offset,
-                                   size_t dst_offset, Stream stream) {
-  auto src_ptr = src.GetNative();
-  auto dst_ptr = dst.GetNative();
-  if (!src_ptr || !dst_ptr) {
-    return Status(eInvalidArgument);
-  }
-  auto device = src.GetDevice();
-  if (device.platform_id() != 0 || device.platform_id() != dst.GetDevice().platform_id()) {
-    return Status(eInvalidArgument);
-  }
-  return CopyImpl(src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset, dst_offset, size,
-                  stream);
-}
-
-Result<Stream> CpuPlatformImpl::GetDefaultStream(int32_t device_id) {
-  try {
-    std::call_once(init_flag_, [&] { default_stream_ = Stream(GetDevice(device_id)); });
-    return default_stream_;
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CpuBufferImpl
-
-CpuBufferImpl::CpuBufferImpl(Device device) : BufferImpl(device) {}
-
-void* CpuBufferImpl::GetNative(ErrorCode* ec) {
-  if (!memory_) {
-    if (ec) *ec = eInvalidArgument;
-    return nullptr;
-  }
-  if (ec) *ec = ErrorCode::eSuccess;
-  return OffsetPtr(memory_->data(), offset_);
-}
-
-Allocator CpuBufferImpl::GetAllocator() const { return {}; }
-
-size_t CpuBufferImpl::GetSize(ErrorCode* ec) {
-  if (!memory_) {
-    if (ec) *ec = eInvalidArgument;
-    return 0;
-  }
-  if (ec) *ec = ErrorCode::eSuccess;
-  return size_;
-}
-
-// int CpuBufferImpl::Fill(uint8_t pattern, size_t size, size_t offset,
-//                         Stream& st) {
-//   if (!memory_ || !memory_->handle) {
-//     return Status(eInvalidArgument);
-//   }
-//   if (offset + size >= size_) {
-//     return Status(eInvalidArgument);
-//   }
-//   auto task = [=] {
-//     auto data = OffsetPtr(memory_->handle, offset);
-//     std::memset(data, pattern, size);
-//   };
-//   if (!st) {
-//     task();
-//     return M_SUCCESS;
-//   }
-//   if (st.GetDevice() != Device()) {
-//     return Status(eInvalidArgument);
-//   }
-//   auto cpu_stream = static_cast<CpuStreamImpl*>(st.GetNative());
-//   if (!cpu_stream) {
-//     return Status(eInvalidArgument);
-//   }
-//   return cpu_stream->Enqueue(std::move(task));
-// }
-
-Result<void> CpuBufferImpl::Init(size_t size, Allocator allocator, size_t alignment,
-                                 uint64_t flags) {
-  assert(!allocator && "CPU device doesn't support allocators yet");
-  memory_ = std::make_shared<CpuHostMemory>();
-  OUTCOME_TRY(memory_->Init(size, alignment));
-  size_ = size;
-  return success();
-}
-
-Result<void> CpuBufferImpl::Init(size_t size, std::shared_ptr<void> native, uint64_t flags) {
-  memory_ = std::make_shared<CpuHostMemory>();
-  OUTCOME_TRY(memory_->Init(size, std::move(native)));
-  size_ = size;
-  return success();
-}
-
-Result<BufferImplPtr> CpuBufferImpl::SubBuffer(size_t offset, size_t size, uint64_t flags) {
-  if (offset_ + offset + size > memory_->size()) {
-    return Status(eInvalidArgument);
-  }
-  auto impl = std::make_shared<CpuBufferImpl>(device_);
-  impl->memory_ = memory_;
-  impl->offset_ = offset_ + offset;
-  impl->size_ = size;
-  return impl;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CpuStreamImpl
-
-CpuStreamImpl::CpuStreamImpl(Device device) : StreamImpl(device) {}
-
-CpuStreamImpl::~CpuStreamImpl() {
-  {
-    std::lock_guard lock(mutex_);
-    abort_ = true;
-  }
-  cv_.notify_one();
-  thread_.join();
-}
-
-Result<void> CpuStreamImpl::Init(uint64_t flags) {
-  thread_ = std::thread(&CpuStreamImpl::InternalThreadEntry, this);
-  return success();
-}
-
-Result<void> CpuStreamImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
-  return Status(eNotSupported);
-}
-
-Result<void> CpuStreamImpl::Enqueue(Task task) {
-  {
-    std::lock_guard lock(mutex_);
-    task_queue_.push(std::move(task));
-  }
-  cv_.notify_one();
-  return success();
-}
-
-Result<void> CpuStreamImpl::DependsOn(Event& event) {
-  return Enqueue([&] { event.Wait().value(); });
-}
-
-Result<void> CpuStreamImpl::Query() {
-  std::lock_guard lock(mutex_);
-  if (task_queue_.empty()) {
-    return success();
-  } else {
-    return Status(eFail);
-  }
-}
-
-Result<void> CpuStreamImpl::Wait() {
-  {
-    std::unique_lock lock(mutex_);
-    cv_.wait(lock, [this] { return task_queue_.empty() || abort_; });
-  }
-  cv_.notify_one();
-  return success();
-}
-
-Result<void> CpuStreamImpl::Submit(Kernel& kernel) {
-  if (GetDevice() != kernel.GetDevice()) {
-    return Status(eInvalidArgument);
-  }
-  auto task = static_cast<Task*>(kernel.GetNative());
-  if (task) {
-    OUTCOME_TRY(Enqueue(*task));
-    return success();
-  }
-  return Status(eInvalidArgument);
-}
-
-void* CpuStreamImpl::GetNative(ErrorCode* ec) {
-  if (ec) *ec = ErrorCode::eSuccess;
-  return this;
-}
-
-void CpuStreamImpl::InternalThreadEntry() {
-  while (true) {
-    Task task;
-    {
-      std::unique_lock lock(mutex_);
-      cv_.wait(lock, [this] { return !task_queue_.empty() || abort_; });
-      if (abort_) {
-        break;
-      }
-      task = std::move(task_queue_.front());
-    }
-    if (task) {
-      task();
-    }
-    {
-      std::lock_guard lock(mutex_);
-      task_queue_.pop();
-    }
-    cv_.notify_one();
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CpuEventImpl
-
-CpuEventImpl::CpuEventImpl(Device device) : EventImpl(device) {}
-
-Result<void> CpuEventImpl::Init(uint64_t flags) {
-  Reset();
-  return success();
-};
-
-Result<void> CpuEventImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
-  return Status(eNotSupported);
-};
-
-Result<void> CpuEventImpl::Query() {
-  auto status = future_.wait_for(std::chrono::microseconds::zero());
-  if (status == std::future_status::ready) {
-    return success();
-  } else {
-    return Status(eNotReady);
-  }
-}
-
-Result<void> CpuEventImpl::Record(Stream& stream) {
-  if (stream.GetDevice() != device_) {
-    return Status(eInvalidArgument);
-  }
-  auto cpu_stream = static_cast<CpuStreamImpl*>(stream.GetNative());
-  if (!cpu_stream) return Status(eInvalidArgument);
-  Reset();
-  return cpu_stream->Enqueue([this] { promise_.set_value(); });
-}
-
-Result<void> CpuEventImpl::Wait() {
-  future_.wait();
-  return success();
-};
-
-void CpuEventImpl::Reset() {
-  promise_ = std::promise<void>();
-  future_ = promise_.get_future();
-}
-
-void* CpuEventImpl::GetNative(ErrorCode* ec) {
-  if (ec) *ec = ErrorCode::eSuccess;
-  return this;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-Kernel CreateCpuKernel(std::function<void()> task) {
-  return Kernel(std::make_shared<CpuKernelImpl>(gCpuPlatform().GetDevice(0), std::move(task)));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CpuPlatformRegisterer
-
-CpuPlatformImpl& gCpuPlatform() {
-  static Platform platform("cpu");
-  return Access::get<CpuPlatformImpl>(platform);
-}
-
-class CpuPlatformRegisterer {
- public:
-  CpuPlatformRegisterer() {
-    gPlatformRegistry().Register([] { return std::make_shared<CpuPlatformImpl>(); });
-  }
-};
-
-CpuPlatformRegisterer g_cpu_platform_registerer;
+                    owned_data_ = false;
+                }
+                data_ = nullptr;
+            }
+            external_.reset();
+            size_ = 0;
+        }
+        size_t size() const
+        {
+            return size_;
+        }
+        void* data() const
+        {
+            return owned_data_ ? aligned_data_ : data_;
+        }
+
+      private:
+        size_t                size_;
+        void*                 data_;
+        void*                 aligned_data_{nullptr};
+        bool                  owned_data_;
+        std::shared_ptr<void> external_;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CpuPlatformImpl
+
+    Result<void> CpuPlatformImpl::BindDevice(Device device, Device* prev)
+    {
+        // do nothing
+        if (prev)
+        {
+            *prev = device;
+        }
+        return success();
+    }
+
+    shared_ptr<BufferImpl> CpuPlatformImpl::CreateBuffer(Device device)
+    {
+        return std::make_shared<CpuBufferImpl>(device);
+    }
+
+    shared_ptr<StreamImpl> CpuPlatformImpl::CreateStream(Device device)
+    {
+        return std::make_shared<CpuStreamImpl>(device);
+    }
+
+    shared_ptr<EventImpl> CpuPlatformImpl::CreateEvent(Device device)
+    {
+        return std::make_shared<CpuEventImpl>(device);
+    }
+
+    int CpuPlatformImpl::GetPlatformId() const noexcept
+    {
+        return 0;
+    }
+
+    const char* CpuPlatformImpl::GetPlatformName() const noexcept
+    {
+        return "cpu";
+    }
+
+    bool CpuPlatformImpl::CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t copy_size)
+    {
+        if (src_offset + copy_size > src_size)
+        {
+            return false;
+        }
+        if (dst_offset + copy_size > dst_size)
+        {
+            return false;
+        }
+        return true;
+    }
+
+    inline void* OffsetPtr(void* ptr, size_t offset)
+    {
+        return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
+    }
+
+    inline const void* OffsetPtr(const void* ptr, size_t offset)
+    {
+        return static_cast<const void*>(static_cast<const uint8_t*>(ptr) + offset);
+    }
+
+    Result<void> CpuPlatformImpl::CopyImpl(const void* src, void* dst, size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t size, Stream st)
+    {
+        if (!CheckCopyParam(src_size, dst_size, src_offset, dst_offset, size))
+        {
+            return Status(eInvalidArgument);
+        }
+        auto task = [=]
+        { std::memcpy(OffsetPtr(dst, dst_offset), OffsetPtr(src, src_offset), size); };
+        if (!st)
+        {
+            task();
+            return success();
+        }
+        if (st.GetDevice().platform_id() != 0)
+        {
+            return Status(eInvalidArgument);
+        }
+        auto cpu_stream = static_cast<CpuStreamImpl*>(st.GetNative());
+        if (!cpu_stream)
+        {
+            return Status(eInvalidArgument);
+        }
+        return cpu_stream->Enqueue(std::move(task));
+    }
+
+    Result<void> CpuPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream)
+    {
+        auto dst_ptr = dst.GetNative();
+        if (!dst_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (dst.GetDevice().platform_id() != 0)
+        {
+            return Status(eInvalidArgument);
+        }
+        return CopyImpl(host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size, stream);
+    }
+
+    Result<void> CpuPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream)
+    {
+        auto src_ptr = src.GetNative();
+        if (!src_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (src.GetDevice().platform_id() != 0)
+        {
+            return Status(eInvalidArgument);
+        }
+        return CopyImpl(src_ptr, host_ptr, src.GetSize(), size, src_offset, 0, size, stream);
+    }
+    Result<void> CpuPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream)
+    {
+        auto src_ptr = src.GetNative();
+        auto dst_ptr = dst.GetNative();
+        if (!src_ptr || !dst_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        auto device = src.GetDevice();
+        if (device.platform_id() != 0 || device.platform_id() != dst.GetDevice().platform_id())
+        {
+            return Status(eInvalidArgument);
+        }
+        return CopyImpl(src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset, dst_offset, size, stream);
+    }
+
+    Result<Stream> CpuPlatformImpl::GetDefaultStream(int32_t device_id)
+    {
+        try
+        {
+            std::call_once(init_flag_, [&]
+                           { default_stream_ = Stream(GetDevice(device_id)); });
+            return default_stream_;
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CpuBufferImpl
+
+    CpuBufferImpl::CpuBufferImpl(Device device)
+        : BufferImpl(device)
+    {
+    }
+
+    void* CpuBufferImpl::GetNative(ErrorCode* ec)
+    {
+        if (!memory_)
+        {
+            if (ec) *ec = eInvalidArgument;
+            return nullptr;
+        }
+        if (ec) *ec = ErrorCode::eSuccess;
+        return OffsetPtr(memory_->data(), offset_);
+    }
+
+    Allocator CpuBufferImpl::GetAllocator() const
+    {
+        return {};
+    }
+
+    size_t CpuBufferImpl::GetSize(ErrorCode* ec)
+    {
+        if (!memory_)
+        {
+            if (ec) *ec = eInvalidArgument;
+            return 0;
+        }
+        if (ec) *ec = ErrorCode::eSuccess;
+        return size_;
+    }
+
+    // int CpuBufferImpl::Fill(uint8_t pattern, size_t size, size_t offset,
+    //                         Stream& st) {
+    //   if (!memory_ || !memory_->handle) {
+    //     return Status(eInvalidArgument);
+    //   }
+    //   if (offset + size >= size_) {
+    //     return Status(eInvalidArgument);
+    //   }
+    //   auto task = [=] {
+    //     auto data = OffsetPtr(memory_->handle, offset);
+    //     std::memset(data, pattern, size);
+    //   };
+    //   if (!st) {
+    //     task();
+    //     return M_SUCCESS;
+    //   }
+    //   if (st.GetDevice() != Device()) {
+    //     return Status(eInvalidArgument);
+    //   }
+    //   auto cpu_stream = static_cast<CpuStreamImpl*>(st.GetNative());
+    //   if (!cpu_stream) {
+    //     return Status(eInvalidArgument);
+    //   }
+    //   return cpu_stream->Enqueue(std::move(task));
+    // }
+
+    Result<void> CpuBufferImpl::Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags)
+    {
+        assert(!allocator && "CPU device doesn't support allocators yet");
+        memory_ = std::make_shared<CpuHostMemory>();
+        OUTCOME_TRY(memory_->Init(size, alignment));
+        size_ = size;
+        return success();
+    }
+
+    Result<void> CpuBufferImpl::Init(size_t size, std::shared_ptr<void> native, uint64_t flags)
+    {
+        memory_ = std::make_shared<CpuHostMemory>();
+        OUTCOME_TRY(memory_->Init(size, std::move(native)));
+        size_ = size;
+        return success();
+    }
+
+    Result<BufferImplPtr> CpuBufferImpl::SubBuffer(size_t offset, size_t size, uint64_t flags)
+    {
+        if (offset_ + offset + size > memory_->size())
+        {
+            return Status(eInvalidArgument);
+        }
+        auto impl     = std::make_shared<CpuBufferImpl>(device_);
+        impl->memory_ = memory_;
+        impl->offset_ = offset_ + offset;
+        impl->size_   = size;
+        return impl;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CpuStreamImpl
+
+    CpuStreamImpl::CpuStreamImpl(Device device)
+        : StreamImpl(device)
+    {
+    }
+
+    CpuStreamImpl::~CpuStreamImpl()
+    {
+        {
+            std::lock_guard lock(mutex_);
+            abort_ = true;
+        }
+        cv_.notify_one();
+        thread_.join();
+    }
+
+    Result<void> CpuStreamImpl::Init(uint64_t flags)
+    {
+        thread_ = std::thread(&CpuStreamImpl::InternalThreadEntry, this);
+        return success();
+    }
+
+    Result<void> CpuStreamImpl::Init(std::shared_ptr<void> native, uint64_t flags)
+    {
+        return Status(eNotSupported);
+    }
+
+    Result<void> CpuStreamImpl::Enqueue(Task task)
+    {
+        {
+            std::lock_guard lock(mutex_);
+            task_queue_.push(std::move(task));
+        }
+        cv_.notify_one();
+        return success();
+    }
+
+    Result<void> CpuStreamImpl::DependsOn(Event& event)
+    {
+        return Enqueue([&]
+                       { event.Wait().value(); });
+    }
+
+    Result<void> CpuStreamImpl::Query()
+    {
+        std::lock_guard lock(mutex_);
+        if (task_queue_.empty())
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eFail);
+        }
+    }
+
+    Result<void> CpuStreamImpl::Wait()
+    {
+        {
+            std::unique_lock lock(mutex_);
+            cv_.wait(lock, [this]
+                     { return task_queue_.empty() || abort_; });
+        }
+        cv_.notify_one();
+        return success();
+    }
+
+    Result<void> CpuStreamImpl::Submit(Kernel& kernel)
+    {
+        if (GetDevice() != kernel.GetDevice())
+        {
+            return Status(eInvalidArgument);
+        }
+        auto task = static_cast<Task*>(kernel.GetNative());
+        if (task)
+        {
+            OUTCOME_TRY(Enqueue(*task));
+            return success();
+        }
+        return Status(eInvalidArgument);
+    }
+
+    void* CpuStreamImpl::GetNative(ErrorCode* ec)
+    {
+        if (ec) *ec = ErrorCode::eSuccess;
+        return this;
+    }
+
+    void CpuStreamImpl::InternalThreadEntry()
+    {
+        while (true)
+        {
+            Task task;
+            {
+                std::unique_lock lock(mutex_);
+                cv_.wait(lock, [this]
+                         { return !task_queue_.empty() || abort_; });
+                if (abort_)
+                {
+                    break;
+                }
+                task = std::move(task_queue_.front());
+            }
+            if (task)
+            {
+                task();
+            }
+            {
+                std::lock_guard lock(mutex_);
+                task_queue_.pop();
+            }
+            cv_.notify_one();
+        }
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CpuEventImpl
+
+    CpuEventImpl::CpuEventImpl(Device device)
+        : EventImpl(device)
+    {
+    }
+
+    Result<void> CpuEventImpl::Init(uint64_t flags)
+    {
+        Reset();
+        return success();
+    };
+
+    Result<void> CpuEventImpl::Init(std::shared_ptr<void> native, uint64_t flags)
+    {
+        return Status(eNotSupported);
+    };
+
+    Result<void> CpuEventImpl::Query()
+    {
+        auto status = future_.wait_for(std::chrono::microseconds::zero());
+        if (status == std::future_status::ready)
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eNotReady);
+        }
+    }
+
+    Result<void> CpuEventImpl::Record(Stream& stream)
+    {
+        if (stream.GetDevice() != device_)
+        {
+            return Status(eInvalidArgument);
+        }
+        auto cpu_stream = static_cast<CpuStreamImpl*>(stream.GetNative());
+        if (!cpu_stream) return Status(eInvalidArgument);
+        Reset();
+        return cpu_stream->Enqueue([this]
+                                   { promise_.set_value(); });
+    }
+
+    Result<void> CpuEventImpl::Wait()
+    {
+        future_.wait();
+        return success();
+    };
+
+    void CpuEventImpl::Reset()
+    {
+        promise_ = std::promise<void>();
+        future_  = promise_.get_future();
+    }
+
+    void* CpuEventImpl::GetNative(ErrorCode* ec)
+    {
+        if (ec) *ec = ErrorCode::eSuccess;
+        return this;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+
+    Kernel CreateCpuKernel(std::function<void()> task)
+    {
+        return Kernel(std::make_shared<CpuKernelImpl>(gCpuPlatform().GetDevice(0), std::move(task)));
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CpuPlatformRegisterer
+
+    CpuPlatformImpl& gCpuPlatform()
+    {
+        static Platform platform("cpu");
+        return Access::get<CpuPlatformImpl>(platform);
+    }
+
+    class CpuPlatformRegisterer
+    {
+      public:
+        CpuPlatformRegisterer()
+        {
+            gPlatformRegistry().Register([]
+                                         { return std::make_shared<CpuPlatformImpl>(); });
+        }
+    };
+
+    CpuPlatformRegisterer g_cpu_platform_registerer;
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/device/cpu/cpu_device.h b/csrc/mmdeploy/device/cpu/cpu_device.h
index c508e030d7..cadb365073 100644
--- a/csrc/mmdeploy/device/cpu/cpu_device.h
+++ b/csrc/mmdeploy/device/cpu/cpu_device.h
@@ -9,143 +9,152 @@
 #include "mmdeploy/core/device_impl.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class CpuPlatformImpl : public PlatformImpl {
- public:
-  int GetPlatformId() const noexcept override;
+    class CpuPlatformImpl : public PlatformImpl
+    {
+      public:
+        int                    GetPlatformId() const noexcept override;
 
-  const char* GetPlatformName() const noexcept override;
+        const char*            GetPlatformName() const noexcept override;
 
-  Result<void> BindDevice(Device device, Device* prev) override;
+        Result<void>           BindDevice(Device device, Device* prev) override;
 
-  shared_ptr<BufferImpl> CreateBuffer(Device device) override;
+        shared_ptr<BufferImpl> CreateBuffer(Device device) override;
 
-  shared_ptr<StreamImpl> CreateStream(Device device) override;
+        shared_ptr<StreamImpl> CreateStream(Device device) override;
 
-  shared_ptr<EventImpl> CreateEvent(Device device) override;
+        shared_ptr<EventImpl>  CreateEvent(Device device) override;
 
-  Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
-                    Stream stream) override;
+        Result<void>           Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream) override;
 
-  Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
-                    Stream stream) override;
+        Result<void>           Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream) override;
 
-  Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset,
-                    Stream stream) override;
+        Result<void>           Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream) override;
 
-  Result<Stream> GetDefaultStream(int32_t device_id) override;
+        Result<Stream>         GetDefaultStream(int32_t device_id) override;
 
-  Device GetDevice(int device_id) const { return Device(GetPlatformId(), device_id); }
+        Device                 GetDevice(int device_id) const
+        {
+            return Device(GetPlatformId(), device_id);
+        }
 
- private:
-  static bool CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset,
-                             size_t copy_size);
+      private:
+        static bool         CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t copy_size);
 
-  static Result<void> CopyImpl(const void* src, void* dst, size_t src_size, size_t dst_size,
-                               size_t src_offset, size_t dst_offset, size_t size, Stream st);
+        static Result<void> CopyImpl(const void* src, void* dst, size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t size, Stream st);
 
-  Stream default_stream_;
-  std::once_flag init_flag_;
-};
+        Stream              default_stream_;
+        std::once_flag      init_flag_;
+    };
 
-CpuPlatformImpl& gCpuPlatform();
+    CpuPlatformImpl& gCpuPlatform();
 
-class CpuHostMemory;
+    class CpuHostMemory;
 
-class CpuBufferImpl : public BufferImpl {
- public:
-  explicit CpuBufferImpl(Device device);
+    class CpuBufferImpl : public BufferImpl
+    {
+      public:
+        explicit CpuBufferImpl(Device device);
 
-  Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
+        Result<void>          Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
 
-  Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
+        Result<void>          Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
 
-  Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
+        Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
 
-  void* GetNative(ErrorCode* ec) override;
+        void*                 GetNative(ErrorCode* ec) override;
 
-  Allocator GetAllocator() const override;
+        Allocator             GetAllocator() const override;
 
-  size_t GetSize(ErrorCode* ec) override;
+        size_t                GetSize(ErrorCode* ec) override;
 
- private:
-  std::shared_ptr<CpuHostMemory> memory_;
-  size_t offset_{0};
-  size_t size_{0};
-};
+      private:
+        std::shared_ptr<CpuHostMemory> memory_;
+        size_t                         offset_{0};
+        size_t                         size_{0};
+    };
 
-class CpuStreamImpl : public StreamImpl {
- public:
-  using Task = std::function<void()>;
+    class CpuStreamImpl : public StreamImpl
+    {
+      public:
+        using Task = std::function<void()>;
 
-  explicit CpuStreamImpl(Device device);
+        explicit CpuStreamImpl(Device device);
 
-  ~CpuStreamImpl() override;
+        ~CpuStreamImpl() override;
 
-  Result<void> Init(uint64_t flags) override;
+        Result<void> Init(uint64_t flags) override;
 
-  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+        Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
 
-  Result<void> Enqueue(Task task);
+        Result<void> Enqueue(Task task);
 
-  Result<void> DependsOn(Event& event) override;
+        Result<void> DependsOn(Event& event) override;
 
-  Result<void> Query() override;
+        Result<void> Query() override;
 
-  Result<void> Wait() override;
+        Result<void> Wait() override;
 
-  Result<void> Submit(Kernel& kernel) override;
+        Result<void> Submit(Kernel& kernel) override;
 
-  void* GetNative(ErrorCode* ec) override;
+        void*        GetNative(ErrorCode* ec) override;
 
- private:
-  void InternalThreadEntry();
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  std::queue<Task> task_queue_;
-  std::thread thread_;
-  Device device_;
-  bool abort_{false};
-};
+      private:
+        void                    InternalThreadEntry();
+        std::mutex              mutex_;
+        std::condition_variable cv_;
+        std::queue<Task>        task_queue_;
+        std::thread             thread_;
+        Device                  device_;
+        bool                    abort_{false};
+    };
 
-class CpuEventImpl : public EventImpl {
- public:
-  explicit CpuEventImpl(Device device);
+    class CpuEventImpl : public EventImpl
+    {
+      public:
+        explicit CpuEventImpl(Device device);
 
-  ~CpuEventImpl() override = default;
+        ~CpuEventImpl() override = default;
 
-  Result<void> Init(uint64_t flags) override;
+        Result<void> Init(uint64_t flags) override;
 
-  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+        Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
 
-  Result<void> Query() override;
+        Result<void> Query() override;
 
-  Result<void> Record(Stream& stream) override;
+        Result<void> Record(Stream& stream) override;
 
-  Result<void> Wait() override;
+        Result<void> Wait() override;
 
-  void* GetNative(ErrorCode* ec) override;
+        void*        GetNative(ErrorCode* ec) override;
 
- private:
-  void Reset();
-  std::shared_future<void> future_;
-  std::promise<void> promise_;
-};
+      private:
+        void                     Reset();
+        std::shared_future<void> future_;
+        std::promise<void>       promise_;
+    };
 
-class CpuKernelImpl : public KernelImpl {
- public:
-  using Task = CpuStreamImpl::Task;
+    class CpuKernelImpl : public KernelImpl
+    {
+      public:
+        using Task = CpuStreamImpl::Task;
 
-  explicit CpuKernelImpl(Device device, Task task) : KernelImpl(device), task_(std::move(task)) {}
+        explicit CpuKernelImpl(Device device, Task task)
+            : KernelImpl(device)
+            , task_(std::move(task))
+        {
+        }
 
-  void* GetNative(ErrorCode* ec) override {
-    if (ec) *ec = ErrorCode::eSuccess;
-    return &task_;
-  }
+        void* GetNative(ErrorCode* ec) override
+        {
+            if (ec) *ec = ErrorCode::eSuccess;
+            return &task_;
+        }
 
- private:
-  Task task_;
-};
+      private:
+        Task task_;
+    };
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/device/cuda/buddy_allocator.h b/csrc/mmdeploy/device/cuda/buddy_allocator.h
index 8525c5562f..0dd578261d 100644
--- a/csrc/mmdeploy/device/cuda/buddy_allocator.h
+++ b/csrc/mmdeploy/device/cuda/buddy_allocator.h
@@ -14,169 +14,200 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/device/cuda/default_allocator.h"
 
-namespace mmdeploy::cuda {
-
-class BuddyAllocator {
- public:
-  using size_type = std::size_t;
-
-  BuddyAllocator(size_type size, size_type block_size) {
-    block_size_ = block_size;
-    block_count_ = size / block_size_;
-    if (!IsPowerOfTwo(block_count_)) {
-      block_count_ = RoundToPowerOfTwo(block_count_);
-      MMDEPLOY_WARN("Rounding up block_count to next power of 2 {}", block_count_);
+namespace mmdeploy::cuda
+{
+
+    class BuddyAllocator
+    {
+      public:
+        using size_type = std::size_t;
+
+        BuddyAllocator(size_type size, size_type block_size)
+        {
+            block_size_  = block_size;
+            block_count_ = size / block_size_;
+            if (!IsPowerOfTwo(block_count_))
+            {
+                block_count_ = RoundToPowerOfTwo(block_count_);
+                MMDEPLOY_WARN("Rounding up block_count to next power of 2 {}", block_count_);
+            }
+            base_   = LogPowerOfTwo(block_count_);
+            size_   = block_size_ * block_count_;
+            memory_ = gDefaultAllocator().Allocate(size_);
+            tree_.resize(block_count_ * 2);
+            free_.resize(base_ + 1);
+            Build(1, 0);
+            Add(1, 0);
+            MMDEPLOY_ERROR("size = {}, block_size = {}, block_count = {}", size_, block_size_, block_count_);
+            size = size_;
+            for (int i = 0; i <= base_; ++i)
+            {
+                MMDEPLOY_ERROR("level {}, size = {}", i, size);
+                size /= 2;
+            }
+        }
+
+        ~BuddyAllocator()
+        {
+            for (int i = 0; i < free_.size(); ++i)
+            {
+                MMDEPLOY_ERROR("free_[{}].size(): {}", i, free_[i].size());
+            }
+            gDefaultAllocator().Deallocate(memory_, size_);
+        }
+
+        [[nodiscard]] void* Allocate(size_type n)
+        {
+            std::lock_guard lock{mutex_};
+            if (n > size_)
+            {
+                return nullptr;
+            }
+            auto n_level = GetLevel(n);
+            auto level   = n_level;
+            for (; level >= 0; --level)
+            {
+                if (!free_[level].empty())
+                {
+                    break;
+                }
+            }
+            if (level < 0)
+            {
+                MMDEPLOY_WARN("failed to allocate memory size = {} bytes", n);
+                return nullptr;
+            }
+            for (; level < n_level; ++level)
+            {
+                auto index = free_[level].front();
+                Split(index, level);
+            }
+            auto index = free_[level].front();
+            Del(index, level);
+            auto offset = (index ^ (1 << level)) << (base_ - level);
+            auto p      = static_cast<uint8_t*>(memory_) + offset * block_size_;
+            return p;
+        }
+
+        void Deallocate(void* p, size_type n)
+        {
+            std::lock_guard lock{mutex_};
+            auto            offset = static_cast<uint8_t*>(p) - static_cast<uint8_t*>(memory_);
+            if (offset < 0 || offset % block_size_)
+            {
+                MMDEPLOY_ERROR("invalid address: {}", p);
+            }
+            offset /= static_cast<long>(block_size_);
+            auto level = GetLevel(n);
+            auto index = (offset >> (base_ - level)) ^ (1 << level);
+            Add(index, level);
+            while (index > 1)
+            {
+                auto buddy = index ^ 1;
+                if (tree_[buddy] != free_[level].end())
+                {
+                    Merge(index, level);
+                    index /= 2;
+                    --level;
+                }
+                else
+                {
+                    break;
+                }
+            }
+        }
+
+      private:
+        void Add(size_type index, size_type level)
+        {
+            assert(tree_[index] == free_[level].end());
+            tree_[index] = free_[level].insert(free_[level].end(), index);
+        }
+
+        void Del(size_type index, size_type level)
+        {
+            assert(tree_[index] != free_[level].end());
+            free_[level].erase(tree_[index]);
+            tree_[index] = free_[level].end();
+        }
+
+        void Split(size_type index, size_type level)
+        {
+            Del(index, level);
+            Add(index * 2, level + 1);
+            Add(index * 2 + 1, level + 1);
+        }
+
+        void Merge(size_type index, size_type level)
+        {
+            Del(index, level);
+            Del(index ^ 1, level);
+            Add(index / 2, level - 1);
+        }
+
+        size_type GetLevel(size_type size) const
+        {
+            size = RoundToPowerOfTwo((size + block_size_ - 1) / block_size_);
+            return base_ - LogPowerOfTwo(size);
+        }
+
+        static bool IsPowerOfTwo(size_type n)
+        {
+            return (n & (n - 1)) == 0;
+        }
+
+        static size_type RoundToPowerOfTwo(size_type n)
+        {
+            --n;
+            n |= (n >> 1);
+            n |= (n >> 2);
+            n |= (n >> 4);
+            n |= (n >> 8);
+            n |= (n >> 16);
+            n |= (n >> 32);
+            return ++n;
+        }
+
+        static size_type LogPowerOfTwo(size_type v)
+        {
+            size_type r{};
+            r |= ((v & 0xFFFFFFFF00000000) != 0) << 5;
+            r |= ((v & 0xFFFF0000FFFF0000) != 0) << 4;
+            r |= ((v & 0xFF00FF00FF00FF00) != 0) << 3;
+            r |= ((v & 0xF0F0F0F0F0F0F0F0) != 0) << 2;
+            r |= ((v & 0xCCCCCCCCCCCCCCCC) != 0) << 1;
+            r |= ((v & 0xAAAAAAAAAAAAAAAA) != 0);
+            return r;
+        }
+
+        void Build(size_type index, size_type level)
+        {
+            if (index < tree_.size())
+            {
+                tree_[index] = free_[level].end();
+                index *= 2;
+                ++level;
+                Build(index, level);
+                Build(index + 1, level);
+            }
+        }
+
+      private:
+        size_type                                   size_;
+        size_type                                   block_size_;
+        size_type                                   block_count_;
+        size_type                                   base_;
+        void*                                       memory_;
+        std::vector<std::list<size_type>::iterator> tree_;
+        std::vector<std::list<size_type>>           free_;
+        std::mutex                                  mutex_;
+    };
+
+    inline BuddyAllocator& gBuddyAllocator()
+    {
+        static BuddyAllocator v(1U << 30, 1024 * 64);
+        return v;
     }
-    base_ = LogPowerOfTwo(block_count_);
-    size_ = block_size_ * block_count_;
-    memory_ = gDefaultAllocator().Allocate(size_);
-    tree_.resize(block_count_ * 2);
-    free_.resize(base_ + 1);
-    Build(1, 0);
-    Add(1, 0);
-    MMDEPLOY_ERROR("size = {}, block_size = {}, block_count = {}", size_, block_size_,
-                   block_count_);
-    size = size_;
-    for (int i = 0; i <= base_; ++i) {
-      MMDEPLOY_ERROR("level {}, size = {}", i, size);
-      size /= 2;
-    }
-  }
-
-  ~BuddyAllocator() {
-    for (int i = 0; i < free_.size(); ++i) {
-      MMDEPLOY_ERROR("free_[{}].size(): {}", i, free_[i].size());
-    }
-    gDefaultAllocator().Deallocate(memory_, size_);
-  }
-
-  [[nodiscard]] void* Allocate(size_type n) {
-    std::lock_guard lock{mutex_};
-    if (n > size_) {
-      return nullptr;
-    }
-    auto n_level = GetLevel(n);
-    auto level = n_level;
-    for (; level >= 0; --level) {
-      if (!free_[level].empty()) {
-        break;
-      }
-    }
-    if (level < 0) {
-      MMDEPLOY_WARN("failed to allocate memory size = {} bytes", n);
-      return nullptr;
-    }
-    for (; level < n_level; ++level) {
-      auto index = free_[level].front();
-      Split(index, level);
-    }
-    auto index = free_[level].front();
-    Del(index, level);
-    auto offset = (index ^ (1 << level)) << (base_ - level);
-    auto p = static_cast<uint8_t*>(memory_) + offset * block_size_;
-    return p;
-  }
-
-  void Deallocate(void* p, size_type n) {
-    std::lock_guard lock{mutex_};
-    auto offset = static_cast<uint8_t*>(p) - static_cast<uint8_t*>(memory_);
-    if (offset < 0 || offset % block_size_) {
-      MMDEPLOY_ERROR("invalid address: {}", p);
-    }
-    offset /= static_cast<long>(block_size_);
-    auto level = GetLevel(n);
-    auto index = (offset >> (base_ - level)) ^ (1 << level);
-    Add(index, level);
-    while (index > 1) {
-      auto buddy = index ^ 1;
-      if (tree_[buddy] != free_[level].end()) {
-        Merge(index, level);
-        index /= 2;
-        --level;
-      } else {
-        break;
-      }
-    }
-  }
-
- private:
-  void Add(size_type index, size_type level) {
-    assert(tree_[index] == free_[level].end());
-    tree_[index] = free_[level].insert(free_[level].end(), index);
-  }
-
-  void Del(size_type index, size_type level) {
-    assert(tree_[index] != free_[level].end());
-    free_[level].erase(tree_[index]);
-    tree_[index] = free_[level].end();
-  }
-
-  void Split(size_type index, size_type level) {
-    Del(index, level);
-    Add(index * 2, level + 1);
-    Add(index * 2 + 1, level + 1);
-  }
-
-  void Merge(size_type index, size_type level) {
-    Del(index, level);
-    Del(index ^ 1, level);
-    Add(index / 2, level - 1);
-  }
-
-  size_type GetLevel(size_type size) const {
-    size = RoundToPowerOfTwo((size + block_size_ - 1) / block_size_);
-    return base_ - LogPowerOfTwo(size);
-  }
-
-  static bool IsPowerOfTwo(size_type n) { return (n & (n - 1)) == 0; }
-
-  static size_type RoundToPowerOfTwo(size_type n) {
-    --n;
-    n |= (n >> 1);
-    n |= (n >> 2);
-    n |= (n >> 4);
-    n |= (n >> 8);
-    n |= (n >> 16);
-    n |= (n >> 32);
-    return ++n;
-  }
-
-  static size_type LogPowerOfTwo(size_type v) {
-    size_type r{};
-    r |= ((v & 0xFFFFFFFF00000000) != 0) << 5;
-    r |= ((v & 0xFFFF0000FFFF0000) != 0) << 4;
-    r |= ((v & 0xFF00FF00FF00FF00) != 0) << 3;
-    r |= ((v & 0xF0F0F0F0F0F0F0F0) != 0) << 2;
-    r |= ((v & 0xCCCCCCCCCCCCCCCC) != 0) << 1;
-    r |= ((v & 0xAAAAAAAAAAAAAAAA) != 0);
-    return r;
-  }
-
-  void Build(size_type index, size_type level) {
-    if (index < tree_.size()) {
-      tree_[index] = free_[level].end();
-      index *= 2;
-      ++level;
-      Build(index, level);
-      Build(index + 1, level);
-    }
-  }
-
- private:
-  size_type size_;
-  size_type block_size_;
-  size_type block_count_;
-  size_type base_;
-  void* memory_;
-  std::vector<std::list<size_type>::iterator> tree_;
-  std::vector<std::list<size_type> > free_;
-  std::mutex mutex_;
-};
-
-inline BuddyAllocator& gBuddyAllocator() {
-  static BuddyAllocator v(1U << 30, 1024 * 64);
-  return v;
-}
 
 }  // namespace mmdeploy::cuda
 
diff --git a/csrc/mmdeploy/device/cuda/cuda_device.cpp b/csrc/mmdeploy/device/cuda/cuda_device.cpp
index 70cac8802a..40ea817ae1 100644
--- a/csrc/mmdeploy/device/cuda/cuda_device.cpp
+++ b/csrc/mmdeploy/device/cuda/cuda_device.cpp
@@ -6,496 +6,637 @@
 
 #include "mmdeploy/device/device_allocator.h"
 
-namespace mmdeploy::framework {
-
-inline void* OffsetPtr(void* ptr, size_t offset) {
-  return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
-}
-
-inline const void* OffsetPtr(const void* ptr, size_t offset) {
-  return static_cast<const void*>(static_cast<const uint8_t*>(ptr) + offset);
-}
-
-cudaMemcpyKind MapMemcpyKindToCuda(MemcpyKind kind) {
-  switch (kind) {
-    case MemcpyKind::HtoD:
-      return cudaMemcpyHostToDevice;
-    case MemcpyKind::DtoH:
-      return cudaMemcpyDeviceToHost;
-    case MemcpyKind::DtoD:
-      return cudaMemcpyDeviceToDevice;
-    default:
-      return cudaMemcpyDefault;
-  }
-}
-
-namespace cuda {
-
-class Mallocator : public AllocatorImpl {
- public:
-  Block Allocate(size_t size) noexcept override {
-    if (size == 0) {
-      return Block{};
-    }
-    Block block;
-    if (auto status = cudaMalloc(&block.handle, size); status != cudaSuccess) {
-      // log error
-    }
-    block.size = size;
-    return block;
-  }
-  void Deallocate(Block& block) noexcept override {
-    if (!block.handle) {
-      return;
-    }
-    cudaFree(block.handle);
-  }
-  bool Owns(const Block& block) const noexcept override { return true; }
-};
-
-Allocator CreateDefaultAllocator() {
-  using namespace device_allocator;
-  AllocatorImplPtr allocator = std::make_shared<Mallocator>();
-  allocator = std::make_shared<Tree>(allocator, -1, .5);
-  allocator = std::make_shared<Locked>(allocator);
-  MMDEPLOY_DEBUG("Default CUDA allocator initialized");
-  return Access::create<Allocator>(allocator);
-}
-
-}  // namespace cuda
-
-// ! this class doesn't handle device id
-class CudaDeviceMemory : public NonCopyable {
- public:
-  explicit CudaDeviceMemory(int device_id) : device_id_(device_id), size_(), owned_block_() {}
-  Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) {
-    if (alignment > 256 || 256 % alignment != 0) {
-      return Status(eNotSupported);
-    }
-    allocator_ = std::move(allocator);
-    CudaDeviceGuard guard(device_id_);
-    block_ = Access::get<AllocatorImpl>(allocator_).Allocate(size);
-    if (size && !block_.handle) {
-      return Status(eOutOfMemory);
-    }
-    size_ = size;
-    owned_block_ = true;
-    return success();
-  }
-  Result<void> Init(size_t size, std::shared_ptr<void> data, uint64_t flags) {
-    size_ = size;
-    external_ = std::move(data);
-    block_.handle = external_.get();
-    block_.size = size;
-    owned_block_ = false;
-    return success();
-  }
-  ~CudaDeviceMemory() {
-    if (block_.handle) {
-      if (owned_block_) {
-        CudaDeviceGuard guard(device_id_);
-        Access::get<AllocatorImpl>(allocator_).Deallocate(block_);
-        owned_block_ = false;
-      }
-      block_.handle = nullptr;
-    }
-    external_.reset();
-    size_ = 0;
-  }
-  size_t size() const { return size_; }
-  void* data() const { return block_.handle; }
-  const Allocator& allocator() const { return allocator_; }
-
- private:
-  int device_id_;
-  size_t size_;
-  AllocatorImpl::Block block_;
-  bool owned_block_;
-  Allocator allocator_;
-  std::shared_ptr<void> external_;
-};
-
-shared_ptr<BufferImpl> CudaPlatformImpl::CreateBuffer(Device device) {
-  return std::make_shared<CudaBufferImpl>(device);
-}
-
-shared_ptr<StreamImpl> CudaPlatformImpl::CreateStream(Device device) {
-  return std::make_shared<CudaStreamImpl>(device);
-}
-
-shared_ptr<EventImpl> CudaPlatformImpl::CreateEvent(Device device) {
-  return std::make_shared<CudaEventImpl>(device);
-}
-
-Result<void> CudaPlatformImpl::BindDevice(Device device, Device* prev) {
-  if (device.platform_id() != platform_id_) {
-    return Status(eInvalidArgument);
-  }
-  // skip null device
-  if (device.device_id() == -1) {
-    return success();
-  }
-  int prev_device_id = -1;
-  if (prev) {
-    CUcontext ctx{};
-    cuCtxGetCurrent(&ctx);
-    if (ctx) {
-      cudaGetDevice(&prev_device_id);
-      *prev = Device(platform_id_, prev_device_id);
-    } else {
-      // cuda is not initialized return a null device as previous
-      *prev = Device(platform_id_, -1);
-    }
-  }
-  if (device.device_id() != prev_device_id) {
-    cudaSetDevice(device.device_id());
-  }
-  return success();
-}
-
-bool CudaPlatformImpl::CheckCopyDevice(const Device& src, const Device& dst, const Device& st) {
-  return st.is_device() && (src.is_host() || src == st) && (dst.is_host() || dst == st);
-}
-
-Result<void> CudaPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size,
-                                    size_t dst_offset, Stream stream) {
-  if (!CheckCopyDevice(Device{0, 0}, dst.GetDevice(), stream.GetDevice())) {
-    return Status(eInvalidArgument);
-  }
-  if (size == 0) {
-    return success();
-  }
-  auto dst_ptr = dst.GetNative();
-  if (!dst_ptr) {
-    return Status(eInvalidArgument);
-  }
-  //  auto device = dst.GetDevice();
-  return CopyImpl(stream.GetDevice(), host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size,
-                  stream);
-}
-
-Result<void> CudaPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
-                                    Stream stream) {
-  if (!CheckCopyDevice(src.GetDevice(), Device{0, 0}, stream.GetDevice())) {
-    return Status(eInvalidArgument);
-  }
-  if (size == 0) {
-    return success();
-  }
-  auto src_ptr = src.GetNative();
-  if (!src_ptr) {
-    return Status(eInvalidArgument);
-  }
-  //  auto device = src.GetDevice();
-  return CopyImpl(stream.GetDevice(), src_ptr, host_ptr, src.GetSize(), size, src_offset, 0, size,
-                  stream);
-}
-
-Result<void> CudaPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t src_offset,
-                                    size_t dst_offset, Stream stream) {
-  if (!CheckCopyDevice(src.GetDevice(), dst.GetDevice(), stream.GetDevice())) {
-    return Status(eInvalidArgument);
-  }
-  if (size == 0) {
-    return success();
-  }
-  auto src_ptr = src.GetNative();
-  auto dst_ptr = dst.GetNative();
-  if (!src_ptr || !dst_ptr) {
-    return Status(eInvalidArgument);
-  }
-  return CopyImpl(stream.GetDevice(), src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset,
-                  dst_offset, size, stream);
-}
-
-bool CudaPlatformImpl::CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset,
-                                      size_t dst_offset, size_t copy_size) {
-  if (src_offset + copy_size > src_size) {
-    return false;
-  }
-  if (dst_offset + copy_size > dst_size) {
-    return false;
-  }
-  return true;
-}
-
-Result<void> CudaPlatformImpl::CopyImpl(Device device, const void* src, void* dst, size_t src_size,
-                                        size_t dst_size, size_t src_offset, size_t dst_offset,
-                                        size_t size, Stream st) {
-  if (!CheckCopyParam(src_size, dst_size, src_offset, dst_offset, size)) {
-    return Status(eInvalidArgument);
-  }
-
-  auto p_dst = OffsetPtr(dst, dst_offset);
-  auto p_src = OffsetPtr(src, src_offset);
-
-  CudaDeviceGuard guard(device);
-
-  if (st) {
-    auto cuda_stream = ::mmdeploy::framework::GetNative<cudaStream_t>(st);
-    // TODO: how about default stream cudaStream_t(0)?
-    if (!cuda_stream) {
-      return Status(eInvalidArgument);
-    }
-    auto err = cudaMemcpyAsync(p_dst, p_src, size, cudaMemcpyDefault, cuda_stream);
-    if (err != cudaSuccess) {
-      return Status(eFail);
-    }
-  } else {
-    auto err = cudaMemcpy(p_dst, p_src, size, cudaMemcpyDefault);
-    if (err != cudaSuccess) {
-      return Status(eFail);
-    }
-  }
-  return success();
-}
-
-Result<Stream> CudaPlatformImpl::GetDefaultStream(int32_t device_id) {
-  if (device_id >= per_device_data_.size()) {
-    return Status(eInvalidArgument);
-  }
-  return per_device_data_[device_id]->default_stream();
-}
-
-void CudaPlatformImpl::PerDeviceData::init() {
-  std::call_once(init_flag_, [&] {
+namespace mmdeploy::framework
+{
+
+    inline void* OffsetPtr(void* ptr, size_t offset)
+    {
+        return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
+    }
+
+    inline const void* OffsetPtr(const void* ptr, size_t offset)
+    {
+        return static_cast<const void*>(static_cast<const uint8_t*>(ptr) + offset);
+    }
+
+    cudaMemcpyKind MapMemcpyKindToCuda(MemcpyKind kind)
+    {
+        switch (kind)
+        {
+            case MemcpyKind::HtoD:
+                return cudaMemcpyHostToDevice;
+            case MemcpyKind::DtoH:
+                return cudaMemcpyDeviceToHost;
+            case MemcpyKind::DtoD:
+                return cudaMemcpyDeviceToDevice;
+            default:
+                return cudaMemcpyDefault;
+        }
+    }
+
+    namespace cuda
+    {
+
+        class Mallocator : public AllocatorImpl
+        {
+          public:
+            Block Allocate(size_t size) noexcept override
+            {
+                if (size == 0)
+                {
+                    return Block{};
+                }
+                Block block;
+                if (auto status = cudaMalloc(&block.handle, size); status != cudaSuccess)
+                {
+                    // log error
+                }
+                block.size = size;
+                return block;
+            }
+            void Deallocate(Block& block) noexcept override
+            {
+                if (!block.handle)
+                {
+                    return;
+                }
+                cudaFree(block.handle);
+            }
+            bool Owns(const Block& block) const noexcept override
+            {
+                return true;
+            }
+        };
+
+        Allocator CreateDefaultAllocator()
+        {
+            using namespace device_allocator;
+            AllocatorImplPtr allocator = std::make_shared<Mallocator>();
+            allocator                  = std::make_shared<Tree>(allocator, -1, .5);
+            allocator                  = std::make_shared<Locked>(allocator);
+            MMDEPLOY_DEBUG("Default CUDA allocator initialized");
+            return Access::create<Allocator>(allocator);
+        }
+
+    }  // namespace cuda
+
+    // ! this class doesn't handle device id
+    class CudaDeviceMemory : public NonCopyable
+    {
+      public:
+        explicit CudaDeviceMemory(int device_id)
+            : device_id_(device_id)
+            , size_()
+            , owned_block_()
+        {
+        }
+        Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags)
+        {
+            if (alignment > 256 || 256 % alignment != 0)
+            {
+                return Status(eNotSupported);
+            }
+            allocator_ = std::move(allocator);
+            CudaDeviceGuard guard(device_id_);
+            block_ = Access::get<AllocatorImpl>(allocator_).Allocate(size);
+            if (size && !block_.handle)
+            {
+                return Status(eOutOfMemory);
+            }
+            size_        = size;
+            owned_block_ = true;
+            return success();
+        }
+        Result<void> Init(size_t size, std::shared_ptr<void> data, uint64_t flags)
+        {
+            size_         = size;
+            external_     = std::move(data);
+            block_.handle = external_.get();
+            block_.size   = size;
+            owned_block_  = false;
+            return success();
+        }
+        ~CudaDeviceMemory()
+        {
+            if (block_.handle)
+            {
+                if (owned_block_)
+                {
+                    CudaDeviceGuard guard(device_id_);
+                    Access::get<AllocatorImpl>(allocator_).Deallocate(block_);
+                    owned_block_ = false;
+                }
+                block_.handle = nullptr;
+            }
+            external_.reset();
+            size_ = 0;
+        }
+        size_t size() const
+        {
+            return size_;
+        }
+        void* data() const
+        {
+            return block_.handle;
+        }
+        const Allocator& allocator() const
+        {
+            return allocator_;
+        }
+
+      private:
+        int                   device_id_;
+        size_t                size_;
+        AllocatorImpl::Block  block_;
+        bool                  owned_block_;
+        Allocator             allocator_;
+        std::shared_ptr<void> external_;
+    };
+
+    shared_ptr<BufferImpl> CudaPlatformImpl::CreateBuffer(Device device)
+    {
+        return std::make_shared<CudaBufferImpl>(device);
+    }
+
+    shared_ptr<StreamImpl> CudaPlatformImpl::CreateStream(Device device)
+    {
+        return std::make_shared<CudaStreamImpl>(device);
+    }
+
+    shared_ptr<EventImpl> CudaPlatformImpl::CreateEvent(Device device)
+    {
+        return std::make_shared<CudaEventImpl>(device);
+    }
+
+    Result<void> CudaPlatformImpl::BindDevice(Device device, Device* prev)
+    {
+        if (device.platform_id() != platform_id_)
+        {
+            return Status(eInvalidArgument);
+        }
+        // skip null device
+        if (device.device_id() == -1)
+        {
+            return success();
+        }
+        int prev_device_id = -1;
+        if (prev)
+        {
+            CUcontext ctx{};
+            cuCtxGetCurrent(&ctx);
+            if (ctx)
+            {
+                cudaGetDevice(&prev_device_id);
+                *prev = Device(platform_id_, prev_device_id);
+            }
+            else
+            {
+                // cuda is not initialized return a null device as previous
+                *prev = Device(platform_id_, -1);
+            }
+        }
+        if (device.device_id() != prev_device_id)
+        {
+            cudaSetDevice(device.device_id());
+        }
+        return success();
+    }
+
+    bool CudaPlatformImpl::CheckCopyDevice(const Device& src, const Device& dst, const Device& st)
+    {
+        return st.is_device() && (src.is_host() || src == st) && (dst.is_host() || dst == st);
+    }
+
+    Result<void> CudaPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream)
+    {
+        if (!CheckCopyDevice(Device{0, 0}, dst.GetDevice(), stream.GetDevice()))
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == 0)
+        {
+            return success();
+        }
+        auto dst_ptr = dst.GetNative();
+        if (!dst_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        //  auto device = dst.GetDevice();
+        return CopyImpl(stream.GetDevice(), host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size, stream);
+    }
+
+    Result<void> CudaPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream)
+    {
+        if (!CheckCopyDevice(src.GetDevice(), Device{0, 0}, stream.GetDevice()))
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == 0)
+        {
+            return success();
+        }
+        auto src_ptr = src.GetNative();
+        if (!src_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        //  auto device = src.GetDevice();
+        return CopyImpl(stream.GetDevice(), src_ptr, host_ptr, src.GetSize(), size, src_offset, 0, size, stream);
+    }
+
+    Result<void> CudaPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream)
+    {
+        if (!CheckCopyDevice(src.GetDevice(), dst.GetDevice(), stream.GetDevice()))
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == 0)
+        {
+            return success();
+        }
+        auto src_ptr = src.GetNative();
+        auto dst_ptr = dst.GetNative();
+        if (!src_ptr || !dst_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        return CopyImpl(stream.GetDevice(), src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset, dst_offset, size, stream);
+    }
+
+    bool CudaPlatformImpl::CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t copy_size)
+    {
+        if (src_offset + copy_size > src_size)
+        {
+            return false;
+        }
+        if (dst_offset + copy_size > dst_size)
+        {
+            return false;
+        }
+        return true;
+    }
+
+    Result<void> CudaPlatformImpl::CopyImpl(Device device, const void* src, void* dst, size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t size, Stream st)
+    {
+        if (!CheckCopyParam(src_size, dst_size, src_offset, dst_offset, size))
+        {
+            return Status(eInvalidArgument);
+        }
+
+        auto            p_dst = OffsetPtr(dst, dst_offset);
+        auto            p_src = OffsetPtr(src, src_offset);
+
+        CudaDeviceGuard guard(device);
+
+        if (st)
+        {
+            auto cuda_stream = ::mmdeploy::framework::GetNative<cudaStream_t>(st);
+            // TODO: how about default stream cudaStream_t(0)?
+            if (!cuda_stream)
+            {
+                return Status(eInvalidArgument);
+            }
+            auto err = cudaMemcpyAsync(p_dst, p_src, size, cudaMemcpyDefault, cuda_stream);
+            if (err != cudaSuccess)
+            {
+                return Status(eFail);
+            }
+        }
+        else
+        {
+            auto err = cudaMemcpy(p_dst, p_src, size, cudaMemcpyDefault);
+            if (err != cudaSuccess)
+            {
+                return Status(eFail);
+            }
+        }
+        return success();
+    }
+
+    Result<Stream> CudaPlatformImpl::GetDefaultStream(int32_t device_id)
+    {
+        if (device_id >= per_device_data_.size())
+        {
+            return Status(eInvalidArgument);
+        }
+        return per_device_data_[device_id]->default_stream();
+    }
+
+    void CudaPlatformImpl::PerDeviceData::init()
+    {
+        std::call_once(init_flag_, [&]
+                       {
     CudaDeviceGuard guard(device_id_);
     default_stream_ = Stream(gCudaPlatform().GetDevice(device_id_));
-    default_allocator_ = cuda::CreateDefaultAllocator();
-  });
-}
-
-CudaPlatformImpl::CudaPlatformImpl() {
-  int count{};
-  if (auto err = cudaGetDeviceCount(&count); err != cudaSuccess) {
-    MMDEPLOY_ERROR("error getting device count: {}", cudaGetErrorString(err));
-    throw_exception(eFail);
-  }
-  per_device_data_storage_.reserve(count);
-  per_device_data_.reserve(count);
-  for (int device_id = 0; device_id < count; ++device_id) {
-    per_device_data_storage_.push_back(std::make_unique<PerDeviceData>(device_id));
-    per_device_data_.push_back(per_device_data_storage_.back().get());
-  }
-}
-Allocator CudaPlatformImpl::GetDefaultAllocator(int32_t device_id) {
-  return per_device_data_[device_id]->default_allocator();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CudaStreamImpl
-
-CudaStreamImpl::CudaStreamImpl(Device device) : StreamImpl(device), stream_(), owned_stream_() {}
-
-CudaStreamImpl::~CudaStreamImpl() {
-  CudaDeviceGuard guard(device_.device_id());
-  if (owned_stream_) {
-    if (auto status = cudaStreamDestroy(stream_); status != cudaSuccess) {
-      // TODO: signal error
-    }
-    owned_stream_ = false;
-  }
-  external_.reset();
-}
-
-Result<void> CudaStreamImpl::Init(uint64_t flags) {
-  CudaDeviceGuard guard(device_);
-  if (auto status = cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking);
-      status != cudaSuccess) {
-    return Status(eFail);
-  }
-  owned_stream_ = true;
-  return success();
-}
-
-Result<void> CudaStreamImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
-  // ! nullptr is valid for cudaStream_t
-  external_ = std::move(native);
-  stream_ = static_cast<cudaStream_t>(external_.get());
-  owned_stream_ = false;
-  return success();
-}
-
-Result<void> CudaStreamImpl::DependsOn(Event& event) {
-  if (event.GetDevice() == device_) {
-    CudaDeviceGuard guard(device_);
-    auto native_event = ::mmdeploy::framework::GetNative<cudaEvent_t>(event);
-    cudaStreamWaitEvent(stream_, native_event, 0);
-    return success();
-  }
-  return Status(eInvalidArgument);
-}
-
-Result<void> CudaStreamImpl::Query() {
-  CudaDeviceGuard guard(device_);
-  if (cudaStreamQuery(stream_) == cudaSuccess) {
-    return success();
-  } else {
-    return Status(eFail);
-  }
-}
-
-Result<void> CudaStreamImpl::Wait() {
-  CudaDeviceGuard guard(device_);
-  if (cudaStreamSynchronize(stream_) == cudaSuccess) {
-    return success();
-  } else {
-    return Status(eFail);
-  }
-}
-
-Result<void> CudaStreamImpl::Submit(Kernel& kernel) {
-  auto task = ::mmdeploy::framework::GetNative<CudaTask*>(kernel);
-  if (task) {
-    CudaDeviceGuard guard(device_);
-    (*task)(stream_);
-    return success();
-  }
-  return Status(eInvalidArgument);
-}
-
-void* CudaStreamImpl::GetNative(ErrorCode* ec) {
-  if (ec) *ec = ErrorCode::eSuccess;
-  return stream_;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CudaEventImpl
-
-CudaEventImpl::CudaEventImpl(Device device) : EventImpl(device), event_(), owned_event_() {}
-
-CudaEventImpl::~CudaEventImpl() {
-  CudaDeviceGuard guard(device_.device_id());
-  if (owned_event_) {
-    if (auto status = cudaEventDestroy(event_); status != cudaSuccess) {
-      // TODO: signal error
-    }
-    owned_event_ = false;
-  }
-  external_.reset();
-}
-
-Result<void> CudaEventImpl::Init(uint64_t flags) {
-  CudaDeviceGuard guard(device_);
-  if (auto status = cudaEventCreateWithFlags(&event_, 0); status != cudaSuccess) {
-    return Status(eFail);
-  }
-  owned_event_ = true;
-  return success();
-}
-
-Result<void> CudaEventImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
-  if (!native) {
-    return Status(eInvalidArgument);
-  }
-  external_ = std::move(native);
-  event_ = static_cast<cudaEvent_t>(external_.get());
-  owned_event_ = false;
-  return success();
-}
-
-Result<void> CudaEventImpl::Query() {
-  if (cudaEventQuery(event_) == cudaSuccess) {
-    return success();
-  } else {
-    return Status(eFail);
-  }
-}
-
-Result<void> CudaEventImpl::Record(Stream& stream) {
-  if (stream.GetDevice() != device_) {
-    return Status(eInvalidArgument);
-  }
-  CudaDeviceGuard guard(device_);
-  auto native_stream = ::mmdeploy::framework::GetNative<cudaStream_t>(stream);
-  cudaEventRecord(event_, native_stream);
-  return success();
-}
-
-Result<void> CudaEventImpl::Wait() {
-  CudaDeviceGuard guard(device_);
-  if (cudaEventSynchronize(event_) == cudaSuccess) {
-    return success();
-  } else {
-    return Status(eFail);
-  }
-}
-
-void* CudaEventImpl::GetNative(ErrorCode* ec) {
-  if (ec) *ec = ErrorCode::eSuccess;
-  return event_;
-}
-////////////////////////////////////////////////////////////////////////////////
-/// CudaBufferImpl
-
-CudaBufferImpl::CudaBufferImpl(Device device) : BufferImpl(device) {}
-
-Result<void> CudaBufferImpl::Init(size_t size, Allocator allocator, size_t alignment,
-                                  uint64_t flags) {
-  memory_ = std::make_shared<CudaDeviceMemory>(device_.device_id());
-  if (!allocator) {
-    allocator = gCudaPlatform().GetDefaultAllocator(device_.device_id());
-  }
-  OUTCOME_TRY(memory_->Init(size, std::move(allocator), alignment, flags));
-  size_ = size;
-  return success();
-}
-
-Result<void> CudaBufferImpl::Init(size_t size, std::shared_ptr<void> native, uint64_t flags) {
-  memory_ = std::make_shared<CudaDeviceMemory>(device_.device_id());
-  OUTCOME_TRY(memory_->Init(size, std::move(native), flags));
-  size_ = size;
-  return success();
-}
-
-Result<BufferImplPtr> CudaBufferImpl::SubBuffer(size_t offset, size_t size, uint64_t flags) {
-  if (offset_ + offset + size > memory_->size()) {
-    return Status(eInvalidArgument);
-  }
-  auto impl = std::make_shared<CudaBufferImpl>(device_);
-  impl->memory_ = memory_;
-  impl->offset_ = offset_ + offset;
-  impl->size_ = size;
-  return impl;
-}
-
-size_t CudaBufferImpl::GetSize(ErrorCode* ec) { return size_; }
-
-void* CudaBufferImpl::GetNative(ErrorCode* ec) {
-  if (!memory_) {
-    if (ec) *ec = eInvalidArgument;
-    return nullptr;
-  }
-  if (ec) *ec = ErrorCode::eSuccess;
-  return OffsetPtr(memory_->data(), offset_);
-}
-
-Allocator CudaBufferImpl::GetAllocator() const { return memory_->allocator(); }
-
-////////////////////////////////////////////////////////////////////////////////
-/// CudaKernelImpl
-void* CudaKernelImpl::GetNative(ErrorCode* ec) {
-  if (ec) *ec = ErrorCode::eSuccess;
-  return &task_;
-}
-
-CudaKernelImpl::CudaKernelImpl(Device device, CudaTask task)
-    : KernelImpl(device), task_(std::move(task)) {}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CudaPlatformRegisterer
-class CudaPlatformRegisterer {
- public:
-  CudaPlatformRegisterer() {
-    gPlatformRegistry().Register([] { return std::make_shared<CudaPlatformImpl>(); });
-  }
-};
-
-CudaPlatformRegisterer g_cuda_platform_registerer;
-
-CudaPlatformImpl& gCudaPlatform() {
-  static Platform platform("cuda");
-  return Access::get<CudaPlatformImpl>(platform);
-}
+    default_allocator_ = cuda::CreateDefaultAllocator(); });
+    }
+
+    CudaPlatformImpl::CudaPlatformImpl()
+    {
+        int count{};
+        if (auto err = cudaGetDeviceCount(&count); err != cudaSuccess)
+        {
+            MMDEPLOY_ERROR("error getting device count: {}", cudaGetErrorString(err));
+            throw_exception(eFail);
+        }
+        per_device_data_storage_.reserve(count);
+        per_device_data_.reserve(count);
+        for (int device_id = 0; device_id < count; ++device_id)
+        {
+            per_device_data_storage_.push_back(std::make_unique<PerDeviceData>(device_id));
+            per_device_data_.push_back(per_device_data_storage_.back().get());
+        }
+    }
+    Allocator CudaPlatformImpl::GetDefaultAllocator(int32_t device_id)
+    {
+        return per_device_data_[device_id]->default_allocator();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CudaStreamImpl
+
+    CudaStreamImpl::CudaStreamImpl(Device device)
+        : StreamImpl(device)
+        , stream_()
+        , owned_stream_()
+    {
+    }
+
+    CudaStreamImpl::~CudaStreamImpl()
+    {
+        CudaDeviceGuard guard(device_.device_id());
+        if (owned_stream_)
+        {
+            if (auto status = cudaStreamDestroy(stream_); status != cudaSuccess)
+            {
+                // TODO: signal error
+            }
+            owned_stream_ = false;
+        }
+        external_.reset();
+    }
+
+    Result<void> CudaStreamImpl::Init(uint64_t flags)
+    {
+        CudaDeviceGuard guard(device_);
+        if (auto status = cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking);
+            status != cudaSuccess)
+        {
+            return Status(eFail);
+        }
+        owned_stream_ = true;
+        return success();
+    }
+
+    Result<void> CudaStreamImpl::Init(std::shared_ptr<void> native, uint64_t flags)
+    {
+        // ! nullptr is valid for cudaStream_t
+        external_     = std::move(native);
+        stream_       = static_cast<cudaStream_t>(external_.get());
+        owned_stream_ = false;
+        return success();
+    }
+
+    Result<void> CudaStreamImpl::DependsOn(Event& event)
+    {
+        if (event.GetDevice() == device_)
+        {
+            CudaDeviceGuard guard(device_);
+            auto            native_event = ::mmdeploy::framework::GetNative<cudaEvent_t>(event);
+            cudaStreamWaitEvent(stream_, native_event, 0);
+            return success();
+        }
+        return Status(eInvalidArgument);
+    }
+
+    Result<void> CudaStreamImpl::Query()
+    {
+        CudaDeviceGuard guard(device_);
+        if (cudaStreamQuery(stream_) == cudaSuccess)
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eFail);
+        }
+    }
+
+    Result<void> CudaStreamImpl::Wait()
+    {
+        CudaDeviceGuard guard(device_);
+        if (cudaStreamSynchronize(stream_) == cudaSuccess)
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eFail);
+        }
+    }
+
+    Result<void> CudaStreamImpl::Submit(Kernel& kernel)
+    {
+        auto task = ::mmdeploy::framework::GetNative<CudaTask*>(kernel);
+        if (task)
+        {
+            CudaDeviceGuard guard(device_);
+            (*task)(stream_);
+            return success();
+        }
+        return Status(eInvalidArgument);
+    }
+
+    void* CudaStreamImpl::GetNative(ErrorCode* ec)
+    {
+        if (ec) *ec = ErrorCode::eSuccess;
+        return stream_;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CudaEventImpl
+
+    CudaEventImpl::CudaEventImpl(Device device)
+        : EventImpl(device)
+        , event_()
+        , owned_event_()
+    {
+    }
+
+    CudaEventImpl::~CudaEventImpl()
+    {
+        CudaDeviceGuard guard(device_.device_id());
+        if (owned_event_)
+        {
+            if (auto status = cudaEventDestroy(event_); status != cudaSuccess)
+            {
+                // TODO: signal error
+            }
+            owned_event_ = false;
+        }
+        external_.reset();
+    }
+
+    Result<void> CudaEventImpl::Init(uint64_t flags)
+    {
+        CudaDeviceGuard guard(device_);
+        if (auto status = cudaEventCreateWithFlags(&event_, 0); status != cudaSuccess)
+        {
+            return Status(eFail);
+        }
+        owned_event_ = true;
+        return success();
+    }
+
+    Result<void> CudaEventImpl::Init(std::shared_ptr<void> native, uint64_t flags)
+    {
+        if (!native)
+        {
+            return Status(eInvalidArgument);
+        }
+        external_    = std::move(native);
+        event_       = static_cast<cudaEvent_t>(external_.get());
+        owned_event_ = false;
+        return success();
+    }
+
+    Result<void> CudaEventImpl::Query()
+    {
+        if (cudaEventQuery(event_) == cudaSuccess)
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eFail);
+        }
+    }
+
+    Result<void> CudaEventImpl::Record(Stream& stream)
+    {
+        if (stream.GetDevice() != device_)
+        {
+            return Status(eInvalidArgument);
+        }
+        CudaDeviceGuard guard(device_);
+        auto            native_stream = ::mmdeploy::framework::GetNative<cudaStream_t>(stream);
+        cudaEventRecord(event_, native_stream);
+        return success();
+    }
+
+    Result<void> CudaEventImpl::Wait()
+    {
+        CudaDeviceGuard guard(device_);
+        if (cudaEventSynchronize(event_) == cudaSuccess)
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eFail);
+        }
+    }
+
+    void* CudaEventImpl::GetNative(ErrorCode* ec)
+    {
+        if (ec) *ec = ErrorCode::eSuccess;
+        return event_;
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CudaBufferImpl
+
+    CudaBufferImpl::CudaBufferImpl(Device device)
+        : BufferImpl(device)
+    {
+    }
+
+    Result<void> CudaBufferImpl::Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags)
+    {
+        memory_ = std::make_shared<CudaDeviceMemory>(device_.device_id());
+        if (!allocator)
+        {
+            allocator = gCudaPlatform().GetDefaultAllocator(device_.device_id());
+        }
+        OUTCOME_TRY(memory_->Init(size, std::move(allocator), alignment, flags));
+        size_ = size;
+        return success();
+    }
+
+    Result<void> CudaBufferImpl::Init(size_t size, std::shared_ptr<void> native, uint64_t flags)
+    {
+        memory_ = std::make_shared<CudaDeviceMemory>(device_.device_id());
+        OUTCOME_TRY(memory_->Init(size, std::move(native), flags));
+        size_ = size;
+        return success();
+    }
+
+    Result<BufferImplPtr> CudaBufferImpl::SubBuffer(size_t offset, size_t size, uint64_t flags)
+    {
+        if (offset_ + offset + size > memory_->size())
+        {
+            return Status(eInvalidArgument);
+        }
+        auto impl     = std::make_shared<CudaBufferImpl>(device_);
+        impl->memory_ = memory_;
+        impl->offset_ = offset_ + offset;
+        impl->size_   = size;
+        return impl;
+    }
+
+    size_t CudaBufferImpl::GetSize(ErrorCode* ec)
+    {
+        return size_;
+    }
+
+    void* CudaBufferImpl::GetNative(ErrorCode* ec)
+    {
+        if (!memory_)
+        {
+            if (ec) *ec = eInvalidArgument;
+            return nullptr;
+        }
+        if (ec) *ec = ErrorCode::eSuccess;
+        return OffsetPtr(memory_->data(), offset_);
+    }
+
+    Allocator CudaBufferImpl::GetAllocator() const
+    {
+        return memory_->allocator();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CudaKernelImpl
+    void* CudaKernelImpl::GetNative(ErrorCode* ec)
+    {
+        if (ec) *ec = ErrorCode::eSuccess;
+        return &task_;
+    }
+
+    CudaKernelImpl::CudaKernelImpl(Device device, CudaTask task)
+        : KernelImpl(device)
+        , task_(std::move(task))
+    {
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CudaPlatformRegisterer
+    class CudaPlatformRegisterer
+    {
+      public:
+        CudaPlatformRegisterer()
+        {
+            gPlatformRegistry().Register([]
+                                         { return std::make_shared<CudaPlatformImpl>(); });
+        }
+    };
+
+    CudaPlatformRegisterer g_cuda_platform_registerer;
+
+    CudaPlatformImpl&      gCudaPlatform()
+    {
+        static Platform platform("cuda");
+        return Access::get<CudaPlatformImpl>(platform);
+    }
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/device/cuda/cuda_device.h b/csrc/mmdeploy/device/cuda/cuda_device.h
index 20b894652d..2e9547788c 100644
--- a/csrc/mmdeploy/device/cuda/cuda_device.h
+++ b/csrc/mmdeploy/device/cuda/cuda_device.h
@@ -8,191 +8,216 @@
 #include "mmdeploy/core/device_impl.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-using CudaTask = std::function<void(cudaStream_t)>;
+    using CudaTask = std::function<void(cudaStream_t)>;
 
-class CudaPlatformImpl : public PlatformImpl {
- public:
-  CudaPlatformImpl();
+    class CudaPlatformImpl : public PlatformImpl
+    {
+      public:
+        CudaPlatformImpl();
 
-  ~CudaPlatformImpl() override {
-    // The CUDA driver may have already shutdown before the platform dtor is called.
-    // As a workaround, simply leak per device resources and let the driver handle it
-    // FIXME: maybe a pair of global mmdeploy_init/deinit function would be a
-    //  better solution
-    for (auto& data : per_device_data_storage_) {
-      data.release();
-    }
-  }
+        ~CudaPlatformImpl() override
+        {
+            // The CUDA driver may have already shutdown before the platform dtor is called.
+            // As a workaround, simply leak per device resources and let the driver handle it
+            // FIXME: maybe a pair of global mmdeploy_init/deinit function would be a
+            //  better solution
+            for (auto& data : per_device_data_storage_)
+            {
+                data.release();
+            }
+        }
 
-  const char* GetPlatformName() const noexcept override { return "cuda"; }
+        const char* GetPlatformName() const noexcept override
+        {
+            return "cuda";
+        }
 
-  Result<void> BindDevice(Device device, Device* prev) override;
+        Result<void>           BindDevice(Device device, Device* prev) override;
 
-  shared_ptr<BufferImpl> CreateBuffer(Device device) override;
+        shared_ptr<BufferImpl> CreateBuffer(Device device) override;
 
-  shared_ptr<StreamImpl> CreateStream(Device device) override;
+        shared_ptr<StreamImpl> CreateStream(Device device) override;
 
-  shared_ptr<EventImpl> CreateEvent(Device device) override;
+        shared_ptr<EventImpl>  CreateEvent(Device device) override;
 
-  Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
-                    Stream stream) override;
+        Result<void>           Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream) override;
 
-  Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
-                    Stream stream) override;
+        Result<void>           Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream) override;
 
-  Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset,
-                    Stream stream) override;
+        Result<void>           Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream) override;
 
-  Result<Stream> GetDefaultStream(int32_t device_id) override;
+        Result<Stream>         GetDefaultStream(int32_t device_id) override;
 
-  Allocator GetDefaultAllocator(int32_t device_id);
+        Allocator              GetDefaultAllocator(int32_t device_id);
 
-  Device GetDevice(int device_id) { return Device(platform_id_, device_id); }
+        Device                 GetDevice(int device_id)
+        {
+            return Device(platform_id_, device_id);
+        }
 
- private:
-  static bool CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset,
-                             size_t copy_size);
+      private:
+        static bool         CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t copy_size);
 
-  static bool CheckCopyDevice(const Device& src, const Device& dst, const Device& st);
+        static bool         CheckCopyDevice(const Device& src, const Device& dst, const Device& st);
 
-  static Result<void> CopyImpl(Device device, const void* src, void* dst, size_t src_size,
-                               size_t dst_size, size_t src_offset, size_t dst_offset, size_t size,
-                               Stream st);
+        static Result<void> CopyImpl(Device device, const void* src, void* dst, size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t size, Stream st);
 
-  class PerDeviceData {
-   public:
-    explicit PerDeviceData(int device_id) : device_id_(device_id) {}
-    void init();
-    Stream& default_stream() {
-      init();
-      return default_stream_;
-    }
-    Allocator& default_allocator() {
-      init();
-      return default_allocator_;
-    }
+        class PerDeviceData
+        {
+          public:
+            explicit PerDeviceData(int device_id)
+                : device_id_(device_id)
+            {
+            }
+            void    init();
+            Stream& default_stream()
+            {
+                init();
+                return default_stream_;
+            }
+            Allocator& default_allocator()
+            {
+                init();
+                return default_allocator_;
+            }
 
-   private:
-    int device_id_;
-    std::once_flag init_flag_;
-    Stream default_stream_;
-    Allocator default_allocator_;
-  };
+          private:
+            int            device_id_;
+            std::once_flag init_flag_;
+            Stream         default_stream_;
+            Allocator      default_allocator_;
+        };
 
-  std::vector<std::unique_ptr<PerDeviceData>> per_device_data_storage_;
-  std::vector<PerDeviceData*> per_device_data_;
-};
+        std::vector<std::unique_ptr<PerDeviceData>> per_device_data_storage_;
+        std::vector<PerDeviceData*>                 per_device_data_;
+    };
 
-CudaPlatformImpl& gCudaPlatform();
+    CudaPlatformImpl& gCudaPlatform();
 
-class CudaDeviceMemory;
+    class CudaDeviceMemory;
 
-class CudaBufferImpl : public BufferImpl {
- public:
-  explicit CudaBufferImpl(Device device);
+    class CudaBufferImpl : public BufferImpl
+    {
+      public:
+        explicit CudaBufferImpl(Device device);
 
-  Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
+        Result<void>          Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
 
-  Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
+        Result<void>          Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
 
-  Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
+        Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
 
-  void* GetNative(ErrorCode* ec) override;
+        void*                 GetNative(ErrorCode* ec) override;
 
-  Allocator GetAllocator() const override;
+        Allocator             GetAllocator() const override;
 
-  size_t GetSize(ErrorCode* ec) override;
+        size_t                GetSize(ErrorCode* ec) override;
 
- private:
-  std::shared_ptr<CudaDeviceMemory> memory_;
-  size_t offset_{0};
-  size_t size_{0};
-};
+      private:
+        std::shared_ptr<CudaDeviceMemory> memory_;
+        size_t                            offset_{0};
+        size_t                            size_{0};
+    };
 
-class CudaStreamImpl : public StreamImpl {
- public:
-  explicit CudaStreamImpl(Device device);
+    class CudaStreamImpl : public StreamImpl
+    {
+      public:
+        explicit CudaStreamImpl(Device device);
 
-  ~CudaStreamImpl() override;
+        ~CudaStreamImpl() override;
 
-  Result<void> Init(uint64_t flags) override;
+        Result<void> Init(uint64_t flags) override;
 
-  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+        Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
 
-  Result<void> DependsOn(Event& event) override;
+        Result<void> DependsOn(Event& event) override;
 
-  Result<void> Query() override;
+        Result<void> Query() override;
 
-  Result<void> Wait() override;
+        Result<void> Wait() override;
 
-  Result<void> Submit(Kernel& kernel) override;
+        Result<void> Submit(Kernel& kernel) override;
 
-  void* GetNative(ErrorCode* ec) override;
+        void*        GetNative(ErrorCode* ec) override;
 
- private:
-  cudaStream_t stream_;
-  bool owned_stream_;
-  std::shared_ptr<void> external_;
-};
+      private:
+        cudaStream_t          stream_;
+        bool                  owned_stream_;
+        std::shared_ptr<void> external_;
+    };
 
-class CudaEventImpl : public EventImpl {
- public:
-  explicit CudaEventImpl(Device device);
+    class CudaEventImpl : public EventImpl
+    {
+      public:
+        explicit CudaEventImpl(Device device);
+
+        ~CudaEventImpl() override;
+
+        Result<void> Init(uint64_t flags) override;
 
-  ~CudaEventImpl() override;
+        Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+
+        Result<void> Query() override;
+
+        Result<void> Record(Stream& stream) override;
+
+        Result<void> Wait() override;
+
+        void*        GetNative(ErrorCode* ec) override;
+
+      private:
+        cudaEvent_t           event_;
+        bool                  owned_event_;
+        std::shared_ptr<void> external_;
+    };
 
-  Result<void> Init(uint64_t flags) override;
+    class CudaKernelImpl : public KernelImpl
+    {
+      public:
+        explicit CudaKernelImpl(Device device, CudaTask task);
 
-  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+        void* GetNative(ErrorCode* ec) override;
 
-  Result<void> Query() override;
+      private:
+        CudaTask task_;
+    };
 
-  Result<void> Record(Stream& stream) override;
-
-  Result<void> Wait() override;
-
-  void* GetNative(ErrorCode* ec) override;
-
- private:
-  cudaEvent_t event_;
-  bool owned_event_;
-  std::shared_ptr<void> external_;
-};
-
-class CudaKernelImpl : public KernelImpl {
- public:
-  explicit CudaKernelImpl(Device device, CudaTask task);
-
-  void* GetNative(ErrorCode* ec) override;
-
- private:
-  CudaTask task_;
-};
-
-class CudaDeviceGuard {
- public:
-  explicit CudaDeviceGuard(Device device) : CudaDeviceGuard(device.device_id()) {}
-  explicit CudaDeviceGuard(int device_id) : device_id_(device_id), prev_device_id_(-1) {
-    CUcontext ctx{};
-    cuCtxGetCurrent(&ctx);
-    if (ctx) {
-      cudaGetDevice(&prev_device_id_);
-    }
-    if (prev_device_id_ != device_id_) {
-      cudaSetDevice(device_id_);
-    }
-  }
-  ~CudaDeviceGuard() {
-    if (prev_device_id_ >= 0 && prev_device_id_ != device_id_) {
-      cudaSetDevice(prev_device_id_);
-    }
-  }
-
- private:
-  int device_id_;
-  int prev_device_id_;
-};
+    class CudaDeviceGuard
+    {
+      public:
+        explicit CudaDeviceGuard(Device device)
+            : CudaDeviceGuard(device.device_id())
+        {
+        }
+        explicit CudaDeviceGuard(int device_id)
+            : device_id_(device_id)
+            , prev_device_id_(-1)
+        {
+            CUcontext ctx{};
+            cuCtxGetCurrent(&ctx);
+            if (ctx)
+            {
+                cudaGetDevice(&prev_device_id_);
+            }
+            if (prev_device_id_ != device_id_)
+            {
+                cudaSetDevice(device_id_);
+            }
+        }
+        ~CudaDeviceGuard()
+        {
+            if (prev_device_id_ >= 0 && prev_device_id_ != device_id_)
+            {
+                cudaSetDevice(prev_device_id_);
+            }
+        }
+
+      private:
+        int device_id_;
+        int prev_device_id_;
+    };
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/device/cuda/default_allocator.h b/csrc/mmdeploy/device/cuda/default_allocator.h
index 50fe68b2c5..3eeb75ecbe 100644
--- a/csrc/mmdeploy/device/cuda/default_allocator.h
+++ b/csrc/mmdeploy/device/cuda/default_allocator.h
@@ -10,59 +10,65 @@
 
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy::cuda {
+namespace mmdeploy::cuda
+{
 
-class DefaultAllocator {
- public:
-  DefaultAllocator() = default;
-  ~DefaultAllocator() {
-    MMDEPLOY_ERROR("=== CUDA Default Allocator ===");
-    MMDEPLOY_ERROR("  Allocation: count={}, size={}MB, time={}ms", alloc_count_,
-                   alloc_size_ / (1024 * 1024.f), alloc_time_ / 1000000.f);
-    MMDEPLOY_ERROR("Deallocation: count={}, size={}MB, time={}ms", dealloc_count_,
-                   dealloc_size_ / (1024 * 1024.f), dealloc_time_ / 1000000.f);
-  }
-  [[nodiscard]] void* Allocate(std::size_t n) {
-    void* p{};
-    auto t0 = std::chrono::high_resolution_clock::now();
-    auto ret = cudaMalloc(&p, n);
-    auto t1 = std::chrono::high_resolution_clock::now();
-    alloc_time_ += (int64_t)std::chrono::duration<double, std::nano>(t1 - t0).count();
-    if (ret != cudaSuccess) {
-      MMDEPLOY_ERROR("error allocating cuda memory: {}", cudaGetErrorString(ret));
-      return nullptr;
-    }
-    alloc_count_ += 1;
-    alloc_size_ += n;
-    return p;
-  }
-  void Deallocate(void* p, std::size_t n) {
-    (void)n;
-    auto t0 = std::chrono::high_resolution_clock::now();
-    auto ret = cudaFree(p);
-    auto t1 = std::chrono::high_resolution_clock::now();
-    dealloc_time_ += (int64_t)std::chrono::duration<double, std::nano>(t1 - t0).count();
-    if (ret != cudaSuccess) {
-      MMDEPLOY_ERROR("error deallocating cuda memory: {}", cudaGetErrorString(ret));
-      return;
-    }
-    dealloc_count_ += 1;
-    dealloc_size_ += n;
-  }
+    class DefaultAllocator
+    {
+      public:
+        DefaultAllocator() = default;
+        ~DefaultAllocator()
+        {
+            MMDEPLOY_ERROR("=== CUDA Default Allocator ===");
+            MMDEPLOY_ERROR("  Allocation: count={}, size={}MB, time={}ms", alloc_count_, alloc_size_ / (1024 * 1024.f), alloc_time_ / 1000000.f);
+            MMDEPLOY_ERROR("Deallocation: count={}, size={}MB, time={}ms", dealloc_count_, dealloc_size_ / (1024 * 1024.f), dealloc_time_ / 1000000.f);
+        }
+        [[nodiscard]] void* Allocate(std::size_t n)
+        {
+            void* p{};
+            auto  t0  = std::chrono::high_resolution_clock::now();
+            auto  ret = cudaMalloc(&p, n);
+            auto  t1  = std::chrono::high_resolution_clock::now();
+            alloc_time_ += (int64_t)std::chrono::duration<double, std::nano>(t1 - t0).count();
+            if (ret != cudaSuccess)
+            {
+                MMDEPLOY_ERROR("error allocating cuda memory: {}", cudaGetErrorString(ret));
+                return nullptr;
+            }
+            alloc_count_ += 1;
+            alloc_size_ += n;
+            return p;
+        }
+        void Deallocate(void* p, std::size_t n)
+        {
+            (void)n;
+            auto t0  = std::chrono::high_resolution_clock::now();
+            auto ret = cudaFree(p);
+            auto t1  = std::chrono::high_resolution_clock::now();
+            dealloc_time_ += (int64_t)std::chrono::duration<double, std::nano>(t1 - t0).count();
+            if (ret != cudaSuccess)
+            {
+                MMDEPLOY_ERROR("error deallocating cuda memory: {}", cudaGetErrorString(ret));
+                return;
+            }
+            dealloc_count_ += 1;
+            dealloc_size_ += n;
+        }
 
- private:
-  std::atomic<std::size_t> alloc_count_;
-  std::atomic<std::size_t> alloc_size_;
-  std::atomic<std::size_t> alloc_time_;
-  std::atomic<std::size_t> dealloc_count_;
-  std::atomic<std::size_t> dealloc_size_;
-  std::atomic<std::size_t> dealloc_time_;
-};
+      private:
+        std::atomic<std::size_t> alloc_count_;
+        std::atomic<std::size_t> alloc_size_;
+        std::atomic<std::size_t> alloc_time_;
+        std::atomic<std::size_t> dealloc_count_;
+        std::atomic<std::size_t> dealloc_size_;
+        std::atomic<std::size_t> dealloc_time_;
+    };
 
-inline DefaultAllocator& gDefaultAllocator() {
-  static DefaultAllocator v;
-  return v;
-}
+    inline DefaultAllocator& gDefaultAllocator()
+    {
+        static DefaultAllocator v;
+        return v;
+    }
 
 }  // namespace mmdeploy::cuda
 
diff --git a/csrc/mmdeploy/device/cuda/linear_allocator.h b/csrc/mmdeploy/device/cuda/linear_allocator.h
index 59133e9332..bb197883dd 100644
--- a/csrc/mmdeploy/device/cuda/linear_allocator.h
+++ b/csrc/mmdeploy/device/cuda/linear_allocator.h
@@ -5,64 +5,79 @@
 
 #include "default_allocator.h"
 
-namespace mmdeploy::cuda {
+namespace mmdeploy::cuda
+{
 
-class LinearAllocator {
- public:
-  explicit LinearAllocator(std::size_t size) : size_(size) {
-    base_ = static_cast<uint8_t*>(gDefaultAllocator().Allocate(size));
-    ptr_ = base_;
-  }
-  ~LinearAllocator() { gDefaultAllocator().Deallocate(base_, size_); }
-  [[nodiscard]] void* Allocate(std::size_t n) {
-    std::optional<std::lock_guard<std::mutex> > lock;
-    if (mutex_) {
-      lock.emplace(*mutex_);
-    }
-    ++count_;
-    total_ += n;
-    auto ptr = static_cast<void*>(ptr_);
-    std::size_t space = base_ + size_ - ptr_;
+    class LinearAllocator
+    {
+      public:
+        explicit LinearAllocator(std::size_t size)
+            : size_(size)
+        {
+            base_ = static_cast<uint8_t*>(gDefaultAllocator().Allocate(size));
+            ptr_  = base_;
+        }
+        ~LinearAllocator()
+        {
+            gDefaultAllocator().Deallocate(base_, size_);
+        }
+        [[nodiscard]] void* Allocate(std::size_t n)
+        {
+            std::optional<std::lock_guard<std::mutex>> lock;
+            if (mutex_)
+            {
+                lock.emplace(*mutex_);
+            }
+            ++count_;
+            total_ += n;
+            auto        ptr   = static_cast<void*>(ptr_);
+            std::size_t space = base_ + size_ - ptr_;
 
-    if (std::align(16, n, ptr, space)) {
-      MMDEPLOY_ERROR("success n={}, total={}, count={}", n, total_, count_);
-      ptr_ = static_cast<uint8_t*>(ptr) + n;
-      return ptr;
-    }
-    MMDEPLOY_ERROR("fallback {}, total={}, count={}", n, total_, count_);
-    return gDefaultAllocator().Allocate(n);
-  }
-  void Deallocate(void* _p, std::size_t n) {
-    std::optional<std::lock_guard<std::mutex> > lock;
-    if (mutex_) {
-      lock.emplace(*mutex_);
-    }
-    auto p = static_cast<uint8_t*>(_p);
-    if (!(base_ <= p && p < ptr_)) {
-      gDefaultAllocator().Deallocate(_p, n);
-    }
-    total_ -= n;
-    --count_;
-    MMDEPLOY_ERROR("deallocate total={}, count={}", total_, count_);
-    if (total_ == 0) {
-      assert(count_ == 0);
-      ptr_ = base_;
-    }
-  }
+            if (std::align(16, n, ptr, space))
+            {
+                MMDEPLOY_ERROR("success n={}, total={}, count={}", n, total_, count_);
+                ptr_ = static_cast<uint8_t*>(ptr) + n;
+                return ptr;
+            }
+            MMDEPLOY_ERROR("fallback {}, total={}, count={}", n, total_, count_);
+            return gDefaultAllocator().Allocate(n);
+        }
+        void Deallocate(void* _p, std::size_t n)
+        {
+            std::optional<std::lock_guard<std::mutex>> lock;
+            if (mutex_)
+            {
+                lock.emplace(*mutex_);
+            }
+            auto p = static_cast<uint8_t*>(_p);
+            if (!(base_ <= p && p < ptr_))
+            {
+                gDefaultAllocator().Deallocate(_p, n);
+            }
+            total_ -= n;
+            --count_;
+            MMDEPLOY_ERROR("deallocate total={}, count={}", total_, count_);
+            if (total_ == 0)
+            {
+                assert(count_ == 0);
+                ptr_ = base_;
+            }
+        }
 
- private:
-  std::size_t size_;
-  uint8_t* base_;
-  uint8_t* ptr_;
-  std::size_t total_{};
-  std::size_t count_{};
-  std::optional<std::mutex> mutex_;
-};
+      private:
+        std::size_t               size_;
+        uint8_t*                  base_;
+        uint8_t*                  ptr_;
+        std::size_t               total_{};
+        std::size_t               count_{};
+        std::optional<std::mutex> mutex_;
+    };
 
-inline LinearAllocator& gLinearAllocator() {
-  static LinearAllocator v(1U << 30);
-  return v;
-}
+    inline LinearAllocator& gLinearAllocator()
+    {
+        static LinearAllocator v(1U << 30);
+        return v;
+    }
 
 }  // namespace mmdeploy::cuda
 
diff --git a/csrc/mmdeploy/device/device_allocator.h b/csrc/mmdeploy/device/device_allocator.h
index 8adf0cf72f..657a3368d9 100644
--- a/csrc/mmdeploy/device/device_allocator.h
+++ b/csrc/mmdeploy/device/device_allocator.h
@@ -13,339 +13,430 @@
 #include "mmdeploy/core/device_impl.h"
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy::framework::device_allocator {
-
-class Fallback : public AllocatorImpl {
- public:
-  Fallback(AllocatorImplPtr primary, AllocatorImplPtr fallback)
-      : primary_(std::move(primary)), fallback_(std::move(fallback)) {}
-
-  Block Allocate(size_t size) noexcept override {
-    if (auto block = primary_->Allocate(size); block.handle) {
-      return block;
-    }
-    return fallback_->Allocate(size);
-  }
-
-  void Deallocate(Block& block) noexcept override {
-    if (primary_->Owns(block)) {
-      primary_->Deallocate(block);
-      return;
-    }
-    fallback_->Deallocate(block);
-  }
-
-  bool Owns(const Block& block) const noexcept override {
-    return primary_->Owns(block) || fallback_->Owns(block);
-  }
-
- private:
-  AllocatorImplPtr primary_;
-  AllocatorImplPtr fallback_;
-};
-
-// TODO: batch allocation
-class Pool : public AllocatorImpl {
- public:
-  explicit Pool(AllocatorImplPtr allocator, size_t min_size, size_t max_size, unsigned pool_size)
-      : allocator_(std::move(allocator)),
-        min_size_(min_size),
-        max_size_(max_size),
-        pool_size_(pool_size) {
-    free_.reserve(pool_size);
-  }
-
-  ~Pool() override {
-    while (!free_.empty()) {
-      Block block(free_.back(), max_size_);
-      allocator_->Deallocate(block);
-      free_.pop_back();
-    }
-  }
-
-  Block Allocate(size_t size) noexcept override {
-    if (min_size_ <= size && size <= max_size_) {
-      if (!free_.empty()) {
-        auto handle = free_.back();
-        free_.pop_back();
-        return Block{handle, max_size_};
-      } else {
-        return allocator_->Allocate(max_size_);
-      }
+namespace mmdeploy::framework::device_allocator
+{
+
+    class Fallback : public AllocatorImpl
+    {
+      public:
+        Fallback(AllocatorImplPtr primary, AllocatorImplPtr fallback)
+            : primary_(std::move(primary))
+            , fallback_(std::move(fallback))
+        {
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            if (auto block = primary_->Allocate(size); block.handle)
+            {
+                return block;
+            }
+            return fallback_->Allocate(size);
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            if (primary_->Owns(block))
+            {
+                primary_->Deallocate(block);
+                return;
+            }
+            fallback_->Deallocate(block);
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            return primary_->Owns(block) || fallback_->Owns(block);
+        }
+
+      private:
+        AllocatorImplPtr primary_;
+        AllocatorImplPtr fallback_;
+    };
+
+    // TODO: batch allocation
+    class Pool : public AllocatorImpl
+    {
+      public:
+        explicit Pool(AllocatorImplPtr allocator, size_t min_size, size_t max_size, unsigned pool_size)
+            : allocator_(std::move(allocator))
+            , min_size_(min_size)
+            , max_size_(max_size)
+            , pool_size_(pool_size)
+        {
+            free_.reserve(pool_size);
+        }
+
+        ~Pool() override
+        {
+            while (!free_.empty())
+            {
+                Block block(free_.back(), max_size_);
+                allocator_->Deallocate(block);
+                free_.pop_back();
+            }
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            if (min_size_ <= size && size <= max_size_)
+            {
+                if (!free_.empty())
+                {
+                    auto handle = free_.back();
+                    free_.pop_back();
+                    return Block{handle, max_size_};
+                }
+                else
+                {
+                    return allocator_->Allocate(max_size_);
+                }
+            }
+            return Block{};
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            if (Owns(block))
+            {
+                if (free_.size() < pool_size_)
+                {
+                    free_.push_back(block.handle);
+                    block.handle = nullptr;
+                    block.size   = 0;
+                }
+                else
+                {
+                    allocator_->Deallocate(block);
+                }
+            }
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            return block.handle && min_size_ <= block.size && block.size <= max_size_;
+        }
+
+      private:
+        AllocatorImplPtr   allocator_;
+        size_t             min_size_;
+        size_t             max_size_;
+        unsigned           pool_size_;
+        std::vector<void*> free_;
+    };
+
+    class Tree : public AllocatorImpl
+    {
+        static constexpr auto kQuantizer = 100;
+
+      public:
+        Tree(AllocatorImplPtr allocator, size_t max_bytes, float threshold)
+            : allocator_(std::move(allocator))
+            , max_tree_bytes_(max_bytes)
+        {
+            if (threshold)
+            {
+                thresh_numerator_   = static_cast<int>(threshold * kQuantizer);
+                thresh_denominator_ = kQuantizer;
+                auto divisor        = std::gcd(thresh_numerator_, thresh_denominator_);
+                thresh_numerator_ /= divisor;
+                thresh_denominator_ /= divisor;
+            }
+        }
+
+        ~Tree() override
+        {
+            for (const auto& [size, handle] : tree_)
+            {
+                Block block(handle, size);
+                allocator_->Deallocate(block);
+            }
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            if (auto it = tree_.lower_bound(size); it != tree_.end())
+            {
+                if (size * thresh_denominator_ >= it->first * thresh_numerator_)
+                {
+                    Block block(it->second, it->first);
+                    tree_bytes_ -= it->first;
+                    tree_.erase(it);
+                    return block;
+                }
+            }
+            return allocator_->Allocate(size);
+        }
+        void Deallocate(Block& block) noexcept override
+        {
+            auto bytes = tree_bytes_ + block.size;
+            if (bytes < max_tree_bytes_)
+            {
+                tree_.insert({block.size, block.handle});
+                tree_bytes_  = bytes;
+                block.size   = 0;
+                block.handle = nullptr;
+            }
+            else
+            {
+                allocator_->Deallocate(block);
+            }
+        }
+        bool Owns(const Block& block) const noexcept override
+        {
+            return true;
+        }
+
+      private:
+        AllocatorImplPtr             allocator_;
+        // threshold ~ thresh_numerator_ / thresh_denominator_
+        int                          thresh_numerator_{};
+        int                          thresh_denominator_{};
+        std::multimap<size_t, void*> tree_;
+        size_t                       max_tree_bytes_;
+        size_t                       tree_bytes_{};
+    };
+
+    class Stats : public AllocatorImpl
+    {
+      public:
+        explicit Stats(AllocatorImplPtr allocator, std::string name)
+            : allocator_(std::move(allocator))
+            , name_(std::move(name))
+        {
+        }
+
+        ~Stats() override
+        {
+            MMDEPLOY_INFO("=== {} ===", name_);
+            MMDEPLOY_INFO("  Allocation: count={}, size={}MB, time={}ms", data_.allocation_count, data_.allocated_bytes / (1024 * 1024.f), static_cast<float>(data_.allocation_time));
+            MMDEPLOY_INFO("Deallocation: count={}, size={}MB, time={}ms", data_.deallocation_count, data_.deallocated_bytes / (1024 * 1024.f), static_cast<float>(data_.deallocation_time));
+            MMDEPLOY_INFO("Peak memory usage: size={}MB", data_.peak / (1024 * 1024.f));
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            auto t0    = std::chrono::high_resolution_clock::now();
+            auto block = allocator_->Allocate(size);
+            auto t1    = std::chrono::high_resolution_clock::now();
+            data_.allocation_time += std::chrono::duration<double, std::milli>(t1 - t0).count();
+            data_.allocated_bytes += block.size;
+            data_.peak = std::max(data_.peak, data_.allocated_bytes - data_.deallocated_bytes);
+            ++data_.allocation_count;
+            return block;
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            ++data_.deallocation_count;
+            data_.deallocated_bytes += block.size;
+            auto t0 = std::chrono::high_resolution_clock::now();
+            allocator_->Deallocate(block);
+            auto t1 = std::chrono::high_resolution_clock::now();
+            data_.deallocation_time += std::chrono::duration<double, std::milli>(t1 - t0).count();
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            return allocator_->Owns(block);
+        }
+
+        const char* Name() const noexcept override
+        {
+            return name_.c_str();
+        }
+
+      private:
+        struct Data
+        {
+            size_t allocation_count{};
+            size_t deallocation_count{};
+            size_t allocated_bytes{};
+            size_t deallocated_bytes{};
+            size_t peak{};
+            double allocation_time{};
+            double deallocation_time{};
+        };
+        Data             data_;
+        AllocatorImplPtr allocator_;
+        std::string      name_;
+    };
+
+    class Locked : public AllocatorImpl
+    {
+      public:
+        explicit Locked(AllocatorImplPtr allocator)
+            : allocator_(std::move(allocator))
+        {
+        }
+        Block Allocate(size_t size) noexcept override
+        {
+            std::lock_guard lock(mutex_);
+            return allocator_->Allocate(size);
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            std::lock_guard lock(mutex_);
+            allocator_->Deallocate(block);
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            std::lock_guard lock(mutex_);
+            return allocator_->Owns(block);
+        }
+
+      private:
+        AllocatorImplPtr   allocator_;
+        mutable std::mutex mutex_;
+    };
+
+    class Segregator : public AllocatorImpl
+    {
+      public:
+        Segregator(size_t threshold, AllocatorImplPtr small, AllocatorImplPtr large)
+            : threshold_(threshold)
+            , small_(std::move(small))
+            , large_(std::move(large))
+        {
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            if (size <= threshold_)
+            {
+                return small_->Allocate(size);
+            }
+            return large_->Allocate(size);
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            if (block.size <= threshold_)
+            {
+                return small_->Deallocate(block);
+            }
+            return large_->Deallocate(block);
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            if (block.size <= threshold_)
+            {
+                return small_->Owns(block);
+            }
+            return large_->Owns(block);
+        }
+
+      private:
+        size_t           threshold_;
+        AllocatorImplPtr small_;
+        AllocatorImplPtr large_;
+    };
+
+    template<typename Allocator>
+    class AllocatorAdapter : public AllocatorImpl
+    {
+      public:
+        Block Allocate(size_t size) noexcept override
+        {
+            return allocator_.Allocate(size);
+        }
+        void Deallocate(Block& block) noexcept override
+        {
+            return allocator_.Deallocate(block);
+        }
+        bool Owns(const Block& block) const noexcept override
+        {
+            return allocator_.Owns(block);
+        }
+
+      private:
+        Allocator allocator_;
+    };
+
+    class Bucketizer : public AllocatorImpl
+    {
+      public:
+        using AllocatorCreator = std::function<AllocatorImplPtr(size_t, size_t)>;
+        Bucketizer(const AllocatorCreator& creator, size_t min_size, size_t max_size, size_t step_size)
+            : min_size_(min_size)
+            , max_size_(max_size)
+            , step_size_(step_size)
+        {
+            for (auto base = min_size_; base < max_size_; base += step_size_)
+            {
+                //      MMDEPLOY_ERROR("{}, {}", base, base + step_size - 1);
+                allocator_.push_back(creator(base, base + step_size - 1));
+            }
+            //    MMDEPLOY_ERROR("{}", allocator_.size());
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            auto index = (size - min_size_) / step_size_;
+            if (0 <= index && index < allocator_.size())
+            {
+                return allocator_[index]->Allocate(size);
+            }
+            return Block{};
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            auto index = (block.size - min_size_) / step_size_;
+            if (0 <= index && index < allocator_.size())
+            {
+                return allocator_[index]->Deallocate(block);
+            }
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            return min_size_ <= block.size && block.size < max_size_;
+        }
+
+      private:
+        std::vector<AllocatorImplPtr> allocator_;
+        size_t                        min_size_;
+        size_t                        max_size_;
+        size_t                        step_size_;
+    };
+
+    inline AllocatorImplPtr CreateFallback(AllocatorImplPtr primary, AllocatorImplPtr fallback)
+    {
+        return std::make_shared<Fallback>(std::move(primary), std::move(fallback));
     }
-    return Block{};
-  }
-
-  void Deallocate(Block& block) noexcept override {
-    if (Owns(block)) {
-      if (free_.size() < pool_size_) {
-        free_.push_back(block.handle);
-        block.handle = nullptr;
-        block.size = 0;
-      } else {
-        allocator_->Deallocate(block);
-      }
-    }
-  }
-
-  bool Owns(const Block& block) const noexcept override {
-    return block.handle && min_size_ <= block.size && block.size <= max_size_;
-  }
-
- private:
-  AllocatorImplPtr allocator_;
-  size_t min_size_;
-  size_t max_size_;
-  unsigned pool_size_;
-  std::vector<void*> free_;
-};
-
-class Tree : public AllocatorImpl {
-  static constexpr auto kQuantizer = 100;
-
- public:
-  Tree(AllocatorImplPtr allocator, size_t max_bytes, float threshold)
-      : allocator_(std::move(allocator)), max_tree_bytes_(max_bytes) {
-    if (threshold) {
-      thresh_numerator_ = static_cast<int>(threshold * kQuantizer);
-      thresh_denominator_ = kQuantizer;
-      auto divisor = std::gcd(thresh_numerator_, thresh_denominator_);
-      thresh_numerator_ /= divisor;
-      thresh_denominator_ /= divisor;
-    }
-  }
 
-  ~Tree() override {
-    for (const auto& [size, handle] : tree_) {
-      Block block(handle, size);
-      allocator_->Deallocate(block);
-    }
-  }
-
-  Block Allocate(size_t size) noexcept override {
-    if (auto it = tree_.lower_bound(size); it != tree_.end()) {
-      if (size * thresh_denominator_ >= it->first * thresh_numerator_) {
-        Block block(it->second, it->first);
-        tree_bytes_ -= it->first;
-        tree_.erase(it);
-        return block;
-      }
+    inline AllocatorImplPtr CreateStats(const std::string& name, AllocatorImplPtr allocator)
+    {
+        return std::make_shared<Stats>(std::move(allocator), name);
     }
-    return allocator_->Allocate(size);
-  }
-  void Deallocate(Block& block) noexcept override {
-    auto bytes = tree_bytes_ + block.size;
-    if (bytes < max_tree_bytes_) {
-      tree_.insert({block.size, block.handle});
-      tree_bytes_ = bytes;
-      block.size = 0;
-      block.handle = nullptr;
-    } else {
-      allocator_->Deallocate(block);
-    }
-  }
-  bool Owns(const Block& block) const noexcept override { return true; }
-
- private:
-  AllocatorImplPtr allocator_;
-  // threshold ~ thresh_numerator_ / thresh_denominator_
-  int thresh_numerator_{};
-  int thresh_denominator_{};
-  std::multimap<size_t, void*> tree_;
-  size_t max_tree_bytes_;
-  size_t tree_bytes_{};
-};
-
-class Stats : public AllocatorImpl {
- public:
-  explicit Stats(AllocatorImplPtr allocator, std::string name)
-      : allocator_(std::move(allocator)), name_(std::move(name)) {}
-
-  ~Stats() override {
-    MMDEPLOY_INFO("=== {} ===", name_);
-    MMDEPLOY_INFO("  Allocation: count={}, size={}MB, time={}ms", data_.allocation_count,
-                  data_.allocated_bytes / (1024 * 1024.f),
-                  static_cast<float>(data_.allocation_time));
-    MMDEPLOY_INFO("Deallocation: count={}, size={}MB, time={}ms", data_.deallocation_count,
-                  data_.deallocated_bytes / (1024 * 1024.f),
-                  static_cast<float>(data_.deallocation_time));
-    MMDEPLOY_INFO("Peak memory usage: size={}MB", data_.peak / (1024 * 1024.f));
-  }
-
-  Block Allocate(size_t size) noexcept override {
-    auto t0 = std::chrono::high_resolution_clock::now();
-    auto block = allocator_->Allocate(size);
-    auto t1 = std::chrono::high_resolution_clock::now();
-    data_.allocation_time += std::chrono::duration<double, std::milli>(t1 - t0).count();
-    data_.allocated_bytes += block.size;
-    data_.peak = std::max(data_.peak, data_.allocated_bytes - data_.deallocated_bytes);
-    ++data_.allocation_count;
-    return block;
-  }
-
-  void Deallocate(Block& block) noexcept override {
-    ++data_.deallocation_count;
-    data_.deallocated_bytes += block.size;
-    auto t0 = std::chrono::high_resolution_clock::now();
-    allocator_->Deallocate(block);
-    auto t1 = std::chrono::high_resolution_clock::now();
-    data_.deallocation_time += std::chrono::duration<double, std::milli>(t1 - t0).count();
-  }
-
-  bool Owns(const Block& block) const noexcept override { return allocator_->Owns(block); }
-
-  const char* Name() const noexcept override { return name_.c_str(); }
-
- private:
-  struct Data {
-    size_t allocation_count{};
-    size_t deallocation_count{};
-    size_t allocated_bytes{};
-    size_t deallocated_bytes{};
-    size_t peak{};
-    double allocation_time{};
-    double deallocation_time{};
-  };
-  Data data_;
-  AllocatorImplPtr allocator_;
-  std::string name_;
-};
-
-class Locked : public AllocatorImpl {
- public:
-  explicit Locked(AllocatorImplPtr allocator) : allocator_(std::move(allocator)) {}
-  Block Allocate(size_t size) noexcept override {
-    std::lock_guard lock(mutex_);
-    return allocator_->Allocate(size);
-  }
-
-  void Deallocate(Block& block) noexcept override {
-    std::lock_guard lock(mutex_);
-    allocator_->Deallocate(block);
-  }
-
-  bool Owns(const Block& block) const noexcept override {
-    std::lock_guard lock(mutex_);
-    return allocator_->Owns(block);
-  }
-
- private:
-  AllocatorImplPtr allocator_;
-  mutable std::mutex mutex_;
-};
-
-class Segregator : public AllocatorImpl {
- public:
-  Segregator(size_t threshold, AllocatorImplPtr small, AllocatorImplPtr large)
-      : threshold_(threshold), small_(std::move(small)), large_(std::move(large)) {}
-
-  Block Allocate(size_t size) noexcept override {
-    if (size <= threshold_) {
-      return small_->Allocate(size);
-    }
-    return large_->Allocate(size);
-  }
 
-  void Deallocate(Block& block) noexcept override {
-    if (block.size <= threshold_) {
-      return small_->Deallocate(block);
+    inline AllocatorImplPtr CreatePool(size_t min_size, size_t max_size, unsigned int pool_size, AllocatorImplPtr allocator)
+    {
+        return std::make_shared<Pool>(std::move(allocator), min_size, max_size, pool_size);
     }
-    return large_->Deallocate(block);
-  }
 
-  bool Owns(const Block& block) const noexcept override {
-    if (block.size <= threshold_) {
-      return small_->Owns(block);
-    }
-    return large_->Owns(block);
-  }
-
- private:
-  size_t threshold_;
-  AllocatorImplPtr small_;
-  AllocatorImplPtr large_;
-};
-
-template <typename Allocator>
-class AllocatorAdapter : public AllocatorImpl {
- public:
-  Block Allocate(size_t size) noexcept override { return allocator_.Allocate(size); }
-  void Deallocate(Block& block) noexcept override { return allocator_.Deallocate(block); }
-  bool Owns(const Block& block) const noexcept override { return allocator_.Owns(block); }
-
- private:
-  Allocator allocator_;
-};
-
-class Bucketizer : public AllocatorImpl {
- public:
-  using AllocatorCreator = std::function<AllocatorImplPtr(size_t, size_t)>;
-  Bucketizer(const AllocatorCreator& creator, size_t min_size, size_t max_size, size_t step_size)
-      : min_size_(min_size), max_size_(max_size), step_size_(step_size) {
-    for (auto base = min_size_; base < max_size_; base += step_size_) {
-      //      MMDEPLOY_ERROR("{}, {}", base, base + step_size - 1);
-      allocator_.push_back(creator(base, base + step_size - 1));
+    inline AllocatorImplPtr CreateSegregator(size_t threshold, AllocatorImplPtr small, AllocatorImplPtr large)
+    {
+        return std::make_shared<Segregator>(threshold, std::move(small), std::move(large));
     }
-    //    MMDEPLOY_ERROR("{}", allocator_.size());
-  }
 
-  Block Allocate(size_t size) noexcept override {
-    auto index = (size - min_size_) / step_size_;
-    if (0 <= index && index < allocator_.size()) {
-      return allocator_[index]->Allocate(size);
+    inline AllocatorImplPtr CreateBucketizer(size_t min_size, size_t max_size, size_t step_size, const Bucketizer::AllocatorCreator& creator)
+    {
+        return std::make_shared<Bucketizer>(creator, min_size, max_size, step_size);
     }
-    return Block{};
-  }
 
-  void Deallocate(Block& block) noexcept override {
-    auto index = (block.size - min_size_) / step_size_;
-    if (0 <= index && index < allocator_.size()) {
-      return allocator_[index]->Deallocate(block);
+    inline AllocatorImplPtr CreatePoolBucketizer(size_t min_size, size_t max_size, size_t step_size, unsigned pool_size, const AllocatorImplPtr& allocator)
+    {
+        auto creator = [&](size_t lo, size_t hi)
+        {
+            return std::make_shared<Locked>(CreatePool(lo, hi, pool_size, allocator));
+        };
+        return CreateBucketizer(min_size, max_size, step_size, creator);
     }
-  }
-
-  bool Owns(const Block& block) const noexcept override {
-    return min_size_ <= block.size && block.size < max_size_;
-  }
-
- private:
-  std::vector<AllocatorImplPtr> allocator_;
-  size_t min_size_;
-  size_t max_size_;
-  size_t step_size_;
-};
-
-inline AllocatorImplPtr CreateFallback(AllocatorImplPtr primary, AllocatorImplPtr fallback) {
-  return std::make_shared<Fallback>(std::move(primary), std::move(fallback));
-}
-
-inline AllocatorImplPtr CreateStats(const std::string& name, AllocatorImplPtr allocator) {
-  return std::make_shared<Stats>(std::move(allocator), name);
-}
-
-inline AllocatorImplPtr CreatePool(size_t min_size, size_t max_size, unsigned int pool_size,
-                                   AllocatorImplPtr allocator) {
-  return std::make_shared<Pool>(std::move(allocator), min_size, max_size, pool_size);
-}
-
-inline AllocatorImplPtr CreateSegregator(size_t threshold, AllocatorImplPtr small,
-                                         AllocatorImplPtr large) {
-  return std::make_shared<Segregator>(threshold, std::move(small), std::move(large));
-}
-
-inline AllocatorImplPtr CreateBucketizer(size_t min_size, size_t max_size, size_t step_size,
-                                         const Bucketizer::AllocatorCreator& creator) {
-  return std::make_shared<Bucketizer>(creator, min_size, max_size, step_size);
-}
-
-inline AllocatorImplPtr CreatePoolBucketizer(size_t min_size, size_t max_size, size_t step_size,
-                                             unsigned pool_size,
-                                             const AllocatorImplPtr& allocator) {
-  auto creator = [&](size_t lo, size_t hi) {
-    return std::make_shared<Locked>(CreatePool(lo, hi, pool_size, allocator));
-  };
-  return CreateBucketizer(min_size, max_size, step_size, creator);
-}
 
 }  // namespace mmdeploy::framework::device_allocator
 
diff --git a/csrc/mmdeploy/execution/bulk.h b/csrc/mmdeploy/execution/bulk.h
index aeeeb40c99..4af12874c9 100644
--- a/csrc/mmdeploy/execution/bulk.h
+++ b/csrc/mmdeploy/execution/bulk.h
@@ -10,115 +10,133 @@
 #include "mmdeploy/core/logger.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __bulk {
-
-template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
-using Operation = typename _Operation<CvrefSender, Shape, Func, remove_cvref_t<Receiver>>::type;
-
-template <typename Receiver, typename Shape, typename Func>
-struct _Receiver {
-  struct type;
-};
-template <typename Receiver, typename Shape, typename Func>
-using receiver_t = typename _Receiver<Receiver, Shape, Func>::type;
-
-template <typename Receiver, typename Shape, typename Func>
-struct _Receiver<Receiver, Shape, Func>::type {
-  Receiver receiver_;
-  Shape shape_;
-  Func func_;
-
-  template <class... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    MMDEPLOY_DEBUG("fallback Bulk implementation");
-    for (Shape i = 0; i < self.shape_; ++i) {
-      self.func_(i, as...);
-    }
-    SetValue(std::move(self.receiver_), (As &&) as...);
-  }
-};
-
-template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
-struct _Operation<CvrefSender, Shape, Func, Receiver>::type {
-  connect_result_t<CvrefSender, receiver_t<Receiver, Shape, Func>> op_state2_;
-
-  friend void tag_invoke(start_t, type& self) { Start(self.op_state2_); }
-};
-
-template <typename Sender, typename Shape, typename Func>
-struct _Sender {
-  struct type;
-};
-template <typename Sender, typename Shape, typename Func>
-using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Shape>, Func>::type;
-
-template <typename Sender, typename Shape, typename Func>
-struct _Sender<Sender, Shape, Func>::type {
-  using value_types = completion_signatures_of_t<Sender>;
-
-  template <typename Receiver>
-  using _receiver_t = receiver_t<Receiver, Shape, Func>;
-
-  Sender sender_;
-  Shape shape_;
-  Func func_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
-      -> Operation<_copy_cvref_t<Self, Sender>, Shape, Func, Receiver> {
-    return {Connect(((Self &&) self).sender_,
-                    _receiver_t<Receiver>{(Receiver &&) receiver, ((Self &&) self).shape_,
-                                          ((Self &&) self).func_})};
-  }
-};
-
-using std::enable_if_t;
-
-struct bulk_t {
-  template <typename Sender, typename Shape, typename Func,
-            enable_if_t<_is_sender<Sender> &&
-                            _tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func>,
-                        int> = 0>
-  auto operator()(Sender&& sender, Shape&& shape, Func func) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(bulk_t{}, std::move(scheduler), (Sender &&) sender, (Shape &&) shape,
-                      (Func &&) func);
-  }
-  template <
-      typename Sender, typename Shape, typename Func,
-      enable_if_t<_is_sender<Sender> &&
-                      !_tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func> &&
-                      tag_invocable<bulk_t, Sender, Shape, Func>,
-                  int> = 0>
-  auto operator()(Sender&& sender, Shape&& shape, Func func) const {
-    return tag_invoke(bulk_t{}, (Sender &&) sender, (Shape &&) shape, (Func &&) func);
-  }
-  template <
-      typename Sender, typename Shape, typename Func,
-      enable_if_t<_is_sender<Sender> &&
-                      !_tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func> &&
-                      !tag_invocable<bulk_t, Sender, Shape, Func>,
-                  int> = 0>
-  auto operator()(Sender&& sender, Shape&& shape, Func func) const
-      -> sender_t<Sender, Shape, Func> {
-    return {(Sender &&) sender, (Shape &&) shape, std::move(func)};
-  }
-  template <typename Shape, typename Func>
-  _BinderBack<bulk_t, Shape, Func> operator()(Shape shape, Func fun) const {
-    return {{}, {}, {shape, std::move(fun)}};
-  }
-};
-
-}  // namespace __bulk
-
-using __bulk::bulk_t;
-inline constexpr bulk_t Bulk{};
+namespace mmdeploy
+{
+
+    namespace __bulk
+    {
+
+        template<typename CvrefSender, typename Shape, typename Func, typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename CvrefSender, typename Shape, typename Func, typename Receiver>
+        using Operation = typename _Operation<CvrefSender, Shape, Func, remove_cvref_t<Receiver>>::type;
+
+        template<typename Receiver, typename Shape, typename Func>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Receiver, typename Shape, typename Func>
+        using receiver_t = typename _Receiver<Receiver, Shape, Func>::type;
+
+        template<typename Receiver, typename Shape, typename Func>
+        struct _Receiver<Receiver, Shape, Func>::type
+        {
+            Receiver receiver_;
+            Shape    shape_;
+            Func     func_;
+
+            template<class... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                MMDEPLOY_DEBUG("fallback Bulk implementation");
+                for (Shape i = 0; i < self.shape_; ++i)
+                {
+                    self.func_(i, as...);
+                }
+                SetValue(std::move(self.receiver_), (As&&)as...);
+            }
+        };
+
+        template<typename CvrefSender, typename Shape, typename Func, typename Receiver>
+        struct _Operation<CvrefSender, Shape, Func, Receiver>::type
+        {
+            connect_result_t<CvrefSender, receiver_t<Receiver, Shape, Func>> op_state2_;
+
+            friend void                                                      tag_invoke(start_t, type& self)
+            {
+                Start(self.op_state2_);
+            }
+        };
+
+        template<typename Sender, typename Shape, typename Func>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Sender, typename Shape, typename Func>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Shape>, Func>::type;
+
+        template<typename Sender, typename Shape, typename Func>
+        struct _Sender<Sender, Shape, Func>::type
+        {
+            using value_types = completion_signatures_of_t<Sender>;
+
+            template<typename Receiver>
+            using _receiver_t = receiver_t<Receiver, Shape, Func>;
+
+            Sender sender_;
+            Shape  shape_;
+            Func   func_;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                -> Operation<_copy_cvref_t<Self, Sender>, Shape, Func, Receiver>
+            {
+                return {Connect(((Self&&)self).sender_,
+                                _receiver_t<Receiver>{(Receiver&&)receiver, ((Self&&)self).shape_, ((Self&&)self).func_})};
+            }
+        };
+
+        using std::enable_if_t;
+
+        struct bulk_t
+        {
+            template<typename Sender, typename Shape, typename Func, enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func>, int> = 0>
+            auto operator()(Sender&& sender, Shape&& shape, Func func) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(bulk_t{}, std::move(scheduler), (Sender&&)sender, (Shape&&)shape, (Func&&)func);
+            }
+            template<
+                typename Sender,
+                typename Shape,
+                typename Func,
+                enable_if_t<_is_sender<Sender> &&
+                                !_tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func> &&
+                                tag_invocable<bulk_t, Sender, Shape, Func>,
+                            int> = 0>
+            auto operator()(Sender&& sender, Shape&& shape, Func func) const
+            {
+                return tag_invoke(bulk_t{}, (Sender&&)sender, (Shape&&)shape, (Func&&)func);
+            }
+            template<
+                typename Sender,
+                typename Shape,
+                typename Func,
+                enable_if_t<_is_sender<Sender> &&
+                                !_tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func> &&
+                                !tag_invocable<bulk_t, Sender, Shape, Func>,
+                            int> = 0>
+            auto operator()(Sender&& sender, Shape&& shape, Func func) const
+                -> sender_t<Sender, Shape, Func>
+            {
+                return {(Sender&&)sender, (Shape&&)shape, std::move(func)};
+            }
+            template<typename Shape, typename Func>
+            _BinderBack<bulk_t, Shape, Func> operator()(Shape shape, Func fun) const
+            {
+                return {{}, {}, {shape, std::move(fun)}};
+            }
+        };
+
+    }  // namespace __bulk
+
+    using __bulk::bulk_t;
+    inline constexpr bulk_t Bulk{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/closure.h b/csrc/mmdeploy/execution/closure.h
index 7a901b5105..f73369e9c3 100644
--- a/csrc/mmdeploy/execution/closure.h
+++ b/csrc/mmdeploy/execution/closure.h
@@ -8,78 +8,87 @@
 #include "utility.h"
 
 #ifndef MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CLOSURE_H_
-#define MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CLOSURE_H_
-
-namespace mmdeploy {
-
-namespace __closure {
-
-template <class D>
-struct SenderAdaptorClosure;
-
-}  // namespace __closure
-
-using __closure::SenderAdaptorClosure;
-
-namespace __closure {
-
-template <typename T0, typename T1>
-struct _Compose : SenderAdaptorClosure<_Compose<T0, T1>> {
-  T0 t0_;
-  T1 t1_;
-
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  std::invoke_result_t<T1, std::invoke_result_t<T0, Sender>> operator()(Sender&& sender) && {
-    return ((T1 &&) t1_)(((T0 &&) t0_)((Sender &&) sender));
-  }
-
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  std::invoke_result_t<T1, std::invoke_result_t<T0, Sender>> operator()(Sender&& sender) const& {
-    return t1_(t0_((Sender &&) sender));
-  }
-};
-
-template <typename D>
-struct SenderAdaptorClosure {};
-
-template <typename T0, typename T1,
-          typename = std::enable_if_t<
-              std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<T0>>, remove_cvref_t<T0>> &&
-              std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<T1>>, remove_cvref_t<T1>>>>
-_Compose<remove_cvref_t<T0>, remove_cvref_t<T1>> operator|(T0&& t0, T1&& t1) {
-  return {(T0 &&) t0, (T1 &&) t1};
-}
-
-template <typename Sender, typename Closure,
-          typename = std::enable_if_t<
-              _is_sender<Sender> && std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<Closure>>,
-                                                      remove_cvref_t<Closure>>>>
-std::invoke_result_t<Closure, Sender> operator|(Sender&& sender, Closure&& closure) {
-  return ((Closure &&) closure)((Sender &&) sender);
-}
-
-template <typename Func, typename... As>
-struct _BinderBack : SenderAdaptorClosure<_BinderBack<Func, As...>> {
-  Func func_;
-  std::tuple<As...> as_;
-
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  std::invoke_result_t<Func, Sender, As...> operator()(Sender&& sender) && {
-    return std::apply(
-        [&sender, this](As&... as) { return ((Func &&) func_)((Sender &&) sender, (As &&) as...); },
-        as_);
-  }
-
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  std::invoke_result_t<Func, Sender, As...> operator()(Sender&& sender) const& {
-    return std::apply([&sender, this](const As&... as) { return func_((Sender &&) sender, as...); },
-                      as_);
-  }
-};
-
-}  // namespace __closure
-
-using __closure::_BinderBack;
+    #define MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CLOSURE_H_
+
+namespace mmdeploy
+{
+
+    namespace __closure
+    {
+
+        template<class D>
+        struct SenderAdaptorClosure;
+
+    }  // namespace __closure
+
+    using __closure::SenderAdaptorClosure;
+
+    namespace __closure
+    {
+
+        template<typename T0, typename T1>
+        struct _Compose : SenderAdaptorClosure<_Compose<T0, T1>>
+        {
+            T0 t0_;
+            T1 t1_;
+
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            std::invoke_result_t<T1, std::invoke_result_t<T0, Sender>> operator()(Sender&& sender) &&
+            {
+                return ((T1&&)t1_)(((T0&&)t0_)((Sender&&)sender));
+            }
+
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            std::invoke_result_t<T1, std::invoke_result_t<T0, Sender>> operator()(Sender&& sender) const&
+            {
+                return t1_(t0_((Sender&&)sender));
+            }
+        };
+
+        template<typename D>
+        struct SenderAdaptorClosure
+        {
+        };
+
+        template<typename T0, typename T1, typename = std::enable_if_t<std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<T0>>, remove_cvref_t<T0>> && std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<T1>>, remove_cvref_t<T1>>>>
+        _Compose<remove_cvref_t<T0>, remove_cvref_t<T1>> operator|(T0&& t0, T1&& t1)
+        {
+            return {(T0&&)t0, (T1&&)t1};
+        }
+
+        template<typename Sender, typename Closure, typename = std::enable_if_t<_is_sender<Sender> && std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<Closure>>, remove_cvref_t<Closure>>>>
+        std::invoke_result_t<Closure, Sender> operator|(Sender&& sender, Closure&& closure)
+        {
+            return ((Closure&&)closure)((Sender&&)sender);
+        }
+
+        template<typename Func, typename... As>
+        struct _BinderBack : SenderAdaptorClosure<_BinderBack<Func, As...>>
+        {
+            Func              func_;
+            std::tuple<As...> as_;
+
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            std::invoke_result_t<Func, Sender, As...> operator()(Sender&& sender) &&
+            {
+                return std::apply(
+                    [&sender, this](As&... as)
+                    { return ((Func&&)func_)((Sender&&)sender, (As&&)as...); },
+                    as_);
+            }
+
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            std::invoke_result_t<Func, Sender, As...> operator()(Sender&& sender) const&
+            {
+                return std::apply([&sender, this](const As&... as)
+                                  { return func_((Sender&&)sender, as...); },
+                                  as_);
+            }
+        };
+
+    }  // namespace __closure
+
+    using __closure::_BinderBack;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/concepts.h b/csrc/mmdeploy/execution/concepts.h
index a0cb0f6d3e..d45fdf9e34 100644
--- a/csrc/mmdeploy/execution/concepts.h
+++ b/csrc/mmdeploy/execution/concepts.h
@@ -5,130 +5,158 @@
 
 #include "tag_invoke.h"
 
-namespace mmdeploy {
-
-namespace _get_completion_signatures {
-
-struct get_completion_signatures_t {
-  template <typename Sender, typename ValueTypes = typename remove_cvref_t<Sender>::value_types>
-  constexpr identity<ValueTypes> operator()(Sender&& sender) const noexcept {
-    return {};
-  }
-};
-
-}  // namespace _get_completion_signatures
-
-using _get_completion_signatures::get_completion_signatures_t;
-inline constexpr get_completion_signatures_t GetCompletionSignatures{};
-
-template <typename Sender>
-inline constexpr bool _is_sender = std::is_invocable_v<get_completion_signatures_t, Sender>&&
-    std::is_move_constructible_v<remove_cvref_t<Sender>>;
-
-// GetCompletionSignatures is expected to return identity<std::tuple<Types...>>;
-template <typename Sender>
-using completion_signatures_of_t =
-    typename std::invoke_result_t<get_completion_signatures_t, Sender>::type;
-
-namespace _set_value {
-struct set_value_t {
-  template <typename Receiver, typename... Args,
-            std::enable_if_t<is_tag_invocable_v<set_value_t, Receiver, Args...>, int> = 0>
-  void operator()(Receiver&& receiver, Args&&... args) const noexcept {
-    static_assert(is_nothrow_tag_invocable_v<set_value_t, Receiver, Args...>);
-    (void)tag_invoke(set_value_t{}, (Receiver &&) receiver, (Args &&) args...);
-  }
-};
-
-}  // namespace _set_value
-
-using _set_value::set_value_t;
-inline constexpr set_value_t SetValue{};
-
-namespace _start {
-
-struct start_t {
-  template <typename Operation, std::enable_if_t<tag_invocable<start_t, Operation&>, int> = 0>
-  void operator()(Operation& op_state) const
-      noexcept(is_nothrow_tag_invocable_v<start_t, Operation&>) {
-    (void)tag_invoke(start_t{}, op_state);
-  }
-};
-
-}  // namespace _start
-
-using _start::start_t;
-inline constexpr start_t Start{};
-
-namespace _connect {
-
-struct connect_t {
-  template <typename Sender, typename Receiver,
-            std::enable_if_t<is_tag_invocable_v<connect_t, Sender, Receiver>, int> = 0>
-  auto operator()(Sender&& sender, Receiver&& receiver) const
-      -> tag_invoke_result_t<connect_t, Sender, Receiver> {
-    return tag_invoke(connect_t{}, (Sender &&) sender, (Receiver &&) receiver);
-  }
-};
-
-}  // namespace _connect
-
-using _connect::connect_t;
-inline constexpr connect_t Connect{};
-
-namespace _get_completion_scheduler {
-
-struct get_completion_scheduler_t {
-  template <
-      typename Sender,
-      std::enable_if_t<is_tag_invocable_v<get_completion_scheduler_t, const Sender&>, int> = 0>
-  auto operator()(const Sender& sender) const noexcept
-      -> tag_invoke_result_t<get_completion_scheduler_t, const Sender&> {
-    return tag_invoke(get_completion_scheduler_t{}, sender);
-  }
-};
-
-}  // namespace _get_completion_scheduler
-
-using _get_completion_scheduler::get_completion_scheduler_t;
-inline constexpr get_completion_scheduler_t GetCompletionScheduler{};
-
-template <typename Sender>
-inline constexpr bool _has_completion_scheduler_v =
-    std::is_invocable_v<get_completion_scheduler_t, Sender>;
-
-template <typename Sender>
-struct _has_completion_scheduler : std::bool_constant<_has_completion_scheduler_v<Sender>> {};
-
-template <typename Sender>
-using _completion_scheduler_for = std::invoke_result_t<get_completion_scheduler_t, Sender>;
-
-namespace impl {
-
-template <typename Func, typename Sender, typename TArgs, typename SFINAE = void>
-struct _tag_invocable_with_completion_scheduler : std::false_type {};
-
-template <typename Func, typename Sender, typename... Args>
-struct _tag_invocable_with_completion_scheduler<
-    Func, Sender, std::tuple<Args...>, std::enable_if_t<_has_completion_scheduler_v<Sender>>>
-    : is_tag_invocable<Func, _completion_scheduler_for<Sender>, Sender, Args...> {};
-
-}  // namespace impl
-
-template <typename Func, typename Sender, typename... Args>
-inline constexpr bool _tag_invocable_with_completion_scheduler =
-    impl::_tag_invocable_with_completion_scheduler<Func, Sender, std::tuple<Args...>>::value;
-
-template <typename T, typename SFINAE = void>
-struct _is_range : std::false_type {};
-
-template <typename T>
-struct _is_range<T,
-                 std::void_t<decltype(std::begin(std::declval<T>()), std::end(std::declval<T>()))>>
-    : std::true_type {};
-
-template <typename T>
-inline constexpr bool _is_range_v = _is_range<T>::value;
+namespace mmdeploy
+{
+
+    namespace _get_completion_signatures
+    {
+
+        struct get_completion_signatures_t
+        {
+            template<typename Sender, typename ValueTypes = typename remove_cvref_t<Sender>::value_types>
+            constexpr identity<ValueTypes> operator()(Sender&& sender) const noexcept
+            {
+                return {};
+            }
+        };
+
+    }  // namespace _get_completion_signatures
+
+    using _get_completion_signatures::get_completion_signatures_t;
+    inline constexpr get_completion_signatures_t GetCompletionSignatures{};
+
+    template<typename Sender>
+    inline constexpr bool _is_sender = std::is_invocable_v<get_completion_signatures_t, Sender> &&
+                                       std::is_move_constructible_v<remove_cvref_t<Sender>>;
+
+    // GetCompletionSignatures is expected to return identity<std::tuple<Types...>>;
+    template<typename Sender>
+    using completion_signatures_of_t =
+        typename std::invoke_result_t<get_completion_signatures_t, Sender>::type;
+
+    namespace _set_value
+    {
+        struct set_value_t
+        {
+            template<typename Receiver, typename... Args, std::enable_if_t<is_tag_invocable_v<set_value_t, Receiver, Args...>, int> = 0>
+            void operator()(Receiver&& receiver, Args&&... args) const noexcept
+            {
+                static_assert(is_nothrow_tag_invocable_v<set_value_t, Receiver, Args...>);
+                (void)tag_invoke(set_value_t{}, (Receiver&&)receiver, (Args&&)args...);
+            }
+        };
+
+    }  // namespace _set_value
+
+    using _set_value::set_value_t;
+    inline constexpr set_value_t SetValue{};
+
+    namespace _start
+    {
+
+        struct start_t
+        {
+            template<typename Operation, std::enable_if_t<tag_invocable<start_t, Operation&>, int> = 0>
+            void operator()(Operation& op_state) const
+                noexcept(is_nothrow_tag_invocable_v<start_t, Operation&>)
+            {
+                (void)tag_invoke(start_t{}, op_state);
+            }
+        };
+
+    }  // namespace _start
+
+    using _start::start_t;
+    inline constexpr start_t Start{};
+
+    namespace _connect
+    {
+
+        struct connect_t
+        {
+            template<typename Sender, typename Receiver, std::enable_if_t<is_tag_invocable_v<connect_t, Sender, Receiver>, int> = 0>
+            auto operator()(Sender&& sender, Receiver&& receiver) const
+                -> tag_invoke_result_t<connect_t, Sender, Receiver>
+            {
+                return tag_invoke(connect_t{}, (Sender&&)sender, (Receiver&&)receiver);
+            }
+        };
+
+    }  // namespace _connect
+
+    using _connect::connect_t;
+    inline constexpr connect_t Connect{};
+
+    namespace _get_completion_scheduler
+    {
+
+        struct get_completion_scheduler_t
+        {
+            template<
+                typename Sender,
+                std::enable_if_t<is_tag_invocable_v<get_completion_scheduler_t, const Sender&>, int> = 0>
+            auto operator()(const Sender& sender) const noexcept
+                -> tag_invoke_result_t<get_completion_scheduler_t, const Sender&>
+            {
+                return tag_invoke(get_completion_scheduler_t{}, sender);
+            }
+        };
+
+    }  // namespace _get_completion_scheduler
+
+    using _get_completion_scheduler::get_completion_scheduler_t;
+    inline constexpr get_completion_scheduler_t GetCompletionScheduler{};
+
+    template<typename Sender>
+    inline constexpr bool _has_completion_scheduler_v =
+        std::is_invocable_v<get_completion_scheduler_t, Sender>;
+
+    template<typename Sender>
+    struct _has_completion_scheduler : std::bool_constant<_has_completion_scheduler_v<Sender>>
+    {
+    };
+
+    template<typename Sender>
+    using _completion_scheduler_for = std::invoke_result_t<get_completion_scheduler_t, Sender>;
+
+    namespace impl
+    {
+
+        template<typename Func, typename Sender, typename TArgs, typename SFINAE = void>
+        struct _tag_invocable_with_completion_scheduler : std::false_type
+        {
+        };
+
+        template<typename Func, typename Sender, typename... Args>
+        struct _tag_invocable_with_completion_scheduler<
+            Func,
+            Sender,
+            std::tuple<Args...>,
+            std::enable_if_t<_has_completion_scheduler_v<Sender>>>
+            : is_tag_invocable<Func, _completion_scheduler_for<Sender>, Sender, Args...>
+        {
+        };
+
+    }  // namespace impl
+
+    template<typename Func, typename Sender, typename... Args>
+    inline constexpr bool _tag_invocable_with_completion_scheduler =
+        impl::_tag_invocable_with_completion_scheduler<Func, Sender, std::tuple<Args...>>::value;
+
+    template<typename T, typename SFINAE = void>
+    struct _is_range : std::false_type
+    {
+    };
+
+    template<typename T>
+    struct _is_range<T,
+                     std::void_t<decltype(std::begin(std::declval<T>()), std::end(std::declval<T>()))>>
+        : std::true_type
+    {
+    };
+
+    template<typename T>
+    inline constexpr bool _is_range_v = _is_range<T>::value;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/dynamic_batch.h b/csrc/mmdeploy/execution/dynamic_batch.h
index 6f43fd9a46..cbb920179b 100644
--- a/csrc/mmdeploy/execution/dynamic_batch.h
+++ b/csrc/mmdeploy/execution/dynamic_batch.h
@@ -8,55 +8,54 @@
 #include "mmdeploy/execution/then.h"
 #include "mmdeploy/execution/utility.h"
 
-namespace mmdeploy {
-
-namespace _dynamic_batch {
-
-struct dynamic_batch_t {
-  struct context_base_t {
-    void (*destroy_)(context_base_t*);
-  };
-  struct context_t {
-    std::atomic<context_base_t*> base{};
-    ~context_t() {
-      if (auto p = base.load()) {
-        p->destroy_(p);
-      }
-    }
-  };
-
-  template <typename Sender, typename Func,
-            std::enable_if_t<
-                _tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, context_t&, Func>,
-                int> = 0>
-  auto operator()(Sender&& sender, context_t& context, Func func) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(*this, std::move(scheduler), (Sender &&) sender, context, std::move(func));
-  }
-
-  template <typename Sender, typename Func,
-            std::enable_if_t<!_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender,
-                                                                       context_t&, Func> &&
-                                 tag_invocable<dynamic_batch_t, Sender, context_t&, Func>,
-                             int> = 0>
-  auto operator()(Sender&& sender, context_t& context, Func func) const {
-    return tag_invoke(*this, (Sender &&) sender, context, std::move(func));
-  }
-
-  template <typename Sender, typename Context, typename Func,
-            std::enable_if_t<
-                !_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, Context, Func> &&
-                    !tag_invocable<dynamic_batch_t, Sender, Context, Func>,
-                int> = 0>
-  auto operator()(Sender&& sender, Context&&, Func func) const {
-    return Then((Sender &&) sender, std::move(func));
-  }
-};
-
-}  // namespace _dynamic_batch
-
-using _dynamic_batch::dynamic_batch_t;
-inline constexpr dynamic_batch_t DynamicBatch{};
+namespace mmdeploy
+{
+
+    namespace _dynamic_batch
+    {
+
+        struct dynamic_batch_t
+        {
+            struct context_base_t
+            {
+                void (*destroy_)(context_base_t*);
+            };
+            struct context_t
+            {
+                std::atomic<context_base_t*> base{};
+                ~context_t()
+                {
+                    if (auto p = base.load())
+                    {
+                        p->destroy_(p);
+                    }
+                }
+            };
+
+            template<typename Sender, typename Func, std::enable_if_t<_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, context_t&, Func>, int> = 0>
+            auto operator()(Sender&& sender, context_t& context, Func func) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(*this, std::move(scheduler), (Sender&&)sender, context, std::move(func));
+            }
+
+            template<typename Sender, typename Func, std::enable_if_t<!_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, context_t&, Func> && tag_invocable<dynamic_batch_t, Sender, context_t&, Func>, int> = 0>
+            auto operator()(Sender&& sender, context_t& context, Func func) const
+            {
+                return tag_invoke(*this, (Sender&&)sender, context, std::move(func));
+            }
+
+            template<typename Sender, typename Context, typename Func, std::enable_if_t<!_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, Context, Func> && !tag_invocable<dynamic_batch_t, Sender, Context, Func>, int> = 0>
+            auto operator()(Sender&& sender, Context&&, Func func) const
+            {
+                return Then((Sender&&)sender, std::move(func));
+            }
+        };
+
+    }  // namespace _dynamic_batch
+
+    using _dynamic_batch::dynamic_batch_t;
+    inline constexpr dynamic_batch_t DynamicBatch{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/ensure_started.h b/csrc/mmdeploy/execution/ensure_started.h
index 19f4458dc3..dbfea9d2f3 100644
--- a/csrc/mmdeploy/execution/ensure_started.h
+++ b/csrc/mmdeploy/execution/ensure_started.h
@@ -8,164 +8,190 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __ensure_started {
-
-struct _OperationBase {
-  void (*notify_)(_OperationBase*);
-};
-
-template <typename SharedState>
-struct _Receiver {
-  struct type;
-};
-template <typename SharedState>
-using receiver_t = typename _Receiver<SharedState>::type;
-
-template <typename SharedState>
-struct _Receiver<SharedState>::type {
-  std::shared_ptr<SharedState> shared_state_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    assert(self.shared_state_);
-    self.shared_state_->data_.emplace((As &&) as...);
-    self.shared_state_->_Notify();
-    self.shared_state_.reset();
-  }
-};
-
-template <typename Sender>
-struct _SharedState {
-  std::optional<completion_signatures_of_t<Sender>> data_;
-  //  std::optional<connect_result_t<Sender, receiver_t<_SharedState>>> op_state2_;
-  std::optional<__conv_proxy<connect_result_t<Sender, receiver_t<_SharedState>>>> op_state2_proxy_;
-
-  std::atomic<void*> awaiting_{nullptr};
-
-  void _Notify() noexcept {
-    void* const completion_state = static_cast<void*>(this);
-    void* old = awaiting_.exchange(completion_state, std::memory_order_acq_rel);
-    auto* op_state = static_cast<_OperationBase*>(old);
-
-    if (op_state != nullptr) {
-      op_state->notify_(op_state);
-    }
-  }
-};
-
-template <typename Sender, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Sender, typename Receiver>
-using Operation = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
-
-template <typename Sender, typename Receiver>
-struct _Operation<Sender, Receiver>::type : public _OperationBase {
-  Receiver receiver_;
-  std::shared_ptr<_SharedState<Sender>> shared_state_;
-
-  type(Receiver&& receiver, std::shared_ptr<_SharedState<Sender>> shared_state)
-      : _OperationBase{_Notify},
-        receiver_(std::move(receiver)),
-        shared_state_(std::move(shared_state)) {}
-
-  static void _Notify(_OperationBase* self) noexcept {
-    auto op_state = static_cast<type*>(self);
-
-    std::apply(
-        [&](auto&&... vals) -> void {
-          SetValue(std::move(op_state->receiver_), (decltype(vals)&&)vals...);
-        },
-        *op_state->shared_state_->data_);
-  }
-
-  friend void tag_invoke(start_t, type& self) {
-    auto shared_state = self.shared_state_.get();
-    std::atomic<void*>& awaiting = shared_state->awaiting_;
-    void* const completion_state = static_cast<void*>(shared_state);
-    void* old = awaiting.load(std::memory_order_acquire);
-
-    // TODO: cancel the loop by replacing `compare_exchange_weak` with `compare_exchange_strong`
-    do {
-      if (old == completion_state) {
-        _Notify(&self);
-        return;
-      }
-    } while (awaiting.compare_exchange_weak(old, static_cast<void*>(&self),
-                                            std::memory_order_release, std::memory_order_acquire));
-  }
-};
-
-template <typename Sender>
-struct _Sender {
-  struct type;
-};
-template <typename Sender>
-using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
-
-template <typename Sender>
-struct _Sender<Sender>::type {
-  using value_types = completion_signatures_of_t<Sender>;
-
-  using SharedState = _SharedState<Sender>;
-
-  std::shared_ptr<SharedState> shared_state_;
-
-  template <typename Sndr, std::enable_if_t<!std::is_same_v<remove_cvref_t<Sndr>, type>, int> = 0>
-  explicit type(Sndr&& sender) : shared_state_(std::make_shared<SharedState>()) {
-    shared_state_->op_state2_proxy_.emplace(
-        [&] { return Connect((Sndr &&) sender, receiver_t<SharedState>{shared_state_}); });
-    Start(**shared_state_->op_state2_proxy_);
-    //    Start(shared_state_->op_state2_.emplace(
-    //        __conv{[&] { return Connect((Sndr &&) sender, receiver_t<SharedState>{shared_state_});
-    //        }}));
-  }
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
-      -> Operation<Sender, Receiver> {
-    return {(Receiver &&) receiver, std::move(self.shared_state_)};
-  }
-};
-
-struct ensure_started_t {
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 _tag_invocable_with_completion_scheduler<ensure_started_t, Sender>,
-                             int> = 0>
-  auto operator()(Sender&& sender) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(ensure_started_t{}, std::move(scheduler), (Sender &&) sender);
-  }
-
-  template <
-      typename Sender,
-      std::enable_if_t<_is_sender<Sender> &&
-                           !_tag_invocable_with_completion_scheduler<ensure_started_t, Sender> &&
-                           tag_invocable<ensure_started_t, Sender>,
-                       int> = 0>
-  auto operator()(Sender&& sender) const {
-    return tag_invoke(ensure_started_t{}, (Sender &&) sender);
-  }
-
-  template <
-      typename Sender,
-      std::enable_if_t<_is_sender<Sender> &&
-                           !_tag_invocable_with_completion_scheduler<ensure_started_t, Sender> &&
-                           !tag_invocable<ensure_started_t, Sender>,
-                       int> = 0>
-  sender_t<Sender> operator()(Sender&& sender) const {
-    return sender_t<Sender>{(Sender &&) sender};
-  }
-};
-
-}  // namespace __ensure_started
-
-using __ensure_started::ensure_started_t;
-inline constexpr ensure_started_t EnsureStarted{};
+namespace mmdeploy
+{
+
+    namespace __ensure_started
+    {
+
+        struct _OperationBase
+        {
+            void (*notify_)(_OperationBase*);
+        };
+
+        template<typename SharedState>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename SharedState>
+        using receiver_t = typename _Receiver<SharedState>::type;
+
+        template<typename SharedState>
+        struct _Receiver<SharedState>::type
+        {
+            std::shared_ptr<SharedState> shared_state_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                assert(self.shared_state_);
+                self.shared_state_->data_.emplace((As&&)as...);
+                self.shared_state_->_Notify();
+                self.shared_state_.reset();
+            }
+        };
+
+        template<typename Sender>
+        struct _SharedState
+        {
+            std::optional<completion_signatures_of_t<Sender>>                               data_;
+            //  std::optional<connect_result_t<Sender, receiver_t<_SharedState>>> op_state2_;
+            std::optional<__conv_proxy<connect_result_t<Sender, receiver_t<_SharedState>>>> op_state2_proxy_;
+
+            std::atomic<void*>                                                              awaiting_{nullptr};
+
+            void                                                                            _Notify() noexcept
+            {
+                void* const completion_state = static_cast<void*>(this);
+                void*       old              = awaiting_.exchange(completion_state, std::memory_order_acq_rel);
+                auto*       op_state         = static_cast<_OperationBase*>(old);
+
+                if (op_state != nullptr)
+                {
+                    op_state->notify_(op_state);
+                }
+            }
+        };
+
+        template<typename Sender, typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Sender, typename Receiver>
+        using Operation = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
+
+        template<typename Sender, typename Receiver>
+        struct _Operation<Sender, Receiver>::type : public _OperationBase
+        {
+            Receiver                              receiver_;
+            std::shared_ptr<_SharedState<Sender>> shared_state_;
+
+            type(Receiver&& receiver, std::shared_ptr<_SharedState<Sender>> shared_state)
+                : _OperationBase{_Notify}
+                , receiver_(std::move(receiver))
+                , shared_state_(std::move(shared_state))
+            {
+            }
+
+            static void _Notify(_OperationBase* self) noexcept
+            {
+                auto op_state = static_cast<type*>(self);
+
+                std::apply(
+                    [&](auto&&... vals) -> void
+                    {
+                        SetValue(std::move(op_state->receiver_), (decltype(vals)&&)vals...);
+                    },
+                    *op_state->shared_state_->data_);
+            }
+
+            friend void tag_invoke(start_t, type& self)
+            {
+                auto                shared_state     = self.shared_state_.get();
+                std::atomic<void*>& awaiting         = shared_state->awaiting_;
+                void* const         completion_state = static_cast<void*>(shared_state);
+                void*               old              = awaiting.load(std::memory_order_acquire);
+
+                // TODO: cancel the loop by replacing `compare_exchange_weak` with `compare_exchange_strong`
+                do {
+                    if (old == completion_state)
+                    {
+                        _Notify(&self);
+                        return;
+                    }
+                } while (awaiting.compare_exchange_weak(old, static_cast<void*>(&self), std::memory_order_release, std::memory_order_acquire));
+            }
+        };
+
+        template<typename Sender>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Sender>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
+
+        template<typename Sender>
+        struct _Sender<Sender>::type
+        {
+            using value_types = completion_signatures_of_t<Sender>;
+
+            using SharedState = _SharedState<Sender>;
+
+            std::shared_ptr<SharedState> shared_state_;
+
+            template<typename Sndr, std::enable_if_t<!std::is_same_v<remove_cvref_t<Sndr>, type>, int> = 0>
+            explicit type(Sndr&& sender)
+                : shared_state_(std::make_shared<SharedState>())
+            {
+                shared_state_->op_state2_proxy_.emplace(
+                    [&]
+                    { return Connect((Sndr&&)sender, receiver_t<SharedState>{shared_state_}); });
+                Start(**shared_state_->op_state2_proxy_);
+                //    Start(shared_state_->op_state2_.emplace(
+                //        __conv{[&] { return Connect((Sndr &&) sender, receiver_t<SharedState>{shared_state_});
+                //        }}));
+            }
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                -> Operation<Sender, Receiver>
+            {
+                return {(Receiver&&)receiver, std::move(self.shared_state_)};
+            }
+        };
+
+        struct ensure_started_t
+        {
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          _tag_invocable_with_completion_scheduler<ensure_started_t, Sender>,
+                                      int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(ensure_started_t{}, std::move(scheduler), (Sender&&)sender);
+            }
+
+            template<
+                typename Sender,
+                std::enable_if_t<_is_sender<Sender> &&
+                                     !_tag_invocable_with_completion_scheduler<ensure_started_t, Sender> &&
+                                     tag_invocable<ensure_started_t, Sender>,
+                                 int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                return tag_invoke(ensure_started_t{}, (Sender&&)sender);
+            }
+
+            template<
+                typename Sender,
+                std::enable_if_t<_is_sender<Sender> &&
+                                     !_tag_invocable_with_completion_scheduler<ensure_started_t, Sender> &&
+                                     !tag_invocable<ensure_started_t, Sender>,
+                                 int> = 0>
+            sender_t<Sender> operator()(Sender&& sender) const
+            {
+                return sender_t<Sender>{(Sender&&)sender};
+            }
+        };
+
+    }  // namespace __ensure_started
+
+    using __ensure_started::ensure_started_t;
+    inline constexpr ensure_started_t EnsureStarted{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/execute.h b/csrc/mmdeploy/execution/execute.h
index 1b26528912..fa216bbf42 100644
--- a/csrc/mmdeploy/execution/execute.h
+++ b/csrc/mmdeploy/execution/execute.h
@@ -9,27 +9,30 @@
 #include "mmdeploy/execution/then.h"
 #include "mmdeploy/execution/utility.h"
 
-namespace mmdeploy {
-
-namespace _execute {
-
-struct execute_t {
-  template <typename Scheduler, typename Func,
-            std::enable_if_t<tag_invocable<execute_t, Scheduler, Func>, int> = 0>
-  void operator()(Scheduler&& scheduler, Func func) const {
-    return tag_invoke(*this, (Scheduler &&) scheduler, std::move(func));
-  }
-  template <typename Scheduler, typename Func,
-            std::enable_if_t<!tag_invocable<execute_t, Scheduler, Func>, int> = 0>
-  void operator()(Scheduler&& scheduler, Func func) const {
-    return StartDetached(Then(Schedule((Scheduler &&) scheduler), std::move(func)));
-  }
-};
-
-}  // namespace _execute
-
-using _execute::execute_t;
-inline constexpr execute_t Execute{};
+namespace mmdeploy
+{
+
+    namespace _execute
+    {
+
+        struct execute_t
+        {
+            template<typename Scheduler, typename Func, std::enable_if_t<tag_invocable<execute_t, Scheduler, Func>, int> = 0>
+            void operator()(Scheduler&& scheduler, Func func) const
+            {
+                return tag_invoke(*this, (Scheduler&&)scheduler, std::move(func));
+            }
+            template<typename Scheduler, typename Func, std::enable_if_t<!tag_invocable<execute_t, Scheduler, Func>, int> = 0>
+            void operator()(Scheduler&& scheduler, Func func) const
+            {
+                return StartDetached(Then(Schedule((Scheduler&&)scheduler), std::move(func)));
+            }
+        };
+
+    }  // namespace _execute
+
+    using _execute::execute_t;
+    inline constexpr execute_t Execute{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/expand.h b/csrc/mmdeploy/execution/expand.h
index 734f9946aa..632796a5cb 100644
--- a/csrc/mmdeploy/execution/expand.h
+++ b/csrc/mmdeploy/execution/expand.h
@@ -7,55 +7,69 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace _expand {
-
-template <typename Sender, typename Receiver>
-struct _Receiver {
-  struct type {
-    Receiver receiver_;
-    template <class Tuple>
-    friend void tag_invoke(set_value_t, type&& self, Tuple&& tup) noexcept {
-      std::apply(
-          [&](auto&&... args) {
-            SetValue((Receiver &&) self.receiver_, (decltype(args)&&)args...);
-          },
-          (Tuple &&) tup);
-    }
-  };
-};
-template <typename Sender, typename Receiver>
-using receiver_t = typename _Receiver<Sender, remove_cvref_t<Receiver>>::type;
-
-template <typename Sender>
-struct _Sender {
-  struct type {
-    using value_types = std::tuple_element_t<0, completion_signatures_of_t<Sender>>;
-    Sender sender_;
-
-    template <typename Self, typename Receiver, _decays_to<Self, type, bool> = true>
-    friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-      return Connect(((Self &&) self).sender_,
-                     receiver_t<Sender, Receiver>{(Receiver &&) receiver});
-    }
-  };
-};
-template <typename Sender>
-using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
-
-struct expand_t {
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  auto operator()(Sender&& sender) const {
-    return sender_t<Sender>{(Sender &&) sender};
-  }
-  _BinderBack<expand_t> operator()() const { return {{}, {}, {}}; }
-};
-
-}  // namespace _expand
-
-using _expand::expand_t;
-inline constexpr expand_t Expand{};
+namespace mmdeploy
+{
+
+    namespace _expand
+    {
+
+        template<typename Sender, typename Receiver>
+        struct _Receiver
+        {
+            struct type
+            {
+                Receiver receiver_;
+                template<class Tuple>
+                friend void tag_invoke(set_value_t, type&& self, Tuple&& tup) noexcept
+                {
+                    std::apply(
+                        [&](auto&&... args)
+                        {
+                            SetValue((Receiver&&)self.receiver_, (decltype(args)&&)args...);
+                        },
+                        (Tuple&&)tup);
+                }
+            };
+        };
+        template<typename Sender, typename Receiver>
+        using receiver_t = typename _Receiver<Sender, remove_cvref_t<Receiver>>::type;
+
+        template<typename Sender>
+        struct _Sender
+        {
+            struct type
+            {
+                using value_types = std::tuple_element_t<0, completion_signatures_of_t<Sender>>;
+                Sender sender_;
+
+                template<typename Self, typename Receiver, _decays_to<Self, type, bool> = true>
+                friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                {
+                    return Connect(((Self&&)self).sender_,
+                                   receiver_t<Sender, Receiver>{(Receiver&&)receiver});
+                }
+            };
+        };
+        template<typename Sender>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
+
+        struct expand_t
+        {
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                return sender_t<Sender>{(Sender&&)sender};
+            }
+            _BinderBack<expand_t> operator()() const
+            {
+                return {{}, {}, {}};
+            }
+        };
+
+    }  // namespace _expand
+
+    using _expand::expand_t;
+    inline constexpr expand_t Expand{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/just.h b/csrc/mmdeploy/execution/just.h
index cf769c57dc..094ead00df 100644
--- a/csrc/mmdeploy/execution/just.h
+++ b/csrc/mmdeploy/execution/just.h
@@ -10,62 +10,74 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __just {
-
-template <typename Receiver, typename... Ts>
-struct _Operation {
-  struct type;
-};
-template <typename Receiver, typename... Ts>
-using operation_t = typename _Operation<remove_cvref_t<Receiver>, Ts...>::type;
-
-template <typename Receiver, typename... Ts>
-struct _Operation<Receiver, Ts...>::type {
-  std::tuple<Ts...> values_;
-  Receiver receiver_;
-  friend void tag_invoke(start_t, type& op_state) noexcept {
-    std::apply(
-        [&](Ts&... ts) -> void { SetValue(std::move(op_state.receiver_), std::move(ts)...); },
-        op_state.values_);
-  }
-};
-
-template <typename... Ts>
-struct _Sender {
-  struct type;
-};
-template <typename... Ts>
-using sender_t = typename _Sender<std::decay_t<Ts>...>::type;
-
-template <typename... Ts>
-struct _Sender<Ts...>::type {
-  using value_types = std::tuple<Ts...>;
-  value_types values_;
-
-  template <typename Receiver>
-  friend operation_t<Receiver, Ts...> tag_invoke(connect_t, const type& self, Receiver&& receiver) {
-    return {self.values_, (Receiver &&) receiver};
-  }
-
-  template <typename Receiver>
-  friend operation_t<Receiver, Ts...> tag_invoke(connect_t, type&& self, Receiver&& receiver) {
-    return {std::move(self).values_, (Receiver &&) receiver};
-  }
-};
-
-struct just_t {
-  template <typename... Ts>
-  sender_t<Ts...> operator()(Ts&&... ts) const {
-    return {{(Ts &&) ts...}};
-  }
-};
-
-}  // namespace __just
-
-using __just::just_t;
-inline constexpr just_t Just{};
+namespace mmdeploy
+{
+
+    namespace __just
+    {
+
+        template<typename Receiver, typename... Ts>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Receiver, typename... Ts>
+        using operation_t = typename _Operation<remove_cvref_t<Receiver>, Ts...>::type;
+
+        template<typename Receiver, typename... Ts>
+        struct _Operation<Receiver, Ts...>::type
+        {
+            std::tuple<Ts...> values_;
+            Receiver          receiver_;
+            friend void       tag_invoke(start_t, type& op_state) noexcept
+            {
+                std::apply(
+                    [&](Ts&... ts) -> void
+                    { SetValue(std::move(op_state.receiver_), std::move(ts)...); },
+                    op_state.values_);
+            }
+        };
+
+        template<typename... Ts>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename... Ts>
+        using sender_t = typename _Sender<std::decay_t<Ts>...>::type;
+
+        template<typename... Ts>
+        struct _Sender<Ts...>::type
+        {
+            using value_types = std::tuple<Ts...>;
+            value_types values_;
+
+            template<typename Receiver>
+            friend operation_t<Receiver, Ts...> tag_invoke(connect_t, const type& self, Receiver&& receiver)
+            {
+                return {self.values_, (Receiver&&)receiver};
+            }
+
+            template<typename Receiver>
+            friend operation_t<Receiver, Ts...> tag_invoke(connect_t, type&& self, Receiver&& receiver)
+            {
+                return {std::move(self).values_, (Receiver&&)receiver};
+            }
+        };
+
+        struct just_t
+        {
+            template<typename... Ts>
+            sender_t<Ts...> operator()(Ts&&... ts) const
+            {
+                return {{(Ts&&)ts...}};
+            }
+        };
+
+    }  // namespace __just
+
+    using __just::just_t;
+    inline constexpr just_t Just{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/let_value.h b/csrc/mmdeploy/execution/let_value.h
index 21947b95ea..e2b2f30af5 100644
--- a/csrc/mmdeploy/execution/let_value.h
+++ b/csrc/mmdeploy/execution/let_value.h
@@ -7,148 +7,159 @@
 
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __let_value {
-
-template <typename T>
-using __decay_ref = std::decay_t<T>&;
-
-template <typename Func, typename... As>
-using __result_sender_t = __call_result_t<Func, __decay_ref<As>...>;
-
-template <typename Func, typename Tuple>
-struct __value_type {};
-
-template <typename Func, typename... As>
-struct __value_type<Func, std::tuple<As...>> {
-  using type = __result_sender_t<Func, As...>;
-};
-
-template <typename Func, typename Tuple>
-using __value_type_t = typename __value_type<Func, Tuple>::type;
-
-template <typename CvrefSender, typename Receiver, typename Fun>
-struct _Storage {
-  using Sender = remove_cvref_t<CvrefSender>;
-  using operation_t =
-      connect_result_t<__value_type_t<Fun, completion_signatures_of_t<Sender>>, Receiver>;
-  std::optional<completion_signatures_of_t<Sender>> args_;
-  // workaround for MSVC v142 toolset, copy elision does not work here
-  std::optional<__conv_proxy<operation_t>> proxy_;
-};
-
-template <typename CvrefSender, typename Receiver, typename Func>
-struct _Operation {
-  struct type;
-};
-template <typename CvrefSender, typename Receiver, typename Func>
-using operation_t = typename _Operation<CvrefSender, remove_cvref_t<Receiver>, Func>::type;
-
-template <typename CvrefSender, typename Receiver, typename Func>
-struct _Receiver {
-  struct type;
-};
-template <typename CvrefSender, typename Receiver, typename Func>
-using receiver_t = typename _Receiver<CvrefSender, Receiver, Func>::type;
-
-template <typename CvrefSender, typename Receiver, typename Func>
-struct _Receiver<CvrefSender, Receiver, Func>::type {
-  operation_t<CvrefSender, Receiver, Func>* op_state_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    auto* op_state = self.op_state_;
-    auto& args = op_state->storage_.args_.emplace((As &&) as...);
-    op_state->storage_.proxy_.emplace([&] {
-      return Connect(std::apply(std::move(op_state->func_), args), std::move(op_state->receiver_));
-    });
-    Start(**op_state->storage_.proxy_);
-  }
-};
-
-template <typename CvrefSender, typename Receiver, typename Func>
-struct _Operation<CvrefSender, Receiver, Func>::type {
-  using _receiver_t = receiver_t<CvrefSender, Receiver, Func>;
-
-  friend void tag_invoke(start_t, type& self) noexcept { Start(self.op_state2_); }
-
-  template <typename Receiver2>
-  type(CvrefSender&& sender, Receiver2&& receiver, Func func)
-      : op_state2_(Connect((CvrefSender &&) sender, _receiver_t{this})),
-        receiver_((Receiver2 &&) receiver),
-        func_(std::move(func)) {}
-
-  connect_result_t<CvrefSender, _receiver_t> op_state2_;
-  Receiver receiver_;
-  Func func_;
-  _Storage<CvrefSender, Receiver, Func> storage_;
-};
-
-template <typename Sender, typename Func>
-struct _Sender {
-  struct type;
-};
-template <typename Sender, typename Func>
-using sender_t = typename _Sender<remove_cvref_t<Sender>, Func>::type;
-
-template <typename Sender, typename Func>
-struct _Sender<Sender, Func>::type {
-  template <typename Self, typename Receiver>
-  using _operation_t = operation_t<_copy_cvref_t<Self, Sender>, Receiver, Func>;
-
-  using value_types =
-      completion_signatures_of_t<__value_type_t<Func, completion_signatures_of_t<Sender>>>;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
-      -> _operation_t<Self, Receiver> {
-    return _operation_t<Self, Receiver>{((Self &&) self).sender_, (Receiver &&) receiver,
-                                        ((Self &&) self).func_};
-  }
-  Sender sender_;
-  Func func_;
-};
-
-using std::enable_if_t;
-
-struct let_value_t {
-  template <typename Sender, typename Func,
-            enable_if_t<_is_sender<Sender> &&
-                            _tag_invocable_with_completion_scheduler<let_value_t, Sender, Func>,
-                        int> = 0>
-  auto operator()(Sender&& sender, Func func) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(let_value_t{}, std::move(scheduler), (Sender &&) sender, std::move(func));
-  }
-
-  template <typename Sender, typename Func,
-            enable_if_t<_is_sender<Sender> &&
-                            _tag_invocable_with_completion_scheduler<let_value_t, Sender, Func> &&
-                            tag_invocable<let_value_t, Sender, Func>,
-                        int> = 0>
-  auto operator()(Sender&& sender, Func func) const {
-    return tag_invoke(let_value_t{}, (Sender &&) sender, std::move(func));
-  }
-
-  template <typename Sender, typename Func,
-            enable_if_t<_is_sender<Sender> &&
-                            !_tag_invocable_with_completion_scheduler<let_value_t, Sender, Func> &&
-                            !tag_invocable<let_value_t, Sender>,
-                        int> = 0>
-  sender_t<Sender, Func> operator()(Sender&& sender, Func func) const {
-    return {(Sender &&) sender, std::move(func)};
-  }
-  template <typename Func>
-  _BinderBack<let_value_t, Func> operator()(Func func) const {
-    return {{}, {}, {std::move(func)}};
-  }
-};
-
-}  // namespace __let_value
-
-using __let_value::let_value_t;
-inline constexpr let_value_t LetValue{};
+namespace mmdeploy
+{
+
+    namespace __let_value
+    {
+
+        template<typename T>
+        using __decay_ref = std::decay_t<T>&;
+
+        template<typename Func, typename... As>
+        using __result_sender_t = __call_result_t<Func, __decay_ref<As>...>;
+
+        template<typename Func, typename Tuple>
+        struct __value_type
+        {
+        };
+
+        template<typename Func, typename... As>
+        struct __value_type<Func, std::tuple<As...>>
+        {
+            using type = __result_sender_t<Func, As...>;
+        };
+
+        template<typename Func, typename Tuple>
+        using __value_type_t = typename __value_type<Func, Tuple>::type;
+
+        template<typename CvrefSender, typename Receiver, typename Fun>
+        struct _Storage
+        {
+            using Sender = remove_cvref_t<CvrefSender>;
+            using operation_t =
+                connect_result_t<__value_type_t<Fun, completion_signatures_of_t<Sender>>, Receiver>;
+            std::optional<completion_signatures_of_t<Sender>> args_;
+            // workaround for MSVC v142 toolset, copy elision does not work here
+            std::optional<__conv_proxy<operation_t>>          proxy_;
+        };
+
+        template<typename CvrefSender, typename Receiver, typename Func>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename CvrefSender, typename Receiver, typename Func>
+        using operation_t = typename _Operation<CvrefSender, remove_cvref_t<Receiver>, Func>::type;
+
+        template<typename CvrefSender, typename Receiver, typename Func>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename CvrefSender, typename Receiver, typename Func>
+        using receiver_t = typename _Receiver<CvrefSender, Receiver, Func>::type;
+
+        template<typename CvrefSender, typename Receiver, typename Func>
+        struct _Receiver<CvrefSender, Receiver, Func>::type
+        {
+            operation_t<CvrefSender, Receiver, Func>* op_state_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                auto* op_state = self.op_state_;
+                auto& args     = op_state->storage_.args_.emplace((As&&)as...);
+                op_state->storage_.proxy_.emplace([&]
+                                                  { return Connect(std::apply(std::move(op_state->func_), args), std::move(op_state->receiver_)); });
+                Start(**op_state->storage_.proxy_);
+            }
+        };
+
+        template<typename CvrefSender, typename Receiver, typename Func>
+        struct _Operation<CvrefSender, Receiver, Func>::type
+        {
+            using _receiver_t = receiver_t<CvrefSender, Receiver, Func>;
+
+            friend void tag_invoke(start_t, type& self) noexcept
+            {
+                Start(self.op_state2_);
+            }
+
+            template<typename Receiver2>
+            type(CvrefSender&& sender, Receiver2&& receiver, Func func)
+                : op_state2_(Connect((CvrefSender&&)sender, _receiver_t{this}))
+                , receiver_((Receiver2&&)receiver)
+                , func_(std::move(func))
+            {
+            }
+
+            connect_result_t<CvrefSender, _receiver_t> op_state2_;
+            Receiver                                   receiver_;
+            Func                                       func_;
+            _Storage<CvrefSender, Receiver, Func>      storage_;
+        };
+
+        template<typename Sender, typename Func>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Sender, typename Func>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>, Func>::type;
+
+        template<typename Sender, typename Func>
+        struct _Sender<Sender, Func>::type
+        {
+            template<typename Self, typename Receiver>
+            using _operation_t = operation_t<_copy_cvref_t<Self, Sender>, Receiver, Func>;
+
+            using value_types =
+                completion_signatures_of_t<__value_type_t<Func, completion_signatures_of_t<Sender>>>;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                -> _operation_t<Self, Receiver>
+            {
+                return _operation_t<Self, Receiver>{((Self&&)self).sender_, (Receiver&&)receiver, ((Self&&)self).func_};
+            }
+            Sender sender_;
+            Func   func_;
+        };
+
+        using std::enable_if_t;
+
+        struct let_value_t
+        {
+            template<typename Sender, typename Func, enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<let_value_t, Sender, Func>, int> = 0>
+            auto operator()(Sender&& sender, Func func) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(let_value_t{}, std::move(scheduler), (Sender&&)sender, std::move(func));
+            }
+
+            template<typename Sender, typename Func, enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<let_value_t, Sender, Func> && tag_invocable<let_value_t, Sender, Func>, int> = 0>
+            auto operator()(Sender&& sender, Func func) const
+            {
+                return tag_invoke(let_value_t{}, (Sender&&)sender, std::move(func));
+            }
+
+            template<typename Sender, typename Func, enable_if_t<_is_sender<Sender> && !_tag_invocable_with_completion_scheduler<let_value_t, Sender, Func> && !tag_invocable<let_value_t, Sender>, int> = 0>
+            sender_t<Sender, Func> operator()(Sender&& sender, Func func) const
+            {
+                return {(Sender&&)sender, std::move(func)};
+            }
+            template<typename Func>
+            _BinderBack<let_value_t, Func> operator()(Func func) const
+            {
+                return {{}, {}, {std::move(func)}};
+            }
+        };
+
+    }  // namespace __let_value
+
+    using __let_value::let_value_t;
+    inline constexpr let_value_t LetValue{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/on.h b/csrc/mmdeploy/execution/on.h
index 992a6b31fb..4c170d56a4 100644
--- a/csrc/mmdeploy/execution/on.h
+++ b/csrc/mmdeploy/execution/on.h
@@ -9,115 +9,136 @@
 
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __on {
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Scheduler, typename Sender, typename Receiver>
-using operation_t = typename _Operation<Scheduler, Sender, Receiver>::type;
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _ReceiverRef {
-  struct type;
-};
-template <typename Scheduler, typename Sender, typename Receiver>
-using receiver_ref_t = typename _ReceiverRef<Scheduler, Sender, Receiver>::type;
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _ReceiverRef<Scheduler, Sender, Receiver>::type {
-  operation_t<Scheduler, Sender, Receiver>* op_state_;
-  template <typename... Args>
-  friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept {
-    SetValue((Receiver &&) self.op_state_->receiver_, ((Args &&) args)...);
-  }
-};
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _Receiver {
-  struct type;
-};
-template <typename Scheduler, typename Sender, typename Receiver>
-using receiver_t = typename _Receiver<Scheduler, Sender, Receiver>::type;
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _Receiver<Scheduler, Sender, Receiver>::type {
-  operation_t<Scheduler, Sender, Receiver>* op_state_;
-  using _receiver_ref_t = receiver_ref_t<Scheduler, Sender, Receiver>;
-
-  friend void tag_invoke(set_value_t, type&& self) noexcept {
-    auto op_state = self.op_state_;
-    Start(op_state->data_.template emplace<1>(
-        Connect((Sender &&) op_state->sender_, _receiver_ref_t{op_state})));
-  }
-};
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _Operation<Scheduler, Sender, Receiver>::type {
-  using _receiver_t = receiver_t<Scheduler, Sender, Receiver>;
-  using _receiver_ref_t = receiver_ref_t<Scheduler, Sender, Receiver>;
-
-  template <class Sender2, class Receiver2>
-  type(Scheduler scheduler, Sender2&& sender, Receiver2&& receiver)
-      : data_(std::in_place_index<0>, Connect(Schedule(scheduler), _receiver_t{this})),
-        scheduler_(scheduler),
-        sender_((Sender2 &&) sender),
-        receiver_((Receiver2 &&) receiver) {}
-
-  friend void tag_invoke(start_t, type& self) { Start(std::get<0>(self.data_)); }
-
-  std::variant<connect_result_t<schedule_result_t<Scheduler>, _receiver_t>,
-               connect_result_t<Sender, _receiver_ref_t>>
-      data_;
-  Scheduler scheduler_;
-  Sender sender_;
-  Receiver receiver_;
-};
-
-template <typename Scheduler, typename Sender>
-struct _Sender {
-  struct type;
-};
-template <typename Scheduler, typename Sender>
-using sender_t = typename _Sender<remove_cvref_t<Scheduler>, remove_cvref_t<Sender>>::type;
-
-template <typename Scheduler, typename Sender>
-struct _Sender<Scheduler, Sender>::type {
-  using value_types = completion_signatures_of_t<Sender>;
-  Scheduler scheduler_;
-  Sender sender_;
-
-  template <typename Receiver>
-  using _operation_t = operation_t<Scheduler, Sender, remove_cvref_t<Receiver>>;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) -> _operation_t<Receiver> {
-    return {((Self &&) self).scheduler_, ((Self &&) self).sender_, (Receiver &&) receiver};
-  }
-};
-
-struct on_t {
-  template <typename Scheduler, typename Sender,
-            std::enable_if_t<_is_sender<Sender> && tag_invocable<on_t, Scheduler, Sender>, int> = 0>
-  auto operator()(Scheduler&& scheduler, Sender&& sender) const
-      -> tag_invoke_result_t<on_t, Scheduler, Sender> {
-    return tag_invoke(on_t{}, (Scheduler &&) scheduler, (Sender &&) sender);
-  }
-  template <
-      typename Scheduler, typename Sender,
-      std::enable_if_t<_is_sender<Sender> && !tag_invocable<on_t, Scheduler, Sender>, int> = 0>
-  sender_t<Scheduler, Sender> operator()(Scheduler&& scheduler, Sender&& sender) const {
-    return {(Scheduler &&) scheduler, (Sender &&) sender};
-  }
-};
-
-}  // namespace __on
-
-using __on::on_t;
-inline constexpr on_t On{};
+namespace mmdeploy
+{
+
+    namespace __on
+    {
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename Sender, typename Receiver>
+        using operation_t = typename _Operation<Scheduler, Sender, Receiver>::type;
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _ReceiverRef
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename Sender, typename Receiver>
+        using receiver_ref_t = typename _ReceiverRef<Scheduler, Sender, Receiver>::type;
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _ReceiverRef<Scheduler, Sender, Receiver>::type
+        {
+            operation_t<Scheduler, Sender, Receiver>* op_state_;
+            template<typename... Args>
+            friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept
+            {
+                SetValue((Receiver&&)self.op_state_->receiver_, ((Args&&)args)...);
+            }
+        };
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename Sender, typename Receiver>
+        using receiver_t = typename _Receiver<Scheduler, Sender, Receiver>::type;
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _Receiver<Scheduler, Sender, Receiver>::type
+        {
+            operation_t<Scheduler, Sender, Receiver>* op_state_;
+            using _receiver_ref_t = receiver_ref_t<Scheduler, Sender, Receiver>;
+
+            friend void tag_invoke(set_value_t, type&& self) noexcept
+            {
+                auto op_state = self.op_state_;
+                Start(op_state->data_.template emplace<1>(
+                    Connect((Sender&&)op_state->sender_, _receiver_ref_t{op_state})));
+            }
+        };
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _Operation<Scheduler, Sender, Receiver>::type
+        {
+            using _receiver_t     = receiver_t<Scheduler, Sender, Receiver>;
+            using _receiver_ref_t = receiver_ref_t<Scheduler, Sender, Receiver>;
+
+            template<class Sender2, class Receiver2>
+            type(Scheduler scheduler, Sender2&& sender, Receiver2&& receiver)
+                : data_(std::in_place_index<0>, Connect(Schedule(scheduler), _receiver_t{this}))
+                , scheduler_(scheduler)
+                , sender_((Sender2&&)sender)
+                , receiver_((Receiver2&&)receiver)
+            {
+            }
+
+            friend void tag_invoke(start_t, type& self)
+            {
+                Start(std::get<0>(self.data_));
+            }
+
+            std::variant<connect_result_t<schedule_result_t<Scheduler>, _receiver_t>,
+                         connect_result_t<Sender, _receiver_ref_t>>
+                      data_;
+            Scheduler scheduler_;
+            Sender    sender_;
+            Receiver  receiver_;
+        };
+
+        template<typename Scheduler, typename Sender>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename Sender>
+        using sender_t = typename _Sender<remove_cvref_t<Scheduler>, remove_cvref_t<Sender>>::type;
+
+        template<typename Scheduler, typename Sender>
+        struct _Sender<Scheduler, Sender>::type
+        {
+            using value_types = completion_signatures_of_t<Sender>;
+            Scheduler scheduler_;
+            Sender    sender_;
+
+            template<typename Receiver>
+            using _operation_t = operation_t<Scheduler, Sender, remove_cvref_t<Receiver>>;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) -> _operation_t<Receiver>
+            {
+                return {((Self&&)self).scheduler_, ((Self&&)self).sender_, (Receiver&&)receiver};
+            }
+        };
+
+        struct on_t
+        {
+            template<typename Scheduler, typename Sender, std::enable_if_t<_is_sender<Sender> && tag_invocable<on_t, Scheduler, Sender>, int> = 0>
+            auto operator()(Scheduler&& scheduler, Sender&& sender) const
+                -> tag_invoke_result_t<on_t, Scheduler, Sender>
+            {
+                return tag_invoke(on_t{}, (Scheduler&&)scheduler, (Sender&&)sender);
+            }
+            template<
+                typename Scheduler,
+                typename Sender,
+                std::enable_if_t<_is_sender<Sender> && !tag_invocable<on_t, Scheduler, Sender>, int> = 0>
+            sender_t<Scheduler, Sender> operator()(Scheduler&& scheduler, Sender&& sender) const
+            {
+                return {(Scheduler&&)scheduler, (Sender&&)sender};
+            }
+        };
+
+    }  // namespace __on
+
+    using __on::on_t;
+    inline constexpr on_t On{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/run_loop.h b/csrc/mmdeploy/execution/run_loop.h
index d52432a8d0..c670d45b50 100644
--- a/csrc/mmdeploy/execution/run_loop.h
+++ b/csrc/mmdeploy/execution/run_loop.h
@@ -11,153 +11,198 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __loop {
-class RunLoop;
-
-namespace __impl {
-
-struct _Task {
-  virtual void _Execute() noexcept = 0;
-  _Task* next_ = nullptr;
-};
-
-template <typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Receiver>
-using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
-
-template <typename Receiver>
-struct _Operation<Receiver>::type final : _Task {
-  friend void tag_invoke(start_t, type& op_state) noexcept { op_state._Start(); }
-
-  void _Execute() noexcept override { SetValue(std::move(receiver_)); }
-  void _Start() noexcept;
-
-  Receiver receiver_;
-  RunLoop* const loop_;
-
- public:
-  template <class _Receiver2>
-  explicit type(_Receiver2&& receiver, RunLoop* loop)
-      : receiver_((_Receiver2 &&) receiver), loop_(loop) {}
-};
-
-}  // namespace __impl
-
-class RunLoop {
-  template <typename>
-  friend struct __impl::_Operation;
-
- public:
-  class _Scheduler {
-    struct _ScheduleTask {
-      using value_types = std::tuple<>;
-
-     private:
-      friend _Scheduler;
-
-      template <typename Receiver>
-      friend __impl::operation_t<Receiver> tag_invoke(connect_t, const _ScheduleTask& self,
-                                                      Receiver&& receiver) {
-        return __impl::operation_t<Receiver>{(Receiver &&) receiver, self.loop_};
-      }
-      RunLoop* const loop_;
-
-     public:
-      explicit _ScheduleTask(RunLoop* loop) noexcept : loop_(loop) {}
-
-      friend _Scheduler tag_invoke(get_completion_scheduler_t, const _ScheduleTask& self);
-    };
-    friend RunLoop;
-
-    friend _Scheduler tag_invoke(get_completion_scheduler_t, const _ScheduleTask& self) {
-      return RunLoop::_Scheduler{self.loop_};
-    }
-
-    explicit _Scheduler(RunLoop* loop) noexcept : loop_(loop) {}
-
-   public:
-    bool operator==(const _Scheduler& other) const noexcept { return loop_ == other.loop_; }
-
-    _Scheduler(const _Scheduler& other) = default;
-
-   private:
-    friend _ScheduleTask tag_invoke(schedule_t, const _Scheduler& self) {
-      return _ScheduleTask{self.loop_};
-    }
-    RunLoop* loop_;
-  };
-  _Scheduler GetScheduler() { return _Scheduler{this}; }
-  void _Run();
-  void _Finish();
-
- private:
-  void _push_back(__impl::_Task* task);
-  __impl::_Task* _pop_front();
-
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  __impl::_Task* head_ = nullptr;
-  __impl::_Task* tail_ = nullptr;
-  bool stop_ = false;
-};
-
-namespace __impl {
-
-template <typename Receiver>
-inline void _Operation<Receiver>::type::_Start() noexcept {
-  loop_->_push_back(this);
-}
-
-}  // namespace __impl
-
-inline void RunLoop::_Run() {
-  while (auto* task = _pop_front()) {
-    task->_Execute();
-  }
-}
-
-inline void RunLoop::_Finish() {
-  std::lock_guard lock{mutex_};
-  stop_ = true;
-  cv_.notify_all();
-}
-
-inline void RunLoop::_push_back(__impl::_Task* task) {
-  std::lock_guard lock{mutex_};
-  if (head_ == nullptr) {
-    head_ = task;
-  } else {
-    tail_->next_ = task;
-  }
-  tail_ = task;
-  task->next_ = nullptr;
-  cv_.notify_one();
-}
-
-inline __impl::_Task* RunLoop::_pop_front() {
-  std::unique_lock lock{mutex_};
-  while (head_ == nullptr) {
-    if (stop_) {
-      return nullptr;
-    }
-    cv_.wait(lock);
-  }
-  auto* task = head_;
-  head_ = task->next_;
-  if (head_ == nullptr) {
-    tail_ = nullptr;
-  }
-  return task;
-}
-
-}  // namespace __loop
-
-using RunLoop = __loop::RunLoop;
+namespace mmdeploy
+{
+
+    namespace __loop
+    {
+        class RunLoop;
+
+        namespace __impl
+        {
+
+            struct _Task
+            {
+                virtual void _Execute() noexcept = 0;
+                _Task*       next_               = nullptr;
+            };
+
+            template<typename Receiver>
+            struct _Operation
+            {
+                struct type;
+            };
+            template<typename Receiver>
+            using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
+
+            template<typename Receiver>
+            struct _Operation<Receiver>::type final : _Task
+            {
+                friend void tag_invoke(start_t, type& op_state) noexcept
+                {
+                    op_state._Start();
+                }
+
+                void _Execute() noexcept override
+                {
+                    SetValue(std::move(receiver_));
+                }
+                void           _Start() noexcept;
+
+                Receiver       receiver_;
+                RunLoop* const loop_;
+
+              public:
+                template<class _Receiver2>
+                explicit type(_Receiver2&& receiver, RunLoop* loop)
+                    : receiver_((_Receiver2&&)receiver)
+                    , loop_(loop)
+                {
+                }
+            };
+
+        }  // namespace __impl
+
+        class RunLoop
+        {
+            template<typename>
+            friend struct __impl::_Operation;
+
+          public:
+            class _Scheduler
+            {
+                struct _ScheduleTask
+                {
+                    using value_types = std::tuple<>;
+
+                  private:
+                    friend _Scheduler;
+
+                    template<typename Receiver>
+                    friend __impl::operation_t<Receiver> tag_invoke(connect_t, const _ScheduleTask& self, Receiver&& receiver)
+                    {
+                        return __impl::operation_t<Receiver>{(Receiver&&)receiver, self.loop_};
+                    }
+                    RunLoop* const loop_;
+
+                  public:
+                    explicit _ScheduleTask(RunLoop* loop) noexcept
+                        : loop_(loop)
+                    {
+                    }
+
+                    friend _Scheduler tag_invoke(get_completion_scheduler_t, const _ScheduleTask& self);
+                };
+                friend RunLoop;
+
+                friend _Scheduler tag_invoke(get_completion_scheduler_t, const _ScheduleTask& self)
+                {
+                    return RunLoop::_Scheduler{self.loop_};
+                }
+
+                explicit _Scheduler(RunLoop* loop) noexcept
+                    : loop_(loop)
+                {
+                }
+
+              public:
+                bool operator==(const _Scheduler& other) const noexcept
+                {
+                    return loop_ == other.loop_;
+                }
+
+                _Scheduler(const _Scheduler& other) = default;
+
+              private:
+                friend _ScheduleTask tag_invoke(schedule_t, const _Scheduler& self)
+                {
+                    return _ScheduleTask{self.loop_};
+                }
+                RunLoop* loop_;
+            };
+            _Scheduler GetScheduler()
+            {
+                return _Scheduler{this};
+            }
+            void _Run();
+            void _Finish();
+
+          private:
+            void                    _push_back(__impl::_Task* task);
+            __impl::_Task*          _pop_front();
+
+            std::mutex              mutex_;
+            std::condition_variable cv_;
+            __impl::_Task*          head_ = nullptr;
+            __impl::_Task*          tail_ = nullptr;
+            bool                    stop_ = false;
+        };
+
+        namespace __impl
+        {
+
+            template<typename Receiver>
+            inline void _Operation<Receiver>::type::_Start() noexcept
+            {
+                loop_->_push_back(this);
+            }
+
+        }  // namespace __impl
+
+        inline void RunLoop::_Run()
+        {
+            while (auto* task = _pop_front())
+            {
+                task->_Execute();
+            }
+        }
+
+        inline void RunLoop::_Finish()
+        {
+            std::lock_guard lock{mutex_};
+            stop_ = true;
+            cv_.notify_all();
+        }
+
+        inline void RunLoop::_push_back(__impl::_Task* task)
+        {
+            std::lock_guard lock{mutex_};
+            if (head_ == nullptr)
+            {
+                head_ = task;
+            }
+            else
+            {
+                tail_->next_ = task;
+            }
+            tail_       = task;
+            task->next_ = nullptr;
+            cv_.notify_one();
+        }
+
+        inline __impl::_Task* RunLoop::_pop_front()
+        {
+            std::unique_lock lock{mutex_};
+            while (head_ == nullptr)
+            {
+                if (stop_)
+                {
+                    return nullptr;
+                }
+                cv_.wait(lock);
+            }
+            auto* task = head_;
+            head_      = task->next_;
+            if (head_ == nullptr)
+            {
+                tail_ = nullptr;
+            }
+            return task;
+        }
+
+    }  // namespace __loop
+
+    using RunLoop = __loop::RunLoop;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedule_from.h b/csrc/mmdeploy/execution/schedule_from.h
index c51e90b94b..ca31a3deb4 100644
--- a/csrc/mmdeploy/execution/schedule_from.h
+++ b/csrc/mmdeploy/execution/schedule_from.h
@@ -9,131 +9,151 @@
 
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __schedule_from {
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Operation1 {
-  struct type;
-};
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-using operation1_t = typename _Operation1<Scheduler, CvrefSender, Receiver>::type;
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Receiver1 {
-  struct type;
-};
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-using receiver1_t = typename _Receiver1<Scheduler, CvrefSender, Receiver>::type;
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Receiver2 {
-  struct type;
-};
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-using receiver2_t = typename _Receiver2<Scheduler, CvrefSender, Receiver>::type;
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Receiver2<Scheduler, CvrefSender, Receiver>::type {
-  operation1_t<Scheduler, CvrefSender, Receiver>* op_state_;
-
-  friend void tag_invoke(set_value_t, type&& self) noexcept {
-    std::apply(
-        [&](auto&&... vals) {
-          SetValue(std::move(self.op_state_->receiver_), std::move(vals)...);  //
-        },
-        std::move(*self.op_state_->data_));
-  }
-};
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Receiver1<Scheduler, CvrefSender, Receiver>::type {
-  using _receiver2_t = receiver2_t<Scheduler, CvrefSender, Receiver>;
-
-  operation1_t<Scheduler, CvrefSender, Receiver>* op_state_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    self.op_state_->data_.emplace((As &&) as...);
-    auto sender = Schedule(self.op_state_->scheduler_);
-    auto& op_state2 = self.op_state_->op_state2_.emplace(
-        __conv{[&] { return Connect(std::move(sender), _receiver2_t{self.op_state_}); }});
-    Start(op_state2);
-  }
-};
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Operation1<Scheduler, CvrefSender, Receiver>::type {
-  using _receiver1_t = receiver1_t<Scheduler, CvrefSender, Receiver>;
-  using _receiver2_t = receiver2_t<Scheduler, CvrefSender, Receiver>;
-
-  Scheduler scheduler_;
-  Receiver receiver_;
-  std::optional<completion_signatures_of_t<remove_cvref_t<CvrefSender>>> data_;
-  connect_result_t<CvrefSender, _receiver1_t> op_state1_;
-  std::optional<connect_result_t<schedule_result_t<Scheduler>, _receiver2_t>> op_state2_;
-
-  template <class Receiver2>
-  type(Scheduler sched, CvrefSender&& sender, Receiver2&& receiver)
-      : scheduler_(sched),
-        receiver_((Receiver2 &&) receiver),
-        op_state1_(Connect((CvrefSender &&) sender, _receiver1_t{this})) {}
-
-  type(const type&) = delete;
-  type(_Operation1&&) noexcept = delete;
-  type& operator=(const type&) = delete;
-  type& operator=(type&&) noexcept = delete;
-
-  friend void tag_invoke(start_t, type& op_state) noexcept { Start(op_state.op_state1_); }
-};
-
-template <typename Scheduler, typename Sender>
-struct _Sender {
-  struct type;
-};
-template <typename Scheduler, typename Sender>
-using sender_t = typename _Sender<remove_cvref_t<Scheduler>, remove_cvref_t<Sender>>::type;
-
-template <typename Scheduler, typename Sender>
-struct _Sender<Scheduler, Sender>::type {
-  using value_types = completion_signatures_of_t<Sender>;
-
-  Scheduler scheduler_;
-  Sender sender_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
-      -> operation1_t<Scheduler, _copy_cvref_t<Self, Sender>, remove_cvref_t<Receiver>> {
-    return {self.scheduler_, ((Self &&) self).sender_, (Receiver &&) receiver};
-  }
-
-  friend Scheduler tag_invoke(get_completion_scheduler_t, const type& self) noexcept {
-    return self.scheduler_;
-  }
-};
-
-struct schedule_from_t {
-  template <typename Scheduler, typename Sender,
-            std::enable_if_t<
-                _is_sender<Sender> && tag_invocable<schedule_from_t, Scheduler, Sender>, int> = 0>
-  auto operator()(Scheduler&& scheduler, Sender&& sender) const
-      -> tag_invoke_result_t<schedule_from_t, Scheduler, Sender> {
-    return tag_invoke(schedule_from_t{}, (Scheduler &&) scheduler, (Sender &&) sender);
-  }
-
-  template <typename Scheduler, typename Sender,
-            std::enable_if_t<
-                _is_sender<Sender> && !tag_invocable<schedule_from_t, Scheduler, Sender>, int> = 0>
-  sender_t<Scheduler, Sender> operator()(Scheduler&& scheduler, Sender&& sender) const {
-    return {(Scheduler &&) scheduler, (Sender &&) sender};
-  }
-};
-
-}  // namespace __schedule_from
-
-using __schedule_from::schedule_from_t;
-inline constexpr schedule_from_t ScheduleFrom{};
+namespace mmdeploy
+{
+
+    namespace __schedule_from
+    {
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Operation1
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        using operation1_t = typename _Operation1<Scheduler, CvrefSender, Receiver>::type;
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Receiver1
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        using receiver1_t = typename _Receiver1<Scheduler, CvrefSender, Receiver>::type;
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Receiver2
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        using receiver2_t = typename _Receiver2<Scheduler, CvrefSender, Receiver>::type;
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Receiver2<Scheduler, CvrefSender, Receiver>::type
+        {
+            operation1_t<Scheduler, CvrefSender, Receiver>* op_state_;
+
+            friend void                                     tag_invoke(set_value_t, type&& self) noexcept
+            {
+                std::apply(
+                    [&](auto&&... vals)
+                    {
+                        SetValue(std::move(self.op_state_->receiver_), std::move(vals)...);  //
+                    },
+                    std::move(*self.op_state_->data_));
+            }
+        };
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Receiver1<Scheduler, CvrefSender, Receiver>::type
+        {
+            using _receiver2_t = receiver2_t<Scheduler, CvrefSender, Receiver>;
+
+            operation1_t<Scheduler, CvrefSender, Receiver>* op_state_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                self.op_state_->data_.emplace((As&&)as...);
+                auto  sender    = Schedule(self.op_state_->scheduler_);
+                auto& op_state2 = self.op_state_->op_state2_.emplace(
+                    __conv{[&]
+                           { return Connect(std::move(sender), _receiver2_t{self.op_state_}); }});
+                Start(op_state2);
+            }
+        };
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Operation1<Scheduler, CvrefSender, Receiver>::type
+        {
+            using _receiver1_t = receiver1_t<Scheduler, CvrefSender, Receiver>;
+            using _receiver2_t = receiver2_t<Scheduler, CvrefSender, Receiver>;
+
+            Scheduler                                                                   scheduler_;
+            Receiver                                                                    receiver_;
+            std::optional<completion_signatures_of_t<remove_cvref_t<CvrefSender>>>      data_;
+            connect_result_t<CvrefSender, _receiver1_t>                                 op_state1_;
+            std::optional<connect_result_t<schedule_result_t<Scheduler>, _receiver2_t>> op_state2_;
+
+            template<class Receiver2>
+            type(Scheduler sched, CvrefSender&& sender, Receiver2&& receiver)
+                : scheduler_(sched)
+                , receiver_((Receiver2&&)receiver)
+                , op_state1_(Connect((CvrefSender&&)sender, _receiver1_t{this}))
+            {
+            }
+
+            type(const type&)                      = delete;
+            type(_Operation1&&) noexcept           = delete;
+            type&       operator=(const type&)     = delete;
+            type&       operator=(type&&) noexcept = delete;
+
+            friend void tag_invoke(start_t, type& op_state) noexcept
+            {
+                Start(op_state.op_state1_);
+            }
+        };
+
+        template<typename Scheduler, typename Sender>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename Sender>
+        using sender_t = typename _Sender<remove_cvref_t<Scheduler>, remove_cvref_t<Sender>>::type;
+
+        template<typename Scheduler, typename Sender>
+        struct _Sender<Scheduler, Sender>::type
+        {
+            using value_types = completion_signatures_of_t<Sender>;
+
+            Scheduler scheduler_;
+            Sender    sender_;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                -> operation1_t<Scheduler, _copy_cvref_t<Self, Sender>, remove_cvref_t<Receiver>>
+            {
+                return {self.scheduler_, ((Self&&)self).sender_, (Receiver&&)receiver};
+            }
+
+            friend Scheduler tag_invoke(get_completion_scheduler_t, const type& self) noexcept
+            {
+                return self.scheduler_;
+            }
+        };
+
+        struct schedule_from_t
+        {
+            template<typename Scheduler, typename Sender, std::enable_if_t<_is_sender<Sender> && tag_invocable<schedule_from_t, Scheduler, Sender>, int> = 0>
+            auto operator()(Scheduler&& scheduler, Sender&& sender) const
+                -> tag_invoke_result_t<schedule_from_t, Scheduler, Sender>
+            {
+                return tag_invoke(schedule_from_t{}, (Scheduler&&)scheduler, (Sender&&)sender);
+            }
+
+            template<typename Scheduler, typename Sender, std::enable_if_t<_is_sender<Sender> && !tag_invocable<schedule_from_t, Scheduler, Sender>, int> = 0>
+            sender_t<Scheduler, Sender> operator()(Scheduler&& scheduler, Sender&& sender) const
+            {
+                return {(Scheduler&&)scheduler, (Sender&&)sender};
+            }
+        };
+
+    }  // namespace __schedule_from
+
+    using __schedule_from::schedule_from_t;
+    inline constexpr schedule_from_t ScheduleFrom{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/dynamic_batch_scheduler.h b/csrc/mmdeploy/execution/schedulers/dynamic_batch_scheduler.h
index 437eda30af..981a07f27f 100644
--- a/csrc/mmdeploy/execution/schedulers/dynamic_batch_scheduler.h
+++ b/csrc/mmdeploy/execution/schedulers/dynamic_batch_scheduler.h
@@ -8,141 +8,170 @@
 #include "mmdeploy/execution/schedulers/timed_single_thread_context.h"
 #include "mmdeploy/execution/utility.h"
 
-namespace mmdeploy {
-
-namespace _dynamic_batch_scheduler {
-
-template <typename SubmitSch, typename ExecuteSch, typename AssemblerType>
-struct DynamicBatchScheduler {
-  using Assembler = AssemblerType;
-
-  SubmitSch submit_sch_;
-  ExecuteSch execute_sch_;
-  TimedSingleThreadContext* timer_;
-  size_t max_batch_size_;
-  std::chrono::duration<int64_t, std::micro> timeout_;
-
-  friend auto tag_invoke(schedule_t, const DynamicBatchScheduler& self) {
-    return Schedule(self.submit_sch_);
-  }
-};
-
-template <typename... Args>
-using scheduler_t = DynamicBatchScheduler<Args...>;
-
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-struct _Operation {
-  struct type;
-};
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-using operation_t = typename _Operation<Sender, Scheduler, remove_cvref_t<Receiver>, Func>::type;
-
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-struct _Receiver {
-  struct type {
-    operation_t<Sender, Scheduler, Receiver, Func>* op_state_;
-    template <typename... Args>
-    friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept {
-      self.op_state_->context_->Notify(self.op_state_, (Args &&) args...);
-    }
-  };
-};
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-using receiver_t = typename _Receiver<Sender, Scheduler, Receiver, Func>::type;
-
-using context_base_t = dynamic_batch_t::context_base_t;
-
-//                         start   count
-using range_t = std::pair<size_t, size_t>;
-
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-struct Context : context_base_t {
-  using _duration_t = std::chrono::duration<int64_t, std::micro>;
-
-  Scheduler scheduler_;
-  using Assembler = typename Scheduler::Assembler;
-  Func func_;
-  size_t max_batch_size_;
-  _duration_t delay_;
-  TimedSingleThreadContext* timer_;
-
-  std::mutex mutex_;
-  size_t counter_{0};
-
-  Context(Scheduler scheduler, Func func)
-      : context_base_t{[](context_base_t* p) { delete static_cast<Context*>(p); }},
-        scheduler_(std::move(scheduler)),
-        func_(std::move(func)),
-        max_batch_size_(scheduler_.max_batch_size_),
-        delay_(scheduler_.timeout_),
-        timer_(scheduler_.timer_) {}
-
-  ~Context() { MMDEPLOY_DEBUG("~Context()"); }
-
-  using _operation_t = operation_t<Sender, Scheduler, Receiver, Func>;
-
-  struct Batch {
-    Context* context_;
-    size_t index_{0};
-    std::vector<_operation_t*> states_;
-    std::vector<range_t> ranges_;
-    completion_signatures_of_t<Sender> values_;
-    size_t size_{0};
-    Batch(Context* context, size_t index, size_t max_batch_size)
-        : context_(context), index_(index), values_{} {
-      states_.reserve(max_batch_size);
-      ranges_.reserve(max_batch_size);
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const Batch& batch) {
-      os << fmt::format("(index={}, size={})", batch.index_, batch.size_);
-      return os;
-    }
-  };
-
-  template <typename... Args>
-  void Notify(_operation_t* op_state, Args&&... args) {
-    std::lock_guard lock{mutex_};
-
-    std::unique_ptr<Batch> batch = std::move(batch_);
-    const size_t size = Assembler::get_size((Args &&) args...);
-    op_state->count_ = size;
-    op_state->batch_size_ = size;
-
-    size_t index = 0;
-    while (index != size) {
-      bool new_batch{};
-      if (!batch) {
-        batch = std::make_unique<Batch>(this, counter_++, max_batch_size_);
-        new_batch = true;
-      }
-      auto count = std::min(max_batch_size_ - batch->size_, size - index);
-      auto start = index;
-
-      batch->states_.push_back(op_state);
-      batch->ranges_.emplace_back(start, count);
-      Assembler::input(std::forward_as_tuple((Args &&) args...), {start, count}, batch->values_,
-                       {batch->size_, count}, max_batch_size_);
-      batch->size_ += count;
-
-      index += count;
-      if (batch->size_ == max_batch_size_) {
-        MMDEPLOY_DEBUG("direct submit of batch {}", *batch);
-        // batch is full, submit immediately
-        Execute(scheduler_.execute_sch_, [this, batch = std::move(batch)] { Run(*batch); });
-      } else if (new_batch && timer_) {
-        MMDEPLOY_DEBUG("set off deferred submission for batch {}", *batch);
-        // set off a deferred task to submit the batch if it still exists at the moment.
-        StartDetached(Then(ScheduleAfter(timer_->GetScheduler(), delay_),
-                           [this, batch_index = batch->index_] { Submit(batch_index); }));
-      }
-    }
-
-    batch_ = std::move(batch);
-  }
-
-  void Submit(size_t batch_index) {
-    Execute(scheduler_.execute_sch_, [this, batch_index] {
+namespace mmdeploy
+{
+
+    namespace _dynamic_batch_scheduler
+    {
+
+        template<typename SubmitSch, typename ExecuteSch, typename AssemblerType>
+        struct DynamicBatchScheduler
+        {
+            using Assembler = AssemblerType;
+
+            SubmitSch                                  submit_sch_;
+            ExecuteSch                                 execute_sch_;
+            TimedSingleThreadContext*                  timer_;
+            size_t                                     max_batch_size_;
+            std::chrono::duration<int64_t, std::micro> timeout_;
+
+            friend auto                                tag_invoke(schedule_t, const DynamicBatchScheduler& self)
+            {
+                return Schedule(self.submit_sch_);
+            }
+        };
+
+        template<typename... Args>
+        using scheduler_t = DynamicBatchScheduler<Args...>;
+
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        using operation_t = typename _Operation<Sender, Scheduler, remove_cvref_t<Receiver>, Func>::type;
+
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        struct _Receiver
+        {
+            struct type
+            {
+                operation_t<Sender, Scheduler, Receiver, Func>* op_state_;
+                template<typename... Args>
+                friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept
+                {
+                    self.op_state_->context_->Notify(self.op_state_, (Args&&)args...);
+                }
+            };
+        };
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        using receiver_t = typename _Receiver<Sender, Scheduler, Receiver, Func>::type;
+
+        using context_base_t = dynamic_batch_t::context_base_t;
+
+        //                         start   count
+        using range_t = std::pair<size_t, size_t>;
+
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        struct Context : context_base_t
+        {
+            using _duration_t = std::chrono::duration<int64_t, std::micro>;
+
+            Scheduler scheduler_;
+            using Assembler = typename Scheduler::Assembler;
+            Func                      func_;
+            size_t                    max_batch_size_;
+            _duration_t               delay_;
+            TimedSingleThreadContext* timer_;
+
+            std::mutex                mutex_;
+            size_t                    counter_{0};
+
+            Context(Scheduler scheduler, Func func)
+                : context_base_t{[](context_base_t* p)
+                                 { delete static_cast<Context*>(p); }}
+                , scheduler_(std::move(scheduler))
+                , func_(std::move(func))
+                , max_batch_size_(scheduler_.max_batch_size_)
+                , delay_(scheduler_.timeout_)
+                , timer_(scheduler_.timer_)
+            {
+            }
+
+            ~Context()
+            {
+                MMDEPLOY_DEBUG("~Context()");
+            }
+
+            using _operation_t = operation_t<Sender, Scheduler, Receiver, Func>;
+
+            struct Batch
+            {
+                Context*                           context_;
+                size_t                             index_{0};
+                std::vector<_operation_t*>         states_;
+                std::vector<range_t>               ranges_;
+                completion_signatures_of_t<Sender> values_;
+                size_t                             size_{0};
+                Batch(Context* context, size_t index, size_t max_batch_size)
+                    : context_(context)
+                    , index_(index)
+                    , values_{}
+                {
+                    states_.reserve(max_batch_size);
+                    ranges_.reserve(max_batch_size);
+                }
+
+                friend std::ostream& operator<<(std::ostream& os, const Batch& batch)
+                {
+                    os << fmt::format("(index={}, size={})", batch.index_, batch.size_);
+                    return os;
+                }
+            };
+
+            template<typename... Args>
+            void Notify(_operation_t* op_state, Args&&... args)
+            {
+                std::lock_guard        lock{mutex_};
+
+                std::unique_ptr<Batch> batch = std::move(batch_);
+                const size_t           size  = Assembler::get_size((Args&&)args...);
+                op_state->count_             = size;
+                op_state->batch_size_        = size;
+
+                size_t index = 0;
+                while (index != size)
+                {
+                    bool new_batch{};
+                    if (!batch)
+                    {
+                        batch     = std::make_unique<Batch>(this, counter_++, max_batch_size_);
+                        new_batch = true;
+                    }
+                    auto count = std::min(max_batch_size_ - batch->size_, size - index);
+                    auto start = index;
+
+                    batch->states_.push_back(op_state);
+                    batch->ranges_.emplace_back(start, count);
+                    Assembler::input(std::forward_as_tuple((Args&&)args...), {start, count}, batch->values_, {batch->size_, count}, max_batch_size_);
+                    batch->size_ += count;
+
+                    index += count;
+                    if (batch->size_ == max_batch_size_)
+                    {
+                        MMDEPLOY_DEBUG("direct submit of batch {}", *batch);
+                        // batch is full, submit immediately
+                        Execute(scheduler_.execute_sch_, [this, batch = std::move(batch)]
+                                { Run(*batch); });
+                    }
+                    else if (new_batch && timer_)
+                    {
+                        MMDEPLOY_DEBUG("set off deferred submission for batch {}", *batch);
+                        // set off a deferred task to submit the batch if it still exists at the moment.
+                        StartDetached(Then(ScheduleAfter(timer_->GetScheduler(), delay_),
+                                           [this, batch_index = batch->index_]
+                                           { Submit(batch_index); }));
+                    }
+                }
+
+                batch_ = std::move(batch);
+            }
+
+            void Submit(size_t batch_index)
+            {
+                Execute(scheduler_.execute_sch_, [this, batch_index]
+                        {
       std::unique_ptr<Batch> batch;
       {
         std::lock_guard lock{mutex_};
@@ -156,126 +185,143 @@ struct Context : context_base_t {
       if (batch) {
         MMDEPLOY_DEBUG("deferred submit of batch {}", *batch);
         Run(*batch);
-      }
-    });
-  }
-
-  void Run(Batch& batch) {
-    auto rets = std::apply([&](auto&&... args) { return func_((decltype(args)&&)args...); },
-                           std::move(batch.values_));
-    auto& states = batch.states_;
-    auto& ranges = batch.ranges_;
-    size_t start = 0;
-    for (size_t i = 0; i < states.size(); ++i) {
-      auto count = ranges[i].second;
-      range_t rets_range{start, count};
-      states[i]->Notify(rets, rets_range, ranges[i]);
-      start += count;
-    }
-  }
-
-  std::unique_ptr<Batch> batch_;
-};
-
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-struct _Operation<Sender, Scheduler, Receiver, Func>::type {
-  using Assembler = typename Scheduler::Assembler;
-  using _context_t = Context<Sender, Scheduler, Receiver, Func>;
-  using _receiver_t = receiver_t<Sender, Scheduler, Receiver, Func>;
-  using _result_t = decltype(
-      std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
-
-  _context_t* context_;
-  connect_result_t<Sender, _receiver_t> op_state_;
-  Receiver receiver_;
-  _result_t vals_;
-
-  std::atomic<size_t> count_{0};
-  size_t batch_size_{0};
-
-  template <typename Receiver2>
-  type(Sender&& sender, Scheduler scheduler, std::atomic<context_base_t*>* context, Func func,
-       Receiver2&& receiver)
-      : context_(CreateContext(*context, std::move(scheduler), std::move(func))),
-        op_state_{Connect((Sender &&) sender, _receiver_t{this})},
-        receiver_((Receiver2 &&) receiver),
-        vals_{} {}
-
-  type(const type&) = delete;
-  type& operator=(const type&) = delete;
-  type(type&&) noexcept = delete;
-  type& operator=(type&&) noexcept = delete;
-
-  _context_t* CreateContext(std::atomic<context_base_t*>& context, Scheduler scheduler, Func func) {
-    auto* old = context.load(std::memory_order_acquire);
-    if (old) {
-      return static_cast<_context_t*>(old);
-    } else {
-      auto p = std::make_unique<_context_t>(scheduler, std::move(func));
-      if (context.compare_exchange_strong(old, p.get(), std::memory_order_release,
-                                          std::memory_order_acquire)) {
-        // context is filled with p, and now it has the ownership of its value
-        return p.release();
-      } else {
-        // old contains context created by some other thread, p will be destroyed
-        return static_cast<_context_t*>(old);
-      }
-    }
-  }
-
-  friend void tag_invoke(start_t, type& self) { Start(self.op_state_); }
-
-  void Notify(_result_t& rets, range_t rets_range, range_t vals_range) {
-    Assembler::output(rets, rets_range, vals_, vals_range, batch_size_);
-    auto count = rets_range.second;
-    if (count_.fetch_sub(count, std::memory_order_acq_rel) == count) {  // (count_ -= count) == 0
-      SetValue(std::move(receiver_), std::move(vals_));
-    }
-  }
-};
-
-template <typename Sender, typename Scheduler, typename Func>
-struct _Sender {
-  struct type {
-    using _result_t = decltype(
-        std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
-
-    using value_types = std::tuple<_result_t>;
-
-    Sender sender_;
-    Scheduler scheduler_;
-    std::atomic<context_base_t*>* context_;
-    Func func_;
-
-    template <typename Sender2>
-    type(Sender2&& sender, Scheduler scheduler, std::atomic<context_base_t*>* context, Func func)
-        : sender_((Sender2 &&) sender),
-          scheduler_(std::move(scheduler)),
-          context_(context),
-          func_(std::move(func)) {}
-
-    template <typename Receiver>
-    friend auto tag_invoke(connect_t, type&& self, Receiver&& receiver)
-        -> operation_t<Sender, Scheduler, Receiver, Func> {
-      return {std::move(self).sender_, std::move(self).scheduler_, self.context_,
-              std::move(self).func_, (Receiver &&) receiver};
-    }
-  };
-};
-
-template <typename Sender, typename Scheduler, typename Func>
-using sender_t = typename _Sender<remove_cvref_t<Sender>, Scheduler, Func>::type;
-
-template <typename Sender, typename Func, typename... Args>
-auto tag_invoke(dynamic_batch_t, const scheduler_t<Args...>& scheduler, Sender&& sender,
-                dynamic_batch_t::context_t& context, Func func)
-    -> sender_t<Sender, scheduler_t<Args...>, Func> {
-  return {(Sender &&) sender, scheduler, &context.base, std::move(func)};
-}
-
-}  // namespace _dynamic_batch_scheduler
-
-using _dynamic_batch_scheduler::DynamicBatchScheduler;
+      } });
+            }
+
+            void Run(Batch& batch)
+            {
+                auto   rets   = std::apply([&](auto&&... args)
+                                       { return func_((decltype(args)&&)args...); },
+                                       std::move(batch.values_));
+                auto&  states = batch.states_;
+                auto&  ranges = batch.ranges_;
+                size_t start  = 0;
+                for (size_t i = 0; i < states.size(); ++i)
+                {
+                    auto    count = ranges[i].second;
+                    range_t rets_range{start, count};
+                    states[i]->Notify(rets, rets_range, ranges[i]);
+                    start += count;
+                }
+            }
+
+            std::unique_ptr<Batch> batch_;
+        };
+
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        struct _Operation<Sender, Scheduler, Receiver, Func>::type
+        {
+            using Assembler   = typename Scheduler::Assembler;
+            using _context_t  = Context<Sender, Scheduler, Receiver, Func>;
+            using _receiver_t = receiver_t<Sender, Scheduler, Receiver, Func>;
+            using _result_t   = decltype(std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
+
+            _context_t*                           context_;
+            connect_result_t<Sender, _receiver_t> op_state_;
+            Receiver                              receiver_;
+            _result_t                             vals_;
+
+            std::atomic<size_t>                   count_{0};
+            size_t                                batch_size_{0};
+
+            template<typename Receiver2>
+            type(Sender&& sender, Scheduler scheduler, std::atomic<context_base_t*>* context, Func func, Receiver2&& receiver)
+                : context_(CreateContext(*context, std::move(scheduler), std::move(func)))
+                , op_state_{Connect((Sender&&)sender, _receiver_t{this})}
+                , receiver_((Receiver2&&)receiver)
+                , vals_{}
+            {
+            }
+
+            type(const type&)                      = delete;
+            type& operator=(const type&)           = delete;
+            type(type&&) noexcept                  = delete;
+            type&       operator=(type&&) noexcept = delete;
+
+            _context_t* CreateContext(std::atomic<context_base_t*>& context, Scheduler scheduler, Func func)
+            {
+                auto* old = context.load(std::memory_order_acquire);
+                if (old)
+                {
+                    return static_cast<_context_t*>(old);
+                }
+                else
+                {
+                    auto p = std::make_unique<_context_t>(scheduler, std::move(func));
+                    if (context.compare_exchange_strong(old, p.get(), std::memory_order_release, std::memory_order_acquire))
+                    {
+                        // context is filled with p, and now it has the ownership of its value
+                        return p.release();
+                    }
+                    else
+                    {
+                        // old contains context created by some other thread, p will be destroyed
+                        return static_cast<_context_t*>(old);
+                    }
+                }
+            }
+
+            friend void tag_invoke(start_t, type& self)
+            {
+                Start(self.op_state_);
+            }
+
+            void Notify(_result_t& rets, range_t rets_range, range_t vals_range)
+            {
+                Assembler::output(rets, rets_range, vals_, vals_range, batch_size_);
+                auto count = rets_range.second;
+                if (count_.fetch_sub(count, std::memory_order_acq_rel) == count)
+                {  // (count_ -= count) == 0
+                    SetValue(std::move(receiver_), std::move(vals_));
+                }
+            }
+        };
+
+        template<typename Sender, typename Scheduler, typename Func>
+        struct _Sender
+        {
+            struct type
+            {
+                using _result_t = decltype(std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
+
+                using value_types = std::tuple<_result_t>;
+
+                Sender                        sender_;
+                Scheduler                     scheduler_;
+                std::atomic<context_base_t*>* context_;
+                Func                          func_;
+
+                template<typename Sender2>
+                type(Sender2&& sender, Scheduler scheduler, std::atomic<context_base_t*>* context, Func func)
+                    : sender_((Sender2&&)sender)
+                    , scheduler_(std::move(scheduler))
+                    , context_(context)
+                    , func_(std::move(func))
+                {
+                }
+
+                template<typename Receiver>
+                friend auto tag_invoke(connect_t, type&& self, Receiver&& receiver)
+                    -> operation_t<Sender, Scheduler, Receiver, Func>
+                {
+                    return {std::move(self).sender_, std::move(self).scheduler_, self.context_, std::move(self).func_, (Receiver&&)receiver};
+                }
+            };
+        };
+
+        template<typename Sender, typename Scheduler, typename Func>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>, Scheduler, Func>::type;
+
+        template<typename Sender, typename Func, typename... Args>
+        auto tag_invoke(dynamic_batch_t, const scheduler_t<Args...>& scheduler, Sender&& sender, dynamic_batch_t::context_t& context, Func func)
+            -> sender_t<Sender, scheduler_t<Args...>, Func>
+        {
+            return {(Sender&&)sender, scheduler, &context.base, std::move(func)};
+        }
+
+    }  // namespace _dynamic_batch_scheduler
+
+    using _dynamic_batch_scheduler::DynamicBatchScheduler;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/inlined_scheduler.h b/csrc/mmdeploy/execution/schedulers/inlined_scheduler.h
index 3a4b06dd73..e80fd792c2 100644
--- a/csrc/mmdeploy/execution/schedulers/inlined_scheduler.h
+++ b/csrc/mmdeploy/execution/schedulers/inlined_scheduler.h
@@ -5,68 +5,86 @@
 
 #include "mmdeploy/execution/execution.h"
 
-namespace mmdeploy {
-
-namespace _inline_sched {
-
-template <typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Receiver>
-using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
-
-template <typename Receiver>
-struct _Operation<Receiver>::type {
-  Receiver receiver_;
-  friend void tag_invoke(start_t, type& op) noexcept { SetValue(std::move(op.receiver_)); }
-};
-
-struct _Sender {
-  using value_types = std::tuple<>;
-
-  template <typename Receiver>
-  friend auto tag_invoke(connect_t, _Sender, Receiver&& receiver) -> operation_t<Receiver> {
-    return {(Receiver &&) receiver};
-  }
-};
-
-struct InlineScheduler {
-  friend _inline_sched::_Sender tag_invoke(schedule_t, const InlineScheduler&) noexcept {
-    return {};
-  }
-};
-
-inline InlineScheduler tag_invoke(get_completion_scheduler_t, const _Sender&) { return {}; }
-
-template <typename Sender>
-struct _Receiver {
-  struct type;
-};
-template <typename Sender>
-using receiver_t = typename _Receiver<remove_cvref_t<Sender>>::type;
-
-template <typename Sender>
-struct _Receiver<Sender>::type {
-  std::optional<completion_signatures_of_t<Sender>>* data_;
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& r, As&&... as) noexcept {
-    r.data_->emplace((As &&) as...);
-  }
-};
-
-template <typename Sender>
-completion_signatures_of_t<Sender> tag_invoke(sync_wait_t, InlineScheduler, Sender&& sender) {
-  std::optional<completion_signatures_of_t<Sender>> data;
-  auto op_state = Connect(((Sender &&) sender), _inline_sched::receiver_t<Sender>{&data});
-  Start(op_state);
-  return std::move(data).value();
-}
-
-}  // namespace _inline_sched
-
-using _inline_sched::InlineScheduler;
-inline constexpr InlineScheduler inline_scheduler{};
+namespace mmdeploy
+{
+
+    namespace _inline_sched
+    {
+
+        template<typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Receiver>
+        using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
+
+        template<typename Receiver>
+        struct _Operation<Receiver>::type
+        {
+            Receiver    receiver_;
+            friend void tag_invoke(start_t, type& op) noexcept
+            {
+                SetValue(std::move(op.receiver_));
+            }
+        };
+
+        struct _Sender
+        {
+            using value_types = std::tuple<>;
+
+            template<typename Receiver>
+            friend auto tag_invoke(connect_t, _Sender, Receiver&& receiver) -> operation_t<Receiver>
+            {
+                return {(Receiver&&)receiver};
+            }
+        };
+
+        struct InlineScheduler
+        {
+            friend _inline_sched::_Sender tag_invoke(schedule_t, const InlineScheduler&) noexcept
+            {
+                return {};
+            }
+        };
+
+        inline InlineScheduler tag_invoke(get_completion_scheduler_t, const _Sender&)
+        {
+            return {};
+        }
+
+        template<typename Sender>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Sender>
+        using receiver_t = typename _Receiver<remove_cvref_t<Sender>>::type;
+
+        template<typename Sender>
+        struct _Receiver<Sender>::type
+        {
+            std::optional<completion_signatures_of_t<Sender>>* data_;
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& r, As&&... as) noexcept
+            {
+                r.data_->emplace((As&&)as...);
+            }
+        };
+
+        template<typename Sender>
+        completion_signatures_of_t<Sender> tag_invoke(sync_wait_t, InlineScheduler, Sender&& sender)
+        {
+            std::optional<completion_signatures_of_t<Sender>> data;
+            auto                                              op_state = Connect(((Sender&&)sender), _inline_sched::receiver_t<Sender>{&data});
+            Start(op_state);
+            return std::move(data).value();
+        }
+
+    }  // namespace _inline_sched
+
+    using _inline_sched::InlineScheduler;
+    inline constexpr InlineScheduler inline_scheduler{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/intrusive_queue.h b/csrc/mmdeploy/execution/schedulers/intrusive_queue.h
index 32514c01c6..ac9550bc7c 100644
--- a/csrc/mmdeploy/execution/schedulers/intrusive_queue.h
+++ b/csrc/mmdeploy/execution/schedulers/intrusive_queue.h
@@ -20,96 +20,124 @@
 #include <tuple>
 #include <utility>
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-template <auto Next>
-class intrusive_queue;
+    template<auto Next>
+    class intrusive_queue;
 
-template <class Item, Item* Item::*Next>
-class intrusive_queue<Next> {
- public:
-  intrusive_queue() noexcept = default;
+    template<class Item, Item* Item::*Next>
+    class intrusive_queue<Next>
+    {
+      public:
+        intrusive_queue() noexcept = default;
 
-  intrusive_queue(intrusive_queue&& other) noexcept
-      : head_(std::exchange((other.head_, nullptr))),
-        tail_(std::exchange((other.head_, nullptr))) {}
+        intrusive_queue(intrusive_queue&& other) noexcept
+            : head_(std::exchange((other.head_, nullptr)))
+            , tail_(std::exchange((other.head_, nullptr)))
+        {
+        }
 
-  ~intrusive_queue() { assert(empty()); }
+        ~intrusive_queue()
+        {
+            assert(empty());
+        }
 
-  static intrusive_queue MakeReversed(Item* list) noexcept {
-    Item* new_head = nullptr;
-    Item* new_tail = list;
-    while (list != nullptr) {
-      Item* next = list->*Next;
-      list->*Next = new_head;
-      new_head = list;
-      list = next;
-    }
-    intrusive_queue result;
-    result.head_ = new_head;
-    result.tail_ = new_tail;
-  }
+        static intrusive_queue MakeReversed(Item* list) noexcept
+        {
+            Item* new_head = nullptr;
+            Item* new_tail = list;
+            while (list != nullptr)
+            {
+                Item* next  = list->*Next;
+                list->*Next = new_head;
+                new_head    = list;
+                list        = next;
+            }
+            intrusive_queue result;
+            result.head_ = new_head;
+            result.tail_ = new_tail;
+        }
 
-  bool empty() const noexcept { return head_ == nullptr; }
+        bool empty() const noexcept
+        {
+            return head_ == nullptr;
+        }
 
-  Item* pop_front() noexcept {
-    assert(!empty());
-    Item* item = std::exchange(head_, head_->*Next);
-    if (head_ == nullptr) {
-      tail_ = nullptr;
-    }
-    return item;
-  }
+        Item* pop_front() noexcept
+        {
+            assert(!empty());
+            Item* item = std::exchange(head_, head_->*Next);
+            if (head_ == nullptr)
+            {
+                tail_ = nullptr;
+            }
+            return item;
+        }
 
-  void push_front(Item* item) noexcept {
-    assert(item != nullptr);
-    item->*Next = head_;
-    head_ = item;
-    if (tail_ == nullptr) {
-      tail_ = item;
-    }
-  }
+        void push_front(Item* item) noexcept
+        {
+            assert(item != nullptr);
+            item->*Next = head_;
+            head_       = item;
+            if (tail_ == nullptr)
+            {
+                tail_ = item;
+            }
+        }
 
-  void push_back(Item* item) noexcept {
-    assert(item != nullptr);
-    item->*Next = nullptr;
-    if (tail_ == nullptr) {
-      head_ = item;
-    } else {
-      tail_->*Next = item;
-    }
-    tail_ = item;
-  }
+        void push_back(Item* item) noexcept
+        {
+            assert(item != nullptr);
+            item->*Next = nullptr;
+            if (tail_ == nullptr)
+            {
+                head_ = item;
+            }
+            else
+            {
+                tail_->*Next = item;
+            }
+            tail_ = item;
+        }
 
-  void append(intrusive_queue other) noexcept {
-    if (other.empty()) {
-      return;
-    }
-    auto* other_head = std::exchange(other.head_, nullptr);
-    if (empty()) {
-      head_ = other_head;
-    } else {
-      tail_->*Next = other_head;
-    }
-    tail_ = std::exchange(other.tail_, nullptr);
-  }
+        void append(intrusive_queue other) noexcept
+        {
+            if (other.empty())
+            {
+                return;
+            }
+            auto* other_head = std::exchange(other.head_, nullptr);
+            if (empty())
+            {
+                head_ = other_head;
+            }
+            else
+            {
+                tail_->*Next = other_head;
+            }
+            tail_ = std::exchange(other.tail_, nullptr);
+        }
 
-  void prepend(intrusive_queue other) noexcept {
-    if (other.empty()) {
-      return;
-    }
-    other.tail_->*Next = head_;
-    head_ = other.head_;
-    if (tail_ == nullptr) {
-      tail_ = other.tail_;
-    }
-    other.tail_ = nullptr;
-    other.head_ = nullptr;
-  }
+        void prepend(intrusive_queue other) noexcept
+        {
+            if (other.empty())
+            {
+                return;
+            }
+            other.tail_->*Next = head_;
+            head_              = other.head_;
+            if (tail_ == nullptr)
+            {
+                tail_ = other.tail_;
+            }
+            other.tail_ = nullptr;
+            other.head_ = nullptr;
+        }
 
- private:
-  Item* head_ = nullptr;
-  Item* tail_ = nullptr;
-};
+      private:
+        Item* head_ = nullptr;
+        Item* tail_ = nullptr;
+    };
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/execution/schedulers/registry.h b/csrc/mmdeploy/execution/schedulers/registry.h
index 3db4e9feee..2a9ebd2c24 100644
--- a/csrc/mmdeploy/execution/schedulers/registry.h
+++ b/csrc/mmdeploy/execution/schedulers/registry.h
@@ -7,12 +7,13 @@
 #include "mmdeploy/core/value.h"
 #include "mmdeploy/execution/type_erased.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_REGISTER_TYPE_ID(TypeErasedScheduler<Value>, 8);
+    MMDEPLOY_REGISTER_TYPE_ID(TypeErasedScheduler<Value>, 8);
 
-MMDEPLOY_DECLARE_REGISTRY(TypeErasedScheduler<Value>,
-                          TypeErasedScheduler<Value>(const Value& config));
+    MMDEPLOY_DECLARE_REGISTRY(TypeErasedScheduler<Value>,
+                              TypeErasedScheduler<Value>(const Value& config));
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/schedulers.cpp b/csrc/mmdeploy/execution/schedulers/schedulers.cpp
index 2703e136e1..e2592fc2ed 100644
--- a/csrc/mmdeploy/execution/schedulers/schedulers.cpp
+++ b/csrc/mmdeploy/execution/schedulers/schedulers.cpp
@@ -7,33 +7,38 @@
 #include "mmdeploy/execution/schedulers/static_thread_pool.h"
 #include "mmdeploy/execution/schedulers/timed_single_thread_context.h"
 
-namespace mmdeploy {
-
-using Scheduler = TypeErasedScheduler<Value>;
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (Inline, 0),
-                               [](const Value&) { return Scheduler{InlineScheduler{}}; });
-
-namespace {
-
-// Create type-erased scheduler by calling Context::GetScheduler and then move the context into the
-// deleter of the impl ptr of the type-erased scheduler
-template <class Context>
-Scheduler CreateFromContext(std::unique_ptr<Context> context) {
-  using SchedType = decltype(context->GetScheduler());
-  using EraseType = _type_erased::TypeErasedSchedulerImpl<SchedType, Value>;
-  auto sched = new EraseType(context->GetScheduler());
-  return Scheduler{std::shared_ptr<Scheduler::Impl>(
-      sched, [context = std::shared_ptr<Context>(std::move(context))](EraseType* p) { delete p; })};
-}
-
-}  // namespace
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (SingleThread, 0), [](const Value&) {
-  return CreateFromContext(std::make_unique<_single_thread_context::SingleThreadContext>());
-});
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (ThreadPool, 0), [](const Value& cfg) {
+namespace mmdeploy
+{
+
+    using Scheduler = TypeErasedScheduler<Value>;
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (Inline, 0), [](const Value&)
+                                   { return Scheduler{InlineScheduler{}}; });
+
+    namespace
+    {
+
+        // Create type-erased scheduler by calling Context::GetScheduler and then move the context into the
+        // deleter of the impl ptr of the type-erased scheduler
+        template<class Context>
+        Scheduler CreateFromContext(std::unique_ptr<Context> context)
+        {
+            using SchedType = decltype(context->GetScheduler());
+            using EraseType = _type_erased::TypeErasedSchedulerImpl<SchedType, Value>;
+            auto sched      = new EraseType(context->GetScheduler());
+            return Scheduler{std::shared_ptr<Scheduler::Impl>(
+                sched,
+                [context = std::shared_ptr<Context>(std::move(context))](EraseType* p)
+                { delete p; })};
+        }
+
+    }  // namespace
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (SingleThread, 0), [](const Value&)
+                                   { return CreateFromContext(std::make_unique<_single_thread_context::SingleThreadContext>()); });
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (ThreadPool, 0), [](const Value& cfg)
+                                   {
   auto num_threads = -1;
   if (cfg.is_object() && cfg.contains("num_threads")) {
     num_threads = cfg["num_threads"].get<int>();
@@ -42,77 +47,85 @@ MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (ThreadPool, 0), [](const Value& cfg)
     return CreateFromContext(std::make_unique<__static_thread_pool::StaticThreadPool>(num_threads));
   } else {
     return CreateFromContext(std::make_unique<__static_thread_pool::StaticThreadPool>());
-  }
-});
-
-struct ValueAssembler {
-  using range_t = std::pair<size_t, size_t>;
-
-  static size_t get_size(const Value& x) { return x.empty() ? 0 : x.front().size(); }
-
-  template <typename ValueType>
-  static void input(std::tuple<ValueType> _src, range_t src_range, std::tuple<Value>& _dst,
-                    range_t dst_range, size_t batch_size) {
-    auto& [src] = _src;
-    auto& [dst] = _dst;
-    if (dst.empty()) {
-      dst = std::move(src);
-      for (auto& x : dst) {
-        x.array().reserve(batch_size);
-      }
-      return;
-    }
-    auto& u = src.array();
-    auto& v = dst.array();
-    assert(u.size() == v.size());
-    assert(dst_range.first = v.front().size());
-    for (size_t k = 0; k < src.size(); ++k) {
-      auto& x = u[k].array();
-      auto& y = v[k].array();
-      std::copy(std::begin(x) + src_range.first, std::begin(x) + src_range.first + src_range.second,
-                std::back_inserter(y));
+  } });
+
+    struct ValueAssembler
+    {
+        using range_t = std::pair<size_t, size_t>;
+
+        static size_t get_size(const Value& x)
+        {
+            return x.empty() ? 0 : x.front().size();
+        }
+
+        template<typename ValueType>
+        static void input(std::tuple<ValueType> _src, range_t src_range, std::tuple<Value>& _dst, range_t dst_range, size_t batch_size)
+        {
+            auto& [src] = _src;
+            auto& [dst] = _dst;
+            if (dst.empty())
+            {
+                dst = std::move(src);
+                for (auto& x : dst)
+                {
+                    x.array().reserve(batch_size);
+                }
+                return;
+            }
+            auto& u = src.array();
+            auto& v = dst.array();
+            assert(u.size() == v.size());
+            assert(dst_range.first = v.front().size());
+            for (size_t k = 0; k < src.size(); ++k)
+            {
+                auto& x = u[k].array();
+                auto& y = v[k].array();
+                std::copy(std::begin(x) + src_range.first, std::begin(x) + src_range.first + src_range.second, std::back_inserter(y));
+            }
+        }
+
+        static void output(Value& src, range_t src_range, Value& dst, range_t dst_range, size_t batch_size)
+        {
+            if (dst.empty())
+            {
+                dst = Value::Array(src.size(), Value::Array(batch_size));
+            }
+            auto& u = src.array();
+            auto& v = dst.array();
+            assert(u.size() == v.size());
+            for (size_t k = 0; k < src.size(); ++k)
+            {
+                auto& x = u[k].array();
+                auto& y = v[k].array();
+                std::move(std::begin(x) + src_range.first, std::begin(x) + src_range.first + src_range.second, std::begin(y) + dst_range.first);
+            }
+        }
+    };
+
+    TimedSingleThreadContext& gTimedSingleThreadContext()
+    {
+        static TimedSingleThreadContext context{};
+        return context;
     }
-  }
 
-  static void output(Value& src, range_t src_range, Value& dst, range_t dst_range,
-                     size_t batch_size) {
-    if (dst.empty()) {
-      dst = Value::Array(src.size(), Value::Array(batch_size));
+    static Scheduler CreateDynamicBatchScheduler(const Value& cfg)
+    {
+        using SchedulerType =
+            DynamicBatchScheduler<InlineScheduler, TypeErasedScheduler<Value>, ValueAssembler>;
+        auto                      scheduler      = cfg["scheduler"].get<TypeErasedScheduler<Value>>();
+        auto                      max_batch_size = cfg["max_batch_size"].get<int>();
+
+        TimedSingleThreadContext* timer{};
+        auto                      timeout = cfg["timeout"].get<int>();
+        if (timeout >= 0)
+        {
+            timer = &gTimedSingleThreadContext();
+        }
+        return Scheduler{SchedulerType{inline_scheduler, std::move(scheduler), timer, (size_t)max_batch_size, std::chrono::microseconds(timeout)}};
     }
-    auto& u = src.array();
-    auto& v = dst.array();
-    assert(u.size() == v.size());
-    for (size_t k = 0; k < src.size(); ++k) {
-      auto& x = u[k].array();
-      auto& y = v[k].array();
-      std::move(std::begin(x) + src_range.first, std::begin(x) + src_range.first + src_range.second,
-                std::begin(y) + dst_range.first);
-    }
-  }
-};
-
-TimedSingleThreadContext& gTimedSingleThreadContext() {
-  static TimedSingleThreadContext context{};
-  return context;
-}
-
-static Scheduler CreateDynamicBatchScheduler(const Value& cfg) {
-  using SchedulerType =
-      DynamicBatchScheduler<InlineScheduler, TypeErasedScheduler<Value>, ValueAssembler>;
-  auto scheduler = cfg["scheduler"].get<TypeErasedScheduler<Value>>();
-  auto max_batch_size = cfg["max_batch_size"].get<int>();
-
-  TimedSingleThreadContext* timer{};
-  auto timeout = cfg["timeout"].get<int>();
-  if (timeout >= 0) {
-    timer = &gTimedSingleThreadContext();
-  }
-  return Scheduler{SchedulerType{inline_scheduler, std::move(scheduler), timer,
-                                 (size_t)max_batch_size, std::chrono::microseconds(timeout)}};
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (DynamicBatch, 0), CreateDynamicBatchScheduler);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (DynamicBatch, 0), CreateDynamicBatchScheduler);
 
-MMDEPLOY_DEFINE_REGISTRY(TypeErasedScheduler<Value>);
+    MMDEPLOY_DEFINE_REGISTRY(TypeErasedScheduler<Value>);
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/execution/schedulers/single_thread_context.h b/csrc/mmdeploy/execution/schedulers/single_thread_context.h
index ebcba774ab..4fee5142a9 100644
--- a/csrc/mmdeploy/execution/schedulers/single_thread_context.h
+++ b/csrc/mmdeploy/execution/schedulers/single_thread_context.h
@@ -9,48 +9,68 @@
 
 #include "mmdeploy/execution/run_loop.h"
 
-namespace mmdeploy {
-
-namespace _single_thread_context {
-
-class SingleThreadContext {
- public:
-  SingleThreadContext() : loop_(), thread_([this] { loop_._Run(); }) {}
-
-  ~SingleThreadContext() {
-    loop_._Finish();
-    thread_.join();
-  }
-
-  class Scheduler {
-   public:
-    explicit Scheduler(SingleThreadContext* context)
-        : context_(context), scheduler_(context_->loop_.GetScheduler()) {}
-
-    friend auto tag_invoke(schedule_t, const Scheduler& self)
-        -> tag_invoke_result_t<schedule_t, RunLoop::_Scheduler> {
-      return Schedule(self.scheduler_);
-    }
-
-   private:
-    SingleThreadContext* context_;
-    RunLoop::_Scheduler scheduler_;
-  };
-
-  Scheduler GetScheduler() noexcept { return Scheduler{this}; }
-
-  std::thread::id GetThreadId() const noexcept { return thread_.get_id(); }
-
- private:
-  RunLoop loop_;
-  std::thread thread_;
-};
-
-using Scheduler = SingleThreadContext::Scheduler;
-
-}  // namespace _single_thread_context
-
-using _single_thread_context::SingleThreadContext;
+namespace mmdeploy
+{
+
+    namespace _single_thread_context
+    {
+
+        class SingleThreadContext
+        {
+          public:
+            SingleThreadContext()
+                : loop_()
+                , thread_([this]
+                          { loop_._Run(); })
+            {
+            }
+
+            ~SingleThreadContext()
+            {
+                loop_._Finish();
+                thread_.join();
+            }
+
+            class Scheduler
+            {
+              public:
+                explicit Scheduler(SingleThreadContext* context)
+                    : context_(context)
+                    , scheduler_(context_->loop_.GetScheduler())
+                {
+                }
+
+                friend auto tag_invoke(schedule_t, const Scheduler& self)
+                    -> tag_invoke_result_t<schedule_t, RunLoop::_Scheduler>
+                {
+                    return Schedule(self.scheduler_);
+                }
+
+              private:
+                SingleThreadContext* context_;
+                RunLoop::_Scheduler  scheduler_;
+            };
+
+            Scheduler GetScheduler() noexcept
+            {
+                return Scheduler{this};
+            }
+
+            std::thread::id GetThreadId() const noexcept
+            {
+                return thread_.get_id();
+            }
+
+          private:
+            RunLoop     loop_;
+            std::thread thread_;
+        };
+
+        using Scheduler = SingleThreadContext::Scheduler;
+
+    }  // namespace _single_thread_context
+
+    using _single_thread_context::SingleThreadContext;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/static_thread_pool.h b/csrc/mmdeploy/execution/schedulers/static_thread_pool.h
index 3aebc5ac99..fc29d4e69f 100644
--- a/csrc/mmdeploy/execution/schedulers/static_thread_pool.h
+++ b/csrc/mmdeploy/execution/schedulers/static_thread_pool.h
@@ -15,346 +15,433 @@
 #include "intrusive_queue.h"
 #include "mmdeploy/execution/execution.h"
 
-namespace mmdeploy {
-
-namespace __static_thread_pool {
-
-struct TaskBase {
-  TaskBase* next_;
-  void (*execute_)(TaskBase*) noexcept;
-};
-
-template <typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Receiver>
-using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
-
-class StaticThreadPool;
-
-struct Scheduler {
-  template <typename Receiver>
-  friend struct _Operation;
-
-  struct Sender {
-    using value_types = std::tuple<>;
-
-    template <typename Receiver>
-    operation_t<Receiver> MakeOperation(Receiver&& r) const {
-      return {pool_, (Receiver &&) r};
-    }
+namespace mmdeploy
+{
+
+    namespace __static_thread_pool
+    {
+
+        struct TaskBase
+        {
+            TaskBase* next_;
+            void (*execute_)(TaskBase*) noexcept;
+        };
+
+        template<typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Receiver>
+        using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
+
+        class StaticThreadPool;
+
+        struct Scheduler
+        {
+            template<typename Receiver>
+            friend struct _Operation;
+
+            struct Sender
+            {
+                using value_types = std::tuple<>;
+
+                template<typename Receiver>
+                operation_t<Receiver> MakeOperation(Receiver&& r) const
+                {
+                    return {pool_, (Receiver&&)r};
+                }
+
+                template<typename Receiver>
+                friend operation_t<Receiver> tag_invoke(connect_t, Sender s, Receiver&& r)
+                {
+                    return s.MakeOperation((Receiver&&)r);
+                }
+
+                friend auto tag_invoke(get_completion_scheduler_t, const Sender& sender) noexcept -> Scheduler
+                {
+                    return Scheduler{sender.pool_};
+                }
+
+                friend struct Scheduler;
+
+                explicit Sender(StaticThreadPool& pool) noexcept
+                    : pool_(pool)
+                {
+                }
+
+                StaticThreadPool& pool_;
+            };
+
+            Sender MakeSender_() const
+            {
+                return Sender{*pool_};
+            }
+
+            friend class StaticThreadPool;
+
+          public:
+            explicit Scheduler(StaticThreadPool& pool) noexcept
+                : pool_(&pool)
+            {
+            }
+
+            friend bool operator==(Scheduler a, Scheduler b) noexcept
+            {
+                return a.pool_ == b.pool_;
+            }
+
+            friend bool operator!=(Scheduler a, Scheduler b) noexcept
+            {
+                return a.pool_ != b.pool_;
+            }
+
+            friend Sender tag_invoke(schedule_t, const Scheduler& self) noexcept
+            {
+                return self.MakeSender_();
+            }
+
+          private:
+            StaticThreadPool* pool_{nullptr};
+        };
+
+        class StaticThreadPool
+        {
+            template<typename Receiver>
+            friend struct _Operation;
+
+          public:
+            StaticThreadPool();
+            explicit StaticThreadPool(std::uint32_t thread_count);
+            ~StaticThreadPool();
+
+            Scheduler GetScheduler() noexcept
+            {
+                return Scheduler{*this};
+            }
+
+            void RequestStop() noexcept;
+
+          private:
+            class ThreadState
+            {
+              public:
+                TaskBase* try_pop();
+                TaskBase* pop();
+                bool      try_push(TaskBase* task);
+                void      push(TaskBase* task);
+                void      request_stop();
+
+              private:
+                std::mutex                        mutex_;
+                std::condition_variable           cv_;
+                intrusive_queue<&TaskBase::next_> queue_;
+                bool                              stop_requested_{false};
+            };
+
+            void                       Run(std::uint32_t index) noexcept;
+            void                       Join() noexcept;
+
+            void                       Enqueue(TaskBase* task) noexcept;
+
+            std::uint32_t              thread_count_;
+            std::vector<std::thread>   threads_;
+            std::vector<ThreadState>   thread_states_;
+            std::atomic<std::uint32_t> next_thread_;
+        };
+
+        template<typename Receiver>
+        struct _Operation<Receiver>::type : TaskBase
+        {
+            friend Scheduler::Sender;
+
+            StaticThreadPool& pool_;
+            Receiver          receiver_;
+
+            type(StaticThreadPool& pool, Receiver&& r)
+                : TaskBase{}
+                , pool_(pool)
+                , receiver_((Receiver&&)r)
+            {
+                this->execute_ = [](TaskBase* t) noexcept
+                {
+                    auto& op = *static_cast<type*>(t);
+                    SetValue((Receiver&&)op.receiver_);
+                };
+            }
+
+            void enqueue_(TaskBase* op) const
+            {
+                return pool_.Enqueue(op);
+            }
+
+            friend void tag_invoke(start_t, type& op) noexcept
+            {
+                op.enqueue_(&op);
+            }
+        };
+
+        inline StaticThreadPool::StaticThreadPool()
+            : StaticThreadPool(std::thread::hardware_concurrency())
+        {
+        }
 
-    template <typename Receiver>
-    friend operation_t<Receiver> tag_invoke(connect_t, Sender s, Receiver&& r) {
-      return s.MakeOperation((Receiver &&) r);
-    }
+        inline StaticThreadPool::StaticThreadPool(std::uint32_t thread_count)
+            : thread_count_(thread_count)
+            , thread_states_(thread_count)
+            , next_thread_(0)
+        {
+            assert(thread_count_ > 0);
+
+            threads_.reserve(thread_count_);
+
+            try
+            {
+                for (std::uint32_t i = 0; i < thread_count_; ++i)
+                {
+                    threads_.emplace_back([this, i]
+                                          { Run(i); });
+                }
+            }
+            catch (...)
+            {
+                RequestStop();
+                Join();
+                throw;
+            }
+        }
 
-    friend auto tag_invoke(get_completion_scheduler_t, const Sender& sender) noexcept -> Scheduler {
-      return Scheduler{sender.pool_};
-    }
+        inline StaticThreadPool::~StaticThreadPool()
+        {
+            RequestStop();
+            Join();
+        }
 
-    friend struct Scheduler;
+        inline void StaticThreadPool::RequestStop() noexcept
+        {
+            for (auto& state : thread_states_)
+            {
+                state.request_stop();
+            }
+        }
 
-    explicit Sender(StaticThreadPool& pool) noexcept : pool_(pool) {}
+        inline void StaticThreadPool::Run(std::uint32_t index) noexcept
+        {
+            while (true)
+            {
+                TaskBase* task = nullptr;
+                for (std::uint32_t i = 0; i < thread_count_; ++i)
+                {
+                    auto  queue_index = (index + i) < thread_count_ ? (index + i) : (index + i - thread_count_);
+                    auto& state       = thread_states_[queue_index];
+                    task              = state.try_pop();
+                    if (task != nullptr)
+                    {
+                        break;
+                    }
+                }
+                if (task == nullptr)
+                {
+                    task = thread_states_[index].pop();
+                    if (task == nullptr)
+                    {
+                        return;
+                    }
+                }
+                task->execute_(task);
+            }
+        }
 
-    StaticThreadPool& pool_;
-  };
+        inline void StaticThreadPool::Join() noexcept
+        {
+            for (auto& t : threads_)
+            {
+                t.join();
+            }
+            threads_.clear();
+        }
 
-  Sender MakeSender_() const { return Sender{*pool_}; }
+        inline void StaticThreadPool::Enqueue(TaskBase* task) noexcept
+        {
+            const auto          thread_count = static_cast<std::uint32_t>(threads_.size());
+            const std::uint32_t start_index =
+                next_thread_.fetch_add(1, std::memory_order_relaxed) % thread_count;
+            for (std::uint32_t i = 0; i < thread_count; ++i)
+            {
+                const auto index =
+                    (start_index + i) < thread_count ? (start_index + i) : (start_index + i - thread_count);
+                if (thread_states_[index].try_push(task))
+                {
+                    return;
+                }
+            }
+            thread_states_[start_index].push(task);
+        }
 
-  friend class StaticThreadPool;
+        inline TaskBase* StaticThreadPool::ThreadState::try_pop()
+        {
+            std::unique_lock lock{mutex_, std::try_to_lock};
+            if (!lock || queue_.empty())
+            {
+                return nullptr;
+            }
+            return queue_.pop_front();
+        }
 
- public:
-  explicit Scheduler(StaticThreadPool& pool) noexcept : pool_(&pool) {}
+        inline TaskBase* StaticThreadPool::ThreadState::pop()
+        {
+            std::unique_lock lock{mutex_};
+            while (queue_.empty())
+            {
+                if (stop_requested_)
+                {
+                    return nullptr;
+                }
+                cv_.wait(lock);
+            }
+            return queue_.pop_front();
+        }
 
-  friend bool operator==(Scheduler a, Scheduler b) noexcept { return a.pool_ == b.pool_; }
+        inline bool StaticThreadPool::ThreadState::try_push(TaskBase* task)
+        {
+            bool was_empty{};
+            {
+                std::unique_lock lock{mutex_, std::try_to_lock};
+                if (!lock)
+                {
+                    return false;
+                }
+                was_empty = queue_.empty();
+                queue_.push_back(task);
+            }
+            if (was_empty)
+            {
+                cv_.notify_one();
+            }
+            return true;
+        }
 
-  friend bool operator!=(Scheduler a, Scheduler b) noexcept { return a.pool_ != b.pool_; }
+        inline void StaticThreadPool::ThreadState::push(TaskBase* task)
+        {
+            bool was_empty{};
+            {
+                std::lock_guard lock{mutex_};
+                was_empty = queue_.empty();
+                queue_.push_back(task);
+            }
+            if (was_empty)
+            {
+                cv_.notify_one();
+            }
+        }
 
-  friend Sender tag_invoke(schedule_t, const Scheduler& self) noexcept {
-    return self.MakeSender_();
-  }
+        inline void StaticThreadPool::ThreadState::request_stop()
+        {
+            {
+                std::lock_guard lock{mutex_};
+                stop_requested_ = true;
+            }
+            cv_.notify_one();
+        }
 
- private:
-  StaticThreadPool* pool_{nullptr};
-};
-
-class StaticThreadPool {
-  template <typename Receiver>
-  friend struct _Operation;
-
- public:
-  StaticThreadPool();
-  explicit StaticThreadPool(std::uint32_t thread_count);
-  ~StaticThreadPool();
-
-  Scheduler GetScheduler() noexcept { return Scheduler{*this}; }
-
-  void RequestStop() noexcept;
-
- private:
-  class ThreadState {
-   public:
-    TaskBase* try_pop();
-    TaskBase* pop();
-    bool try_push(TaskBase* task);
-    void push(TaskBase* task);
-    void request_stop();
-
-   private:
-    std::mutex mutex_;
-    std::condition_variable cv_;
-    intrusive_queue<&TaskBase::next_> queue_;
-    bool stop_requested_{false};
-  };
-
-  void Run(std::uint32_t index) noexcept;
-  void Join() noexcept;
-
-  void Enqueue(TaskBase* task) noexcept;
-
-  std::uint32_t thread_count_;
-  std::vector<std::thread> threads_;
-  std::vector<ThreadState> thread_states_;
-  std::atomic<std::uint32_t> next_thread_;
-};
-
-template <typename Receiver>
-struct _Operation<Receiver>::type : TaskBase {
-  friend Scheduler::Sender;
-
-  StaticThreadPool& pool_;
-  Receiver receiver_;
-
-  type(StaticThreadPool& pool, Receiver&& r) : TaskBase{}, pool_(pool), receiver_((Receiver &&) r) {
-    this->execute_ = [](TaskBase* t) noexcept {
-      auto& op = *static_cast<type*>(t);
-      SetValue((Receiver &&) op.receiver_);
-    };
-  }
-
-  void enqueue_(TaskBase* op) const { return pool_.Enqueue(op); }
-
-  friend void tag_invoke(start_t, type& op) noexcept { op.enqueue_(&op); }
-};
-
-inline StaticThreadPool::StaticThreadPool()
-    : StaticThreadPool(std::thread::hardware_concurrency()) {}
-
-inline StaticThreadPool::StaticThreadPool(std::uint32_t thread_count)
-    : thread_count_(thread_count), thread_states_(thread_count), next_thread_(0) {
-  assert(thread_count_ > 0);
-
-  threads_.reserve(thread_count_);
-
-  try {
-    for (std::uint32_t i = 0; i < thread_count_; ++i) {
-      threads_.emplace_back([this, i] { Run(i); });
-    }
-  } catch (...) {
-    RequestStop();
-    Join();
-    throw;
-  }
-}
-
-inline StaticThreadPool::~StaticThreadPool() {
-  RequestStop();
-  Join();
-}
-
-inline void StaticThreadPool::RequestStop() noexcept {
-  for (auto& state : thread_states_) {
-    state.request_stop();
-  }
-}
-
-inline void StaticThreadPool::Run(std::uint32_t index) noexcept {
-  while (true) {
-    TaskBase* task = nullptr;
-    for (std::uint32_t i = 0; i < thread_count_; ++i) {
-      auto queue_index = (index + i) < thread_count_ ? (index + i) : (index + i - thread_count_);
-      auto& state = thread_states_[queue_index];
-      task = state.try_pop();
-      if (task != nullptr) {
-        break;
-      }
-    }
-    if (task == nullptr) {
-      task = thread_states_[index].pop();
-      if (task == nullptr) {
-        return;
-      }
-    }
-    task->execute_(task);
-  }
-}
-
-inline void StaticThreadPool::Join() noexcept {
-  for (auto& t : threads_) {
-    t.join();
-  }
-  threads_.clear();
-}
-
-inline void StaticThreadPool::Enqueue(TaskBase* task) noexcept {
-  const auto thread_count = static_cast<std::uint32_t>(threads_.size());
-  const std::uint32_t start_index =
-      next_thread_.fetch_add(1, std::memory_order_relaxed) % thread_count;
-  for (std::uint32_t i = 0; i < thread_count; ++i) {
-    const auto index =
-        (start_index + i) < thread_count ? (start_index + i) : (start_index + i - thread_count);
-    if (thread_states_[index].try_push(task)) {
-      return;
-    }
-  }
-  thread_states_[start_index].push(task);
-}
-
-inline TaskBase* StaticThreadPool::ThreadState::try_pop() {
-  std::unique_lock lock{mutex_, std::try_to_lock};
-  if (!lock || queue_.empty()) {
-    return nullptr;
-  }
-  return queue_.pop_front();
-}
-
-inline TaskBase* StaticThreadPool::ThreadState::pop() {
-  std::unique_lock lock{mutex_};
-  while (queue_.empty()) {
-    if (stop_requested_) {
-      return nullptr;
-    }
-    cv_.wait(lock);
-  }
-  return queue_.pop_front();
-}
-
-inline bool StaticThreadPool::ThreadState::try_push(TaskBase* task) {
-  bool was_empty{};
-  {
-    std::unique_lock lock{mutex_, std::try_to_lock};
-    if (!lock) {
-      return false;
-    }
-    was_empty = queue_.empty();
-    queue_.push_back(task);
-  }
-  if (was_empty) {
-    cv_.notify_one();
-  }
-  return true;
-}
-
-inline void StaticThreadPool::ThreadState::push(TaskBase* task) {
-  bool was_empty{};
-  {
-    std::lock_guard lock{mutex_};
-    was_empty = queue_.empty();
-    queue_.push_back(task);
-  }
-  if (was_empty) {
-    cv_.notify_one();
-  }
-}
-
-inline void StaticThreadPool::ThreadState::request_stop() {
-  {
-    std::lock_guard lock{mutex_};
-    stop_requested_ = true;
-  }
-  cv_.notify_one();
-}
-
-namespace __bulk {
-
-template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
-using operation_t = typename _Operation<CvrefSender, Shape, Func, Receiver>::type;
-
-template <typename Receiver, typename Shape, typename Func, typename Tuple>
-struct _Receiver {
-  struct type;
-};
-template <typename Receiver, typename Shape, typename Func, typename Tuple>
-using receiver_t = typename _Receiver<remove_cvref_t<Receiver>, Shape, Func, Tuple>::type;
-
-template <typename Receiver, typename Shape, typename Func, typename Tuple>
-struct _Receiver<Receiver, Shape, Func, Tuple>::type {
-  struct State {
-    Receiver receiver_;
-    Shape shape_;
-    Func func_;
-    std::optional<Tuple> values_;
-    Scheduler scheduler_;
-    std::atomic<Shape> count_;
-  };
-
-  std::shared_ptr<State> state_;
-
-  type(Receiver&& receiver, Shape shape, Func func, Scheduler scheduler)
-      : state_(new State{(Receiver &&) receiver, shape, (Func &&) func, std::nullopt, scheduler,
-                         shape}) {}
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    auto& state = self.state_;
-    state->values_.emplace((As &&) as...);
-    for (Shape index = {}; index < state->shape_; ++index) {
-      StartDetached(Then(Schedule(state->scheduler_), [state, index] {
+        namespace __bulk
+        {
+
+            template<typename CvrefSender, typename Shape, typename Func, typename Receiver>
+            struct _Operation
+            {
+                struct type;
+            };
+            template<typename CvrefSender, typename Shape, typename Func, typename Receiver>
+            using operation_t = typename _Operation<CvrefSender, Shape, Func, Receiver>::type;
+
+            template<typename Receiver, typename Shape, typename Func, typename Tuple>
+            struct _Receiver
+            {
+                struct type;
+            };
+            template<typename Receiver, typename Shape, typename Func, typename Tuple>
+            using receiver_t = typename _Receiver<remove_cvref_t<Receiver>, Shape, Func, Tuple>::type;
+
+            template<typename Receiver, typename Shape, typename Func, typename Tuple>
+            struct _Receiver<Receiver, Shape, Func, Tuple>::type
+            {
+                struct State
+                {
+                    Receiver             receiver_;
+                    Shape                shape_;
+                    Func                 func_;
+                    std::optional<Tuple> values_;
+                    Scheduler            scheduler_;
+                    std::atomic<Shape>   count_;
+                };
+
+                std::shared_ptr<State> state_;
+
+                type(Receiver&& receiver, Shape shape, Func func, Scheduler scheduler)
+                    : state_(new State{(Receiver&&)receiver, shape, (Func&&)func, std::nullopt, scheduler, shape})
+                {
+                }
+
+                template<typename... As>
+                friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+                {
+                    auto& state = self.state_;
+                    state->values_.emplace((As&&)as...);
+                    for (Shape index = {}; index < state->shape_; ++index)
+                    {
+                        StartDetached(Then(Schedule(state->scheduler_), [state, index]
+                                           {
         std::apply([&](auto&... vals) { state->func_(index, vals...); }, state->values_.value());
         if (0 == --state->count_) {
           std::apply(
               [&](auto&... vals) { SetValue(std::move(state->receiver_), std::move(vals)...); },
               state->values_.value());
         }
-        return 0;
-      }));
-    }
-  }
-};
-
-template <typename Sender, typename Shape, typename Func>
-struct _Sender {
-  struct type;
-};
-template <typename Sender, typename Shape, typename Func>
-using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Shape>, Func>::type;
-
-template <typename Sender, typename Shape, typename Func>
-struct _Sender<Sender, Shape, Func>::type {
-  using value_types = completion_signatures_of_t<Sender>;
-  template <typename Receiver>
-  using _receiver_t = receiver_t<Receiver, Shape, Func, value_types>;
-
-  Sender sender_;
-  Scheduler scheduler_;
-  Shape shape_;
-  Func func_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return Connect(((Self &&) self).sender_,
-                   _receiver_t<Receiver>{(Receiver &&) receiver, ((Self &&) self).shape_,
-                                         ((Self &&) self).func_, ((Self &&) self).scheduler_});
-  }
-};
-
-}  // namespace __bulk
-
-template <typename Sender, typename Shape, typename Func>
-__bulk::sender_t<Sender, Shape, Func> tag_invoke(bulk_t, Scheduler scheduler, Sender&& sender,
-                                                 Shape&& shape, Func&& func) {
-  return {(Sender &&) sender, scheduler, (Shape &&) shape, (Func &&) func};
-}
-
-}  // namespace __static_thread_pool
-
-using __static_thread_pool::StaticThreadPool;
+        return 0; }));
+                    }
+                }
+            };
+
+            template<typename Sender, typename Shape, typename Func>
+            struct _Sender
+            {
+                struct type;
+            };
+            template<typename Sender, typename Shape, typename Func>
+            using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Shape>, Func>::type;
+
+            template<typename Sender, typename Shape, typename Func>
+            struct _Sender<Sender, Shape, Func>::type
+            {
+                using value_types = completion_signatures_of_t<Sender>;
+                template<typename Receiver>
+                using _receiver_t = receiver_t<Receiver, Shape, Func, value_types>;
+
+                Sender    sender_;
+                Scheduler scheduler_;
+                Shape     shape_;
+                Func      func_;
+
+                template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+                friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                {
+                    return Connect(((Self&&)self).sender_,
+                                   _receiver_t<Receiver>{(Receiver&&)receiver, ((Self&&)self).shape_, ((Self&&)self).func_, ((Self&&)self).scheduler_});
+                }
+            };
+
+        }  // namespace __bulk
+
+        template<typename Sender, typename Shape, typename Func>
+        __bulk::sender_t<Sender, Shape, Func> tag_invoke(bulk_t, Scheduler scheduler, Sender&& sender, Shape&& shape, Func&& func)
+        {
+            return {(Sender&&)sender, scheduler, (Shape&&)shape, (Func&&)func};
+        }
+
+    }  // namespace __static_thread_pool
+
+    using __static_thread_pool::StaticThreadPool;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/timed_single_thread_context.h b/csrc/mmdeploy/execution/schedulers/timed_single_thread_context.h
index 706d6d9808..5a409d61f7 100644
--- a/csrc/mmdeploy/execution/schedulers/timed_single_thread_context.h
+++ b/csrc/mmdeploy/execution/schedulers/timed_single_thread_context.h
@@ -6,197 +6,258 @@
 
 #include "mmdeploy/execution/execution.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class TimedSingleThreadContext;
+    class TimedSingleThreadContext;
 
-namespace _timed_single_thread_context {
-
-using Clock = std::chrono::steady_clock;
-using TimePoint = typename Clock::time_point;
-
-struct TaskBase {
-  using ExecuteFn = void(TaskBase*) noexcept;
-
-  explicit TaskBase(TimedSingleThreadContext& context, ExecuteFn* execute) noexcept
-      : context_(&context), execute_(execute) {}
-
-  TimedSingleThreadContext* const context_;
-  TaskBase* next_{nullptr};
-  TaskBase** prev_next_ptr_{nullptr};
-  ExecuteFn* execute_;
-  TimePoint due_time_;
-
-  void Execute() noexcept { execute_(this); }
-};
-
-class Scheduler;
-
-namespace __schedule_after {
-
-template <typename Duration, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Duration, typename Receiver>
-using operation_t = typename _Operation<Duration, remove_cvref_t<Receiver>>::type;
-
-template <typename Duration>
-struct _Sender {
-  struct type;
-};
-template <typename Duration>
-using sender_t = typename _Sender<Duration>::type;
-
-}  // namespace __schedule_after
-
-class Scheduler {
-  friend TimedSingleThreadContext;
-
-  explicit Scheduler(TimedSingleThreadContext& context) noexcept : context_(&context) {}
-
-  friend bool operator==(Scheduler a, Scheduler b) noexcept { return a.context_ == b.context_; }
-
-  friend bool operator!=(Scheduler a, Scheduler b) noexcept { return a.context_ != b.context_; }
+    namespace _timed_single_thread_context
+    {
 
-  TimedSingleThreadContext* context_;
+        using Clock     = std::chrono::steady_clock;
+        using TimePoint = typename Clock::time_point;
+
+        struct TaskBase
+        {
+            using ExecuteFn = void(TaskBase*) noexcept;
+
+            explicit TaskBase(TimedSingleThreadContext& context, ExecuteFn* execute) noexcept
+                : context_(&context)
+                , execute_(execute)
+            {
+            }
+
+            TimedSingleThreadContext* const context_;
+            TaskBase*                       next_{nullptr};
+            TaskBase**                      prev_next_ptr_{nullptr};
+            ExecuteFn*                      execute_;
+            TimePoint                       due_time_;
+
+            void                            Execute() noexcept
+            {
+                execute_(this);
+            }
+        };
+
+        class Scheduler;
+
+        namespace __schedule_after
+        {
+
+            template<typename Duration, typename Receiver>
+            struct _Operation
+            {
+                struct type;
+            };
+            template<typename Duration, typename Receiver>
+            using operation_t = typename _Operation<Duration, remove_cvref_t<Receiver>>::type;
+
+            template<typename Duration>
+            struct _Sender
+            {
+                struct type;
+            };
+            template<typename Duration>
+            using sender_t = typename _Sender<Duration>::type;
+
+        }  // namespace __schedule_after
+
+        class Scheduler
+        {
+            friend TimedSingleThreadContext;
+
+            explicit Scheduler(TimedSingleThreadContext& context) noexcept
+                : context_(&context)
+            {
+            }
+
+            friend bool operator==(Scheduler a, Scheduler b) noexcept
+            {
+                return a.context_ == b.context_;
+            }
+
+            friend bool operator!=(Scheduler a, Scheduler b) noexcept
+            {
+                return a.context_ != b.context_;
+            }
+
+            TimedSingleThreadContext* context_;
+
+            template<typename Rep, typename Ratio>
+            friend auto ScheduleAfter(const Scheduler& self, std::chrono::duration<Rep, Ratio> delay) noexcept
+                -> __schedule_after::sender_t<std::chrono::duration<Rep, Ratio>>
+            {
+                return {self.context_, delay};
+            }
+
+            template<typename Duration = std::chrono::microseconds>
+            friend __schedule_after::sender_t<Duration> tag_invoke(schedule_t,
+                                                                   const Scheduler& self) noexcept
+            {
+                return {self.context_, Duration::zero()};
+            }
+        };
+
+    }  // namespace _timed_single_thread_context
+
+    class MMDEPLOY_API TimedSingleThreadContext
+    {
+        using Clock     = _timed_single_thread_context::Clock;
+        using Scheduler = _timed_single_thread_context::Scheduler;
+        using TaskBase  = _timed_single_thread_context::TaskBase;
+        template<typename Duration, typename Receiver>
+        friend struct _timed_single_thread_context::__schedule_after::_Operation;
+        friend Scheduler;
+
+        void Enqueue(TaskBase* task) noexcept
+        {
+            bool need_notify = false;
+            {
+                std::lock_guard lock{mutex_};
+
+                if (head_ == nullptr || task->due_time_ < head_->due_time_)
+                {
+                    task->next_ = head_;
+                    head_       = task;
+                    need_notify = true;
+                }
+                else
+                {
+                    auto* queued_task = head_;
+                    // find insert pos
+                    while (queued_task->next_ != nullptr && queued_task->next_->due_time_ <= task->due_time_)
+                    {
+                        queued_task = queued_task->next_;
+                    }
+
+                    task->next_        = queued_task->next_;
+                    queued_task->next_ = task;
+                }
+            }
+            if (need_notify)
+            {
+                cv_.notify_one();
+            }
+        }
 
-  template <typename Rep, typename Ratio>
-  friend auto ScheduleAfter(const Scheduler& self, std::chrono::duration<Rep, Ratio> delay) noexcept
-      -> __schedule_after::sender_t<std::chrono::duration<Rep, Ratio>> {
-    return {self.context_, delay};
-  }
+        void Run()
+        {
+            std::unique_lock lock{mutex_};
+
+            while (!stop_)
+            {
+                if (head_ != nullptr)
+                {
+                    auto now           = Clock::now();
+                    auto next_due_time = head_->due_time_;
+                    if (next_due_time <= now)
+                    {
+                        // dequeue
+                        auto* task = head_;
+                        head_      = task->next_;
+                        // execute
+                        lock.unlock();
+                        task->Execute();
+                        lock.lock();
+                    }
+                    else
+                    {
+                        cv_.wait_until(lock, next_due_time);
+                    }
+                }
+                else
+                {
+                    cv_.wait(lock);
+                }
+            }
+        }
 
-  template <typename Duration = std::chrono::microseconds>
-  friend __schedule_after::sender_t<Duration> tag_invoke(schedule_t,
-                                                         const Scheduler& self) noexcept {
-    return {self.context_, Duration::zero()};
-  }
-};
+        std::mutex              mutex_;
+        std::condition_variable cv_;
 
-}  // namespace _timed_single_thread_context
+        TaskBase*               head_{nullptr};
+        bool                    stop_{false};
 
-class MMDEPLOY_API TimedSingleThreadContext {
-  using Clock = _timed_single_thread_context::Clock;
-  using Scheduler = _timed_single_thread_context::Scheduler;
-  using TaskBase = _timed_single_thread_context::TaskBase;
-  template <typename Duration, typename Receiver>
-  friend struct _timed_single_thread_context::__schedule_after::_Operation;
-  friend Scheduler;
+        std::thread             thread_;
 
-  void Enqueue(TaskBase* task) noexcept {
-    bool need_notify = false;
-    {
-      std::lock_guard lock{mutex_};
-
-      if (head_ == nullptr || task->due_time_ < head_->due_time_) {
-        task->next_ = head_;
-        head_ = task;
-        need_notify = true;
-      } else {
-        auto* queued_task = head_;
-        // find insert pos
-        while (queued_task->next_ != nullptr && queued_task->next_->due_time_ <= task->due_time_) {
-          queued_task = queued_task->next_;
+      public:
+        TimedSingleThreadContext()
+            : thread_([this]
+                      { this->Run(); })
+        {
         }
-
-        task->next_ = queued_task->next_;
-        queued_task->next_ = task;
-      }
-    }
-    if (need_notify) {
-      cv_.notify_one();
-    }
-  }
-
-  void Run() {
-    std::unique_lock lock{mutex_};
-
-    while (!stop_) {
-      if (head_ != nullptr) {
-        auto now = Clock::now();
-        auto next_due_time = head_->due_time_;
-        if (next_due_time <= now) {
-          // dequeue
-          auto* task = head_;
-          head_ = task->next_;
-          // execute
-          lock.unlock();
-          task->Execute();
-          lock.lock();
-        } else {
-          cv_.wait_until(lock, next_due_time);
+        ~TimedSingleThreadContext()
+        {
+            {
+                std::lock_guard lock{mutex_};
+                stop_ = true;
+                cv_.notify_one();
+            }
+            thread_.join();
+            assert(head_ == nullptr);
         }
-      } else {
-        cv_.wait(lock);
-      }
-    }
-  }
-
-  std::mutex mutex_;
-  std::condition_variable cv_;
 
-  TaskBase* head_{nullptr};
-  bool stop_{false};
+        Scheduler GetScheduler() noexcept
+        {
+            return Scheduler{*this};
+        }
 
-  std::thread thread_;
+        std::thread::id GetThreadId() const noexcept
+        {
+            return thread_.get_id();
+        }
+    };
 
- public:
-  TimedSingleThreadContext() : thread_([this] { this->Run(); }) {}
-  ~TimedSingleThreadContext() {
+    namespace _timed_single_thread_context::__schedule_after
     {
-      std::lock_guard lock{mutex_};
-      stop_ = true;
-      cv_.notify_one();
-    }
-    thread_.join();
-    assert(head_ == nullptr);
-  }
-
-  Scheduler GetScheduler() noexcept { return Scheduler{*this}; }
-
-  std::thread::id GetThreadId() const noexcept { return thread_.get_id(); }
-};
-
-namespace _timed_single_thread_context::__schedule_after {
-
-template <typename Duration, typename Receiver>
-struct _Operation<Duration, Receiver>::type : TaskBase {
-  Duration duration_;
-  Receiver receiver_;
-
-  template <typename Receiver2>
-  type(TimedSingleThreadContext& context, Duration duration, Receiver2&& receiver)
-      : TaskBase(context, &type::ExecuteImpl),
-        duration_(duration),
-        receiver_((Receiver2 &&) receiver) {}
-
-  static void ExecuteImpl(TaskBase* p) noexcept {
-    auto& self = *static_cast<type*>(p);
-    SetValue((Receiver &&) self.receiver_);
-  }
-
-  void Enqueue() { context_->Enqueue(this); }
-
-  friend void tag_invoke(start_t, type& op_state) noexcept {
-    op_state.due_time_ = Clock::now() + op_state.duration_;
-    op_state.Enqueue();
-  }
-};
-
-template <typename Duration>
-struct _Sender<Duration>::type {
-  using value_types = std::tuple<>;
-
-  TimedSingleThreadContext* context_;
-  Duration duration_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend operation_t<Duration, Receiver> tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return {*self.context_, self.duration_, (Receiver &&) receiver};
-  }
-};
-
-}  // namespace _timed_single_thread_context::__schedule_after
+
+        template<typename Duration, typename Receiver>
+        struct _Operation<Duration, Receiver>::type : TaskBase
+        {
+            Duration duration_;
+            Receiver receiver_;
+
+            template<typename Receiver2>
+            type(TimedSingleThreadContext& context, Duration duration, Receiver2&& receiver)
+                : TaskBase(context, &type::ExecuteImpl)
+                , duration_(duration)
+                , receiver_((Receiver2&&)receiver)
+            {
+            }
+
+            static void ExecuteImpl(TaskBase* p) noexcept
+            {
+                auto& self = *static_cast<type*>(p);
+                SetValue((Receiver&&)self.receiver_);
+            }
+
+            void Enqueue()
+            {
+                context_->Enqueue(this);
+            }
+
+            friend void tag_invoke(start_t, type& op_state) noexcept
+            {
+                op_state.due_time_ = Clock::now() + op_state.duration_;
+                op_state.Enqueue();
+            }
+        };
+
+        template<typename Duration>
+        struct _Sender<Duration>::type
+        {
+            using value_types = std::tuple<>;
+
+            TimedSingleThreadContext* context_;
+            Duration                  duration_;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend operation_t<Duration, Receiver> tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+            {
+                return {*self.context_, self.duration_, (Receiver&&)receiver};
+            }
+        };
+
+    }  // namespace _timed_single_thread_context::__schedule_after
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/execution/split.h b/csrc/mmdeploy/execution/split.h
index f4df689f42..93e5d6db70 100644
--- a/csrc/mmdeploy/execution/split.h
+++ b/csrc/mmdeploy/execution/split.h
@@ -9,165 +9,198 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __split {
-
-template <typename SharedState>
-struct _Receiver {
-  struct type;
-};
-template <typename SharedState>
-using receiver_t = typename _Receiver<SharedState>::type;
-
-struct _OperationBase {
-  _OperationBase* next_;
-  void (*notify_)(_OperationBase*) noexcept;
-};
-
-template <typename SharedState>
-struct _Receiver<SharedState>::type {
-  SharedState& shared_state_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    auto& state = self.shared_state_;
-    state.data_.emplace((As &&) as...);
-    state._Notify();
-  }
-};
-
-template <typename Sender>
-struct _SharedState {
-  std::optional<completion_signatures_of_t<Sender>> data_;
-
-  using Receiver = receiver_t<_SharedState>;
-
-  connect_result_t<Sender, Receiver> op_state2_;
-
-  std::atomic<void*> head_{nullptr};
-
-  explicit _SharedState(Sender& sender)
-      : op_state2_(Connect((Sender &&) sender, Receiver{*this})) {}
-
-  void _Notify() noexcept {
-    void* const completion_state = static_cast<void*>(this);
-    void* old = head_.exchange(completion_state, std::memory_order_acq_rel);
-    auto* op_state = static_cast<_OperationBase*>(old);
-
-    while (op_state != nullptr) {
-      _OperationBase* next = op_state->next_;
-      op_state->notify_(op_state);
-      op_state = next;
-    }
-  }
-};
-
-template <typename Sender, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Sender, typename Receiver>
-using operation_t = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
-
-template <typename Sender, typename Receiver>
-struct _Operation<Sender, Receiver>::type : _OperationBase {
-  Receiver receiver_;
-  std::shared_ptr<_SharedState<Sender>> shared_state_;
-
-  type(Receiver&& receiver, std::shared_ptr<_SharedState<Sender>> shared_state)
-      : _OperationBase{nullptr, _Notify},
-        receiver_(std::move(receiver)),
-        shared_state_(std::move(shared_state)) {}
-
-  static void _Notify(_OperationBase* self) noexcept {
-    auto op = static_cast<type*>(self);
-    std::apply([&](const auto&... args) { SetValue(std::move(op->receiver_), args...); },
-               op->shared_state_->data_.value());
-  }
-
-  friend void tag_invoke(start_t, type& self) {
-    auto shared_state = self.shared_state_.get();
-    std::atomic<void*>& head = shared_state->head_;
-    void* const completion_state = static_cast<void*>(shared_state);
-    void* old = head.load(std::memory_order_acquire);
-
-    do {
-      if (old == completion_state) {
-        self._Notify(&self);
-        return;
-      }
-      self.next_ = static_cast<_OperationBase*>(old);
-    } while (!head.compare_exchange_weak(old, static_cast<void*>(&self), std::memory_order_release,
-                                         std::memory_order_acquire));
-
-    if (old == nullptr) {
-      Start(shared_state->op_state2_);
-    }
-  }
-};
-
-template <typename Sender>
-struct _Sender {
-  struct type;
-};
-template <typename Sender>
-using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
-
-template <typename Sender>
-struct _Sender<Sender>::type {
-  using SharedState = _SharedState<Sender>;
-  template <typename Receiver>
-  using _operation_t = operation_t<Sender, Receiver>;
-
-  using value_types = completion_signatures_of_t<Sender>;
-
-  Sender sender_;
-  std::shared_ptr<SharedState> shared_state_;
-
-  explicit type(Sender sender)
-      : sender_(std::move(sender)), shared_state_{std::make_shared<SharedState>(sender_)} {}
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) -> _operation_t<Receiver> {
-    return _operation_t<Receiver>((Receiver &&) receiver, self.shared_state_);
-  }
-};
-
-struct split_t {
-  template <
-      typename Sender,
-      std::enable_if_t<
-          _is_sender<Sender> && _tag_invocable_with_completion_scheduler<split_t, Sender>, int> = 0>
-  auto operator()(Sender&& sender) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(split_t{}, std::move(scheduler), (Sender &&) sender);
-  }
-
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<split_t, Sender> &&
-                                 tag_invocable<split_t, Sender>,
-                             int> = 0>
-  auto operator()(Sender&& sender) const {
-    return tag_invoke(split_t{}, (Sender &&) sender);
-  }
-
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<split_t, Sender> &&
-                                 !tag_invocable<split_t, Sender>,
-                             int> = 0>
-  sender_t<Sender> operator()(Sender&& sender) const {
-    return sender_t<Sender>{(Sender &&) sender};
-  }
-  _BinderBack<split_t> operator()() const { return {{}, {}, {}}; }
-};
-
-}  // namespace __split
-
-using __split::split_t;
-inline constexpr split_t Split{};
+namespace mmdeploy
+{
+
+    namespace __split
+    {
+
+        template<typename SharedState>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename SharedState>
+        using receiver_t = typename _Receiver<SharedState>::type;
+
+        struct _OperationBase
+        {
+            _OperationBase* next_;
+            void (*notify_)(_OperationBase*) noexcept;
+        };
+
+        template<typename SharedState>
+        struct _Receiver<SharedState>::type
+        {
+            SharedState& shared_state_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                auto& state = self.shared_state_;
+                state.data_.emplace((As&&)as...);
+                state._Notify();
+            }
+        };
+
+        template<typename Sender>
+        struct _SharedState
+        {
+            std::optional<completion_signatures_of_t<Sender>> data_;
+
+            using Receiver = receiver_t<_SharedState>;
+
+            connect_result_t<Sender, Receiver> op_state2_;
+
+            std::atomic<void*>                 head_{nullptr};
+
+            explicit _SharedState(Sender& sender)
+                : op_state2_(Connect((Sender&&)sender, Receiver{*this}))
+            {
+            }
+
+            void _Notify() noexcept
+            {
+                void* const completion_state = static_cast<void*>(this);
+                void*       old              = head_.exchange(completion_state, std::memory_order_acq_rel);
+                auto*       op_state         = static_cast<_OperationBase*>(old);
+
+                while (op_state != nullptr)
+                {
+                    _OperationBase* next = op_state->next_;
+                    op_state->notify_(op_state);
+                    op_state = next;
+                }
+            }
+        };
+
+        template<typename Sender, typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Sender, typename Receiver>
+        using operation_t = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
+
+        template<typename Sender, typename Receiver>
+        struct _Operation<Sender, Receiver>::type : _OperationBase
+        {
+            Receiver                              receiver_;
+            std::shared_ptr<_SharedState<Sender>> shared_state_;
+
+            type(Receiver&& receiver, std::shared_ptr<_SharedState<Sender>> shared_state)
+                : _OperationBase{nullptr, _Notify}
+                , receiver_(std::move(receiver))
+                , shared_state_(std::move(shared_state))
+            {
+            }
+
+            static void _Notify(_OperationBase* self) noexcept
+            {
+                auto op = static_cast<type*>(self);
+                std::apply([&](const auto&... args)
+                           { SetValue(std::move(op->receiver_), args...); },
+                           op->shared_state_->data_.value());
+            }
+
+            friend void tag_invoke(start_t, type& self)
+            {
+                auto                shared_state     = self.shared_state_.get();
+                std::atomic<void*>& head             = shared_state->head_;
+                void* const         completion_state = static_cast<void*>(shared_state);
+                void*               old              = head.load(std::memory_order_acquire);
+
+                do {
+                    if (old == completion_state)
+                    {
+                        self._Notify(&self);
+                        return;
+                    }
+                    self.next_ = static_cast<_OperationBase*>(old);
+                } while (!head.compare_exchange_weak(old, static_cast<void*>(&self), std::memory_order_release, std::memory_order_acquire));
+
+                if (old == nullptr)
+                {
+                    Start(shared_state->op_state2_);
+                }
+            }
+        };
+
+        template<typename Sender>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Sender>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
+
+        template<typename Sender>
+        struct _Sender<Sender>::type
+        {
+            using SharedState = _SharedState<Sender>;
+            template<typename Receiver>
+            using _operation_t = operation_t<Sender, Receiver>;
+
+            using value_types = completion_signatures_of_t<Sender>;
+
+            Sender                       sender_;
+            std::shared_ptr<SharedState> shared_state_;
+
+            explicit type(Sender sender)
+                : sender_(std::move(sender))
+                , shared_state_{std::make_shared<SharedState>(sender_)}
+            {
+            }
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) -> _operation_t<Receiver>
+            {
+                return _operation_t<Receiver>((Receiver&&)receiver, self.shared_state_);
+            }
+        };
+
+        struct split_t
+        {
+            template<
+                typename Sender,
+                std::enable_if_t<
+                    _is_sender<Sender> && _tag_invocable_with_completion_scheduler<split_t, Sender>,
+                    int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(split_t{}, std::move(scheduler), (Sender&&)sender);
+            }
+
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          !_tag_invocable_with_completion_scheduler<split_t, Sender> &&
+                                          tag_invocable<split_t, Sender>,
+                                      int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                return tag_invoke(split_t{}, (Sender&&)sender);
+            }
+
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          !_tag_invocable_with_completion_scheduler<split_t, Sender> &&
+                                          !tag_invocable<split_t, Sender>,
+                                      int> = 0>
+            sender_t<Sender> operator()(Sender&& sender) const
+            {
+                return sender_t<Sender>{(Sender&&)sender};
+            }
+            _BinderBack<split_t> operator()() const
+            {
+                return {{}, {}, {}};
+            }
+        };
+
+    }  // namespace __split
+
+    using __split::split_t;
+    inline constexpr split_t Split{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/start_detached.h b/csrc/mmdeploy/execution/start_detached.h
index c4e846c19e..2eb191d728 100644
--- a/csrc/mmdeploy/execution/start_detached.h
+++ b/csrc/mmdeploy/execution/start_detached.h
@@ -8,35 +8,43 @@
 #include "submit.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __start_detached {
-
-struct _Receiver {
-  template <typename... As>
-  friend void tag_invoke(set_value_t, _Receiver&&, As&&...) noexcept {}
-};
-
-struct start_detached_t {
-  template <
-      typename Sender,
-      std::enable_if_t<_is_sender<Sender> && tag_invocable<start_detached_t, Sender>, int> = 0>
-  void operator()(Sender&& sender) const {
-    (void)tag_invoke(start_detached_t{}, (Sender &&) sender);
-  }
-
-  template <
-      typename Sender,
-      std::enable_if_t<_is_sender<Sender> && !tag_invocable<start_detached_t, Sender>, int> = 0>
-  void operator()(Sender&& sender) const {
-    __Submit((Sender &&) sender, _Receiver{});
-  }
-};
-
-}  // namespace __start_detached
-
-using __start_detached::start_detached_t;
-inline constexpr start_detached_t StartDetached{};
+namespace mmdeploy
+{
+
+    namespace __start_detached
+    {
+
+        struct _Receiver
+        {
+            template<typename... As>
+            friend void tag_invoke(set_value_t, _Receiver&&, As&&...) noexcept
+            {
+            }
+        };
+
+        struct start_detached_t
+        {
+            template<
+                typename Sender,
+                std::enable_if_t<_is_sender<Sender> && tag_invocable<start_detached_t, Sender>, int> = 0>
+            void operator()(Sender&& sender) const
+            {
+                (void)tag_invoke(start_detached_t{}, (Sender&&)sender);
+            }
+
+            template<
+                typename Sender,
+                std::enable_if_t<_is_sender<Sender> && !tag_invocable<start_detached_t, Sender>, int> = 0>
+            void operator()(Sender&& sender) const
+            {
+                __Submit((Sender&&)sender, _Receiver{});
+            }
+        };
+
+    }  // namespace __start_detached
+
+    using __start_detached::start_detached_t;
+    inline constexpr start_detached_t StartDetached{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/submit.h b/csrc/mmdeploy/execution/submit.h
index fa1f3d53fe..b376a0eb10 100644
--- a/csrc/mmdeploy/execution/submit.h
+++ b/csrc/mmdeploy/execution/submit.h
@@ -7,52 +7,63 @@
 
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __submit {
-
-namespace __impl {
-
-template <typename Sender, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Sender, typename Receiver>
-using operation_t = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
-
-template <typename Sender, typename Receiver>
-struct _Operation<Sender, Receiver>::type {
-  struct _Receiver {
-    type* op_state_;
-    template <typename... As>
-    friend void tag_invoke(set_value_t, _Receiver&& self, As&&... as) noexcept {
-      std::unique_ptr<type> _{self.op_state_};
-      return SetValue(std::move(self.op_state_->receiver_), (As &&) as...);
-    }
-  };
-  Receiver receiver_;
-  connect_result_t<Sender, _Receiver> op_state_;
-
-  template <typename Receiver2, _decays_to<Receiver2, Receiver, int> = 0>
-  type(Sender&& sender, Receiver2&& receiver)
-      : receiver_((Receiver2 &&) receiver),
-        op_state_(Connect((Sender &&) sender, _Receiver{this})) {}
-};
-
-}  // namespace __impl
-
-struct __submit_t {
-  template <typename Receiver, typename Sender>
-  void operator()(Sender&& sender, Receiver&& receiver) const noexcept(false) {
-    Start((new __impl::operation_t<Sender, Receiver>((Sender &&) sender, (Receiver &&) receiver))
-              ->op_state_);
-  }
-};
-
-}  // namespace __submit
-
-using __submit::__submit_t;
-inline constexpr __submit_t __Submit{};
+namespace mmdeploy
+{
+
+    namespace __submit
+    {
+
+        namespace __impl
+        {
+
+            template<typename Sender, typename Receiver>
+            struct _Operation
+            {
+                struct type;
+            };
+            template<typename Sender, typename Receiver>
+            using operation_t = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
+
+            template<typename Sender, typename Receiver>
+            struct _Operation<Sender, Receiver>::type
+            {
+                struct _Receiver
+                {
+                    type* op_state_;
+                    template<typename... As>
+                    friend void tag_invoke(set_value_t, _Receiver&& self, As&&... as) noexcept
+                    {
+                        std::unique_ptr<type> _{self.op_state_};
+                        return SetValue(std::move(self.op_state_->receiver_), (As&&)as...);
+                    }
+                };
+                Receiver                            receiver_;
+                connect_result_t<Sender, _Receiver> op_state_;
+
+                template<typename Receiver2, _decays_to<Receiver2, Receiver, int> = 0>
+                type(Sender&& sender, Receiver2&& receiver)
+                    : receiver_((Receiver2&&)receiver)
+                    , op_state_(Connect((Sender&&)sender, _Receiver{this}))
+                {
+                }
+            };
+
+        }  // namespace __impl
+
+        struct __submit_t
+        {
+            template<typename Receiver, typename Sender>
+            void operator()(Sender&& sender, Receiver&& receiver) const noexcept(false)
+            {
+                Start((new __impl::operation_t<Sender, Receiver>((Sender&&)sender, (Receiver&&)receiver))
+                          ->op_state_);
+            }
+        };
+
+    }  // namespace __submit
+
+    using __submit::__submit_t;
+    inline constexpr __submit_t __Submit{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/sync_wait.h b/csrc/mmdeploy/execution/sync_wait.h
index 9a9a31a0e4..437545ddb2 100644
--- a/csrc/mmdeploy/execution/sync_wait.h
+++ b/csrc/mmdeploy/execution/sync_wait.h
@@ -10,75 +10,85 @@
 #include "run_loop.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __sync_wait {
-
-template <typename Sender>
-struct _State {
-  std::optional<completion_signatures_of_t<Sender>> data_;
-};
-
-template <typename Sender>
-struct _Receiver {
-  struct type;
-};
-template <typename Sender>
-using receiver_t = typename _Receiver<remove_cvref_t<Sender>>::type;
-
-template <typename Sender>
-struct _Receiver<Sender>::type {
-  _State<Sender>* state_;
-  RunLoop* loop_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& receiver, As&&... as) noexcept {
-    receiver.state_->data_.emplace((As &&) as...);
-    receiver.loop_->_Finish();
-  }
-};
-
-struct sync_wait_t {
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 _tag_invocable_with_completion_scheduler<sync_wait_t, Sender>,
-                             int> = 0>
-  auto operator()(Sender&& sender) const
-      -> tag_invoke_result_t<sync_wait_t, _completion_scheduler_for<Sender>, Sender> {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(sync_wait_t{}, std::move(scheduler), (Sender &&) sender);
-  }
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<sync_wait_t, Sender> &&
-                                 tag_invocable<sync_wait_t, Sender>,
-                             int> = 0>
-  auto operator()(Sender&& sender) const -> tag_invoke_result_t<sync_wait_t, Sender> {
-    return tag_invoke(sync_wait_t{}, (Sender &&) sender);
-  }
-
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<sync_wait_t, Sender> &&
-                                 !tag_invocable<sync_wait_t, Sender>,
-                             int> = 0>
-  completion_signatures_of_t<Sender> operator()(Sender&& sender) const {
-    _State<remove_cvref_t<Sender>> state;
-    RunLoop loop;
-    // connect to internal receiver
-    auto op_state = Connect((Sender &&) sender, receiver_t<Sender>{&state, &loop});
-    Start(op_state);
-
-    loop._Run();
-    // extract the returned values
-    return std::move(*state.data_);
-  }
-};
-
-}  // namespace __sync_wait
-
-using __sync_wait::sync_wait_t;
-inline constexpr sync_wait_t SyncWait{};
+namespace mmdeploy
+{
+
+    namespace __sync_wait
+    {
+
+        template<typename Sender>
+        struct _State
+        {
+            std::optional<completion_signatures_of_t<Sender>> data_;
+        };
+
+        template<typename Sender>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Sender>
+        using receiver_t = typename _Receiver<remove_cvref_t<Sender>>::type;
+
+        template<typename Sender>
+        struct _Receiver<Sender>::type
+        {
+            _State<Sender>* state_;
+            RunLoop*        loop_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& receiver, As&&... as) noexcept
+            {
+                receiver.state_->data_.emplace((As&&)as...);
+                receiver.loop_->_Finish();
+            }
+        };
+
+        struct sync_wait_t
+        {
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          _tag_invocable_with_completion_scheduler<sync_wait_t, Sender>,
+                                      int> = 0>
+            auto operator()(Sender&& sender) const
+                -> tag_invoke_result_t<sync_wait_t, _completion_scheduler_for<Sender>, Sender>
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(sync_wait_t{}, std::move(scheduler), (Sender&&)sender);
+            }
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          !_tag_invocable_with_completion_scheduler<sync_wait_t, Sender> &&
+                                          tag_invocable<sync_wait_t, Sender>,
+                                      int> = 0>
+            auto operator()(Sender&& sender) const -> tag_invoke_result_t<sync_wait_t, Sender>
+            {
+                return tag_invoke(sync_wait_t{}, (Sender&&)sender);
+            }
+
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          !_tag_invocable_with_completion_scheduler<sync_wait_t, Sender> &&
+                                          !tag_invocable<sync_wait_t, Sender>,
+                                      int> = 0>
+            completion_signatures_of_t<Sender> operator()(Sender&& sender) const
+            {
+                _State<remove_cvref_t<Sender>> state;
+                RunLoop                        loop;
+                // connect to internal receiver
+                auto                           op_state = Connect((Sender&&)sender, receiver_t<Sender>{&state, &loop});
+                Start(op_state);
+
+                loop._Run();
+                // extract the returned values
+                return std::move(*state.data_);
+            }
+        };
+
+    }  // namespace __sync_wait
+
+    using __sync_wait::sync_wait_t;
+    inline constexpr sync_wait_t SyncWait{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/tag_invoke.h b/csrc/mmdeploy/execution/tag_invoke.h
index 53fa943499..cc93464063 100644
--- a/csrc/mmdeploy/execution/tag_invoke.h
+++ b/csrc/mmdeploy/execution/tag_invoke.h
@@ -6,77 +6,87 @@
 
 #include "type_traits.h"
 
-namespace mmdeploy {
-
-namespace _tag_invoke {
-
-void tag_invoke();
-
-struct _fn {
-  template <typename CPO, typename... Args>
-  constexpr auto operator()(CPO cpo, Args&&... args) const
-      noexcept(noexcept(tag_invoke((CPO &&) cpo, (Args &&) args...)))
-          -> decltype(tag_invoke((CPO &&) cpo, (Args &&) args...)) {
-    return tag_invoke((CPO &&) cpo, (Args &&) args...);
-  }
-};
-
-template <typename CPO, typename... Args>
-using tag_invoke_result_t = decltype(tag_invoke(std::declval<CPO>(), std::declval<Args>()...));
-
-using yes_type = char;
-using no_type = char (&)[2];
-
-template <typename CPO, typename... Args>
-auto try_tag_invoke(int) noexcept(noexcept(tag_invoke(std::declval<CPO>(),
-                                                      std::declval<Args>()...)))
-    -> decltype((void)tag_invoke(std::declval<CPO&&>(), std::declval<Args>()...), yes_type{});
-
-template <typename CPO, typename... Args>
-no_type try_tag_invoke(...) noexcept(false);
-
-template <template <typename...> class T, typename... Args>
-struct defer {
-  using type = T<Args...>;
-};
-
-struct empty {};
-
-}  // namespace _tag_invoke
-
-namespace _tag_invoke_cpo {
-inline constexpr _tag_invoke::_fn tag_invoke{};
-}
-using namespace _tag_invoke_cpo;
-
-template <auto& CPO>
-using tag_t = std::remove_const_t<std::remove_reference_t<decltype(CPO)>>;
-
-using _tag_invoke::tag_invoke_result_t;
-
-template <typename CPO, typename... Args>
-inline constexpr bool is_tag_invocable_v = (sizeof(_tag_invoke::try_tag_invoke<CPO, Args...>(0))) ==
-                                           (sizeof(_tag_invoke::yes_type));
-
-template <typename CPO, typename... Args>
-struct tag_invoke_result
-    : std::conditional_t<is_tag_invocable_v<CPO, Args...>,
-                         _tag_invoke::defer<tag_invoke_result, CPO, Args...>, _tag_invoke::empty> {
-};
-
-template <typename CPO, typename... Args>
-using is_tag_invocable = std::bool_constant<is_tag_invocable_v<CPO, Args...>>;
-
-template <typename CPO, typename... Args>
-inline constexpr bool is_nothrow_tag_invocable_v =
-    noexcept(_tag_invoke::try_tag_invoke<CPO, Args...>(0));
-
-template <typename CPO, typename... Args>
-using is_nothrow_tag_invocable = std::bool_constant<is_nothrow_tag_invocable_v<CPO, Args...>>;
-
-template <typename CPO, typename... Args>
-inline constexpr bool tag_invocable = (sizeof(_tag_invoke::try_tag_invoke<CPO, Args...>(0)) ==
-                                       sizeof(_tag_invoke::yes_type));
+namespace mmdeploy
+{
+
+    namespace _tag_invoke
+    {
+
+        void tag_invoke();
+
+        struct _fn
+        {
+            template<typename CPO, typename... Args>
+            constexpr auto operator()(CPO cpo, Args&&... args) const
+                noexcept(noexcept(tag_invoke((CPO&&)cpo, (Args&&)args...)))
+                    -> decltype(tag_invoke((CPO&&)cpo, (Args&&)args...))
+            {
+                return tag_invoke((CPO&&)cpo, (Args&&)args...);
+            }
+        };
+
+        template<typename CPO, typename... Args>
+        using tag_invoke_result_t = decltype(tag_invoke(std::declval<CPO>(), std::declval<Args>()...));
+
+        using yes_type = char;
+        using no_type  = char (&)[2];
+
+        template<typename CPO, typename... Args>
+        auto try_tag_invoke(int) noexcept(noexcept(tag_invoke(std::declval<CPO>(),
+                                                              std::declval<Args>()...)))
+            -> decltype((void)tag_invoke(std::declval<CPO&&>(), std::declval<Args>()...), yes_type{});
+
+        template<typename CPO, typename... Args>
+        no_type try_tag_invoke(...) noexcept(false);
+
+        template<template<typename...> class T, typename... Args>
+        struct defer
+        {
+            using type = T<Args...>;
+        };
+
+        struct empty
+        {
+        };
+
+    }  // namespace _tag_invoke
+
+    namespace _tag_invoke_cpo
+    {
+        inline constexpr _tag_invoke::_fn tag_invoke{};
+    }
+    using namespace _tag_invoke_cpo;
+
+    template<auto& CPO>
+    using tag_t = std::remove_const_t<std::remove_reference_t<decltype(CPO)>>;
+
+    using _tag_invoke::tag_invoke_result_t;
+
+    template<typename CPO, typename... Args>
+    inline constexpr bool is_tag_invocable_v = (sizeof(_tag_invoke::try_tag_invoke<CPO, Args...>(0))) ==
+                                               (sizeof(_tag_invoke::yes_type));
+
+    template<typename CPO, typename... Args>
+    struct tag_invoke_result
+        : std::conditional_t<is_tag_invocable_v<CPO, Args...>,
+                             _tag_invoke::defer<tag_invoke_result, CPO, Args...>,
+                             _tag_invoke::empty>
+    {
+    };
+
+    template<typename CPO, typename... Args>
+    using is_tag_invocable = std::bool_constant<is_tag_invocable_v<CPO, Args...>>;
+
+    template<typename CPO, typename... Args>
+    inline constexpr bool is_nothrow_tag_invocable_v =
+        noexcept(_tag_invoke::try_tag_invoke<CPO, Args...>(0));
+
+    template<typename CPO, typename... Args>
+    using is_nothrow_tag_invocable = std::bool_constant<is_nothrow_tag_invocable_v<CPO, Args...>>;
+
+    template<typename CPO, typename... Args>
+    inline constexpr bool tag_invocable = (sizeof(_tag_invoke::try_tag_invoke<CPO, Args...>(0)) ==
+                                           sizeof(_tag_invoke::yes_type));
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/then.h b/csrc/mmdeploy/execution/then.h
index 4d5119c255..c329d419d1 100644
--- a/csrc/mmdeploy/execution/then.h
+++ b/csrc/mmdeploy/execution/then.h
@@ -9,102 +9,107 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __then {
-
-template <typename Receiver, typename Func>
-struct _Receiver {
-  struct type;
-};
-template <typename Receiver, typename Func>
-using receiver_t = typename _Receiver<Receiver, Func>::type;
-
-template <typename Receiver, typename Func>
-struct _Receiver<Receiver, Func>::type {
-  Receiver receiver_;
-  Func func_;
-
-  template <typename... Args>
-  friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept {
-    if constexpr (std::is_void_v<std::invoke_result_t<Func&&, Args...>>) {
-      std::invoke(std::move(self.func_), (Args &&) args...);
-      SetValue(std::move(self.receiver_));
-    } else {
-      SetValue(std::move(self.receiver_), std::invoke(std::move(self.func_), (Args &&) args...));
-    }
-  }
-};
-
-template <typename Sender, typename Func>
-struct _Sender {
-  struct type;
-};
-template <typename Sender, typename Func>
-using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Func>>::type;
-
-template <typename Sender, typename Func>
-struct _Sender<Sender, Func>::type {
-  using _ret_type = decltype(
-      std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
-
-  using value_types =
-      std::conditional_t<std::is_void_v<_ret_type>, std::tuple<>, std::tuple<_ret_type>>;
-
-  Sender sender_;
-  Func func_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return Connect(((Self &&) self).sender_,
-                   receiver_t<Receiver, Func>{(Receiver &&) receiver, std::move(self.func_)});
-  }
-
-  template <typename SenderT = Sender>
-  friend auto tag_invoke(get_completion_scheduler_t, const type& self) noexcept
-      -> tag_invoke_result_t<get_completion_scheduler_t, SenderT> {
-    return GetCompletionScheduler(self.sender_);
-  }
-};
-
-struct then_t {
-  template <typename Sender, typename Func,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 _tag_invocable_with_completion_scheduler<then_t, Sender, Func>,
-                             int> = 0>
-  auto operator()(Sender&& sender, Func func) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(then_t{}, std::move(scheduler), (Sender &&) sender, std::move(func));
-  }
-
-  template <typename Sender, typename Func,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<then_t, Sender, Func> &&
-                                 tag_invocable<then_t, Sender, Func>,
-                             int> = 0>
-  auto operator()(Sender&& sender, Func func) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(then_t{}, std::move(scheduler), (Sender &&) sender, std::move(func));
-  }
-
-  template <typename Sender, typename Func,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<then_t, Sender, Func> &&
-                                 !tag_invocable<then_t, Sender, Func>,
-                             int> = 0>
-  sender_t<Sender, Func> operator()(Sender&& sender, Func func) const {
-    return {(Sender &&) sender, std::move(func)};
-  }
-  template <typename Func>
-  _BinderBack<then_t, Func> operator()(Func func) const {
-    return {{}, {}, {std::move(func)}};
-  }
-};
-
-}  // namespace __then
-
-using __then::then_t;
-inline constexpr then_t Then;
+namespace mmdeploy
+{
+
+    namespace __then
+    {
+
+        template<typename Receiver, typename Func>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Receiver, typename Func>
+        using receiver_t = typename _Receiver<Receiver, Func>::type;
+
+        template<typename Receiver, typename Func>
+        struct _Receiver<Receiver, Func>::type
+        {
+            Receiver receiver_;
+            Func     func_;
+
+            template<typename... Args>
+            friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept
+            {
+                if constexpr (std::is_void_v<std::invoke_result_t<Func&&, Args...>>)
+                {
+                    std::invoke(std::move(self.func_), (Args&&)args...);
+                    SetValue(std::move(self.receiver_));
+                }
+                else
+                {
+                    SetValue(std::move(self.receiver_), std::invoke(std::move(self.func_), (Args&&)args...));
+                }
+            }
+        };
+
+        template<typename Sender, typename Func>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Sender, typename Func>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Func>>::type;
+
+        template<typename Sender, typename Func>
+        struct _Sender<Sender, Func>::type
+        {
+            using _ret_type = decltype(std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
+
+            using value_types =
+                std::conditional_t<std::is_void_v<_ret_type>, std::tuple<>, std::tuple<_ret_type>>;
+
+            Sender sender_;
+            Func   func_;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+            {
+                return Connect(((Self&&)self).sender_,
+                               receiver_t<Receiver, Func>{(Receiver&&)receiver, std::move(self.func_)});
+            }
+
+            template<typename SenderT = Sender>
+            friend auto tag_invoke(get_completion_scheduler_t, const type& self) noexcept
+                -> tag_invoke_result_t<get_completion_scheduler_t, SenderT>
+            {
+                return GetCompletionScheduler(self.sender_);
+            }
+        };
+
+        struct then_t
+        {
+            template<typename Sender, typename Func, std::enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<then_t, Sender, Func>, int> = 0>
+            auto operator()(Sender&& sender, Func func) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(then_t{}, std::move(scheduler), (Sender&&)sender, std::move(func));
+            }
+
+            template<typename Sender, typename Func, std::enable_if_t<_is_sender<Sender> && !_tag_invocable_with_completion_scheduler<then_t, Sender, Func> && tag_invocable<then_t, Sender, Func>, int> = 0>
+            auto operator()(Sender&& sender, Func func) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(then_t{}, std::move(scheduler), (Sender&&)sender, std::move(func));
+            }
+
+            template<typename Sender, typename Func, std::enable_if_t<_is_sender<Sender> && !_tag_invocable_with_completion_scheduler<then_t, Sender, Func> && !tag_invocable<then_t, Sender, Func>, int> = 0>
+            sender_t<Sender, Func> operator()(Sender&& sender, Func func) const
+            {
+                return {(Sender&&)sender, std::move(func)};
+            }
+            template<typename Func>
+            _BinderBack<then_t, Func> operator()(Func func) const
+            {
+                return {{}, {}, {std::move(func)}};
+            }
+        };
+
+    }  // namespace __then
+
+    using __then::then_t;
+    inline constexpr then_t Then;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/transfer.h b/csrc/mmdeploy/execution/transfer.h
index 43752e6453..79fdb25a1b 100644
--- a/csrc/mmdeploy/execution/transfer.h
+++ b/csrc/mmdeploy/execution/transfer.h
@@ -8,47 +8,41 @@
 #include "schedule_from.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __transfer {
-
-struct transfer_t {
-  template <typename Sender, typename Scheduler,
-            std::enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<
-                                                       transfer_t, Sender, Scheduler>,
-                             int> = 0>
-  auto operator()(Sender&& sender, Scheduler&& scheduler) const {
-    auto sched = GetCompletionScheduler(sender);
-    return tag_invoke(transfer_t{}, std::move(sched), (Sender &&) sender, (Scheduler &&) scheduler);
-  }
-  template <typename Sender, typename Scheduler,
-            std::enable_if_t<
-                _is_sender<Sender> &&
-                    !_tag_invocable_with_completion_scheduler<transfer_t, Sender, Scheduler> &&
-                    tag_invocable<transfer_t, Sender, Scheduler>,
-                int> = 0>
-  auto operator()(Sender&& sender, Scheduler&& scheduler) const {
-    return tag_invoke(transfer_t{}, (Sender &&) sender, (Scheduler &&) scheduler);
-  }
-  template <typename Sender, typename Scheduler,
-            std::enable_if_t<
-                _is_sender<Sender> &&
-                    !_tag_invocable_with_completion_scheduler<transfer_t, Sender, Scheduler> &&
-                    !tag_invocable<transfer_t, Sender, Scheduler>,
-                int> = 0>
-  auto operator()(Sender&& sender, Scheduler&& scheduler) const {
-    return ScheduleFrom((Scheduler &&) scheduler, (Sender &&) sender);
-  }
-  template <typename Scheduler>
-  _BinderBack<transfer_t, remove_cvref_t<Scheduler>> operator()(Scheduler&& scheduler) const {
-    return {{}, {}, {(Scheduler &&) scheduler}};
-  }
-};
-
-}  // namespace __transfer
-
-using __transfer::transfer_t;
-inline constexpr transfer_t Transfer{};
+namespace mmdeploy
+{
+
+    namespace __transfer
+    {
+
+        struct transfer_t
+        {
+            template<typename Sender, typename Scheduler, std::enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<transfer_t, Sender, Scheduler>, int> = 0>
+            auto operator()(Sender&& sender, Scheduler&& scheduler) const
+            {
+                auto sched = GetCompletionScheduler(sender);
+                return tag_invoke(transfer_t{}, std::move(sched), (Sender&&)sender, (Scheduler&&)scheduler);
+            }
+            template<typename Sender, typename Scheduler, std::enable_if_t<_is_sender<Sender> && !_tag_invocable_with_completion_scheduler<transfer_t, Sender, Scheduler> && tag_invocable<transfer_t, Sender, Scheduler>, int> = 0>
+            auto operator()(Sender&& sender, Scheduler&& scheduler) const
+            {
+                return tag_invoke(transfer_t{}, (Sender&&)sender, (Scheduler&&)scheduler);
+            }
+            template<typename Sender, typename Scheduler, std::enable_if_t<_is_sender<Sender> && !_tag_invocable_with_completion_scheduler<transfer_t, Sender, Scheduler> && !tag_invocable<transfer_t, Sender, Scheduler>, int> = 0>
+            auto operator()(Sender&& sender, Scheduler&& scheduler) const
+            {
+                return ScheduleFrom((Scheduler&&)scheduler, (Sender&&)sender);
+            }
+            template<typename Scheduler>
+            _BinderBack<transfer_t, remove_cvref_t<Scheduler>> operator()(Scheduler&& scheduler) const
+            {
+                return {{}, {}, {(Scheduler&&)scheduler}};
+            }
+        };
+
+    }  // namespace __transfer
+
+    using __transfer::transfer_t;
+    inline constexpr transfer_t Transfer{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/transfer_just.h b/csrc/mmdeploy/execution/transfer_just.h
index 693455b742..5edd91f76a 100644
--- a/csrc/mmdeploy/execution/transfer_just.h
+++ b/csrc/mmdeploy/execution/transfer_just.h
@@ -9,27 +9,30 @@
 #include "mmdeploy/execution/transfer.h"
 #include "mmdeploy/execution/utility.h"
 
-namespace mmdeploy {
-
-namespace _transfer_just {
-
-struct transfer_just_t {
-  template <typename Scheduler, typename... As,
-            std::enable_if_t<tag_invocable<transfer_just_t, Scheduler, As...>, int> = 0>
-  auto operator()(Scheduler&& scheduler, As&&... as) const {
-    return tag_invoke(transfer_just_t{}, (Scheduler &&) scheduler, (As &&) as...);
-  }
-  template <typename Scheduler, typename... As,
-            std::enable_if_t<!tag_invocable<transfer_just_t, Scheduler, As...>, int> = 0>
-  auto operator()(Scheduler&& scheduler, As&&... as) const {
-    return Transfer(Just((As &&) as...), (Scheduler &&) scheduler);
-  }
-};
-
-}  // namespace _transfer_just
-
-using _transfer_just::transfer_just_t;
-inline constexpr transfer_just_t TransferJust{};
+namespace mmdeploy
+{
+
+    namespace _transfer_just
+    {
+
+        struct transfer_just_t
+        {
+            template<typename Scheduler, typename... As, std::enable_if_t<tag_invocable<transfer_just_t, Scheduler, As...>, int> = 0>
+            auto operator()(Scheduler&& scheduler, As&&... as) const
+            {
+                return tag_invoke(transfer_just_t{}, (Scheduler&&)scheduler, (As&&)as...);
+            }
+            template<typename Scheduler, typename... As, std::enable_if_t<!tag_invocable<transfer_just_t, Scheduler, As...>, int> = 0>
+            auto operator()(Scheduler&& scheduler, As&&... as) const
+            {
+                return Transfer(Just((As&&)as...), (Scheduler&&)scheduler);
+            }
+        };
+
+    }  // namespace _transfer_just
+
+    using _transfer_just::transfer_just_t;
+    inline constexpr transfer_just_t TransferJust{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/type_erased.h b/csrc/mmdeploy/execution/type_erased.h
index be7c202b22..16b4ad58d0 100644
--- a/csrc/mmdeploy/execution/type_erased.h
+++ b/csrc/mmdeploy/execution/type_erased.h
@@ -8,456 +8,563 @@
 // ! DO NOT INCLUDE THIS FILE DIRECTLY IF SPECIALIZATION OF `capture_completion_scheduler` IS
 // NEEDED, ALL TRANSLATION UNITS MUST SEE THE SAME SPECIALIZATION
 
-namespace mmdeploy {
-
-namespace _capture_completion_scheduler {
-
-template <typename ValueTypes>
-struct capture_completion_scheduler : std::false_type {};
-
-}  // namespace _capture_completion_scheduler
-
-using _capture_completion_scheduler::capture_completion_scheduler;
-
-template <typename ValueTypes>
-inline constexpr bool _capture_completion_scheduler_v =
-    capture_completion_scheduler<ValueTypes>::value;
-
-namespace _type_erased {
-
-template <typename ValueTypes>
-class _TypeErasedSender;
-
-class _TypeErasedOperation;
-
-template <typename ValueTypes>
-class _TypeErasedReceiver;
-
-template <typename ValueTypes>
-class _TypeErasedScheduler;
-
-struct _unit {};
-
-template <typename>
-struct _ThenFn {
-  using type = _unit;
-};
-template <typename T>
-struct _ThenFn<std::tuple<T>> {
-  using type = std::function<T(T)>;
-};
-template <>
-struct _ThenFn<std::tuple<>> {
-  using type = std::function<void()>;
-};
-template <typename ValueTypes>
-using _then_fn_t = typename _ThenFn<ValueTypes>::type;
-
-template <typename>
-struct _BulkFn {};
-template <typename... Ts>
-struct _BulkFn<std::tuple<Ts...>> {
-  using type = std::function<void(size_t, Ts&...)>;
-};
-template <typename ValueTypes>
-using _bulk_fn_t = typename _BulkFn<ValueTypes>::type;
-
-///////////////////////////////////////////////////////////////////////////////
-// Operation
-///////////////////////////////////////////////////////////////////////////////
-
-using TypeErasedOperation = _TypeErasedOperation;
-
-class _TypeErasedOperation {
- public:
-  struct Impl {
-    virtual ~Impl() = default;
-    virtual void _Start() = 0;
-  };
-
-  template <typename Fun, typename = std::enable_if_t<std::is_invocable_v<Fun>>>
-  explicit _TypeErasedOperation(Fun&& fun);
-
-  friend void tag_invoke(start_t, _TypeErasedOperation& op_state) { op_state.impl_->_Start(); }
-
- private:
-  std::unique_ptr<Impl> impl_;
-};
-
-template <typename Operation>
-struct _TypeErasedOperationImpl : _TypeErasedOperation::Impl {
-  virtual void _Start() { Start(operation_); }
-
-  template <typename Fun, typename = std::enable_if_t<std::is_invocable_v<Fun>>>
-  explicit _TypeErasedOperationImpl(Fun&& fun) : operation_{((Fun &&) fun)()} {}
-
-  Operation operation_;
-};
-
-template <typename Fun, typename>
-_TypeErasedOperation::_TypeErasedOperation(Fun&& fun) {
-  using _Operation = std::invoke_result_t<Fun>;
-  impl_.reset(new _TypeErasedOperationImpl<_Operation>{(Fun &&) fun});
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Sender
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename SenderType>
-class _TypeErasedSenderAdapter {
- public:
-  using value_types = typename SenderType::value_types;
-
-  explicit _TypeErasedSenderAdapter(SenderType sender) : sender_(std::move(sender)) {}
-
-  template <typename Self, typename Receiver, _decays_to<Self, _TypeErasedSenderAdapter, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return Connect(((Self &&) self).sender_, (Receiver &&) receiver);
-  }
-
- private:
-  SenderType sender_;
-};
-
-template <typename SenderType>
-_TypeErasedSenderAdapter(SenderType &&)->_TypeErasedSenderAdapter<remove_cvref_t<SenderType>>;
-
-namespace _expose {
-
-template <typename ValueTypes>
-struct _Sender {
-  using value_types = ValueTypes;
-
-  _TypeErasedSender<ValueTypes> sender_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, _Sender, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return Connect(((Self &&) self).sender_, (Receiver &&) receiver);
-  }
-
-  friend auto tag_invoke(get_completion_scheduler_t, const _Sender& self) noexcept {
-    return self.sender_._GetCompletionScheduler();
-  }
-};
-
-}  // namespace _expose
-
-template <typename ValueTypes>
-class _TypeErasedSender {
- public:
-  using _Operation = _TypeErasedOperation;
-  using _Receiver = _TypeErasedReceiver<ValueTypes>;
-  using _Scheduler = _TypeErasedScheduler<ValueTypes>;
-  using value_types = ValueTypes;
-
-  struct Impl {
-    virtual ~Impl() = default;
-    virtual _Operation _Connect(_Receiver) = 0;
-    virtual std::unique_ptr<Impl> _Clone() const = 0;
-    virtual _Scheduler _GetCompletionScheduler() const = 0;
-  };
-
-  _TypeErasedSender(_TypeErasedSender&& other) noexcept = default;
-  _TypeErasedSender& operator=(_TypeErasedSender&& other) noexcept = default;
-
-  _TypeErasedSender(const _TypeErasedSender& other) : impl_(other.impl_->_Clone()) {}
-  _TypeErasedSender& operator=(const _TypeErasedSender& other) {
-    impl_ = other.impl_->_Clone();
-    return *this;
-  }
-
-  _Scheduler _GetCompletionScheduler() const { return impl_->_GetCompletionScheduler(); }
-
-  template <typename Self, typename Receiver,
-            std::enable_if_t<std::is_same_v<_TypeErasedSender, remove_cvref_t<Self>>, int> = 0>
-  friend _Operation tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return self.impl_->_Connect(_TypeErasedReceiver<ValueTypes>((Receiver &&) receiver));
-  }
-
-  using SenderType = _TypeErasedSender;
-
-  friend _expose::_Sender<ValueTypes> tag_invoke(transfer_t, SenderType input,
-                                                 _Scheduler scheduler) {
-    auto sched = input.impl_->_GetCompletionScheduler();
-    return _expose::_Sender<ValueTypes>{
-        tag_invoke(transfer_t{}, sched, std::move(input), std::move(scheduler))};
-  }
-
-  template <
-      typename Sender,
-      typename = std::enable_if_t<
-          !std::is_same_v<remove_cvref_t<Sender>, _TypeErasedSender> &&
-          !std::is_same_v<remove_cvref_t<Sender>, _TypeErasedSenderAdapter<_TypeErasedSender>>>>
-  /* implicit */ _TypeErasedSender(Sender&& sender);
-
- private:
-  std::unique_ptr<Impl> impl_;
-};
-
-template <typename... Ts>
-using TypeErasedSender = _TypeErasedSender<std::tuple<Ts...>>;
-
-template <typename Sender>
-_TypeErasedSender(Sender &&)->_TypeErasedSender<completion_signatures_of_t<Sender>>;
-
-template <typename Sender, typename ValueTypes = completion_signatures_of_t<Sender>>
-struct _TypeErasedSenderImpl : _TypeErasedSender<ValueTypes>::Impl {
- public:
-  using Base = typename _TypeErasedSender<ValueTypes>::Impl;
-  using _Operation = _TypeErasedOperation;
-  using _Receiver = _TypeErasedReceiver<ValueTypes>;
-
-  template <typename _Sender, typename = std::enable_if_t<
-                                  !std::is_same_v<std::decay_t<_Sender>, _TypeErasedSenderImpl>>>
-  explicit _TypeErasedSenderImpl(_Sender&& sender) : sender_((_Sender &&) sender) {}
-
-  _TypeErasedOperation _Connect(_Receiver receiver) override;
-
-  _TypeErasedScheduler<ValueTypes> _GetCompletionScheduler() const override {
-    //    static_assert(
-    //        !std::is_same_v<ValueTypes, std::tuple<mmdeploy::Value>> ||
-    //        (_capture_completion_scheduler_v<ValueTypes> && _has_completion_scheduler_v<Sender>));
-    if constexpr (_capture_completion_scheduler_v<ValueTypes> &&
-                  _has_completion_scheduler_v<Sender>) {
-      return _TypeErasedScheduler<ValueTypes>{GetCompletionScheduler(sender_)};
-    } else {
-      return _TypeErasedScheduler<ValueTypes>{
-          std::make_shared<typename _TypeErasedScheduler<ValueTypes>::Impl>()};
-    }
-  }
-
-  std::unique_ptr<Base> _Clone() const override {
-    if constexpr (std::is_copy_constructible_v<Sender>) {
-      return std::make_unique<_TypeErasedSenderImpl>(sender_);
-    } else {
-      MMDEPLOY_ERROR("attempt to clone non-copyable sender");
-      std::abort();
-    }
-    return {};
-  }
-
- private:
-  Sender sender_;
-};
-template <typename Sender, typename ValueTypes>
-_TypeErasedOperation _TypeErasedSenderImpl<Sender, ValueTypes>::_Connect(
-    _TypeErasedSenderImpl::_Receiver receiver) {
-  return _Operation{[&] { return Connect(std::move(sender_), std::move(receiver)); }};
-}
-
-template <typename ValueTypes>
-template <typename Sender, typename>
-_TypeErasedSender<ValueTypes>::_TypeErasedSender(Sender&& sender) {
-  using _Sender = remove_cvref_t<Sender>;
-  impl_ = std::make_unique<_TypeErasedSenderImpl<_Sender>>((Sender &&) sender);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Receiver
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename ValueTypes>
-class _TypeErasedReceiver {
- public:
-  struct Impl {
-    virtual ~Impl() = default;
-    virtual void _SetValue(ValueTypes) = 0;
-  };
-
-  template <typename Receiver, typename = std::enable_if_t<
-                                   !std::is_same_v<std::decay_t<Receiver>, _TypeErasedReceiver>>>
-  explicit _TypeErasedReceiver(Receiver&&);
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, _TypeErasedReceiver&& self, As&&... as) noexcept {
-    self.impl_->_SetValue(std::make_tuple((As &&) as...));
-  }
-
- private:
-  std::unique_ptr<Impl> impl_;
-};
-
-template <typename Receiver, typename ValueTypes>
-struct _TypeErasedReceiverImpl : _TypeErasedReceiver<ValueTypes>::Impl {
-  void _SetValue(ValueTypes vals) override {
-    std::apply(
-        [&](auto&&... args) noexcept { SetValue(std::move(receiver_), (decltype(args)&&)args...); },
-        std::move(vals));
-  }
-  Receiver receiver_;
-
-  template <typename _Receiver>
-  explicit _TypeErasedReceiverImpl(_Receiver&& receiver) : receiver_((_Receiver &&) receiver) {}
-};
-
-template <typename ValueTypes>
-template <typename Receiver, typename>
-_TypeErasedReceiver<ValueTypes>::_TypeErasedReceiver(Receiver&& receiver) {
-  using _Receiver = std::decay_t<Receiver>;
-  impl_ = std::make_unique<_TypeErasedReceiverImpl<_Receiver, ValueTypes>>((Receiver &&) receiver);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Scheduler
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename... Ts>
-using TypeErasedScheduler = _TypeErasedScheduler<std::tuple<Ts...>>;
-
-template <typename ValueTypes>
-class _TypeErasedScheduler {
- public:
-  using SenderType = _TypeErasedSender<ValueTypes>;
-  using SenderAdapterType = _TypeErasedSenderAdapter<SenderType>;
-  using EmptySenderType = _TypeErasedSender<std::tuple<>>;
-
-  using ThenFun = typename _ThenFn<ValueTypes>::type;
-  using BulkFun = typename _BulkFn<ValueTypes>::type;
-
-  struct Impl {
-    virtual ~Impl() = default;
-    virtual EmptySenderType _Schedule() { return Just(); }
-    virtual SenderType _Transfer(SenderAdapterType input, _TypeErasedScheduler sched) {
-      return ::mmdeploy::Transfer(std::move(input), std::move(sched));
-    }
-    virtual SenderType _Bulk(SenderAdapterType input, size_t shape, BulkFun fun) {
-      return ::mmdeploy::Bulk(std::move(input), shape, std::move(fun));
-    }
-    virtual SenderType _DynamicBatch(SenderAdapterType input, dynamic_batch_t::context_t& context,
-                                     ThenFun fun) {
-      if constexpr (!std::is_same_v<ThenFun, _unit>) {
-        return ::mmdeploy::DynamicBatch(std::move(input), nullptr, std::move(fun));
-      } else {
-        std::abort();
-      }
-    }
-    // virtual SenderType _ScheduleFrom(SenderType) = 0;
-    // virtual SenderType _Then(SenderType input, ThenFun fun) = 0;
-    // virtual SenderType _LetValue() = 0;
-    // virtual SenderType _On(SenderType) = 0;
-    // virtual SenderType _Split(SenderType) = 0;
-    // virtual SenderType _WhenAll(std::vector<SenderType>) = 0;
-    // virtual SenderType _TransferWhenAll(std::vector<SenderType>) = 0;
-    // virtual SenderType _EnsureStarted(SenderType) = 0;
-    // virtual void _StartDetached(SenderType) = 0;
-    // virtual ValueTypes _SyncWait(SenderType) = 0;
-  };
-
-  template <typename Scheduler, typename = std::enable_if_t<
-                                    !std::is_same_v<std::decay_t<Scheduler>, _TypeErasedScheduler>>>
-  explicit _TypeErasedScheduler(Scheduler&& sched);
-
-  explicit _TypeErasedScheduler(std::shared_ptr<Impl> impl) : impl_(std::move(impl)) {
-    assert(impl_);
-  }
-
-  friend EmptySenderType tag_invoke(schedule_t, const _TypeErasedScheduler& self) {
-    return self.impl_->_Schedule();
-  }
-
-  friend SenderType tag_invoke(transfer_t, const _TypeErasedScheduler& self, SenderType input,
-                               _TypeErasedScheduler other) {
-    if (self.impl_ == other.impl_) {
-      return std::move(input);
-    } else {
-      return self.impl_->_Transfer(SenderAdapterType{std::move(input)}, std::move(other));
-    }
-  }
-
-  friend SenderType tag_invoke(bulk_t, const _TypeErasedScheduler& self, SenderType input,
-                               size_t shape, BulkFun fun) {
-    return self.impl_->_Bulk(SenderAdapterType{std::move(input)}, shape, std::move(fun));
-  }
-
-  friend SenderType tag_invoke(dynamic_batch_t, const _TypeErasedScheduler& self, SenderType input,
-                               dynamic_batch_t::context_t& context, ThenFun fun) {
-    return self.impl_->_DynamicBatch(SenderAdapterType{std::move(input)}, context, std::move(fun));
-  }
-
- private:
-  std::shared_ptr<Impl> impl_;
-};
-
-template <typename ValueTypes, typename Scheduler>
-struct _TypeErasedSchedulerImpl : _TypeErasedScheduler<ValueTypes>::Impl {
-  using _SenderType = _TypeErasedSender<std::tuple<>>;
-
-  using Base = typename _TypeErasedScheduler<ValueTypes>::Impl;
-  using BulkFun = typename _TypeErasedScheduler<ValueTypes>::BulkFun;
-  using ThenFun = typename _TypeErasedScheduler<ValueTypes>::ThenFun;
-  using VoidSenderType = typename _TypeErasedScheduler<ValueTypes>::EmptySenderType;
-  using SenderType = typename _TypeErasedScheduler<ValueTypes>::SenderType;
-  using SenderAdapterType = _TypeErasedSenderAdapter<SenderType>;
-
-  VoidSenderType _Schedule() override { return VoidSenderType{Schedule(scheduler_)}; }
-
-  SenderType _Transfer(SenderAdapterType input, _TypeErasedScheduler<ValueTypes> sched) override {
-    if constexpr (tag_invocable<transfer_t, Scheduler, SenderType,
-                                _TypeErasedScheduler<ValueTypes>>) {
-      return tag_invoke(transfer_t{}, scheduler_, std::move(input), std::move(sched));
-    } else {
-      return Base::_Transfer(std::move(input), std::move(sched));
-    }
-  }
-
-  SenderType _Bulk(SenderAdapterType input, size_t shape, BulkFun fun) override {
-    if constexpr (tag_invocable<bulk_t, Scheduler, SenderType, size_t, BulkFun>) {
-      return tag_invoke(bulk_t{}, scheduler_, std::move(input), shape, std::move(fun));
-    } else {
-      return Base::_Bulk(std::move(input), shape, std::move(fun));
-    }
-  }
-
-  SenderType _DynamicBatch(SenderAdapterType input, dynamic_batch_t::context_t& context,
-                           ThenFun fun) override {
-    if constexpr (tag_invocable<dynamic_batch_t, Scheduler, SenderAdapterType,
-                                dynamic_batch_t::context_t&, ThenFun>) {
-      return tag_invoke(dynamic_batch_t{}, scheduler_, std::move(input), context, std::move(fun));
-    } else {
-      return Base::_DynamicBatch(std::move(input), context, std::move(fun));
-    }
-  }
-
-  explicit _TypeErasedSchedulerImpl(Scheduler sched) : scheduler_(std::move(sched)) {}
-  Scheduler scheduler_;
-};
-
-template <typename Scheduler, typename... Ts>
-using TypeErasedSchedulerImpl = _TypeErasedSchedulerImpl<std::tuple<Ts...>, Scheduler>;
-
-template <typename ValueTypes>
-template <typename Scheduler, typename>
-_TypeErasedScheduler<ValueTypes>::_TypeErasedScheduler(Scheduler&& scheduler) {
-  using _Scheduler = std::decay_t<Scheduler>;
-  impl_ =
-      std::make_unique<_TypeErasedSchedulerImpl<ValueTypes, _Scheduler>>((Scheduler &&) scheduler);
-}
-
-struct type_erase_t {
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  auto operator()(Sender&& sender) const {
-    return _TypeErasedSender((Sender &&) sender);
-  }
-  _BinderBack<type_erase_t> operator()() const { return {{}, {}, {}}; }
-};
-
-}  // namespace _type_erased
-
-using _type_erased::type_erase_t;
-inline constexpr type_erase_t TypeErase{};
-
-using _type_erased::TypeErasedOperation;
-using _type_erased::TypeErasedScheduler;
-using _type_erased::TypeErasedSender;
-
-// TODO move the specialization somewhere else in a consistent way
-class Value;
-
-namespace _capture_completion_scheduler {
-template <>
-struct capture_completion_scheduler<std::tuple<Value>> : std::true_type {};
-}  // namespace _capture_completion_scheduler
+namespace mmdeploy
+{
+
+    namespace _capture_completion_scheduler
+    {
+
+        template<typename ValueTypes>
+        struct capture_completion_scheduler : std::false_type
+        {
+        };
+
+    }  // namespace _capture_completion_scheduler
+
+    using _capture_completion_scheduler::capture_completion_scheduler;
+
+    template<typename ValueTypes>
+    inline constexpr bool _capture_completion_scheduler_v =
+        capture_completion_scheduler<ValueTypes>::value;
+
+    namespace _type_erased
+    {
+
+        template<typename ValueTypes>
+        class _TypeErasedSender;
+
+        class _TypeErasedOperation;
+
+        template<typename ValueTypes>
+        class _TypeErasedReceiver;
+
+        template<typename ValueTypes>
+        class _TypeErasedScheduler;
+
+        struct _unit
+        {
+        };
+
+        template<typename>
+        struct _ThenFn
+        {
+            using type = _unit;
+        };
+        template<typename T>
+        struct _ThenFn<std::tuple<T>>
+        {
+            using type = std::function<T(T)>;
+        };
+        template<>
+        struct _ThenFn<std::tuple<>>
+        {
+            using type = std::function<void()>;
+        };
+        template<typename ValueTypes>
+        using _then_fn_t = typename _ThenFn<ValueTypes>::type;
+
+        template<typename>
+        struct _BulkFn
+        {
+        };
+        template<typename... Ts>
+        struct _BulkFn<std::tuple<Ts...>>
+        {
+            using type = std::function<void(size_t, Ts&...)>;
+        };
+        template<typename ValueTypes>
+        using _bulk_fn_t = typename _BulkFn<ValueTypes>::type;
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Operation
+        ///////////////////////////////////////////////////////////////////////////////
+
+        using TypeErasedOperation = _TypeErasedOperation;
+
+        class _TypeErasedOperation
+        {
+          public:
+            struct Impl
+            {
+                virtual ~Impl()       = default;
+                virtual void _Start() = 0;
+            };
+
+            template<typename Fun, typename = std::enable_if_t<std::is_invocable_v<Fun>>>
+            explicit _TypeErasedOperation(Fun&& fun);
+
+            friend void tag_invoke(start_t, _TypeErasedOperation& op_state)
+            {
+                op_state.impl_->_Start();
+            }
+
+          private:
+            std::unique_ptr<Impl> impl_;
+        };
+
+        template<typename Operation>
+        struct _TypeErasedOperationImpl : _TypeErasedOperation::Impl
+        {
+            virtual void _Start()
+            {
+                Start(operation_);
+            }
+
+            template<typename Fun, typename = std::enable_if_t<std::is_invocable_v<Fun>>>
+            explicit _TypeErasedOperationImpl(Fun&& fun)
+                : operation_{((Fun&&)fun)()}
+            {
+            }
+
+            Operation operation_;
+        };
+
+        template<typename Fun, typename>
+        _TypeErasedOperation::_TypeErasedOperation(Fun&& fun)
+        {
+            using _Operation = std::invoke_result_t<Fun>;
+            impl_.reset(new _TypeErasedOperationImpl<_Operation>{(Fun&&)fun});
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Sender
+        ///////////////////////////////////////////////////////////////////////////////
+
+        template<typename SenderType>
+        class _TypeErasedSenderAdapter
+        {
+          public:
+            using value_types = typename SenderType::value_types;
+
+            explicit _TypeErasedSenderAdapter(SenderType sender)
+                : sender_(std::move(sender))
+            {
+            }
+
+            template<typename Self, typename Receiver, _decays_to<Self, _TypeErasedSenderAdapter, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+            {
+                return Connect(((Self&&)self).sender_, (Receiver&&)receiver);
+            }
+
+          private:
+            SenderType sender_;
+        };
+
+        template<typename SenderType>
+        _TypeErasedSenderAdapter(SenderType&&) -> _TypeErasedSenderAdapter<remove_cvref_t<SenderType>>;
+
+        namespace _expose
+        {
+
+            template<typename ValueTypes>
+            struct _Sender
+            {
+                using value_types = ValueTypes;
+
+                _TypeErasedSender<ValueTypes> sender_;
+
+                template<typename Self, typename Receiver, _decays_to<Self, _Sender, int> = 0>
+                friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                {
+                    return Connect(((Self&&)self).sender_, (Receiver&&)receiver);
+                }
+
+                friend auto tag_invoke(get_completion_scheduler_t, const _Sender& self) noexcept
+                {
+                    return self.sender_._GetCompletionScheduler();
+                }
+            };
+
+        }  // namespace _expose
+
+        template<typename ValueTypes>
+        class _TypeErasedSender
+        {
+          public:
+            using _Operation  = _TypeErasedOperation;
+            using _Receiver   = _TypeErasedReceiver<ValueTypes>;
+            using _Scheduler  = _TypeErasedScheduler<ValueTypes>;
+            using value_types = ValueTypes;
+
+            struct Impl
+            {
+                virtual ~Impl()                                               = default;
+                virtual _Operation            _Connect(_Receiver)             = 0;
+                virtual std::unique_ptr<Impl> _Clone() const                  = 0;
+                virtual _Scheduler            _GetCompletionScheduler() const = 0;
+            };
+
+            _TypeErasedSender(_TypeErasedSender&& other) noexcept            = default;
+            _TypeErasedSender& operator=(_TypeErasedSender&& other) noexcept = default;
+
+            _TypeErasedSender(const _TypeErasedSender& other)
+                : impl_(other.impl_->_Clone())
+            {
+            }
+            _TypeErasedSender& operator=(const _TypeErasedSender& other)
+            {
+                impl_ = other.impl_->_Clone();
+                return *this;
+            }
+
+            _Scheduler _GetCompletionScheduler() const
+            {
+                return impl_->_GetCompletionScheduler();
+            }
+
+            template<typename Self, typename Receiver, std::enable_if_t<std::is_same_v<_TypeErasedSender, remove_cvref_t<Self>>, int> = 0>
+            friend _Operation tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+            {
+                return self.impl_->_Connect(_TypeErasedReceiver<ValueTypes>((Receiver&&)receiver));
+            }
+
+            using SenderType = _TypeErasedSender;
+
+            friend _expose::_Sender<ValueTypes> tag_invoke(transfer_t, SenderType input, _Scheduler scheduler)
+            {
+                auto sched = input.impl_->_GetCompletionScheduler();
+                return _expose::_Sender<ValueTypes>{
+                    tag_invoke(transfer_t{}, sched, std::move(input), std::move(scheduler))};
+            }
+
+            template<
+                typename Sender,
+                typename = std::enable_if_t<
+                    !std::is_same_v<remove_cvref_t<Sender>, _TypeErasedSender> &&
+                    !std::is_same_v<remove_cvref_t<Sender>, _TypeErasedSenderAdapter<_TypeErasedSender>>>>
+            /* implicit */ _TypeErasedSender(Sender&& sender);
+
+          private:
+            std::unique_ptr<Impl> impl_;
+        };
+
+        template<typename... Ts>
+        using TypeErasedSender = _TypeErasedSender<std::tuple<Ts...>>;
+
+        template<typename Sender>
+        _TypeErasedSender(Sender&&) -> _TypeErasedSender<completion_signatures_of_t<Sender>>;
+
+        template<typename Sender, typename ValueTypes = completion_signatures_of_t<Sender>>
+        struct _TypeErasedSenderImpl : _TypeErasedSender<ValueTypes>::Impl
+        {
+          public:
+            using Base       = typename _TypeErasedSender<ValueTypes>::Impl;
+            using _Operation = _TypeErasedOperation;
+            using _Receiver  = _TypeErasedReceiver<ValueTypes>;
+
+            template<typename _Sender, typename = std::enable_if_t<!std::is_same_v<std::decay_t<_Sender>, _TypeErasedSenderImpl>>>
+            explicit _TypeErasedSenderImpl(_Sender&& sender)
+                : sender_((_Sender&&)sender)
+            {
+            }
+
+            _TypeErasedOperation             _Connect(_Receiver receiver) override;
+
+            _TypeErasedScheduler<ValueTypes> _GetCompletionScheduler() const override
+            {
+                //    static_assert(
+                //        !std::is_same_v<ValueTypes, std::tuple<mmdeploy::Value>> ||
+                //        (_capture_completion_scheduler_v<ValueTypes> && _has_completion_scheduler_v<Sender>));
+                if constexpr (_capture_completion_scheduler_v<ValueTypes> &&
+                              _has_completion_scheduler_v<Sender>)
+                {
+                    return _TypeErasedScheduler<ValueTypes>{GetCompletionScheduler(sender_)};
+                }
+                else
+                {
+                    return _TypeErasedScheduler<ValueTypes>{
+                        std::make_shared<typename _TypeErasedScheduler<ValueTypes>::Impl>()};
+                }
+            }
+
+            std::unique_ptr<Base> _Clone() const override
+            {
+                if constexpr (std::is_copy_constructible_v<Sender>)
+                {
+                    return std::make_unique<_TypeErasedSenderImpl>(sender_);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("attempt to clone non-copyable sender");
+                    std::abort();
+                }
+                return {};
+            }
+
+          private:
+            Sender sender_;
+        };
+        template<typename Sender, typename ValueTypes>
+        _TypeErasedOperation _TypeErasedSenderImpl<Sender, ValueTypes>::_Connect(
+            _TypeErasedSenderImpl::_Receiver receiver)
+        {
+            return _Operation{[&]
+                              { return Connect(std::move(sender_), std::move(receiver)); }};
+        }
+
+        template<typename ValueTypes>
+        template<typename Sender, typename>
+        _TypeErasedSender<ValueTypes>::_TypeErasedSender(Sender&& sender)
+        {
+            using _Sender = remove_cvref_t<Sender>;
+            impl_         = std::make_unique<_TypeErasedSenderImpl<_Sender>>((Sender&&)sender);
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Receiver
+        ///////////////////////////////////////////////////////////////////////////////
+
+        template<typename ValueTypes>
+        class _TypeErasedReceiver
+        {
+          public:
+            struct Impl
+            {
+                virtual ~Impl()                    = default;
+                virtual void _SetValue(ValueTypes) = 0;
+            };
+
+            template<typename Receiver, typename = std::enable_if_t<!std::is_same_v<std::decay_t<Receiver>, _TypeErasedReceiver>>>
+            explicit _TypeErasedReceiver(Receiver&&);
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, _TypeErasedReceiver&& self, As&&... as) noexcept
+            {
+                self.impl_->_SetValue(std::make_tuple((As&&)as...));
+            }
+
+          private:
+            std::unique_ptr<Impl> impl_;
+        };
+
+        template<typename Receiver, typename ValueTypes>
+        struct _TypeErasedReceiverImpl : _TypeErasedReceiver<ValueTypes>::Impl
+        {
+            void _SetValue(ValueTypes vals) override
+            {
+                std::apply(
+                    [&](auto&&... args) noexcept
+                    { SetValue(std::move(receiver_), (decltype(args)&&)args...); },
+                    std::move(vals));
+            }
+            Receiver receiver_;
+
+            template<typename _Receiver>
+            explicit _TypeErasedReceiverImpl(_Receiver&& receiver)
+                : receiver_((_Receiver&&)receiver)
+            {
+            }
+        };
+
+        template<typename ValueTypes>
+        template<typename Receiver, typename>
+        _TypeErasedReceiver<ValueTypes>::_TypeErasedReceiver(Receiver&& receiver)
+        {
+            using _Receiver = std::decay_t<Receiver>;
+            impl_           = std::make_unique<_TypeErasedReceiverImpl<_Receiver, ValueTypes>>((Receiver&&)receiver);
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Scheduler
+        ///////////////////////////////////////////////////////////////////////////////
+
+        template<typename... Ts>
+        using TypeErasedScheduler = _TypeErasedScheduler<std::tuple<Ts...>>;
+
+        template<typename ValueTypes>
+        class _TypeErasedScheduler
+        {
+          public:
+            using SenderType        = _TypeErasedSender<ValueTypes>;
+            using SenderAdapterType = _TypeErasedSenderAdapter<SenderType>;
+            using EmptySenderType   = _TypeErasedSender<std::tuple<>>;
+
+            using ThenFun = typename _ThenFn<ValueTypes>::type;
+            using BulkFun = typename _BulkFn<ValueTypes>::type;
+
+            struct Impl
+            {
+                virtual ~Impl() = default;
+                virtual EmptySenderType _Schedule()
+                {
+                    return Just();
+                }
+                virtual SenderType _Transfer(SenderAdapterType input, _TypeErasedScheduler sched)
+                {
+                    return ::mmdeploy::Transfer(std::move(input), std::move(sched));
+                }
+                virtual SenderType _Bulk(SenderAdapterType input, size_t shape, BulkFun fun)
+                {
+                    return ::mmdeploy::Bulk(std::move(input), shape, std::move(fun));
+                }
+                virtual SenderType _DynamicBatch(SenderAdapterType input, dynamic_batch_t::context_t& context, ThenFun fun)
+                {
+                    if constexpr (!std::is_same_v<ThenFun, _unit>)
+                    {
+                        return ::mmdeploy::DynamicBatch(std::move(input), nullptr, std::move(fun));
+                    }
+                    else
+                    {
+                        std::abort();
+                    }
+                }
+                // virtual SenderType _ScheduleFrom(SenderType) = 0;
+                // virtual SenderType _Then(SenderType input, ThenFun fun) = 0;
+                // virtual SenderType _LetValue() = 0;
+                // virtual SenderType _On(SenderType) = 0;
+                // virtual SenderType _Split(SenderType) = 0;
+                // virtual SenderType _WhenAll(std::vector<SenderType>) = 0;
+                // virtual SenderType _TransferWhenAll(std::vector<SenderType>) = 0;
+                // virtual SenderType _EnsureStarted(SenderType) = 0;
+                // virtual void _StartDetached(SenderType) = 0;
+                // virtual ValueTypes _SyncWait(SenderType) = 0;
+            };
+
+            template<typename Scheduler, typename = std::enable_if_t<!std::is_same_v<std::decay_t<Scheduler>, _TypeErasedScheduler>>>
+            explicit _TypeErasedScheduler(Scheduler&& sched);
+
+            explicit _TypeErasedScheduler(std::shared_ptr<Impl> impl)
+                : impl_(std::move(impl))
+            {
+                assert(impl_);
+            }
+
+            friend EmptySenderType tag_invoke(schedule_t, const _TypeErasedScheduler& self)
+            {
+                return self.impl_->_Schedule();
+            }
+
+            friend SenderType tag_invoke(transfer_t, const _TypeErasedScheduler& self, SenderType input, _TypeErasedScheduler other)
+            {
+                if (self.impl_ == other.impl_)
+                {
+                    return std::move(input);
+                }
+                else
+                {
+                    return self.impl_->_Transfer(SenderAdapterType{std::move(input)}, std::move(other));
+                }
+            }
+
+            friend SenderType tag_invoke(bulk_t, const _TypeErasedScheduler& self, SenderType input, size_t shape, BulkFun fun)
+            {
+                return self.impl_->_Bulk(SenderAdapterType{std::move(input)}, shape, std::move(fun));
+            }
+
+            friend SenderType tag_invoke(dynamic_batch_t, const _TypeErasedScheduler& self, SenderType input, dynamic_batch_t::context_t& context, ThenFun fun)
+            {
+                return self.impl_->_DynamicBatch(SenderAdapterType{std::move(input)}, context, std::move(fun));
+            }
+
+          private:
+            std::shared_ptr<Impl> impl_;
+        };
+
+        template<typename ValueTypes, typename Scheduler>
+        struct _TypeErasedSchedulerImpl : _TypeErasedScheduler<ValueTypes>::Impl
+        {
+            using _SenderType = _TypeErasedSender<std::tuple<>>;
+
+            using Base              = typename _TypeErasedScheduler<ValueTypes>::Impl;
+            using BulkFun           = typename _TypeErasedScheduler<ValueTypes>::BulkFun;
+            using ThenFun           = typename _TypeErasedScheduler<ValueTypes>::ThenFun;
+            using VoidSenderType    = typename _TypeErasedScheduler<ValueTypes>::EmptySenderType;
+            using SenderType        = typename _TypeErasedScheduler<ValueTypes>::SenderType;
+            using SenderAdapterType = _TypeErasedSenderAdapter<SenderType>;
+
+            VoidSenderType _Schedule() override
+            {
+                return VoidSenderType{Schedule(scheduler_)};
+            }
+
+            SenderType _Transfer(SenderAdapterType input, _TypeErasedScheduler<ValueTypes> sched) override
+            {
+                if constexpr (tag_invocable<transfer_t, Scheduler, SenderType, _TypeErasedScheduler<ValueTypes>>)
+                {
+                    return tag_invoke(transfer_t{}, scheduler_, std::move(input), std::move(sched));
+                }
+                else
+                {
+                    return Base::_Transfer(std::move(input), std::move(sched));
+                }
+            }
+
+            SenderType _Bulk(SenderAdapterType input, size_t shape, BulkFun fun) override
+            {
+                if constexpr (tag_invocable<bulk_t, Scheduler, SenderType, size_t, BulkFun>)
+                {
+                    return tag_invoke(bulk_t{}, scheduler_, std::move(input), shape, std::move(fun));
+                }
+                else
+                {
+                    return Base::_Bulk(std::move(input), shape, std::move(fun));
+                }
+            }
+
+            SenderType _DynamicBatch(SenderAdapterType input, dynamic_batch_t::context_t& context, ThenFun fun) override
+            {
+                if constexpr (tag_invocable<dynamic_batch_t, Scheduler, SenderAdapterType, dynamic_batch_t::context_t&, ThenFun>)
+                {
+                    return tag_invoke(dynamic_batch_t{}, scheduler_, std::move(input), context, std::move(fun));
+                }
+                else
+                {
+                    return Base::_DynamicBatch(std::move(input), context, std::move(fun));
+                }
+            }
+
+            explicit _TypeErasedSchedulerImpl(Scheduler sched)
+                : scheduler_(std::move(sched))
+            {
+            }
+            Scheduler scheduler_;
+        };
+
+        template<typename Scheduler, typename... Ts>
+        using TypeErasedSchedulerImpl = _TypeErasedSchedulerImpl<std::tuple<Ts...>, Scheduler>;
+
+        template<typename ValueTypes>
+        template<typename Scheduler, typename>
+        _TypeErasedScheduler<ValueTypes>::_TypeErasedScheduler(Scheduler&& scheduler)
+        {
+            using _Scheduler = std::decay_t<Scheduler>;
+            impl_ =
+                std::make_unique<_TypeErasedSchedulerImpl<ValueTypes, _Scheduler>>((Scheduler&&)scheduler);
+        }
+
+        struct type_erase_t
+        {
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                return _TypeErasedSender((Sender&&)sender);
+            }
+            _BinderBack<type_erase_t> operator()() const
+            {
+                return {{}, {}, {}};
+            }
+        };
+
+    }  // namespace _type_erased
+
+    using _type_erased::type_erase_t;
+    inline constexpr type_erase_t TypeErase{};
+
+    using _type_erased::TypeErasedOperation;
+    using _type_erased::TypeErasedScheduler;
+    using _type_erased::TypeErasedSender;
+
+    // TODO move the specialization somewhere else in a consistent way
+    class Value;
+
+    namespace _capture_completion_scheduler
+    {
+        template<>
+        struct capture_completion_scheduler<std::tuple<Value>> : std::true_type
+        {
+        };
+    }  // namespace _capture_completion_scheduler
 
 }  // namespace mmdeploy
 //
diff --git a/csrc/mmdeploy/execution/type_traits.h b/csrc/mmdeploy/execution/type_traits.h
index 1ac6e69eb0..8d01dfb596 100644
--- a/csrc/mmdeploy/execution/type_traits.h
+++ b/csrc/mmdeploy/execution/type_traits.h
@@ -7,80 +7,93 @@
 
 #include <type_traits>
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-/////////////////////////////////////////////////////////
-// remove_cvref without handling volatile
-template <typename T>
-struct remove_cvref {
-  using type = T;
-};
-template <typename T>
-struct remove_cvref<const T> {
-  using type = T;
-};
-template <typename T>
-struct remove_cvref<T&> {
-  using type = T;
-};
-template <typename T>
-struct remove_cvref<const T&> {
-  using type = T;
-};
-template <typename T>
-struct remove_cvref<T&&> {
-  using type = T;
-};
-template <typename T>
-struct remove_cvref<const T&&> {
-  using type = T;
-};
+    /////////////////////////////////////////////////////////
+    // remove_cvref without handling volatile
+    template<typename T>
+    struct remove_cvref
+    {
+        using type = T;
+    };
+    template<typename T>
+    struct remove_cvref<const T>
+    {
+        using type = T;
+    };
+    template<typename T>
+    struct remove_cvref<T&>
+    {
+        using type = T;
+    };
+    template<typename T>
+    struct remove_cvref<const T&>
+    {
+        using type = T;
+    };
+    template<typename T>
+    struct remove_cvref<T&&>
+    {
+        using type = T;
+    };
+    template<typename T>
+    struct remove_cvref<const T&&>
+    {
+        using type = T;
+    };
 
-template <typename T>
-using remove_cvref_t = typename remove_cvref<T>::type;
+    template<typename T>
+    using remove_cvref_t = typename remove_cvref<T>::type;
 
-template <typename Fn, typename... Args>
-using callable_result_t = decltype(std::declval<Fn&&>()(std::declval<Args&&>()...));
+    template<typename Fn, typename... Args>
+    using callable_result_t = decltype(std::declval<Fn&&>()(std::declval<Args&&>()...));
 
-namespace _is_callable {
-struct yes_type {
-  char dummy;
-};
-struct no_type {
-  char dummy[2];
-};
-static_assert(sizeof(yes_type) != sizeof(no_type));
+    namespace _is_callable
+    {
+        struct yes_type
+        {
+            char dummy;
+        };
+        struct no_type
+        {
+            char dummy[2];
+        };
+        static_assert(sizeof(yes_type) != sizeof(no_type));
 
-template <typename Fn, typename... Args, typename = callable_result_t<Fn, Args...>>
-yes_type _try_call(Fn (*)(Args...)) noexcept(
-    noexcept(std::declval<Fn&&>()(std::declval<Args&&>()...)));
-no_type _try_call(...) noexcept(false);
+        template<typename Fn, typename... Args, typename = callable_result_t<Fn, Args...>>
+        yes_type _try_call(Fn (*)(Args...)) noexcept(
+            noexcept(std::declval<Fn&&>()(std::declval<Args&&>()...)));
+        no_type _try_call(...) noexcept(false);
 
-}  // namespace _is_callable
+    }  // namespace _is_callable
 
-template <typename Fn, typename... Args>
-inline constexpr bool is_callable_v =
-    sizeof(decltype(_is_callable::_try_call(static_cast<Fn (*)(Args...)>(nullptr)))) ==
-    sizeof(_is_callable::yes_type);
+    template<typename Fn, typename... Args>
+    inline constexpr bool is_callable_v =
+        sizeof(decltype(_is_callable::_try_call(static_cast<Fn (*)(Args...)>(nullptr)))) ==
+        sizeof(_is_callable::yes_type);
 
-template <typename Fn, typename... Args>
-inline constexpr bool is_nothrow_callable_v =
-    noexcept(_is_callable::_try_call(static_cast<Fn (*)(Args...)>(nullptr)));
+    template<typename Fn, typename... Args>
+    inline constexpr bool is_nothrow_callable_v =
+        noexcept(_is_callable::_try_call(static_cast<Fn (*)(Args...)>(nullptr)));
 }  // namespace mmdeploy
 
-template <template <typename...> class T, typename... Args>
-struct _defer {
-  using type = T<Args...>;
+template<template<typename...> class T, typename... Args>
+struct _defer
+{
+    using type = T<Args...>;
 };
 
-template <template <typename...> class T, typename... Args>
-struct _defer_args {
-  using type = T<typename Args::type...>;
+template<template<typename...> class T, typename... Args>
+struct _defer_args
+{
+    using type = T<typename Args::type...>;
 };
 
-template <typename T>
-struct identity {
-  using type = T;
+template<typename T>
+struct identity
+{
+    using type = T;
 };
 
 #endif  // MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_TYPE_TRAITS_H_
diff --git a/csrc/mmdeploy/execution/utility.h b/csrc/mmdeploy/execution/utility.h
index 5a0bb795d8..bef6d14e81 100644
--- a/csrc/mmdeploy/execution/utility.h
+++ b/csrc/mmdeploy/execution/utility.h
@@ -13,71 +13,86 @@
 
 #define MMDEPLOY_REQUIRES(...) std::enable_if_t<__VA_ARGS__, int> = 0
 
-namespace mmdeploy {
-
-template <typename T, typename E, typename U = void>
-using _decays_to = std::enable_if_t<std::is_same<std::decay_t<T>, E>::value, U>;
-
-template <typename... Ts>
-using __decayed_tuple = std::tuple<std::decay_t<Ts>...>;
-
-template <typename Fun, typename... As>
-using __call_result_t = decltype(std::declval<Fun>()(std::declval<As>()...));
-
-template <typename F>
-struct __conv {
-  F f_;
-  using type = __call_result_t<F>;
-  operator type() && { return ((F &&) f_)(); }
-};
-
-template <typename F>
-__conv(F)->__conv<F>;
-
-template <typename T, typename = std::enable_if_t<std::is_destructible_v<T>>>
-struct __conv_proxy {
-  T v_;
-  template <typename F>
-  explicit __conv_proxy(F&& f) : v_(((F &&) f)()) {}
-  T& operator*() noexcept { return v_; }
-};
-
-template <typename _Member, typename _Self>
-_Member _Self::*__memptr(const _Self&);
-
-template <typename _Self, typename _Member>
-using __member_t = decltype((std::declval<_Self>().*__memptr<_Member>(std::declval<_Self>())));
-
-template <typename From, typename To>
-using _copy_cvref_t = __member_t<From, To>;
-
-template <typename S, typename R>
-using connect_result_t = decltype(Connect(std::declval<S>(), std::declval<R>()));
+namespace mmdeploy
+{
 
-template <typename...>
-struct _types
+    template<typename T, typename E, typename U = void>
+    using _decays_to = std::enable_if_t<std::is_same<std::decay_t<T>, E>::value, U>;
+
+    template<typename... Ts>
+    using __decayed_tuple = std::tuple<std::decay_t<Ts>...>;
+
+    template<typename Fun, typename... As>
+    using __call_result_t = decltype(std::declval<Fun>()(std::declval<As>()...));
+
+    template<typename F>
+    struct __conv
+    {
+        F f_;
+        using type = __call_result_t<F>;
+        operator type() &&
+        {
+            return ((F&&)f_)();
+        }
+    };
+
+    template<typename F>
+    __conv(F) -> __conv<F>;
+
+    template<typename T, typename = std::enable_if_t<std::is_destructible_v<T>>>
+    struct __conv_proxy
+    {
+        T v_;
+        template<typename F>
+        explicit __conv_proxy(F&& f)
+            : v_(((F&&)f)())
+        {
+        }
+        T& operator*() noexcept
+        {
+            return v_;
+        }
+    };
+
+    template<typename _Member, typename _Self>
+    _Member _Self::*__memptr(const _Self&);
+
+    template<typename _Self, typename _Member>
+    using __member_t = decltype((std::declval<_Self>().*__memptr<_Member>(std::declval<_Self>())));
+
+    template<typename From, typename To>
+    using _copy_cvref_t = __member_t<From, To>;
+
+    template<typename S, typename R>
+    using connect_result_t = decltype(Connect(std::declval<S>(), std::declval<R>()));
+
+    template<typename...>
+    struct _types
 #if defined(__GNUC__) && !defined(__clang__)
-{
-}
+    {
+    }
 #endif
-;
+    ;
 
-namespace __schedule {
+    namespace __schedule
+    {
 
-struct schedule_t {
-  template <typename Scheduler, std::enable_if_t<tag_invocable<schedule_t, Scheduler>, int> = 0>
-  auto operator()(Scheduler&& scheduler) const -> tag_invoke_result_t<schedule_t, Scheduler> {
-    return tag_invoke(schedule_t{}, (Scheduler &&) scheduler);
-  }
-};
+        struct schedule_t
+        {
+            template<typename Scheduler, std::enable_if_t<tag_invocable<schedule_t, Scheduler>, int> = 0>
+            auto operator()(Scheduler&& scheduler) const -> tag_invoke_result_t<schedule_t, Scheduler>
+            {
+                return tag_invoke(schedule_t{}, (Scheduler&&)scheduler);
+            }
+        };
 
-}  // namespace __schedule
+    }  // namespace __schedule
 
-using __schedule::schedule_t;
-inline constexpr schedule_t Schedule{};
+    using __schedule::schedule_t;
+    inline constexpr schedule_t Schedule{};
 
-template <typename Scheduler>
-using schedule_result_t = decltype(std::declval<schedule_t>()(std::declval<Scheduler>()));
+    template<typename Scheduler>
+    using schedule_result_t = decltype(std::declval<schedule_t>()(std::declval<Scheduler>()));
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/when_all.h b/csrc/mmdeploy/execution/when_all.h
index 46e0f85ee2..a7395d97ad 100644
--- a/csrc/mmdeploy/execution/when_all.h
+++ b/csrc/mmdeploy/execution/when_all.h
@@ -7,168 +7,202 @@
 
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __when_all {
-
-template <typename... Senders>
-using __concat_t = decltype(std::tuple_cat(std::declval<completion_signatures_of_t<Senders>>()...));
-
-template <typename CvrefReceiver, typename... Senders>
-struct _Operation {
-  struct type;
-};
-template <typename CvrefReceiver, typename... Senders>
-using Operation = typename _Operation<CvrefReceiver, Senders...>::type;
-
-template <typename CvrefReceiver, size_t Index, typename... Senders>
-struct _Receiver {
-  struct type;
-};
-template <typename CvrefReceiver, size_t Index, typename... Senders>
-using receiver_t = typename _Receiver<CvrefReceiver, Index, Senders...>::type;
-
-template <typename CvrefReceiver, size_t Index, typename... Senders>
-struct _Receiver<CvrefReceiver, Index, Senders...>::type {
-  using Receiver = remove_cvref_t<CvrefReceiver>;
-  Operation<CvrefReceiver, Senders...>* op_state_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    std::get<Index>(self.op_state_->vals_).emplace((As &&) as...);
-    self.op_state_->_Arrive();
-  }
-};
-
-template <typename CvrefReceiver, typename... Senders>
-struct _Operation<CvrefReceiver, Senders...>::type {
-  using Receiver = remove_cvref_t<CvrefReceiver>;
-
-  template <size_t Index>
-  using _receiver_t = receiver_t<CvrefReceiver, Index, Senders...>;
-
-  template <typename Sender, size_t Index>
-  using _ChildOpState = connect_result_t<_copy_cvref_t<CvrefReceiver, Sender>, _receiver_t<Index>>;
-
-  using _Indices = std::index_sequence_for<Senders...>;
-
-  // workaround for a bug in GCC7 that `Is` in a lambda is treated as unexpanded parameter pack
-  template <typename Sender, typename Receiver>
-  static auto _Connect1(Sender&& sender, Receiver&& receiver) {
-    return __conv{[&]() mutable { return Connect((Sender &&) sender, (Receiver &&) receiver); }};
-  }
-
-  template <size_t... Is, typename... _Senders>
-  static auto _ConnectChildren(type* self, std::index_sequence<Is...>, _Senders&&... senders)
-      -> std::tuple<_ChildOpState<Senders, Is>...> {
-    return {_Connect1((_Senders &&) senders, _receiver_t<Is>{self})...};
-  }
-
-  using _ChildOpStates = decltype(_ConnectChildren(
-      nullptr, _Indices{}, std::declval<_copy_cvref_t<CvrefReceiver, Senders>>()...));
-
-  using _ChildValueTuple = std::tuple<std::optional<completion_signatures_of_t<Senders>>...>;
-
-  void _Arrive() noexcept {
-    if (0 == --count_) {
-      _Complete();
-    }
-  }
-
-  void _Complete() noexcept {
-    std::apply(
-        [this](auto&... opt_vals) -> void {
-          std::apply(
-              [this](auto&... all_vals) -> void {
-                SetValue((Receiver &&) receiver_, std::move(all_vals)...);
-              },
-              std::tuple_cat(
-                  std::apply([](auto&... vals) { return std::tie(vals...); }, *opt_vals)...));
-        },
-        vals_);
-  }
-
-  template <typename... _Senders>
-  explicit type(Receiver&& receiver, _Senders&&... senders)
-      : child_states_{_ConnectChildren(this, _Indices{}, (_Senders &&) senders...)},
-        receiver_(std::move(receiver)) {}
-
-  friend void tag_invoke(start_t, type& self) noexcept {
-    std::apply([](auto&&... child_ops) noexcept -> void { (Start(child_ops), ...); },
-               self.child_states_);
-  }
-
-  type(const type&) = delete;
-  type(type&&) = delete;
-  type& operator=(const type&) = delete;
-  type& operator=(type&&) = delete;
-
-  _ChildOpStates child_states_;
-  Receiver receiver_;
-  std::atomic<size_t> count_{sizeof...(Senders)};
-  _ChildValueTuple vals_;
-};
-
-template <typename... Senders>
-struct _Sender {
-  struct type;
-};
-template <typename... Senders>
-using Sender = typename _Sender<remove_cvref_t<Senders>...>::type;
-
-template <typename... Senders>
-struct _Sender<Senders...>::type {
-  using value_types = __concat_t<Senders...>;
-
-  template <typename Receiver>
-  using operation_t = Operation<Receiver, Senders...>;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
-      -> operation_t<_copy_cvref_t<Self, remove_cvref_t<Receiver>>> {  // cvref encoded in receiver
-                                                                       // type
-    return std::apply(
-        [&](auto&&... senders) {
-          // MSVC v142 doesn't recognize operation_t here
-          return Operation<_copy_cvref_t<Self, remove_cvref_t<Receiver>>, Senders...>(
-              (Receiver &&) receiver, (decltype(senders)&&)senders...);
-        },
-        ((Self &&) self).senders_);
-  }
-
-  std::tuple<Senders...> senders_;
-};
-
-struct when_all_t {
-  template <typename... Senders,
-            std::enable_if_t<(_is_sender<Senders> && ...) && (sizeof...(Senders) > 0) &&
-                                 tag_invocable<when_all_t, Senders...>,
-                             int> = 0>
-  auto operator()(Senders&&... senders) const {
-    return tag_invoke(when_all_t{}, (Senders &&) senders...);
-  }
-
-  template <
-      typename Range, typename ValueType = typename remove_cvref_t<Range>::value_type,
-      std::enable_if_t<
-          _is_range_v<Range> && _is_sender<ValueType> && tag_invocable<when_all_t, Range>, int> = 0>
-  auto operator()(Range&& range) const {
-    return tag_invoke(when_all_t{}, (Range &&) range);
-  }
-
-  template <typename... Senders,
-            std::enable_if_t<(_is_sender<Senders> && ...) && (sizeof...(Senders) > 0) &&
-                                 !tag_invocable<when_all_t, Senders...>,
-                             int> = 0>
-  Sender<Senders...> operator()(Senders&&... senders) const {
-    return {{(Senders &&) senders...}};
-  }
-};
-
-}  // namespace __when_all
-
-using __when_all::when_all_t;
-inline constexpr when_all_t WhenAll{};
+namespace mmdeploy
+{
+
+    namespace __when_all
+    {
+
+        template<typename... Senders>
+        using __concat_t = decltype(std::tuple_cat(std::declval<completion_signatures_of_t<Senders>>()...));
+
+        template<typename CvrefReceiver, typename... Senders>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename CvrefReceiver, typename... Senders>
+        using Operation = typename _Operation<CvrefReceiver, Senders...>::type;
+
+        template<typename CvrefReceiver, size_t Index, typename... Senders>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename CvrefReceiver, size_t Index, typename... Senders>
+        using receiver_t = typename _Receiver<CvrefReceiver, Index, Senders...>::type;
+
+        template<typename CvrefReceiver, size_t Index, typename... Senders>
+        struct _Receiver<CvrefReceiver, Index, Senders...>::type
+        {
+            using Receiver = remove_cvref_t<CvrefReceiver>;
+            Operation<CvrefReceiver, Senders...>* op_state_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                std::get<Index>(self.op_state_->vals_).emplace((As&&)as...);
+                self.op_state_->_Arrive();
+            }
+        };
+
+        template<typename CvrefReceiver, typename... Senders>
+        struct _Operation<CvrefReceiver, Senders...>::type
+        {
+            using Receiver = remove_cvref_t<CvrefReceiver>;
+
+            template<size_t Index>
+            using _receiver_t = receiver_t<CvrefReceiver, Index, Senders...>;
+
+            template<typename Sender, size_t Index>
+            using _ChildOpState = connect_result_t<_copy_cvref_t<CvrefReceiver, Sender>, _receiver_t<Index>>;
+
+            using _Indices = std::index_sequence_for<Senders...>;
+
+            // workaround for a bug in GCC7 that `Is` in a lambda is treated as unexpanded parameter pack
+            template<typename Sender, typename Receiver>
+            static auto _Connect1(Sender&& sender, Receiver&& receiver)
+            {
+                return __conv{[&]() mutable
+                              { return Connect((Sender&&)sender, (Receiver&&)receiver); }};
+            }
+
+            template<size_t... Is, typename... _Senders>
+            static auto _ConnectChildren(type* self, std::index_sequence<Is...>, _Senders&&... senders)
+                -> std::tuple<_ChildOpState<Senders, Is>...>
+            {
+                return {_Connect1((_Senders&&)senders, _receiver_t<Is>{self})...};
+            }
+
+            using _ChildOpStates = decltype(_ConnectChildren(
+                nullptr,
+                _Indices{},
+                std::declval<_copy_cvref_t<CvrefReceiver, Senders>>()...));
+
+            using _ChildValueTuple = std::tuple<std::optional<completion_signatures_of_t<Senders>>...>;
+
+            void _Arrive() noexcept
+            {
+                if (0 == --count_)
+                {
+                    _Complete();
+                }
+            }
+
+            void _Complete() noexcept
+            {
+                std::apply(
+                    [this](auto&... opt_vals) -> void
+                    {
+                        std::apply(
+                            [this](auto&... all_vals) -> void
+                            {
+                                SetValue((Receiver&&)receiver_, std::move(all_vals)...);
+                            },
+                            std::tuple_cat(
+                                std::apply([](auto&... vals)
+                                           { return std::tie(vals...); },
+                                           *opt_vals)...));
+                    },
+                    vals_);
+            }
+
+            template<typename... _Senders>
+            explicit type(Receiver&& receiver, _Senders&&... senders)
+                : child_states_{_ConnectChildren(this, _Indices{}, (_Senders&&)senders...)}
+                , receiver_(std::move(receiver))
+            {
+            }
+
+            friend void tag_invoke(start_t, type& self) noexcept
+            {
+                std::apply([](auto&&... child_ops) noexcept -> void
+                           { (Start(child_ops), ...); },
+                           self.child_states_);
+            }
+
+            type(const type&)                          = delete;
+            type(type&&)                               = delete;
+            type&               operator=(const type&) = delete;
+            type&               operator=(type&&)      = delete;
+
+            _ChildOpStates      child_states_;
+            Receiver            receiver_;
+            std::atomic<size_t> count_{sizeof...(Senders)};
+            _ChildValueTuple    vals_;
+        };
+
+        template<typename... Senders>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename... Senders>
+        using Sender = typename _Sender<remove_cvref_t<Senders>...>::type;
+
+        template<typename... Senders>
+        struct _Sender<Senders...>::type
+        {
+            using value_types = __concat_t<Senders...>;
+
+            template<typename Receiver>
+            using operation_t = Operation<Receiver, Senders...>;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                -> operation_t<_copy_cvref_t<Self, remove_cvref_t<Receiver>>>
+            {  // cvref encoded in receiver
+               // type
+                return std::apply(
+                    [&](auto&&... senders)
+                    {
+                        // MSVC v142 doesn't recognize operation_t here
+                        return Operation<_copy_cvref_t<Self, remove_cvref_t<Receiver>>, Senders...>(
+                            (Receiver&&)receiver,
+                            (decltype(senders)&&)senders...);
+                    },
+                    ((Self&&)self).senders_);
+            }
+
+            std::tuple<Senders...> senders_;
+        };
+
+        struct when_all_t
+        {
+            template<typename... Senders,
+                     std::enable_if_t<(_is_sender<Senders> && ...) && (sizeof...(Senders) > 0) &&
+                                          tag_invocable<when_all_t, Senders...>,
+                                      int> = 0>
+            auto operator()(Senders&&... senders) const
+            {
+                return tag_invoke(when_all_t{}, (Senders&&)senders...);
+            }
+
+            template<
+                typename Range,
+                typename ValueType = typename remove_cvref_t<Range>::value_type,
+                std::enable_if_t<
+                    _is_range_v<Range> && _is_sender<ValueType> && tag_invocable<when_all_t, Range>,
+                    int> = 0>
+            auto operator()(Range&& range) const
+            {
+                return tag_invoke(when_all_t{}, (Range&&)range);
+            }
+
+            template<typename... Senders,
+                     std::enable_if_t<(_is_sender<Senders> && ...) && (sizeof...(Senders) > 0) &&
+                                          !tag_invocable<when_all_t, Senders...>,
+                                      int> = 0>
+            Sender<Senders...> operator()(Senders&&... senders) const
+            {
+                return {{(Senders&&)senders...}};
+            }
+        };
+
+    }  // namespace __when_all
+
+    using __when_all::when_all_t;
+    inline constexpr when_all_t WhenAll{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/when_all_value.h b/csrc/mmdeploy/execution/when_all_value.h
index b059937995..706cc7ffa2 100644
--- a/csrc/mmdeploy/execution/when_all_value.h
+++ b/csrc/mmdeploy/execution/when_all_value.h
@@ -6,86 +6,102 @@
 #include "mmdeploy/core/value.h"
 #include "mmdeploy/execution/schedulers/registry.h"
 
-namespace mmdeploy {
-
-namespace __when_all_value {
-
-template <typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Receiver>
-using operation_t = typename _Operation<Receiver>::type;
-
-template <typename Receiver>
-struct _Receiver {
-  struct type;
-};
-template <typename Receiver>
-using receiver_t = typename _Receiver<Receiver>::type;
-
-template <typename Receiver>
-struct _Receiver<Receiver>::type {
-  size_t index_;
-  operation_t<Receiver>* op_state_;
-
-  friend void tag_invoke(set_value_t, type&& self, Value value) noexcept {
-    self.op_state_->values_[self.index_] = std::move(value);
-    if (0 == --self.op_state_->count_) {
-      SetValue(std::move(self.op_state_->rcvr_), std::move(self.op_state_->values_));
-    }
-  }
-};
-
-template <typename Receiver>
-struct _Operation<Receiver>::type {
-  std::vector<TypeErasedOperation> ConnectChildren(std::vector<TypeErasedSender<Value>> senders) {
-    std::vector<TypeErasedOperation> op_states;
-    op_states.reserve(senders.size());
-    for (size_t i = 0; i < senders.size(); ++i)
-      op_states.push_back(Connect(std::move(senders[i]), receiver_t<Receiver>{i, this}));
-    return op_states;
-  }
-  type(std::vector<TypeErasedSender<Value>> senders, Receiver receiver)
-      : child_op_states_{ConnectChildren(std::move(senders))},
-        rcvr_((Receiver &&) receiver),
-        count_(child_op_states_.size()),
-        values_(child_op_states_.size()) {}
-
-  std::vector<TypeErasedOperation> child_op_states_;
-  Receiver rcvr_;
-  std::atomic<size_t> count_;
-  std::vector<Value> values_;
-
-  friend void tag_invoke(start_t, type& op_state) {
-    for (auto& op : op_state.child_op_states_) {
-      Start(op);
-    }
-  }
-};
-
-struct sender_t {
-  using value_types = std::tuple<std::vector<Value>>;
-
-  std::vector<TypeErasedSender<Value>> senders_;
-
-  template <typename Self, typename Receiver, typename = _decays_to<Self, sender_t>>
-  friend operation_t<remove_cvref_t<Receiver>> tag_invoke(connect_t, Self&& self,
-                                                          Receiver&& receiver) {
-    return {((Self &&) self).senders_, (Receiver &&) receiver};
-  }
-};
-
-}  // namespace __when_all_value
-
-namespace _type_erased {
-
-inline __when_all_value::sender_t tag_invoke(when_all_t,
-                                             std::vector<TypeErasedSender<Value>> senders) {
-  return {std::move(senders)};
-}
-
-}  // namespace _type_erased
+namespace mmdeploy
+{
+
+    namespace __when_all_value
+    {
+
+        template<typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Receiver>
+        using operation_t = typename _Operation<Receiver>::type;
+
+        template<typename Receiver>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Receiver>
+        using receiver_t = typename _Receiver<Receiver>::type;
+
+        template<typename Receiver>
+        struct _Receiver<Receiver>::type
+        {
+            size_t                 index_;
+            operation_t<Receiver>* op_state_;
+
+            friend void            tag_invoke(set_value_t, type&& self, Value value) noexcept
+            {
+                self.op_state_->values_[self.index_] = std::move(value);
+                if (0 == --self.op_state_->count_)
+                {
+                    SetValue(std::move(self.op_state_->rcvr_), std::move(self.op_state_->values_));
+                }
+            }
+        };
+
+        template<typename Receiver>
+        struct _Operation<Receiver>::type
+        {
+            std::vector<TypeErasedOperation> ConnectChildren(std::vector<TypeErasedSender<Value>> senders)
+            {
+                std::vector<TypeErasedOperation> op_states;
+                op_states.reserve(senders.size());
+                for (size_t i = 0; i < senders.size(); ++i)
+                    op_states.push_back(Connect(std::move(senders[i]), receiver_t<Receiver>{i, this}));
+                return op_states;
+            }
+            type(std::vector<TypeErasedSender<Value>> senders, Receiver receiver)
+                : child_op_states_{ConnectChildren(std::move(senders))}
+                , rcvr_((Receiver&&)receiver)
+                , count_(child_op_states_.size())
+                , values_(child_op_states_.size())
+            {
+            }
+
+            std::vector<TypeErasedOperation> child_op_states_;
+            Receiver                         rcvr_;
+            std::atomic<size_t>              count_;
+            std::vector<Value>               values_;
+
+            friend void                      tag_invoke(start_t, type& op_state)
+            {
+                for (auto& op : op_state.child_op_states_)
+                {
+                    Start(op);
+                }
+            }
+        };
+
+        struct sender_t
+        {
+            using value_types = std::tuple<std::vector<Value>>;
+
+            std::vector<TypeErasedSender<Value>> senders_;
+
+            template<typename Self, typename Receiver, typename = _decays_to<Self, sender_t>>
+            friend operation_t<remove_cvref_t<Receiver>> tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+            {
+                return {((Self&&)self).senders_, (Receiver&&)receiver};
+            }
+        };
+
+    }  // namespace __when_all_value
+
+    namespace _type_erased
+    {
+
+        inline __when_all_value::sender_t tag_invoke(when_all_t,
+                                                     std::vector<TypeErasedSender<Value>> senders)
+        {
+            return {std::move(senders)};
+        }
+
+    }  // namespace _type_erased
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/experimental/module_adapter.h b/csrc/mmdeploy/experimental/module_adapter.h
index 763a71ba6b..712ff84615 100644
--- a/csrc/mmdeploy/experimental/module_adapter.h
+++ b/csrc/mmdeploy/experimental/module_adapter.h
@@ -7,105 +7,136 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/mpl/type_traits.h"
 
-namespace mmdeploy {
-
-namespace module_adapter {
-
-template <typename T>
-struct is_tuple : std::false_type {};
-
-template <typename... Ts>
-struct is_tuple<std::tuple<Ts...>> : std::true_type {};
-
-template <typename T>
-inline constexpr auto is_tuple_v = is_tuple<T>::value;
-
-template <typename... Args>
-struct InvokeImpl {
-  template <typename F>
-  static Result<Value> apply(F&& f, const Value& args) {
-    try {
-      using ArgsType = std::tuple<uncvref_t<Args>...>;
-      return make_ret_val(std::apply((F &&) f, from_value<ArgsType>(args)));
-    } catch (const std::exception& e) {
-      MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-      return Status(eFail);
-    } catch (...) {
-      return Status(eFail);
-    }
-  }
-
-  template <typename T, typename T0 = uncvref_t<T>>
-  static Result<Value> make_ret_val(T&& ret) {
-    if constexpr (is_tuple_v<T0>) {
-      return to_value(std::forward<T>(ret));
-    } else if constexpr (is_result_v<T0>) {
-      return ret ? make_ret_val(std::forward<T>(ret).value()) : std::forward<T>(ret).as_failure();
-    } else {
-      return make_ret_val(std::forward_as_tuple(std::forward<T>(ret)));
-    }
-  }
-};
-
-// match function pointer
-template <typename Ret, typename... Args>
-Result<Value> Invoke(Ret (*f)(Args...), const Value& args) {
-  return InvokeImpl<Args...>::apply(f, args);
-}
-
-// match member function pointer `&C::operator()`
-template <typename Ret, typename C, typename F, typename... Args>
-Result<Value> Invoke(Ret (C::*)(Args...) const, const F& f, const Value& args) {
-  return InvokeImpl<Args...>::apply(f, args);
-}
-template <typename Ret, typename C, typename F, typename... Args>
-Result<Value> Invoke(Ret (C::*)(Args...), F& f, const Value& args) {
-  return InvokeImpl<Args...>::apply(f, args);
-}
-
-// match function object
-template <typename F, typename C = std::remove_reference_t<F>,
-          typename = std::void_t<decltype(&C::operator())>>
-Result<Value> Invoke(F&& f, const Value& args) {
-  return Invoke(&C::operator(), (F &&) f, args);
-}
-
-template <typename F>
-Result<Value> Invoke(const std::unique_ptr<F>& f, const Value& args) {
-  return Invoke(*f, args);
-}
-template <typename F>
-Result<Value> Invoke(const std::shared_ptr<F>& f, const Value& args) {
-  return Invoke(*f, args);
-}
-
-template <typename Func>
-class Task : public Module {
- public:
-  explicit Task(Func func) : func_(std::move(func)) {}
-
-  Result<Value> Process(const Value& arg) override {
-    return ::mmdeploy::module_adapter::Invoke(func_, arg);
-  }
-
- private:
-  Func func_;
-};
-
-template <typename T>
-std::unique_ptr<Module> CreateTask(T&& x) {
-  return std::unique_ptr<Module>(new Task{std::forward<T>(x)});
-}
-
-template <typename T>
-auto MakeTask(T&& x) {
-  return Task(std::forward<T>(x));
-}
-
-}  // namespace module_adapter
-
-using module_adapter::CreateTask;
-using module_adapter::MakeTask;
+namespace mmdeploy
+{
+
+    namespace module_adapter
+    {
+
+        template<typename T>
+        struct is_tuple : std::false_type
+        {
+        };
+
+        template<typename... Ts>
+        struct is_tuple<std::tuple<Ts...>> : std::true_type
+        {
+        };
+
+        template<typename T>
+        inline constexpr auto is_tuple_v = is_tuple<T>::value;
+
+        template<typename... Args>
+        struct InvokeImpl
+        {
+            template<typename F>
+            static Result<Value> apply(F&& f, const Value& args)
+            {
+                try
+                {
+                    using ArgsType = std::tuple<uncvref_t<Args>...>;
+                    return make_ret_val(std::apply((F&&)f, from_value<ArgsType>(args)));
+                }
+                catch (const std::exception& e)
+                {
+                    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+                    return Status(eFail);
+                }
+                catch (...)
+                {
+                    return Status(eFail);
+                }
+            }
+
+            template<typename T, typename T0 = uncvref_t<T>>
+            static Result<Value> make_ret_val(T&& ret)
+            {
+                if constexpr (is_tuple_v<T0>)
+                {
+                    return to_value(std::forward<T>(ret));
+                }
+                else if constexpr (is_result_v<T0>)
+                {
+                    return ret ? make_ret_val(std::forward<T>(ret).value()) : std::forward<T>(ret).as_failure();
+                }
+                else
+                {
+                    return make_ret_val(std::forward_as_tuple(std::forward<T>(ret)));
+                }
+            }
+        };
+
+        // match function pointer
+        template<typename Ret, typename... Args>
+        Result<Value> Invoke(Ret (*f)(Args...), const Value& args)
+        {
+            return InvokeImpl<Args...>::apply(f, args);
+        }
+
+        // match member function pointer `&C::operator()`
+        template<typename Ret, typename C, typename F, typename... Args>
+        Result<Value> Invoke(Ret (C::*)(Args...) const, const F& f, const Value& args)
+        {
+            return InvokeImpl<Args...>::apply(f, args);
+        }
+        template<typename Ret, typename C, typename F, typename... Args>
+        Result<Value> Invoke(Ret (C::*)(Args...), F& f, const Value& args)
+        {
+            return InvokeImpl<Args...>::apply(f, args);
+        }
+
+        // match function object
+        template<typename F, typename C = std::remove_reference_t<F>, typename = std::void_t<decltype(&C::operator())>>
+        Result<Value> Invoke(F&& f, const Value& args)
+        {
+            return Invoke(&C::operator(), (F&&)f, args);
+        }
+
+        template<typename F>
+        Result<Value> Invoke(const std::unique_ptr<F>& f, const Value& args)
+        {
+            return Invoke(*f, args);
+        }
+        template<typename F>
+        Result<Value> Invoke(const std::shared_ptr<F>& f, const Value& args)
+        {
+            return Invoke(*f, args);
+        }
+
+        template<typename Func>
+        class Task : public Module
+        {
+          public:
+            explicit Task(Func func)
+                : func_(std::move(func))
+            {
+            }
+
+            Result<Value> Process(const Value& arg) override
+            {
+                return ::mmdeploy::module_adapter::Invoke(func_, arg);
+            }
+
+          private:
+            Func func_;
+        };
+
+        template<typename T>
+        std::unique_ptr<Module> CreateTask(T&& x)
+        {
+            return std::unique_ptr<Module>(new Task{std::forward<T>(x)});
+        }
+
+        template<typename T>
+        auto MakeTask(T&& x)
+        {
+            return Task(std::forward<T>(x));
+        }
+
+    }  // namespace module_adapter
+
+    using module_adapter::CreateTask;
+    using module_adapter::MakeTask;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/graph/common.h b/csrc/mmdeploy/graph/common.h
index 6427340592..cd5e81a716 100644
--- a/csrc/mmdeploy/graph/common.h
+++ b/csrc/mmdeploy/graph/common.h
@@ -8,48 +8,67 @@
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy::graph {
-
-namespace {
-
-template <typename T>
-inline auto Check(const T& v) -> decltype(!!v) {
-  return !!v;
-}
-
-template <typename T>
-inline std::true_type Check(T&&) {
-  return {};
-}
-
-}  // namespace
-
-namespace _maybe {
-
-struct Maybe {
-  std::optional<std::reference_wrapper<const Value>> val_;
-  explicit operator bool() const noexcept { return val_.has_value(); }
-  const Value& operator*() const noexcept { return val_->get(); }
-  const Value* operator->() const noexcept { return &val_->get(); }
-};
-
-inline Maybe operator/(const Maybe& maybe, const string& p) {
-  if (maybe && maybe->contains(p)) {
-    return {(*maybe)[p]};
-  }
-  return {std::nullopt};
-}
-
-template <typename T>
-inline std::optional<T> operator/(const Maybe& maybe, identity<T>) {
-  if (maybe) {
-    return maybe->get<T>();
-  }
-  return std::nullopt;
-}
-}  // namespace _maybe
-
-using _maybe::Maybe;
+namespace mmdeploy::graph
+{
+
+    namespace
+    {
+
+        template<typename T>
+        inline auto Check(const T& v) -> decltype(!!v)
+        {
+            return !!v;
+        }
+
+        template<typename T>
+        inline std::true_type Check(T&&)
+        {
+            return {};
+        }
+
+    }  // namespace
+
+    namespace _maybe
+    {
+
+        struct Maybe
+        {
+            std::optional<std::reference_wrapper<const Value>> val_;
+            explicit                                           operator bool() const noexcept
+            {
+                return val_.has_value();
+            }
+            const Value& operator*() const noexcept
+            {
+                return val_->get();
+            }
+            const Value* operator->() const noexcept
+            {
+                return &val_->get();
+            }
+        };
+
+        inline Maybe operator/(const Maybe& maybe, const string& p)
+        {
+            if (maybe && maybe->contains(p))
+            {
+                return {(*maybe)[p]};
+            }
+            return {std::nullopt};
+        }
+
+        template<typename T>
+        inline std::optional<T> operator/(const Maybe& maybe, identity<T>)
+        {
+            if (maybe)
+            {
+                return maybe->get<T>();
+            }
+            return std::nullopt;
+        }
+    }  // namespace _maybe
+
+    using _maybe::Maybe;
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/cond.cpp b/csrc/mmdeploy/graph/cond.cpp
index f490e08e73..5cf6b80053 100644
--- a/csrc/mmdeploy/graph/cond.cpp
+++ b/csrc/mmdeploy/graph/cond.cpp
@@ -4,66 +4,81 @@
 
 #include <algorithm>
 
-namespace mmdeploy::graph {
-
-namespace {
-
-std::vector<int> get_predicates(const Value::Array& xs) {
-  std::vector<int> ps;
-  ps.reserve(xs.size());
-  std::transform(std::begin(xs), std::end(xs), std::back_inserter(ps),
-                 [](const Value& x) { return static_cast<int>(x.get<bool>()); });
-  return ps;
-}
-
-std::pair<bool, int> choice(const std::vector<int>& xs) {
-  auto count = std::count(std::begin(xs), std::end(xs), 1);
-  if (count == 0 || count == xs.size()) {
-    return std::make_pair(true, count == xs.size());
-  }
-  return std::make_pair(false, false);
-}
-
-Value get_divergent_input(Value::Array& as, const std::vector<int>& ps) {
-  Value::Array ts(as.size(), Value::kArray);
-  for (size_t i = 0; i < ts.size(); ++i) {
-    auto& t = ts[i].array();
-    auto& a = as[i].array();
-    for (size_t j = 0; j < ps.size(); ++j) {
-      if (ps[j]) {
-        t.push_back(std::move(a[j]));
-      }
-    }
-  }
-  return ts;
-}
-
-Value get_divergent_output(Value::Array& rs, const vector<int>& ps) {
-  Value::Array ys(rs.size(), Value::kArray);
-  for (size_t i = 0; i < ys.size(); ++i) {
-    auto& y = ys[i].array();
-    auto& r = rs[i].array();
-    size_t k = 0;
-    for (const auto& p : ps) {
-      y.push_back(p ? std::move(r[k++]) : nullptr);
-    }
-  }
-  return ys;
-}
-
-}  // namespace
-
-Sender<Value> Cond::Process(Sender<Value> input) {
-  auto index = std::make_shared<profiler::Index>();
-  if (scope_) {
-    *index = scope_->next_.fetch_add(1, std::memory_order_relaxed);
-    input = Then(std::move(input), [this, index](Value v) mutable {
+namespace mmdeploy::graph
+{
+
+    namespace
+    {
+
+        std::vector<int> get_predicates(const Value::Array& xs)
+        {
+            std::vector<int> ps;
+            ps.reserve(xs.size());
+            std::transform(std::begin(xs), std::end(xs), std::back_inserter(ps), [](const Value& x)
+                           { return static_cast<int>(x.get<bool>()); });
+            return ps;
+        }
+
+        std::pair<bool, int> choice(const std::vector<int>& xs)
+        {
+            auto count = std::count(std::begin(xs), std::end(xs), 1);
+            if (count == 0 || count == xs.size())
+            {
+                return std::make_pair(true, count == xs.size());
+            }
+            return std::make_pair(false, false);
+        }
+
+        Value get_divergent_input(Value::Array& as, const std::vector<int>& ps)
+        {
+            Value::Array ts(as.size(), Value::kArray);
+            for (size_t i = 0; i < ts.size(); ++i)
+            {
+                auto& t = ts[i].array();
+                auto& a = as[i].array();
+                for (size_t j = 0; j < ps.size(); ++j)
+                {
+                    if (ps[j])
+                    {
+                        t.push_back(std::move(a[j]));
+                    }
+                }
+            }
+            return ts;
+        }
+
+        Value get_divergent_output(Value::Array& rs, const vector<int>& ps)
+        {
+            Value::Array ys(rs.size(), Value::kArray);
+            for (size_t i = 0; i < ys.size(); ++i)
+            {
+                auto&  y = ys[i].array();
+                auto&  r = rs[i].array();
+                size_t k = 0;
+                for (const auto& p : ps)
+                {
+                    y.push_back(p ? std::move(r[k++]) : nullptr);
+                }
+            }
+            return ys;
+        }
+
+    }  // namespace
+
+    Sender<Value> Cond::Process(Sender<Value> input)
+    {
+        auto index = std::make_shared<profiler::Index>();
+        if (scope_)
+        {
+            *index = scope_->next_.fetch_add(1, std::memory_order_relaxed);
+            input  = Then(std::move(input), [this, index](Value v) mutable
+                         {
       scope_->Add(profiler::Event::kStart, *index, profiler::Clock::now());
-      return std::move(v);
-    });
-  }
+      return std::move(v); });
+        }
 
-  Sender<Value> output = LetValue(std::move(input), [this](Value& _input) -> Sender<Value> {
+        Sender<Value> output = LetValue(std::move(input), [this](Value& _input) -> Sender<Value>
+                                        {
     assert(_input.is_array());
     auto& as = _input.array();
     auto ps = get_predicates(as.front().array());
@@ -82,60 +97,71 @@ Sender<Value> Cond::Process(Sender<Value> input) {
              Then([ps = std::move(ps)](Value rs) -> Value {
                return get_divergent_output(rs.array(), ps);
              });
-    }
-  });
+    } });
 
-  if (scope_) {
-    output = Then(std::move(output), [this, index](Value v) {
+        if (scope_)
+        {
+            output = Then(std::move(output), [this, index](Value v)
+                          {
       scope_->Add(profiler::Event::kEnd, *index, profiler::Clock::now());
-      return std::move(v);
-    });
-  }
-  return output;
-}
-
-CondBuilder::CondBuilder(Value config) : Builder(std::move(config)) {}
-
-Result<unique_ptr<Node>> CondBuilder::BuildImpl() {
-  try {
-    auto cond = std::make_unique<Cond>();
-    cond->n_output_ = static_cast<int>(config_["output"].size());
-
-    auto& body_config = config_["body"];
-    auto inputs = config_["input"].array();
-    inputs.erase(inputs.begin());
-
-    body_config["input"] = std::move(inputs);
-    body_config["output"] = config_["output"];
-
-    // propagate context
-    if (!body_config.contains("context")) {
-      body_config["context"] = Value::Object();
+      return std::move(v); });
+        }
+        return output;
     }
-    if (config_.contains("context")) {
-      update(body_config["context"].object(), config_["context"].object(), 2);
-      if (config_["context"].contains("scope")) {
-        auto scope = config_["context"]["scope"].get<profiler::Scope*>();
-        auto name = config_.value("name", std::string("Cond"));
-        cond->scope_ = scope->CreateScope(name);
-        body_config["context"]["scope"] = cond->scope_;
-      }
+
+    CondBuilder::CondBuilder(Value config)
+        : Builder(std::move(config))
+    {
     }
 
-    if (auto builder = Builder::CreateFromConfig(body_config).value()) {
-      if (auto node = builder->Build().value()) {
-        cond->node_ = std::move(node);
-        return std::move(cond);
-      }
+    Result<unique_ptr<Node>> CondBuilder::BuildImpl()
+    {
+        try
+        {
+            auto cond       = std::make_unique<Cond>();
+            cond->n_output_ = static_cast<int>(config_["output"].size());
+
+            auto& body_config = config_["body"];
+            auto  inputs      = config_["input"].array();
+            inputs.erase(inputs.begin());
+
+            body_config["input"]  = std::move(inputs);
+            body_config["output"] = config_["output"];
+
+            // propagate context
+            if (!body_config.contains("context"))
+            {
+                body_config["context"] = Value::Object();
+            }
+            if (config_.contains("context"))
+            {
+                update(body_config["context"].object(), config_["context"].object(), 2);
+                if (config_["context"].contains("scope"))
+                {
+                    auto scope                      = config_["context"]["scope"].get<profiler::Scope*>();
+                    auto name                       = config_.value("name", std::string("Cond"));
+                    cond->scope_                    = scope->CreateScope(name);
+                    body_config["context"]["scope"] = cond->scope_;
+                }
+            }
+
+            if (auto builder = Builder::CreateFromConfig(body_config).value())
+            {
+                if (auto node = builder->Build().value())
+                {
+                    cond->node_ = std::move(node);
+                    return std::move(cond);
+                }
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("error parsing config: {}", config_);
+        }
+        return Status(eFail);
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("error parsing config: {}", config_);
-  }
-  return Status(eFail);
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Cond, 0), [](const Value& config) {
-  return std::make_unique<CondBuilder>(config);
-})
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Cond, 0), [](const Value& config)
+                                   { return std::make_unique<CondBuilder>(config); })
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/graph/cond.h b/csrc/mmdeploy/graph/cond.h
index 9d374f2ff6..bf600eaa1b 100644
--- a/csrc/mmdeploy/graph/cond.h
+++ b/csrc/mmdeploy/graph/cond.h
@@ -6,27 +6,30 @@
 #include "mmdeploy/core/graph.h"
 #include "mmdeploy/core/profiler.h"
 
-namespace mmdeploy::graph {
-
-class Cond : public Node {
-  friend class CondBuilder;
-
- public:
-  Sender<Value> Process(Sender<Value> input) override;
-
- private:
-  profiler::Scope* scope_{nullptr};
-  std::unique_ptr<Node> node_;
-  int n_output_{0};
-};
-
-class CondBuilder : public Builder {
- public:
-  explicit CondBuilder(Value config);
-
- protected:
-  Result<unique_ptr<Node>> BuildImpl() override;
-};
+namespace mmdeploy::graph
+{
+
+    class Cond : public Node
+    {
+        friend class CondBuilder;
+
+      public:
+        Sender<Value> Process(Sender<Value> input) override;
+
+      private:
+        profiler::Scope*      scope_{nullptr};
+        std::unique_ptr<Node> node_;
+        int                   n_output_{0};
+    };
+
+    class CondBuilder : public Builder
+    {
+      public:
+        explicit CondBuilder(Value config);
+
+      protected:
+        Result<unique_ptr<Node>> BuildImpl() override;
+    };
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/flattened.h b/csrc/mmdeploy/graph/flattened.h
index 4a2fd4cb9b..2555245453 100644
--- a/csrc/mmdeploy/graph/flattened.h
+++ b/csrc/mmdeploy/graph/flattened.h
@@ -7,48 +7,48 @@
 #include "mmdeploy/core/operator.h"
 #include "mmdeploy/execution/expand.h"
 
-namespace mmdeploy::graph {
-
-class Flattened : public Node {
- public:
-  Flattened(unique_ptr<Node> child, vector<bool> flatten, vector<bool> broadcast,
-            vector<bool> unflatten);
-
-  Sender<Value> Process(Sender<Value> input) override;
-
- private:
-  const vector<bool> flatten_;
-  const vector<bool> broadcast_;
-  const vector<bool> unflatten_;
-  unique_ptr<Node> body_;
-};
-
-inline Flattened::Flattened(unique_ptr<Node> child, vector<bool> flatten, vector<bool> broadcast,
-                            vector<bool> unflatten)
-    : flatten_(std::move(flatten)),
-      broadcast_(std::move(broadcast)),
-      unflatten_(std::move(unflatten)),
-      body_(std::move(child)) {}
-
-inline Sender<Value> Flattened::Process(Sender<Value> input) {
-  auto flatten = Then([this](Value input) -> std::tuple<Value::Array, vector<int>> {
+namespace mmdeploy::graph
+{
+
+    class Flattened : public Node
+    {
+      public:
+        Flattened(unique_ptr<Node> child, vector<bool> flatten, vector<bool> broadcast, vector<bool> unflatten);
+
+        Sender<Value> Process(Sender<Value> input) override;
+
+      private:
+        const vector<bool> flatten_;
+        const vector<bool> broadcast_;
+        const vector<bool> unflatten_;
+        unique_ptr<Node>   body_;
+    };
+
+    inline Flattened::Flattened(unique_ptr<Node> child, vector<bool> flatten, vector<bool> broadcast, vector<bool> unflatten)
+        : flatten_(std::move(flatten))
+        , broadcast_(std::move(broadcast))
+        , unflatten_(std::move(unflatten))
+        , body_(std::move(child))
+    {
+    }
+
+    inline Sender<Value> Flattened::Process(Sender<Value> input)
+    {
+        auto flatten = Then([this](Value input) -> std::tuple<Value::Array, vector<int>>
+                            {
     auto [output, index] = FlattenArray(std::move(input).array(), flatten_);
     output = BroadcastArray(std::move(output), index, broadcast_);
-    return {std::move(output), std::move(index)};
-  });
+    return {std::move(output), std::move(index)}; });
 
-  auto process = LetValue([this](Value::Array& v, vector<int>& idx) {
-    return Just(Value(std::move(v))) | body_->Process() | Then([idx](Value output) mutable {
-             return std::make_tuple(std::move(output), std::move(idx));
-           });
-  });
+        auto process = LetValue([this](Value::Array& v, vector<int>& idx)
+                                { return Just(Value(std::move(v))) | body_->Process() | Then([idx](Value output) mutable
+                                                                                             { return std::make_tuple(std::move(output), std::move(idx)); }); });
 
-  auto unflatten = Then([this](Value output, const vector<int>& index) -> Value {
-    return UnflattenArray(std::move(output).array(), index, unflatten_);
-  });
+        auto unflatten = Then([this](Value output, const vector<int>& index) -> Value
+                              { return UnflattenArray(std::move(output).array(), index, unflatten_); });
 
-  return std::move(input) | flatten | Expand() | process | Expand() | unflatten;
-}
+        return std::move(input) | flatten | Expand() | process | Expand() | unflatten;
+    }
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/inference.cpp b/csrc/mmdeploy/graph/inference.cpp
index 8f5c8d1699..b72adaeef8 100644
--- a/csrc/mmdeploy/graph/inference.cpp
+++ b/csrc/mmdeploy/graph/inference.cpp
@@ -7,72 +7,85 @@
 #include "mmdeploy/core/profiler.h"
 #include "mmdeploy/graph/common.h"
 
-namespace mmdeploy::graph {
-
-using namespace framework;
-
-InferenceBuilder::InferenceBuilder(Value config) : Builder(std::move(config)) {}
-
-Result<unique_ptr<Node>> InferenceBuilder::BuildImpl() {
-  auto& model_config = config_["params"]["model"];
-  Model model;
-  if (model_config.is_any<Model>()) {
-    model = model_config.get<Model>();
-  } else {
-    auto model_name = model_config.get<string>();
-    if (auto m = Maybe{config_} / "context" / "model" / model_name / identity<Model>{}) {
-      model = *m;
-    } else {
-      model = Model(model_name);
+namespace mmdeploy::graph
+{
+
+    using namespace framework;
+
+    InferenceBuilder::InferenceBuilder(Value config)
+        : Builder(std::move(config))
+    {
+    }
+
+    Result<unique_ptr<Node>> InferenceBuilder::BuildImpl()
+    {
+        auto& model_config = config_["params"]["model"];
+        Model model;
+        if (model_config.is_any<Model>())
+        {
+            model = model_config.get<Model>();
+        }
+        else
+        {
+            auto model_name = model_config.get<string>();
+            if (auto m = Maybe{config_} / "context" / "model" / model_name / identity<Model>{})
+            {
+                model = *m;
+            }
+            else
+            {
+                model = Model(model_name);
+            }
+        }
+
+        OUTCOME_TRY(auto pipeline_config, model.ReadConfig("pipeline.json"));
+
+        auto context     = config_.value("context", Value(ValueType::kObject));
+        context["model"] = std::move(model);
+
+        if (context.contains("scope"))
+        {
+            auto name        = config_.value("name", config_["type"].get<std::string>());
+            auto scope       = context["scope"].get_ref<profiler::Scope*&>()->CreateScope(name);
+            context["scope"] = scope;
+        }
+        pipeline_config["context"] = context;
+
+        MMDEPLOY_DEBUG("{}", pipeline_config);
+
+        OUTCOME_TRY(auto pipeline_builder, Builder::CreateFromConfig(pipeline_config));
+        OUTCOME_TRY(auto node, pipeline_builder->Build());
+
+        OUTCOME_TRY(CheckInputs(*pipeline_builder));
+        OUTCOME_TRY(CheckOutputs(*pipeline_builder));
+
+        return std::move(node);
     }
-  }
-
-  OUTCOME_TRY(auto pipeline_config, model.ReadConfig("pipeline.json"));
-
-  auto context = config_.value("context", Value(ValueType::kObject));
-  context["model"] = std::move(model);
-
-  if (context.contains("scope")) {
-    auto name = config_.value("name", config_["type"].get<std::string>());
-    auto scope = context["scope"].get_ref<profiler::Scope*&>()->CreateScope(name);
-    context["scope"] = scope;
-  }
-  pipeline_config["context"] = context;
-
-  MMDEPLOY_DEBUG("{}", pipeline_config);
-
-  OUTCOME_TRY(auto pipeline_builder, Builder::CreateFromConfig(pipeline_config));
-  OUTCOME_TRY(auto node, pipeline_builder->Build());
-
-  OUTCOME_TRY(CheckInputs(*pipeline_builder));
-  OUTCOME_TRY(CheckOutputs(*pipeline_builder));
-
-  return std::move(node);
-}
-Result<void> InferenceBuilder::CheckInputs(Builder& builder) {
-  OUTCOME_TRY(auto inputs_internal, ParseStringArray(config_["input"]));
-  MMDEPLOY_INFO("{} <- {}", builder.inputs(), inputs_internal);
-  if (builder.inputs().size() != inputs_internal.size()) {
-    MMDEPLOY_ERROR("mis-matched number of inputs: {} vs {}", builder.inputs().size(),
-                   inputs_internal.size());
-    return Status(eInvalidArgument);
-  }
-  return success();
-}
-
-Result<void> InferenceBuilder::CheckOutputs(Builder& builder) {
-  OUTCOME_TRY(auto outputs_internal, ParseStringArray(config_["output"]));
-  MMDEPLOY_INFO("{} -> {}", builder.outputs(), outputs_internal);
-  if (builder.outputs().size() != outputs_internal.size()) {
-    MMDEPLOY_ERROR("mis-matched number of outputs: {} vs {}", builder.outputs().size(),
-                   outputs_internal.size());
-    return Status(eInvalidArgument);
-  }
-  return success();
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Inference, 0), [](const Value& config) {
-  return std::make_unique<InferenceBuilder>(config);
-});
+    Result<void> InferenceBuilder::CheckInputs(Builder& builder)
+    {
+        OUTCOME_TRY(auto inputs_internal, ParseStringArray(config_["input"]));
+        MMDEPLOY_INFO("{} <- {}", builder.inputs(), inputs_internal);
+        if (builder.inputs().size() != inputs_internal.size())
+        {
+            MMDEPLOY_ERROR("mis-matched number of inputs: {} vs {}", builder.inputs().size(), inputs_internal.size());
+            return Status(eInvalidArgument);
+        }
+        return success();
+    }
+
+    Result<void> InferenceBuilder::CheckOutputs(Builder& builder)
+    {
+        OUTCOME_TRY(auto outputs_internal, ParseStringArray(config_["output"]));
+        MMDEPLOY_INFO("{} -> {}", builder.outputs(), outputs_internal);
+        if (builder.outputs().size() != outputs_internal.size())
+        {
+            MMDEPLOY_ERROR("mis-matched number of outputs: {} vs {}", builder.outputs().size(), outputs_internal.size());
+            return Status(eInvalidArgument);
+        }
+        return success();
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Inference, 0), [](const Value& config)
+                                   { return std::make_unique<InferenceBuilder>(config); });
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/graph/inference.h b/csrc/mmdeploy/graph/inference.h
index 81b8aedc47..35317d21da 100644
--- a/csrc/mmdeploy/graph/inference.h
+++ b/csrc/mmdeploy/graph/inference.h
@@ -5,19 +5,21 @@
 
 #include "mmdeploy/core/graph.h"
 
-namespace mmdeploy::graph {
+namespace mmdeploy::graph
+{
 
-class InferenceBuilder : public Builder {
- public:
-  explicit InferenceBuilder(Value config);
+    class InferenceBuilder : public Builder
+    {
+      public:
+        explicit InferenceBuilder(Value config);
 
- protected:
-  Result<unique_ptr<Node>> BuildImpl() override;
+      protected:
+        Result<unique_ptr<Node>> BuildImpl() override;
 
- private:
-  Result<void> CheckInputs(Builder& builder);
-  Result<void> CheckOutputs(Builder& builder);
-};
+      private:
+        Result<void> CheckInputs(Builder& builder);
+        Result<void> CheckOutputs(Builder& builder);
+    };
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/pipeline.cpp b/csrc/mmdeploy/graph/pipeline.cpp
index 6b328f3008..5b680d0055 100644
--- a/csrc/mmdeploy/graph/pipeline.cpp
+++ b/csrc/mmdeploy/graph/pipeline.cpp
@@ -5,17 +5,21 @@
 #include "mmdeploy/archive/value_archive.h"
 #include "mmdeploy/graph/static_router.h"
 
-namespace mmdeploy::graph {
+namespace mmdeploy::graph
+{
 
-PipelineBuilder::PipelineBuilder(Value config) : Builder(std::move(config)) {}
+    PipelineBuilder::PipelineBuilder(Value config)
+        : Builder(std::move(config))
+    {
+    }
 
-Result<unique_ptr<Node>> PipelineBuilder::BuildImpl() {
-  // create static router
-  return StaticRouterBuilder{}.Build(config_).value();
-}
+    Result<unique_ptr<Node>> PipelineBuilder::BuildImpl()
+    {
+        // create static router
+        return StaticRouterBuilder{}.Build(config_).value();
+    }
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Pipeline, 0), [](const Value& config) {
-  return std::make_unique<PipelineBuilder>(config);
-});
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Pipeline, 0), [](const Value& config)
+                                   { return std::make_unique<PipelineBuilder>(config); });
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/graph/pipeline.h b/csrc/mmdeploy/graph/pipeline.h
index 9cb0eae3e7..4bd3cfe2de 100644
--- a/csrc/mmdeploy/graph/pipeline.h
+++ b/csrc/mmdeploy/graph/pipeline.h
@@ -5,15 +5,17 @@
 
 #include "mmdeploy/core/graph.h"
 
-namespace mmdeploy::graph {
+namespace mmdeploy::graph
+{
 
-class PipelineBuilder : public Builder {
- public:
-  explicit PipelineBuilder(Value config);
+    class PipelineBuilder : public Builder
+    {
+      public:
+        explicit PipelineBuilder(Value config);
 
- protected:
-  Result<unique_ptr<Node>> BuildImpl() override;
-};
+      protected:
+        Result<unique_ptr<Node>> BuildImpl() override;
+    };
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/static_router.cpp b/csrc/mmdeploy/graph/static_router.cpp
index 5fea50b180..a1b2bc600a 100644
--- a/csrc/mmdeploy/graph/static_router.cpp
+++ b/csrc/mmdeploy/graph/static_router.cpp
@@ -6,37 +6,42 @@
 #include "mmdeploy/execution/schedulers/inlined_scheduler.h"
 #include "mmdeploy/graph/common.h"
 
-namespace mmdeploy::graph {
-
-class StaticRouter::State {
- public:
-  State(vector<int> use_count, Sender<Value> args);
-
-  void Write(int index, Sender<Value> value);
-  // ! coords must last until finish of the async operation.
-  Sender<Value> Collect(const vector<Coords>& coords);
-
- private:
-  Sender<Value> Read(int index);
-  // collect inputs from outputs of multiple nodes
-  Sender<Value> CollectN(const vector<Coords>& coords);
-  // collect inputs from 1 node's outputs
-  Sender<Value> Collect1(const Coords& coords);
-
- private:
-  vector<int> use_count_;
-  vector<std::optional<Sender<Value>>> values_;
-};
-
-Sender<Value> StaticRouter::State::CollectN(const vector<Coords>& coords) {
-  vector<Sender<Value>> predecessors;
-  predecessors.reserve(coords.size());
-  size_t count = 0;
-  for (const auto& coord : coords) {
-    predecessors.push_back(Read(coord.index));
-    count += coord.mapping.size();
-  }
-  return Then(WhenAll(std::move(predecessors)), [count, &coords](Value::Array vals) {
+namespace mmdeploy::graph
+{
+
+    class StaticRouter::State
+    {
+      public:
+        State(vector<int> use_count, Sender<Value> args);
+
+        void          Write(int index, Sender<Value> value);
+        // ! coords must last until finish of the async operation.
+        Sender<Value> Collect(const vector<Coords>& coords);
+
+      private:
+        Sender<Value> Read(int index);
+        // collect inputs from outputs of multiple nodes
+        Sender<Value> CollectN(const vector<Coords>& coords);
+        // collect inputs from 1 node's outputs
+        Sender<Value> Collect1(const Coords& coords);
+
+      private:
+        vector<int>                          use_count_;
+        vector<std::optional<Sender<Value>>> values_;
+    };
+
+    Sender<Value> StaticRouter::State::CollectN(const vector<Coords>& coords)
+    {
+        vector<Sender<Value>> predecessors;
+        predecessors.reserve(coords.size());
+        size_t count = 0;
+        for (const auto& coord : coords)
+        {
+            predecessors.push_back(Read(coord.index));
+            count += coord.mapping.size();
+        }
+        return Then(WhenAll(std::move(predecessors)), [count, &coords](Value::Array vals)
+                    {
     Value ret(ValueType::kArray);
     auto& args = ret.array();
     args.resize(count);
@@ -46,12 +51,13 @@ Sender<Value> StaticRouter::State::CollectN(const vector<Coords>& coords) {
         args[to] = std::move(vals[j][from]);
       }
     }
-    return ret;
-  });
-}
+    return ret; });
+    }
 
-Sender<Value> StaticRouter::State::Collect1(const StaticRouter::Coords& coords) {
-  return Then(Read(coords.index), [&coords](Value val) {
+    Sender<Value> StaticRouter::State::Collect1(const StaticRouter::Coords& coords)
+    {
+        return Then(Read(coords.index), [&coords](Value val)
+                    {
     Value ret(ValueType::kArray);
     auto& args = ret.array();
     args.resize(coords.mapping.size());
@@ -59,180 +65,220 @@ Sender<Value> StaticRouter::State::Collect1(const StaticRouter::Coords& coords)
       // ! from(s) must be unique to avoid trouble, should be enforced by parser
       args[to] = std::move(val[from]);
     }
-    return ret;
-  });
-}
-
-Sender<Value> StaticRouter::State::Collect(const vector<Coords>& coords) {
-  if (coords.size() == 1) {
-    return Collect1(coords[0]);
-  } else {
-    return CollectN(coords);
-  }
-}
-
-void StaticRouter::State::Write(int index, Sender<Value> value) {
-  assert(!values_[index]);
-  if (use_count_[index] > 1) {
-    // ! split to create a copyable sender
-    values_[index] = Split(std::move(value));
-  } else {
-    values_[index] = std::move(value);
-  }
-}
-
-Sender<Value> StaticRouter::State::Read(int index) {
-  assert(values_[index]);
-  if (--use_count_[index] == 0) {
-    return std::move(*values_[index]);
-  } else {
-    // ! copy ctor of the wrapped sender must be valid
-    return *values_[index];
-  }
-}
-
-StaticRouter::State::State(vector<int> use_count, Sender<Value> args)
-    : use_count_(std::move(use_count)), values_(use_count_.size()) {
-  values_.back() = std::move(args);
-}
-
-Sender<Value> StaticRouter::Process(Sender<Value> args) {
-  auto index = std::make_shared<profiler::Index>();
-  auto start = std::make_shared<bool>(false);
-  if (scope_) {
-    *index = scope_->next_.fetch_add(1, std::memory_order_relaxed);
-    args = Then(std::move(args), [this, index, start](Value v) mutable {
+    return ret; });
+    }
+
+    Sender<Value> StaticRouter::State::Collect(const vector<Coords>& coords)
+    {
+        if (coords.size() == 1)
+        {
+            return Collect1(coords[0]);
+        }
+        else
+        {
+            return CollectN(coords);
+        }
+    }
+
+    void StaticRouter::State::Write(int index, Sender<Value> value)
+    {
+        assert(!values_[index]);
+        if (use_count_[index] > 1)
+        {
+            // ! split to create a copyable sender
+            values_[index] = Split(std::move(value));
+        }
+        else
+        {
+            values_[index] = std::move(value);
+        }
+    }
+
+    Sender<Value> StaticRouter::State::Read(int index)
+    {
+        assert(values_[index]);
+        if (--use_count_[index] == 0)
+        {
+            return std::move(*values_[index]);
+        }
+        else
+        {
+            // ! copy ctor of the wrapped sender must be valid
+            return *values_[index];
+        }
+    }
+
+    StaticRouter::State::State(vector<int> use_count, Sender<Value> args)
+        : use_count_(std::move(use_count))
+        , values_(use_count_.size())
+    {
+        values_.back() = std::move(args);
+    }
+
+    Sender<Value> StaticRouter::Process(Sender<Value> args)
+    {
+        auto index = std::make_shared<profiler::Index>();
+        auto start = std::make_shared<bool>(false);
+        if (scope_)
+        {
+            *index = scope_->next_.fetch_add(1, std::memory_order_relaxed);
+            args   = Then(std::move(args), [this, index, start](Value v) mutable
+                        {
       if (*start == false) {
         scope_->Add(profiler::Event::kStart, *index, profiler::Clock::now());
         *start = true;
       }
-      return std::move(v);
-    });
-  }
-
-  State state(use_count_, std::move(args));
-  for (size_t i = 0; i < nodes_.size(); ++i) {
-    auto input = state.Collect(input_coords_[i]);
-    auto output = nodes_[i]->Process(std::move(input));
-    state.Write(static_cast<int>(i), std::move(output));
-  }
-  auto output = state.Collect(ret_coords_);
-  if (scope_) {
-    output = Then(std::move(output), [this, index](Value v) {
+      return std::move(v); });
+        }
+
+        State state(use_count_, std::move(args));
+        for (size_t i = 0; i < nodes_.size(); ++i)
+        {
+            auto input  = state.Collect(input_coords_[i]);
+            auto output = nodes_[i]->Process(std::move(input));
+            state.Write(static_cast<int>(i), std::move(output));
+        }
+        auto output = state.Collect(ret_coords_);
+        if (scope_)
+        {
+            output = Then(std::move(output), [this, index](Value v)
+                          {
       scope_->Add(profiler::Event::kEnd, *index, profiler::Clock::now());
-      return std::move(v);
-    });
-  }
-  return output;
-}
-
-/////////////////////////////////////////////////////////////////////
-/// parsers
-
-Result<unique_ptr<StaticRouter>> StaticRouterBuilder::Build(const Value& config) {
-  try {
-    auto pipeline = std::make_unique<StaticRouter>();
-    if (config.contains("context") && config["context"].contains("scope")) {
-      auto name = config.value("name", std::string("Pipeline"));
-      auto scope = config["context"]["scope"].get<profiler::Scope*>();
-      pipeline->scope_ = scope->CreateScope(name);
+      return std::move(v); });
+        }
+        return output;
     }
 
-    const auto& task_configs = config["tasks"];
-    auto size = task_configs.size();
+    /////////////////////////////////////////////////////////////////////
+    /// parsers
 
-    vector<unique_ptr<Node>> nodes;
-    nodes.reserve(size);
+    Result<unique_ptr<StaticRouter>> StaticRouterBuilder::Build(const Value& config)
+    {
+        try
+        {
+            auto pipeline = std::make_unique<StaticRouter>();
+            if (config.contains("context") && config["context"].contains("scope"))
+            {
+                auto name        = config.value("name", std::string("Pipeline"));
+                auto scope       = config["context"]["scope"].get<profiler::Scope*>();
+                pipeline->scope_ = scope->CreateScope(name);
+            }
 
-    vector<vector<StaticRouter::Coords>> input_coords;
-    input_coords.reserve(size);
+            const auto&              task_configs = config["tasks"];
+            auto                     size         = task_configs.size();
 
-    use_count_.resize(size + 1);
+            vector<unique_ptr<Node>> nodes;
+            nodes.reserve(size);
 
-    OUTCOME_TRY(auto inputs, ParseStringArray(config["input"]));
-    OUTCOME_TRY(auto outputs, ParseStringArray(config["output"]));
+            vector<vector<StaticRouter::Coords>> input_coords;
+            input_coords.reserve(size);
 
-    OUTCOME_TRY(UpdateOutputCoords(static_cast<int>(size), inputs));
-    for (auto task_config : task_configs) {
-      auto index = static_cast<int>(nodes.size());
+            use_count_.resize(size + 1);
 
-      auto name = task_config.value<string>("name", "");
-      auto type = task_config.value<string>("type", "");
-      // propagate context
-      if (!task_config.contains("context")) {
-        task_config["context"] = Value::Object();
-      }
-      if (config.contains("context")) {
-        update(task_config["context"].object(), config["context"].object(), 2);
-        if (pipeline->scope_) {
-          task_config["context"]["scope"] = pipeline->scope_;
-        }
-      }
+            OUTCOME_TRY(auto inputs, ParseStringArray(config["input"]));
+            OUTCOME_TRY(auto outputs, ParseStringArray(config["output"]));
 
-      OUTCOME_TRY(auto builder, Builder::CreateFromConfig(task_config));
-      if (builder) {
-        auto node = builder->Build().value();
-        OUTCOME_TRY(auto coords, GetInputCoords(builder->inputs()));
-        input_coords.push_back(std::move(coords));
-        OUTCOME_TRY(UpdateOutputCoords(index, builder->outputs()));
-        nodes.push_back(std::move(node));
-      } else {
-        MMDEPLOY_ERROR("could not create {}: {}", name, type);
-        return Status(eFail);
-      }
+            OUTCOME_TRY(UpdateOutputCoords(static_cast<int>(size), inputs));
+            for (auto task_config : task_configs)
+            {
+                auto index = static_cast<int>(nodes.size());
+
+                auto name = task_config.value<string>("name", "");
+                auto type = task_config.value<string>("type", "");
+                // propagate context
+                if (!task_config.contains("context"))
+                {
+                    task_config["context"] = Value::Object();
+                }
+                if (config.contains("context"))
+                {
+                    update(task_config["context"].object(), config["context"].object(), 2);
+                    if (pipeline->scope_)
+                    {
+                        task_config["context"]["scope"] = pipeline->scope_;
+                    }
+                }
+
+                OUTCOME_TRY(auto builder, Builder::CreateFromConfig(task_config));
+                if (builder)
+                {
+                    auto node = builder->Build().value();
+                    OUTCOME_TRY(auto coords, GetInputCoords(builder->inputs()));
+                    input_coords.push_back(std::move(coords));
+                    OUTCOME_TRY(UpdateOutputCoords(index, builder->outputs()));
+                    nodes.push_back(std::move(node));
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("could not create {}: {}", name, type);
+                    return Status(eFail);
+                }
+            }
+            OUTCOME_TRY(auto coords, GetInputCoords(outputs));
+
+            pipeline->nodes_        = std::move(nodes);
+            pipeline->use_count_    = std::move(use_count_);
+            pipeline->input_coords_ = std::move(input_coords);
+            pipeline->ret_coords_   = std::move(coords);
+
+            return std::move(pipeline);
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("error parsing config: {}", e.what());
+            return Status(eFail);
+        }
     }
-    OUTCOME_TRY(auto coords, GetInputCoords(outputs));
-
-    pipeline->nodes_ = std::move(nodes);
-    pipeline->use_count_ = std::move(use_count_);
-    pipeline->input_coords_ = std::move(input_coords);
-    pipeline->ret_coords_ = std::move(coords);
-
-    return std::move(pipeline);
-
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("error parsing config: {}", e.what());
-    return Status(eFail);
-  }
-}
-
-Result<vector<StaticRouter::Coords>> StaticRouterBuilder::GetInputCoords(
-    const vector<string>& names) {
-  vector<StaticRouter::Coords> ret;
-  ret.reserve(names.size());
-  for (int i = 0; i < names.size(); ++i) {
-    const auto& input = names[i];
-    if (auto it = output_name_to_coords_.find(input); it != output_name_to_coords_.end()) {
-      const auto& [node_id, port_id] = it->second;
-      ++use_count_[node_id];
-      auto ct = find_if(begin(ret), end(ret),
-                        [node_id = node_id](auto& c) { return c.index == node_id; });
-      if (ct == end(ret)) {
-        ct = ret.insert(ct, {node_id, {}});
-      }
-      ct->mapping.emplace_back(port_id, i);
-    } else {
-      MMDEPLOY_ERROR("missing input: {}", input);
-      for (const auto& [k, v] : output_name_to_coords_) {
-        MMDEPLOY_ERROR("local var: {}", k);
-      }
-      return Status(eEntryNotFound);
+
+    Result<vector<StaticRouter::Coords>> StaticRouterBuilder::GetInputCoords(
+        const vector<string>& names)
+    {
+        vector<StaticRouter::Coords> ret;
+        ret.reserve(names.size());
+        for (int i = 0; i < names.size(); ++i)
+        {
+            const auto& input = names[i];
+            if (auto it = output_name_to_coords_.find(input); it != output_name_to_coords_.end())
+            {
+                const auto& [node_id, port_id] = it->second;
+                ++use_count_[node_id];
+                auto ct = find_if(begin(ret), end(ret), [node_id = node_id](auto& c)
+                                  { return c.index == node_id; });
+                if (ct == end(ret))
+                {
+                    ct = ret.insert(ct, {node_id, {}});
+                }
+                ct->mapping.emplace_back(port_id, i);
+            }
+            else
+            {
+                MMDEPLOY_ERROR("missing input: {}", input);
+                for (const auto& [k, v] : output_name_to_coords_)
+                {
+                    MMDEPLOY_ERROR("local var: {}", k);
+                }
+                return Status(eEntryNotFound);
+            }
+        }
+        return ret;
     }
-  }
-  return ret;
-}
-
-Result<void> StaticRouterBuilder::UpdateOutputCoords(int index, const vector<string>& names) {
-  for (int i = 0; i < names.size(); ++i) {
-    const auto& output = names[i];
-    if (auto it = output_name_to_coords_.find(output); it != output_name_to_coords_.end()) {
-      MMDEPLOY_ERROR("duplicate output: ", output);
-      return Status(eNotSupported);
-    } else {
-      output_name_to_coords_.insert({output, {index, i}});
+
+    Result<void> StaticRouterBuilder::UpdateOutputCoords(int index, const vector<string>& names)
+    {
+        for (int i = 0; i < names.size(); ++i)
+        {
+            const auto& output = names[i];
+            if (auto it = output_name_to_coords_.find(output); it != output_name_to_coords_.end())
+            {
+                MMDEPLOY_ERROR("duplicate output: ", output);
+                return Status(eNotSupported);
+            }
+            else
+            {
+                output_name_to_coords_.insert({output, {index, i}});
+            }
+        }
+        return success();
     }
-  }
-  return success();
-}
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/graph/static_router.h b/csrc/mmdeploy/graph/static_router.h
index 11dd4e5fb2..668242d523 100644
--- a/csrc/mmdeploy/graph/static_router.h
+++ b/csrc/mmdeploy/graph/static_router.h
@@ -13,45 +13,49 @@
 #include "mmdeploy/execution/schedulers/registry.h"
 #include "mmdeploy/execution/when_all_value.h"
 
-namespace mmdeploy::graph {
-
-class StaticRouter : public Node {
-  friend class StaticRouterBuilder;
-
- public:
-  Sender<Value> Process(Sender<Value> args) override;
-
-  struct Coords {
-    // source node index
-    int index;
-    // source output port -> destination input port mapping
-    vector<pair<int, int>> mapping;
-  };
-
-  class State;
-
- private:
-  vector<unique_ptr<Node>> nodes_;
-  vector<int> use_count_;
-  vector<vector<Coords>> input_coords_;
-  vector<Coords> ret_coords_;
-  profiler::Scope* scope_{nullptr};
-};
-
-class StaticRouterBuilder {
- public:
-  Result<unique_ptr<StaticRouter>> Build(const Value& config);
-
- private:
-  Result<vector<StaticRouter::Coords>> GetInputCoords(const vector<string>& names);
-
-  Result<void> UpdateOutputCoords(int index, const vector<string>& names);
-
-  // use count for each node's output
-  vector<int> use_count_;
-  // name -> (node_id, port_id)
-  std::map<string, pair<int, int>> output_name_to_coords_;
-};
+namespace mmdeploy::graph
+{
+
+    class StaticRouter : public Node
+    {
+        friend class StaticRouterBuilder;
+
+      public:
+        Sender<Value> Process(Sender<Value> args) override;
+
+        struct Coords
+        {
+            // source node index
+            int                    index;
+            // source output port -> destination input port mapping
+            vector<pair<int, int>> mapping;
+        };
+
+        class State;
+
+      private:
+        vector<unique_ptr<Node>> nodes_;
+        vector<int>              use_count_;
+        vector<vector<Coords>>   input_coords_;
+        vector<Coords>           ret_coords_;
+        profiler::Scope*         scope_{nullptr};
+    };
+
+    class StaticRouterBuilder
+    {
+      public:
+        Result<unique_ptr<StaticRouter>> Build(const Value& config);
+
+      private:
+        Result<vector<StaticRouter::Coords>> GetInputCoords(const vector<string>& names);
+
+        Result<void>                         UpdateOutputCoords(int index, const vector<string>& names);
+
+        // use count for each node's output
+        vector<int>                          use_count_;
+        // name -> (node_id, port_id)
+        std::map<string, pair<int, int>>     output_name_to_coords_;
+    };
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/task.cpp b/csrc/mmdeploy/graph/task.cpp
index 6cb6c4a798..9a2b140e02 100644
--- a/csrc/mmdeploy/graph/task.cpp
+++ b/csrc/mmdeploy/graph/task.cpp
@@ -5,10 +5,13 @@
 #include "mmdeploy/core/operator.h"
 #include "mmdeploy/graph/common.h"
 
-namespace mmdeploy::graph {
+namespace mmdeploy::graph
+{
 
-Sender<Value> Task::Process(Sender<Value> input) {
-  return LetValue(std::move(input), [this](Value& v) -> Sender<Value> {
+    Sender<Value> Task::Process(Sender<Value> input)
+    {
+        return LetValue(std::move(input), [this](Value& v) -> Sender<Value>
+                        {
     assert(v.is_array());
     // handle empty input
     if (v.front().empty()) {
@@ -39,70 +42,84 @@ Sender<Value> Task::Process(Sender<Value> input) {
         profiler::ScopedCounter counter(scope_);
         return module_->Process(u).value();
       });
+    } });
     }
-  });
-}
 
-TaskBuilder::TaskBuilder(Value config) : Builder(std::move(config)) {}
+    TaskBuilder::TaskBuilder(Value config)
+        : Builder(std::move(config))
+    {
+    }
 
-namespace {
+    namespace
+    {
 
-inline Result<unique_ptr<Module>> CreateModule(const Value& config) {
-  auto type = config["module"].get<std::string>();
-  auto creator = gRegistry<Module>().Get(type);
-  if (!creator) {
-    MMDEPLOY_ERROR("failed to find module creator: {}", type);
-    return Status(eEntryNotFound);
-  }
-  auto inst = creator->Create(config);
-  if (!Check(inst)) {
-    MMDEPLOY_ERROR("failed to create module: {}", type);
-    return Status(eFail);
-  }
-  return std::move(inst);
-}
+        inline Result<unique_ptr<Module>> CreateModule(const Value& config)
+        {
+            auto type    = config["module"].get<std::string>();
+            auto creator = gRegistry<Module>().Get(type);
+            if (!creator)
+            {
+                MMDEPLOY_ERROR("failed to find module creator: {}", type);
+                return Status(eEntryNotFound);
+            }
+            auto inst = creator->Create(config);
+            if (!Check(inst))
+            {
+                MMDEPLOY_ERROR("failed to create module: {}", type);
+                return Status(eFail);
+            }
+            return std::move(inst);
+        }
 
-}  // namespace
+    }  // namespace
 
-Result<unique_ptr<Node>> TaskBuilder::BuildImpl() {
-  try {
-    auto task = std::make_unique<Task>();
-    if (auto scope = Maybe{config_} / "context" / "scope" / identity<profiler::Scope*>{}) {
-      auto module_name = config_.value<std::string>("module", "");
-      auto name = config_.value<std::string>("name", "");
-      string scope_name = (name != "") ? name : module_name;
-      task->scope_ = (*scope)->CreateScope(scope_name);
-      config_["context"]["scope"] = task->scope_;
-      if (module_name == "Transform") {
-        task->scope_ = nullptr;
-      }
-    }
+    Result<unique_ptr<Node>> TaskBuilder::BuildImpl()
+    {
+        try
+        {
+            auto task = std::make_unique<Task>();
+            if (auto scope = Maybe{config_} / "context" / "scope" / identity<profiler::Scope*>{})
+            {
+                auto   module_name          = config_.value<std::string>("module", "");
+                auto   name                 = config_.value<std::string>("name", "");
+                string scope_name           = (name != "") ? name : module_name;
+                task->scope_                = (*scope)->CreateScope(scope_name);
+                config_["context"]["scope"] = task->scope_;
+                if (module_name == "Transform")
+                {
+                    task->scope_ = nullptr;
+                }
+            }
 
-    OUTCOME_TRY(task->module_, CreateModule(config_));
+            OUTCOME_TRY(task->module_, CreateModule(config_));
 
-    if (auto name = Maybe{config_} / "scheduler" / identity<string>{}) {
-      if (auto sched = Maybe{config_} / "context" / "scheduler" / *name /
-                       identity<TypeErasedScheduler<Value>>{}) {
-        task->sched_ = std::move(*sched);
-      }
-    }
+            if (auto name = Maybe{config_} / "scheduler" / identity<string>{})
+            {
+                if (auto sched = Maybe{config_} / "context" / "scheduler" / *name /
+                                 identity<TypeErasedScheduler<Value>>{})
+                {
+                    task->sched_ = std::move(*sched);
+                }
+            }
 
-    if (!task->sched_) {
-      task->sched_ =
-          TypeErasedScheduler<Value>{std::make_shared<TypeErasedScheduler<Value>::Impl>()};
-    }
+            if (!task->sched_)
+            {
+                task->sched_ =
+                    TypeErasedScheduler<Value>{std::make_shared<TypeErasedScheduler<Value>::Impl>()};
+            }
 
-    task->is_batched_ = config_.value("is_batched", false);
-    task->is_thread_safe_ = config_.value("is_thread_safe", false);
-    return std::move(task);
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("error parsing config: {}", config_);
-    return nullptr;
-  }
-}
+            task->is_batched_     = config_.value("is_batched", false);
+            task->is_thread_safe_ = config_.value("is_thread_safe", false);
+            return std::move(task);
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("error parsing config: {}", config_);
+            return nullptr;
+        }
+    }
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Task, 0), [](const Value& config) {
-  return std::make_unique<TaskBuilder>(config);
-});
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Task, 0), [](const Value& config)
+                                   { return std::make_unique<TaskBuilder>(config); });
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/graph/task.h b/csrc/mmdeploy/graph/task.h
index 11974efade..6effccd489 100644
--- a/csrc/mmdeploy/graph/task.h
+++ b/csrc/mmdeploy/graph/task.h
@@ -6,30 +6,33 @@
 #include "mmdeploy/core/graph.h"
 #include "mmdeploy/core/profiler.h"
 
-namespace mmdeploy::graph {
-
-class Task : public Node {
-  friend class TaskBuilder;
-
- public:
-  Sender<Value> Process(Sender<Value> input) override;
-
- private:
-  std::optional<TypeErasedScheduler<Value>> sched_;
-  unique_ptr<Module> module_;
-  bool is_batched_{false};
-  bool is_thread_safe_{false};
-  dynamic_batch_t::context_t batch_context_;
-  profiler::Scope* scope_{nullptr};
-};
-
-class TaskBuilder : public Builder {
- public:
-  explicit TaskBuilder(Value config);
-
- protected:
-  Result<unique_ptr<Node>> BuildImpl() override;
-};
+namespace mmdeploy::graph
+{
+
+    class Task : public Node
+    {
+        friend class TaskBuilder;
+
+      public:
+        Sender<Value> Process(Sender<Value> input) override;
+
+      private:
+        std::optional<TypeErasedScheduler<Value>> sched_;
+        unique_ptr<Module>                        module_;
+        bool                                      is_batched_{false};
+        bool                                      is_thread_safe_{false};
+        dynamic_batch_t::context_t                batch_context_;
+        profiler::Scope*                          scope_{nullptr};
+    };
+
+    class TaskBuilder : public Builder
+    {
+      public:
+        explicit TaskBuilder(Value config);
+
+      protected:
+        Result<unique_ptr<Node>> BuildImpl() override;
+    };
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/model/directory_model_impl.cpp b/csrc/mmdeploy/model/directory_model_impl.cpp
index 88e8ce4350..8df8a75223 100644
--- a/csrc/mmdeploy/model/directory_model_impl.cpp
+++ b/csrc/mmdeploy/model/directory_model_impl.cpp
@@ -8,61 +8,75 @@
 #include "mmdeploy/core/model_impl.h"
 #include "mmdeploy/core/utils/filesystem.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class DirectoryModelImpl : public ModelImpl {
- public:
-  DirectoryModelImpl() = default;
+    class DirectoryModelImpl : public ModelImpl
+    {
+      public:
+        DirectoryModelImpl() = default;
 
-  Result<void> Init(const std::string& sdk_model_path) override {
-    auto path = fs::path{sdk_model_path};
-    if (!is_directory(path)) {
-      return Status(eInvalidArgument);
-    }
-    root_ = fs::path{sdk_model_path};
-    return success();
-  }
+        Result<void> Init(const std::string& sdk_model_path) override
+        {
+            auto path = fs::path{sdk_model_path};
+            if (!is_directory(path))
+            {
+                return Status(eInvalidArgument);
+            }
+            root_ = fs::path{sdk_model_path};
+            return success();
+        }
 
-  Result<std::string> ReadFile(const std::string& file_path) const override {
-    auto _path = root_ / fs::path(file_path);
-    std::ifstream ifs(_path, std::ios::binary | std::ios::in);
-    if (!ifs.is_open()) {
-      MMDEPLOY_ERROR("read file {} failed", _path.string());
-      return Status(eFail);
-    }
-    ifs.seekg(0, std::ios::end);
-    auto size = ifs.tellg();
-    ifs.seekg(0, std::ios::beg);
-    std::string str(size, '\0');
-    ifs.read(str.data(), size);
-    return str;
-  }
+        Result<std::string> ReadFile(const std::string& file_path) const override
+        {
+            auto          _path = root_ / fs::path(file_path);
+            std::ifstream ifs(_path, std::ios::binary | std::ios::in);
+            if (!ifs.is_open())
+            {
+                MMDEPLOY_ERROR("read file {} failed", _path.string());
+                return Status(eFail);
+            }
+            ifs.seekg(0, std::ios::end);
+            auto size = ifs.tellg();
+            ifs.seekg(0, std::ios::beg);
+            std::string str(size, '\0');
+            ifs.read(str.data(), size);
+            return str;
+        }
 
-  Result<Value> ReadConfig(const std::string& config_path) const override {
-    try {
-      OUTCOME_TRY(auto json_str, ReadFile(config_path));
-      return from_json<Value>(nlohmann::json::parse(json_str));
-    } catch (const std::exception& e) {
-      MMDEPLOY_ERROR("exception: {}", e.what());
-      return Status(eFail);
-    }
-  }
+        Result<Value> ReadConfig(const std::string& config_path) const override
+        {
+            try
+            {
+                OUTCOME_TRY(auto json_str, ReadFile(config_path));
+                return from_json<Value>(nlohmann::json::parse(json_str));
+            }
+            catch (const std::exception& e)
+            {
+                MMDEPLOY_ERROR("exception: {}", e.what());
+                return Status(eFail);
+            }
+        }
 
-  Result<deploy_meta_info_t> ReadMeta() const override {
-    try {
-      OUTCOME_TRY(auto deploy_cfg, ReadConfig("deploy.json"));
-      return from_value<deploy_meta_info_t>(deploy_cfg);
-    } catch (std::exception& e) {
-      MMDEPLOY_ERROR("exception: {}", e.what());
-      return Status(eFail);
-    }
-  }
+        Result<deploy_meta_info_t> ReadMeta() const override
+        {
+            try
+            {
+                OUTCOME_TRY(auto deploy_cfg, ReadConfig("deploy.json"));
+                return from_value<deploy_meta_info_t>(deploy_cfg);
+            }
+            catch (std::exception& e)
+            {
+                MMDEPLOY_ERROR("exception: {}", e.what());
+                return Status(eFail);
+            }
+        }
 
- private:
-  fs::path root_;
-};
+      private:
+        fs::path root_;
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(ModelImpl, (DirectoryModel, 0),
-                               [] { return std::make_unique<DirectoryModelImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(ModelImpl, (DirectoryModel, 0), []
+                                   { return std::make_unique<DirectoryModelImpl>(); });
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/model/zip_model_impl.cpp b/csrc/mmdeploy/model/zip_model_impl.cpp
index 39bf1762de..23604888e0 100644
--- a/csrc/mmdeploy/model/zip_model_impl.cpp
+++ b/csrc/mmdeploy/model/zip_model_impl.cpp
@@ -13,135 +13,165 @@
 
 using nlohmann::json;
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class ZipModelImpl : public ModelImpl {
- public:
-  ~ZipModelImpl() override {
-    if (zip_ != nullptr) {
-      zip_close(zip_);
-    }
+    class ZipModelImpl : public ModelImpl
+    {
+      public:
+        ~ZipModelImpl() override
+        {
+            if (zip_ != nullptr)
+            {
+                zip_close(zip_);
+            }
 #if LIBZIP_VERSION_MAJOR >= 1
-    if (source_) {
-      zip_source_close(source_);
-    }
+            if (source_)
+            {
+                zip_source_close(source_);
+            }
 #endif
-  }
+        }
 
-  // @brief load an sdk model, which HAS TO BE a zip file.
-  // Meta file (i.e. deploy.json) will be extracted and parsed from the zip file
-  // @param sdk_model_path path of sdk model file, in zip format
-  Result<void> Init(const std::string& model_path) override {
-    int ret = 0;
-    zip_ = zip_open(model_path.c_str(), 0, &ret);
-    if (ret != 0) {
-      MMDEPLOY_INFO("Failed to open zip file {}, ret {}", model_path.c_str(), ret);
-      return Status(eInvalidArgument);
-    }
-    MMDEPLOY_INFO("Open model file {} successfully", model_path.c_str());
-    return InitZip();
-  }
+        // @brief load an sdk model, which HAS TO BE a zip file.
+        // Meta file (i.e. deploy.json) will be extracted and parsed from the zip file
+        // @param sdk_model_path path of sdk model file, in zip format
+        Result<void> Init(const std::string& model_path) override
+        {
+            int ret = 0;
+            zip_    = zip_open(model_path.c_str(), 0, &ret);
+            if (ret != 0)
+            {
+                MMDEPLOY_INFO("Failed to open zip file {}, ret {}", model_path.c_str(), ret);
+                return Status(eInvalidArgument);
+            }
+            MMDEPLOY_INFO("Open model file {} successfully", model_path.c_str());
+            return InitZip();
+        }
 
-  Result<void> Init(const void* buffer, size_t size) override {
+        Result<void> Init(const void* buffer, size_t size) override
+        {
 #if LIBZIP_VERSION_MAJOR >= 1
-    zip_error_t error{};
-    source_ = zip_source_buffer_create(buffer, size, 0, &error);
-    if (zip_error_code_zip(&error) != ZIP_ER_OK) {
-      return Status(eFail);
-    }
-    zip_ = zip_open_from_source(source_, ZIP_RDONLY, &error);
-    if (zip_error_code_zip(&error) != ZIP_ER_OK) {
-      return Status(eFail);
-    }
-    return InitZip();
+            zip_error_t error{};
+            source_ = zip_source_buffer_create(buffer, size, 0, &error);
+            if (zip_error_code_zip(&error) != ZIP_ER_OK)
+            {
+                return Status(eFail);
+            }
+            zip_ = zip_open_from_source(source_, ZIP_RDONLY, &error);
+            if (zip_error_code_zip(&error) != ZIP_ER_OK)
+            {
+                return Status(eFail);
+            }
+            return InitZip();
 #else
-    return Status(eNotSupported);
+            return Status(eNotSupported);
 #endif
-  }
+        }
 
-  Result<std::string> ReadFile(const std::string& file_path) const override {
-    int ret = 0;
-    int index = -1;
+        Result<std::string> ReadFile(const std::string& file_path) const override
+        {
+            int  ret   = 0;
+            int  index = -1;
 
-    auto iter = file_index_.find(file_path);
-    if (iter == file_index_.end()) {
-      MMDEPLOY_ERROR("cannot find file {} under dir {}", file_path.c_str(), root_dir_.c_str());
-      return Status(eFail);
-    }
-    index = iter->second;
-    struct zip_file* pzip = zip_fopen_index(zip_, index, 0);
-    if (nullptr == pzip) {
-      MMDEPLOY_ERROR("read file {} in zip file failed, whose index is {}", file_path.c_str(),
-                     index);
-      return Status(eFail);
-    }
-    struct zip_stat stat {};
-    if ((ret = zip_stat_index(zip_, index, 0, &stat)) < 0) {
-      MMDEPLOY_ERROR("get stat of file {} error, ret {}", file_path.c_str(), ret);
-      return Status(eFail);
-    }
-    MMDEPLOY_DEBUG("file size {}", (int)stat.size);
-    std::vector<char> buf(stat.size);
-    if ((ret = zip_fread(pzip, buf.data(), stat.size)) < 0) {
-      MMDEPLOY_ERROR("read data of file {} error, ret {}", file_path.c_str(), ret);
-      return Status(eFail);
-    }
-    return std::string(buf.begin(), buf.end());
-  }
+            auto iter = file_index_.find(file_path);
+            if (iter == file_index_.end())
+            {
+                MMDEPLOY_ERROR("cannot find file {} under dir {}", file_path.c_str(), root_dir_.c_str());
+                return Status(eFail);
+            }
+            index                 = iter->second;
+            struct zip_file* pzip = zip_fopen_index(zip_, index, 0);
+            if (nullptr == pzip)
+            {
+                MMDEPLOY_ERROR("read file {} in zip file failed, whose index is {}", file_path.c_str(), index);
+                return Status(eFail);
+            }
+            struct zip_stat stat
+            {
+            };
+            if ((ret = zip_stat_index(zip_, index, 0, &stat)) < 0)
+            {
+                MMDEPLOY_ERROR("get stat of file {} error, ret {}", file_path.c_str(), ret);
+                return Status(eFail);
+            }
+            MMDEPLOY_DEBUG("file size {}", (int)stat.size);
+            std::vector<char> buf(stat.size);
+            if ((ret = zip_fread(pzip, buf.data(), stat.size)) < 0)
+            {
+                MMDEPLOY_ERROR("read data of file {} error, ret {}", file_path.c_str(), ret);
+                return Status(eFail);
+            }
+            return std::string(buf.begin(), buf.end());
+        }
 
-  Result<Value> ReadConfig(const std::string& config_path) const override {
-    OUTCOME_TRY(auto json_str, ReadFile(config_path));
-    try {
-      return from_json<Value>(nlohmann::json::parse(json_str));
-    } catch (const std::exception& e) {
-      MMDEPLOY_ERROR("exception: {}", e.what());
-      return Status(eFail);
-    }
-  }
+        Result<Value> ReadConfig(const std::string& config_path) const override
+        {
+            OUTCOME_TRY(auto json_str, ReadFile(config_path));
+            try
+            {
+                return from_json<Value>(nlohmann::json::parse(json_str));
+            }
+            catch (const std::exception& e)
+            {
+                MMDEPLOY_ERROR("exception: {}", e.what());
+                return Status(eFail);
+            }
+        }
 
-  Result<deploy_meta_info_t> ReadMeta() const override {
-    try {
-      OUTCOME_TRY(auto deploy_cfg, ReadConfig("deploy.json"));
-      return from_value<deploy_meta_info_t>(deploy_cfg);
-    } catch (std::exception& e) {
-      MMDEPLOY_ERROR("exception: {}", e.what());
-      return Status(eFail);
-    }
-  }
+        Result<deploy_meta_info_t> ReadMeta() const override
+        {
+            try
+            {
+                OUTCOME_TRY(auto deploy_cfg, ReadConfig("deploy.json"));
+                return from_value<deploy_meta_info_t>(deploy_cfg);
+            }
+            catch (std::exception& e)
+            {
+                MMDEPLOY_ERROR("exception: {}", e.what());
+                return Status(eFail);
+            }
+        }
 
- private:
-  Result<void> InitZip() {
-    int files = zip_get_num_files(zip_);
-    MMDEPLOY_INFO("There are {} files in the model", files);
-    if (files == 0) {
-      return Status(eFail);
-    }
-    for (int i = 0; i < files; ++i) {
-      struct zip_stat stat;
-      zip_stat_init(&stat);
-      zip_stat_index(zip_, i, 0, &stat);
-      fs::path path(stat.name);
-      auto file_name = path.filename().string();
-      if (file_name == ".") {
-        MMDEPLOY_DEBUG("{}-th file name is: {}， which is a directory", i, stat.name);
-      } else {
-        MMDEPLOY_DEBUG("{}-th file name is: {}， which is a file", i, stat.name);
-        file_index_[file_name] = i;
-      }
-    }
-    return success();
-  }
+      private:
+        Result<void> InitZip()
+        {
+            int files = zip_get_num_files(zip_);
+            MMDEPLOY_INFO("There are {} files in the model", files);
+            if (files == 0)
+            {
+                return Status(eFail);
+            }
+            for (int i = 0; i < files; ++i)
+            {
+                struct zip_stat stat;
+                zip_stat_init(&stat);
+                zip_stat_index(zip_, i, 0, &stat);
+                fs::path path(stat.name);
+                auto     file_name = path.filename().string();
+                if (file_name == ".")
+                {
+                    MMDEPLOY_DEBUG("{}-th file name is: {}， which is a directory", i, stat.name);
+                }
+                else
+                {
+                    MMDEPLOY_DEBUG("{}-th file name is: {}， which is a file", i, stat.name);
+                    file_index_[file_name] = i;
+                }
+            }
+            return success();
+        }
 #if LIBZIP_VERSION_MAJOR >= 1
-  struct zip_source* source_{};
+        struct zip_source* source_{};
 #endif
-  struct zip* zip_{};
-  // root directory in zip file
-  std::string root_dir_;
-  // a map between file path and its index in zip file
-  std::map<std::string, int> file_index_;
-};
+        struct zip*                zip_{};
+        // root directory in zip file
+        std::string                root_dir_;
+        // a map between file path and its index in zip file
+        std::map<std::string, int> file_index_;
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(ModelImpl, (ZipModel, 0),
-                               [] { return std::make_unique<ZipModelImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(ModelImpl, (ZipModel, 0), []
+                                   { return std::make_unique<ZipModelImpl>(); });
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/acl/acl_net.cpp b/csrc/mmdeploy/net/acl/acl_net.cpp
index c8e83bb229..11d8db4775 100644
--- a/csrc/mmdeploy/net/acl/acl_net.cpp
+++ b/csrc/mmdeploy/net/acl/acl_net.cpp
@@ -6,649 +6,799 @@
 #include "mmdeploy/core/model.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-std::ostream& operator<<(std::ostream& os, const aclmdlIODims& dims) {
-  os << dims.name << " [";
-  for (int i = 0; i < dims.dimCount; ++i) {
-    os << (i ? ", " : "") << dims.dims[i];
-  }
-  os << "]";
-  return os;
+std::ostream& operator<<(std::ostream& os, const aclmdlIODims& dims)
+{
+    os << dims.name << " [";
+    for (int i = 0; i < dims.dimCount; ++i)
+    {
+        os << (i ? ", " : "") << dims.dims[i];
+    }
+    os << "]";
+    return os;
 }
 
-std::ostream& operator<<(std::ostream& os, const aclmdlBatch& batch) {
-  os << "batch [";
-  for (int i = 0; i < batch.batchCount; ++i) {
-    os << (i ? ", " : "") << batch.batch[i];
-  }
-  os << "]";
-  return os;
+std::ostream& operator<<(std::ostream& os, const aclmdlBatch& batch)
+{
+    os << "batch [";
+    for (int i = 0; i < batch.batchCount; ++i)
+    {
+        os << (i ? ", " : "") << batch.batch[i];
+    }
+    os << "]";
+    return os;
 }
 
-std::ostream& operator<<(std::ostream& os, const aclmdlHW& hw) {
-  os << "HW [";
-  for (int i = 0; i < hw.hwCount; ++i) {
-    os << (i ? ", " : "") << "(" << hw.hw[i][0] << ", " << hw.hw[i][1] << ")";
-  }
-  os << "]";
-  return os;
+std::ostream& operator<<(std::ostream& os, const aclmdlHW& hw)
+{
+    os << "HW [";
+    for (int i = 0; i < hw.hwCount; ++i)
+    {
+        os << (i ? ", " : "") << "(" << hw.hw[i][0] << ", " << hw.hw[i][1] << ")";
+    }
+    os << "]";
+    return os;
 }
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
+
+    namespace
+    {
+
+        inline Result<void> _m(aclError ec, SourceLocation loc = SourceLocation::current())
+        {
+            if (ec == ACL_SUCCESS)
+            {
+                return success();
+            }
+            else
+            {
+                return Status(eFail, loc);
+            }
+        }
 
-namespace {
+        template<typename T>
+        inline Result<T*> _p(T* ptr, SourceLocation loc = SourceLocation::current())
+        {
+            if (ptr)
+            {
+                return ptr;
+            }
+            else
+            {
+                return Status(eFail, loc);
+            }
+        }
 
-inline Result<void> _m(aclError ec, SourceLocation loc = SourceLocation::current()) {
-  if (ec == ACL_SUCCESS) {
-    return success();
-  } else {
-    return Status(eFail, loc);
-  }
-}
+        struct Context
+        {
+            Context()
+            {
+                std::lock_guard lock{mutex_};
+                if (ref_count_++ != 0)
+                {
+                    return;
+                }
+                auto ret = aclInit(nullptr);
+                if (ret == ACL_SUCCESS)
+                {
+                    MMDEPLOY_INFO("ACL initialized.");
+                    owned_acl_ = true;
+                }
+                else if (ret == ACL_ERROR_REPEAT_INITIALIZE)
+                {
+                    MMDEPLOY_INFO("ACL has already been initialized.");
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("aclInit() failed: {}", ret);
+                    assert(ret == 0);
+                }
+            }
+            ~Context()
+            {
+                std::lock_guard lock{mutex_};
+                if (--ref_count_ != 0)
+                {
+                    return;
+                }
+                // skip aclFinalize if aclInit is not successfully called by us.
+                if (owned_acl_)
+                {
+                    auto ret = aclFinalize();
+                    if (ret == ACL_SUCCESS)
+                    {
+                        MMDEPLOY_INFO("ACL finalized.");
+                        owned_acl_ = false;
+                    }
+                    else if (ret == ACL_ERROR_REPEAT_FINALIZE)
+                    {
+                        MMDEPLOY_INFO("ACL has already been finalized.");
+                    }
+                    else
+                    {
+                        MMDEPLOY_ERROR("aclFinalize() failed: {}", ret);
+                    }
+                }
+            }
+            static bool       owned_acl_;
+            static int        ref_count_;
+            static std::mutex mutex_;
+        };
+
+        bool       Context::owned_acl_ = false;
+        int        Context::ref_count_ = 0;
+        std::mutex Context::mutex_{};
+
+    }  // namespace
+
+    AclNet::~AclNet()
+    {
+        auto dtor = [&]() -> Result<void>
+        {
+            auto n_inputs = aclmdlGetDatasetNumBuffers(input_dataset_);
+            for (int i = 0; i < n_inputs; ++i)
+            {
+                auto buffer = aclmdlGetDatasetBuffer(input_dataset_, i);
+                auto data   = aclGetDataBufferAddr(buffer);
+                OUTCOME_TRY(_m(aclrtFree(data)));
+            }
+            input_tensor_.clear();
+            OUTCOME_TRY(_m(aclmdlDestroyDataset(input_dataset_)));
+
+            auto n_outputs = aclmdlGetDatasetNumBuffers(output_dataset_);
+            for (int i = 0; i < n_outputs; ++i)
+            {
+                auto buffer = aclmdlGetDatasetBuffer(output_dataset_, i);
+                auto data   = aclGetDataBufferAddr(buffer);
+                OUTCOME_TRY(_m(aclrtFree(data)));
+            }
+            output_tensor_.clear();
+            OUTCOME_TRY(_m(aclmdlDestroyDataset(output_dataset_)));
+
+            OUTCOME_TRY(_m(aclmdlDestroyDesc(model_desc_)));
+            OUTCOME_TRY(_m(aclmdlUnload(model_id_)));
+            return success();
+        };
+        if (auto r = dtor(); !r)
+        {
+            MMDEPLOY_ERROR("uninit failed: {}", r.error().message().c_str());
+        }
+    }
 
-template <typename T>
-inline Result<T*> _p(T* ptr, SourceLocation loc = SourceLocation::current()) {
-  if (ptr) {
-    return ptr;
-  } else {
-    return Status(eFail, loc);
-  }
-}
+    namespace
+    {
+
+        Result<DataType> FromAclDataType(aclDataType data_type)
+        {
+            switch (data_type)
+            {
+                case ACL_FLOAT:
+                    return DataType::kFLOAT;
+                case ACL_FLOAT16:
+                    return DataType::kHALF;
+                case ACL_INT8:
+                    return DataType::kINT8;
+                case ACL_INT32:
+                    return DataType::kINT32;
+                case ACL_INT64:
+                    return DataType::kINT64;
+                default:
+                    return Status(eNotSupported);
+            }
+        }
 
-struct Context {
-  Context() {
-    std::lock_guard lock{mutex_};
-    if (ref_count_++ != 0) {
-      return;
-    }
-    auto ret = aclInit(nullptr);
-    if (ret == ACL_SUCCESS) {
-      MMDEPLOY_INFO("ACL initialized.");
-      owned_acl_ = true;
-    } else if (ret == ACL_ERROR_REPEAT_INITIALIZE) {
-      MMDEPLOY_INFO("ACL has already been initialized.");
-    } else {
-      MMDEPLOY_ERROR("aclInit() failed: {}", ret);
-      assert(ret == 0);
-    }
-  }
-  ~Context() {
-    std::lock_guard lock{mutex_};
-    if (--ref_count_ != 0) {
-      return;
-    }
-    // skip aclFinalize if aclInit is not successfully called by us.
-    if (owned_acl_) {
-      auto ret = aclFinalize();
-      if (ret == ACL_SUCCESS) {
-        MMDEPLOY_INFO("ACL finalized.");
-        owned_acl_ = false;
-      } else if (ret == ACL_ERROR_REPEAT_FINALIZE) {
-        MMDEPLOY_INFO("ACL has already been finalized.");
-      } else {
-        MMDEPLOY_ERROR("aclFinalize() failed: {}", ret);
-      }
-    }
-  }
-  static bool owned_acl_;
-  static int ref_count_;
-  static std::mutex mutex_;
-};
-
-bool Context::owned_acl_ = false;
-int Context::ref_count_ = 0;
-std::mutex Context::mutex_{};
-
-}  // namespace
-
-AclNet::~AclNet() {
-  auto dtor = [&]() -> Result<void> {
-    auto n_inputs = aclmdlGetDatasetNumBuffers(input_dataset_);
-    for (int i = 0; i < n_inputs; ++i) {
-      auto buffer = aclmdlGetDatasetBuffer(input_dataset_, i);
-      auto data = aclGetDataBufferAddr(buffer);
-      OUTCOME_TRY(_m(aclrtFree(data)));
-    }
-    input_tensor_.clear();
-    OUTCOME_TRY(_m(aclmdlDestroyDataset(input_dataset_)));
-
-    auto n_outputs = aclmdlGetDatasetNumBuffers(output_dataset_);
-    for (int i = 0; i < n_outputs; ++i) {
-      auto buffer = aclmdlGetDatasetBuffer(output_dataset_, i);
-      auto data = aclGetDataBufferAddr(buffer);
-      OUTCOME_TRY(_m(aclrtFree(data)));
-    }
-    output_tensor_.clear();
-    OUTCOME_TRY(_m(aclmdlDestroyDataset(output_dataset_)));
-
-    OUTCOME_TRY(_m(aclmdlDestroyDesc(model_desc_)));
-    OUTCOME_TRY(_m(aclmdlUnload(model_id_)));
-    return success();
-  };
-  if (auto r = dtor(); !r) {
-    MMDEPLOY_ERROR("uninit failed: {}", r.error().message().c_str());
-  }
-}
+        Result<aclDataType> ToAclDataType(DataType data_type)
+        {
+            switch (data_type)
+            {
+                case DataType::kFLOAT:
+                    return ACL_FLOAT;
+                case DataType::kHALF:
+                    return ACL_FLOAT16;
+                case DataType::kINT8:
+                    return ACL_INT8;
+                case DataType::kINT32:
+                    return ACL_INT32;
+                case DataType::kINT64:
+                    return ACL_INT64;
+                default:
+                    return Status(eNotSupported);
+            }
+        }
 
-namespace {
-
-Result<DataType> FromAclDataType(aclDataType data_type) {
-  switch (data_type) {
-    case ACL_FLOAT:
-      return DataType::kFLOAT;
-    case ACL_FLOAT16:
-      return DataType::kHALF;
-    case ACL_INT8:
-      return DataType::kINT8;
-    case ACL_INT32:
-      return DataType::kINT32;
-    case ACL_INT64:
-      return DataType::kINT64;
-    default:
-      return Status(eNotSupported);
-  }
-}
+        Result<TensorDesc> ToTensorDesc(const aclmdlIODims& dims, aclDataType data_type)
+        {
+            auto extract_name = [](const std::string& name)
+            {
+                if (auto pos = name.find_last_of(':'); pos != std::string::npos)
+                {
+                    return name.substr(pos + 1);
+                }
+                else
+                {
+                    return name;
+                }
+            };
+            OUTCOME_TRY(auto _data_type, FromAclDataType(data_type));
+            return TensorDesc{Device(0), _data_type, TensorShape(&dims.dims[0], &dims.dims[0] + dims.dimCount), extract_name(dims.name)};
+        }
 
-Result<aclDataType> ToAclDataType(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return ACL_FLOAT;
-    case DataType::kHALF:
-      return ACL_FLOAT16;
-    case DataType::kINT8:
-      return ACL_INT8;
-    case DataType::kINT32:
-      return ACL_INT32;
-    case DataType::kINT64:
-      return ACL_INT64;
-    default:
-      return Status(eNotSupported);
-  }
-}
+        Result<size_t> GetByteSize(const aclmdlIODims& dims, aclDataType data_type)
+        {
+            size_t byte_size = aclDataTypeSize(data_type);
+            for (int i = 0; i < dims.dimCount; ++i)
+            {
+                if (dims.dims[i] < 0)
+                {
+                    return Status(eInvalidArgument);
+                }
+                byte_size *= dims.dims[i];
+            }
+            return byte_size;
+        }
 
-Result<TensorDesc> ToTensorDesc(const aclmdlIODims& dims, aclDataType data_type) {
-  auto extract_name = [](const std::string& name) {
-    if (auto pos = name.find_last_of(':'); pos != std::string::npos) {
-      return name.substr(pos + 1);
-    } else {
-      return name;
+    }  // namespace
+
+    // all dims must be fixed
+    auto AclNet::CreateBuffers(const aclmdlIODims& dims, aclDataType data_type) -> Result<Buffers>
+    {
+        OUTCOME_TRY(auto byte_size, GetByteSize(dims, data_type));
+        Buffers pair{};
+        void*   dev_ptr{};
+        OUTCOME_TRY(_m(aclrtMalloc(&dev_ptr, byte_size, ACL_MEM_MALLOC_HUGE_FIRST)));
+        OUTCOME_TRY(_m(aclrtMemset(dev_ptr, byte_size, 0, byte_size)));
+        OUTCOME_TRY(pair.device_buffer, _p(aclCreateDataBuffer(dev_ptr, byte_size)));
+        OUTCOME_TRY(auto desc, ToTensorDesc(dims, data_type));
+        void* host_ptr{};
+        OUTCOME_TRY(_m(aclrtMallocHost(&host_ptr, byte_size)));
+        memset(host_ptr, 0, byte_size);
+        pair.host_tensor =
+            Tensor(desc, std::shared_ptr<void>(host_ptr, [](void* p)
+                                               { aclrtFreeHost(p); }));
+        return pair;
     }
-  };
-  OUTCOME_TRY(auto _data_type, FromAclDataType(data_type));
-  return TensorDesc{Device(0), _data_type,
-                    TensorShape(&dims.dims[0], &dims.dims[0] + dims.dimCount),
-                    extract_name(dims.name)};
-}
 
-Result<size_t> GetByteSize(const aclmdlIODims& dims, aclDataType data_type) {
-  size_t byte_size = aclDataTypeSize(data_type);
-  for (int i = 0; i < dims.dimCount; ++i) {
-    if (dims.dims[i] < 0) {
-      return Status(eInvalidArgument);
+    auto AclNet::CreateBuffersDynamicBatchSize(aclmdlIODims dims, aclDataType data_type)
+        -> Result<Buffers>
+    {
+        for (int i = 0; i < dims.dimCount; ++i)
+        {
+            if (dims.dims[i] == -1)
+            {
+                dims.dims[i] = dynamic_batch_size_.back();
+            }
+        }
+        return CreateBuffers(dims, data_type);
     }
-    byte_size *= dims.dims[i];
-  }
-  return byte_size;
-}
-
-}  // namespace
-
-// all dims must be fixed
-auto AclNet::CreateBuffers(const aclmdlIODims& dims, aclDataType data_type) -> Result<Buffers> {
-  OUTCOME_TRY(auto byte_size, GetByteSize(dims, data_type));
-  Buffers pair{};
-  void* dev_ptr{};
-  OUTCOME_TRY(_m(aclrtMalloc(&dev_ptr, byte_size, ACL_MEM_MALLOC_HUGE_FIRST)));
-  OUTCOME_TRY(_m(aclrtMemset(dev_ptr, byte_size, 0, byte_size)));
-  OUTCOME_TRY(pair.device_buffer, _p(aclCreateDataBuffer(dev_ptr, byte_size)));
-  OUTCOME_TRY(auto desc, ToTensorDesc(dims, data_type));
-  void* host_ptr{};
-  OUTCOME_TRY(_m(aclrtMallocHost(&host_ptr, byte_size)));
-  memset(host_ptr, 0, byte_size);
-  pair.host_tensor =
-      Tensor(desc, std::shared_ptr<void>(host_ptr, [](void* p) { aclrtFreeHost(p); }));
-  return pair;
-}
 
-auto AclNet::CreateBuffersDynamicBatchSize(aclmdlIODims dims, aclDataType data_type)
-    -> Result<Buffers> {
-  for (int i = 0; i < dims.dimCount; ++i) {
-    if (dims.dims[i] == -1) {
-      dims.dims[i] = dynamic_batch_size_.back();
+    auto AclNet::CreateBuffersDynamicImageSize(int index, aclmdlIODims dims, aclDataType data_type)
+        -> Result<Buffers>
+    {
+        aclmdlHW hw_desc{};
+        OUTCOME_TRY(_m(aclmdlGetDynamicHW(model_desc_, index, &hw_desc)));
+        if (hw_desc.hwCount > 0)
+        {
+            auto& val = *std::max_element(hw_desc.hw, hw_desc.hw + hw_desc.hwCount, [](auto u, auto v)
+                                          { return u[0] * u[1] < v[0] * v[1]; });
+            int   ptr = 0;
+            for (int i = 0; i < dims.dimCount; ++i)
+            {
+                if (dims.dims[i] == -1)
+                {
+                    if (ptr == 2)
+                    {
+                        return Status(eInvalidArgument);
+                    }
+                    dims.dims[i] = val[ptr++];
+                }
+            }
+            if (ptr != 2)
+            {
+                return Status(eInvalidArgument);
+            }
+        }
+        return CreateBuffers(dims, data_type);
     }
-  }
-  return CreateBuffers(dims, data_type);
-}
 
-auto AclNet::CreateBuffersDynamicImageSize(int index, aclmdlIODims dims, aclDataType data_type)
-    -> Result<Buffers> {
-  aclmdlHW hw_desc{};
-  OUTCOME_TRY(_m(aclmdlGetDynamicHW(model_desc_, index, &hw_desc)));
-  if (hw_desc.hwCount > 0) {
-    auto& val = *std::max_element(hw_desc.hw, hw_desc.hw + hw_desc.hwCount,
-                                  [](auto u, auto v) { return u[0] * u[1] < v[0] * v[1]; });
-    int ptr = 0;
-    for (int i = 0; i < dims.dimCount; ++i) {
-      if (dims.dims[i] == -1) {
-        if (ptr == 2) {
-          return Status(eInvalidArgument);
-        }
-        dims.dims[i] = val[ptr++];
-      }
-    }
-    if (ptr != 2) {
-      return Status(eInvalidArgument);
+    auto AclNet::CreateBuffersDynamicDims(int index, int dim_count, const aclmdlIODims& dims, aclDataType data_type) -> Result<Buffers>
+    {
+        int          max_index = -1;
+        size_t       max_value = 0;
+        aclmdlIODims max_shape{};
+        for (int j = 0; j < dynamic_input_dims_.size(); ++j)
+        {
+            aclmdlIODims shape{};
+            strncpy(shape.name, dims.name, sizeof(shape.name));
+            shape.dimCount = dims.dimCount;
+            std::copy(dynamic_input_dims_[j].dims + dim_count,
+                      dynamic_input_dims_[j].dims + dim_count + dims.dimCount,
+                      shape.dims);
+            OUTCOME_TRY(auto byte_size, GetByteSize(shape, data_type));
+            if (byte_size > max_value)
+            {
+                max_index = j;
+                max_value = byte_size;
+                max_shape = shape;
+            }
+        }
+        if (max_index < 0)
+        {
+            return Status(eInvalidArgument);
+        }
+        MMDEPLOY_INFO("max shape for input {}: {}", index, max_shape);
+        return CreateBuffers(max_shape, data_type);
     }
-  }
-  return CreateBuffers(dims, data_type);
-}
 
-auto AclNet::CreateBuffersDynamicDims(int index, int dim_count, const aclmdlIODims& dims,
-                                      aclDataType data_type) -> Result<Buffers> {
-  int max_index = -1;
-  size_t max_value = 0;
-  aclmdlIODims max_shape{};
-  for (int j = 0; j < dynamic_input_dims_.size(); ++j) {
-    aclmdlIODims shape{};
-    strncpy(shape.name, dims.name, sizeof(shape.name));
-    shape.dimCount = dims.dimCount;
-    std::copy(dynamic_input_dims_[j].dims + dim_count,
-              dynamic_input_dims_[j].dims + dim_count + dims.dimCount, shape.dims);
-    OUTCOME_TRY(auto byte_size, GetByteSize(shape, data_type));
-    if (byte_size > max_value) {
-      max_index = j;
-      max_value = byte_size;
-      max_shape = shape;
-    }
-  }
-  if (max_index < 0) {
-    return Status(eInvalidArgument);
-  }
-  MMDEPLOY_INFO("max shape for input {}: {}", index, max_shape);
-  return CreateBuffers(max_shape, data_type);
-}
+    Result<void> AclNet::ConfigDynamicShapes()
+    {
+        aclError status = ACL_SUCCESS;
+        {
+            size_t dynamic_tensor_index{};
+            status = aclmdlGetInputIndexByName(model_desc_, ACL_DYNAMIC_TENSOR_NAME, &dynamic_tensor_index);
+            if (status == ACL_SUCCESS)
+            {
+                dynamic_tensor_index_ = static_cast<int>(dynamic_tensor_index);
+                MMDEPLOY_INFO("dynamic tensor index: {}", dynamic_tensor_index);
+            }
+        }
 
-Result<void> AclNet::ConfigDynamicShapes() {
-  aclError status = ACL_SUCCESS;
-  {
-    size_t dynamic_tensor_index{};
-    status = aclmdlGetInputIndexByName(model_desc_, ACL_DYNAMIC_TENSOR_NAME, &dynamic_tensor_index);
-    if (status == ACL_SUCCESS) {
-      dynamic_tensor_index_ = static_cast<int>(dynamic_tensor_index);
-      MMDEPLOY_INFO("dynamic tensor index: {}", dynamic_tensor_index);
-    }
-  }
-
-  if (dynamic_tensor_index_ >= 0) {
-    aclmdlBatch batch_desc{};
-    status = aclmdlGetDynamicBatch(model_desc_, &batch_desc);
-    if (status == ACL_SUCCESS && batch_desc.batchCount > 0) {
-      MMDEPLOY_INFO("{}, status = {}", batch_desc, status);
-      input_shape_type_ = kDynamicBatchSize;
-      dynamic_batch_size_.insert(dynamic_batch_size_.end(), batch_desc.batch,
-                                 batch_desc.batch + batch_desc.batchCount);
-      std::sort(dynamic_batch_size_.begin(), dynamic_batch_size_.end());
+        if (dynamic_tensor_index_ >= 0)
+        {
+            aclmdlBatch batch_desc{};
+            status = aclmdlGetDynamicBatch(model_desc_, &batch_desc);
+            if (status == ACL_SUCCESS && batch_desc.batchCount > 0)
+            {
+                MMDEPLOY_INFO("{}, status = {}", batch_desc, status);
+                input_shape_type_ = kDynamicBatchSize;
+                dynamic_batch_size_.insert(dynamic_batch_size_.end(), batch_desc.batch, batch_desc.batch + batch_desc.batchCount);
+                std::sort(dynamic_batch_size_.begin(), dynamic_batch_size_.end());
+            }
+
+            size_t dynamic_gear_count{0};
+            if (input_shape_type_ == kStatic)
+            {
+                status = aclmdlGetInputDynamicGearCount(model_desc_, -1, &dynamic_gear_count);
+                dynamic_input_dims_.resize(dynamic_gear_count);
+                if (status == ACL_SUCCESS && dynamic_gear_count > 0)
+                {
+                    status = aclmdlGetInputDynamicDims(model_desc_, -1, dynamic_input_dims_.data(), dynamic_gear_count);
+                    for (const auto& dims : dynamic_input_dims_)
+                    {
+                        MMDEPLOY_INFO("dynamic input dims: {}", dims);
+                    }
+                    input_shape_type_ = kDynamicDims;
+                }
+                else
+                {
+                    input_shape_type_ = kDynamicImageSize;
+                }
+            }
+        }
+        return success();
     }
 
-    size_t dynamic_gear_count{0};
-    if (input_shape_type_ == kStatic) {
-      status = aclmdlGetInputDynamicGearCount(model_desc_, -1, &dynamic_gear_count);
-      dynamic_input_dims_.resize(dynamic_gear_count);
-      if (status == ACL_SUCCESS && dynamic_gear_count > 0) {
-        status = aclmdlGetInputDynamicDims(model_desc_, -1, dynamic_input_dims_.data(),
-                                           dynamic_gear_count);
-        for (const auto& dims : dynamic_input_dims_) {
-          MMDEPLOY_INFO("dynamic input dims: {}", dims);
-        }
-        input_shape_type_ = kDynamicDims;
-      } else {
-        input_shape_type_ = kDynamicImageSize;
-      }
+    Result<void> AclNet::CreateInputBuffers()
+    {
+        input_dataset_ = aclmdlCreateDataset();
+        auto n_inputs  = aclmdlGetNumInputs(model_desc_);
+        MMDEPLOY_INFO("n_inputs = {}, dynamic_tensor_index_ = {}", n_inputs, dynamic_tensor_index_);
+        int dim_count = 0;
+        for (int i = 0; i < n_inputs; ++i)
+        {
+            if (i == dynamic_tensor_index_)
+            {
+                void* data{};
+                auto  input_len = aclmdlGetInputSizeByIndex(model_desc_, i);
+                OUTCOME_TRY(_m(aclrtMalloc(&data, input_len, ACL_MEM_MALLOC_HUGE_FIRST)));
+                OUTCOME_TRY(auto buffer, _p(aclCreateDataBuffer(data, input_len)));
+                OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(input_dataset_, buffer)));
+            }
+            else
+            {
+                Buffers      buffers{};
+                aclmdlIODims dims{};
+                OUTCOME_TRY(_m(aclmdlGetInputDims(model_desc_, i, &dims)));
+                input_dims_.push_back(dims);
+                auto data_type = aclmdlGetInputDataType(model_desc_, i);
+                input_data_type_.push_back(data_type);
+                MMDEPLOY_INFO("{}", dims);
+
+                switch (input_shape_type_)
+                {
+                    case kStatic:
+                    {
+                        OUTCOME_TRY(buffers, CreateBuffers(dims, data_type));
+                        break;
+                    }
+                    case kDynamicBatchSize:
+                    {
+                        OUTCOME_TRY(buffers, CreateBuffersDynamicBatchSize(dims, data_type));
+                        break;
+                    }
+                    case kDynamicImageSize:
+                    {
+                        OUTCOME_TRY(buffers, CreateBuffersDynamicImageSize(i, dims, data_type));
+                        break;
+                    }
+                    case kDynamicDims:
+                    {
+                        OUTCOME_TRY(buffers, CreateBuffersDynamicDims(i, dim_count, dims, data_type));
+                        break;
+                    }
+                    default:
+                        return Status(eInvalidArgument);
+                }
+
+                OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(input_dataset_, buffers.device_buffer)));
+                input_tensor_.push_back(std::move(buffers.host_tensor));
+                dim_count += dims.dimCount;
+            }
+        }
+        return success();
     }
-  }
-  return success();
-}
 
-Result<void> AclNet::CreateInputBuffers() {
-  input_dataset_ = aclmdlCreateDataset();
-  auto n_inputs = aclmdlGetNumInputs(model_desc_);
-  MMDEPLOY_INFO("n_inputs = {}, dynamic_tensor_index_ = {}", n_inputs, dynamic_tensor_index_);
-  int dim_count = 0;
-  for (int i = 0; i < n_inputs; ++i) {
-    if (i == dynamic_tensor_index_) {
-      void* data{};
-      auto input_len = aclmdlGetInputSizeByIndex(model_desc_, i);
-      OUTCOME_TRY(_m(aclrtMalloc(&data, input_len, ACL_MEM_MALLOC_HUGE_FIRST)));
-      OUTCOME_TRY(auto buffer, _p(aclCreateDataBuffer(data, input_len)));
-      OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(input_dataset_, buffer)));
-    } else {
-      Buffers buffers{};
-      aclmdlIODims dims{};
-      OUTCOME_TRY(_m(aclmdlGetInputDims(model_desc_, i, &dims)));
-      input_dims_.push_back(dims);
-      auto data_type = aclmdlGetInputDataType(model_desc_, i);
-      input_data_type_.push_back(data_type);
-      MMDEPLOY_INFO("{}", dims);
-
-      switch (input_shape_type_) {
-        case kStatic: {
-          OUTCOME_TRY(buffers, CreateBuffers(dims, data_type));
-          break;
-        }
-        case kDynamicBatchSize: {
-          OUTCOME_TRY(buffers, CreateBuffersDynamicBatchSize(dims, data_type));
-          break;
-        }
-        case kDynamicImageSize: {
-          OUTCOME_TRY(buffers, CreateBuffersDynamicImageSize(i, dims, data_type));
-          break;
-        }
-        case kDynamicDims: {
-          OUTCOME_TRY(buffers, CreateBuffersDynamicDims(i, dim_count, dims, data_type));
-          break;
-        }
-        default:
-          return Status(eInvalidArgument);
-      }
-
-      OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(input_dataset_, buffers.device_buffer)));
-      input_tensor_.push_back(std::move(buffers.host_tensor));
-      dim_count += dims.dimCount;
+    Result<void> AclNet::CreateOutputBuffers()
+    {
+        output_dataset_                     = aclmdlCreateDataset();
+        auto                      n_outputs = aclmdlGetNumOutputs(model_desc_);
+        std::vector<aclmdlIODims> output_dims;
+        for (int i = 0; i < n_outputs; ++i)
+        {
+            aclmdlIODims dims{};
+            OUTCOME_TRY(_m(aclmdlGetOutputDims(model_desc_, i, &dims)));  // return max dims
+            output_dims_.push_back(dims);
+            MMDEPLOY_INFO("{}", dims);
+            auto data_type = aclmdlGetOutputDataType(model_desc_, i);
+            output_data_type_.push_back(data_type);
+            OUTCOME_TRY(auto buffers, CreateBuffers(dims, data_type));
+            OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(output_dataset_, buffers.device_buffer)));
+            output_tensor_.push_back(std::move(buffers.host_tensor));
+        }
+        return success();
     }
-  }
-  return success();
-}
 
-Result<void> AclNet::CreateOutputBuffers() {
-  output_dataset_ = aclmdlCreateDataset();
-  auto n_outputs = aclmdlGetNumOutputs(model_desc_);
-  std::vector<aclmdlIODims> output_dims;
-  for (int i = 0; i < n_outputs; ++i) {
-    aclmdlIODims dims{};
-    OUTCOME_TRY(_m(aclmdlGetOutputDims(model_desc_, i, &dims)));  // return max dims
-    output_dims_.push_back(dims);
-    MMDEPLOY_INFO("{}", dims);
-    auto data_type = aclmdlGetOutputDataType(model_desc_, i);
-    output_data_type_.push_back(data_type);
-    OUTCOME_TRY(auto buffers, CreateBuffers(dims, data_type));
-    OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(output_dataset_, buffers.device_buffer)));
-    output_tensor_.push_back(std::move(buffers.host_tensor));
-  }
-  return success();
-}
+    Result<void> AclNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        cpu_stream_   = context["stream"].get<Stream>();
 
-Result<void> AclNet::Init(const Value& args) {
-  auto& context = args["context"];
-  cpu_stream_ = context["stream"].get<Stream>();
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-
-  device_id_ = context["device"].get<Device>().device_id();
-  acl_context_ = std::make_shared<Context>();
-
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-  OUTCOME_TRY(auto binary, model.ReadFile(config.net));
-
-  OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
-
-  OUTCOME_TRY(_m(aclmdlLoadFromMem(binary.data(), binary.size(), &model_id_)));
-
-  model_desc_ = aclmdlCreateDesc();
-  OUTCOME_TRY(_m(aclmdlGetDesc(model_desc_, model_id_)));
-
-  // dynamic_tensor_index_
-  // input_shape_type_
-  // dynamic_batch_size_
-  // dynamic_input_dims_
-  if (auto r = ConfigDynamicShapes(); !r) {
-    MMDEPLOY_ERROR("Failed to config dynamic shapes");
-    return r.as_failure();
-  }
-
-  // input_dataset_
-  // input_data_type_
-  // input_dims_
-  // input_tensor_
-  if (auto r = CreateInputBuffers(); !r) {
-    MMDEPLOY_ERROR("Failed to create input buffers");
-    return r.as_failure();
-  }
-
-  // output_dataset_
-  // output_data_type_
-  // output_dims_
-  // output_tensor_
-  if (auto r = CreateOutputBuffers(); !r) {
-    MMDEPLOY_ERROR("Failed to create output buffers");
-    return r.as_failure();
-  }
-
-  return success();
-}
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
 
-Result<void> AclNet::Deinit() { return success(); }
+        device_id_   = context["device"].get<Device>().device_id();
+        acl_context_ = std::make_shared<Context>();
 
-Result<Span<Tensor>> AclNet::GetInputTensors() { return input_tensor_; }
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+        OUTCOME_TRY(auto binary, model.ReadFile(config.net));
 
-Result<Span<Tensor>> AclNet::GetOutputTensors() { return output_tensor_; }
+        OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
 
-Result<void> AclNet::Reshape(Span<TensorShape> input_shapes) {
-  OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
-  // Sanity checks
-  if (input_shapes.size() != input_dims_.size()) {
-    MMDEPLOY_ERROR("inconsistent num inputs");
-    return Status(eInvalidArgument);
-  }
-  for (int i = 0; i < input_dims_.size(); ++i) {
-    if (input_shapes[i].size() != input_dims_[i].dimCount) {
-      MMDEPLOY_ERROR("inconsistent num of dims");
-      return Status(eInvalidArgument);
-    }
-  }
+        OUTCOME_TRY(_m(aclmdlLoadFromMem(binary.data(), binary.size(), &model_id_)));
 
-  switch (input_shape_type_) {
-    case kStatic: {
-      OUTCOME_TRY(ReshapeStatic(input_shapes));
-      break;
-    }
-    case kDynamicBatchSize: {
-      OUTCOME_TRY(ReshapeDynamicBatchSize(input_shapes));
-      break;
+        model_desc_ = aclmdlCreateDesc();
+        OUTCOME_TRY(_m(aclmdlGetDesc(model_desc_, model_id_)));
+
+        // dynamic_tensor_index_
+        // input_shape_type_
+        // dynamic_batch_size_
+        // dynamic_input_dims_
+        if (auto r = ConfigDynamicShapes(); !r)
+        {
+            MMDEPLOY_ERROR("Failed to config dynamic shapes");
+            return r.as_failure();
+        }
+
+        // input_dataset_
+        // input_data_type_
+        // input_dims_
+        // input_tensor_
+        if (auto r = CreateInputBuffers(); !r)
+        {
+            MMDEPLOY_ERROR("Failed to create input buffers");
+            return r.as_failure();
+        }
+
+        // output_dataset_
+        // output_data_type_
+        // output_dims_
+        // output_tensor_
+        if (auto r = CreateOutputBuffers(); !r)
+        {
+            MMDEPLOY_ERROR("Failed to create output buffers");
+            return r.as_failure();
+        }
+
+        return success();
     }
-    case kDynamicImageSize: {
-      OUTCOME_TRY(ReshapeDynamicImageSize(input_shapes));
-      break;
+
+    Result<void> AclNet::Deinit()
+    {
+        return success();
     }
-    case kDynamicDims: {
-      OUTCOME_TRY(ReshapeDynamicDims(input_shapes));
-      break;
+
+    Result<Span<Tensor>> AclNet::GetInputTensors()
+    {
+        return input_tensor_;
     }
-    default:
-      return Status(eInvalidArgument);
-  }
-
-  for (int i = 0; i < input_shapes.size(); ++i) {
-    auto buffer = input_tensor_[i].buffer();
-    auto desc = input_tensor_[i].desc();
-    desc.shape = input_shapes[i];
-    input_tensor_[i] = Tensor(std::move(desc), std::move(buffer));
-  }
-
-  for (int i = 0; i < output_dims_.size(); ++i) {
-    aclmdlIODims dims{};
-    OUTCOME_TRY(_m(aclmdlGetCurOutputDims(model_desc_, i, &dims)));
-    auto buffer = output_tensor_[i].buffer();
-    auto desc = output_tensor_[i].desc();
-    desc.shape = TensorShape(&dims.dims[0], &dims.dims[0] + dims.dimCount);
-    output_tensor_[i] = Tensor(std::move(desc), std::move(buffer));
-  }
-
-  return success();
-}
 
-Result<void> AclNet::ReshapeStatic(Span<TensorShape> input_shapes) {
-  for (int i = 0; i < input_dims_.size(); ++i) {
-    Span src(input_shapes[i]);
-    Span ref(input_dims_[i].dims, input_dims_[i].dimCount);
-    if (src != ref) {
-      MMDEPLOY_ERROR("Shape mismatch {} vs {}", src, ref);
-      return Status(eInvalidArgument);
+    Result<Span<Tensor>> AclNet::GetOutputTensors()
+    {
+        return output_tensor_;
     }
-  }
-  return success();
-}
 
-Result<void> AclNet::ReshapeDynamicBatchSize(Span<TensorShape> input_shapes) {
-  int batch_size = -1;
-  for (int i = 0; i < input_dims_.size(); ++i) {
-    for (int j = 0; j < input_dims_[i].dimCount; ++j) {
-      if (input_dims_[i].dims[j] == -1) {
-        if (batch_size != -1 && batch_size != input_shapes[i][j]) {
-          // inconsistent batch size
-          return Status(eInvalidArgument);
-        }
-        batch_size = input_shapes[i][j];
-      }
+    Result<void> AclNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
+        // Sanity checks
+        if (input_shapes.size() != input_dims_.size())
+        {
+            MMDEPLOY_ERROR("inconsistent num inputs");
+            return Status(eInvalidArgument);
+        }
+        for (int i = 0; i < input_dims_.size(); ++i)
+        {
+            if (input_shapes[i].size() != input_dims_[i].dimCount)
+            {
+                MMDEPLOY_ERROR("inconsistent num of dims");
+                return Status(eInvalidArgument);
+            }
+        }
+
+        switch (input_shape_type_)
+        {
+            case kStatic:
+            {
+                OUTCOME_TRY(ReshapeStatic(input_shapes));
+                break;
+            }
+            case kDynamicBatchSize:
+            {
+                OUTCOME_TRY(ReshapeDynamicBatchSize(input_shapes));
+                break;
+            }
+            case kDynamicImageSize:
+            {
+                OUTCOME_TRY(ReshapeDynamicImageSize(input_shapes));
+                break;
+            }
+            case kDynamicDims:
+            {
+                OUTCOME_TRY(ReshapeDynamicDims(input_shapes));
+                break;
+            }
+            default:
+                return Status(eInvalidArgument);
+        }
+
+        for (int i = 0; i < input_shapes.size(); ++i)
+        {
+            auto buffer      = input_tensor_[i].buffer();
+            auto desc        = input_tensor_[i].desc();
+            desc.shape       = input_shapes[i];
+            input_tensor_[i] = Tensor(std::move(desc), std::move(buffer));
+        }
+
+        for (int i = 0; i < output_dims_.size(); ++i)
+        {
+            aclmdlIODims dims{};
+            OUTCOME_TRY(_m(aclmdlGetCurOutputDims(model_desc_, i, &dims)));
+            auto buffer       = output_tensor_[i].buffer();
+            auto desc         = output_tensor_[i].desc();
+            desc.shape        = TensorShape(&dims.dims[0], &dims.dims[0] + dims.dimCount);
+            output_tensor_[i] = Tensor(std::move(desc), std::move(buffer));
+        }
+
+        return success();
     }
-  }
-  if (batch_size < 0) {
-    MMDEPLOY_ERROR("unable to determine batch size");
-    return Status(eFail);
-  }
-  MMDEPLOY_INFO("batch size {} {}", batch_size, dynamic_tensor_index_);
-  auto index =
-      std::lower_bound(dynamic_batch_size_.begin(), dynamic_batch_size_.end(), batch_size) -
-      dynamic_batch_size_.begin();
-  if (index == dynamic_batch_size_.size()) {
-    MMDEPLOY_ERROR("Unsupported batch size: {}", batch_size);
-  }
-  // TODO: memset padding memory to avoid potential extra computation
-  OUTCOME_TRY(_m(aclmdlSetDynamicBatchSize(model_id_, input_dataset_, dynamic_tensor_index_,
-                                           dynamic_batch_size_[index])));
-  return success();
-}
 
-Result<void> AclNet::ReshapeDynamicImageSize(Span<TensorShape> input_shapes) {
-  uint64_t hw[2];
-  bool found = false;
-  for (int i = 0; i < input_dims_.size(); ++i) {
-    uint64_t tmp[2];
-    int ptr = 0;
-    for (int j = 0; j < input_dims_[i].dimCount; ++j) {
-      if (input_dims_[i].dims[j] == -1) {
-        if (ptr == 2) {
-          MMDEPLOY_ERROR("dynamic HW size out of bounds: {}", input_dims_[i]);
-          return Status(eInvalidArgument);
-        } else {
-          tmp[ptr++] = input_shapes[i][j];
-        }
-      }
+    Result<void> AclNet::ReshapeStatic(Span<TensorShape> input_shapes)
+    {
+        for (int i = 0; i < input_dims_.size(); ++i)
+        {
+            Span src(input_shapes[i]);
+            Span ref(input_dims_[i].dims, input_dims_[i].dimCount);
+            if (src != ref)
+            {
+                MMDEPLOY_ERROR("Shape mismatch {} vs {}", src, ref);
+                return Status(eInvalidArgument);
+            }
+        }
+        return success();
     }
-    if (ptr && ptr != 2) {
-      MMDEPLOY_ERROR("Partially determined dynamic HW size: {}", input_dims_[i]);
-      return Status(eInvalidArgument);
+
+    Result<void> AclNet::ReshapeDynamicBatchSize(Span<TensorShape> input_shapes)
+    {
+        int batch_size = -1;
+        for (int i = 0; i < input_dims_.size(); ++i)
+        {
+            for (int j = 0; j < input_dims_[i].dimCount; ++j)
+            {
+                if (input_dims_[i].dims[j] == -1)
+                {
+                    if (batch_size != -1 && batch_size != input_shapes[i][j])
+                    {
+                        // inconsistent batch size
+                        return Status(eInvalidArgument);
+                    }
+                    batch_size = input_shapes[i][j];
+                }
+            }
+        }
+        if (batch_size < 0)
+        {
+            MMDEPLOY_ERROR("unable to determine batch size");
+            return Status(eFail);
+        }
+        MMDEPLOY_INFO("batch size {} {}", batch_size, dynamic_tensor_index_);
+        auto index =
+            std::lower_bound(dynamic_batch_size_.begin(), dynamic_batch_size_.end(), batch_size) -
+            dynamic_batch_size_.begin();
+        if (index == dynamic_batch_size_.size())
+        {
+            MMDEPLOY_ERROR("Unsupported batch size: {}", batch_size);
+        }
+        // TODO: memset padding memory to avoid potential extra computation
+        OUTCOME_TRY(_m(aclmdlSetDynamicBatchSize(model_id_, input_dataset_, dynamic_tensor_index_, dynamic_batch_size_[index])));
+        return success();
     }
-    if (ptr == 2) {
-      if (found) {
-        if (hw[0] != tmp[0] || hw[1] != tmp[1]) {
-          MMDEPLOY_ERROR("Inconsistent dynamic HW size: ({}, {}) vs ({}, {})", hw[0], hw[1], tmp[0],
-                         tmp[1]);
-          return Status(eInvalidArgument);
-        }
-      } else {
-        found = true;
-        hw[0] = tmp[0];
-        hw[1] = tmp[1];
-      }
+
+    Result<void> AclNet::ReshapeDynamicImageSize(Span<TensorShape> input_shapes)
+    {
+        uint64_t hw[2];
+        bool     found = false;
+        for (int i = 0; i < input_dims_.size(); ++i)
+        {
+            uint64_t tmp[2];
+            int      ptr = 0;
+            for (int j = 0; j < input_dims_[i].dimCount; ++j)
+            {
+                if (input_dims_[i].dims[j] == -1)
+                {
+                    if (ptr == 2)
+                    {
+                        MMDEPLOY_ERROR("dynamic HW size out of bounds: {}", input_dims_[i]);
+                        return Status(eInvalidArgument);
+                    }
+                    else
+                    {
+                        tmp[ptr++] = input_shapes[i][j];
+                    }
+                }
+            }
+            if (ptr && ptr != 2)
+            {
+                MMDEPLOY_ERROR("Partially determined dynamic HW size: {}", input_dims_[i]);
+                return Status(eInvalidArgument);
+            }
+            if (ptr == 2)
+            {
+                if (found)
+                {
+                    if (hw[0] != tmp[0] || hw[1] != tmp[1])
+                    {
+                        MMDEPLOY_ERROR("Inconsistent dynamic HW size: ({}, {}) vs ({}, {})", hw[0], hw[1], tmp[0], tmp[1]);
+                        return Status(eInvalidArgument);
+                    }
+                }
+                else
+                {
+                    found = true;
+                    hw[0] = tmp[0];
+                    hw[1] = tmp[1];
+                }
+            }
+        }
+        if (!found)
+        {
+            MMDEPLOY_ERROR("Unable to determine image size");
+            return Status(eInvalidArgument);
+        }
+        MMDEPLOY_INFO("dynamic HW size ({}, {})", hw[0], hw[1]);
+        OUTCOME_TRY(
+            _m(aclmdlSetDynamicHWSize(model_id_, input_dataset_, dynamic_tensor_index_, hw[0], hw[1])));
+        return success();
     }
-  }
-  if (!found) {
-    MMDEPLOY_ERROR("Unable to determine image size");
-    return Status(eInvalidArgument);
-  }
-  MMDEPLOY_INFO("dynamic HW size ({}, {})", hw[0], hw[1]);
-  OUTCOME_TRY(
-      _m(aclmdlSetDynamicHWSize(model_id_, input_dataset_, dynamic_tensor_index_, hw[0], hw[1])));
-  return success();
-}
 
-Result<void> AclNet::ReshapeDynamicDims(Span<TensorShape> input_shapes) {
-  std::vector<int> match(dynamic_input_dims_.size(), 1);
-  aclmdlIODims dims{};
-  for (int i = 0; i < input_shapes.size(); ++i) {
-    const auto& shape = input_shapes[i];
-    for (int j = 0; j < shape.size(); ++j) {
-      if (input_dims_[i].dims[j] == -1) {
-        for (int k = 0; k < dynamic_input_dims_.size(); ++k) {
-          // disable profile when dims mismatch, except for the first dim (batch size)
-          if (j == 0 && shape[j] < dynamic_input_dims_[k].dims[dims.dimCount]) {
-            // pass
-          } else if (shape[j] != dynamic_input_dims_[k].dims[dims.dimCount]) {
-            match[k] = 0;
-          }
-        }
-      } else {
-        if (input_dims_[i].dims[j] != shape[j]) {
-          return Status(eNotSupported);
-        }
-      }
-      dims.dims[dims.dimCount++] = shape[j];
+    Result<void> AclNet::ReshapeDynamicDims(Span<TensorShape> input_shapes)
+    {
+        std::vector<int> match(dynamic_input_dims_.size(), 1);
+        aclmdlIODims     dims{};
+        for (int i = 0; i < input_shapes.size(); ++i)
+        {
+            const auto& shape = input_shapes[i];
+            for (int j = 0; j < shape.size(); ++j)
+            {
+                if (input_dims_[i].dims[j] == -1)
+                {
+                    for (int k = 0; k < dynamic_input_dims_.size(); ++k)
+                    {
+                        // disable profile when dims mismatch, except for the first dim (batch size)
+                        if (j == 0 && shape[j] < dynamic_input_dims_[k].dims[dims.dimCount])
+                        {
+                            // pass
+                        }
+                        else if (shape[j] != dynamic_input_dims_[k].dims[dims.dimCount])
+                        {
+                            match[k] = 0;
+                        }
+                    }
+                }
+                else
+                {
+                    if (input_dims_[i].dims[j] != shape[j])
+                    {
+                        return Status(eNotSupported);
+                    }
+                }
+                dims.dims[dims.dimCount++] = shape[j];
+            }
+        }
+        int dims_index = std::find(match.begin(), match.end(), 1) - match.begin();
+        if (dims_index == match.size())
+        {
+            MMDEPLOY_ERROR("Shape not supported: {}", dims);
+            return Status(eNotSupported);
+        }
+        // TODO: memset padding memory to avoid potential extra computation
+        OUTCOME_TRY(_m(aclmdlSetInputDynamicDims(model_id_, input_dataset_, dynamic_tensor_index_, &dynamic_input_dims_[dims_index])));
+        return success();
     }
-  }
-  int dims_index = std::find(match.begin(), match.end(), 1) - match.begin();
-  if (dims_index == match.size()) {
-    MMDEPLOY_ERROR("Shape not supported: {}", dims);
-    return Status(eNotSupported);
-  }
-  // TODO: memset padding memory to avoid potential extra computation
-  OUTCOME_TRY(_m(aclmdlSetInputDynamicDims(model_id_, input_dataset_, dynamic_tensor_index_,
-                                           &dynamic_input_dims_[dims_index])));
-  return success();
-}
 
-Result<void> AclNet::Forward() {
-  OUTCOME_TRY(cpu_stream_.Wait());
-
-  OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
-
-  for (int i = 0; i < input_tensor_.size(); ++i) {
-    auto buffer = aclmdlGetDatasetBuffer(input_dataset_, i);
-    auto buffer_size = aclGetDataBufferSizeV2(buffer);
-    auto buffer_data = aclGetDataBufferAddr(buffer);
-    auto host_ptr = input_tensor_[i].data();
-    OUTCOME_TRY(_m(aclrtMemcpy(buffer_data, buffer_size, host_ptr, input_tensor_[i].byte_size(),
-                               ACL_MEMCPY_HOST_TO_DEVICE)));
-  }
-
-  OUTCOME_TRY(_m(aclmdlExecute(model_id_, input_dataset_, output_dataset_)));
-
-  for (int i = 0; i < output_tensor_.size(); ++i) {
-    auto buffer = aclmdlGetDatasetBuffer(output_dataset_, i);
-    auto buffer_data = aclGetDataBufferAddr(buffer);
-    auto host_ptr = output_tensor_[i].data();
-    OUTCOME_TRY(_m(aclrtMemcpy(host_ptr, output_tensor_[i].byte_size(), buffer_data,
-                               output_tensor_[i].byte_size(), ACL_MEMCPY_DEVICE_TO_HOST)));
-  }
-  return success();
-}
+    Result<void> AclNet::Forward()
+    {
+        OUTCOME_TRY(cpu_stream_.Wait());
 
-Result<void> AclNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
+        OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
 
-static std::unique_ptr<Net> Create(const Value& args) {
-  try {
-    auto p = std::make_unique<AclNet>();
-    if (auto r = p->Init(args)) {
-      return p;
-    } else {
-      MMDEPLOY_ERROR("error creating AclNet: {}", r.error().message().c_str());
-      return nullptr;
+        for (int i = 0; i < input_tensor_.size(); ++i)
+        {
+            auto buffer      = aclmdlGetDatasetBuffer(input_dataset_, i);
+            auto buffer_size = aclGetDataBufferSizeV2(buffer);
+            auto buffer_data = aclGetDataBufferAddr(buffer);
+            auto host_ptr    = input_tensor_[i].data();
+            OUTCOME_TRY(_m(aclrtMemcpy(buffer_data, buffer_size, host_ptr, input_tensor_[i].byte_size(), ACL_MEMCPY_HOST_TO_DEVICE)));
+        }
+
+        OUTCOME_TRY(_m(aclmdlExecute(model_id_, input_dataset_, output_dataset_)));
+
+        for (int i = 0; i < output_tensor_.size(); ++i)
+        {
+            auto buffer      = aclmdlGetDatasetBuffer(output_dataset_, i);
+            auto buffer_data = aclGetDataBufferAddr(buffer);
+            auto host_ptr    = output_tensor_[i].data();
+            OUTCOME_TRY(_m(aclrtMemcpy(host_ptr, output_tensor_[i].byte_size(), buffer_data, output_tensor_[i].byte_size(), ACL_MEMCPY_DEVICE_TO_HOST)));
+        }
+        return success();
+    }
+
+    Result<void> AclNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
+    }
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        try
+        {
+            auto p = std::make_unique<AclNet>();
+            if (auto r = p->Init(args))
+            {
+                return p;
+            }
+            else
+            {
+                MMDEPLOY_ERROR("error creating AclNet: {}", r.error().message().c_str());
+                return nullptr;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating AclNet: {}", e.what());
+            return nullptr;
+        }
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating AclNet: {}", e.what());
-    return nullptr;
-  }
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (ascend, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (ascend, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/acl/acl_net.h b/csrc/mmdeploy/net/acl/acl_net.h
index a842ce4a3c..faee7e508e 100644
--- a/csrc/mmdeploy/net/acl/acl_net.h
+++ b/csrc/mmdeploy/net/acl/acl_net.h
@@ -7,63 +7,70 @@
 #include "mmdeploy/core/net.h"
 #include "mmdeploy/core/status_code.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class AclNet : public Net {
- public:
-  ~AclNet() override;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
+    class AclNet : public Net
+    {
+      public:
+        ~AclNet() override;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
 
- private:
-  enum InputShapeType { kStatic, kDynamicBatchSize, kDynamicImageSize, kDynamicDims };
+      private:
+        enum InputShapeType
+        {
+            kStatic,
+            kDynamicBatchSize,
+            kDynamicImageSize,
+            kDynamicDims
+        };
 
-  Result<void> ReshapeStatic(Span<TensorShape> input_shapes);
-  Result<void> ReshapeDynamicBatchSize(Span<TensorShape> input_shapes);
-  Result<void> ReshapeDynamicImageSize(Span<TensorShape> input_shapes);
-  Result<void> ReshapeDynamicDims(Span<TensorShape> input_shapes);
+        Result<void> ReshapeStatic(Span<TensorShape> input_shapes);
+        Result<void> ReshapeDynamicBatchSize(Span<TensorShape> input_shapes);
+        Result<void> ReshapeDynamicImageSize(Span<TensorShape> input_shapes);
+        Result<void> ReshapeDynamicDims(Span<TensorShape> input_shapes);
 
-  struct Buffers {
-    aclDataBuffer* device_buffer;
-    Tensor host_tensor;
-  };
+        struct Buffers
+        {
+            aclDataBuffer* device_buffer;
+            Tensor         host_tensor;
+        };
 
-  Result<Buffers> CreateBuffers(const aclmdlIODims& dims, aclDataType data_type);
+        Result<Buffers>           CreateBuffers(const aclmdlIODims& dims, aclDataType data_type);
 
-  Result<Buffers> CreateBuffersDynamicBatchSize(aclmdlIODims dims, aclDataType data_type);
-  Result<Buffers> CreateBuffersDynamicImageSize(int index, aclmdlIODims dims,
-                                                aclDataType data_type);
-  Result<Buffers> CreateBuffersDynamicDims(int index, int dim_count, const aclmdlIODims& dims,
-                                           aclDataType data_type);
+        Result<Buffers>           CreateBuffersDynamicBatchSize(aclmdlIODims dims, aclDataType data_type);
+        Result<Buffers>           CreateBuffersDynamicImageSize(int index, aclmdlIODims dims, aclDataType data_type);
+        Result<Buffers>           CreateBuffersDynamicDims(int index, int dim_count, const aclmdlIODims& dims, aclDataType data_type);
 
-  Result<void> ConfigDynamicShapes();
+        Result<void>              ConfigDynamicShapes();
 
-  Result<void> CreateInputBuffers();
-  Result<void> CreateOutputBuffers();
+        Result<void>              CreateInputBuffers();
+        Result<void>              CreateOutputBuffers();
 
-  std::shared_ptr<void> acl_context_;
-  Stream cpu_stream_;
-  int32_t device_id_{0};
-  uint32_t model_id_{(uint32_t)-1};
-  aclmdlDesc* model_desc_{nullptr};
-  int dynamic_tensor_index_{-1};
-  InputShapeType input_shape_type_{kStatic};
-  std::vector<size_t> dynamic_batch_size_;
-  std::vector<aclmdlIODims> dynamic_input_dims_;
-  aclmdlDataset* input_dataset_{nullptr};
-  aclmdlDataset* output_dataset_{nullptr};
-  std::vector<aclmdlIODims> input_dims_;
-  std::vector<aclmdlIODims> output_dims_;
-  std::vector<aclDataType> input_data_type_;
-  std::vector<aclDataType> output_data_type_;
-  std::vector<Tensor> input_tensor_;
-  std::vector<Tensor> output_tensor_;
-};
+        std::shared_ptr<void>     acl_context_;
+        Stream                    cpu_stream_;
+        int32_t                   device_id_{0};
+        uint32_t                  model_id_{(uint32_t)-1};
+        aclmdlDesc*               model_desc_{nullptr};
+        int                       dynamic_tensor_index_{-1};
+        InputShapeType            input_shape_type_{kStatic};
+        std::vector<size_t>       dynamic_batch_size_;
+        std::vector<aclmdlIODims> dynamic_input_dims_;
+        aclmdlDataset*            input_dataset_{nullptr};
+        aclmdlDataset*            output_dataset_{nullptr};
+        std::vector<aclmdlIODims> input_dims_;
+        std::vector<aclmdlIODims> output_dims_;
+        std::vector<aclDataType>  input_data_type_;
+        std::vector<aclDataType>  output_data_type_;
+        std::vector<Tensor>       input_tensor_;
+        std::vector<Tensor>       output_tensor_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/coreml/coreml_net.h b/csrc/mmdeploy/net/coreml/coreml_net.h
index 797dfcaa23..d28f0fa627 100644
--- a/csrc/mmdeploy/net/coreml/coreml_net.h
+++ b/csrc/mmdeploy/net/coreml/coreml_net.h
@@ -5,32 +5,35 @@
 
 #include "mmdeploy/core/net.h"
 
-namespace mmdeploy::framework {
-
-namespace coreml {
-class Execution;
-}  // namespace coreml
-
-class CoreMLNet : public Net {
- public:
-  ~CoreMLNet() override = default;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
-
- private:
-  std::unique_ptr<coreml::Execution> execution_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  Device device_;
-  Stream stream_;
-
-  friend class coreml::Execution;
-};
+namespace mmdeploy::framework
+{
+
+    namespace coreml
+    {
+        class Execution;
+    }  // namespace coreml
+
+    class CoreMLNet : public Net
+    {
+      public:
+        ~CoreMLNet() override = default;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
+
+      private:
+        std::unique_ptr<coreml::Execution> execution_;
+        std::vector<Tensor>                input_tensors_;
+        std::vector<Tensor>                output_tensors_;
+        Device                             device_;
+        Stream                             stream_;
+
+        friend class coreml::Execution;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/ncnn/ncnn_net.cpp b/csrc/mmdeploy/net/ncnn/ncnn_net.cpp
index 6dcd5d5822..bd37bbdfcd 100644
--- a/csrc/mmdeploy/net/ncnn/ncnn_net.cpp
+++ b/csrc/mmdeploy/net/ncnn/ncnn_net.cpp
@@ -7,142 +7,181 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "ncnn_ops_register.h"
 
-namespace mmdeploy::framework {
-
-NCNNNet::~NCNNNet() {}
-
-Result<void> ncnn_status(int code) {
-  if (code == 0) {
-    return success();
-  }
-  return Status(eFail);
-}
-
-Result<void> NCNNNet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-  if (args.contains("use_vulkan")) {
-    net_.opt.use_vulkan_compute = args["use_vulkan"].get<bool>();
-  }
-  if (!device_.is_host()) {
-    return Status(eNotSupported);
-  }
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-  auto precision = config.precision;
-  if (precision == "FP16") {
-    net_.opt.use_fp16_packed = true;
-    net_.opt.use_fp16_storage = true;
-    net_.opt.use_fp16_arithmetic = true;
-  } else if (precision == "INT8") {
-    // in android platform, ncnn will automatically start FP16 accelerate.
-    // In INT8 case, we set fp16 as false explicitly.
-    net_.opt.use_int8_packed = true;
-    net_.opt.use_int8_storage = true;
-    net_.opt.use_int8_arithmetic = true;
-    net_.opt.use_fp16_packed = false;
-    net_.opt.use_fp16_storage = false;
-    net_.opt.use_fp16_arithmetic = false;
-  } else {
-    // in android platform, ncnn will automatically start FP16 accelerate.
-    // In FP32 case, we set fp16 as false explicitly.
-    net_.opt.use_fp16_packed = false;
-    net_.opt.use_fp16_storage = false;
-    net_.opt.use_fp16_arithmetic = false;
-  }
-  OUTCOME_TRY(params_, model.ReadFile(config.net));
-  OUTCOME_TRY(weights_, model.ReadFile(config.weights));
-  register_mmdeploy_custom_layers(net_);
-
-  OUTCOME_TRY(ncnn_status(net_.load_param_mem(params_.c_str())));
-  net_.load_model(reinterpret_cast<const unsigned char*>(weights_.data()));
-
-  input_indices_ = net_.input_indexes();
-  for (const auto& x : net_.input_names()) {
-    input_tensors_.emplace_back(TensorDesc{
-        Device("cpu"),
-        DataType::kFLOAT,
-        {},
-        x,
-    });
-  }
-  output_indices_ = net_.output_indexes();
-  for (const auto& x : net_.output_names()) {
-    output_tensors_.emplace_back(TensorDesc{
-        Device("cpu"),
-        DataType::kFLOAT,
-        {},
-        x,
-    });
-  }
-  return success();
-}
-
-Result<void> NCNNNet::Deinit() { return success(); }
-
-Result<void> NCNNNet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-Result<Span<Tensor>> NCNNNet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> NCNNNet::GetOutputTensors() { return output_tensors_; }
-
-// TODO: discuss a policy for batch processing
-Result<void> NCNNNet::Forward() {
-  auto extractor = net_.create_extractor();
-  OUTCOME_TRY(stream_.Wait());
-  std::vector<ncnn::Mat> inputs(input_indices_.size());
-  for (size_t i = 0; i < input_indices_.size(); ++i) {
-    auto& tensor = input_tensors_[i];
-    auto shape = tensor.shape();
-    assert(shape[0] == 1);
-    inputs[i] = ncnn::Mat(shape[3], shape[2], shape[1], tensor.data());
-    OUTCOME_TRY(ncnn_status(extractor.input(input_indices_[i], inputs[i])));
-  }
-  std::vector<ncnn::Mat> outputs(output_indices_.size());
-  for (size_t i = 0; i < output_indices_.size(); ++i) {
-    OUTCOME_TRY(ncnn_status(extractor.extract(output_indices_[i], outputs[i])));
-    auto& tensor = output_tensors_[i];
-    auto shape = outputs[i].shape();
-    if (outputs[i].dims == 1) {
-      tensor.Reshape({1, shape.w});
-    } else if (outputs[i].dims == 2) {
-      tensor.Reshape({1, shape.h, shape.w});
-    } else if (outputs[i].dims == 3) {
-      tensor.Reshape({1, shape.c, shape.h, shape.w});
-    } else {
-      // for dim==4 case and blank image.
-      tensor.Reshape({1, shape.c, shape.d, shape.h, shape.w});
+namespace mmdeploy::framework
+{
+
+    NCNNNet::~NCNNNet() {}
+
+    Result<void> ncnn_status(int code)
+    {
+        if (code == 0)
+        {
+            return success();
+        }
+        return Status(eFail);
     }
-    // tensor.Reshape({1, shape.c, shape.h, shape.w});
-    // ncnn Mat may be padded, flatten to avoid that
-    auto flattened = outputs[i].reshape(shape.c * shape.h * shape.w);
-    // if ((shape.c * shape.h * shape.w) > 0)
-    if (outputs[i].dims > 0) {
-      OUTCOME_TRY(tensor.CopyFrom(flattened.data, stream_));
+
+    Result<void> NCNNNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+        if (args.contains("use_vulkan"))
+        {
+            net_.opt.use_vulkan_compute = args["use_vulkan"].get<bool>();
+        }
+        if (!device_.is_host())
+        {
+            return Status(eNotSupported);
+        }
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+        auto precision = config.precision;
+        if (precision == "FP16")
+        {
+            net_.opt.use_fp16_packed     = true;
+            net_.opt.use_fp16_storage    = true;
+            net_.opt.use_fp16_arithmetic = true;
+        }
+        else if (precision == "INT8")
+        {
+            // in android platform, ncnn will automatically start FP16 accelerate.
+            // In INT8 case, we set fp16 as false explicitly.
+            net_.opt.use_int8_packed     = true;
+            net_.opt.use_int8_storage    = true;
+            net_.opt.use_int8_arithmetic = true;
+            net_.opt.use_fp16_packed     = false;
+            net_.opt.use_fp16_storage    = false;
+            net_.opt.use_fp16_arithmetic = false;
+        }
+        else
+        {
+            // in android platform, ncnn will automatically start FP16 accelerate.
+            // In FP32 case, we set fp16 as false explicitly.
+            net_.opt.use_fp16_packed     = false;
+            net_.opt.use_fp16_storage    = false;
+            net_.opt.use_fp16_arithmetic = false;
+        }
+        OUTCOME_TRY(params_, model.ReadFile(config.net));
+        OUTCOME_TRY(weights_, model.ReadFile(config.weights));
+        register_mmdeploy_custom_layers(net_);
+
+        OUTCOME_TRY(ncnn_status(net_.load_param_mem(params_.c_str())));
+        net_.load_model(reinterpret_cast<const unsigned char*>(weights_.data()));
+
+        input_indices_ = net_.input_indexes();
+        for (const auto& x : net_.input_names())
+        {
+            input_tensors_.emplace_back(TensorDesc{
+                Device("cpu"),
+                DataType::kFLOAT,
+                {},
+                x,
+            });
+        }
+        output_indices_ = net_.output_indexes();
+        for (const auto& x : net_.output_names())
+        {
+            output_tensors_.emplace_back(TensorDesc{
+                Device("cpu"),
+                DataType::kFLOAT,
+                {},
+                x,
+            });
+        }
+        return success();
+    }
+
+    Result<void> NCNNNet::Deinit()
+    {
+        return success();
+    }
+
+    Result<void> NCNNNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
+    }
+
+    Result<Span<Tensor>> NCNNNet::GetInputTensors()
+    {
+        return input_tensors_;
     }
-    OUTCOME_TRY(stream_.Wait());
-  }
-  return success();
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<NCNNNet>();
-  if (auto r = p->Init(args)) {
-    return p;
-  } else {
-    MMDEPLOY_ERROR("error creating NCNNNet: {}", r.error().message().c_str());
-    return nullptr;
-  }
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (ncnn, 0), Create);
+
+    Result<Span<Tensor>> NCNNNet::GetOutputTensors()
+    {
+        return output_tensors_;
+    }
+
+    // TODO: discuss a policy for batch processing
+    Result<void> NCNNNet::Forward()
+    {
+        auto extractor = net_.create_extractor();
+        OUTCOME_TRY(stream_.Wait());
+        std::vector<ncnn::Mat> inputs(input_indices_.size());
+        for (size_t i = 0; i < input_indices_.size(); ++i)
+        {
+            auto& tensor = input_tensors_[i];
+            auto  shape  = tensor.shape();
+            assert(shape[0] == 1);
+            inputs[i] = ncnn::Mat(shape[3], shape[2], shape[1], tensor.data());
+            OUTCOME_TRY(ncnn_status(extractor.input(input_indices_[i], inputs[i])));
+        }
+        std::vector<ncnn::Mat> outputs(output_indices_.size());
+        for (size_t i = 0; i < output_indices_.size(); ++i)
+        {
+            OUTCOME_TRY(ncnn_status(extractor.extract(output_indices_[i], outputs[i])));
+            auto& tensor = output_tensors_[i];
+            auto  shape  = outputs[i].shape();
+            if (outputs[i].dims == 1)
+            {
+                tensor.Reshape({1, shape.w});
+            }
+            else if (outputs[i].dims == 2)
+            {
+                tensor.Reshape({1, shape.h, shape.w});
+            }
+            else if (outputs[i].dims == 3)
+            {
+                tensor.Reshape({1, shape.c, shape.h, shape.w});
+            }
+            else
+            {
+                // for dim==4 case and blank image.
+                tensor.Reshape({1, shape.c, shape.d, shape.h, shape.w});
+            }
+            // tensor.Reshape({1, shape.c, shape.h, shape.w});
+            // ncnn Mat may be padded, flatten to avoid that
+            auto flattened = outputs[i].reshape(shape.c * shape.h * shape.w);
+            // if ((shape.c * shape.h * shape.w) > 0)
+            if (outputs[i].dims > 0)
+            {
+                OUTCOME_TRY(tensor.CopyFrom(flattened.data, stream_));
+            }
+            OUTCOME_TRY(stream_.Wait());
+        }
+        return success();
+    }
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<NCNNNet>();
+        if (auto r = p->Init(args))
+        {
+            return p;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("error creating NCNNNet: {}", r.error().message().c_str());
+            return nullptr;
+        }
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (ncnn, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/ncnn/ncnn_net.h b/csrc/mmdeploy/net/ncnn/ncnn_net.h
index dfac181d0d..73197995ee 100644
--- a/csrc/mmdeploy/net/ncnn/ncnn_net.h
+++ b/csrc/mmdeploy/net/ncnn/ncnn_net.h
@@ -7,30 +7,35 @@
 // It's ncnn's net.h
 #include "net.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class NCNNNet : public Net {
- public:
-  ~NCNNNet() override;
-  Result<void> Init(const Value& args) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override { return success(); };
+    class NCNNNet : public Net
+    {
+      public:
+        ~NCNNNet() override;
+        Result<void>         Init(const Value& args) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override
+        {
+            return success();
+        };
 
- private:
-  Device device_;
-  Stream stream_;
-  std::string params_;
-  std::string weights_;
-  std::vector<int> input_indices_;
-  std::vector<int> output_indices_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  ncnn::Net net_;
-};
+      private:
+        Device              device_;
+        Stream              stream_;
+        std::string         params_;
+        std::string         weights_;
+        std::vector<int>    input_indices_;
+        std::vector<int>    output_indices_;
+        std::vector<Tensor> input_tensors_;
+        std::vector<Tensor> output_tensors_;
+        ncnn::Net           net_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/net_module.cpp b/csrc/mmdeploy/net/net_module.cpp
index d9ded2b5b0..ff8227c83a 100644
--- a/csrc/mmdeploy/net/net_module.cpp
+++ b/csrc/mmdeploy/net/net_module.cpp
@@ -18,315 +18,382 @@
 using std::string;
 using std::vector;
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-struct NetModule::Impl {
-  using Input = std::map<std::string, Tensor>;
-  using Output = std::map<std::string, Tensor>;
+    struct NetModule::Impl
+    {
+        using Input  = std::map<std::string, Tensor>;
+        using Output = std::map<std::string, Tensor>;
 
-  explicit Impl(const Value& args) {
-    MMDEPLOY_DEBUG("Net Module cfg: {}", args);
-    auto init = [&]() -> Result<void> {
-      auto name = args["name"].get<std::string>();
-      auto& context = args["context"];
-      if (context.contains("scope")) {
-        is_profiling_ = true;
-      }
-      auto model = context["model"].get<Model>();
-      for (const auto& meta : model.meta().models) {
-        if (meta.name == name) {
-          max_batch_size_ = meta.batch_size;
+        explicit Impl(const Value& args)
+        {
+            MMDEPLOY_DEBUG("Net Module cfg: {}", args);
+            auto init = [&]() -> Result<void>
+            {
+                auto  name    = args["name"].get<std::string>();
+                auto& context = args["context"];
+                if (context.contains("scope"))
+                {
+                    is_profiling_ = true;
+                }
+                auto model = context["model"].get<Model>();
+                for (const auto& meta : model.meta().models)
+                {
+                    if (meta.name == name)
+                    {
+                        max_batch_size_ = meta.batch_size;
+                    }
+                }
+                OUTCOME_TRY(auto config, model.GetModelConfig(name));
+                device_      = context.value("device", Device{"cpu"});
+                stream_      = context.value("stream", Stream::GetDefault(device_));
+                auto creator = gRegistry<Net>().Get(config.backend);
+                if (!creator)
+                {
+                    MMDEPLOY_ERROR("Net backend not found: {}, available backends: {}", config.backend, gRegistry<Net>().List());
+                    return Status(eEntryNotFound);
+                }
+                auto net_cfg = args;
+                net_cfg["context"].update({{"device", device_}, {"stream", stream_}});
+                net_ = creator->Create(net_cfg);
+                if (!net_)
+                {
+                    MMDEPLOY_ERROR("Failed to create Net backend: {}, config: {}", config.backend, net_cfg);
+                    return Status(eFail);
+                }
+                OUTCOME_TRY(InitializeInputTensors(args));
+                OUTCOME_TRY(InitializeOutputTensors(args));
+                return success();
+            };
+            init().value();
         }
-      }
-      OUTCOME_TRY(auto config, model.GetModelConfig(name));
-      device_ = context.value("device", Device{"cpu"});
-      stream_ = context.value("stream", Stream::GetDefault(device_));
-      auto creator = gRegistry<Net>().Get(config.backend);
-      if (!creator) {
-        MMDEPLOY_ERROR("Net backend not found: {}, available backends: {}", config.backend,
-                       gRegistry<Net>().List());
-        return Status(eEntryNotFound);
-      }
-      auto net_cfg = args;
-      net_cfg["context"].update({{"device", device_}, {"stream", stream_}});
-      net_ = creator->Create(net_cfg);
-      if (!net_) {
-        MMDEPLOY_ERROR("Failed to create Net backend: {}, config: {}", config.backend, net_cfg);
-        return Status(eFail);
-      }
-      OUTCOME_TRY(InitializeInputTensors(args));
-      OUTCOME_TRY(InitializeOutputTensors(args));
-      return success();
-    };
-    init().value();
-  }
 
-  Result<void> InitializeInputTensors(const Value& args) {
-    auto inputs = args.value<Value>("input_map", ValueType::kObject);
-    for (auto it = inputs.begin(); it != inputs.end(); ++it) {
-      input_mapping_.insert({(*it).get<std::string>(), it.key()});
-    }
-    OUTCOME_TRY(inputs_, net_->GetInputTensors());
-    for (const auto& t : inputs_) {
-      input_mapping_.insert({t.name(), t.name()});
-    }
-    return success();
-  }
+        Result<void> InitializeInputTensors(const Value& args)
+        {
+            auto inputs = args.value<Value>("input_map", ValueType::kObject);
+            for (auto it = inputs.begin(); it != inputs.end(); ++it)
+            {
+                input_mapping_.insert({(*it).get<std::string>(), it.key()});
+            }
+            OUTCOME_TRY(inputs_, net_->GetInputTensors());
+            for (const auto& t : inputs_)
+            {
+                input_mapping_.insert({t.name(), t.name()});
+            }
+            return success();
+        }
 
-  Result<void> InitializeOutputTensors(const Value& args) {
-    auto outputs = args.value<Value>("output_map", ValueType::kObject);
-    for (auto it = outputs.begin(); it != outputs.end(); ++it) {
-      output_mapping_.insert({(*it).get<std::string>(), it.key()});
-    }
-    OUTCOME_TRY(outputs_, net_->GetOutputTensors());
-    for (const auto& t : outputs_) {
-      output_mapping_.insert({t.name(), t.name()});
-    }
-    return success();
-  }
+        Result<void> InitializeOutputTensors(const Value& args)
+        {
+            auto outputs = args.value<Value>("output_map", ValueType::kObject);
+            for (auto it = outputs.begin(); it != outputs.end(); ++it)
+            {
+                output_mapping_.insert({(*it).get<std::string>(), it.key()});
+            }
+            OUTCOME_TRY(outputs_, net_->GetOutputTensors());
+            for (const auto& t : outputs_)
+            {
+                output_mapping_.insert({t.name(), t.name()});
+            }
+            return success();
+        }
 
-  static Result<TensorShape> InferBatchShape(const vector<Tensor>& input) {
-    auto batch_size = input.size();
-    auto& exemplar = input.front();
-    auto shape = exemplar.shape();
-    if (batch_size == 1) {
-      return shape;
-    }
-    if (shape[0] != 1) {
-      MMDEPLOY_WARN("unsupported shape for batch assemble: {}", shape);
-      return Status(eNotSupported);
-    }
-    for (int i = 1; i < input.size(); ++i) {
-      auto& sample = input[i];
-      if (sample.shape() != shape) {
-        MMDEPLOY_WARN("shapes are not consistent across the batch");
-        return Status(eNotSupported);
-      }
-    }
-    shape[0] = static_cast<int64_t>(batch_size);
-    return shape;
-  }
+        static Result<TensorShape> InferBatchShape(const vector<Tensor>& input)
+        {
+            auto  batch_size = input.size();
+            auto& exemplar   = input.front();
+            auto  shape      = exemplar.shape();
+            if (batch_size == 1)
+            {
+                return shape;
+            }
+            if (shape[0] != 1)
+            {
+                MMDEPLOY_WARN("unsupported shape for batch assemble: {}", shape);
+                return Status(eNotSupported);
+            }
+            for (int i = 1; i < input.size(); ++i)
+            {
+                auto& sample = input[i];
+                if (sample.shape() != shape)
+                {
+                    MMDEPLOY_WARN("shapes are not consistent across the batch");
+                    return Status(eNotSupported);
+                }
+            }
+            shape[0] = static_cast<int64_t>(batch_size);
+            return shape;
+        }
 
-  static Result<vector<TensorShape>> InferBatchShape(const vector<vector<Tensor>>& inputs) {
-    vector<TensorShape> shapes;
-    shapes.reserve(inputs.size());
-    for (const auto& input : inputs) {
-      OUTCOME_TRY(auto shape, InferBatchShape(input));
-      shapes.push_back(std::move(shape));
-    }
-    return shapes;
-  }
+        static Result<vector<TensorShape>> InferBatchShape(const vector<vector<Tensor>>& inputs)
+        {
+            vector<TensorShape> shapes;
+            shapes.reserve(inputs.size());
+            for (const auto& input : inputs)
+            {
+                OUTCOME_TRY(auto shape, InferBatchShape(input));
+                shapes.push_back(std::move(shape));
+            }
+            return shapes;
+        }
 
-  Result<vector<vector<Tensor>>> CollectInputTensors(const vector<Input>& inputs) {
-    vector<vector<Tensor>> input_samples;
-    input_samples.reserve(inputs_.size());
-    for (const auto& t : inputs_) {
-      auto name = input_mapping_.at(t.name());
-      auto& tmp = input_samples.emplace_back();
-      for (const auto& sample : inputs) {
-        if (auto it = sample.find(name); it != sample.end()) {
-          tmp.push_back(it->second);
-        } else {
-          MMDEPLOY_ERROR("sample {} missing key {}", &sample - inputs.data(), name);
-          return Status(eInvalidArgument);
+        Result<vector<vector<Tensor>>> CollectInputTensors(const vector<Input>& inputs)
+        {
+            vector<vector<Tensor>> input_samples;
+            input_samples.reserve(inputs_.size());
+            for (const auto& t : inputs_)
+            {
+                auto  name = input_mapping_.at(t.name());
+                auto& tmp  = input_samples.emplace_back();
+                for (const auto& sample : inputs)
+                {
+                    if (auto it = sample.find(name); it != sample.end())
+                    {
+                        tmp.push_back(it->second);
+                    }
+                    else
+                    {
+                        MMDEPLOY_ERROR("sample {} missing key {}", &sample - inputs.data(), name);
+                        return Status(eInvalidArgument);
+                    }
+                }
+            }
+            return input_samples;
         }
-      }
-    }
-    return input_samples;
-  }
 
-  void SaveBatch(vector<vector<Tensor>> samples, vector<int> indices,
-                 vector<vector<vector<Tensor>>>& batch_tensors,
-                 vector<vector<TensorShape>>& batch_shapes,
-                 vector<vector<int>>& batch_sample_idxs) const {
-    if (auto maybe_batch_shape = InferBatchShape(samples)) {
-      batch_shapes.push_back(maybe_batch_shape.value());
-      batch_tensors.push_back(std::move(samples));
-      batch_sample_idxs.push_back(std::move(indices));
-    } else {
-      // cannot assemble batch, do it one by one
-      for (int k = 0; k < indices.size(); ++k) {
-        auto& shapes = batch_shapes.emplace_back();
-        auto& batch = batch_tensors.emplace_back(inputs_.size());
-        batch_sample_idxs.push_back({indices[k]});
-        for (int j = 0; j < inputs_.size(); ++j) {
-          shapes.push_back(samples[j][k].shape());
-          batch[j].push_back(std::move(samples[j][k]));
+        void SaveBatch(vector<vector<Tensor>> samples, vector<int> indices, vector<vector<vector<Tensor>>>& batch_tensors, vector<vector<TensorShape>>& batch_shapes, vector<vector<int>>& batch_sample_idxs) const
+        {
+            if (auto maybe_batch_shape = InferBatchShape(samples))
+            {
+                batch_shapes.push_back(maybe_batch_shape.value());
+                batch_tensors.push_back(std::move(samples));
+                batch_sample_idxs.push_back(std::move(indices));
+            }
+            else
+            {
+                // cannot assemble batch, do it one by one
+                for (int k = 0; k < indices.size(); ++k)
+                {
+                    auto& shapes = batch_shapes.emplace_back();
+                    auto& batch  = batch_tensors.emplace_back(inputs_.size());
+                    batch_sample_idxs.push_back({indices[k]});
+                    for (int j = 0; j < inputs_.size(); ++j)
+                    {
+                        shapes.push_back(samples[j][k].shape());
+                        batch[j].push_back(std::move(samples[j][k]));
+                    }
+                }
+            }
         }
-      }
-    }
-  }
 
-  void SamplesToBatches(const vector<vector<Tensor>>& input_samples, size_t n_samples,
-                        vector<vector<vector<Tensor>>>& batch_tensors,
-                        vector<vector<TensorShape>>& batch_shapes,
-                        vector<vector<int>>& batch_sample_idxs) const {
-    // concat all shapes in samples to make comparison easier
-    vector<vector<int64_t>> concat_shapes;
-    concat_shapes.reserve(n_samples);
-    for (size_t i = 0; i < n_samples; ++i) {
-      auto& shape = concat_shapes.emplace_back();
-      for (const auto& input : input_samples) {
-        shape.insert(shape.end(), input[i].shape().begin(), input[i].shape().end());
-      }
-    }
+        void SamplesToBatches(const vector<vector<Tensor>>& input_samples, size_t n_samples, vector<vector<vector<Tensor>>>& batch_tensors, vector<vector<TensorShape>>& batch_shapes, vector<vector<int>>& batch_sample_idxs) const
+        {
+            // concat all shapes in samples to make comparison easier
+            vector<vector<int64_t>> concat_shapes;
+            concat_shapes.reserve(n_samples);
+            for (size_t i = 0; i < n_samples; ++i)
+            {
+                auto& shape = concat_shapes.emplace_back();
+                for (const auto& input : input_samples)
+                {
+                    shape.insert(shape.end(), input[i].shape().begin(), input[i].shape().end());
+                }
+            }
 
-    // cluster samples by concatenated shapes
-    vector<int> shape_idxs(concat_shapes.size());
-    std::iota(shape_idxs.begin(), shape_idxs.end(), 0);
-    std::sort(shape_idxs.begin(), shape_idxs.end(),
-              [&concat_shapes](int i, int j) { return concat_shapes[i] < concat_shapes[j]; });
-    shape_idxs.erase(std::unique(shape_idxs.begin(), shape_idxs.end(),
-                                 [&concat_shapes](int i, int j) {
-                                   return concat_shapes[i] == concat_shapes[j];
-                                 }),
-                     shape_idxs.end());
+            // cluster samples by concatenated shapes
+            vector<int> shape_idxs(concat_shapes.size());
+            std::iota(shape_idxs.begin(), shape_idxs.end(), 0);
+            std::sort(shape_idxs.begin(), shape_idxs.end(), [&concat_shapes](int i, int j)
+                      { return concat_shapes[i] < concat_shapes[j]; });
+            shape_idxs.erase(std::unique(shape_idxs.begin(), shape_idxs.end(), [&concat_shapes](int i, int j)
+                                         { return concat_shapes[i] == concat_shapes[j]; }),
+                             shape_idxs.end());
 
-    // generate batches of samples with equal shapes, limit the batch size by max_batch_size_
-    for (const auto ref_shape_idx : shape_idxs) {
-      const auto& ref_shape = concat_shapes[ref_shape_idx];
-      vector<vector<Tensor>> samples(inputs_.size());
-      vector<int> indices;
-      for (size_t i = 0; i < concat_shapes.size(); ++i) {
-        if (concat_shapes[i] == ref_shape) {
-          for (size_t j = 0; j < inputs_.size(); ++j) {
-            samples[j].push_back(input_samples[j][i]);
-          }
-          indices.push_back(static_cast<int>(i));
-          if (indices.size() == max_batch_size_) {
-            SaveBatch(std::move(samples), std::move(indices), batch_tensors, batch_shapes,
-                      batch_sample_idxs);
-            samples = vector<vector<Tensor>>(inputs_.size());
-            indices = {};
-          }
+            // generate batches of samples with equal shapes, limit the batch size by max_batch_size_
+            for (const auto ref_shape_idx : shape_idxs)
+            {
+                const auto&            ref_shape = concat_shapes[ref_shape_idx];
+                vector<vector<Tensor>> samples(inputs_.size());
+                vector<int>            indices;
+                for (size_t i = 0; i < concat_shapes.size(); ++i)
+                {
+                    if (concat_shapes[i] == ref_shape)
+                    {
+                        for (size_t j = 0; j < inputs_.size(); ++j)
+                        {
+                            samples[j].push_back(input_samples[j][i]);
+                        }
+                        indices.push_back(static_cast<int>(i));
+                        if (indices.size() == max_batch_size_)
+                        {
+                            SaveBatch(std::move(samples), std::move(indices), batch_tensors, batch_shapes, batch_sample_idxs);
+                            samples = vector<vector<Tensor>>(inputs_.size());
+                            indices = {};
+                        }
+                    }
+                }
+                if (!indices.empty())
+                {
+                    SaveBatch(std::move(samples), std::move(indices), batch_tensors, batch_shapes, batch_sample_idxs);
+                }
+            }
         }
-      }
-      if (!indices.empty()) {
-        SaveBatch(std::move(samples), std::move(indices), batch_tensors, batch_shapes,
-                  batch_sample_idxs);
-      }
-    }
-  }
 
-  Result<vector<Output>> Forward(const vector<Input>& inputs) {
-    OUTCOME_TRY(auto input_samples, CollectInputTensors(inputs));
+        Result<vector<Output>> Forward(const vector<Input>& inputs)
+        {
+            OUTCOME_TRY(auto input_samples, CollectInputTensors(inputs));
 
-    vector<vector<vector<Tensor>>> batch_tensors;
-    vector<vector<TensorShape>> batch_shapes;
-    vector<vector<int>> batch_sample_indices;
+            vector<vector<vector<Tensor>>> batch_tensors;
+            vector<vector<TensorShape>>    batch_shapes;
+            vector<vector<int>>            batch_sample_indices;
 
-    SamplesToBatches(input_samples, inputs.size(), batch_tensors, batch_shapes,
-                     batch_sample_indices);
-
-    vector<Output> outputs(inputs.size());
-    for (size_t i = 0; i < batch_tensors.size(); ++i) {
-      OUTCOME_TRY(net_->Reshape(batch_shapes[i]));
-      OUTCOME_TRY(CopyInputTensors(batch_tensors[i], batch_shapes[i]));
-      OUTCOME_TRY(net_->Forward());
-      OUTCOME_TRY(CopyOutputTensors(batch_sample_indices[i], outputs));
-      if (i + 1 < batch_tensors.size()) {  // sync if not the last batch
-        OUTCOME_TRY(stream_.Wait());
-      }
-    }
+            SamplesToBatches(input_samples, inputs.size(), batch_tensors, batch_shapes, batch_sample_indices);
 
-    if (is_profiling_) {
-      OUTCOME_TRY(stream_.Wait());
-    }
+            vector<Output> outputs(inputs.size());
+            for (size_t i = 0; i < batch_tensors.size(); ++i)
+            {
+                OUTCOME_TRY(net_->Reshape(batch_shapes[i]));
+                OUTCOME_TRY(CopyInputTensors(batch_tensors[i], batch_shapes[i]));
+                OUTCOME_TRY(net_->Forward());
+                OUTCOME_TRY(CopyOutputTensors(batch_sample_indices[i], outputs));
+                if (i + 1 < batch_tensors.size())
+                {  // sync if not the last batch
+                    OUTCOME_TRY(stream_.Wait());
+                }
+            }
 
-    return outputs;
-  }
+            if (is_profiling_)
+            {
+                OUTCOME_TRY(stream_.Wait());
+            }
 
-  Result<void> CopyInputTensors(const vector<vector<Tensor>>& batch,
-                                const vector<TensorShape>& shapes) const {
-    for (int i = 0; i < inputs_.size(); ++i) {
-      auto& src = batch[i];
-      auto& dst = inputs_[i];
-      if (dst.shape() != shapes[i]) {
-        MMDEPLOY_ERROR("inconsistent input shape, expect {}, got {}", shapes[i], dst.shape());
-        return Status(eFail);
-      }
-      if (src.size() > 1) {
-        for (int j = 0; j < src.size(); ++j) {
-          OUTCOME_TRY(dst.Slice(j).CopyFrom(src[j], stream_));
+            return outputs;
         }
-      } else {
-        OUTCOME_TRY(src.front().CopyTo(dst, stream_));
-      }
-    }
-    return success();
-  }
 
-  Result<void> CopyOutputTensors(const vector<int>& indices, vector<Output>& outputs) {
-    for (const auto& output : outputs_) {
-      auto name = output_mapping_.at(output.name());
-      auto desc = output.desc();
-      desc.device = device_;
-      Tensor tmp(desc);
-      if (tmp.size()) {
-        OUTCOME_TRY(output.CopyTo(tmp, stream_));
-      } else {
-        MMDEPLOY_WARN("copy skipped due to zero sized tensor");
-      }
-      if (indices.size() > 1) {
-        for (int i = 0; i < indices.size(); ++i) {
-          outputs[indices[i]].emplace(name, tmp.Slice(i));
+        Result<void> CopyInputTensors(const vector<vector<Tensor>>& batch,
+                                      const vector<TensorShape>&    shapes) const
+        {
+            for (int i = 0; i < inputs_.size(); ++i)
+            {
+                auto& src = batch[i];
+                auto& dst = inputs_[i];
+                if (dst.shape() != shapes[i])
+                {
+                    MMDEPLOY_ERROR("inconsistent input shape, expect {}, got {}", shapes[i], dst.shape());
+                    return Status(eFail);
+                }
+                if (src.size() > 1)
+                {
+                    for (int j = 0; j < src.size(); ++j)
+                    {
+                        OUTCOME_TRY(dst.Slice(j).CopyFrom(src[j], stream_));
+                    }
+                }
+                else
+                {
+                    OUTCOME_TRY(src.front().CopyTo(dst, stream_));
+                }
+            }
+            return success();
         }
-      } else {
-        outputs[indices.front()].emplace(name, std::move(tmp));
-      }
-    }
-    return success();
-  }
 
-  Device device_;
-  Stream stream_;
-  std::unique_ptr<Net> net_;
-  Span<Tensor> inputs_;
-  Span<Tensor> outputs_;
-  // outer scope to model input names
-  std::map<std::string, std::string> input_mapping_;
-  // outer scope to model output names
-  std::map<std::string, std::string> output_mapping_;
-  int max_batch_size_{1};
-  bool is_profiling_{false};
-};
+        Result<void> CopyOutputTensors(const vector<int>& indices, vector<Output>& outputs)
+        {
+            for (const auto& output : outputs_)
+            {
+                auto name   = output_mapping_.at(output.name());
+                auto desc   = output.desc();
+                desc.device = device_;
+                Tensor tmp(desc);
+                if (tmp.size())
+                {
+                    OUTCOME_TRY(output.CopyTo(tmp, stream_));
+                }
+                else
+                {
+                    MMDEPLOY_WARN("copy skipped due to zero sized tensor");
+                }
+                if (indices.size() > 1)
+                {
+                    for (int i = 0; i < indices.size(); ++i)
+                    {
+                        outputs[indices[i]].emplace(name, tmp.Slice(i));
+                    }
+                }
+                else
+                {
+                    outputs[indices.front()].emplace(name, std::move(tmp));
+                }
+            }
+            return success();
+        }
 
-NetModule::~NetModule() = default;
+        Device                             device_;
+        Stream                             stream_;
+        std::unique_ptr<Net>               net_;
+        Span<Tensor>                       inputs_;
+        Span<Tensor>                       outputs_;
+        // outer scope to model input names
+        std::map<std::string, std::string> input_mapping_;
+        // outer scope to model output names
+        std::map<std::string, std::string> output_mapping_;
+        int                                max_batch_size_{1};
+        bool                               is_profiling_{false};
+    };
 
-NetModule::NetModule(NetModule&&) noexcept = default;
+    NetModule::~NetModule() = default;
 
-NetModule::NetModule(const Value& args) : impl_(std::make_unique<Impl>(args)) {}
+    NetModule::NetModule(NetModule&&) noexcept = default;
 
-Result<Value> NetModule::operator()(const Value& input) {
-  auto filter = [](const Value& sample) {
-    Impl::Input tensors;
-    for (auto it = sample.begin(); it != sample.end(); ++it) {
-      if (it->is_any<Tensor>()) {
-        tensors.insert({it.key(), it->get<Tensor>()});
-      }
+    NetModule::NetModule(const Value& args)
+        : impl_(std::make_unique<Impl>(args))
+    {
     }
-    return tensors;
-  };
-  std::vector<Impl::Input> batch;
-  if (input.is_array()) {
-    batch.reserve(input.size());
-    for (const auto& sample : input) {
-      batch.push_back(filter(sample));
+
+    Result<Value> NetModule::operator()(const Value& input)
+    {
+        auto filter = [](const Value& sample)
+        {
+            Impl::Input tensors;
+            for (auto it = sample.begin(); it != sample.end(); ++it)
+            {
+                if (it->is_any<Tensor>())
+                {
+                    tensors.insert({it.key(), it->get<Tensor>()});
+                }
+            }
+            return tensors;
+        };
+        std::vector<Impl::Input> batch;
+        if (input.is_array())
+        {
+            batch.reserve(input.size());
+            for (const auto& sample : input)
+            {
+                batch.push_back(filter(sample));
+            }
+        }
+        else if (input.is_object())
+        {
+            batch.push_back(filter(input));
+        }
+        else
+        {
+            return Status(eNotSupported);
+        }
+        OUTCOME_TRY(auto batch_output, impl_->Forward(batch));
+        if (input.is_array())
+        {
+            return to_value(batch_output);
+        }
+        else
+        {
+            return to_value(batch_output.at(0));
+        }
     }
-  } else if (input.is_object()) {
-    batch.push_back(filter(input));
-  } else {
-    return Status(eNotSupported);
-  }
-  OUTCOME_TRY(auto batch_output, impl_->Forward(batch));
-  if (input.is_array()) {
-    return to_value(batch_output);
-  } else {
-    return to_value(batch_output.at(0));
-  }
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (Net, 0),
-                               [](const Value& config) { return CreateTask(NetModule{config}); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (Net, 0), [](const Value& config)
+                                   { return CreateTask(NetModule{config}); });
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/net_module.h b/csrc/mmdeploy/net/net_module.h
index 79797d19f2..0a84ea7080 100644
--- a/csrc/mmdeploy/net/net_module.h
+++ b/csrc/mmdeploy/net/net_module.h
@@ -7,20 +7,22 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy::framework {
-
-class NetModule {
- public:
-  ~NetModule();
-  NetModule(NetModule&&) noexcept;
-
-  explicit NetModule(const Value& args);
-  Result<Value> operator()(const Value& input);
-
- private:
-  struct Impl;
-  std::unique_ptr<Impl> impl_;
-};
+namespace mmdeploy::framework
+{
+
+    class NetModule
+    {
+      public:
+        ~NetModule();
+        NetModule(NetModule&&) noexcept;
+
+        explicit NetModule(const Value& args);
+        Result<Value> operator()(const Value& input);
+
+      private:
+        struct Impl;
+        std::unique_ptr<Impl> impl_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/openvino/openvino_net.cpp b/csrc/mmdeploy/net/openvino/openvino_net.cpp
index 2fed37c32c..ff9b5741e1 100644
--- a/csrc/mmdeploy/net/openvino/openvino_net.cpp
+++ b/csrc/mmdeploy/net/openvino/openvino_net.cpp
@@ -10,251 +10,293 @@
 #include "mmdeploy/core/utils/filesystem.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::framework {
-
-template <typename T>
-Result<std::unique_ptr<T>> openvino_try(T* v) {
-  if (v) {
-    return success(v);
-  }
-  return Status(eFail);
-}
-
-static Result<DataType> ConvertElementType(InferenceEngine::Precision prec) {
-  auto type = InferenceEngine::Precision::ePrecision(prec);
-  switch (type) {
-    case InferenceEngine::Precision::ePrecision::FP32:
-      return DataType::kFLOAT;
-    case InferenceEngine::Precision::ePrecision::FP16:
-      return DataType::kHALF;
-    case InferenceEngine::Precision::ePrecision::I8:
-      return DataType::kINT8;
-    case InferenceEngine::Precision::ePrecision::I32:
-      return DataType::kINT32;
-    case InferenceEngine::Precision::ePrecision::I64:
-      return DataType::kINT64;
-    default:
-      MMDEPLOY_ERROR("unsupported InferenceEngine Precision: {}", static_cast<int>(type));
-      return Status(eNotSupported);
-  }
-}
-
-static Result<InferenceEngine::Precision::ePrecision> ConvertPrecision(DataType type) {
-  switch (type) {
-    case DataType::kFLOAT:
-      return InferenceEngine::Precision::ePrecision::FP32;
-    case DataType::kHALF:
-      return InferenceEngine::Precision::ePrecision::FP16;
-    case DataType::kINT8:
-      return InferenceEngine::Precision::ePrecision::I8;
-    case DataType::kINT32:
-      return InferenceEngine::Precision::ePrecision::I32;
-    case DataType::kINT64:
-      return InferenceEngine::Precision::ePrecision::I64;
-    default:
-      MMDEPLOY_ERROR("unsupported DataType: {}", static_cast<int>(type));
-      return Status(eNotSupported);
-  }
-}
-
-static Result<std::string> ConvertDeviceName(const Device& device) {
-  if (device.is_host()) {
-    return "CPU";
-  }
-  return Status(eNotSupported);
-}
-
-Result<void> OpenVINONet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-
-  if (!device_.is_host()) {
-    return Status(eNotSupported);
-  }
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  OUTCOME_TRY(auto raw_xml, model.ReadFile(config.net));
-  OUTCOME_TRY(auto raw_bin, model.ReadFile(config.weights));
-  auto ov_tensor = InferenceEngine::TensorDesc(InferenceEngine::Precision::U8, {raw_bin.size()},
-                                               InferenceEngine::Layout::C);
-  auto ov_blob = InferenceEngine::make_shared_blob<uint8_t>(ov_tensor);
-  ov_blob->allocate();
-  memcpy(ov_blob->buffer(), raw_bin.data(), ov_blob->byteSize());
-
-  try {
-    // create cnnnetwork
-    core_ = InferenceEngine::Core();
-    network_ = core_.ReadNetwork(raw_xml, std::move(ov_blob));
-
-    // set input tensor
-    InferenceEngine::InputsDataMap input_info = network_.getInputsInfo();
-    for (auto& item : input_info) {
-      auto input_data = item.second;
-      const auto& input_name = input_data->name();
-      OUTCOME_TRY(auto data_type, ConvertElementType(input_data->getPrecision()));
-      const auto& size_vector = input_data->getTensorDesc().getDims();
-      TensorShape shape{size_vector.begin(), size_vector.end()};
-      input_tensors_.emplace_back(TensorDesc{device_, data_type, shape, input_name});
+namespace mmdeploy::framework
+{
+
+    template<typename T>
+    Result<std::unique_ptr<T>> openvino_try(T* v)
+    {
+        if (v)
+        {
+            return success(v);
+        }
+        return Status(eFail);
     }
 
-    // set output tensor
-    InferenceEngine::OutputsDataMap output_info = network_.getOutputsInfo();
-    for (auto& item : output_info) {
-      auto output_data = item.second;
-      const auto& output_name = output_data->getName();
-      OUTCOME_TRY(auto data_type, ConvertElementType(output_data->getPrecision()));
-      const auto& size_vector = output_data->getDims();
-      TensorShape shape{size_vector.begin(), size_vector.end()};
-      output_tensors_.emplace_back(TensorDesc{device_, data_type, shape, output_name});
+    static Result<DataType> ConvertElementType(InferenceEngine::Precision prec)
+    {
+        auto type = InferenceEngine::Precision::ePrecision(prec);
+        switch (type)
+        {
+            case InferenceEngine::Precision::ePrecision::FP32:
+                return DataType::kFLOAT;
+            case InferenceEngine::Precision::ePrecision::FP16:
+                return DataType::kHALF;
+            case InferenceEngine::Precision::ePrecision::I8:
+                return DataType::kINT8;
+            case InferenceEngine::Precision::ePrecision::I32:
+                return DataType::kINT32;
+            case InferenceEngine::Precision::ePrecision::I64:
+                return DataType::kINT64;
+            default:
+                MMDEPLOY_ERROR("unsupported InferenceEngine Precision: {}", static_cast<int>(type));
+                return Status(eNotSupported);
+        }
     }
 
-    // create request
-    net_config_ =
-        std::map<std::string, std::string>{{InferenceEngine::PluginConfigParams::KEY_PERF_COUNT,
-                                            InferenceEngine::PluginConfigParams::YES}};
-    OUTCOME_TRY(auto device_str, ConvertDeviceName(device_));
-    auto executable_network = core_.LoadNetwork(network_, device_str, net_config_);
-    request_ = executable_network.CreateInferRequest();
-
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating OpenVINO: {}", e.what());
-    return Status(eFail);
-  }
-  return success();
-}
-
-Result<void> OpenVINONet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-Result<void> OpenVINONet::Deinit() { return success(); }
-
-Result<Span<Tensor>> OpenVINONet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> OpenVINONet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> OpenVINONet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-static Result<void> SetBlob(InferenceEngine::InferRequest& request, Tensor& tensor) {
-  const auto& input_name = tensor.desc().name;
-
-  const auto& desc = tensor.desc();
-  const auto& shape = desc.shape;
-  InferenceEngine::SizeVector size_vector{shape.begin(), shape.end()};
-  OUTCOME_TRY(auto prec, ConvertPrecision(desc.data_type));
-  InferenceEngine::TensorDesc ie_desc(prec, size_vector, InferenceEngine::Layout::NCHW);
-
-  // TODO: find a better way instead of switch case
-  switch (desc.data_type) {
-    case DataType::kFLOAT:
-      request.SetBlob(input_name,
-                      InferenceEngine::make_shared_blob<float>(ie_desc, tensor.data<float>()));
-      break;
-    case DataType::kINT8:
-      request.SetBlob(input_name,
-                      InferenceEngine::make_shared_blob<int8_t>(ie_desc, tensor.data<int8_t>()));
-      break;
-    case DataType::kINT32:
-      request.SetBlob(input_name,
-                      InferenceEngine::make_shared_blob<int32_t>(ie_desc, tensor.data<int32_t>()));
-      break;
-    case DataType::kINT64:
-      request.SetBlob(input_name,
-                      InferenceEngine::make_shared_blob<int64_t>(ie_desc, tensor.data<int64_t>()));
-      break;
-    default:
-      MMDEPLOY_ERROR("unsupported DataType: {}", static_cast<int>(desc.data_type));
-      return Status(eNotSupported);
-  }
-  return success();
-}
-
-static Result<void> GetBlob(InferenceEngine::InferRequest& request, Tensor& tensor,
-                            Stream& stream) {
-  const auto& desc = tensor.desc();
-  const auto& output_name = desc.name;
-  const auto device = desc.device;
-  const auto data_type = desc.data_type;
-  const auto& output = request.GetBlob(output_name);
-  const auto& size_vector = output->getTensorDesc().getDims();
-  TensorShape shape{size_vector.begin(), size_vector.end()};
-
-  InferenceEngine::MemoryBlob::CPtr moutput =
-      InferenceEngine::as<InferenceEngine::MemoryBlob>(output);
-  auto moutputHolder = moutput->rmap();
-  std::shared_ptr<void> data(const_cast<void*>(moutputHolder.as<const void*>()), [](void*) {});
-
-  Tensor blob_tensor = {TensorDesc{device, data_type, shape, output_name}, data};
-  if (!std::equal(blob_tensor.shape().begin(), blob_tensor.shape().end(), tensor.shape().begin()))
-    tensor.Reshape(shape);
-  OUTCOME_TRY(tensor.CopyFrom(blob_tensor, stream));
-
-  return success();
-}
-
-Result<void> OpenVINONet::Forward() {
-  OUTCOME_TRY(stream_.Wait());
-
-  // reshape network if shape does not match
-  bool need_reshape = false;
-  auto input_shapes = network_.getInputShapes();
-  for (auto& tensor : input_tensors_) {
-    const auto& input_name = tensor.desc().name;
-    const auto& tensor_shape = tensor.desc().shape;
-    auto& size_vector = input_shapes[input_name];
-    bool shape_changed = !std::equal(size_vector.begin(), size_vector.end(), tensor_shape.begin(),
-                                     [](size_t a, int64_t b) { return a == size_t(b); });
-    need_reshape |= shape_changed;
-    if (shape_changed)
-      size_vector = InferenceEngine::SizeVector{tensor_shape.begin(), tensor_shape.end()};
-  }
-
-  if (need_reshape) {
-    network_.reshape(input_shapes);
-    OUTCOME_TRY(auto device_str, ConvertDeviceName(device_));
-    auto executable_network = core_.LoadNetwork(network_, device_str, net_config_);
-    request_ = executable_network.CreateInferRequest();
-  }
-
-  // fill input into request
-  for (auto& tensor : input_tensors_) {
-    OUTCOME_TRY(SetBlob(request_, tensor));
-  }
-
-  request_.StartAsync();
-  request_.Wait(InferenceEngine::InferRequest::WaitMode::RESULT_READY);
-
-  // read output from request
-  for (auto& tensor : output_tensors_) {
-    OUTCOME_TRY(GetBlob(request_, tensor, stream_));
-  }
-  OUTCOME_TRY(stream_.Wait());
-
-  return success();
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  try {
-    auto p = std::make_unique<OpenVINONet>();
-    if (auto r = p->Init(args)) {
-      return p;
-    } else {
-      MMDEPLOY_ERROR("error creating OpenVINONet: {}", r.error().message().c_str());
-      return nullptr;
+    static Result<InferenceEngine::Precision::ePrecision> ConvertPrecision(DataType type)
+    {
+        switch (type)
+        {
+            case DataType::kFLOAT:
+                return InferenceEngine::Precision::ePrecision::FP32;
+            case DataType::kHALF:
+                return InferenceEngine::Precision::ePrecision::FP16;
+            case DataType::kINT8:
+                return InferenceEngine::Precision::ePrecision::I8;
+            case DataType::kINT32:
+                return InferenceEngine::Precision::ePrecision::I32;
+            case DataType::kINT64:
+                return InferenceEngine::Precision::ePrecision::I64;
+            default:
+                MMDEPLOY_ERROR("unsupported DataType: {}", static_cast<int>(type));
+                return Status(eNotSupported);
+        }
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating OpenVINONet: {}", e.what());
-    return nullptr;
-  }
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (openvino, 0), Create);
+    static Result<std::string> ConvertDeviceName(const Device& device)
+    {
+        if (device.is_host())
+        {
+            return "CPU";
+        }
+        return Status(eNotSupported);
+    }
+
+    Result<void> OpenVINONet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+
+        if (!device_.is_host())
+        {
+            return Status(eNotSupported);
+        }
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        OUTCOME_TRY(auto raw_xml, model.ReadFile(config.net));
+        OUTCOME_TRY(auto raw_bin, model.ReadFile(config.weights));
+        auto ov_tensor = InferenceEngine::TensorDesc(InferenceEngine::Precision::U8, {raw_bin.size()}, InferenceEngine::Layout::C);
+        auto ov_blob   = InferenceEngine::make_shared_blob<uint8_t>(ov_tensor);
+        ov_blob->allocate();
+        memcpy(ov_blob->buffer(), raw_bin.data(), ov_blob->byteSize());
+
+        try
+        {
+            // create cnnnetwork
+            core_    = InferenceEngine::Core();
+            network_ = core_.ReadNetwork(raw_xml, std::move(ov_blob));
+
+            // set input tensor
+            InferenceEngine::InputsDataMap input_info = network_.getInputsInfo();
+            for (auto& item : input_info)
+            {
+                auto        input_data = item.second;
+                const auto& input_name = input_data->name();
+                OUTCOME_TRY(auto data_type, ConvertElementType(input_data->getPrecision()));
+                const auto& size_vector = input_data->getTensorDesc().getDims();
+                TensorShape shape{size_vector.begin(), size_vector.end()};
+                input_tensors_.emplace_back(TensorDesc{device_, data_type, shape, input_name});
+            }
+
+            // set output tensor
+            InferenceEngine::OutputsDataMap output_info = network_.getOutputsInfo();
+            for (auto& item : output_info)
+            {
+                auto        output_data = item.second;
+                const auto& output_name = output_data->getName();
+                OUTCOME_TRY(auto data_type, ConvertElementType(output_data->getPrecision()));
+                const auto& size_vector = output_data->getDims();
+                TensorShape shape{size_vector.begin(), size_vector.end()};
+                output_tensors_.emplace_back(TensorDesc{device_, data_type, shape, output_name});
+            }
+
+            // create request
+            net_config_ =
+                std::map<std::string, std::string>{{InferenceEngine::PluginConfigParams::KEY_PERF_COUNT,
+                                                    InferenceEngine::PluginConfigParams::YES}};
+            OUTCOME_TRY(auto device_str, ConvertDeviceName(device_));
+            auto executable_network = core_.LoadNetwork(network_, device_str, net_config_);
+            request_                = executable_network.CreateInferRequest();
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating OpenVINO: {}", e.what());
+            return Status(eFail);
+        }
+        return success();
+    }
+
+    Result<void> OpenVINONet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
+    }
+
+    Result<void> OpenVINONet::Deinit()
+    {
+        return success();
+    }
+
+    Result<Span<Tensor>> OpenVINONet::GetInputTensors()
+    {
+        return input_tensors_;
+    }
+
+    Result<Span<Tensor>> OpenVINONet::GetOutputTensors()
+    {
+        return output_tensors_;
+    }
+
+    Result<void> OpenVINONet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
+    }
+
+    static Result<void> SetBlob(InferenceEngine::InferRequest& request, Tensor& tensor)
+    {
+        const auto&                 input_name = tensor.desc().name;
+
+        const auto&                 desc  = tensor.desc();
+        const auto&                 shape = desc.shape;
+        InferenceEngine::SizeVector size_vector{shape.begin(), shape.end()};
+        OUTCOME_TRY(auto prec, ConvertPrecision(desc.data_type));
+        InferenceEngine::TensorDesc ie_desc(prec, size_vector, InferenceEngine::Layout::NCHW);
+
+        // TODO: find a better way instead of switch case
+        switch (desc.data_type)
+        {
+            case DataType::kFLOAT:
+                request.SetBlob(input_name,
+                                InferenceEngine::make_shared_blob<float>(ie_desc, tensor.data<float>()));
+                break;
+            case DataType::kINT8:
+                request.SetBlob(input_name,
+                                InferenceEngine::make_shared_blob<int8_t>(ie_desc, tensor.data<int8_t>()));
+                break;
+            case DataType::kINT32:
+                request.SetBlob(input_name,
+                                InferenceEngine::make_shared_blob<int32_t>(ie_desc, tensor.data<int32_t>()));
+                break;
+            case DataType::kINT64:
+                request.SetBlob(input_name,
+                                InferenceEngine::make_shared_blob<int64_t>(ie_desc, tensor.data<int64_t>()));
+                break;
+            default:
+                MMDEPLOY_ERROR("unsupported DataType: {}", static_cast<int>(desc.data_type));
+                return Status(eNotSupported);
+        }
+        return success();
+    }
+
+    static Result<void> GetBlob(InferenceEngine::InferRequest& request, Tensor& tensor, Stream& stream)
+    {
+        const auto&                       desc        = tensor.desc();
+        const auto&                       output_name = desc.name;
+        const auto                        device      = desc.device;
+        const auto                        data_type   = desc.data_type;
+        const auto&                       output      = request.GetBlob(output_name);
+        const auto&                       size_vector = output->getTensorDesc().getDims();
+        TensorShape                       shape{size_vector.begin(), size_vector.end()};
+
+        InferenceEngine::MemoryBlob::CPtr moutput =
+            InferenceEngine::as<InferenceEngine::MemoryBlob>(output);
+        auto                  moutputHolder = moutput->rmap();
+        std::shared_ptr<void> data(const_cast<void*>(moutputHolder.as<const void*>()), [](void*) {});
+
+        Tensor                blob_tensor = {TensorDesc{device, data_type, shape, output_name}, data};
+        if (!std::equal(blob_tensor.shape().begin(), blob_tensor.shape().end(), tensor.shape().begin()))
+            tensor.Reshape(shape);
+        OUTCOME_TRY(tensor.CopyFrom(blob_tensor, stream));
+
+        return success();
+    }
+
+    Result<void> OpenVINONet::Forward()
+    {
+        OUTCOME_TRY(stream_.Wait());
+
+        // reshape network if shape does not match
+        bool need_reshape = false;
+        auto input_shapes = network_.getInputShapes();
+        for (auto& tensor : input_tensors_)
+        {
+            const auto& input_name    = tensor.desc().name;
+            const auto& tensor_shape  = tensor.desc().shape;
+            auto&       size_vector   = input_shapes[input_name];
+            bool        shape_changed = !std::equal(size_vector.begin(), size_vector.end(), tensor_shape.begin(), [](size_t a, int64_t b)
+                                             { return a == size_t(b); });
+            need_reshape |= shape_changed;
+            if (shape_changed)
+                size_vector = InferenceEngine::SizeVector{tensor_shape.begin(), tensor_shape.end()};
+        }
+
+        if (need_reshape)
+        {
+            network_.reshape(input_shapes);
+            OUTCOME_TRY(auto device_str, ConvertDeviceName(device_));
+            auto executable_network = core_.LoadNetwork(network_, device_str, net_config_);
+            request_                = executable_network.CreateInferRequest();
+        }
+
+        // fill input into request
+        for (auto& tensor : input_tensors_)
+        {
+            OUTCOME_TRY(SetBlob(request_, tensor));
+        }
+
+        request_.StartAsync();
+        request_.Wait(InferenceEngine::InferRequest::WaitMode::RESULT_READY);
+
+        // read output from request
+        for (auto& tensor : output_tensors_)
+        {
+            OUTCOME_TRY(GetBlob(request_, tensor, stream_));
+        }
+        OUTCOME_TRY(stream_.Wait());
+
+        return success();
+    }
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        try
+        {
+            auto p = std::make_unique<OpenVINONet>();
+            if (auto r = p->Init(args))
+            {
+                return p;
+            }
+            else
+            {
+                MMDEPLOY_ERROR("error creating OpenVINONet: {}", r.error().message().c_str());
+                return nullptr;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating OpenVINONet: {}", e.what());
+            return nullptr;
+        }
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (openvino, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/openvino/openvino_net.h b/csrc/mmdeploy/net/openvino/openvino_net.h
index ce43b5de30..3972e2d8c5 100644
--- a/csrc/mmdeploy/net/openvino/openvino_net.h
+++ b/csrc/mmdeploy/net/openvino/openvino_net.h
@@ -6,30 +6,32 @@
 #include "inference_engine.hpp"
 #include "mmdeploy/core/net.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class OpenVINONet : public Net {
- public:
-  ~OpenVINONet() override = default;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
+    class OpenVINONet : public Net
+    {
+      public:
+        ~OpenVINONet() override = default;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
 
- private:
-  InferenceEngine::Core core_;
-  InferenceEngine::CNNNetwork network_;
-  InferenceEngine::InferRequest request_;
-  std::map<std::string, std::string> net_config_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  std::string device_str_;
-  Device device_;
-  Stream stream_;
-};
+      private:
+        InferenceEngine::Core              core_;
+        InferenceEngine::CNNNetwork        network_;
+        InferenceEngine::InferRequest      request_;
+        std::map<std::string, std::string> net_config_;
+        std::vector<Tensor>                input_tensors_;
+        std::vector<Tensor>                output_tensors_;
+        std::string                        device_str_;
+        Device                             device_;
+        Stream                             stream_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/ort/ort_net.cpp b/csrc/mmdeploy/net/ort/ort_net.cpp
index 38b04e70f0..cf0f4a27a0 100644
--- a/csrc/mmdeploy/net/ort/ort_net.cpp
+++ b/csrc/mmdeploy/net/ort/ort_net.cpp
@@ -9,204 +9,245 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "onnxruntime_register.h"
 
-namespace mmdeploy::framework {
-
-static TensorShape to_shape(const Ort::TypeInfo& info) {
-  auto shape = info.GetTensorTypeAndShapeInfo().GetShape();
-  return {shape.begin(), shape.end()};
-}
-
-static Result<DataType> ConvertElementType(ONNXTensorElementDataType type) {
-  switch (type) {
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
-      return DataType::kFLOAT;
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
-      return DataType::kHALF;
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
-      return DataType::kINT8;
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
-      return DataType::kINT32;
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
-      return DataType::kINT64;
-    default:
-      MMDEPLOY_ERROR("unsupported ONNXTensorElementDataType: {}", static_cast<int>(type));
-      return Status(eNotSupported);
-  }
-}
-
-// TODO: handle datatype
-Result<void> OrtNet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-  DeviceGuard guard(device_);
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  OUTCOME_TRY(auto onnx, model.ReadFile(config.net));
-
-  Ort::SessionOptions options;
-  options.SetLogSeverityLevel(3);
-
-  RegisterCustomOps(options, OrtGetApiBase());
-
-  if (device_.is_device()) {
-    OrtCUDAProviderOptions cuda_options{};
-    cuda_options.device_id = device_.device_id();
-    // TODO set compute stream
-    options.AppendExecutionProvider_CUDA(cuda_options);
-  }
-  session_ = {env_, onnx.data(), onnx.size(), options};
-
-  auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-  Ort::Allocator allocator(session_, memory_info);
-
-  auto n_inputs = session_.GetInputCount();
-
-  // force negative shape to be empty
-  auto filter_shape = [](TensorShape& shape) {
-    if (std::any_of(begin(shape), end(shape), [](auto x) { return x < 0; })) {
-      shape = {};
+namespace mmdeploy::framework
+{
+
+    static TensorShape to_shape(const Ort::TypeInfo& info)
+    {
+        auto shape = info.GetTensorTypeAndShapeInfo().GetShape();
+        return {shape.begin(), shape.end()};
+    }
+
+    static Result<DataType> ConvertElementType(ONNXTensorElementDataType type)
+    {
+        switch (type)
+        {
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+                return DataType::kFLOAT;
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+                return DataType::kHALF;
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+                return DataType::kINT8;
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+                return DataType::kINT32;
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+                return DataType::kINT64;
+            default:
+                MMDEPLOY_ERROR("unsupported ONNXTensorElementDataType: {}", static_cast<int>(type));
+                return Status(eNotSupported);
+        }
     }
-  };
 
-  for (int i = 0; i < n_inputs; ++i) {
+    // TODO: handle datatype
+    Result<void> OrtNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+        DeviceGuard guard(device_);
+        auto        name  = args["name"].get<std::string>();
+        auto        model = context["model"].get<Model>();
+
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        OUTCOME_TRY(auto onnx, model.ReadFile(config.net));
+
+        Ort::SessionOptions options;
+        options.SetLogSeverityLevel(3);
+
+        RegisterCustomOps(options, OrtGetApiBase());
+
+        if (device_.is_device())
+        {
+            OrtCUDAProviderOptions cuda_options{};
+            cuda_options.device_id = device_.device_id();
+            // TODO set compute stream
+            options.AppendExecutionProvider_CUDA(cuda_options);
+        }
+        session_ = {env_, onnx.data(), onnx.size(), options};
+
+        auto           memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+        Ort::Allocator allocator(session_, memory_info);
+
+        auto           n_inputs = session_.GetInputCount();
+
+        // force negative shape to be empty
+        auto           filter_shape = [](TensorShape& shape)
+        {
+            if (std::any_of(begin(shape), end(shape), [](auto x)
+                            { return x < 0; }))
+            {
+                shape = {};
+            }
+        };
+
+        for (int i = 0; i < n_inputs; ++i)
+        {
 #if ORT_API_VERSION >= 13
-    auto input_name = session_.GetInputNameAllocated(i, allocator).release();
+            auto input_name = session_.GetInputNameAllocated(i, allocator).release();
 #else
-    auto input_name = session_.GetInputName(i, allocator);
+            auto input_name = session_.GetInputName(i, allocator);
 #endif
-    auto type_info = session_.GetInputTypeInfo(i);
-    auto shape = to_shape(type_info);
-    MMDEPLOY_DEBUG("input {}, shape = {}", i, shape);
-    filter_shape(shape);
-    OUTCOME_TRY(auto data_type,
-                ConvertElementType(type_info.GetTensorTypeAndShapeInfo().GetElementType()));
-    input_tensors_.emplace_back(TensorDesc{device_, data_type, shape, input_name});
-    allocator.Free(input_name);
-  }
-
-  auto n_outputs = session_.GetOutputCount();
-
-  for (int i = 0; i < n_outputs; ++i) {
+            auto type_info = session_.GetInputTypeInfo(i);
+            auto shape     = to_shape(type_info);
+            MMDEPLOY_DEBUG("input {}, shape = {}", i, shape);
+            filter_shape(shape);
+            OUTCOME_TRY(auto data_type,
+                        ConvertElementType(type_info.GetTensorTypeAndShapeInfo().GetElementType()));
+            input_tensors_.emplace_back(TensorDesc{device_, data_type, shape, input_name});
+            allocator.Free(input_name);
+        }
+
+        auto n_outputs = session_.GetOutputCount();
+
+        for (int i = 0; i < n_outputs; ++i)
+        {
 #if ORT_API_VERSION >= 13
-    auto output_name = session_.GetOutputNameAllocated(i, allocator).release();
+            auto output_name = session_.GetOutputNameAllocated(i, allocator).release();
 #else
-    auto output_name = session_.GetOutputName(i, allocator);
+            auto output_name = session_.GetOutputName(i, allocator);
 #endif
-    auto type_info = session_.GetOutputTypeInfo(i);
-    auto shape = to_shape(type_info);
-    MMDEPLOY_DEBUG("output {}, shape = {}", i, shape);
-    filter_shape(shape);
-    OUTCOME_TRY(auto data_type,
-                ConvertElementType(type_info.GetTensorTypeAndShapeInfo().GetElementType()));
-    output_tensors_.emplace_back(TensorDesc{device_, data_type, shape, output_name});
-    allocator.Free(output_name);
-  }
-
-  return success();
-}
-
-Result<void> OrtNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-Result<void> OrtNet::Deinit() { return success(); }
-
-Result<Span<Tensor>> OrtNet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> OrtNet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> OrtNet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-static Ort::MemoryInfo MemoryInfo(const TensorDesc& desc) {
-  const char* device_name = desc.device.is_host() ? "Cpu" : "Cuda";
-  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, desc.device.device_id(),
-                              OrtMemTypeDefault);
-  return memory_info;
-}
-
-static Ort::Value AsOrtValue(Tensor& tensor) {
-  auto memory_info = MemoryInfo(tensor.desc());
-  std::vector<int64_t> shape(begin(tensor.shape()), end(tensor.shape()));
-  return Ort::Value::CreateTensor(memory_info, tensor.data<float>(), tensor.size(), shape.data(),
-                                  shape.size());
-}
-
-static Result<Tensor> AsTensor(Ort::Value& value, const Device& device) {
-  auto info = value.GetTensorTypeAndShapeInfo();
-  TensorDesc desc;
-  desc.shape = info.GetShape();
-  desc.device = device;
-  OUTCOME_TRY(desc.data_type, ConvertElementType(info.GetElementType()));
-  std::shared_ptr<void> data(const_cast<void*>(value.GetTensorData<void>()), [](void*) {});
-  return Tensor(desc, data);
-}
-
-Result<void> OrtNet::Forward() {
-  DeviceGuard guard(device_);
-  try {
-    OUTCOME_TRY(stream_.Wait());
-    Ort::IoBinding binding(session_);
-    std::vector<Ort::Value> inputs;
-    std::vector<Ort::Value> outputs;
-    Ort::RunOptions options;
-
-    inputs.reserve(input_tensors_.size());
-    for (auto& t : input_tensors_) {
-      inputs.push_back(AsOrtValue(t));
-      binding.BindInput(t.name(), inputs.back());
+            auto type_info = session_.GetOutputTypeInfo(i);
+            auto shape     = to_shape(type_info);
+            MMDEPLOY_DEBUG("output {}, shape = {}", i, shape);
+            filter_shape(shape);
+            OUTCOME_TRY(auto data_type,
+                        ConvertElementType(type_info.GetTensorTypeAndShapeInfo().GetElementType()));
+            output_tensors_.emplace_back(TensorDesc{device_, data_type, shape, output_name});
+            allocator.Free(output_name);
+        }
+
+        return success();
+    }
+
+    Result<void> OrtNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
+    }
+
+    Result<void> OrtNet::Deinit()
+    {
+        return success();
+    }
+
+    Result<Span<Tensor>> OrtNet::GetInputTensors()
+    {
+        return input_tensors_;
     }
 
-    // TODO: We are in the same situation as PPL.nn, the backend can't infer shapes
-    //  without executing forward
-    for (auto& t : output_tensors_) {
-      binding.BindOutput(t.name(), MemoryInfo(t.desc()));
+    Result<Span<Tensor>> OrtNet::GetOutputTensors()
+    {
+        return output_tensors_;
     }
 
-    session_.Run({}, binding);
+    Result<void> OrtNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
+    }
+
+    static Ort::MemoryInfo MemoryInfo(const TensorDesc& desc)
+    {
+        const char*     device_name = desc.device.is_host() ? "Cpu" : "Cuda";
+        Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, desc.device.device_id(), OrtMemTypeDefault);
+        return memory_info;
+    }
+
+    static Ort::Value AsOrtValue(Tensor& tensor)
+    {
+        auto                 memory_info = MemoryInfo(tensor.desc());
+        std::vector<int64_t> shape(begin(tensor.shape()), end(tensor.shape()));
+        return Ort::Value::CreateTensor(memory_info, tensor.data<float>(), tensor.size(), shape.data(), shape.size());
+    }
+
+    static Result<Tensor> AsTensor(Ort::Value& value, const Device& device)
+    {
+        auto       info = value.GetTensorTypeAndShapeInfo();
+        TensorDesc desc;
+        desc.shape  = info.GetShape();
+        desc.device = device;
+        OUTCOME_TRY(desc.data_type, ConvertElementType(info.GetElementType()));
+        std::shared_ptr<void> data(const_cast<void*>(value.GetTensorData<void>()), [](void*) {});
+        return Tensor(desc, data);
+    }
+
+    Result<void> OrtNet::Forward()
+    {
+        DeviceGuard guard(device_);
+        try
+        {
+            OUTCOME_TRY(stream_.Wait());
+            Ort::IoBinding          binding(session_);
+            std::vector<Ort::Value> inputs;
+            std::vector<Ort::Value> outputs;
+            Ort::RunOptions         options;
+
+            inputs.reserve(input_tensors_.size());
+            for (auto& t : input_tensors_)
+            {
+                inputs.push_back(AsOrtValue(t));
+                binding.BindInput(t.name(), inputs.back());
+            }
+
+            // TODO: We are in the same situation as PPL.nn, the backend can't infer shapes
+            //  without executing forward
+            for (auto& t : output_tensors_)
+            {
+                binding.BindOutput(t.name(), MemoryInfo(t.desc()));
+            }
+
+            session_.Run({}, binding);
+
+            outputs = binding.GetOutputValues();
+            for (size_t i = 0; i < output_tensors_.size(); ++i)
+            {
+                OUTCOME_TRY(auto tmp, AsTensor(outputs[i], output_tensors_[i].device()));
+                output_tensors_[i].Reshape(tmp.shape());
+                OUTCOME_TRY(tmp.CopyTo(output_tensors_[i], stream_));
+            }
+
+            OUTCOME_TRY(stream_.Wait());
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR(e.what());
+            return Status(eFail);
+        }
+        return success();
+    }
 
-    outputs = binding.GetOutputValues();
-    for (size_t i = 0; i < output_tensors_.size(); ++i) {
-      OUTCOME_TRY(auto tmp, AsTensor(outputs[i], output_tensors_[i].device()));
-      output_tensors_[i].Reshape(tmp.shape());
-      OUTCOME_TRY(tmp.CopyTo(output_tensors_[i], stream_));
+    OrtNet::~OrtNet()
+    {
+        DeviceGuard guard(device_);
+        session_ = Ort::Session{nullptr};
     }
 
-    OUTCOME_TRY(stream_.Wait());
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR(e.what());
-    return Status(eFail);
-  }
-  return success();
-}
-
-OrtNet::~OrtNet() {
-  DeviceGuard guard(device_);
-  session_ = Ort::Session{nullptr};
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  try {
-    auto p = std::make_unique<OrtNet>();
-    if (auto r = p->Init(args)) {
-      return p;
-    } else {
-      MMDEPLOY_ERROR("error creating OrtNet: {}", r.error().message().c_str());
-      return nullptr;
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        try
+        {
+            auto p = std::make_unique<OrtNet>();
+            if (auto r = p->Init(args))
+            {
+                return p;
+            }
+            else
+            {
+                MMDEPLOY_ERROR("error creating OrtNet: {}", r.error().message().c_str());
+                return nullptr;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating ORTNet: {}", e.what());
+            return nullptr;
+        }
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating ORTNet: {}", e.what());
-    return nullptr;
-  }
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (onnxruntime, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (onnxruntime, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/ort/ort_net.h b/csrc/mmdeploy/net/ort/ort_net.h
index 94f5095d84..85629eddc1 100644
--- a/csrc/mmdeploy/net/ort/ort_net.h
+++ b/csrc/mmdeploy/net/ort/ort_net.h
@@ -7,27 +7,29 @@
 #include "onnxruntime_c_api.h"
 #include "onnxruntime_cxx_api.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class OrtNet : public Net {
- public:
-  ~OrtNet() override;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
+    class OrtNet : public Net
+    {
+      public:
+        ~OrtNet() override;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
 
- private:
-  Ort::Env env_;
-  Ort::Session session_{nullptr};
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  Device device_;
-  Stream stream_;
-};
+      private:
+        Ort::Env            env_;
+        Ort::Session        session_{nullptr};
+        std::vector<Tensor> input_tensors_;
+        std::vector<Tensor> output_tensors_;
+        Device              device_;
+        Stream              stream_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/ppl/ppl_net.cpp b/csrc/mmdeploy/net/ppl/ppl_net.cpp
index 06f21bf579..8bb53320f4 100644
--- a/csrc/mmdeploy/net/ppl/ppl_net.cpp
+++ b/csrc/mmdeploy/net/ppl/ppl_net.cpp
@@ -8,351 +8,419 @@
 #include "ppl/nn/common/logger.h"
 #include "ppl/nn/models/onnx/runtime_builder_factory.h"
 #if PPL_NN_HAS_X86
-#include "ppl/nn/engines/x86/engine_factory.h"
-#include "ppl/nn/engines/x86/engine_options.h"
-#include "ppl/nn/engines/x86/ops.h"
+    #include "ppl/nn/engines/x86/engine_factory.h"
+    #include "ppl/nn/engines/x86/engine_options.h"
+    #include "ppl/nn/engines/x86/ops.h"
 #endif
 #if PPL_NN_HAS_CUDA
-#include "ppl/nn/engines/cuda/engine_factory.h"
-#include "ppl/nn/engines/cuda/engine_options.h"
-#include "ppl/nn/engines/cuda/ops.h"
-#define PPL_CUDA_IMPORT_FROM_BUFFER 1
+    #include "ppl/nn/engines/cuda/engine_factory.h"
+    #include "ppl/nn/engines/cuda/engine_options.h"
+    #include "ppl/nn/engines/cuda/ops.h"
+    #define PPL_CUDA_IMPORT_FROM_BUFFER 1
 #endif
 #if PPL_NN_HAS_RISCV
-#include "ppl/nn/engines/riscv/engine_factory.h"
-#include "ppl/nn/engines/riscv/engine_options.h"
-#include "ppl/nn/engines/riscv/ops.h"
+    #include "ppl/nn/engines/riscv/engine_factory.h"
+    #include "ppl/nn/engines/riscv/engine_options.h"
+    #include "ppl/nn/engines/riscv/ops.h"
 #endif
 
-namespace mmdeploy::framework {
-
-Result<void> ppl_try(int code) {
-  if (code == 0) {
-    return success();
-  }
-  MMDEPLOY_ERROR("ppl error: {}", ppl::common::GetRetCodeStr(code));
-  return Status(eFail);
-}
-
-template <typename T>
-Result<std::unique_ptr<T>> ppl_try(T* v) {
-  if (v) {
-    return success(v);
-  }
-  return Status(eFail);
-}
-
-Tensor PPLNet::CreateInternalTensor(ppl::nn::Tensor* src, Device device) {
-  const auto& desc = *src->GetShape();
-  auto name = src->GetName();
-  std::vector<int64_t> shape{desc.GetDims(), desc.GetDims() + desc.GetDimCount()};
-  if (std::any_of(begin(shape), end(shape), [](auto x) { return x <= 0; })) {
-    shape = {};
-  }
-  return TensorDesc{.device = device, .data_type = DataType::kFLOAT, .shape = shape, .name = name};
-}
-
-Result<void> PPLNet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-  OUTCOME_TRY(auto onnx, model.ReadFile(config.net));
+namespace mmdeploy::framework
+{
+
+    Result<void> ppl_try(int code)
+    {
+        if (code == 0)
+        {
+            return success();
+        }
+        MMDEPLOY_ERROR("ppl error: {}", ppl::common::GetRetCodeStr(code));
+        return Status(eFail);
+    }
 
-#if PPL_NN_HAS_CUDA
-  if (device_.is_device()) {
-    ppl::nn::cuda::RegisterBuiltinOpImpls();
-    ppl::nn::cuda::EngineOptions options{};
-    options.device_id = device_.device_id();
-    options.mm_policy = ppl::nn::cuda::MM_BEST_FIT;
-    engines_.emplace_back(ppl::nn::cuda::EngineFactory::Create(options));
-
-    bool import_algo = false;
-
-#if PPL_CUDA_IMPORT_FROM_BUFFER
-    auto algo = model.ReadFile(config.weights);
-    if (algo) {
-      auto ret =
-          engines_.back()->Configure(ppl::nn::cuda::ENGINE_CONF_IMPORT_ALGORITHMS_FROM_BUFFER,
-                                     algo.value().c_str(), algo.value().size());
-      if (ret == ppl::common::RC_SUCCESS) {
-        import_algo = true;
-      } else {
-        MMDEPLOY_ERROR("failed to import algorithms ({}), default algorithms will be used", ret);
-      }
+    template<typename T>
+    Result<std::unique_ptr<T>> ppl_try(T* v)
+    {
+        if (v)
+        {
+            return success(v);
+        }
+        return Status(eFail);
     }
-#endif
 
-    if (!import_algo) {
-      engines_.back()->Configure(ppl::nn::cuda::ENGINE_CONF_USE_DEFAULT_ALGORITHMS, true);
+    Tensor PPLNet::CreateInternalTensor(ppl::nn::Tensor* src, Device device)
+    {
+        const auto&          desc = *src->GetShape();
+        auto                 name = src->GetName();
+        std::vector<int64_t> shape{desc.GetDims(), desc.GetDims() + desc.GetDimCount()};
+        if (std::any_of(begin(shape), end(shape), [](auto x)
+                        { return x <= 0; }))
+        {
+            shape = {};
+        }
+        return TensorDesc{.device = device, .data_type = DataType::kFLOAT, .shape = shape, .name = name};
     }
-  }
+
+    Result<void> PPLNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+        auto name     = args["name"].get<std::string>();
+        auto model    = context["model"].get<Model>();
+
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+        OUTCOME_TRY(auto onnx, model.ReadFile(config.net));
+
+#if PPL_NN_HAS_CUDA
+        if (device_.is_device())
+        {
+            ppl::nn::cuda::RegisterBuiltinOpImpls();
+            ppl::nn::cuda::EngineOptions options{};
+            options.device_id = device_.device_id();
+            options.mm_policy = ppl::nn::cuda::MM_BEST_FIT;
+            engines_.emplace_back(ppl::nn::cuda::EngineFactory::Create(options));
+
+            bool import_algo = false;
+
+    #if PPL_CUDA_IMPORT_FROM_BUFFER
+            auto algo = model.ReadFile(config.weights);
+            if (algo)
+            {
+                auto ret =
+                    engines_.back()->Configure(ppl::nn::cuda::ENGINE_CONF_IMPORT_ALGORITHMS_FROM_BUFFER,
+                                               algo.value().c_str(),
+                                               algo.value().size());
+                if (ret == ppl::common::RC_SUCCESS)
+                {
+                    import_algo = true;
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("failed to import algorithms ({}), default algorithms will be used", ret);
+                }
+            }
+    #endif
+
+            if (!import_algo)
+            {
+                engines_.back()->Configure(ppl::nn::cuda::ENGINE_CONF_USE_DEFAULT_ALGORITHMS, true);
+            }
+        }
 #endif
 #if PPL_NN_HAS_X86
-  if (device_.is_host()) {
-    ppl::nn::x86::RegisterBuiltinOpImpls();
-    engines_.emplace_back(ppl::nn::x86::EngineFactory::Create({}));
-  }
+        if (device_.is_host())
+        {
+            ppl::nn::x86::RegisterBuiltinOpImpls();
+            engines_.emplace_back(ppl::nn::x86::EngineFactory::Create({}));
+        }
 #endif
 #if PPL_NN_HAS_RISCV
-  if (device_.is_host()) {
-    ppl::nn::riscv::RegisterBuiltinOpImpls();
-    ppl::nn::riscv::EngineOptions options{};
-    // TODO:
-    //   FP16 -> postprocess
-    options.forward_precision = ppl::common::DATATYPE_FLOAT32;
-    options.dynamic_tuning_level = 0;
-    options.winograd_level = 1;
-    engines_.emplace_back(ppl::nn::riscv::EngineFactory::Create(options));
-  }
+        if (device_.is_host())
+        {
+            ppl::nn::riscv::RegisterBuiltinOpImpls();
+            ppl::nn::riscv::EngineOptions options{};
+            // TODO:
+            //   FP16 -> postprocess
+            options.forward_precision    = ppl::common::DATATYPE_FLOAT32;
+            options.dynamic_tuning_level = 0;
+            options.winograd_level       = 1;
+            engines_.emplace_back(ppl::nn::riscv::EngineFactory::Create(options));
+        }
 #endif
 
-  std::vector<ppl::nn::Engine*> engines;
-  for (const auto& engine : engines_) {
-    engines.push_back(engine.get());
-  }
-
-  OUTCOME_TRY(auto builder, ppl_try(ppl::nn::onnx::RuntimeBuilderFactory::Create()));
-  OUTCOME_TRY(ppl_try(builder->LoadModel(onnx.data(), onnx.size(), nullptr)));
-
-  ppl::nn::onnx::RuntimeBuilder::Resources resources{};
-  resources.engines = engines.data();
-  resources.engine_num = engines.size();
-  OUTCOME_TRY(ppl_try(builder->SetResources(resources)));
-  OUTCOME_TRY(ppl_try(builder->Preprocess()));
-
-  OUTCOME_TRY(auto runtime, ppl_try(builder->CreateRuntime()));
-
-  for (int i = 0; i < runtime->GetInputCount(); ++i) {
-    auto src = runtime->GetInputTensor(i);
-    inputs_internal_.push_back(src);
-    inputs_external_.push_back(CreateInternalTensor(src, device_));
-
-    /// debug only
-    const auto& desc = *inputs_internal_[i]->GetShape();
-    std::vector<long> shape_(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
-    MMDEPLOY_DEBUG("input {}: datatype = {}, dataformat = {}, shape = {}", i,
-                   ppl::common::GetDataTypeStr(desc.GetDataType()),
-                   ppl::common::GetDataFormatStr(desc.GetDataFormat()), shape_);
-  }
-
-  for (int i = 0; i < runtime->GetOutputCount(); ++i) {
-    auto src = runtime->GetOutputTensor(i);
-    outputs_internal_.push_back(src);
-    outputs_external_.push_back(CreateInternalTensor(src, device_));
-
-    const auto& desc = *outputs_internal_[i]->GetShape();
-    std::vector<long> shape_(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
-    MMDEPLOY_DEBUG("output {}: datatype = {}, dataformat = {}, shape = {}", i,
-                   ppl::common::GetDataTypeStr(desc.GetDataType()),
-                   ppl::common::GetDataFormatStr(desc.GetDataFormat()), shape_);
-    TensorShape shape(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
-  }
-
-  auto input_shapes = GetShapes(inputs_external_);
-  if (auto input_batch_size = GetBatchSize(input_shapes)) {
-    auto output_shapes = GetShapes(outputs_external_);
-    if (auto output_batch_size = GetBatchSize(output_shapes)) {
-      if (input_batch_size.value() == output_batch_size.value()) {
-        can_infer_output_shapes_ = true;
-      }
+        std::vector<ppl::nn::Engine*> engines;
+        for (const auto& engine : engines_)
+        {
+            engines.push_back(engine.get());
+        }
+
+        OUTCOME_TRY(auto builder, ppl_try(ppl::nn::onnx::RuntimeBuilderFactory::Create()));
+        OUTCOME_TRY(ppl_try(builder->LoadModel(onnx.data(), onnx.size(), nullptr)));
+
+        ppl::nn::onnx::RuntimeBuilder::Resources resources{};
+        resources.engines    = engines.data();
+        resources.engine_num = engines.size();
+        OUTCOME_TRY(ppl_try(builder->SetResources(resources)));
+        OUTCOME_TRY(ppl_try(builder->Preprocess()));
+
+        OUTCOME_TRY(auto runtime, ppl_try(builder->CreateRuntime()));
+
+        for (int i = 0; i < runtime->GetInputCount(); ++i)
+        {
+            auto src = runtime->GetInputTensor(i);
+            inputs_internal_.push_back(src);
+            inputs_external_.push_back(CreateInternalTensor(src, device_));
+
+            /// debug only
+            const auto&       desc = *inputs_internal_[i]->GetShape();
+            std::vector<long> shape_(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
+            MMDEPLOY_DEBUG("input {}: datatype = {}, dataformat = {}, shape = {}", i, ppl::common::GetDataTypeStr(desc.GetDataType()), ppl::common::GetDataFormatStr(desc.GetDataFormat()), shape_);
+        }
+
+        for (int i = 0; i < runtime->GetOutputCount(); ++i)
+        {
+            auto src = runtime->GetOutputTensor(i);
+            outputs_internal_.push_back(src);
+            outputs_external_.push_back(CreateInternalTensor(src, device_));
+
+            const auto&       desc = *outputs_internal_[i]->GetShape();
+            std::vector<long> shape_(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
+            MMDEPLOY_DEBUG("output {}: datatype = {}, dataformat = {}, shape = {}", i, ppl::common::GetDataTypeStr(desc.GetDataType()), ppl::common::GetDataFormatStr(desc.GetDataFormat()), shape_);
+            TensorShape shape(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
+        }
+
+        auto input_shapes = GetShapes(inputs_external_);
+        if (auto input_batch_size = GetBatchSize(input_shapes))
+        {
+            auto output_shapes = GetShapes(outputs_external_);
+            if (auto output_batch_size = GetBatchSize(output_shapes))
+            {
+                if (input_batch_size.value() == output_batch_size.value())
+                {
+                    can_infer_output_shapes_ = true;
+                }
+            }
+        }
+
+        runtime_ = std::move(runtime);
+        return success();
+    }
+
+    Result<void> PPLNet::Deinit()
+    {
+        try
+        {
+            runtime_.reset();
+            return success();
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
+    }
+
+    static TensorShape GetShape(const PPLTensor& tensor)
+    {
+        const auto& desc = *tensor.GetShape();
+        return {desc.GetDims(), desc.GetDims() + desc.GetDimCount()};
+    }
+
+    Result<ppl::common::datatype_t> GetPPLDataType(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return ppl::common::DATATYPE_FLOAT32;
+            case DataType::kHALF:
+                return ppl::common::DATATYPE_FLOAT16;
+            case DataType::kINT8:
+                return ppl::common::DATATYPE_INT8;
+            case DataType::kINT32:
+                return ppl::common::DATATYPE_INT32;
+            case DataType::kINT64:
+                return ppl::common::DATATYPE_INT64;
+            default:
+                return Status(eNotSupported);
+        }
+    }
+
+    Result<DataType> GetMMDeployDataType(ppl::common::datatype_t data_type)
+    {
+        switch (data_type)
+        {
+            case ppl::common::DATATYPE_FLOAT32:
+                return DataType::kFLOAT;
+            case ppl::common::DATATYPE_FLOAT16:
+                return DataType::kHALF;
+            case ppl::common::DATATYPE_INT8:
+                return DataType::kINT8;
+            case ppl::common::DATATYPE_INT32:
+                return DataType::kINT32;
+            case ppl::common::DATATYPE_INT64:
+                return DataType::kINT64;
+            default:
+                return Status(eNotSupported);
+        }
     }
-  }
-
-  runtime_ = std::move(runtime);
-  return success();
-}
-
-Result<void> PPLNet::Deinit() {
-  try {
-    runtime_.reset();
-    return success();
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-
-static TensorShape GetShape(const PPLTensor& tensor) {
-  const auto& desc = *tensor.GetShape();
-  return {desc.GetDims(), desc.GetDims() + desc.GetDimCount()};
-}
-
-Result<ppl::common::datatype_t> GetPPLDataType(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return ppl::common::DATATYPE_FLOAT32;
-    case DataType::kHALF:
-      return ppl::common::DATATYPE_FLOAT16;
-    case DataType::kINT8:
-      return ppl::common::DATATYPE_INT8;
-    case DataType::kINT32:
-      return ppl::common::DATATYPE_INT32;
-    case DataType::kINT64:
-      return ppl::common::DATATYPE_INT64;
-    default:
-      return Status(eNotSupported);
-  }
-}
-
-Result<DataType> GetMMDeployDataType(ppl::common::datatype_t data_type) {
-  switch (data_type) {
-    case ppl::common::DATATYPE_FLOAT32:
-      return DataType::kFLOAT;
-    case ppl::common::DATATYPE_FLOAT16:
-      return DataType::kHALF;
-    case ppl::common::DATATYPE_INT8:
-      return DataType::kINT8;
-    case ppl::common::DATATYPE_INT32:
-      return DataType::kINT32;
-    case ppl::common::DATATYPE_INT64:
-      return DataType::kINT64;
-    default:
-      return Status(eNotSupported);
-  }
-}
-
-Result<void> PPLNet::Forward() {
-  OUTCOME_TRY(stream_.Wait());
-
-  OUTCOME_TRY(ppl_try(runtime_->Run()));
-
-  for (int i = 0; i < outputs_external_.size(); ++i) {
-    auto& internal = *outputs_internal_[i];
-    auto format = internal.GetShape()->GetDataFormat();
-    if (format != ppl::common::DATAFORMAT_NDARRAY) {
-      MMDEPLOY_ERROR("output {}'s format is {}, only NDARRAY is currently supported", i,
-                     ppl::common::GetDataFormatStr(format));
-      return Status(eNotSupported);
+
+    Result<void> PPLNet::Forward()
+    {
+        OUTCOME_TRY(stream_.Wait());
+
+        OUTCOME_TRY(ppl_try(runtime_->Run()));
+
+        for (int i = 0; i < outputs_external_.size(); ++i)
+        {
+            auto& internal = *outputs_internal_[i];
+            auto  format   = internal.GetShape()->GetDataFormat();
+            if (format != ppl::common::DATAFORMAT_NDARRAY)
+            {
+                MMDEPLOY_ERROR("output {}'s format is {}, only NDARRAY is currently supported", i, ppl::common::GetDataFormatStr(format));
+                return Status(eNotSupported);
+            }
+            auto& external  = outputs_external_[i];
+            auto  dtype_int = internal.GetShape()->GetDataType();
+            OUTCOME_TRY(auto dtype_ext, GetPPLDataType(external.data_type()));
+            auto shape_int = GetShape(internal);
+            auto shape_ext = external.shape();
+            auto data_int  = internal.GetBufferPtr();
+            auto data_ext  = external.data();
+            if (shape_int != shape_ext || dtype_int != dtype_ext || data_int != data_ext)
+            {
+                if (dtype_int != dtype_ext)
+                {
+                    auto desc  = external.desc();
+                    desc.shape = shape_int;
+                    OUTCOME_TRY(desc.data_type, GetMMDeployDataType(dtype_int));
+                    external = Tensor(desc, external.allocator());
+                }
+                else
+                {
+                    external.Reshape(shape_int);
+                }
+                std::shared_ptr<void> data(data_int, [](void*) {});
+                if (external.size() > 0)
+                {
+                    OUTCOME_TRY(Tensor(external.desc(), data).CopyTo(external, stream_));
+                }
+                else
+                {
+                    MMDEPLOY_WARN("copy skipped due to zero sized tensor: {} {}", external.name(), external.shape());
+                }
+            }
+        }
+
+        OUTCOME_TRY(stream_.Wait());
+        return success();
+    }
+
+    Result<void> PPLNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
     }
-    auto& external = outputs_external_[i];
-    auto dtype_int = internal.GetShape()->GetDataType();
-    OUTCOME_TRY(auto dtype_ext, GetPPLDataType(external.data_type()));
-    auto shape_int = GetShape(internal);
-    auto shape_ext = external.shape();
-    auto data_int = internal.GetBufferPtr();
-    auto data_ext = external.data();
-    if (shape_int != shape_ext || dtype_int != dtype_ext || data_int != data_ext) {
-      if (dtype_int != dtype_ext) {
-        auto desc = external.desc();
-        desc.shape = shape_int;
-        OUTCOME_TRY(desc.data_type, GetMMDeployDataType(dtype_int));
-        external = Tensor(desc, external.allocator());
-      } else {
-        external.Reshape(shape_int);
-      }
-      std::shared_ptr<void> data(data_int, [](void*) {});
-      if (external.size() > 0) {
-        OUTCOME_TRY(Tensor(external.desc(), data).CopyTo(external, stream_));
-      } else {
-        MMDEPLOY_WARN("copy skipped due to zero sized tensor: {} {}", external.name(),
-                      external.shape());
-      }
+
+    Result<void> ReshapeLike(PPLTensor& dst, Tensor& src)
+    {
+        auto& dst_desc = *dst.GetShape();
+        auto& src_desc = src.desc();
+        OUTCOME_TRY(auto data_type, GetPPLDataType(src_desc.data_type));
+        dst_desc.SetDataType(data_type);
+        dst_desc.SetDataFormat(ppl::common::DATAFORMAT_NDARRAY);
+        dst_desc.Reshape({begin(src_desc.shape), end(src_desc.shape)});
+        dst.SetBufferPtr(src.data());
+        return success();
     }
-  }
-
-  OUTCOME_TRY(stream_.Wait());
-  return success();
-}
-
-Result<void> PPLNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-Result<void> ReshapeLike(PPLTensor& dst, Tensor& src) {
-  auto& dst_desc = *dst.GetShape();
-  auto& src_desc = src.desc();
-  OUTCOME_TRY(auto data_type, GetPPLDataType(src_desc.data_type));
-  dst_desc.SetDataType(data_type);
-  dst_desc.SetDataFormat(ppl::common::DATAFORMAT_NDARRAY);
-  dst_desc.Reshape({begin(src_desc.shape), end(src_desc.shape)});
-  dst.SetBufferPtr(src.data());
-  return success();
-}
-
-Result<void> PPLNet::Reshape(Span<TensorShape> input_shapes) {
-  auto prev_in_shapes = GetShapes(inputs_external_);
-  auto prev_out_shapes = GetShapes(outputs_external_);
-
-  for (int i = 0; i < inputs_external_.size(); ++i) {
-    auto& input = inputs_external_[i];
-    input.Reshape(input_shapes[i]);
-    OUTCOME_TRY(ReshapeLike(*inputs_internal_[i], input));
-  }
-
-  if (can_infer_output_shapes_) {
-    OUTCOME_TRY(auto output_shapes,
-                InferOutputShapes(input_shapes, prev_in_shapes, prev_out_shapes));
-    MMDEPLOY_DEBUG("inferred output shapes: {}", output_shapes);
-    for (int i = 0; i < outputs_external_.size(); ++i) {
-      auto& output = outputs_external_[i];
-      output.Reshape(output_shapes[i]);
-      OUTCOME_TRY(ReshapeLike(*outputs_internal_[i], output));
+
+    Result<void> PPLNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        auto prev_in_shapes  = GetShapes(inputs_external_);
+        auto prev_out_shapes = GetShapes(outputs_external_);
+
+        for (int i = 0; i < inputs_external_.size(); ++i)
+        {
+            auto& input = inputs_external_[i];
+            input.Reshape(input_shapes[i]);
+            OUTCOME_TRY(ReshapeLike(*inputs_internal_[i], input));
+        }
+
+        if (can_infer_output_shapes_)
+        {
+            OUTCOME_TRY(auto output_shapes,
+                        InferOutputShapes(input_shapes, prev_in_shapes, prev_out_shapes));
+            MMDEPLOY_DEBUG("inferred output shapes: {}", output_shapes);
+            for (int i = 0; i < outputs_external_.size(); ++i)
+            {
+                auto& output = outputs_external_[i];
+                output.Reshape(output_shapes[i]);
+                OUTCOME_TRY(ReshapeLike(*outputs_internal_[i], output));
+            }
+        }
+        return success();
     }
-  }
-  return success();
-}
-
-Result<Span<Tensor>> PPLNet::GetInputTensors() { return inputs_external_; }
-
-Result<Span<Tensor>> PPLNet::GetOutputTensors() { return outputs_external_; }
-
-std::vector<TensorShape> PPLNet::GetShapes(Span<Tensor> tensors) {
-  std::vector<TensorShape> shapes;
-  shapes.reserve(tensors.size());
-  for (const auto& t : tensors) {
-    shapes.push_back(t.shape());
-  }
-  return shapes;
-}
-
-Result<int64_t> PPLNet::GetBatchSize(Span<TensorShape> shapes) {
-  int64_t batch_size = -1;
-  for (const auto& s : shapes) {
-    if (s.empty()) {
-      return Status(eNotSupported);
+
+    Result<Span<Tensor>> PPLNet::GetInputTensors()
+    {
+        return inputs_external_;
     }
-    if (batch_size < 0) {
-      batch_size = s.front();
-    } else if (batch_size != s.front()) {
-      return Status(eNotSupported);
+
+    Result<Span<Tensor>> PPLNet::GetOutputTensors()
+    {
+        return outputs_external_;
     }
-  }
-  return batch_size;
-}
-
-Result<std::vector<TensorShape>> PPLNet::InferOutputShapes(Span<TensorShape> input_shapes,
-                                                           Span<TensorShape> prev_in_shapes,
-                                                           Span<TensorShape> prev_out_shapes) {
-  OUTCOME_TRY(auto batch_size, GetBatchSize(input_shapes));
-  if (input_shapes.size() != prev_in_shapes.size()) {
-    return Status(eInvalidArgument);
-  }
-  for (int i = 0; i < input_shapes.size(); ++i) {
-    prev_in_shapes[i][0] = batch_size;
-    if (prev_in_shapes[i] != input_shapes[i]) {
-      return Status(eNotSupported);
+
+    std::vector<TensorShape> PPLNet::GetShapes(Span<Tensor> tensors)
+    {
+        std::vector<TensorShape> shapes;
+        shapes.reserve(tensors.size());
+        for (const auto& t : tensors)
+        {
+            shapes.push_back(t.shape());
+        }
+        return shapes;
     }
-  }
-  std::vector<TensorShape> output_shapes(prev_out_shapes.begin(), prev_out_shapes.end());
-  for (auto& shape : output_shapes) {
-    shape[0] = batch_size;
-  }
-  return output_shapes;
-}
-
-PPLNet::~PPLNet() = default;
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<PPLNet>();
-  if (auto r = p->Init(args)) {
-    return p;
-  } else {
-    MMDEPLOY_ERROR("error creating PPLNet: {}", r.error().message().c_str());
-    return nullptr;
-  }
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (pplnn, 0), Create);
+
+    Result<int64_t> PPLNet::GetBatchSize(Span<TensorShape> shapes)
+    {
+        int64_t batch_size = -1;
+        for (const auto& s : shapes)
+        {
+            if (s.empty())
+            {
+                return Status(eNotSupported);
+            }
+            if (batch_size < 0)
+            {
+                batch_size = s.front();
+            }
+            else if (batch_size != s.front())
+            {
+                return Status(eNotSupported);
+            }
+        }
+        return batch_size;
+    }
+
+    Result<std::vector<TensorShape>> PPLNet::InferOutputShapes(Span<TensorShape> input_shapes,
+                                                               Span<TensorShape> prev_in_shapes,
+                                                               Span<TensorShape> prev_out_shapes)
+    {
+        OUTCOME_TRY(auto batch_size, GetBatchSize(input_shapes));
+        if (input_shapes.size() != prev_in_shapes.size())
+        {
+            return Status(eInvalidArgument);
+        }
+        for (int i = 0; i < input_shapes.size(); ++i)
+        {
+            prev_in_shapes[i][0] = batch_size;
+            if (prev_in_shapes[i] != input_shapes[i])
+            {
+                return Status(eNotSupported);
+            }
+        }
+        std::vector<TensorShape> output_shapes(prev_out_shapes.begin(), prev_out_shapes.end());
+        for (auto& shape : output_shapes)
+        {
+            shape[0] = batch_size;
+        }
+        return output_shapes;
+    }
+
+    PPLNet::~PPLNet() = default;
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<PPLNet>();
+        if (auto r = p->Init(args))
+        {
+            return p;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("error creating PPLNet: {}", r.error().message().c_str());
+            return nullptr;
+        }
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (pplnn, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/ppl/ppl_net.h b/csrc/mmdeploy/net/ppl/ppl_net.h
index d7a0f70c28..45e188994f 100644
--- a/csrc/mmdeploy/net/ppl/ppl_net.h
+++ b/csrc/mmdeploy/net/ppl/ppl_net.h
@@ -8,50 +8,52 @@
 #include "ppl/nn/engines/engine.h"
 #include "ppl/nn/runtime/runtime.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-using PPLTensor = ppl::nn::Tensor;
+    using PPLTensor = ppl::nn::Tensor;
 
-class PPLNet : public Net {
- public:
-  ~PPLNet() override;
+    class PPLNet : public Net
+    {
+      public:
+        ~PPLNet() override;
 
-  Result<void> Init(const Value& args) override;
+        Result<void>                            Init(const Value& args) override;
 
-  Result<void> Deinit() override;
+        Result<void>                            Deinit() override;
 
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>                            Reshape(Span<TensorShape> input_shapes) override;
 
-  Result<Span<Tensor> > GetInputTensors() override;
+        Result<Span<Tensor>>                    GetInputTensors() override;
 
-  Result<Span<Tensor> > GetOutputTensors() override;
+        Result<Span<Tensor>>                    GetOutputTensors() override;
 
-  Result<void> Forward() override;
+        Result<void>                            Forward() override;
 
-  Result<void> ForwardAsync(Event* event) override;
+        Result<void>                            ForwardAsync(Event* event) override;
 
-  static Result<std::vector<TensorShape> > InferOutputShapes(Span<TensorShape> input_shapes,
-                                                             Span<TensorShape> prev_in_shapes,
-                                                             Span<TensorShape> prev_out_shapes);
+        static Result<std::vector<TensorShape>> InferOutputShapes(Span<TensorShape> input_shapes,
+                                                                  Span<TensorShape> prev_in_shapes,
+                                                                  Span<TensorShape> prev_out_shapes);
 
- private:
-  static Tensor CreateInternalTensor(ppl::nn::Tensor* src, Device device);
+      private:
+        static Tensor                                 CreateInternalTensor(ppl::nn::Tensor* src, Device device);
 
-  static Result<int64_t> GetBatchSize(Span<TensorShape> shapes);
+        static Result<int64_t>                        GetBatchSize(Span<TensorShape> shapes);
 
-  static std::vector<TensorShape> GetShapes(Span<Tensor> tensors);
+        static std::vector<TensorShape>               GetShapes(Span<Tensor> tensors);
 
-  Device device_;
-  Stream stream_;
-  std::vector<std::unique_ptr<ppl::nn::Engine> > engines_;
-  std::vector<Tensor> inputs_external_;
-  std::vector<Tensor> outputs_external_;
-  std::vector<PPLTensor*> inputs_internal_;
-  std::vector<PPLTensor*> outputs_internal_;
-  std::unique_ptr<ppl::nn::Runtime> runtime_;
-  bool can_infer_output_shapes_{false};
-  static constexpr const auto kHost = Device(0);
-};
+        Device                                        device_;
+        Stream                                        stream_;
+        std::vector<std::unique_ptr<ppl::nn::Engine>> engines_;
+        std::vector<Tensor>                           inputs_external_;
+        std::vector<Tensor>                           outputs_external_;
+        std::vector<PPLTensor*>                       inputs_internal_;
+        std::vector<PPLTensor*>                       outputs_internal_;
+        std::unique_ptr<ppl::nn::Runtime>             runtime_;
+        bool                                          can_infer_output_shapes_{false};
+        static constexpr const auto                   kHost = Device(0);
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/rknn/rknn_net.cpp b/csrc/mmdeploy/net/rknn/rknn_net.cpp
index 2582bb98c7..d5e1e6c0c0 100644
--- a/csrc/mmdeploy/net/rknn/rknn_net.cpp
+++ b/csrc/mmdeploy/net/rknn/rknn_net.cpp
@@ -10,275 +10,343 @@
 #include "mmdeploy/core/utils/filesystem.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::framework {
-
-static inline const char* const rknn_type(rknn_tensor_type type) {
-  switch (type) {
-    case RKNN_TENSOR_FLOAT32:
-      return "FP32";
-    case RKNN_TENSOR_FLOAT16:
-      return "FP16";
-    case RKNN_TENSOR_INT8:
-      return "INT8";
-    case RKNN_TENSOR_UINT8:
-      return "UINT8";
-    case RKNN_TENSOR_INT16:
-      return "INT16";
+namespace mmdeploy::framework
+{
+
+    static inline const char* const rknn_type(rknn_tensor_type type)
+    {
+        switch (type)
+        {
+            case RKNN_TENSOR_FLOAT32:
+                return "FP32";
+            case RKNN_TENSOR_FLOAT16:
+                return "FP16";
+            case RKNN_TENSOR_INT8:
+                return "INT8";
+            case RKNN_TENSOR_UINT8:
+                return "UINT8";
+            case RKNN_TENSOR_INT16:
+                return "INT16";
 #ifdef RK_MODELS
-    case RKNN_TENSOR_INT32:
-      return "INT32";
-    case RKNN_TENSOR_INT64:
-      return "INT64";
+            case RKNN_TENSOR_INT32:
+                return "INT32";
+            case RKNN_TENSOR_INT64:
+                return "INT64";
 #endif
-    default:
-      return "UNKNOWN";
-  }
-}
-
-static inline const char* const rknn_format(rknn_tensor_format fmt) {
-  switch (fmt) {
-    case RKNN_TENSOR_NCHW:
-      return "NCHW";
-    case RKNN_TENSOR_NHWC:
-      return "NHWC";
-    default:
-      return "UNKNOWN";
-  }
-}
-
-static inline const char* const rknn_qnt_type(rknn_tensor_qnt_type type) {
-  switch (type) {
-    case RKNN_TENSOR_QNT_NONE:
-      return "NONE";
-    case RKNN_TENSOR_QNT_DFP:
-      return "DFP";
-    case RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC:
-      return "AFFINE";
-    default:
-      return "UNKNOWN";
-  }
-}
-
-static Result<rknn_tensor_type> GetRKNNDataType(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return RKNN_TENSOR_FLOAT32;
-    case DataType::kHALF:
-      return RKNN_TENSOR_FLOAT16;
-    case DataType::kINT8:
-      return RKNN_TENSOR_INT8;
+            default:
+                return "UNKNOWN";
+        }
+    }
+
+    static inline const char* const rknn_format(rknn_tensor_format fmt)
+    {
+        switch (fmt)
+        {
+            case RKNN_TENSOR_NCHW:
+                return "NCHW";
+            case RKNN_TENSOR_NHWC:
+                return "NHWC";
+            default:
+                return "UNKNOWN";
+        }
+    }
+
+    static inline const char* const rknn_qnt_type(rknn_tensor_qnt_type type)
+    {
+        switch (type)
+        {
+            case RKNN_TENSOR_QNT_NONE:
+                return "NONE";
+            case RKNN_TENSOR_QNT_DFP:
+                return "DFP";
+            case RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC:
+                return "AFFINE";
+            default:
+                return "UNKNOWN";
+        }
+    }
+
+    static Result<rknn_tensor_type> GetRKNNDataType(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return RKNN_TENSOR_FLOAT32;
+            case DataType::kHALF:
+                return RKNN_TENSOR_FLOAT16;
+            case DataType::kINT8:
+                return RKNN_TENSOR_INT8;
 #ifdef RK_MODELS
-    case DataType::kINT32:
-      return RKNN_TENSOR_INT32;
-    case DataType::kINT64:
-      return RKNN_TENSOR_INT64;
+            case DataType::kINT32:
+                return RKNN_TENSOR_INT32;
+            case DataType::kINT64:
+                return RKNN_TENSOR_INT64;
 #endif
-    default:
-      return Status(eNotSupported);
-  }
-}
-
-static Result<DataType> GetMMDeployDataType(rknn_tensor_type type) {
-  switch (type) {
-    case RKNN_TENSOR_FLOAT32:
-      return DataType::kFLOAT;
-    case RKNN_TENSOR_FLOAT16:
-      return DataType::kHALF;
-    case RKNN_TENSOR_INT8:  // fall through
-    case RKNN_TENSOR_UINT8:
-      return DataType::kINT8;
+            default:
+                return Status(eNotSupported);
+        }
+    }
+
+    static Result<DataType> GetMMDeployDataType(rknn_tensor_type type)
+    {
+        switch (type)
+        {
+            case RKNN_TENSOR_FLOAT32:
+                return DataType::kFLOAT;
+            case RKNN_TENSOR_FLOAT16:
+                return DataType::kHALF;
+            case RKNN_TENSOR_INT8:  // fall through
+            case RKNN_TENSOR_UINT8:
+                return DataType::kINT8;
 #ifdef RK_MODELS
-    case RKNN_TENSOR_INT32:
-      return DataType::kINT32;
-    case RKNN_TENSOR_INT64:
-      return DataType::kINT64;
+            case RKNN_TENSOR_INT32:
+                return DataType::kINT32;
+            case RKNN_TENSOR_INT64:
+                return DataType::kINT64;
 #endif
-    default:
-      MMDEPLOY_ERROR("unsupported rknn_tensor_type: {}", rknn_type(type));
-      return Status(eNotSupported);
-  }
-}
-
-RKNNNet::~RKNNNet() { rknn_destroy(ctx_); }
-
-void RKNNNet::PrintRKNNTensorAttr(const char* tag, const std::vector<rknn_tensor_attr>& attrs) {
-  MMDEPLOY_INFO("{} tensors: ", tag);
-  for (auto& attr : attrs) {
-    MMDEPLOY_INFO(
-        " - index={}, name={}, type={}, n_dims={}, dims=[{}, {}, {}, {}], n_elems={}, size={},"
-        " fmt={}, qnt_type={}, zp={}, scale={}",
-        attr.index, attr.name, rknn_type(attr.type), attr.n_dims, attr.dims[0], attr.dims[1],
-        attr.dims[2], attr.dims[3], attr.n_elems, attr.size, rknn_format(attr.fmt),
-        rknn_qnt_type(attr.qnt_type), attr.zp, attr.scale);
-  }
-}
-
-Result<void> RKNNNet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-  if (!device_.is_host()) {
-    return Status(eNotSupported);
-  }
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  std::string content;
-  OUTCOME_TRY(content, model.ReadFile(config.net));
-  char* model_ptr = const_cast<char*>(content.data());
+            default:
+                MMDEPLOY_ERROR("unsupported rknn_tensor_type: {}", rknn_type(type));
+                return Status(eNotSupported);
+        }
+    }
+
+    RKNNNet::~RKNNNet()
+    {
+        rknn_destroy(ctx_);
+    }
+
+    void RKNNNet::PrintRKNNTensorAttr(const char* tag, const std::vector<rknn_tensor_attr>& attrs)
+    {
+        MMDEPLOY_INFO("{} tensors: ", tag);
+        for (auto& attr : attrs)
+        {
+            MMDEPLOY_INFO(
+                " - index={}, name={}, type={}, n_dims={}, dims=[{}, {}, {}, {}], n_elems={}, size={},"
+                " fmt={}, qnt_type={}, zp={}, scale={}",
+                attr.index,
+                attr.name,
+                rknn_type(attr.type),
+                attr.n_dims,
+                attr.dims[0],
+                attr.dims[1],
+                attr.dims[2],
+                attr.dims[3],
+                attr.n_elems,
+                attr.size,
+                rknn_format(attr.fmt),
+                rknn_qnt_type(attr.qnt_type),
+                attr.zp,
+                attr.scale);
+        }
+    }
+
+    Result<void> RKNNNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+        if (!device_.is_host())
+        {
+            return Status(eNotSupported);
+        }
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        std::string content;
+        OUTCOME_TRY(content, model.ReadFile(config.net));
+        char* model_ptr = const_cast<char*>(content.data());
 #ifdef RK_MODELS
-  int ret = rknn_init(&ctx_, model_ptr, content.size(), 0, NULL);
+        int ret = rknn_init(&ctx_, model_ptr, content.size(), 0, NULL);
 #endif
 #ifdef RV_MODELS
-  int ret = rknn_init(&ctx_, model_ptr, content.size(), 0);
+        int ret = rknn_init(&ctx_, model_ptr, content.size(), 0);
 #endif
-  if (ret != RKNN_SUCC) {
-    MMDEPLOY_ERROR("init rknn model with {} failed! ret: {}", config.net, ret);
-    return Status(eFail);
-  }
-
-  // Get Model Input Output Info
-  rknn_input_output_num io_num;
-  ret = rknn_query(ctx_, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
-  if (ret != RKNN_SUCC) {
-    MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_IN_OUT_NUM' fail! ret: {}", ret);
-    return Status(eFail);
-  }
-  MMDEPLOY_DEBUG("model input num: {}, output num: {}", io_num.n_input, io_num.n_output);
-
-  auto get_tensor_shape = [](rknn_tensor_attr& attr) -> Result<TensorShape> {
-    TensorShape shape;
-    for (int i = 0; i < attr.n_dims; ++i) {
-      shape.push_back(attr.dims[i]);
-    }
+        if (ret != RKNN_SUCC)
+        {
+            MMDEPLOY_ERROR("init rknn model with {} failed! ret: {}", config.net, ret);
+            return Status(eFail);
+        }
+
+        // Get Model Input Output Info
+        rknn_input_output_num io_num;
+        ret = rknn_query(ctx_, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+        if (ret != RKNN_SUCC)
+        {
+            MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_IN_OUT_NUM' fail! ret: {}", ret);
+            return Status(eFail);
+        }
+        MMDEPLOY_DEBUG("model input num: {}, output num: {}", io_num.n_input, io_num.n_output);
+
+        auto get_tensor_shape = [](rknn_tensor_attr& attr) -> Result<TensorShape>
+        {
+            TensorShape shape;
+            for (int i = 0; i < attr.n_dims; ++i)
+            {
+                shape.push_back(attr.dims[i]);
+            }
 #ifdef RK_MODELS
-    return shape;
+            return shape;
 #endif
 #ifdef RV_MODELS
-    std::reverse(shape.begin(), shape.end());
-    return shape;
+            std::reverse(shape.begin(), shape.end());
+            return shape;
 #endif
-  };
-
-  for (int i = 0; i < io_num.n_input; i++) {
-    rknn_tensor_attr attr;
-    attr.index = i;
-    ret = rknn_query(ctx_, RKNN_QUERY_INPUT_ATTR, &(attr), sizeof(rknn_tensor_attr));
-    if (ret != RKNN_SUCC) {
-      MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_INPUT_ATTR' fail! ret: {}", ret);
-      return Status(eFail);
+        };
+
+        for (int i = 0; i < io_num.n_input; i++)
+        {
+            rknn_tensor_attr attr;
+            attr.index = i;
+            ret        = rknn_query(ctx_, RKNN_QUERY_INPUT_ATTR, &(attr), sizeof(rknn_tensor_attr));
+            if (ret != RKNN_SUCC)
+            {
+                MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_INPUT_ATTR' fail! ret: {}", ret);
+                return Status(eFail);
+            }
+            if (!(attr.type == RKNN_TENSOR_UINT8 || attr.type == RKNN_TENSOR_INT8))
+            {
+                MMDEPLOY_ERROR("MMDeploy SDK only supports RKNN-INT8 model");
+                return Status(eInvalidArgument);
+            }
+            input_attrs_.push_back(attr);
+            // Only support uint8 input data
+            OUTCOME_TRY(auto data_type, GetMMDeployDataType(RKNN_TENSOR_UINT8));
+            input_tensors_.emplace_back(
+                TensorDesc{device_, data_type, get_tensor_shape(attr).value(), "#" + std::to_string(i)});
+        }
+        PrintRKNNTensorAttr("input", input_attrs_);
+
+        for (int i = 0; i < io_num.n_output; i++)
+        {
+            rknn_tensor_attr attr;
+            attr.index = i;
+            ret        = rknn_query(ctx_, RKNN_QUERY_OUTPUT_ATTR, &(attr), sizeof(rknn_tensor_attr));
+            if (ret != RKNN_SUCC)
+            {
+                MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_OUTPUT_ATTR' fail! ret: {}", ret);
+                return Status(eFail);
+            }
+            output_attrs_.push_back(attr);
+            // MMDeploy SDK always make the output data type as float
+            output_tensors_.emplace_back(TensorDesc{
+                device_,
+                DataType::kFLOAT,
+                get_tensor_shape(attr).value(),
+                "#" + std::to_string(i)});
+        }
+        PrintRKNNTensorAttr("output", output_attrs_);
+
+        return success();
     }
-    if (!(attr.type == RKNN_TENSOR_UINT8 || attr.type == RKNN_TENSOR_INT8)) {
-      MMDEPLOY_ERROR("MMDeploy SDK only supports RKNN-INT8 model");
-      return Status(eInvalidArgument);
+
+    Result<void> RKNNNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
     }
-    input_attrs_.push_back(attr);
-    // Only support uint8 input data
-    OUTCOME_TRY(auto data_type, GetMMDeployDataType(RKNN_TENSOR_UINT8));
-    input_tensors_.emplace_back(
-        TensorDesc{device_, data_type, get_tensor_shape(attr).value(), "#" + std::to_string(i)});
-  }
-  PrintRKNNTensorAttr("input", input_attrs_);
-
-  for (int i = 0; i < io_num.n_output; i++) {
-    rknn_tensor_attr attr;
-    attr.index = i;
-    ret = rknn_query(ctx_, RKNN_QUERY_OUTPUT_ATTR, &(attr), sizeof(rknn_tensor_attr));
-    if (ret != RKNN_SUCC) {
-      MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_OUTPUT_ATTR' fail! ret: {}", ret);
-      return Status(eFail);
+
+    Result<void> RKNNNet::Deinit()
+    {
+        return success();
     }
-    output_attrs_.push_back(attr);
-    // MMDeploy SDK always make the output data type as float
-    output_tensors_.emplace_back(TensorDesc{
-        device_, DataType::kFLOAT, get_tensor_shape(attr).value(), "#" + std::to_string(i)});
-  }
-  PrintRKNNTensorAttr("output", output_attrs_);
-
-  return success();
-}
-
-Result<void> RKNNNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-Result<void> RKNNNet::Deinit() { return success(); }
-
-Result<Span<Tensor>> RKNNNet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> RKNNNet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> RKNNNet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-Result<void> RKNNNet::Forward() {
-  OUTCOME_TRY(stream_.Wait());
-
-  std::vector<rknn_input> inputs;
-  for (int i = 0; i < input_tensors_.size(); i++) {
-    rknn_input input;
-    input.index = i;
-    // '0' let the buf data be converted into an input consistent with the model
-    input.pass_through = 0;
-    input.type = RKNN_TENSOR_UINT8;  // data type of input buf
-    input.fmt = RKNN_TENSOR_NHWC;    // data format of input buf
-    input.buf = input_tensors_[i].data();
-    input.size = input_attrs_[i].size;
-    inputs.push_back(input);
-  }
-
-  // Set input
-  int ret = rknn_inputs_set(ctx_, input_tensors_.size(), inputs.data());
-  if (ret < 0) {
-    MMDEPLOY_ERROR("rknn_input_set fail! ret= {}", ret);
-    return Status(eFail);
-  }
-
-  // Forward
-  ret = rknn_run(ctx_, NULL);
-  if (ret < 0) {
-    MMDEPLOY_ERROR("rknn_run fail! ret={}", ret);
-    return Status(eFail);
-  }
-
-  // Get output
-  std::vector<rknn_output> outputs(output_tensors_.size());
-  for (uint32_t i = 0; i < output_tensors_.size(); ++i) {
-    outputs[i].want_float = 1;
-    outputs[i].is_prealloc = 1;  // use pre-allocated buffer in `output_tensors_`
-    outputs[i].index = i;
-    outputs[i].buf = output_tensors_[i].data();
-    outputs[i].size = output_tensors_[i].byte_size();
-  }
-  ret = rknn_outputs_get(ctx_, outputs.size(), outputs.data(), NULL);
-  if (ret < 0) {
-    MMDEPLOY_ERROR("rknn_outputs_get fail! ret= {}", ret);
-    return Status(eFail);
-  }
-
-  OUTCOME_TRY(stream_.Wait());
-  return success();
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  try {
-    auto p = std::make_unique<RKNNNet>();
-    if (auto r = p->Init(args)) {
-      return p;
-    } else {
-      MMDEPLOY_ERROR("error creating RKNNNet: {}", r.error().message().c_str());
-      return nullptr;
+
+    Result<Span<Tensor>> RKNNNet::GetInputTensors()
+    {
+        return input_tensors_;
+    }
+
+    Result<Span<Tensor>> RKNNNet::GetOutputTensors()
+    {
+        return output_tensors_;
+    }
+
+    Result<void> RKNNNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
+    }
+
+    Result<void> RKNNNet::Forward()
+    {
+        OUTCOME_TRY(stream_.Wait());
+
+        std::vector<rknn_input> inputs;
+        for (int i = 0; i < input_tensors_.size(); i++)
+        {
+            rknn_input input;
+            input.index        = i;
+            // '0' let the buf data be converted into an input consistent with the model
+            input.pass_through = 0;
+            input.type         = RKNN_TENSOR_UINT8;  // data type of input buf
+            input.fmt          = RKNN_TENSOR_NHWC;   // data format of input buf
+            input.buf          = input_tensors_[i].data();
+            input.size         = input_attrs_[i].size;
+            inputs.push_back(input);
+        }
+
+        // Set input
+        int ret = rknn_inputs_set(ctx_, input_tensors_.size(), inputs.data());
+        if (ret < 0)
+        {
+            MMDEPLOY_ERROR("rknn_input_set fail! ret= {}", ret);
+            return Status(eFail);
+        }
+
+        // Forward
+        ret = rknn_run(ctx_, NULL);
+        if (ret < 0)
+        {
+            MMDEPLOY_ERROR("rknn_run fail! ret={}", ret);
+            return Status(eFail);
+        }
+
+        // Get output
+        std::vector<rknn_output> outputs(output_tensors_.size());
+        for (uint32_t i = 0; i < output_tensors_.size(); ++i)
+        {
+            outputs[i].want_float  = 1;
+            outputs[i].is_prealloc = 1;  // use pre-allocated buffer in `output_tensors_`
+            outputs[i].index       = i;
+            outputs[i].buf         = output_tensors_[i].data();
+            outputs[i].size        = output_tensors_[i].byte_size();
+        }
+        ret = rknn_outputs_get(ctx_, outputs.size(), outputs.data(), NULL);
+        if (ret < 0)
+        {
+            MMDEPLOY_ERROR("rknn_outputs_get fail! ret= {}", ret);
+            return Status(eFail);
+        }
+
+        OUTCOME_TRY(stream_.Wait());
+        return success();
+    }
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        try
+        {
+            auto p = std::make_unique<RKNNNet>();
+            if (auto r = p->Init(args))
+            {
+                return p;
+            }
+            else
+            {
+                MMDEPLOY_ERROR("error creating RKNNNet: {}", r.error().message().c_str());
+                return nullptr;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating RKNNNet: {}", e.what());
+            return nullptr;
+        }
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating RKNNNet: {}", e.what());
-    return nullptr;
-  }
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (rknn, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (rknn, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/rknn/rknn_net.h b/csrc/mmdeploy/net/rknn/rknn_net.h
index 5d42c749d5..f307af1cd0 100644
--- a/csrc/mmdeploy/net/rknn/rknn_net.h
+++ b/csrc/mmdeploy/net/rknn/rknn_net.h
@@ -7,38 +7,40 @@
 #include "mmdeploy/core/net.h"
 #include "rknn_api.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class RKNNNet : public Net {
- public:
-  ~RKNNNet() override;
+    class RKNNNet : public Net
+    {
+      public:
+        ~RKNNNet() override;
 
-  Result<void> Init(const Value& args) override;
+        Result<void>         Init(const Value& args) override;
 
-  Result<void> Deinit() override;
+        Result<void>         Deinit() override;
 
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
 
-  Result<Span<Tensor> > GetInputTensors() override;
+        Result<Span<Tensor>> GetInputTensors() override;
 
-  Result<Span<Tensor> > GetOutputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
 
-  Result<void> Forward() override;
+        Result<void>         Forward() override;
 
-  Result<void> ForwardAsync(Event* event) override;
+        Result<void>         ForwardAsync(Event* event) override;
 
- private:
-  void PrintRKNNTensorAttr(const char* tag, const std::vector<rknn_tensor_attr>& attrs);
+      private:
+        void                          PrintRKNNTensorAttr(const char* tag, const std::vector<rknn_tensor_attr>& attrs);
 
-  Device device_;
-  Stream stream_;
-  rknn_context ctx_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  std::vector<rknn_tensor_attr> input_attrs_;
-  std::vector<rknn_tensor_attr> output_attrs_;
-  static constexpr const auto kHost = Device(0);
-};
+        Device                        device_;
+        Stream                        stream_;
+        rknn_context                  ctx_;
+        std::vector<Tensor>           input_tensors_;
+        std::vector<Tensor>           output_tensors_;
+        std::vector<rknn_tensor_attr> input_attrs_;
+        std::vector<rknn_tensor_attr> output_attrs_;
+        static constexpr const auto   kHost = Device(0);
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/snpe/snpe_net.cpp b/csrc/mmdeploy/net/snpe/snpe_net.cpp
index d847449652..79057aa67a 100644
--- a/csrc/mmdeploy/net/snpe/snpe_net.cpp
+++ b/csrc/mmdeploy/net/snpe/snpe_net.cpp
@@ -6,252 +6,301 @@
 #include "mmdeploy/core/model.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::framework {
-
-SNPENet::~SNPENet() {}
-
-std::string SNPENet::ShapeStr(zdl::DlSystem::ITensor* pTensor) {
-  std::string str;
-
-  str += "[";
-  auto shape = pTensor->getShape();
-  for (int i = 0; i < shape.rank(); ++i) {
-    str += std::to_string(shape[i]);
-    str += ",";
-  }
-  str += ']';
-  return str;
-}
-
-void SNPENet::Build(std::unique_ptr<zdl::DlContainer::IDlContainer>& container,
-                    zdl::DlSystem::Runtime_t runtime, zdl::DlSystem::RuntimeList runtimeList,
-                    bool useUserSuppliedBuffers, zdl::DlSystem::PlatformConfig platformConfig) {
-  zdl::SNPE::SNPEBuilder snpeBuilder(container.get());
-
-  if (runtimeList.empty()) {
-    runtimeList.add(runtime);
-  }
-
-  snpe_ =
-      snpeBuilder.setOutputLayers({})
-          .setRuntimeProcessorOrder(runtimeList)
-          .setUseUserSuppliedBuffers(useUserSuppliedBuffers)
-          .setPlatformConfig(platformConfig)
-          .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::SUSTAINED_HIGH_PERFORMANCE)
-          .build();
-  return;
-}
-
-void SNPENet::copy_output(const zdl::DlSystem::ITensor* from, Tensor& to) {
-  auto hwc_to_chw = [](const zdl::DlSystem::TensorShape& shape) -> bool {
-    if (shape.rank() != 4 || (shape[1] == 1 && shape[2] > 1 && shape[3] > 1)) {
-      return false;
+namespace mmdeploy::framework
+{
+
+    SNPENet::~SNPENet() {}
+
+    std::string SNPENet::ShapeStr(zdl::DlSystem::ITensor* pTensor)
+    {
+        std::string str;
+
+        str += "[";
+        auto shape = pTensor->getShape();
+        for (int i = 0; i < shape.rank(); ++i)
+        {
+            str += std::to_string(shape[i]);
+            str += ",";
+        }
+        str += ']';
+        return str;
     }
-    return true;
-  };
 
-  auto output_shape = from->getShape();
-
-  if (to.size() != from->getSize()) {
-    TensorShape tensor_shape;
-    for (int j = 0; j < output_shape.rank(); ++j) {
-      tensor_shape.push_back(output_shape[j]);
+    void SNPENet::Build(std::unique_ptr<zdl::DlContainer::IDlContainer>& container,
+                        zdl::DlSystem::Runtime_t                         runtime,
+                        zdl::DlSystem::RuntimeList                       runtimeList,
+                        bool                                             useUserSuppliedBuffers,
+                        zdl::DlSystem::PlatformConfig                    platformConfig)
+    {
+        zdl::SNPE::SNPEBuilder snpeBuilder(container.get());
+
+        if (runtimeList.empty())
+        {
+            runtimeList.add(runtime);
+        }
+
+        snpe_ =
+            snpeBuilder.setOutputLayers({})
+                .setRuntimeProcessorOrder(runtimeList)
+                .setUseUserSuppliedBuffers(useUserSuppliedBuffers)
+                .setPlatformConfig(platformConfig)
+                .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::SUSTAINED_HIGH_PERFORMANCE)
+                .build();
+        return;
     }
 
-    if (hwc_to_chw(output_shape)) {
-      auto tmp = output_shape[3];
-      output_shape[3] = output_shape[1];
-      output_shape[1] = tmp;
+    void SNPENet::copy_output(const zdl::DlSystem::ITensor* from, Tensor& to)
+    {
+        auto hwc_to_chw = [](const zdl::DlSystem::TensorShape& shape) -> bool
+        {
+            if (shape.rank() != 4 || (shape[1] == 1 && shape[2] > 1 && shape[3] > 1))
+            {
+                return false;
+            }
+            return true;
+        };
+
+        auto output_shape = from->getShape();
+
+        if (to.size() != from->getSize())
+        {
+            TensorShape tensor_shape;
+            for (int j = 0; j < output_shape.rank(); ++j)
+            {
+                tensor_shape.push_back(output_shape[j]);
+            }
+
+            if (hwc_to_chw(output_shape))
+            {
+                auto tmp        = output_shape[3];
+                output_shape[3] = output_shape[1];
+                output_shape[1] = tmp;
+            }
+            to.Reshape(tensor_shape);
+        }
+
+        float* pto = to.data<float>();
+
+        if (output_shape.rank() != 4 ||
+            (output_shape[1] == 1 && output_shape[2] > 1 && output_shape[3] > 1))
+        {
+            // skip [1,1,w>1,h>1] for segmentation task
+            for (auto it = from->cbegin(); it != from->cend(); ++it, ++pto)
+            {
+                *pto = *it;
+            }
+        }
+        else
+        {
+            const int channel = output_shape[1];
+            const int panel   = output_shape[2] * output_shape[3];
+
+            int       i = 0;
+            // HWC to CHW
+            for (auto it = from->cbegin(); it != from->cend(); ++it, ++i)
+            {
+                int channel_idx                      = i % channel;
+                int panel_idx                        = i / channel;
+                pto[channel_idx * panel + panel_idx] = *it;
+            }
+        }
+        return;
     }
-    to.Reshape(tensor_shape);
-  }
-
-  float* pto = to.data<float>();
 
-  if (output_shape.rank() != 4 ||
-      (output_shape[1] == 1 && output_shape[2] > 1 && output_shape[3] > 1)) {
-    // skip [1,1,w>1,h>1] for segmentation task
-    for (auto it = from->cbegin(); it != from->cend(); ++it, ++pto) {
-      *pto = *it;
+    void SNPENet::copy_input(const Tensor& from, zdl::DlSystem::ITensor* to)
+    {
+        if (from.size() != to->getSize())
+        {
+            MMDEPLOY_ERROR("input tensor size not match");
+            return;
+        }
+
+        const float* pfrom = from.data<float>();
+
+        auto         input_shape = to->getShape();
+        if (input_shape.rank() == 4)
+        {
+            const int channel = input_shape[3];
+            const int panel   = input_shape[1] * input_shape[2];
+
+            int       i = 0;
+            // CHW to HWC
+            for (auto it = to->begin(); it != to->end(); ++it, ++i)
+            {
+                int channel_index = i % channel;
+                int panel_index   = (i / channel) % panel;
+
+                *it = pfrom[channel_index * panel + panel_index];
+            }
+        }
+        else
+        {
+            for (auto it = to->begin(); it != to->end(); ++it, ++pfrom)
+            {
+                *it = *pfrom;
+            }
+        }
     }
-  } else {
-    const int channel = output_shape[1];
-    const int panel = output_shape[2] * output_shape[3];
-
-    int i = 0;
-    // HWC to CHW
-    for (auto it = from->cbegin(); it != from->cend(); ++it, ++i) {
-      int channel_idx = i % channel;
-      int panel_idx = i / channel;
-      pto[channel_idx * panel + panel_idx] = *it;
+
+    Result<void> SNPENet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+        if (!device_.is_host())
+        {
+            return Status(eNotSupported);
+        }
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        std::string content;
+        OUTCOME_TRY(content, model.ReadFile(config.net));
+        char* model_ptr = const_cast<char*>(content.data());
+        container_ =
+            zdl::DlContainer::IDlContainer::open(reinterpret_cast<uint8_t*>(model_ptr), content.size());
+        if (container_ == nullptr)
+        {
+            MMDEPLOY_ERROR("Load .dlc failed: {}", config.net);
+            return Status(eInvalidArgument);
+        }
+
+        zdl::DlSystem::Runtime_t runtime = zdl::DlSystem::Runtime_t::GPU;
+        if (!zdl::SNPE::SNPEFactory::isRuntimeAvailable(runtime))
+        {
+            MMDEPLOY_WARN("Selected runtime not present. Falling back to CPU.\n");
+            runtime = zdl::DlSystem::Runtime_t::CPU;
+        }
+
+        zdl::DlSystem::RuntimeList runtimeList;
+        // Add CPU backend to support fallback
+        runtimeList.add(zdl::DlSystem::Runtime_t::CPU);
+        runtimeList.add(runtime);
+        zdl::DlSystem::PlatformConfig platformConfig;
+        Build(container_, runtime, runtimeList, false, platformConfig);
+
+        // init internal input tensor list
+        const auto& inputTensorNamesRef = snpe_->getInputTensorNames();
+        const auto& inputTensorNames    = *inputTensorNamesRef;
+        inputs_internal_.resize(inputTensorNames.size());
+
+        for (int i = 0; i < inputTensorNames.size(); ++i)
+        {
+            const auto& inputShape_opt = snpe_->getInputDimensions(inputTensorNames.at(i));
+            const auto& inputShape     = *inputShape_opt;
+
+            inputs_internal_[i] = zdl::SNPE::SNPEFactory::getTensorFactory().createTensor(inputShape);
+
+            std::string info =
+                std::string(inputTensorNames.at(i)) + " shape: " + ShapeStr(inputs_internal_[i].get());
+            MMDEPLOY_INFO(info);
+
+            input_tensor_map_.add(inputTensorNames.at(i), inputs_internal_[i].get());
+
+            input_tensors_.emplace_back(TensorDesc{
+                Device("cpu"),
+                DataType::kFLOAT,
+                {},
+                std::string(inputTensorNames.at(i)),
+            });
+        }
+
+        const auto& outputTensorNamesRef = snpe_->getOutputTensorNames();
+        const auto& outputTensorNames    = *outputTensorNamesRef;
+        for (int i = 0; i < outputTensorNames.size(); ++i)
+        {
+            output_tensors_.emplace_back(TensorDesc{
+                Device("cpu"),
+                DataType::kFLOAT,
+                {},
+                std::string(outputTensorNames.at(i)),
+            });
+        }
+
+        return success();
     }
-  }
-  return;
-}
-
-void SNPENet::copy_input(const Tensor& from, zdl::DlSystem::ITensor* to) {
-  if (from.size() != to->getSize()) {
-    MMDEPLOY_ERROR("input tensor size not match");
-    return;
-  }
-
-  const float* pfrom = from.data<float>();
-
-  auto input_shape = to->getShape();
-  if (input_shape.rank() == 4) {
-    const int channel = input_shape[3];
-    const int panel = input_shape[1] * input_shape[2];
-
-    int i = 0;
-    // CHW to HWC
-    for (auto it = to->begin(); it != to->end(); ++it, ++i) {
-      int channel_index = i % channel;
-      int panel_index = (i / channel) % panel;
-
-      *it = pfrom[channel_index * panel + panel_index];
+
+    Result<void> SNPENet::Deinit()
+    {
+        return success();
     }
 
-  } else {
-    for (auto it = to->begin(); it != to->end(); ++it, ++pfrom) {
-      *it = *pfrom;
+    Result<void> SNPENet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
     }
-  }
-}
-
-Result<void> SNPENet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-  if (!device_.is_host()) {
-    return Status(eNotSupported);
-  }
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  std::string content;
-  OUTCOME_TRY(content, model.ReadFile(config.net));
-  char* model_ptr = const_cast<char*>(content.data());
-  container_ =
-      zdl::DlContainer::IDlContainer::open(reinterpret_cast<uint8_t*>(model_ptr), content.size());
-  if (container_ == nullptr) {
-    MMDEPLOY_ERROR("Load .dlc failed: {}", config.net);
-    return Status(eInvalidArgument);
-  }
-
-  zdl::DlSystem::Runtime_t runtime = zdl::DlSystem::Runtime_t::GPU;
-  if (!zdl::SNPE::SNPEFactory::isRuntimeAvailable(runtime)) {
-    MMDEPLOY_WARN("Selected runtime not present. Falling back to CPU.\n");
-    runtime = zdl::DlSystem::Runtime_t::CPU;
-  }
-
-  zdl::DlSystem::RuntimeList runtimeList;
-  // Add CPU backend to support fallback
-  runtimeList.add(zdl::DlSystem::Runtime_t::CPU);
-  runtimeList.add(runtime);
-  zdl::DlSystem::PlatformConfig platformConfig;
-  Build(container_, runtime, runtimeList, false, platformConfig);
-
-  // init internal input tensor list
-  const auto& inputTensorNamesRef = snpe_->getInputTensorNames();
-  const auto& inputTensorNames = *inputTensorNamesRef;
-  inputs_internal_.resize(inputTensorNames.size());
-
-  for (int i = 0; i < inputTensorNames.size(); ++i) {
-    const auto& inputShape_opt = snpe_->getInputDimensions(inputTensorNames.at(i));
-    const auto& inputShape = *inputShape_opt;
-
-    inputs_internal_[i] = zdl::SNPE::SNPEFactory::getTensorFactory().createTensor(inputShape);
-
-    std::string info =
-        std::string(inputTensorNames.at(i)) + " shape: " + ShapeStr(inputs_internal_[i].get());
-    MMDEPLOY_INFO(info);
-
-    input_tensor_map_.add(inputTensorNames.at(i), inputs_internal_[i].get());
-
-    input_tensors_.emplace_back(TensorDesc{
-        Device("cpu"),
-        DataType::kFLOAT,
-        {},
-        std::string(inputTensorNames.at(i)),
-    });
-  }
-
-  const auto& outputTensorNamesRef = snpe_->getOutputTensorNames();
-  const auto& outputTensorNames = *outputTensorNamesRef;
-  for (int i = 0; i < outputTensorNames.size(); ++i) {
-    output_tensors_.emplace_back(TensorDesc{
-        Device("cpu"),
-        DataType::kFLOAT,
-        {},
-        std::string(outputTensorNames.at(i)),
-    });
-  }
-
-  return success();
-}
-
-Result<void> SNPENet::Deinit() { return success(); }
-
-Result<void> SNPENet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-Result<Span<Tensor>> SNPENet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> SNPENet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> SNPENet::Forward() {
-  OUTCOME_TRY(stream_.Wait());
-
-  {
-    // copy input to itensor buffer
-    for (auto& tensor : input_tensors_) {
-      const auto& name = tensor.desc().name;
-      auto pbuffer = input_tensor_map_.getTensor(name.c_str());
-
-      copy_input(tensor, pbuffer);
+
+    Result<Span<Tensor>> SNPENet::GetInputTensors()
+    {
+        return input_tensors_;
     }
-  }
-
-  // A tensor map for SNPE execution outputs
-  zdl::DlSystem::TensorMap output_map;
-  {
-    // real inference
-    bool success = snpe_->execute(input_tensor_map_, output_map);
-    if (!success) {
-      MMDEPLOY_ERROR("snpe Inference error: {}", std::string(zdl::DlSystem::getLastErrorString()));
-      return Status(eFail);
+
+    Result<Span<Tensor>> SNPENet::GetOutputTensors()
+    {
+        return output_tensors_;
     }
-  }
 
-  {
-    // extract output buffer to tensor
-    auto names = output_map.getTensorNames();
-    for (size_t i = 0; i < names.size(); ++i) {
-      const zdl::DlSystem::ITensor* pbuffer = output_map.getTensor(names.at(i));
+    Result<void> SNPENet::Forward()
+    {
+        OUTCOME_TRY(stream_.Wait());
+
+        {
+            // copy input to itensor buffer
+            for (auto& tensor : input_tensors_)
+            {
+                const auto& name    = tensor.desc().name;
+                auto        pbuffer = input_tensor_map_.getTensor(name.c_str());
+
+                copy_input(tensor, pbuffer);
+            }
+        }
+
+        // A tensor map for SNPE execution outputs
+        zdl::DlSystem::TensorMap output_map;
+        {
+            // real inference
+            bool success = snpe_->execute(input_tensor_map_, output_map);
+            if (!success)
+            {
+                MMDEPLOY_ERROR("snpe Inference error: {}", std::string(zdl::DlSystem::getLastErrorString()));
+                return Status(eFail);
+            }
+        }
+
+        {
+            // extract output buffer to tensor
+            auto names = output_map.getTensorNames();
+            for (size_t i = 0; i < names.size(); ++i)
+            {
+                const zdl::DlSystem::ITensor* pbuffer = output_map.getTensor(names.at(i));
+
+                auto&                         tensor = output_tensors_[i];
+                copy_output(pbuffer, tensor);
+            }
+        }
+        return success();
+    }
 
-      auto& tensor = output_tensors_[i];
-      copy_output(pbuffer, tensor);
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<SNPENet>();
+        if (auto r = p->Init(args))
+        {
+            return p;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("error creating SNPENet: {}", r.error().message().c_str());
+            return nullptr;
+        }
     }
-  }
-  return success();
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<SNPENet>();
-  if (auto r = p->Init(args)) {
-    return p;
-  } else {
-    MMDEPLOY_ERROR("error creating SNPENet: {}", r.error().message().c_str());
-    return nullptr;
-  }
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (snpe, 0), Create);
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (snpe, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/snpe/snpe_net.h b/csrc/mmdeploy/net/snpe/snpe_net.h
index 90257811df..203ec320e1 100644
--- a/csrc/mmdeploy/net/snpe/snpe_net.h
+++ b/csrc/mmdeploy/net/snpe/snpe_net.h
@@ -21,40 +21,47 @@
 #include "SNPE/SNPEFactory.hpp"
 #include "mmdeploy/core/net.h"
 
-namespace mmdeploy::framework {
-
-class SNPENet : public Net {
- public:
-  ~SNPENet() override;
-  Result<void> Init(const Value& args) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override { return Status(eNotSupported); };
-
- private:
-  void Build(std::unique_ptr<zdl::DlContainer::IDlContainer>& container,
-             zdl::DlSystem::Runtime_t runtime, zdl::DlSystem::RuntimeList runtimeList,
-             bool useUserSuppliedBuffers, zdl::DlSystem::PlatformConfig platformConfig);
-
-  std::string ShapeStr(zdl::DlSystem::ITensor* pTensor);
-
-  void copy_output(const zdl::DlSystem::ITensor* from, Tensor& to);
-  void copy_input(const Tensor& from, zdl::DlSystem::ITensor* to);
-
-  Device device_;
-  Stream stream_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-
-  std::unique_ptr<zdl::SNPE::SNPE> snpe_;
-  std::unique_ptr<zdl::DlContainer::IDlContainer> container_;
-
-  std::vector<std::unique_ptr<zdl::DlSystem::ITensor>> inputs_internal_;
-  zdl::DlSystem::TensorMap input_tensor_map_;
-};
+namespace mmdeploy::framework
+{
+
+    class SNPENet : public Net
+    {
+      public:
+        ~SNPENet() override;
+        Result<void>         Init(const Value& args) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override
+        {
+            return Status(eNotSupported);
+        };
+
+      private:
+        void                                                 Build(std::unique_ptr<zdl::DlContainer::IDlContainer>& container,
+                                                                   zdl::DlSystem::Runtime_t                         runtime,
+                                                                   zdl::DlSystem::RuntimeList                       runtimeList,
+                                                                   bool                                             useUserSuppliedBuffers,
+                                                                   zdl::DlSystem::PlatformConfig                    platformConfig);
+
+        std::string                                          ShapeStr(zdl::DlSystem::ITensor* pTensor);
+
+        void                                                 copy_output(const zdl::DlSystem::ITensor* from, Tensor& to);
+        void                                                 copy_input(const Tensor& from, zdl::DlSystem::ITensor* to);
+
+        Device                                               device_;
+        Stream                                               stream_;
+        std::vector<Tensor>                                  input_tensors_;
+        std::vector<Tensor>                                  output_tensors_;
+
+        std::unique_ptr<zdl::SNPE::SNPE>                     snpe_;
+        std::unique_ptr<zdl::DlContainer::IDlContainer>      container_;
+
+        std::vector<std::unique_ptr<zdl::DlSystem::ITensor>> inputs_internal_;
+        zdl::DlSystem::TensorMap                             input_tensor_map_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/torchscript/torch_net.cpp b/csrc/mmdeploy/net/torchscript/torch_net.cpp
index 0f424bb710..61ff37ed69 100644
--- a/csrc/mmdeploy/net/torchscript/torch_net.cpp
+++ b/csrc/mmdeploy/net/torchscript/torch_net.cpp
@@ -7,227 +7,295 @@
 #include "torch/torch.h"
 
 #if MMDEPLOY_USE_CUDA
-#include "c10/cuda/CUDAGuard.h"
-#include "c10/cuda/CUDAStream.h"
+    #include "c10/cuda/CUDAGuard.h"
+    #include "c10/cuda/CUDAStream.h"
 #endif
 
 #if MMDEPLOY_USE_TORCHVISION
-#include "torchvision/vision.h"
-MMDEPLOY_API void _mmdeploy_force_link_torchvision() { vision::detail::_register_ops(); }
+    #include "torchvision/vision.h"
+MMDEPLOY_API void _mmdeploy_force_link_torchvision()
+{
+    vision::detail::_register_ops();
+}
 #endif
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-namespace {
+    namespace
+    {
 
-class InferenceMode {
+        class InferenceMode
+        {
 #if TORCH_VERSION_MAJOR == 1 && TORCH_VERSION_MINOR >= 10
-  c10::InferenceMode guard_;
+            c10::InferenceMode guard_;
 #else
-  at::AutoNonVariableTypeMode guard_;
+            at::AutoNonVariableTypeMode guard_;
 #endif
-};
+        };
 
-class StreamGuard {
- public:
-  StreamGuard(const torch::Device& device, Stream stream)
-      : device_(device), stream_(std::move(stream)), device_guard_(device) {
-    stream_.Wait().value();
-  }
+        class StreamGuard
+        {
+          public:
+            StreamGuard(const torch::Device& device, Stream stream)
+                : device_(device)
+                , stream_(std::move(stream))
+                , device_guard_(device)
+            {
+                stream_.Wait().value();
+            }
 
-  ~StreamGuard() {
+            ~StreamGuard()
+            {
 #if MMDEPLOY_USE_CUDA
-    auto device = stream_.GetDevice();
-    if (device.is_device()) {
-      Stream stream(device, (cudaStream_t)c10::cuda::getCurrentCUDAStream(device_.index()));
-      stream.Wait().value();
-    }
+                auto device = stream_.GetDevice();
+                if (device.is_device())
+                {
+                    Stream stream(device, (cudaStream_t)c10::cuda::getCurrentCUDAStream(device_.index()));
+                    stream.Wait().value();
+                }
 #endif
-  }
-
- private:
-  torch::Device device_;
-  Stream stream_;
-  c10::DeviceGuard device_guard_;
-};
-
-Result<torch::ScalarType> FromDataType(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return torch::ScalarType::Float;
-    case DataType::kHALF:
-      return torch::ScalarType::Half;
-    case DataType::kINT32:
-      return torch::ScalarType::Int;
-    case DataType::kINT64:
-      return torch::ScalarType::Long;
-    case DataType::kINT8:
-      return torch::ScalarType::Char;
-    default:
-      MMDEPLOY_ERROR("Unsupported mmdeploy::DataType: {}", to_string(data_type));
-      return Status(eNotSupported);
-  }
-}
+            }
 
-Result<DataType> ToDataType(torch::ScalarType scalar_type) {
-  switch (scalar_type) {
-    case torch::ScalarType::Float:
-      return DataType::kFLOAT;
-    case torch::ScalarType::Half:
-      return DataType::kHALF;
-    case torch::ScalarType::Int:
-      return DataType::kINT32;
-    case torch::ScalarType::Long:
-      return DataType::kINT64;
-    case torch::ScalarType::Char:
-      return DataType::kINT8;
-    default:
-      MMDEPLOY_ERROR("Unsupported torch::ScalarType: {}", toString(scalar_type));
-      return Status(eNotSupported);
-  }
-}
+          private:
+            torch::Device    device_;
+            Stream           stream_;
+            c10::DeviceGuard device_guard_;
+        };
 
-}  // namespace
+        Result<torch::ScalarType> FromDataType(DataType data_type)
+        {
+            switch (data_type)
+            {
+                case DataType::kFLOAT:
+                    return torch::ScalarType::Float;
+                case DataType::kHALF:
+                    return torch::ScalarType::Half;
+                case DataType::kINT32:
+                    return torch::ScalarType::Int;
+                case DataType::kINT64:
+                    return torch::ScalarType::Long;
+                case DataType::kINT8:
+                    return torch::ScalarType::Char;
+                default:
+                    MMDEPLOY_ERROR("Unsupported mmdeploy::DataType: {}", to_string(data_type));
+                    return Status(eNotSupported);
+            }
+        }
+
+        Result<DataType> ToDataType(torch::ScalarType scalar_type)
+        {
+            switch (scalar_type)
+            {
+                case torch::ScalarType::Float:
+                    return DataType::kFLOAT;
+                case torch::ScalarType::Half:
+                    return DataType::kHALF;
+                case torch::ScalarType::Int:
+                    return DataType::kINT32;
+                case torch::ScalarType::Long:
+                    return DataType::kINT64;
+                case torch::ScalarType::Char:
+                    return DataType::kINT8;
+                default:
+                    MMDEPLOY_ERROR("Unsupported torch::ScalarType: {}", toString(scalar_type));
+                    return Status(eNotSupported);
+            }
+        }
 
-TorchNet::~TorchNet() = default;
+    }  // namespace
 
-Result<void> TorchNet::Init(const Value& cfg) {
-  auto& context = cfg["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
+    TorchNet::~TorchNet() = default;
+
+    Result<void> TorchNet::Init(const Value& cfg)
+    {
+        auto& context = cfg["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
 
-  auto name = cfg["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
+        auto name  = cfg["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
 
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-  OUTCOME_TRY(auto bytes, model.ReadFile(config.net));
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+        OUTCOME_TRY(auto bytes, model.ReadFile(config.net));
 
-  auto platform = Platform(device_.platform_id());
-  auto device_name = platform.GetPlatformName();
+        auto platform    = Platform(device_.platform_id());
+        auto device_name = platform.GetPlatformName();
 
-  try {
+        try
+        {
+            {
+                using namespace std::string_literals;
+                if (device_name == "cpu"s)
+                {
+                    torch_device_ = torch::Device(device_name);
+                }
+                else
+                {
+                    torch_device_ = torch::Device(device_name + ":"s + std::to_string(device_.device_id()));
+                }
+            }
+            std::istringstream iss(bytes);
+            InferenceMode      guard;
+            module_ = torch::jit::load(iss);
+            module_.eval();
+            module_.to(*torch_device_);
+            auto forward = module_.get_method("forward");
+
+            auto ToDesc = [&](torch::jit::Value* value, const char* type, int index)
+            {
+                MMDEPLOY_INFO("Found {}: {}", type, value->debugNameBase());
+                return TensorDesc{device_, DataType::kFLOAT, {}, "#" + std::to_string(index)};
+            };
+
+            auto inputs      = forward.graph()->inputs();
+            int  input_count = 0;
+            for (int i = 1; i < inputs.size(); ++i)
+            {
+                if (inputs[i]->type()->kind() == c10::TypeKind::TensorType)
+                {
+                    input_tensor_.emplace_back(ToDesc(inputs[i], "input", input_count++));
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("Unsupported input type: {}", typeKindToString(inputs[i]->type()->kind()));
+                    return Status(eNotSupported);
+                }
+            }
+
+            auto outputs      = forward.graph()->outputs();
+            int  output_count = 0;
+            for (const auto& output : outputs)
+            {
+                auto kind = output->type()->kind();
+                if (kind == c10::TypeKind::TensorType)
+                {
+                    output_tensor_.emplace_back(ToDesc(output, "output", output_count++));
+                }
+                else if (output->type()->kind() == c10::TypeKind::TupleType)
+                {
+                    for (const auto& v : output->node()->inputs())
+                    {
+                        if (v->type()->kind() == c10::TypeKind::TensorType)
+                        {
+                            output_tensor_.emplace_back(ToDesc(v, "output", output_count++));
+                        }
+                        else
+                        {
+                            MMDEPLOY_ERROR("Unsupported output type: {}", typeKindToString(v->type()->kind()));
+                            return Status(eNotSupported);
+                        }
+                    }
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("Unsupported output type: {}", typeKindToString(kind));
+                }
+            }
+            return success();
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+            return Status(eFail);
+        }
+    }
+
+    Result<void> TorchNet::Deinit()
+    {
+        return success();
+    }
+    Result<Span<Tensor>> TorchNet::GetInputTensors()
     {
-      using namespace std::string_literals;
-      if (device_name == "cpu"s) {
-        torch_device_ = torch::Device(device_name);
-      } else {
-        torch_device_ = torch::Device(device_name + ":"s + std::to_string(device_.device_id()));
-      }
+        return input_tensor_;
     }
-    std::istringstream iss(bytes);
-    InferenceMode guard;
-    module_ = torch::jit::load(iss);
-    module_.eval();
-    module_.to(*torch_device_);
-    auto forward = module_.get_method("forward");
-
-    auto ToDesc = [&](torch::jit::Value* value, const char* type, int index) {
-      MMDEPLOY_INFO("Found {}: {}", type, value->debugNameBase());
-      return TensorDesc{device_, DataType::kFLOAT, {}, "#" + std::to_string(index)};
-    };
-
-    auto inputs = forward.graph()->inputs();
-    int input_count = 0;
-    for (int i = 1; i < inputs.size(); ++i) {
-      if (inputs[i]->type()->kind() == c10::TypeKind::TensorType) {
-        input_tensor_.emplace_back(ToDesc(inputs[i], "input", input_count++));
-      } else {
-        MMDEPLOY_ERROR("Unsupported input type: {}", typeKindToString(inputs[i]->type()->kind()));
-        return Status(eNotSupported);
-      }
+    Result<Span<Tensor>> TorchNet::GetOutputTensors()
+    {
+        return output_tensor_;
     }
 
-    auto outputs = forward.graph()->outputs();
-    int output_count = 0;
-    for (const auto& output : outputs) {
-      auto kind = output->type()->kind();
-      if (kind == c10::TypeKind::TensorType) {
-        output_tensor_.emplace_back(ToDesc(output, "output", output_count++));
-      } else if (output->type()->kind() == c10::TypeKind::TupleType) {
-        for (const auto& v : output->node()->inputs()) {
-          if (v->type()->kind() == c10::TypeKind::TensorType) {
-            output_tensor_.emplace_back(ToDesc(v, "output", output_count++));
-          } else {
-            MMDEPLOY_ERROR("Unsupported output type: {}", typeKindToString(v->type()->kind()));
-            return Status(eNotSupported);
-          }
+    Result<void> TorchNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        if (input_shapes.size() != input_tensor_.size())
+        {
+            return Status(eInvalidArgument);
+        }
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensor_[i].Reshape(input_shapes[i]);
         }
-      } else {
-        MMDEPLOY_ERROR("Unsupported output type: {}", typeKindToString(kind));
-      }
+        return success();
     }
-    return success();
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-    return Status(eFail);
-  }
-}
-
-Result<void> TorchNet::Deinit() { return success(); }
-Result<Span<Tensor>> TorchNet::GetInputTensors() { return input_tensor_; }
-Result<Span<Tensor>> TorchNet::GetOutputTensors() { return output_tensor_; }
-
-Result<void> TorchNet::Reshape(Span<TensorShape> input_shapes) {
-  if (input_shapes.size() != input_tensor_.size()) {
-    return Status(eInvalidArgument);
-  }
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensor_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
 
-Result<void> TorchNet::Forward() {
-  try {
-    StreamGuard stream_guard(*torch_device_, stream_);
-    InferenceMode inference_guard;
-    std::vector<torch::jit::IValue> inputs;
-    for (auto& v : input_tensor_) {
-      OUTCOME_TRY(auto data_type, FromDataType(v.data_type()));
-      auto tensor = torch::from_blob(v.data(), v.shape(),
-                                     c10::TensorOptions(*torch_device_).dtype(data_type));
-      inputs.emplace_back(tensor);
+    Result<void> TorchNet::Forward()
+    {
+        try
+        {
+            StreamGuard                     stream_guard(*torch_device_, stream_);
+            InferenceMode                   inference_guard;
+            std::vector<torch::jit::IValue> inputs;
+            for (auto& v : input_tensor_)
+            {
+                OUTCOME_TRY(auto data_type, FromDataType(v.data_type()));
+                auto tensor = torch::from_blob(v.data(), v.shape(), c10::TensorOptions(*torch_device_).dtype(data_type));
+                inputs.emplace_back(tensor);
+            }
+            auto outputs = module_.forward(inputs);
+            if (outputs.isTensor())
+            {
+                OUTCOME_TRY(output_tensor_[0], FromTorchTensor(outputs.toTensor(), output_tensor_[0].name()));
+            }
+            else if (outputs.isTuple())
+            {
+                auto   tuple = outputs.toTuple();
+                size_t index = 0;
+                for (const auto& x : tuple->elements())
+                {
+                    OUTCOME_TRY(output_tensor_[index],
+                                FromTorchTensor(x.toTensor(), output_tensor_[index].name()));
+                    ++index;
+                }
+            }
+            else
+            {
+                MMDEPLOY_ERROR("{}", toString(outputs.type()));
+                return Status(eNotSupported);
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+            return Status(eFail);
+        }
+        return success();
     }
-    auto outputs = module_.forward(inputs);
-    if (outputs.isTensor()) {
-      OUTCOME_TRY(output_tensor_[0], FromTorchTensor(outputs.toTensor(), output_tensor_[0].name()));
-    } else if (outputs.isTuple()) {
-      auto tuple = outputs.toTuple();
-      size_t index = 0;
-      for (const auto& x : tuple->elements()) {
-        OUTCOME_TRY(output_tensor_[index],
-                    FromTorchTensor(x.toTensor(), output_tensor_[index].name()));
-        ++index;
-      }
-    } else {
-      MMDEPLOY_ERROR("{}", toString(outputs.type()));
-      return Status(eNotSupported);
+    Result<void> TorchNet::ForwardAsync(Event* event)
+    {
+        return success();
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-    return Status(eFail);
-  }
-  return success();
-}
-Result<void> TorchNet::ForwardAsync(Event* event) { return success(); }
 
-Result<Tensor> TorchNet::FromTorchTensor(const torch::Tensor& tensor, const std::string& name) {
-  OUTCOME_TRY(auto data_type, ToDataType(tensor.scalar_type()));
-  auto shape = tensor.sizes();
-  TensorDesc desc{device_, data_type, {shape.begin(), shape.end()}, name};
-  return Tensor(desc, std::shared_ptr<void>(tensor.data_ptr(), [tensor](auto) {}));
-}
+    Result<Tensor> TorchNet::FromTorchTensor(const torch::Tensor& tensor, const std::string& name)
+    {
+        OUTCOME_TRY(auto data_type, ToDataType(tensor.scalar_type()));
+        auto       shape = tensor.sizes();
+        TensorDesc desc{device_, data_type, {shape.begin(), shape.end()}, name};
+        return Tensor(desc, std::shared_ptr<void>(tensor.data_ptr(), [tensor](auto) {}));
+    }
 
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<TorchNet>();
-  if (auto status = p->Init(args)) {
-    return p;
-  } else {
-    MMDEPLOY_ERROR("Failed to created TorchNet with config: {}", args);
-  }
-  return nullptr;
-}
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<TorchNet>();
+        if (auto status = p->Init(args))
+        {
+            return p;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("Failed to created TorchNet with config: {}", args);
+        }
+        return nullptr;
+    }
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (torchscript, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (torchscript, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/torchscript/torch_net.h b/csrc/mmdeploy/net/torchscript/torch_net.h
index 9027e33ef0..4db1198056 100644
--- a/csrc/mmdeploy/net/torchscript/torch_net.h
+++ b/csrc/mmdeploy/net/torchscript/torch_net.h
@@ -6,29 +6,31 @@
 #include "mmdeploy/core/net.h"
 #include "torch/script.h"
 
-namespace mmdeploy::framework {
-
-class TorchNet : public Net {
- public:
-  ~TorchNet() override;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
-
- private:
-  Result<Tensor> FromTorchTensor(const torch::Tensor& tensor, const std::string& name);
-
-  torch::jit::script::Module module_;
-  std::vector<Tensor> input_tensor_;
-  std::vector<Tensor> output_tensor_;
-  Device device_;
-  Stream stream_;
-  std::optional<torch::Device> torch_device_;
-};
+namespace mmdeploy::framework
+{
+
+    class TorchNet : public Net
+    {
+      public:
+        ~TorchNet() override;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
+
+      private:
+        Result<Tensor>               FromTorchTensor(const torch::Tensor& tensor, const std::string& name);
+
+        torch::jit::script::Module   module_;
+        std::vector<Tensor>          input_tensor_;
+        std::vector<Tensor>          output_tensor_;
+        Device                       device_;
+        Stream                       stream_;
+        std::optional<torch::Device> torch_device_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/trt/trt_net.cpp b/csrc/mmdeploy/net/trt/trt_net.cpp
index a2dc2efa83..db83d595eb 100644
--- a/csrc/mmdeploy/net/trt/trt_net.cpp
+++ b/csrc/mmdeploy/net/trt/trt_net.cpp
@@ -9,219 +9,265 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::framework {
-
-namespace trt_detail {
-
-class TRTLogger : public nvinfer1::ILogger {
- public:
-  void log(Severity severity, const char* msg) noexcept override {
-    switch (severity) {
-      case Severity::kINFO:
-        MMDEPLOY_DEBUG("TRTNet: {}", msg);
-        break;
-      case Severity::kWARNING:
-        MMDEPLOY_WARN("TRTNet: {}", msg);
-        break;
-      case Severity::kERROR:
-      case Severity::kINTERNAL_ERROR:
-        MMDEPLOY_ERROR("TRTNet: {}", msg);
-        break;
-      default:
-        break;
+namespace mmdeploy::framework
+{
+
+    namespace trt_detail
+    {
+
+        class TRTLogger : public nvinfer1::ILogger
+        {
+          public:
+            void log(Severity severity, const char* msg) noexcept override
+            {
+                switch (severity)
+                {
+                    case Severity::kINFO:
+                        MMDEPLOY_DEBUG("TRTNet: {}", msg);
+                        break;
+                    case Severity::kWARNING:
+                        MMDEPLOY_WARN("TRTNet: {}", msg);
+                        break;
+                    case Severity::kERROR:
+                    case Severity::kINTERNAL_ERROR:
+                        MMDEPLOY_ERROR("TRTNet: {}", msg);
+                        break;
+                    default:
+                        break;
+                }
+            }
+            static TRTLogger& get()
+            {
+                static TRTLogger trt_logger{};
+                return trt_logger;
+            }
+        };
+
+        nvinfer1::Dims to_dims(const TensorShape& shape)
+        {
+            nvinfer1::Dims dims{};
+            dims.nbDims = shape.size();
+            for (size_t i = 0; i < shape.size(); ++i)
+            {
+                dims.d[i] = shape[i];
+            }
+            return dims;
+        }
+
+        TensorShape to_shape(const nvinfer1::Dims& dims)
+        {
+            TensorShape shape(dims.nbDims);
+            for (int i = 0; i < shape.size(); ++i)
+            {
+                shape[i] = dims.d[i];
+            }
+            return shape;
+        }
+
+    }  // namespace trt_detail
+
+    std::string to_string(const nvinfer1::Dims& dims)
+    {
+        std::stringstream ss;
+        ss << "(";
+        for (int i = 0; i < dims.nbDims; ++i)
+        {
+            if (i) ss << ", ";
+            ss << dims.d[i];
+        }
+        ss << ")";
+        return ss.str();
+    }
+
+    static inline Result<void> trt_try(bool code, const char* msg = nullptr, Status e = Status(eFail))
+    {
+        if (code)
+        {
+            return success();
+        }
+        if (msg)
+        {
+            MMDEPLOY_ERROR("{}", msg);
+        }
+        return e;
     }
-  }
-  static TRTLogger& get() {
-    static TRTLogger trt_logger{};
-    return trt_logger;
-  }
-};
-
-nvinfer1::Dims to_dims(const TensorShape& shape) {
-  nvinfer1::Dims dims{};
-  dims.nbDims = shape.size();
-  for (size_t i = 0; i < shape.size(); ++i) {
-    dims.d[i] = shape[i];
-  }
-  return dims;
-}
-
-TensorShape to_shape(const nvinfer1::Dims& dims) {
-  TensorShape shape(dims.nbDims);
-  for (int i = 0; i < shape.size(); ++i) {
-    shape[i] = dims.d[i];
-  }
-  return shape;
-}
-
-}  // namespace trt_detail
-
-std::string to_string(const nvinfer1::Dims& dims) {
-  std::stringstream ss;
-  ss << "(";
-  for (int i = 0; i < dims.nbDims; ++i) {
-    if (i) ss << ", ";
-    ss << dims.d[i];
-  }
-  ss << ")";
-  return ss.str();
-}
-
-static inline Result<void> trt_try(bool code, const char* msg = nullptr, Status e = Status(eFail)) {
-  if (code) {
-    return success();
-  }
-  if (msg) {
-    MMDEPLOY_ERROR("{}", msg);
-  }
-  return e;
-}
 
 #define TRT_TRY(...) OUTCOME_TRY(trt_try(__VA_ARGS__))
 
-TRTNet::~TRTNet() {
-  CudaDeviceGuard guard(device_);
-  context_.reset();
-  engine_.reset();
-  runtime_.reset();
-}
-
-static Result<DataType> MapDataType(nvinfer1::DataType dtype) {
-  switch (dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      return DataType::kFLOAT;
-    case nvinfer1::DataType::kHALF:
-      return DataType::kHALF;
-    case nvinfer1::DataType::kINT8:
-    case nvinfer1::DataType::kBOOL:
-      return DataType::kINT8;
-    case nvinfer1::DataType::kINT32:
-      return DataType::kINT32;
-    default:
-      return Status(eNotSupported);
-  }
-}
-
-Result<void> TRTNet::Init(const Value& args) {
-  using namespace trt_detail;
-
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  if (device_.is_host()) {
-    MMDEPLOY_ERROR("TRTNet: device must be a GPU!");
-    return Status(eNotSupported);
-  }
-  CudaDeviceGuard guard(device_);
-  stream_ = context["stream"].get<Stream>();
-
-  event_ = Event(device_);
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  OUTCOME_TRY(auto plan, model.ReadFile(config.net));
-
-  runtime_ = nvinfer1::createInferRuntime(TRTLogger::get());
-  TRT_TRY(!!runtime_, "failed to create TRT infer runtime");
-
-  engine_ = runtime_->deserializeCudaEngine(plan.data(), plan.size());
-  TRT_TRY(!!engine_, "failed to deserialize TRT CUDA engine");
-
-  TRT_TRY(engine_->getNbOptimizationProfiles() == 1, "only 1 optimization profile supported",
-          Status(eNotSupported));
-
-  auto n_bindings = engine_->getNbBindings();
-  for (int i = 0; i < n_bindings; ++i) {
-    auto binding_name = engine_->getBindingName(i);
-    auto dims = engine_->getBindingDimensions(i);
-    if (engine_->isShapeBinding(i)) {
-      MMDEPLOY_ERROR("shape binding is not supported.");
-      return Status(eNotSupported);
+    TRTNet::~TRTNet()
+    {
+        CudaDeviceGuard guard(device_);
+        context_.reset();
+        engine_.reset();
+        runtime_.reset();
+    }
+
+    static Result<DataType> MapDataType(nvinfer1::DataType dtype)
+    {
+        switch (dtype)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                return DataType::kFLOAT;
+            case nvinfer1::DataType::kHALF:
+                return DataType::kHALF;
+            case nvinfer1::DataType::kINT8:
+            case nvinfer1::DataType::kBOOL:
+                return DataType::kINT8;
+            case nvinfer1::DataType::kINT32:
+                return DataType::kINT32;
+            default:
+                return Status(eNotSupported);
+        }
+    }
+
+    Result<void> TRTNet::Init(const Value& args)
+    {
+        using namespace trt_detail;
+
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        if (device_.is_host())
+        {
+            MMDEPLOY_ERROR("TRTNet: device must be a GPU!");
+            return Status(eNotSupported);
+        }
+        CudaDeviceGuard guard(device_);
+        stream_ = context["stream"].get<Stream>();
+
+        event_ = Event(device_);
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        OUTCOME_TRY(auto plan, model.ReadFile(config.net));
+
+        runtime_ = nvinfer1::createInferRuntime(TRTLogger::get());
+        TRT_TRY(!!runtime_, "failed to create TRT infer runtime");
+
+        engine_ = runtime_->deserializeCudaEngine(plan.data(), plan.size());
+        TRT_TRY(!!engine_, "failed to deserialize TRT CUDA engine");
+
+        TRT_TRY(engine_->getNbOptimizationProfiles() == 1, "only 1 optimization profile supported", Status(eNotSupported));
+
+        auto n_bindings = engine_->getNbBindings();
+        for (int i = 0; i < n_bindings; ++i)
+        {
+            auto binding_name = engine_->getBindingName(i);
+            auto dims         = engine_->getBindingDimensions(i);
+            if (engine_->isShapeBinding(i))
+            {
+                MMDEPLOY_ERROR("shape binding is not supported.");
+                return Status(eNotSupported);
+            }
+            OUTCOME_TRY(auto dtype, MapDataType(engine_->getBindingDataType(i)));
+            TensorDesc desc{device_, dtype, to_shape(dims), binding_name};
+            if (engine_->bindingIsInput(i))
+            {
+                MMDEPLOY_DEBUG("input binding {} {} {}", i, binding_name, to_string(dims));
+                input_ids_.push_back(i);
+                input_names_.emplace_back(binding_name);
+                input_tensors_.emplace_back(desc, Buffer());
+            }
+            else
+            {
+                MMDEPLOY_DEBUG("output binding {} {} {}", i, binding_name, to_string(dims));
+                output_ids_.push_back(i);
+                output_names_.emplace_back(binding_name);
+                output_tensors_.emplace_back(desc, Buffer());
+            }
+        }
+        context_ = engine_->createExecutionContext();
+        TRT_TRY(!!context_, "failed to create TRT execution context");
+
+        context_->setOptimizationProfileAsync(0, static_cast<cudaStream_t>(stream_.GetNative()));
+        OUTCOME_TRY(stream_.Wait());
+
+        return success();
+    }
+
+    Result<void> TRTNet::Deinit()
+    {
+        return success();
+    }
+
+    Result<void> TRTNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        CudaDeviceGuard guard(device_);
+        using namespace trt_detail;
+        if (input_shapes.size() != input_tensors_.size())
+        {
+            return Status(eInvalidArgument);
+        }
+        for (int i = 0; i < input_tensors_.size(); ++i)
+        {
+            auto dims = to_dims(input_shapes[i]);
+            MMDEPLOY_DEBUG("input shape: {}", to_string(dims));
+            TRT_TRY(context_->setBindingDimensions(input_ids_[i], dims));
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        if (!context_->allInputDimensionsSpecified())
+        {
+            MMDEPLOY_ERROR("not all input dimensions specified");
+            return Status(eFail);
+        }
+        for (int i = 0; i < output_tensors_.size(); ++i)
+        {
+            auto dims = context_->getBindingDimensions(output_ids_[i]);
+            MMDEPLOY_DEBUG("output shape: {}", to_string(dims));
+            output_tensors_[i].Reshape(to_shape(dims));
+        }
+        return success();
     }
-    OUTCOME_TRY(auto dtype, MapDataType(engine_->getBindingDataType(i)));
-    TensorDesc desc{device_, dtype, to_shape(dims), binding_name};
-    if (engine_->bindingIsInput(i)) {
-      MMDEPLOY_DEBUG("input binding {} {} {}", i, binding_name, to_string(dims));
-      input_ids_.push_back(i);
-      input_names_.emplace_back(binding_name);
-      input_tensors_.emplace_back(desc, Buffer());
-    } else {
-      MMDEPLOY_DEBUG("output binding {} {} {}", i, binding_name, to_string(dims));
-      output_ids_.push_back(i);
-      output_names_.emplace_back(binding_name);
-      output_tensors_.emplace_back(desc, Buffer());
+
+    Result<Span<Tensor>> TRTNet::GetInputTensors()
+    {
+        return input_tensors_;
+    }
+
+    Result<Span<Tensor>> TRTNet::GetOutputTensors()
+    {
+        return output_tensors_;
+    }
+
+    Result<void> TRTNet::Forward()
+    {
+        CudaDeviceGuard guard(device_);
+        using namespace trt_detail;
+        std::vector<void*> bindings(engine_->getNbBindings());
+
+        for (int i = 0; i < input_tensors_.size(); ++i)
+        {
+            bindings[input_ids_[i]] = input_tensors_[i].data();
+        }
+        for (int i = 0; i < output_tensors_.size(); ++i)
+        {
+            bindings[output_ids_[i]] = output_tensors_[i].data();
+        }
+
+        auto event  = GetNative<cudaEvent_t>(event_);
+        auto status = context_->enqueueV2(bindings.data(), GetNative<cudaStream_t>(stream_), &event);
+        TRT_TRY(status, "TRT forward failed", Status(eFail));
+        OUTCOME_TRY(event_.Wait());
+
+        return success();
     }
-  }
-  context_ = engine_->createExecutionContext();
-  TRT_TRY(!!context_, "failed to create TRT execution context");
-
-  context_->setOptimizationProfileAsync(0, static_cast<cudaStream_t>(stream_.GetNative()));
-  OUTCOME_TRY(stream_.Wait());
-
-  return success();
-}
-
-Result<void> TRTNet::Deinit() { return success(); }
-
-Result<void> TRTNet::Reshape(Span<TensorShape> input_shapes) {
-  CudaDeviceGuard guard(device_);
-  using namespace trt_detail;
-  if (input_shapes.size() != input_tensors_.size()) {
-    return Status(eInvalidArgument);
-  }
-  for (int i = 0; i < input_tensors_.size(); ++i) {
-    auto dims = to_dims(input_shapes[i]);
-    MMDEPLOY_DEBUG("input shape: {}", to_string(dims));
-    TRT_TRY(context_->setBindingDimensions(input_ids_[i], dims));
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  if (!context_->allInputDimensionsSpecified()) {
-    MMDEPLOY_ERROR("not all input dimensions specified");
-    return Status(eFail);
-  }
-  for (int i = 0; i < output_tensors_.size(); ++i) {
-    auto dims = context_->getBindingDimensions(output_ids_[i]);
-    MMDEPLOY_DEBUG("output shape: {}", to_string(dims));
-    output_tensors_[i].Reshape(to_shape(dims));
-  }
-  return success();
-}
-
-Result<Span<Tensor>> TRTNet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> TRTNet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> TRTNet::Forward() {
-  CudaDeviceGuard guard(device_);
-  using namespace trt_detail;
-  std::vector<void*> bindings(engine_->getNbBindings());
-
-  for (int i = 0; i < input_tensors_.size(); ++i) {
-    bindings[input_ids_[i]] = input_tensors_[i].data();
-  }
-  for (int i = 0; i < output_tensors_.size(); ++i) {
-    bindings[output_ids_[i]] = output_tensors_[i].data();
-  }
-
-  auto event = GetNative<cudaEvent_t>(event_);
-  auto status = context_->enqueueV2(bindings.data(), GetNative<cudaStream_t>(stream_), &event);
-  TRT_TRY(status, "TRT forward failed", Status(eFail));
-  OUTCOME_TRY(event_.Wait());
-
-  return success();
-}
-
-Result<void> TRTNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<TRTNet>();
-  if (p->Init(args)) {
-    return p;
-  }
-  return nullptr;
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (tensorrt, 0), Create);
+
+    Result<void> TRTNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
+    }
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<TRTNet>();
+        if (p->Init(args))
+        {
+            return p;
+        }
+        return nullptr;
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (tensorrt, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/trt/trt_net.h b/csrc/mmdeploy/net/trt/trt_net.h
index 6233e05207..6252de5185 100644
--- a/csrc/mmdeploy/net/trt/trt_net.h
+++ b/csrc/mmdeploy/net/trt/trt_net.h
@@ -8,73 +8,101 @@
 #include "mmdeploy/core/net.h"
 #include "mmdeploy/device/cuda/cuda_device.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-namespace trt_detail {
+    namespace trt_detail
+    {
 
-template <typename T>
-class TRTWrapper {
- public:
-  TRTWrapper() : ptr_(nullptr) {}
-  TRTWrapper(T* ptr) : ptr_(ptr) {}  // NOLINT
-  ~TRTWrapper() { reset(); }
-  TRTWrapper(const TRTWrapper&) = delete;
-  TRTWrapper& operator=(const TRTWrapper&) = delete;
-  TRTWrapper(TRTWrapper&& other) noexcept { *this = std::move(other); }
-  TRTWrapper& operator=(TRTWrapper&& other) noexcept {
-    reset(std::exchange(other.ptr_, nullptr));
-    return *this;
-  }
-  T& operator*() { return *ptr_; }
-  T* operator->() { return ptr_; }
-  void reset(T* p = nullptr) {
-    if (auto old = std::exchange(ptr_, p)) {  // NOLINT
+        template<typename T>
+        class TRTWrapper
+        {
+          public:
+            TRTWrapper()
+                : ptr_(nullptr)
+            {
+            }
+            TRTWrapper(T* ptr)
+                : ptr_(ptr)
+            {
+            }  // NOLINT
+            ~TRTWrapper()
+            {
+                reset();
+            }
+            TRTWrapper(const TRTWrapper&)            = delete;
+            TRTWrapper& operator=(const TRTWrapper&) = delete;
+            TRTWrapper(TRTWrapper&& other) noexcept
+            {
+                *this = std::move(other);
+            }
+            TRTWrapper& operator=(TRTWrapper&& other) noexcept
+            {
+                reset(std::exchange(other.ptr_, nullptr));
+                return *this;
+            }
+            T& operator*()
+            {
+                return *ptr_;
+            }
+            T* operator->()
+            {
+                return ptr_;
+            }
+            void reset(T* p = nullptr)
+            {
+                if (auto old = std::exchange(ptr_, p))
+                {  // NOLINT
 #if NV_TENSORRT_MAJOR < 8
-      old->destroy();
+                    old->destroy();
 #else
-      delete old;
+                    delete old;
 #endif
-    }
-  }
+                }
+            }
 
-  explicit operator bool() const noexcept { return ptr_ != nullptr; }
+            explicit operator bool() const noexcept
+            {
+                return ptr_ != nullptr;
+            }
 
- private:
-  T* ptr_;
-};
+          private:
+            T* ptr_;
+        };
 
-// clang-format off
+        // clang-format off
 template <typename T>
 explicit TRTWrapper(T*) -> TRTWrapper<T>;
-// clang-format on
-}  // namespace trt_detail
+        // clang-format on
+    }  // namespace trt_detail
 
-class TRTNet : public Net {
- public:
-  ~TRTNet() override;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
+    class TRTNet : public Net
+    {
+      public:
+        ~TRTNet() override;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
 
- private:
- private:
-  trt_detail::TRTWrapper<nvinfer1::ICudaEngine> engine_;
-  trt_detail::TRTWrapper<nvinfer1::IExecutionContext> context_;
-  trt_detail::TRTWrapper<nvinfer1::IRuntime> runtime_;
-  std::vector<int> input_ids_;
-  std::vector<int> output_ids_;
-  std::vector<std::string> input_names_;
-  std::vector<std::string> output_names_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  Device device_;
-  Stream stream_;
-  Event event_;
-};
+      private:
+      private:
+        trt_detail::TRTWrapper<nvinfer1::ICudaEngine>       engine_;
+        trt_detail::TRTWrapper<nvinfer1::IExecutionContext> context_;
+        trt_detail::TRTWrapper<nvinfer1::IRuntime>          runtime_;
+        std::vector<int>                                    input_ids_;
+        std::vector<int>                                    output_ids_;
+        std::vector<std::string>                            input_names_;
+        std::vector<std::string>                            output_names_;
+        std::vector<Tensor>                                 input_tensors_;
+        std::vector<Tensor>                                 output_tensors_;
+        Device                                              device_;
+        Stream                                              stream_;
+        Event                                               event_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/tvm/tvm_net.cpp b/csrc/mmdeploy/net/tvm/tvm_net.cpp
index 8985065d4a..ed095944fa 100644
--- a/csrc/mmdeploy/net/tvm/tvm_net.cpp
+++ b/csrc/mmdeploy/net/tvm/tvm_net.cpp
@@ -14,269 +14,334 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/utils/dlpack/dlpack_utils.h"
 
-namespace mmdeploy::framework {
-
-static DLDevice GetDLDevice(const Device& device) {
-  DLDevice dev;
-  if (device.is_device()) {
-    dev = {kDLCUDA, device.device_id()};
-  } else {
-    dev = {kDLCPU, 0};
-  }
-  return dev;
-}
-
-static Result<DLDataType> FromDataType(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return DLDataType{kDLFloat, 32, 1};
-    case DataType::kINT32:
-      return DLDataType{kDLInt, 32, 1};
-    case DataType::kINT64:
-      return DLDataType{kDLInt, 64, 1};
-    case DataType::kINT8:
-      return DLDataType{kDLInt, 8, 1};
-    default:
-      MMDEPLOY_ERROR("Unsupported mmdeploy::DataType");
-      return Status(eNotSupported);
-  }
-}
-
-static Result<DataType> ToDataType(DLDataType scalar_type) {
-  if (scalar_type.lanes != 1) {
-    MMDEPLOY_ERROR("Unsupported scalar_type.lanes==1.");
-    return Status(eNotSupported);
-  }
-
-  if (scalar_type.code == kDLFloat && scalar_type.bits == 32) {
-    return DataType::kFLOAT;
-  } else if (scalar_type.code == kDLInt) {
-    switch (scalar_type.bits) {
-      case 32:
-        return DataType::kINT32;
-      case 64:
-        return DataType::kINT64;
-      case 8:
-        return DataType::kINT8;
-      default:
-        break;
+namespace mmdeploy::framework
+{
+
+    static DLDevice GetDLDevice(const Device& device)
+    {
+        DLDevice dev;
+        if (device.is_device())
+        {
+            dev = {kDLCUDA, device.device_id()};
+        }
+        else
+        {
+            dev = {kDLCPU, 0};
+        }
+        return dev;
+    }
+
+    static Result<DLDataType> FromDataType(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return DLDataType{kDLFloat, 32, 1};
+            case DataType::kINT32:
+                return DLDataType{kDLInt, 32, 1};
+            case DataType::kINT64:
+                return DLDataType{kDLInt, 64, 1};
+            case DataType::kINT8:
+                return DLDataType{kDLInt, 8, 1};
+            default:
+                MMDEPLOY_ERROR("Unsupported mmdeploy::DataType");
+                return Status(eNotSupported);
+        }
+    }
+
+    static Result<DataType> ToDataType(DLDataType scalar_type)
+    {
+        if (scalar_type.lanes != 1)
+        {
+            MMDEPLOY_ERROR("Unsupported scalar_type.lanes==1.");
+            return Status(eNotSupported);
+        }
+
+        if (scalar_type.code == kDLFloat && scalar_type.bits == 32)
+        {
+            return DataType::kFLOAT;
+        }
+        else if (scalar_type.code == kDLInt)
+        {
+            switch (scalar_type.bits)
+            {
+                case 32:
+                    return DataType::kINT32;
+                case 64:
+                    return DataType::kINT64;
+                case 8:
+                    return DataType::kINT8;
+                default:
+                    break;
+            }
+        }
+
+        MMDEPLOY_ERROR("Unsupported code: {}, bits: {}, lanes: {}.", std::to_string(scalar_type.code), std::to_string(scalar_type.bits), std::to_string(scalar_type.lanes));
+        return Status(eNotSupported);
+    }
+
+    static std::vector<std::string> split_str(const std::string& s, char delim)
+    {
+        using namespace std;
+        vector<string> result;
+        stringstream   ss(s);
+        string         item;
+
+        while (getline(ss, item, delim))
+        {
+            result.push_back(item);
+        }
+
+        return result;
+    }
+
+    Result<void> TVMNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        auto        tmp_dir = fs::temp_directory_path();
+        std::string tmp_lib = (tmp_dir / fs::path(config.net)).string();
+        OUTCOME_TRY(auto raw_lib, model.ReadFile(config.net));
+        std::string tmp_label = (tmp_dir / fs::path(config.weights)).string();
+        OUTCOME_TRY(auto raw_label, model.ReadFile(config.weights));
+
+        try
+        {
+            std::ofstream lib_out(tmp_lib, std::ios::binary);
+            lib_out << raw_lib;
+            lib_out.close();
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating tmp library: {}", e.what());
+            return Status(eFail);
+        }
+
+        try
+        {
+            auto     io_names     = split_str(raw_label, '\n');
+            auto     input_names  = split_str(io_names[0], ',');
+            auto     output_names = split_str(io_names[1], ',');
+            DLDevice dev          = GetDLDevice(device_);
+
+            mod_factory_ = tvm::runtime::Module::LoadFromFile(tmp_lib);
+
+            use_vm_ = false;
+            if (io_names.size() > 2)
+            {
+                use_vm_ = true;
+                OUTCOME_TRY(auto bytecode, model.ReadFile(io_names[2]));
+                auto                 exec           = tvm::runtime::vm::Executable::Load(bytecode, mod_factory_);
+                const auto           runtime_create = *tvm::runtime::Registry::Get("runtime._VirtualMachine");
+                tvm::runtime::Module vm_            = runtime_create(exec);
+
+                // init vm
+                auto                 func_init  = vm_.GetFunction("init", false);
+                auto                 alloc_type = static_cast<int>(tvm::runtime::vm::AllocatorType::kPooled);
+                if (dev.device_type != kDLCPU)
+                {
+                    func_init(static_cast<int>(kDLCPU), 0, alloc_type, int(dev.device_type), int(dev.device_id), alloc_type);
+                }
+                else
+                {
+                    func_init(int(dev.device_type), int(dev.device_id), alloc_type);
+                }
+
+                // get input ids
+                auto func_input_index_ = vm_.GetFunction("get_input_index", false);
+                for (auto name : input_names)
+                {
+                    input_ids_[name] = func_input_index_(name, "main");
+                }
+
+                // get function
+                func_set_input_ = vm_.GetFunction("set_input");
+                func_run_       = vm_.GetFunction("invoke");
+            }
+            else
+            {
+                // graph executor won't do synchronize stream after run？
+                if (device_.is_device())
+                    tvm::runtime::DeviceAPI::Get(dev)->SetStream(dev, stream_.GetNative());
+                tvm::runtime::Module gmod = mod_factory_.GetFunction("default")(dev);
+
+                // get function
+                func_set_input_  = gmod.GetFunction("set_input");
+                func_get_output_ = gmod.GetFunction("get_output");
+                func_run_        = gmod.GetFunction("run");
+            }
+
+            auto ToDesc = [&](const std::string& name)
+            {
+                return TensorDesc{device_, DataType::kFLOAT, {}, name};
+            };
+
+            for (auto name : input_names)
+            {
+                input_tensors_.emplace_back(ToDesc(name));
+            }
+
+            for (auto name : output_names)
+            {
+                output_tensors_.emplace_back(ToDesc(name));
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating TVM Net: {}", e.what());
+            return Status(eFail);
+        }
+
+        return success();
+    }
+
+    Result<void> TVMNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
     }
-  }
-
-  MMDEPLOY_ERROR("Unsupported code: {}, bits: {}, lanes: {}.", std::to_string(scalar_type.code),
-                 std::to_string(scalar_type.bits), std::to_string(scalar_type.lanes));
-  return Status(eNotSupported);
-}
-
-static std::vector<std::string> split_str(const std::string& s, char delim) {
-  using namespace std;
-  vector<string> result;
-  stringstream ss(s);
-  string item;
-
-  while (getline(ss, item, delim)) {
-    result.push_back(item);
-  }
-
-  return result;
-}
-
-Result<void> TVMNet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  auto tmp_dir = fs::temp_directory_path();
-  std::string tmp_lib = (tmp_dir / fs::path(config.net)).string();
-  OUTCOME_TRY(auto raw_lib, model.ReadFile(config.net));
-  std::string tmp_label = (tmp_dir / fs::path(config.weights)).string();
-  OUTCOME_TRY(auto raw_label, model.ReadFile(config.weights));
-
-  try {
-    std::ofstream lib_out(tmp_lib, std::ios::binary);
-    lib_out << raw_lib;
-    lib_out.close();
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating tmp library: {}", e.what());
-    return Status(eFail);
-  }
-
-  try {
-    auto io_names = split_str(raw_label, '\n');
-    auto input_names = split_str(io_names[0], ',');
-    auto output_names = split_str(io_names[1], ',');
-    DLDevice dev = GetDLDevice(device_);
-
-    mod_factory_ = tvm::runtime::Module::LoadFromFile(tmp_lib);
-
-    use_vm_ = false;
-    if (io_names.size() > 2) {
-      use_vm_ = true;
-      OUTCOME_TRY(auto bytecode, model.ReadFile(io_names[2]));
-      auto exec = tvm::runtime::vm::Executable::Load(bytecode, mod_factory_);
-      const auto runtime_create = *tvm::runtime::Registry::Get("runtime._VirtualMachine");
-      tvm::runtime::Module vm_ = runtime_create(exec);
-
-      // init vm
-      auto func_init = vm_.GetFunction("init", false);
-      auto alloc_type = static_cast<int>(tvm::runtime::vm::AllocatorType::kPooled);
-      if (dev.device_type != kDLCPU) {
-        func_init(static_cast<int>(kDLCPU), 0, alloc_type, int(dev.device_type), int(dev.device_id),
-                  alloc_type);
-      } else {
-        func_init(int(dev.device_type), int(dev.device_id), alloc_type);
-      }
-
-      // get input ids
-      auto func_input_index_ = vm_.GetFunction("get_input_index", false);
-      for (auto name : input_names) {
-        input_ids_[name] = func_input_index_(name, "main");
-      }
-
-      // get function
-      func_set_input_ = vm_.GetFunction("set_input");
-      func_run_ = vm_.GetFunction("invoke");
-    } else {
-      // graph executor won't do synchronize stream after run？
-      if (device_.is_device())
-        tvm::runtime::DeviceAPI::Get(dev)->SetStream(dev, stream_.GetNative());
-      tvm::runtime::Module gmod = mod_factory_.GetFunction("default")(dev);
-
-      // get function
-      func_set_input_ = gmod.GetFunction("set_input");
-      func_get_output_ = gmod.GetFunction("get_output");
-      func_run_ = gmod.GetFunction("run");
+
+    Result<void> TVMNet::Deinit()
+    {
+        return success();
     }
 
-    auto ToDesc = [&](const std::string& name) {
-      return TensorDesc{device_, DataType::kFLOAT, {}, name};
-    };
+    Result<Span<Tensor>> TVMNet::GetInputTensors()
+    {
+        return input_tensors_;
+    }
 
-    for (auto name : input_names) {
-      input_tensors_.emplace_back(ToDesc(name));
+    Result<Span<Tensor>> TVMNet::GetOutputTensors()
+    {
+        return output_tensors_;
     }
 
-    for (auto name : output_names) {
-      output_tensors_.emplace_back(ToDesc(name));
+    Result<void> TVMNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
     }
 
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating TVM Net: {}", e.what());
-    return Status(eFail);
-  }
-
-  return success();
-}
-
-Result<void> TVMNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-Result<void> TVMNet::Deinit() { return success(); }
-
-Result<Span<Tensor>> TVMNet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> TVMNet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> TVMNet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-Result<void> TVMNet::Forward() {
-  DLDevice dev = GetDLDevice(device_);
-  try {
-    OUTCOME_TRY(stream_.Wait());
-
-    if (use_vm_) {
-      // vm
-
-      // set input
-      int num_inputs = input_tensors_.size();
-      std::vector<tvm::runtime::NDArray> args_arr(num_inputs);
-      std::vector<TVMValue> tvm_values(num_inputs + 1);
-      std::vector<int> tvm_type_codes(num_inputs + 1);
-      tvm::runtime::TVMArgsSetter setter(tvm_values.data(), tvm_type_codes.data());
-      setter(0, "main");
-      for (int k = 0; k < num_inputs; ++k) {
-        auto v = input_tensors_[k];
-        OUTCOME_TRY(auto managed_tensor, ToDLPack(v, stream_));
-        OUTCOME_TRY(stream_.Wait());
-        args_arr[k] = tvm::runtime::NDArray::FromDLPack(managed_tensor);
-
-        int input_id = input_ids_[v.name()];
-        setter(input_id + 1, args_arr[k]);
-      }
-      func_set_input_.CallPacked(
-          tvm::runtime::TVMArgs(tvm_values.data(), tvm_type_codes.data(), num_inputs + 1), nullptr);
-
-      // run
-      tvm::runtime::TVMRetValue ret = func_run_("main");
-      if (device_.is_device()) {
-        // tvm virtual machine use default stream.
-        OUTCOME_TRY(Stream(device_, nullptr).Wait());
-      }
-
-      // get output
-      if (ret.type_code() == kTVMNDArrayHandle) {
-        tvm::runtime::NDArray ndarray = ret.AsObjectRef<tvm::runtime::NDArray>();
-        Tensor& v = output_tensors_[0];
-        OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
-      } else if (ret.type_code() == kTVMObjectHandle) {
-        const auto& adt = ret.AsObjectRef<tvm::runtime::ADT>();
-        for (int i = 0; i < output_tensors_.size(); ++i) {
-          tvm::runtime::NDArray ndarray = tvm::runtime::Downcast<tvm::runtime::NDArray>(adt[i]);
-          Tensor& v = output_tensors_[i];
-          OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
+    Result<void> TVMNet::Forward()
+    {
+        DLDevice dev = GetDLDevice(device_);
+        try
+        {
+            OUTCOME_TRY(stream_.Wait());
+
+            if (use_vm_)
+            {
+                // vm
+
+                // set input
+                int                                num_inputs = input_tensors_.size();
+                std::vector<tvm::runtime::NDArray> args_arr(num_inputs);
+                std::vector<TVMValue>              tvm_values(num_inputs + 1);
+                std::vector<int>                   tvm_type_codes(num_inputs + 1);
+                tvm::runtime::TVMArgsSetter        setter(tvm_values.data(), tvm_type_codes.data());
+                setter(0, "main");
+                for (int k = 0; k < num_inputs; ++k)
+                {
+                    auto v = input_tensors_[k];
+                    OUTCOME_TRY(auto managed_tensor, ToDLPack(v, stream_));
+                    OUTCOME_TRY(stream_.Wait());
+                    args_arr[k] = tvm::runtime::NDArray::FromDLPack(managed_tensor);
+
+                    int input_id = input_ids_[v.name()];
+                    setter(input_id + 1, args_arr[k]);
+                }
+                func_set_input_.CallPacked(
+                    tvm::runtime::TVMArgs(tvm_values.data(), tvm_type_codes.data(), num_inputs + 1),
+                    nullptr);
+
+                // run
+                tvm::runtime::TVMRetValue ret = func_run_("main");
+                if (device_.is_device())
+                {
+                    // tvm virtual machine use default stream.
+                    OUTCOME_TRY(Stream(device_, nullptr).Wait());
+                }
+
+                // get output
+                if (ret.type_code() == kTVMNDArrayHandle)
+                {
+                    tvm::runtime::NDArray ndarray = ret.AsObjectRef<tvm::runtime::NDArray>();
+                    Tensor&               v       = output_tensors_[0];
+                    OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
+                }
+                else if (ret.type_code() == kTVMObjectHandle)
+                {
+                    const auto& adt = ret.AsObjectRef<tvm::runtime::ADT>();
+                    for (int i = 0; i < output_tensors_.size(); ++i)
+                    {
+                        tvm::runtime::NDArray ndarray = tvm::runtime::Downcast<tvm::runtime::NDArray>(adt[i]);
+                        Tensor&               v       = output_tensors_[i];
+                        OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
+                    }
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("error return type code {}", ret.type_code());
+                    return Status(eFail);
+                }
+            }
+            else
+            {
+                // graph executor
+
+                // set input
+                for (auto v : input_tensors_)
+                {
+                    OUTCOME_TRY(auto managed_tensor, ToDLPack(v, stream_));
+                    OUTCOME_TRY(stream_.Wait());
+                    auto ndarray = tvm::runtime::NDArray::FromDLPack(managed_tensor);
+
+                    func_set_input_(v.name(), ndarray);
+                }
+
+                // run
+                func_run_();
+
+                // get output
+                for (int i = 0; i < output_tensors_.size(); ++i)
+                {
+                    tvm::runtime::NDArray ndarray = func_get_output_(i);
+                    Tensor&               v       = output_tensors_[i];
+                    OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
+                }
+
+                OUTCOME_TRY(stream_.Wait());
+            }
         }
-      } else {
-        MMDEPLOY_ERROR("error return type code {}", ret.type_code());
-        return Status(eFail);
-      }
-    } else {
-      // graph executor
-
-      // set input
-      for (auto v : input_tensors_) {
-        OUTCOME_TRY(auto managed_tensor, ToDLPack(v, stream_));
-        OUTCOME_TRY(stream_.Wait());
-        auto ndarray = tvm::runtime::NDArray::FromDLPack(managed_tensor);
-
-        func_set_input_(v.name(), ndarray);
-      }
-
-      // run
-      func_run_();
-
-      // get output
-      for (int i = 0; i < output_tensors_.size(); ++i) {
-        tvm::runtime::NDArray ndarray = func_get_output_(i);
-        Tensor& v = output_tensors_[i];
-        OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
-      }
-
-      OUTCOME_TRY(stream_.Wait());
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR(e.what());
+            return Status(eFail);
+        }
+        return success();
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR(e.what());
-    return Status(eFail);
-  }
-  return success();
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<TVMNet>();
-  if (auto status = p->Init(args)) {
-    return p;
-  } else {
-    MMDEPLOY_ERROR("Failed to created TVMNet with config: {}", args);
-  }
-  return nullptr;
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (tvm, 0), Create);
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<TVMNet>();
+        if (auto status = p->Init(args))
+        {
+            return p;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("Failed to created TVMNet with config: {}", args);
+        }
+        return nullptr;
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (tvm, 0), Create);
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/tvm/tvm_net.h b/csrc/mmdeploy/net/tvm/tvm_net.h
index 9f1135eb08..2159cd1982 100644
--- a/csrc/mmdeploy/net/tvm/tvm_net.h
+++ b/csrc/mmdeploy/net/tvm/tvm_net.h
@@ -7,33 +7,35 @@
 
 #include "mmdeploy/core/net.h"
 
-namespace mmdeploy::framework {
-
-class TVMNet : public Net {
- public:
-  ~TVMNet() override = default;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
-
- private:
-  tvm::runtime::Module mod_factory_;
-
-  tvm::runtime::PackedFunc func_set_input_;
-  tvm::runtime::PackedFunc func_get_output_;
-  tvm::runtime::PackedFunc func_run_;
-  bool use_vm_;
-
-  std::map<std::string, int> input_ids_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  Device device_;
-  Stream stream_;
-};
+namespace mmdeploy::framework
+{
+
+    class TVMNet : public Net
+    {
+      public:
+        ~TVMNet() override = default;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
+
+      private:
+        tvm::runtime::Module       mod_factory_;
+
+        tvm::runtime::PackedFunc   func_set_input_;
+        tvm::runtime::PackedFunc   func_get_output_;
+        tvm::runtime::PackedFunc   func_run_;
+        bool                       use_vm_;
+
+        std::map<std::string, int> input_ids_;
+        std::vector<Tensor>        input_tensors_;
+        std::vector<Tensor>        output_tensors_;
+        Device                     device_;
+        Stream                     stream_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/operation/cpu/crop.cpp b/csrc/mmdeploy/operation/cpu/crop.cpp
index bf13a42e53..57cf1ff726 100644
--- a/csrc/mmdeploy/operation/cpu/crop.cpp
+++ b/csrc/mmdeploy/operation/cpu/crop.cpp
@@ -3,19 +3,22 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class CropImpl : public Crop {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    cv::Mat mat = mmdeploy::cpu::Tensor2CVMat(src);
-    cv::Mat cropped_mat = mmdeploy::cpu::Crop(mat, top, left, bottom, right);
-    dst = mmdeploy::cpu::CVMat2Tensor(cropped_mat);
-    return success();
-  }
-};
+    class CropImpl : public Crop
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            cv::Mat mat         = mmdeploy::cpu::Tensor2CVMat(src);
+            cv::Mat cropped_mat = mmdeploy::cpu::Crop(mat, top, left, bottom, right);
+            dst                 = mmdeploy::cpu::CVMat2Tensor(cropped_mat);
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (cpu, 0), []() { return std::make_unique<CropImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (cpu, 0), []()
+                                   { return std::make_unique<CropImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/crop_resize_pad.cpp b/csrc/mmdeploy/operation/cpu/crop_resize_pad.cpp
index ee0dea65ab..bfaec5a4c2 100644
--- a/csrc/mmdeploy/operation/cpu/crop_resize_pad.cpp
+++ b/csrc/mmdeploy/operation/cpu/crop_resize_pad.cpp
@@ -3,23 +3,24 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class CropResizePadImpl : public CropResizePad {
- public:
-  CropResizePadImpl() = default;
+    class CropResizePadImpl : public CropResizePad
+    {
+      public:
+        CropResizePadImpl() = default;
 
-  Result<void> apply(const Tensor &src, const std::vector<int> &crop_rect,
-                     const std::vector<int> &target_size, const std::vector<int> &pad_rect,
-                     Tensor &dst) override {
-    auto src_mat = mmdeploy::cpu::Tensor2CVMat(src);
-    auto dst_mat = mmdeploy::cpu::CropResizePad(src_mat, crop_rect, target_size, pad_rect);
-    dst = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-    return success();
-  }
-};
+        Result<void> apply(const Tensor& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect, Tensor& dst) override
+        {
+            auto src_mat = mmdeploy::cpu::Tensor2CVMat(src);
+            auto dst_mat = mmdeploy::cpu::CropResizePad(src_mat, crop_rect, target_size, pad_rect);
+            dst          = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(CropResizePad, (cpu, 0),
-                               []() { return std::make_unique<CropResizePadImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(CropResizePad, (cpu, 0), []()
+                                   { return std::make_unique<CropResizePadImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/cvtcolor.cpp b/csrc/mmdeploy/operation/cpu/cvtcolor.cpp
index b1e4d9b536..65484db766 100644
--- a/csrc/mmdeploy/operation/cpu/cvtcolor.cpp
+++ b/csrc/mmdeploy/operation/cpu/cvtcolor.cpp
@@ -3,18 +3,22 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class CvtColorImpl : public CvtColor {
- public:
-  Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override {
-    auto src_mat = mmdeploy::cpu::Mat2CVMat(src);
-    auto dst_mat = mmdeploy::cpu::CvtColor(src_mat, src.pixel_format(), dst_fmt);
-    dst = mmdeploy::cpu::CVMat2Mat(dst_mat, dst_fmt);
-    return success();
-  }
-};
+    class CvtColorImpl : public CvtColor
+    {
+      public:
+        Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override
+        {
+            auto src_mat = mmdeploy::cpu::Mat2CVMat(src);
+            auto dst_mat = mmdeploy::cpu::CvtColor(src_mat, src.pixel_format(), dst_fmt);
+            dst          = mmdeploy::cpu::CVMat2Mat(dst_mat, dst_fmt);
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (cpu, 0), [] { return std::make_unique<CvtColorImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (cpu, 0), []
+                                   { return std::make_unique<CvtColorImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/flip.cpp b/csrc/mmdeploy/operation/cpu/flip.cpp
index 94808793b6..a456600468 100644
--- a/csrc/mmdeploy/operation/cpu/flip.cpp
+++ b/csrc/mmdeploy/operation/cpu/flip.cpp
@@ -3,22 +3,25 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class FlipImpl : public Flip {
- public:
-  using Flip::Flip;
+    class FlipImpl : public Flip
+    {
+      public:
+        using Flip::Flip;
 
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    cv::Mat mat = mmdeploy::cpu::Tensor2CVMat(src);
-    cv::Mat flipped_mat;
-    cv::flip(mat, flipped_mat, flip_code_);
-    dst = mmdeploy::cpu::CVMat2Tensor(flipped_mat);
-    return success();
-  }
-};
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            cv::Mat mat = mmdeploy::cpu::Tensor2CVMat(src);
+            cv::Mat flipped_mat;
+            cv::flip(mat, flipped_mat, flip_code_);
+            dst = mmdeploy::cpu::CVMat2Tensor(flipped_mat);
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Flip, (cpu, 0),
-                               [](int flip_code) { return std::make_unique<FlipImpl>(flip_code); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Flip, (cpu, 0), [](int flip_code)
+                                   { return std::make_unique<FlipImpl>(flip_code); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/hwc2chw.cpp b/csrc/mmdeploy/operation/cpu/hwc2chw.cpp
index 33db1b8e4d..eebd0b97b5 100644
--- a/csrc/mmdeploy/operation/cpu/hwc2chw.cpp
+++ b/csrc/mmdeploy/operation/cpu/hwc2chw.cpp
@@ -3,26 +3,30 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
-
-class HWC2CHWImpl : public HWC2CHW {
- public:
-  Result<void> apply(const Tensor& img, Tensor& dst) override {
-    auto shape = img.shape();
-    auto height = shape[1];
-    auto width = shape[2];
-    auto channels = shape[3];
-
-    auto dst_mat = mmdeploy::cpu::Transpose(mmdeploy::cpu::Tensor2CVMat(img));
-
-    auto dst_tensor = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-    dst_tensor.Reshape({1, channels, height, width});
-
-    dst = std::move(dst_tensor);
-    return success();
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (cpu, 0), []() { return std::make_unique<HWC2CHWImpl>(); });
+namespace mmdeploy::operation::cpu
+{
+
+    class HWC2CHWImpl : public HWC2CHW
+    {
+      public:
+        Result<void> apply(const Tensor& img, Tensor& dst) override
+        {
+            auto shape    = img.shape();
+            auto height   = shape[1];
+            auto width    = shape[2];
+            auto channels = shape[3];
+
+            auto dst_mat = mmdeploy::cpu::Transpose(mmdeploy::cpu::Tensor2CVMat(img));
+
+            auto dst_tensor = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+            dst_tensor.Reshape({1, channels, height, width});
+
+            dst = std::move(dst_tensor);
+            return success();
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (cpu, 0), []()
+                                   { return std::make_unique<HWC2CHWImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/normalize.cpp b/csrc/mmdeploy/operation/cpu/normalize.cpp
index e444cf0dad..83244794fe 100644
--- a/csrc/mmdeploy/operation/cpu/normalize.cpp
+++ b/csrc/mmdeploy/operation/cpu/normalize.cpp
@@ -3,27 +3,32 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
-
-class NormalizeImpl : public Normalize {
- public:
-  explicit NormalizeImpl(Param param) : param_(std::move(param)) {}
-
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    auto mat = mmdeploy::cpu::Tensor2CVMat(src);
-    auto dst_mat = mmdeploy::cpu::Normalize(mat, param_.mean, param_.std, param_.to_rgb, false);
-    auto output = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-
-    dst = std::move(output);
-    return success();
-  }
-
- protected:
-  Param param_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (cpu, 0), [](const Normalize::Param& param) {
-  return std::make_unique<NormalizeImpl>(param);
-});
+namespace mmdeploy::operation::cpu
+{
+
+    class NormalizeImpl : public Normalize
+    {
+      public:
+        explicit NormalizeImpl(Param param)
+            : param_(std::move(param))
+        {
+        }
+
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            auto mat     = mmdeploy::cpu::Tensor2CVMat(src);
+            auto dst_mat = mmdeploy::cpu::Normalize(mat, param_.mean, param_.std, param_.to_rgb, false);
+            auto output  = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+
+            dst = std::move(output);
+            return success();
+        }
+
+      protected:
+        Param param_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (cpu, 0), [](const Normalize::Param& param)
+                                   { return std::make_unique<NormalizeImpl>(param); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/pad.cpp b/csrc/mmdeploy/operation/cpu/pad.cpp
index 8a6ba50d6a..872bf51208 100644
--- a/csrc/mmdeploy/operation/cpu/pad.cpp
+++ b/csrc/mmdeploy/operation/cpu/pad.cpp
@@ -5,35 +5,40 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
-
-class PadImpl : public Pad {
- public:
-  PadImpl(cv::BorderTypes border_type, float pad_val)
-      : border_type_(border_type), pad_val_(pad_val) {}
-
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    cv::Mat dst_mat = mmdeploy::cpu::Pad(mmdeploy::cpu::Tensor2CVMat(src), top, left, bottom, right,
-                                         border_type_, pad_val_);
-    dst = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-    return success();
-  }
-
- private:
-  cv::BorderTypes border_type_;
-  float pad_val_;
-};
-
-static auto Create(const string_view& border_type, float pad_val) {
-  static const std::map<string_view, cv::BorderTypes> border_map{
-      {"constant", cv::BORDER_CONSTANT},
-      {"edge", cv::BORDER_REPLICATE},
-      {"reflect", cv::BORDER_REFLECT_101},
-      {"symmetric", cv::BORDER_REFLECT}};
-  return std::make_unique<PadImpl>(border_map.at(border_type), pad_val);
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (cpu, 0), Create);
+namespace mmdeploy::operation::cpu
+{
+
+    class PadImpl : public Pad
+    {
+      public:
+        PadImpl(cv::BorderTypes border_type, float pad_val)
+            : border_type_(border_type)
+            , pad_val_(pad_val)
+        {
+        }
+
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            cv::Mat dst_mat = mmdeploy::cpu::Pad(mmdeploy::cpu::Tensor2CVMat(src), top, left, bottom, right, border_type_, pad_val_);
+            dst             = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+            return success();
+        }
+
+      private:
+        cv::BorderTypes border_type_;
+        float           pad_val_;
+    };
+
+    static auto Create(const string_view& border_type, float pad_val)
+    {
+        static const std::map<string_view, cv::BorderTypes> border_map{
+            {"constant", cv::BORDER_CONSTANT},
+            {"edge", cv::BORDER_REPLICATE},
+            {"reflect", cv::BORDER_REFLECT_101},
+            {"symmetric", cv::BORDER_REFLECT}};
+        return std::make_unique<PadImpl>(border_map.at(border_type), pad_val);
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (cpu, 0), Create);
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/permute.cpp b/csrc/mmdeploy/operation/cpu/permute.cpp
index 44c98fe24d..7eadd5946d 100644
--- a/csrc/mmdeploy/operation/cpu/permute.cpp
+++ b/csrc/mmdeploy/operation/cpu/permute.cpp
@@ -3,89 +3,108 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class PermuteImpl : public Permute {
- public:
-  explicit PermuteImpl() {}
+    class PermuteImpl : public Permute
+    {
+      public:
+        explicit PermuteImpl() {}
 
-  Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) override {
-    int ndim = src.shape().size();
-    if (ndim != axes.size()) {
-      MMDEPLOY_ERROR("The size of axes should be equal to src, {} vs {}", axes.size(), ndim);
-      return Status(eInvalidArgument);
-    }
-    std::vector<int> axes_vis(ndim, 0);
-    for (const auto& x : axes) {
-      if (x < 0 || x >= ndim || axes_vis[x]) {
-        MMDEPLOY_ERROR("Invalid axes");
-        return Status(eInvalidArgument);
-      }
-      axes_vis[x] = 1;
-    }
+        Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) override
+        {
+            int ndim = src.shape().size();
+            if (ndim != axes.size())
+            {
+                MMDEPLOY_ERROR("The size of axes should be equal to src, {} vs {}", axes.size(), ndim);
+                return Status(eInvalidArgument);
+            }
+            std::vector<int> axes_vis(ndim, 0);
+            for (const auto& x : axes)
+            {
+                if (x < 0 || x >= ndim || axes_vis[x])
+                {
+                    MMDEPLOY_ERROR("Invalid axes");
+                    return Status(eInvalidArgument);
+                }
+                axes_vis[x] = 1;
+            }
 
-    Tensor dst_tensor(src.desc());
-    auto src_dims = src.shape();
-    TensorShape dst_dims(ndim);
-    for (int i = 0; i < src_dims.size(); i++) {
-      dst_dims[i] = src_dims[axes[i]];
-    }
-    dst_tensor.Reshape(dst_dims);
+            Tensor      dst_tensor(src.desc());
+            auto        src_dims = src.shape();
+            TensorShape dst_dims(ndim);
+            for (int i = 0; i < src_dims.size(); i++)
+            {
+                dst_dims[i] = src_dims[axes[i]];
+            }
+            dst_tensor.Reshape(dst_dims);
 
-    std::vector<int> dst_strides(ndim);
-    std::vector<int> src_strides(ndim);
-    dst_strides[ndim - 1] = src_strides[ndim - 1] = 1;
-    for (int i = ndim - 2; i >= 0; i--) {
-      dst_strides[i] = dst_strides[i + 1] * dst_dims[i + 1];
-      src_strides[i] = src_strides[i + 1] * src_dims[i + 1];
-    }
+            std::vector<int> dst_strides(ndim);
+            std::vector<int> src_strides(ndim);
+            dst_strides[ndim - 1] = src_strides[ndim - 1] = 1;
+            for (int i = ndim - 2; i >= 0; i--)
+            {
+                dst_strides[i] = dst_strides[i + 1] * dst_dims[i + 1];
+                src_strides[i] = src_strides[i + 1] * src_dims[i + 1];
+            }
 
-    std::vector<int> tmp(ndim);
-    for (int i = 0; i < ndim; i++) {
-      tmp[i] = src_strides[axes[i]];
-    }
-    src_strides.swap(tmp);
+            std::vector<int> tmp(ndim);
+            for (int i = 0; i < ndim; i++)
+            {
+                tmp[i] = src_strides[axes[i]];
+            }
+            src_strides.swap(tmp);
 
-    if (src.data_type() == DataType::kINT8) {
-      OUTCOME_TRY(PermuteDispatch<uint8_t>(src, dst_tensor, src_strides, dst_strides));
-    } else if (src.data_type() == DataType::kFLOAT) {
-      OUTCOME_TRY(PermuteDispatch<float>(src, dst_tensor, src_strides, dst_strides));
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-      return Status(eNotSupported);
-    }
-    dst = std::move(dst_tensor);
-    return success();
-  }
+            if (src.data_type() == DataType::kINT8)
+            {
+                OUTCOME_TRY(PermuteDispatch<uint8_t>(src, dst_tensor, src_strides, dst_strides));
+            }
+            else if (src.data_type() == DataType::kFLOAT)
+            {
+                OUTCOME_TRY(PermuteDispatch<float>(src, dst_tensor, src_strides, dst_strides));
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                return Status(eNotSupported);
+            }
+            dst = std::move(dst_tensor);
+            return success();
+        }
 
-  template <typename T>
-  Result<void> PermuteDispatch(const Tensor& src, Tensor& dst, const std::vector<int>& src_strides,
-                               const std::vector<int>& dst_strides) {
-    auto shape = dst.shape();
-    int ndim = src.shape().size();
-    std::vector<int> coord(ndim, 0);
-    auto dst_data = dst.data<T>();
-    auto src_data = src.data<T>();
+        template<typename T>
+        Result<void> PermuteDispatch(const Tensor& src, Tensor& dst, const std::vector<int>& src_strides, const std::vector<int>& dst_strides)
+        {
+            auto             shape = dst.shape();
+            int              ndim  = src.shape().size();
+            std::vector<int> coord(ndim, 0);
+            auto             dst_data = dst.data<T>();
+            auto             src_data = src.data<T>();
 
-    int i;
-    do {
-      dst_data[0] = src_data[0];
-      for (i = ndim - 1; i >= 0; i--) {
-        if (++coord[i] == shape[i]) {
-          coord[i] = 0;
-          dst_data -= (shape[i] - 1) * dst_strides[i];
-          src_data -= (shape[i] - 1) * src_strides[i];
-        } else {
-          dst_data += dst_strides[i];
-          src_data += src_strides[i];
-          break;
+            int              i;
+            do {
+                dst_data[0] = src_data[0];
+                for (i = ndim - 1; i >= 0; i--)
+                {
+                    if (++coord[i] == shape[i])
+                    {
+                        coord[i] = 0;
+                        dst_data -= (shape[i] - 1) * dst_strides[i];
+                        src_data -= (shape[i] - 1) * src_strides[i];
+                    }
+                    else
+                    {
+                        dst_data += dst_strides[i];
+                        src_data += src_strides[i];
+                        break;
+                    }
+                }
+            } while (i >= 0);
+            return success();
         }
-      }
-    } while (i >= 0);
-    return success();
-  }
-};
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Permute, (cpu, 0), []() { return std::make_unique<PermuteImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Permute, (cpu, 0), []()
+                                   { return std::make_unique<PermuteImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/resize.cpp b/csrc/mmdeploy/operation/cpu/resize.cpp
index 33c5ce313b..d01664c4ad 100644
--- a/csrc/mmdeploy/operation/cpu/resize.cpp
+++ b/csrc/mmdeploy/operation/cpu/resize.cpp
@@ -3,25 +3,30 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class ResizeImpl : public Resize {
- public:
-  explicit ResizeImpl(std::string interp) : interp_(std::move(interp)) {}
+    class ResizeImpl : public Resize
+    {
+      public:
+        explicit ResizeImpl(std::string interp)
+            : interp_(std::move(interp))
+        {
+        }
 
-  Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override {
-    auto src_mat = mmdeploy::cpu::Tensor2CVMat(src);
-    auto dst_mat = mmdeploy::cpu::Resize(src_mat, dst_h, dst_w, interp_);
-    dst = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-    return success();
-  }
+        Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override
+        {
+            auto src_mat = mmdeploy::cpu::Tensor2CVMat(src);
+            auto dst_mat = mmdeploy::cpu::Resize(src_mat, dst_h, dst_w, interp_);
+            dst          = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+            return success();
+        }
 
- private:
-  std::string interp_;
-};
+      private:
+        std::string interp_;
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (cpu, 0), [](const string_view& interp) {
-  return std::make_unique<ResizeImpl>(std::string(interp));
-});
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (cpu, 0), [](const string_view& interp)
+                                   { return std::make_unique<ResizeImpl>(std::string(interp)); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/to_float.cpp b/csrc/mmdeploy/operation/cpu/to_float.cpp
index 90435d31b3..f513f093b6 100644
--- a/csrc/mmdeploy/operation/cpu/to_float.cpp
+++ b/csrc/mmdeploy/operation/cpu/to_float.cpp
@@ -5,38 +5,45 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
-
-class ToFloatImpl : public ToFloat {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    auto data_type = src.desc().data_type;
-    if (data_type == DataType::kFLOAT) {
-      dst = src;
-      return success();
-    }
-
-    if (data_type == DataType::kINT8) {
-      const auto size = src.size();
-      if (size > std::numeric_limits<int>::max()) {
-        throw_exception(eNotSupported);
-      }
-      cv::Mat uint8_mat(1, static_cast<int>(size), CV_8U, const_cast<void*>(src.data()));
-
-      auto desc = src.desc();
-      desc.data_type = DataType::kFLOAT;
-      Tensor dst_tensor(desc);
-
-      cv::Mat float_mat(1, static_cast<int>(size), CV_32F, dst_tensor.data());
-      uint8_mat.convertTo(float_mat, CV_32F);
-
-      dst = std::move(dst_tensor);
-      return success();
-    }
-    throw_exception(eNotSupported);
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (cpu, 0), []() { return std::make_unique<ToFloatImpl>(); });
+namespace mmdeploy::operation::cpu
+{
+
+    class ToFloatImpl : public ToFloat
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            auto data_type = src.desc().data_type;
+            if (data_type == DataType::kFLOAT)
+            {
+                dst = src;
+                return success();
+            }
+
+            if (data_type == DataType::kINT8)
+            {
+                const auto size = src.size();
+                if (size > std::numeric_limits<int>::max())
+                {
+                    throw_exception(eNotSupported);
+                }
+                cv::Mat uint8_mat(1, static_cast<int>(size), CV_8U, const_cast<void*>(src.data()));
+
+                auto    desc   = src.desc();
+                desc.data_type = DataType::kFLOAT;
+                Tensor  dst_tensor(desc);
+
+                cv::Mat float_mat(1, static_cast<int>(size), CV_32F, dst_tensor.data());
+                uint8_mat.convertTo(float_mat, CV_32F);
+
+                dst = std::move(dst_tensor);
+                return success();
+            }
+            throw_exception(eNotSupported);
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (cpu, 0), []()
+                                   { return std::make_unique<ToFloatImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/warp_affine.cpp b/csrc/mmdeploy/operation/cpu/warp_affine.cpp
index 5b5914db71..257716a5b9 100644
--- a/csrc/mmdeploy/operation/cpu/warp_affine.cpp
+++ b/csrc/mmdeploy/operation/cpu/warp_affine.cpp
@@ -3,27 +3,31 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class WarpAffineImpl : public WarpAffine {
- public:
-  explicit WarpAffineImpl(int method) : method_(method) {}
+    class WarpAffineImpl : public WarpAffine
+    {
+      public:
+        explicit WarpAffineImpl(int method)
+            : method_(method)
+        {
+        }
 
-  Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6], int dst_h,
-                     int dst_w) override {
-    auto src_mat = mmdeploy::cpu::Tensor2CVMat(src);
-    cv::Mat_<float> _matrix(2, 3, const_cast<float*>(affine_matrix));
-    auto dst_mat = mmdeploy::cpu::WarpAffine(src_mat, _matrix, dst_h, dst_w, method_);
-    dst = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-    return success();
-  }
+        Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6], int dst_h, int dst_w) override
+        {
+            auto            src_mat = mmdeploy::cpu::Tensor2CVMat(src);
+            cv::Mat_<float> _matrix(2, 3, const_cast<float*>(affine_matrix));
+            auto            dst_mat = mmdeploy::cpu::WarpAffine(src_mat, _matrix, dst_h, dst_w, method_);
+            dst                     = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+            return success();
+        }
 
- private:
-  int method_;
-};
+      private:
+        int method_;
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(WarpAffine, (cpu, 0), [](const string_view& interp) {
-  return std::make_unique<WarpAffineImpl>(::mmdeploy::cpu::GetInterpolationMethod(interp).value());
-});
+    MMDEPLOY_REGISTER_FACTORY_FUNC(WarpAffine, (cpu, 0), [](const string_view& interp)
+                                   { return std::make_unique<WarpAffineImpl>(::mmdeploy::cpu::GetInterpolationMethod(interp).value()); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cuda/cast.cu b/csrc/mmdeploy/operation/cuda/cast.cu
index 9449071b05..cf18d22683 100644
--- a/csrc/mmdeploy/operation/cuda/cast.cu
+++ b/csrc/mmdeploy/operation/cuda/cast.cu
@@ -2,27 +2,33 @@
 
 #include <cstdint>
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
 
-template <typename From, typename To>
-__global__ void _Cast(const From* src, To* dst, size_t n) {
-  auto idx = threadIdx.x + static_cast<size_t>(blockIdx.x) * blockDim.x;
-  for (size_t i = idx; i < n; i += blockDim.x * gridDim.x) {
-    dst[i] = static_cast<To>(src[i]);
-  }
-}
+            template<typename From, typename To>
+            __global__ void _Cast(const From* src, To* dst, size_t n)
+            {
+                auto idx = threadIdx.x + static_cast<size_t>(blockIdx.x) * blockDim.x;
+                for (size_t i = idx; i < n; i += blockDim.x * gridDim.x)
+                {
+                    dst[i] = static_cast<To>(src[i]);
+                }
+            }
 
-template <typename From, typename To>
-void Cast(const From* src, To* dst, size_t n, cudaStream_t stream) {
-  size_t n_threads = 256;
-  size_t n_blocks = (n + n_threads - 1) / n_threads;
-  _Cast<<<n_blocks, n_threads, 0, stream>>>(src, dst, n);
-}
+            template<typename From, typename To>
+            void Cast(const From* src, To* dst, size_t n, cudaStream_t stream)
+            {
+                size_t n_threads = 256;
+                size_t n_blocks  = (n + n_threads - 1) / n_threads;
+                _Cast<<<n_blocks, n_threads, 0, stream>>>(src, dst, n);
+            }
 
-template void Cast(const uint8_t*, float*, size_t, cudaStream_t);
+            template void Cast(const uint8_t*, float*, size_t, cudaStream_t);
 
-}  // namespace cuda
-}  // namespace operation
+        }  // namespace cuda
+    }      // namespace operation
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/operation/cuda/crop.cpp b/csrc/mmdeploy/operation/cuda/crop.cpp
index be8930a835..52b65a5c26 100644
--- a/csrc/mmdeploy/operation/cuda/crop.cpp
+++ b/csrc/mmdeploy/operation/cuda/crop.cpp
@@ -5,64 +5,82 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
+namespace mmdeploy::operation::cuda
+{
 
-namespace impl {
+    namespace impl
+    {
 
-template <typename T, int channels>
-void Crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w,
-          cudaStream_t stream);
+        template<typename T, int channels>
+        void Crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream);
 
-}
+    }
 
-class CropImpl : public Crop {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-    auto desc = src.desc();
+    class CropImpl : public Crop
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            auto        cuda_stream = GetNative<cudaStream_t>(stream());
+            auto        desc        = src.desc();
 
-    int h = bottom - top + 1;
-    int w = right - left + 1;
-    int c = desc.shape[3];
-    auto type = desc.data_type;
+            int         h    = bottom - top + 1;
+            int         w    = right - left + 1;
+            int         c    = desc.shape[3];
+            auto        type = desc.data_type;
 
-    TensorShape shape{1, bottom - top + 1, right - left + 1, src.desc().shape[3]};
-    TensorDesc dst_desc{device(), src.desc().data_type, shape, desc.name};
-    Tensor dst_tensor{dst_desc};
+            TensorShape shape{1, bottom - top + 1, right - left + 1, src.desc().shape[3]};
+            TensorDesc  dst_desc{device(), src.desc().data_type, shape, desc.name};
+            Tensor      dst_tensor{dst_desc};
 
-    if (DataType::kINT8 == type) {
-      auto input = src.data<uint8_t>();
-      auto output = dst_tensor.data<uint8_t>();
-      if (3 == c) {
-        impl::Crop<uint8_t, 3>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
-      } else if (1 == c) {
-        impl::Crop<uint8_t, 1>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else if (DataType::kFLOAT == type) {
-      auto input = static_cast<float*>(src.buffer().GetNative());
-      auto output = static_cast<float*>(dst_tensor.buffer().GetNative());
-      if (3 == c) {
-        impl::Crop<float, 3>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
-      } else if (1 == c) {
-        impl::Crop<float, 1>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported type {}", type);
-      return Status(eNotSupported);
-    }
+            if (DataType::kINT8 == type)
+            {
+                auto input  = src.data<uint8_t>();
+                auto output = dst_tensor.data<uint8_t>();
+                if (3 == c)
+                {
+                    impl::Crop<uint8_t, 3>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+                }
+                else if (1 == c)
+                {
+                    impl::Crop<uint8_t, 1>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    return Status(eNotSupported);
+                }
+            }
+            else if (DataType::kFLOAT == type)
+            {
+                auto input  = static_cast<float*>(src.buffer().GetNative());
+                auto output = static_cast<float*>(dst_tensor.buffer().GetNative());
+                if (3 == c)
+                {
+                    impl::Crop<float, 3>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+                }
+                else if (1 == c)
+                {
+                    impl::Crop<float, 1>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    return Status(eNotSupported);
+                }
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported type {}", type);
+                return Status(eNotSupported);
+            }
 
-    dst = std::move(dst_tensor);
-    return success();
-  }
-};
+            dst = std::move(dst_tensor);
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (cuda, 0), [] { return std::make_unique<CropImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (cuda, 0), []
+                                   { return std::make_unique<CropImpl>(); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/crop.cu b/csrc/mmdeploy/operation/cuda/crop.cu
index e2f09ff7d5..39e2caf4ea 100644
--- a/csrc/mmdeploy/operation/cuda/crop.cu
+++ b/csrc/mmdeploy/operation/cuda/crop.cu
@@ -2,52 +2,53 @@
 
 #include <cstdint>
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
-namespace impl {
-
-template <typename T, int channels>
-__global__ void crop(const T *src, int src_w, T *dst, int dst_h, int dst_w, int offset_h,
-                     int offset_w) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-  if (x >= dst_w || y >= dst_h) return;
-  int src_x = x + offset_w;
-  int src_y = y + offset_h;
-
-  int dst_loc = (y * dst_w + x) * channels;
-  int src_loc = (src_y * src_w + src_x) * channels;
-
-  for (int i = 0; i < channels; ++i) {
-    dst[dst_loc + i] = src[src_loc + i];
-  }
-}
-
-template <typename T, int channels>
-void Crop(const T *src, int src_w, T *dst, int dst_h, int dst_w, int offset_h, int offset_w,
-          cudaStream_t stream) {
-  const dim3 thread_block(32, 8);
-  const dim3 block_num((dst_w + thread_block.x - 1) / thread_block.x,
-                       (dst_h + thread_block.y - 1) / thread_block.y);
-  crop<T, channels>
-      <<<block_num, thread_block, 0, stream>>>(src, src_w, dst, dst_h, dst_w, offset_h, offset_w);
-}
-
-template void Crop<uint8_t, 3>(const uint8_t *src, int src_w, uint8_t *dst, int dst_h, int dst_w,
-                               int offset_h, int offset_w, cudaStream_t stream);
-
-template void Crop<uint8_t, 1>(const uint8_t *src, int src_w, uint8_t *dst, int dst_h, int dst_w,
-                               int offset_h, int offset_w, cudaStream_t stream);
-
-template void Crop<float, 3>(const float *src, int src_w, float *dst, int dst_h, int dst_w,
-                             int offset_h, int offset_w, cudaStream_t stream);
-
-template void Crop<float, 1>(const float *src, int src_w, float *dst, int dst_h, int dst_w,
-                             int offset_h, int offset_w, cudaStream_t stream);
-
-}  // namespace impl
-}  // namespace cuda
-}  // namespace operation
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
+            namespace impl
+            {
+
+                template<typename T, int channels>
+                __global__ void crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w)
+                {
+                    int x = blockIdx.x * blockDim.x + threadIdx.x;
+                    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+                    if (x >= dst_w || y >= dst_h) return;
+                    int src_x = x + offset_w;
+                    int src_y = y + offset_h;
+
+                    int dst_loc = (y * dst_w + x) * channels;
+                    int src_loc = (src_y * src_w + src_x) * channels;
+
+                    for (int i = 0; i < channels; ++i)
+                    {
+                        dst[dst_loc + i] = src[src_loc + i];
+                    }
+                }
+
+                template<typename T, int channels>
+                void Crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream)
+                {
+                    const dim3 thread_block(32, 8);
+                    const dim3 block_num((dst_w + thread_block.x - 1) / thread_block.x,
+                                         (dst_h + thread_block.y - 1) / thread_block.y);
+                    crop<T, channels>
+                        <<<block_num, thread_block, 0, stream>>>(src, src_w, dst, dst_h, dst_w, offset_h, offset_w);
+                }
+
+                template void Crop<uint8_t, 3>(const uint8_t* src, int src_w, uint8_t* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream);
+
+                template void Crop<uint8_t, 1>(const uint8_t* src, int src_w, uint8_t* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream);
+
+                template void Crop<float, 3>(const float* src, int src_w, float* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream);
+
+                template void Crop<float, 1>(const float* src, int src_w, float* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream);
+
+            }  // namespace impl
+        }      // namespace cuda
+    }          // namespace operation
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp b/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp
index e12ae06a1e..8e4eb12c3f 100644
--- a/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp
+++ b/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp
@@ -3,88 +3,97 @@
 #include "mmdeploy/operation/vision.h"
 #include "ppl/cv/cuda/resize.h"
 
-namespace mmdeploy::operation::cuda {
-
-class CropResizePadImpl : public CropResizePad {
- public:
-  CropResizePadImpl() = default;
-
-  Result<void> apply(const Tensor &src, const std::vector<int> &crop_rect,
-                     const std::vector<int> &target_size, const std::vector<int> &pad_rect,
-                     Tensor &dst) override {
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-
-    int width = target_size[0] + pad_rect[1] + pad_rect[3];
-    int height = target_size[1] + pad_rect[0] + pad_rect[2];
-
-    TensorDesc desc{device(), src.data_type(), {1, height, width, src.shape(3)}, src.name()};
-    Tensor dst_tensor(desc);
-    cudaMemsetAsync(dst_tensor.data<uint8_t>(), 0, dst_tensor.byte_size(), cuda_stream);
-
-    if (crop_rect[2] - crop_rect[0] + 1 > 0 && crop_rect[3] - crop_rect[1] + 1 > 0) {
-      if (src.data_type() == DataType::kINT8) {
-        OUTCOME_TRY(ResizeDispatch<uint8_t>(src, crop_rect, target_size, pad_rect, dst_tensor,
-                                            cuda_stream));
-      } else if (src.data_type() == DataType::kFLOAT) {
-        OUTCOME_TRY(
-            ResizeDispatch<float>(src, crop_rect, target_size, pad_rect, dst_tensor, cuda_stream));
-      } else {
-        MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-        return Status(eNotSupported);
-      }
-    }
-
-    dst = std::move(dst_tensor);
-    return success();
-  }
-
- private:
-  template <typename T>
-  auto Select(int channels) -> decltype(&ppl::cv::cuda::Resize<T, 1>) {
-    switch (channels) {
-      case 1:
-        return &ppl::cv::cuda::Resize<T, 1>;
-      case 3:
-        return &ppl::cv::cuda::Resize<T, 3>;
-      case 4:
-        return &ppl::cv::cuda::Resize<T, 4>;
-      default:
-        MMDEPLOY_ERROR("unsupported channels {}", channels);
-        return nullptr;
-    }
-  }
-
-  template <class T>
-  Result<void> ResizeDispatch(const Tensor &src, const std::vector<int> &crop_rect,
-                              const std::vector<int> &target_size, const std::vector<int> &pad_rect,
-                              Tensor &dst, cudaStream_t stream) {
-    int in_height = crop_rect[2] - crop_rect[0] + 1;
-    int in_width = crop_rect[3] - crop_rect[1] + 1;
-    int in_width_stride = src.shape(2) * src.shape(3);
-    int in_offset = crop_rect[0] * in_width_stride + crop_rect[1] * src.shape(3);
-    int out_h = target_size[1];
-    int out_w = target_size[0];
-    int out_width_stride = dst.shape(2) * dst.shape(3);
-    int out_offset = pad_rect[0] * out_width_stride + pad_rect[1] * dst.shape(3);
-    auto interp = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
-
-    auto input = src.data<T>();
-    auto output = dst.data<T>();
-
-    ppl::common::RetCode ret = 0;
-
-    if (auto resize = Select<T>(src.shape(3)); resize) {
-      ret = resize(stream, in_height, in_width, in_width_stride, input + in_offset, out_h, out_w,
-                   out_width_stride, output + out_offset, interp);
-    } else {
-      return Status(eNotSupported);
-    }
-
-    return ret == 0 ? success() : Result<void>(Status(eFail));
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(CropResizePad, (cuda, 0),
-                               []() { return std::make_unique<CropResizePadImpl>(); });
+namespace mmdeploy::operation::cuda
+{
+
+    class CropResizePadImpl : public CropResizePad
+    {
+      public:
+        CropResizePadImpl() = default;
+
+        Result<void> apply(const Tensor& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect, Tensor& dst) override
+        {
+            auto       cuda_stream = GetNative<cudaStream_t>(stream());
+
+            int        width  = target_size[0] + pad_rect[1] + pad_rect[3];
+            int        height = target_size[1] + pad_rect[0] + pad_rect[2];
+
+            TensorDesc desc{device(), src.data_type(), {1, height, width, src.shape(3)}, src.name()};
+            Tensor     dst_tensor(desc);
+            cudaMemsetAsync(dst_tensor.data<uint8_t>(), 0, dst_tensor.byte_size(), cuda_stream);
+
+            if (crop_rect[2] - crop_rect[0] + 1 > 0 && crop_rect[3] - crop_rect[1] + 1 > 0)
+            {
+                if (src.data_type() == DataType::kINT8)
+                {
+                    OUTCOME_TRY(ResizeDispatch<uint8_t>(src, crop_rect, target_size, pad_rect, dst_tensor, cuda_stream));
+                }
+                else if (src.data_type() == DataType::kFLOAT)
+                {
+                    OUTCOME_TRY(
+                        ResizeDispatch<float>(src, crop_rect, target_size, pad_rect, dst_tensor, cuda_stream));
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                    return Status(eNotSupported);
+                }
+            }
+
+            dst = std::move(dst_tensor);
+            return success();
+        }
+
+      private:
+        template<typename T>
+        auto Select(int channels) -> decltype(&ppl::cv::cuda::Resize<T, 1>)
+        {
+            switch (channels)
+            {
+                case 1:
+                    return &ppl::cv::cuda::Resize<T, 1>;
+                case 3:
+                    return &ppl::cv::cuda::Resize<T, 3>;
+                case 4:
+                    return &ppl::cv::cuda::Resize<T, 4>;
+                default:
+                    MMDEPLOY_ERROR("unsupported channels {}", channels);
+                    return nullptr;
+            }
+        }
+
+        template<class T>
+        Result<void> ResizeDispatch(const Tensor& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect, Tensor& dst, cudaStream_t stream)
+        {
+            int                  in_height        = crop_rect[2] - crop_rect[0] + 1;
+            int                  in_width         = crop_rect[3] - crop_rect[1] + 1;
+            int                  in_width_stride  = src.shape(2) * src.shape(3);
+            int                  in_offset        = crop_rect[0] * in_width_stride + crop_rect[1] * src.shape(3);
+            int                  out_h            = target_size[1];
+            int                  out_w            = target_size[0];
+            int                  out_width_stride = dst.shape(2) * dst.shape(3);
+            int                  out_offset       = pad_rect[0] * out_width_stride + pad_rect[1] * dst.shape(3);
+            auto                 interp           = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
+
+            auto                 input  = src.data<T>();
+            auto                 output = dst.data<T>();
+
+            ppl::common::RetCode ret = 0;
+
+            if (auto resize = Select<T>(src.shape(3)); resize)
+            {
+                ret = resize(stream, in_height, in_width, in_width_stride, input + in_offset, out_h, out_w, out_width_stride, output + out_offset, interp);
+            }
+            else
+            {
+                return Status(eNotSupported);
+            }
+
+            return ret == 0 ? success() : Result<void>(Status(eFail));
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(CropResizePad, (cuda, 0), []()
+                                   { return std::make_unique<CropResizePadImpl>(); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/cvtcolor.cpp b/csrc/mmdeploy/operation/cuda/cvtcolor.cpp
index 1e35adc06b..ce6743f152 100644
--- a/csrc/mmdeploy/operation/cuda/cvtcolor.cpp
+++ b/csrc/mmdeploy/operation/cuda/cvtcolor.cpp
@@ -7,122 +7,141 @@
 
 using namespace ppl::cv::cuda;
 
-namespace mmdeploy::operation::cuda {
-
-template <typename T>
-using Converter = ppl::common::RetCode (*)(cudaStream_t stream, int height, int width,
-                                           int inWidthStride, const T* inData, int outWidthStride,
-                                           T* outData);
-
-namespace {
-
-template <typename T>
-ppl::common::RetCode CopyLuma(cudaStream_t stream, int height, int width, int inWidthStride,
-                              const T* inData, int outWidthStride, T* outData) {
-  auto ec = cudaMemcpyAsync(outData, inData, height * width * sizeof(T), cudaMemcpyDefault, stream);
-  if (ec == cudaSuccess) {
-    return ppl::common::RC_SUCCESS;
-  }
-  return ppl::common::RC_OTHER_ERROR;
-}
-
-template <typename T>
-class ConverterTable {
-  static constexpr auto kSize = static_cast<size_t>(PixelFormat::kCOUNT);
-
-  Converter<T> converters_[kSize][kSize]{};  // value-initialize to zeros
-
-  template <typename Self>
-  static auto& get_impl(Self& self, PixelFormat src, PixelFormat dst) {
-    return self.converters_[static_cast<int32_t>(src)][static_cast<int32_t>(dst)];
-  }
-
- public:
-  auto& get(PixelFormat src, PixelFormat dst) noexcept { return get_impl(*this, src, dst); }
-  auto& get(PixelFormat src, PixelFormat dst) const noexcept { return get_impl(*this, src, dst); }
-
-  ConverterTable() {
-    using namespace pixel_formats;
-    // to BGR
-    get(kRGB, kBGR) = RGB2BGR<T>;
-    get(kGRAY, kBGR) = GRAY2BGR<T>;
-    if constexpr (std::is_same_v<T, uint8_t>) {
-      get(kNV21, kBGR) = NV212BGR<T>;
-      get(kNV12, kBGR) = NV122BGR<T>;
-    }
-    get(kBGRA, kBGR) = BGRA2BGR<T>;
-
-    // to RGB
-    get(kBGR, kRGB) = BGR2RGB<T>;
-    get(kGRAY, kRGB) = GRAY2RGB<T>;
-    if constexpr (std::is_same_v<T, uint8_t>) {
-      get(kNV21, kRGB) = NV212RGB<T>;
-      get(kNV12, kRGB) = NV122RGB<T>;
-    }
-    get(kBGRA, kRGB) = BGRA2RGB<T>;
-
-    // to GRAY
-    get(kBGR, kGRAY) = BGR2GRAY<T>;
-    get(kRGB, kGRAY) = RGB2GRAY<T>;
-    get(kNV21, kGRAY) = CopyLuma<T>;
-    get(kNV12, kGRAY) = CopyLuma<T>;
-    get(kBGRA, kGRAY) = BGRA2GRAY<T>;
-  }
-};
-
-template <typename T>
-Converter<T> GetConverter(PixelFormat src, PixelFormat dst) {
-  static const ConverterTable<T> table{};
-  return table.get(src, dst);
-}
-
-}  // namespace
-
-class CvtColorImpl : public CvtColor {
- public:
-  Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override {
-    if (src.pixel_format() == dst_fmt) {
-      dst = src;
-      return success();
-    }
-
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-
-    auto height = src.height();
-    auto width = src.width();
-    auto src_channels = src.channel();
-    auto src_stride = width * src_channels;
-
-    Mat dst_mat(height, width, dst_fmt, src.type(), device());
-    auto dst_stride = width * dst_mat.channel();
-
-    auto convert = [&](auto type) -> Result<void> {
-      using T = typename decltype(type)::type;
-      auto converter = GetConverter<T>(src.pixel_format(), dst_fmt);
-      if (!converter) {
-        return Status(eNotSupported);
-      }
-      auto ret = converter(cuda_stream, height, width, src_stride, src.data<T>(), dst_stride,
-                           dst_mat.data<T>());
-      if (ret != ppl::common::RC_SUCCESS) {
-        return Status(eFail);
-      }
-      dst = std::move(dst_mat);
-      return success();
+namespace mmdeploy::operation::cuda
+{
+
+    template<typename T>
+    using Converter = ppl::common::RetCode (*)(cudaStream_t stream, int height, int width, int inWidthStride, const T* inData, int outWidthStride, T* outData);
+
+    namespace
+    {
+
+        template<typename T>
+        ppl::common::RetCode CopyLuma(cudaStream_t stream, int height, int width, int inWidthStride, const T* inData, int outWidthStride, T* outData)
+        {
+            auto ec = cudaMemcpyAsync(outData, inData, height * width * sizeof(T), cudaMemcpyDefault, stream);
+            if (ec == cudaSuccess)
+            {
+                return ppl::common::RC_SUCCESS;
+            }
+            return ppl::common::RC_OTHER_ERROR;
+        }
+
+        template<typename T>
+        class ConverterTable
+        {
+            static constexpr auto kSize = static_cast<size_t>(PixelFormat::kCOUNT);
+
+            Converter<T>          converters_[kSize][kSize]{};  // value-initialize to zeros
+
+            template<typename Self>
+            static auto& get_impl(Self& self, PixelFormat src, PixelFormat dst)
+            {
+                return self.converters_[static_cast<int32_t>(src)][static_cast<int32_t>(dst)];
+            }
+
+          public:
+            auto& get(PixelFormat src, PixelFormat dst) noexcept
+            {
+                return get_impl(*this, src, dst);
+            }
+            auto& get(PixelFormat src, PixelFormat dst) const noexcept
+            {
+                return get_impl(*this, src, dst);
+            }
+
+            ConverterTable()
+            {
+                using namespace pixel_formats;
+                // to BGR
+                get(kRGB, kBGR)  = RGB2BGR<T>;
+                get(kGRAY, kBGR) = GRAY2BGR<T>;
+                if constexpr (std::is_same_v<T, uint8_t>)
+                {
+                    get(kNV21, kBGR) = NV212BGR<T>;
+                    get(kNV12, kBGR) = NV122BGR<T>;
+                }
+                get(kBGRA, kBGR) = BGRA2BGR<T>;
+
+                // to RGB
+                get(kBGR, kRGB)  = BGR2RGB<T>;
+                get(kGRAY, kRGB) = GRAY2RGB<T>;
+                if constexpr (std::is_same_v<T, uint8_t>)
+                {
+                    get(kNV21, kRGB) = NV212RGB<T>;
+                    get(kNV12, kRGB) = NV122RGB<T>;
+                }
+                get(kBGRA, kRGB) = BGRA2RGB<T>;
+
+                // to GRAY
+                get(kBGR, kGRAY)  = BGR2GRAY<T>;
+                get(kRGB, kGRAY)  = RGB2GRAY<T>;
+                get(kNV21, kGRAY) = CopyLuma<T>;
+                get(kNV12, kGRAY) = CopyLuma<T>;
+                get(kBGRA, kGRAY) = BGRA2GRAY<T>;
+            }
+        };
+
+        template<typename T>
+        Converter<T> GetConverter(PixelFormat src, PixelFormat dst)
+        {
+            static const ConverterTable<T> table{};
+            return table.get(src, dst);
+        }
+
+    }  // namespace
+
+    class CvtColorImpl : public CvtColor
+    {
+      public:
+        Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override
+        {
+            if (src.pixel_format() == dst_fmt)
+            {
+                dst = src;
+                return success();
+            }
+
+            auto cuda_stream = GetNative<cudaStream_t>(stream());
+
+            auto height       = src.height();
+            auto width        = src.width();
+            auto src_channels = src.channel();
+            auto src_stride   = width * src_channels;
+
+            Mat  dst_mat(height, width, dst_fmt, src.type(), device());
+            auto dst_stride = width * dst_mat.channel();
+
+            auto convert = [&](auto type) -> Result<void>
+            {
+                using T        = typename decltype(type)::type;
+                auto converter = GetConverter<T>(src.pixel_format(), dst_fmt);
+                if (!converter)
+                {
+                    return Status(eNotSupported);
+                }
+                auto ret = converter(cuda_stream, height, width, src_stride, src.data<T>(), dst_stride, dst_mat.data<T>());
+                if (ret != ppl::common::RC_SUCCESS)
+                {
+                    return Status(eFail);
+                }
+                dst = std::move(dst_mat);
+                return success();
+            };
+
+            switch (src.type())
+            {
+                case DataType::kINT8:
+                    return convert(basic_type<uint8_t>{});
+                case DataType::kFLOAT:
+                    return convert(basic_type<float>{});
+                default:
+                    return Status(eNotSupported);
+            }
+        }
     };
 
-    switch (src.type()) {
-      case DataType::kINT8:
-        return convert(basic_type<uint8_t>{});
-      case DataType::kFLOAT:
-        return convert(basic_type<float>{});
-      default:
-        return Status(eNotSupported);
-    }
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (cuda, 0),
-                               [] { return std::make_unique<CvtColorImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (cuda, 0), []
+                                   { return std::make_unique<CvtColorImpl>(); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/flip.cpp b/csrc/mmdeploy/operation/cuda/flip.cpp
index 18b295c526..d640385cf8 100644
--- a/csrc/mmdeploy/operation/cuda/flip.cpp
+++ b/csrc/mmdeploy/operation/cuda/flip.cpp
@@ -5,57 +5,72 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
-
-class FlipImpl : public Flip {
- public:
-  using Flip::Flip;
-
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    Tensor dst_tensor(src.desc());
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-    auto h = static_cast<int>(src.shape(1));
-    auto w = static_cast<int>(src.shape(2));
-    auto c = static_cast<int>(src.shape(3));
-    ppl::common::RetCode ret;
-    if (src.data_type() == DataType::kINT8) {
-      auto input = src.data<uint8_t>();
-      auto output = dst_tensor.data<uint8_t>();
-      if (c == 1) {
-        ret = ppl::cv::cuda::Flip<uint8_t, 1>(cuda_stream, h, w, w * c, input, w * c, output,
-                                              flip_code_);
-      } else if (c == 3) {
-        ret = ppl::cv::cuda::Flip<uint8_t, 3>(cuda_stream, h, w, w * c, input, w * c, output,
-                                              flip_code_);
-      } else {
-        ret = ppl::common::RC_UNSUPPORTED;
-      }
-    } else if (src.data_type() == DataType::kFLOAT) {
-      auto input = src.data<float>();
-      auto output = dst_tensor.data<float>();
-      if (c == 1) {
-        ret = ppl::cv::cuda::Flip<float, 1>(cuda_stream, h, w, w * c, input, w * c, output,
-                                            flip_code_);
-      } else if (c == 3) {
-        ret = ppl::cv::cuda::Flip<float, 3>(cuda_stream, h, w, w * c, input, w * c, output,
-                                            flip_code_);
-      } else {
-        ret = ppl::common::RC_UNSUPPORTED;
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-      return Status(eNotSupported);
-    }
-
-    if (ret != 0) {
-      return Status(eFail);
-    }
-    dst = std::move(dst_tensor);
-    return success();
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Flip, (cuda, 0),
-                               [](int flip_code) { return std::make_unique<FlipImpl>(flip_code); });
+namespace mmdeploy::operation::cuda
+{
+
+    class FlipImpl : public Flip
+    {
+      public:
+        using Flip::Flip;
+
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            Tensor               dst_tensor(src.desc());
+            auto                 cuda_stream = GetNative<cudaStream_t>(stream());
+            auto                 h           = static_cast<int>(src.shape(1));
+            auto                 w           = static_cast<int>(src.shape(2));
+            auto                 c           = static_cast<int>(src.shape(3));
+            ppl::common::RetCode ret;
+            if (src.data_type() == DataType::kINT8)
+            {
+                auto input  = src.data<uint8_t>();
+                auto output = dst_tensor.data<uint8_t>();
+                if (c == 1)
+                {
+                    ret = ppl::cv::cuda::Flip<uint8_t, 1>(cuda_stream, h, w, w * c, input, w * c, output, flip_code_);
+                }
+                else if (c == 3)
+                {
+                    ret = ppl::cv::cuda::Flip<uint8_t, 3>(cuda_stream, h, w, w * c, input, w * c, output, flip_code_);
+                }
+                else
+                {
+                    ret = ppl::common::RC_UNSUPPORTED;
+                }
+            }
+            else if (src.data_type() == DataType::kFLOAT)
+            {
+                auto input  = src.data<float>();
+                auto output = dst_tensor.data<float>();
+                if (c == 1)
+                {
+                    ret = ppl::cv::cuda::Flip<float, 1>(cuda_stream, h, w, w * c, input, w * c, output, flip_code_);
+                }
+                else if (c == 3)
+                {
+                    ret = ppl::cv::cuda::Flip<float, 3>(cuda_stream, h, w, w * c, input, w * c, output, flip_code_);
+                }
+                else
+                {
+                    ret = ppl::common::RC_UNSUPPORTED;
+                }
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                return Status(eNotSupported);
+            }
+
+            if (ret != 0)
+            {
+                return Status(eFail);
+            }
+            dst = std::move(dst_tensor);
+            return success();
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Flip, (cuda, 0), [](int flip_code)
+                                   { return std::make_unique<FlipImpl>(flip_code); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/hwc2chw.cpp b/csrc/mmdeploy/operation/cuda/hwc2chw.cpp
index abc6c54db3..4c86327e3a 100644
--- a/csrc/mmdeploy/operation/cuda/hwc2chw.cpp
+++ b/csrc/mmdeploy/operation/cuda/hwc2chw.cpp
@@ -4,40 +4,49 @@
 
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
-
-template <typename T>
-void Transpose(const T* src, int height, int width, int channels, T* dst, cudaStream_t stream);
-
-class HWC2CHWImpl : public HWC2CHW {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    auto h = src.shape(1);
-    auto w = src.shape(2);
-    auto c = src.shape(3);
-
-    Tensor dst_tensor(src.desc());
-    dst_tensor.Reshape({1, c, h, w});
-
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-
-    if (DataType::kINT8 == src.data_type()) {
-      auto input = src.data<uint8_t>();
-      auto output = dst_tensor.data<uint8_t>();
-      Transpose(input, (int)h, (int)w, (int)c, output, cuda_stream);
-    } else if (DataType::kFLOAT == src.data_type()) {
-      auto input = src.data<float>();
-      auto output = dst_tensor.data<float>();
-      Transpose(input, (int)h, (int)w, (int)c, output, cuda_stream);
-    } else {
-      assert(0);
-    }
-
-    dst = std::move(dst_tensor);
-    return success();
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (cuda, 0), [] { return std::make_unique<HWC2CHWImpl>(); });
+namespace mmdeploy::operation::cuda
+{
+
+    template<typename T>
+    void Transpose(const T* src, int height, int width, int channels, T* dst, cudaStream_t stream);
+
+    class HWC2CHWImpl : public HWC2CHW
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            auto   h = src.shape(1);
+            auto   w = src.shape(2);
+            auto   c = src.shape(3);
+
+            Tensor dst_tensor(src.desc());
+            dst_tensor.Reshape({1, c, h, w});
+
+            auto cuda_stream = GetNative<cudaStream_t>(stream());
+
+            if (DataType::kINT8 == src.data_type())
+            {
+                auto input  = src.data<uint8_t>();
+                auto output = dst_tensor.data<uint8_t>();
+                Transpose(input, (int)h, (int)w, (int)c, output, cuda_stream);
+            }
+            else if (DataType::kFLOAT == src.data_type())
+            {
+                auto input  = src.data<float>();
+                auto output = dst_tensor.data<float>();
+                Transpose(input, (int)h, (int)w, (int)c, output, cuda_stream);
+            }
+            else
+            {
+                assert(0);
+            }
+
+            dst = std::move(dst_tensor);
+            return success();
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (cuda, 0), []
+                                   { return std::make_unique<HWC2CHWImpl>(); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/normalize.cpp b/csrc/mmdeploy/operation/cuda/normalize.cpp
index b2168fed4b..23181279b7 100644
--- a/csrc/mmdeploy/operation/cuda/normalize.cpp
+++ b/csrc/mmdeploy/operation/cuda/normalize.cpp
@@ -5,70 +5,86 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
+namespace mmdeploy::operation::cuda
+{
 
-namespace impl {
-template <typename T, int channels>
-void Normalize(const T* src, int height, int width, int stride, float* output, const float* mean,
-               const float* std, bool to_rgb, cudaStream_t stream);
-}
+    namespace impl
+    {
+        template<typename T, int channels>
+        void Normalize(const T* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream);
+    }
 
-class NormalizeImpl : public Normalize {
- public:
-  NormalizeImpl(Param param) : param_(std::move(param)) {}
+    class NormalizeImpl : public Normalize
+    {
+      public:
+        NormalizeImpl(Param param)
+            : param_(std::move(param))
+        {
+        }
 
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    auto src_desc = src.desc();
-    int h = (int)src_desc.shape[1];
-    int w = (int)src_desc.shape[2];
-    int c = (int)src_desc.shape[3];
-    int stride = w * c;
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            auto       src_desc = src.desc();
+            int        h        = (int)src_desc.shape[1];
+            int        w        = (int)src_desc.shape[2];
+            int        c        = (int)src_desc.shape[3];
+            int        stride   = w * c;
 
-    TensorDesc dst_desc{device(), DataType::kFLOAT, src_desc.shape, src_desc.name};
-    Tensor dst_tensor{dst_desc};
-    auto output = dst_tensor.data<float>();
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
+            TensorDesc dst_desc{device(), DataType::kFLOAT, src_desc.shape, src_desc.name};
+            Tensor     dst_tensor{dst_desc};
+            auto       output      = dst_tensor.data<float>();
+            auto       cuda_stream = GetNative<cudaStream_t>(stream());
 
-    if (DataType::kINT8 == src_desc.data_type) {
-      auto input = src.data<uint8_t>();
-      if (3 == c) {
-        impl::Normalize<uint8_t, 3>(input, h, w, stride, output, param_.mean.data(),
-                                    param_.std.data(), param_.to_rgb, cuda_stream);
-      } else if (1 == c) {
-        impl::Normalize<uint8_t, 1>(input, h, w, stride, output, param_.mean.data(),
-                                    param_.std.data(), param_.to_rgb, cuda_stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else if (DataType::kFLOAT == src_desc.data_type) {
-      auto input = src.data<float>();
-      if (3 == c) {
-        impl::Normalize<float, 3>(input, h, w, stride, output, param_.mean.data(),
-                                  param_.std.data(), param_.to_rgb, cuda_stream);
-      } else if (1 == c) {
-        impl::Normalize<float, 1>(input, h, w, stride, output, param_.mean.data(),
-                                  param_.std.data(), param_.to_rgb, cuda_stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src_desc.data_type);
-      assert(0);
-      return Status(eNotSupported);
-    }
+            if (DataType::kINT8 == src_desc.data_type)
+            {
+                auto input = src.data<uint8_t>();
+                if (3 == c)
+                {
+                    impl::Normalize<uint8_t, 3>(input, h, w, stride, output, param_.mean.data(), param_.std.data(), param_.to_rgb, cuda_stream);
+                }
+                else if (1 == c)
+                {
+                    impl::Normalize<uint8_t, 1>(input, h, w, stride, output, param_.mean.data(), param_.std.data(), param_.to_rgb, cuda_stream);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    return Status(eNotSupported);
+                }
+            }
+            else if (DataType::kFLOAT == src_desc.data_type)
+            {
+                auto input = src.data<float>();
+                if (3 == c)
+                {
+                    impl::Normalize<float, 3>(input, h, w, stride, output, param_.mean.data(), param_.std.data(), param_.to_rgb, cuda_stream);
+                }
+                else if (1 == c)
+                {
+                    impl::Normalize<float, 1>(input, h, w, stride, output, param_.mean.data(), param_.std.data(), param_.to_rgb, cuda_stream);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    return Status(eNotSupported);
+                }
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src_desc.data_type);
+                assert(0);
+                return Status(eNotSupported);
+            }
 
-    dst = std::move(dst_tensor);
-    return success();
-  }
+            dst = std::move(dst_tensor);
+            return success();
+        }
 
- protected:
-  Param param_;
-};
+      protected:
+        Param param_;
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (cuda, 0), [](const Normalize::Param& param) {
-  return std::make_unique<NormalizeImpl>(param);
-});
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (cuda, 0), [](const Normalize::Param& param)
+                                   { return std::make_unique<NormalizeImpl>(param); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/normalize.cu b/csrc/mmdeploy/operation/cuda/normalize.cu
index efd1c47008..259c51783c 100644
--- a/csrc/mmdeploy/operation/cuda/normalize.cu
+++ b/csrc/mmdeploy/operation/cuda/normalize.cu
@@ -4,61 +4,62 @@
 
 #include <cstdint>
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
-namespace impl {
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
+            namespace impl
+            {
 
-template <typename T, int channels>
-__global__ void normalize(const T* src, int height, int width, int stride, float* output,
-                          const float3 mean, const float3 std, bool to_rgb) {
-  int x = (int)(blockIdx.x * blockDim.x + threadIdx.x);
-  int y = (int)(blockIdx.y * blockDim.y + threadIdx.y);
+                template<typename T, int channels>
+                __global__ void normalize(const T* src, int height, int width, int stride, float* output, const float3 mean, const float3 std, bool to_rgb)
+                {
+                    int x = (int)(blockIdx.x * blockDim.x + threadIdx.x);
+                    int y = (int)(blockIdx.y * blockDim.y + threadIdx.y);
 
-  if (x >= width || y >= height) {
-    return;
-  }
+                    if (x >= width || y >= height)
+                    {
+                        return;
+                    }
 
-  int loc = y * stride + x * channels;
-  auto mean_ptr = &mean.x;
-  auto std_ptr = &std.x;
-  if (to_rgb) {
-    for (int c = 0; c < channels; ++c) {
-      output[loc + c] = ((float)src[loc + channels - 1 - c] - mean_ptr[c]) * std_ptr[c];
-    }
-  } else {
-    for (int c = 0; c < channels; ++c) {
-      output[loc + c] = ((float)src[loc + c] - mean_ptr[c]) * std_ptr[c];
-    }
-  }
-}
+                    int  loc      = y * stride + x * channels;
+                    auto mean_ptr = &mean.x;
+                    auto std_ptr  = &std.x;
+                    if (to_rgb)
+                    {
+                        for (int c = 0; c < channels; ++c)
+                        {
+                            output[loc + c] = ((float)src[loc + channels - 1 - c] - mean_ptr[c]) * std_ptr[c];
+                        }
+                    }
+                    else
+                    {
+                        for (int c = 0; c < channels; ++c)
+                        {
+                            output[loc + c] = ((float)src[loc + c] - mean_ptr[c]) * std_ptr[c];
+                        }
+                    }
+                }
 
-template <typename T, int channels>
-void Normalize(const T* src, int height, int width, int stride, float* output, const float* mean,
-               const float* std, bool to_rgb, cudaStream_t stream) {
-  const dim3 thread_block(16, 16);
-  const dim3 num_blocks((width + thread_block.x - 1) / thread_block.x,
-                        (height + thread_block.y - 1) / thread_block.y);
-  const float3 _mean{mean[0], mean[1], mean[2]};
-  const float3 _std{float(1. / std[0]), float(1. / std[1]), float(1. / std[2])};
-  normalize<T, channels><<<num_blocks, thread_block, 0, stream>>>(src, height, width, stride,
-                                                                  output, _mean, _std, to_rgb);
-}
+                template<typename T, int channels>
+                void Normalize(const T* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream)
+                {
+                    const dim3   thread_block(16, 16);
+                    const dim3   num_blocks((width + thread_block.x - 1) / thread_block.x,
+                                          (height + thread_block.y - 1) / thread_block.y);
+                    const float3 _mean{mean[0], mean[1], mean[2]};
+                    const float3 _std{float(1. / std[0]), float(1. / std[1]), float(1. / std[2])};
+                    normalize<T, channels><<<num_blocks, thread_block, 0, stream>>>(src, height, width, stride, output, _mean, _std, to_rgb);
+                }
 
-template void Normalize<uint8_t, 3>(const uint8_t* src, int height, int width, int stride,
-                                    float* output, const float* mean, const float* std, bool to_rgb,
-                                    cudaStream_t stream);
-template void Normalize<uint8_t, 1>(const uint8_t* src, int height, int width, int stride,
-                                    float* output, const float* mean, const float* std, bool to_rgb,
-                                    cudaStream_t stream);
+                template void Normalize<uint8_t, 3>(const uint8_t* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream);
+                template void Normalize<uint8_t, 1>(const uint8_t* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream);
 
-template void Normalize<float, 3>(const float* src, int height, int width, int stride,
-                                  float* output, const float* mean, const float* std, bool to_rgb,
-                                  cudaStream_t stream);
-template void Normalize<float, 1>(const float* src, int height, int width, int stride,
-                                  float* output, const float* mean, const float* std, bool to_rgb,
-                                  cudaStream_t stream);
-}  // namespace impl
-}  // namespace cuda
-}  // namespace operation
+                template void Normalize<float, 3>(const float* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream);
+                template void Normalize<float, 1>(const float* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream);
+            }  // namespace impl
+        }      // namespace cuda
+    }          // namespace operation
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/operation/cuda/pad.cpp b/csrc/mmdeploy/operation/cuda/pad.cpp
index 7e40c38cbb..43970df81c 100644
--- a/csrc/mmdeploy/operation/cuda/pad.cpp
+++ b/csrc/mmdeploy/operation/cuda/pad.cpp
@@ -8,90 +8,104 @@
 
 using namespace ppl::cv::cuda;
 
-namespace mmdeploy::operation::cuda {
+namespace mmdeploy::operation::cuda
+{
 
-class PadImpl : public Pad {
- public:
-  PadImpl(ppl::cv::BorderType border_type, float pad_val)
-      : border_type_(border_type), pad_val_(pad_val) {}
+    class PadImpl : public Pad
+    {
+      public:
+        PadImpl(ppl::cv::BorderType border_type, float pad_val)
+            : border_type_(border_type)
+            , pad_val_(pad_val)
+        {
+        }
 
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    auto desc = src.desc();
-    int height = desc.shape[1];
-    int width = desc.shape[2];
-    int c = desc.shape[3];
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            auto                 desc   = src.desc();
+            int                  height = desc.shape[1];
+            int                  width  = desc.shape[2];
+            int                  c      = desc.shape[3];
 
-    auto dst_height = height + top + bottom;
-    auto dst_width = width + left + right;
-    TensorShape dst_shape{1, dst_height, dst_width, c};
-    TensorDesc dst_desc{device(), desc.data_type, dst_shape, ""};
-    Tensor dst_tensor(dst_desc);
+            auto                 dst_height = height + top + bottom;
+            auto                 dst_width  = width + left + right;
+            TensorShape          dst_shape{1, dst_height, dst_width, c};
+            TensorDesc           dst_desc{device(), desc.data_type, dst_shape, ""};
+            Tensor               dst_tensor(dst_desc);
 
-    ppl::common::RetCode ret = 0;
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
+            ppl::common::RetCode ret         = 0;
+            auto                 cuda_stream = GetNative<cudaStream_t>(stream());
 
-    if (desc.data_type == DataType::kFLOAT) {
-      auto src_buffer = src.data<float>();
-      auto dst_buffer = dst_tensor.data<float>();
-      if (3 == c) {
-        ret = CopyMakeBorder<float, 3>(cuda_stream, height, width, width * c, src_buffer,
-                                       dst_width * c, dst_buffer, top, bottom, left, right,
-                                       border_type_, pad_val_);
-      } else if (1 == c) {
-        ret = CopyMakeBorder<float, 1>(cuda_stream, height, width, width * c, src_buffer,
-                                       dst_width * c, dst_buffer, top, bottom, left, right,
-                                       border_type_, pad_val_);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        assert(0);
-        return Status(eNotSupported);
-      }
-    } else if (desc.data_type == DataType::kINT8) {
-      auto src_buffer = src.data<uint8_t>();
-      auto dst_buffer = dst_tensor.data<uint8_t>();
-      if (3 == c) {
-        ret = CopyMakeBorder<ppl::cv::uchar, 3>(cuda_stream, height, width, width * c, src_buffer,
-                                                dst_width * c, dst_buffer, top, bottom, left, right,
-                                                border_type_, (ppl::cv::uchar)pad_val_);
-      } else if (1 == c) {
-        ret = CopyMakeBorder<ppl::cv::uchar, 1>(cuda_stream, height, width, width * c, src_buffer,
-                                                dst_width * c, dst_buffer, top, bottom, left, right,
-                                                border_type_, (ppl::cv::uchar)pad_val_);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        assert(0);
-        return Status(eNotSupported);
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", desc.data_type);
-      assert(0);
-      return Status(eNotSupported);
-    }
-    if (ret != 0) {
-      MMDEPLOY_ERROR("unexpected exception happened");
-      assert(0);
-      return Status(eNotSupported);
-    }
+            if (desc.data_type == DataType::kFLOAT)
+            {
+                auto src_buffer = src.data<float>();
+                auto dst_buffer = dst_tensor.data<float>();
+                if (3 == c)
+                {
+                    ret = CopyMakeBorder<float, 3>(cuda_stream, height, width, width * c, src_buffer, dst_width * c, dst_buffer, top, bottom, left, right, border_type_, pad_val_);
+                }
+                else if (1 == c)
+                {
+                    ret = CopyMakeBorder<float, 1>(cuda_stream, height, width, width * c, src_buffer, dst_width * c, dst_buffer, top, bottom, left, right, border_type_, pad_val_);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    assert(0);
+                    return Status(eNotSupported);
+                }
+            }
+            else if (desc.data_type == DataType::kINT8)
+            {
+                auto src_buffer = src.data<uint8_t>();
+                auto dst_buffer = dst_tensor.data<uint8_t>();
+                if (3 == c)
+                {
+                    ret = CopyMakeBorder<ppl::cv::uchar, 3>(cuda_stream, height, width, width * c, src_buffer, dst_width * c, dst_buffer, top, bottom, left, right, border_type_, (ppl::cv::uchar)pad_val_);
+                }
+                else if (1 == c)
+                {
+                    ret = CopyMakeBorder<ppl::cv::uchar, 1>(cuda_stream, height, width, width * c, src_buffer, dst_width * c, dst_buffer, top, bottom, left, right, border_type_, (ppl::cv::uchar)pad_val_);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    assert(0);
+                    return Status(eNotSupported);
+                }
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", desc.data_type);
+                assert(0);
+                return Status(eNotSupported);
+            }
+            if (ret != 0)
+            {
+                MMDEPLOY_ERROR("unexpected exception happened");
+                assert(0);
+                return Status(eNotSupported);
+            }
 
-    dst = std::move(dst_tensor);
-    return success();
-  }
+            dst = std::move(dst_tensor);
+            return success();
+        }
 
- private:
-  ppl::cv::BorderType border_type_;
-  float pad_val_;
-};
+      private:
+        ppl::cv::BorderType border_type_;
+        float               pad_val_;
+    };
 
-static auto Create(const string_view& border_type, float pad_val) {
-  static const std::map<string_view, ppl::cv::BorderType> border_map{
-      {"constant", ppl::cv::BORDER_CONSTANT},
-      {"edge", ppl::cv::BORDER_REPLICATE},
-      {"reflect", ppl::cv::BORDER_REFLECT_101},
-      {"symmetric", ppl::cv::BORDER_REFLECT}};
-  return std::make_unique<PadImpl>(border_map.at(border_type), pad_val);
-}
+    static auto Create(const string_view& border_type, float pad_val)
+    {
+        static const std::map<string_view, ppl::cv::BorderType> border_map{
+            {"constant", ppl::cv::BORDER_CONSTANT},
+            {"edge", ppl::cv::BORDER_REPLICATE},
+            {"reflect", ppl::cv::BORDER_REFLECT_101},
+            {"symmetric", ppl::cv::BORDER_REFLECT}};
+        return std::make_unique<PadImpl>(border_map.at(border_type), pad_val);
+    }
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (cuda, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (cuda, 0), Create);
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/permute.cpp b/csrc/mmdeploy/operation/cuda/permute.cpp
index c5c87da881..2135ce64fe 100644
--- a/csrc/mmdeploy/operation/cuda/permute.cpp
+++ b/csrc/mmdeploy/operation/cuda/permute.cpp
@@ -6,86 +6,100 @@
 
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
+namespace mmdeploy::operation::cuda
+{
 
-namespace impl {
-template <typename T>
-void Permute(const T* src, const TensorStride& src_strides, T* dst, const TensorStride& dst_strides,
-             int ndim, int total, cudaStream_t stream);
-}
+    namespace impl
+    {
+        template<typename T>
+        void Permute(const T* src, const TensorStride& src_strides, T* dst, const TensorStride& dst_strides, int ndim, int total, cudaStream_t stream);
+    }
 
-class PermuteImpl : public Permute {
- public:
-  explicit PermuteImpl() {}
+    class PermuteImpl : public Permute
+    {
+      public:
+        explicit PermuteImpl() {}
 
-  Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) override {
-    int ndim = src.shape().size();
-    if (ndim != axes.size()) {
-      MMDEPLOY_ERROR("The size of axes should be equal of src, {} vs {}", axes.size(), ndim);
-      return Status(eInvalidArgument);
-    }
-    if (ndim > MAX_PERMUTE_DIM) {
-      MMDEPLOY_ERROR("Only support ndim < {}", MAX_PERMUTE_DIM);
-      return Status(eInvalidArgument);
-    }
-    std::vector<int> axes_vis(ndim, 0);
-    for (const auto& x : axes) {
-      if (x < 0 || x >= ndim || axes_vis[x]) {
-        MMDEPLOY_ERROR("Invalid axes");
-        return Status(eInvalidArgument);
-      }
-      axes_vis[x] = 1;
-    }
+        Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) override
+        {
+            int ndim = src.shape().size();
+            if (ndim != axes.size())
+            {
+                MMDEPLOY_ERROR("The size of axes should be equal of src, {} vs {}", axes.size(), ndim);
+                return Status(eInvalidArgument);
+            }
+            if (ndim > MAX_PERMUTE_DIM)
+            {
+                MMDEPLOY_ERROR("Only support ndim < {}", MAX_PERMUTE_DIM);
+                return Status(eInvalidArgument);
+            }
+            std::vector<int> axes_vis(ndim, 0);
+            for (const auto& x : axes)
+            {
+                if (x < 0 || x >= ndim || axes_vis[x])
+                {
+                    MMDEPLOY_ERROR("Invalid axes");
+                    return Status(eInvalidArgument);
+                }
+                axes_vis[x] = 1;
+            }
 
-    Tensor dst_tensor(src.desc());
-    auto src_dims = src.shape();
-    TensorShape dst_dims(ndim);
-    for (int i = 0; i < src_dims.size(); i++) {
-      dst_dims[i] = src_dims[axes[i]];
-    }
-    dst_tensor.Reshape(dst_dims);
+            Tensor      dst_tensor(src.desc());
+            auto        src_dims = src.shape();
+            TensorShape dst_dims(ndim);
+            for (int i = 0; i < src_dims.size(); i++)
+            {
+                dst_dims[i] = src_dims[axes[i]];
+            }
+            dst_tensor.Reshape(dst_dims);
 
-    TensorStride dst_strides;
-    TensorStride src_strides;
+            TensorStride dst_strides;
+            TensorStride src_strides;
 
-    dst_strides[ndim - 1] = src_strides[ndim - 1] = 1;
-    for (int i = ndim - 2; i >= 0; i--) {
-      dst_strides[i] = dst_strides[i + 1] * dst_dims[i + 1];
-      src_strides[i] = src_strides[i + 1] * src_dims[i + 1];
-    }
+            dst_strides[ndim - 1] = src_strides[ndim - 1] = 1;
+            for (int i = ndim - 2; i >= 0; i--)
+            {
+                dst_strides[i] = dst_strides[i + 1] * dst_dims[i + 1];
+                src_strides[i] = src_strides[i + 1] * src_dims[i + 1];
+            }
 
-    TensorStride tmp;
-    for (int i = 0; i < ndim; i++) {
-      tmp[i] = src_strides[axes[i]];
-    }
-    src_strides = tmp;
+            TensorStride tmp;
+            for (int i = 0; i < ndim; i++)
+            {
+                tmp[i] = src_strides[axes[i]];
+            }
+            src_strides = tmp;
 
-    if (src.data_type() == DataType::kINT8) {
-      OUTCOME_TRY(PermuteDispatch<uint8_t>(src, dst_tensor, src_strides, dst_strides));
-    } else if (src.data_type() == DataType::kFLOAT) {
-      OUTCOME_TRY(PermuteDispatch<float>(src, dst_tensor, src_strides, dst_strides));
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-      return Status(eNotSupported);
-    }
-    dst = std::move(dst_tensor);
-    return success();
-  }
+            if (src.data_type() == DataType::kINT8)
+            {
+                OUTCOME_TRY(PermuteDispatch<uint8_t>(src, dst_tensor, src_strides, dst_strides));
+            }
+            else if (src.data_type() == DataType::kFLOAT)
+            {
+                OUTCOME_TRY(PermuteDispatch<float>(src, dst_tensor, src_strides, dst_strides));
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                return Status(eNotSupported);
+            }
+            dst = std::move(dst_tensor);
+            return success();
+        }
 
-  template <typename T>
-  Result<void> PermuteDispatch(const Tensor& src, Tensor& dst, const TensorStride& src_strides,
-                               const TensorStride& dst_strides) {
-    auto src_data = src.data<T>();
-    auto dst_data = dst.data<T>();
-    auto ndim = src.shape().size();
-    auto total = src.size();
-    impl::Permute(src_data, src_strides, dst_data, dst_strides, ndim, total,
-                  GetNative<cudaStream_t>(stream()));
-    return success();
-  }
-};
+        template<typename T>
+        Result<void> PermuteDispatch(const Tensor& src, Tensor& dst, const TensorStride& src_strides, const TensorStride& dst_strides)
+        {
+            auto src_data = src.data<T>();
+            auto dst_data = dst.data<T>();
+            auto ndim     = src.shape().size();
+            auto total    = src.size();
+            impl::Permute(src_data, src_strides, dst_data, dst_strides, ndim, total, GetNative<cudaStream_t>(stream()));
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Permute, (cuda, 0),
-                               []() { return std::make_unique<PermuteImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Permute, (cuda, 0), []()
+                                   { return std::make_unique<PermuteImpl>(); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/permute.cu b/csrc/mmdeploy/operation/cuda/permute.cu
index 7f979ed3fc..5b95dc80e0 100644
--- a/csrc/mmdeploy/operation/cuda/permute.cu
+++ b/csrc/mmdeploy/operation/cuda/permute.cu
@@ -4,46 +4,48 @@
 
 #include "mmdeploy/operation/cuda/permute.h"
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
-namespace impl {
-
-template <typename T>
-__global__ void permute(const T* src, const TensorStride src_strides, T* dst,
-                        const TensorStride dst_strides, int ndim, int total) {
-  int u = blockIdx.x * blockDim.x + threadIdx.x;
-  if (u >= total) {
-    return;
-  }
-
-  int remaining = u;
-  int v = 0;
-  for (size_t i = 0; i < ndim; i++) {
-    int p = remaining / dst_strides.v_[i];
-    remaining -= p * dst_strides.v_[i];
-    v += p * src_strides.v_[i];
-  }
-  dst[u] = src[v];
-}
-
-template <typename T>
-void Permute(const T* src, const TensorStride& src_strides, T* dst, const TensorStride& dst_strides,
-             int ndim, int total, cudaStream_t stream) {
-  int thread_num = 256;
-  int block_num = (total + thread_num - 1) / thread_num;
-  permute<T><<<block_num, thread_num, 0, stream>>>(src, src_strides, dst, dst_strides, ndim, total);
-}
-
-template void Permute<float>(const float* src, const TensorStride& src_strides, float* dst,
-                             const TensorStride& dst_strides, int ndim, int total,
-                             cudaStream_t stream);
-
-template void Permute<uint8_t>(const uint8_t* src, const TensorStride& src_strides, uint8_t* dst,
-                               const TensorStride& dst_strides, int ndim, int total,
-                               cudaStream_t stream);
-
-}  // namespace impl
-}  // namespace cuda
-}  // namespace operation
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
+            namespace impl
+            {
+
+                template<typename T>
+                __global__ void permute(const T* src, const TensorStride src_strides, T* dst, const TensorStride dst_strides, int ndim, int total)
+                {
+                    int u = blockIdx.x * blockDim.x + threadIdx.x;
+                    if (u >= total)
+                    {
+                        return;
+                    }
+
+                    int remaining = u;
+                    int v         = 0;
+                    for (size_t i = 0; i < ndim; i++)
+                    {
+                        int p = remaining / dst_strides.v_[i];
+                        remaining -= p * dst_strides.v_[i];
+                        v += p * src_strides.v_[i];
+                    }
+                    dst[u] = src[v];
+                }
+
+                template<typename T>
+                void Permute(const T* src, const TensorStride& src_strides, T* dst, const TensorStride& dst_strides, int ndim, int total, cudaStream_t stream)
+                {
+                    int thread_num = 256;
+                    int block_num  = (total + thread_num - 1) / thread_num;
+                    permute<T><<<block_num, thread_num, 0, stream>>>(src, src_strides, dst, dst_strides, ndim, total);
+                }
+
+                template void Permute<float>(const float* src, const TensorStride& src_strides, float* dst, const TensorStride& dst_strides, int ndim, int total, cudaStream_t stream);
+
+                template void Permute<uint8_t>(const uint8_t* src, const TensorStride& src_strides, uint8_t* dst, const TensorStride& dst_strides, int ndim, int total, cudaStream_t stream);
+
+            }  // namespace impl
+        }      // namespace cuda
+    }          // namespace operation
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/operation/cuda/permute.h b/csrc/mmdeploy/operation/cuda/permute.h
index 7bbc0a404f..2078e6161b 100644
--- a/csrc/mmdeploy/operation/cuda/permute.h
+++ b/csrc/mmdeploy/operation/cuda/permute.h
@@ -6,19 +6,26 @@
 
 #include <cstdlib>
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
 
-const int MAX_PERMUTE_DIM = 8;
+            const int MAX_PERMUTE_DIM = 8;
 
-struct TensorStride {
-  int v_[MAX_PERMUTE_DIM];
-  int& operator[](size_t idx) { return v_[idx]; }
-};
+            struct TensorStride
+            {
+                int  v_[MAX_PERMUTE_DIM];
+                int& operator[](size_t idx)
+                {
+                    return v_[idx];
+                }
+            };
 
-}  // namespace cuda
-}  // namespace operation
+        }  // namespace cuda
+    }      // namespace operation
 }  // namespace mmdeploy
 
 #endif  // MMDEPLOY_OPERATION_CUDA_PERMUTE_H_
diff --git a/csrc/mmdeploy/operation/cuda/resize.cpp b/csrc/mmdeploy/operation/cuda/resize.cpp
index 55ff8e5efe..6264460d2e 100644
--- a/csrc/mmdeploy/operation/cuda/resize.cpp
+++ b/csrc/mmdeploy/operation/cuda/resize.cpp
@@ -5,88 +5,113 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
-
-class ResizeImpl : public Resize {
- public:
-  ResizeImpl(ppl::cv::InterpolationType interp) : interp_(interp) {}
-
-  Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override {
-    assert(src.device() == device());
-
-    TensorDesc desc{device(), src.data_type(), {1, dst_h, dst_w, src.shape(3)}, src.name()};
-    Tensor dst_tensor(desc);
-
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-    if (src.data_type() == DataType::kINT8) {
-      OUTCOME_TRY(ResizeDispatch<uint8_t>(src, dst_tensor, cuda_stream));
-    } else if (src.data_type() == DataType::kFLOAT) {
-      OUTCOME_TRY(ResizeDispatch<float>(src, dst_tensor, cuda_stream));
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-      return Status(eNotSupported);
+namespace mmdeploy::operation::cuda
+{
+
+    class ResizeImpl : public Resize
+    {
+      public:
+        ResizeImpl(ppl::cv::InterpolationType interp)
+            : interp_(interp)
+        {
+        }
+
+        Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override
+        {
+            assert(src.device() == device());
+
+            TensorDesc desc{device(), src.data_type(), {1, dst_h, dst_w, src.shape(3)}, src.name()};
+            Tensor     dst_tensor(desc);
+
+            auto       cuda_stream = GetNative<cudaStream_t>(stream());
+            if (src.data_type() == DataType::kINT8)
+            {
+                OUTCOME_TRY(ResizeDispatch<uint8_t>(src, dst_tensor, cuda_stream));
+            }
+            else if (src.data_type() == DataType::kFLOAT)
+            {
+                OUTCOME_TRY(ResizeDispatch<float>(src, dst_tensor, cuda_stream));
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                return Status(eNotSupported);
+            }
+
+            dst = std::move(dst_tensor);
+            return success();
+        }
+
+      private:
+        template<typename T>
+        auto Select(int channels) -> decltype(&ppl::cv::cuda::Resize<T, 1>)
+        {
+            switch (channels)
+            {
+                case 1:
+                    return &ppl::cv::cuda::Resize<T, 1>;
+                case 3:
+                    return &ppl::cv::cuda::Resize<T, 3>;
+                case 4:
+                    return &ppl::cv::cuda::Resize<T, 4>;
+                default:
+                    MMDEPLOY_ERROR("unsupported channels {}", channels);
+                    return nullptr;
+            }
+        }
+
+        template<class T>
+        Result<void> ResizeDispatch(const Tensor& src, Tensor& dst, cudaStream_t stream)
+        {
+            int                  h     = (int)src.shape(1);
+            int                  w     = (int)src.shape(2);
+            int                  c     = (int)src.shape(3);
+            int                  dst_h = (int)dst.shape(1);
+            int                  dst_w = (int)dst.shape(2);
+
+            auto                 input  = src.data<T>();
+            auto                 output = dst.data<T>();
+
+            ppl::common::RetCode ret = 0;
+
+            if (auto resize = Select<T>(c); resize)
+            {
+                ret = resize(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output, interp_);
+            }
+            else
+            {
+                return Status(eNotSupported);
+            }
+
+            return ret == 0 ? success() : Result<void>(Status(eFail));
+        }
+
+        ppl::cv::InterpolationType interp_;
+    };
+
+    static auto Create(const string_view& interp)
+    {
+        ppl::cv::InterpolationType type{};
+        if (interp == "bilinear")
+        {
+            type = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
+        }
+        else if (interp == "nearest")
+        {
+            type = ppl::cv::InterpolationType::INTERPOLATION_NEAREST_POINT;
+        }
+        else if (interp == "area")
+        {
+            type = ppl::cv::InterpolationType::INTERPOLATION_AREA;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("unsupported interpolation method: {}", interp);
+            throw_exception(eNotSupported);
+        }
+        return std::make_unique<ResizeImpl>(type);
     }
 
-    dst = std::move(dst_tensor);
-    return success();
-  }
-
- private:
-  template <typename T>
-  auto Select(int channels) -> decltype(&ppl::cv::cuda::Resize<T, 1>) {
-    switch (channels) {
-      case 1:
-        return &ppl::cv::cuda::Resize<T, 1>;
-      case 3:
-        return &ppl::cv::cuda::Resize<T, 3>;
-      case 4:
-        return &ppl::cv::cuda::Resize<T, 4>;
-      default:
-        MMDEPLOY_ERROR("unsupported channels {}", channels);
-        return nullptr;
-    }
-  }
-
-  template <class T>
-  Result<void> ResizeDispatch(const Tensor& src, Tensor& dst, cudaStream_t stream) {
-    int h = (int)src.shape(1);
-    int w = (int)src.shape(2);
-    int c = (int)src.shape(3);
-    int dst_h = (int)dst.shape(1);
-    int dst_w = (int)dst.shape(2);
-
-    auto input = src.data<T>();
-    auto output = dst.data<T>();
-
-    ppl::common::RetCode ret = 0;
-
-    if (auto resize = Select<T>(c); resize) {
-      ret = resize(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output, interp_);
-    } else {
-      return Status(eNotSupported);
-    }
-
-    return ret == 0 ? success() : Result<void>(Status(eFail));
-  }
-
-  ppl::cv::InterpolationType interp_;
-};
-
-static auto Create(const string_view& interp) {
-  ppl::cv::InterpolationType type{};
-  if (interp == "bilinear") {
-    type = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
-  } else if (interp == "nearest") {
-    type = ppl::cv::InterpolationType::INTERPOLATION_NEAREST_POINT;
-  } else if (interp == "area") {
-    type = ppl::cv::InterpolationType::INTERPOLATION_AREA;
-  } else {
-    MMDEPLOY_ERROR("unsupported interpolation method: {}", interp);
-    throw_exception(eNotSupported);
-  }
-  return std::make_unique<ResizeImpl>(type);
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (cuda, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (cuda, 0), Create);
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/to_float.cpp b/csrc/mmdeploy/operation/cuda/to_float.cpp
index 0057529160..a2a7506f08 100644
--- a/csrc/mmdeploy/operation/cuda/to_float.cpp
+++ b/csrc/mmdeploy/operation/cuda/to_float.cpp
@@ -4,35 +4,40 @@
 
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
-
-template <typename From, typename To>
-void Cast(const From* src, To* dst, size_t n, cudaStream_t stream);
-
-class ToFloatImpl : public ToFloat {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    auto data_type = src.desc().data_type;
-    if (data_type == DataType::kFLOAT) {
-      dst = src;
-      return success();
-    }
-
-    if (data_type == DataType::kINT8) {
-      auto desc = src.desc();
-      desc.data_type = DataType::kFLOAT;
-
-      Tensor dst_tensor(desc);
-      Cast(src.data<uint8_t>(), dst_tensor.data<float>(), src.size(),
-           GetNative<cudaStream_t>(stream()));
-
-      dst = std::move(dst_tensor);
-      return success();
-    }
-    throw_exception(eNotSupported);
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (cuda, 0), [] { return std::make_unique<ToFloatImpl>(); });
+namespace mmdeploy::operation::cuda
+{
+
+    template<typename From, typename To>
+    void Cast(const From* src, To* dst, size_t n, cudaStream_t stream);
+
+    class ToFloatImpl : public ToFloat
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            auto data_type = src.desc().data_type;
+            if (data_type == DataType::kFLOAT)
+            {
+                dst = src;
+                return success();
+            }
+
+            if (data_type == DataType::kINT8)
+            {
+                auto desc      = src.desc();
+                desc.data_type = DataType::kFLOAT;
+
+                Tensor dst_tensor(desc);
+                Cast(src.data<uint8_t>(), dst_tensor.data<float>(), src.size(), GetNative<cudaStream_t>(stream()));
+
+                dst = std::move(dst_tensor);
+                return success();
+            }
+            throw_exception(eNotSupported);
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (cuda, 0), []
+                                   { return std::make_unique<ToFloatImpl>(); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/transpose.cu b/csrc/mmdeploy/operation/cuda/transpose.cu
index 5170ca7f2b..dba6058670 100644
--- a/csrc/mmdeploy/operation/cuda/transpose.cu
+++ b/csrc/mmdeploy/operation/cuda/transpose.cu
@@ -2,41 +2,43 @@
 
 #include <cstdint>
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
-
-template <typename T>
-__global__ void transpose(const T* src, int height, int width, int channels, int src_width_stride,
-                          T* dst, int dst_channel_stride) {
-  auto x = blockIdx.x * blockDim.x + threadIdx.x;
-  auto y = blockIdx.y * blockDim.y + threadIdx.y;
-
-  if (x >= width || y >= height) return;
-
-  for (auto c = 0; c < channels; ++c) {
-    dst[c * dst_channel_stride + y * width + x] = src[y * src_width_stride + x * channels + c];
-  }
-}
-
-template <typename T>
-void Transpose(const T* src, int height, int width, int channels, T* dst, cudaStream_t stream) {
-  const dim3 thread_block(32, 8);
-  const dim3 block_num((width + thread_block.x - 1) / thread_block.x,
-                       (height + thread_block.y - 1) / thread_block.y);
-
-  auto src_width_stride = width * channels;
-  auto dst_channel_stride = width * height;
-
-  transpose<T><<<block_num, thread_block, 0, stream>>>(src, height, width, channels,
-                                                       src_width_stride, dst, dst_channel_stride);
-}
-
-template void Transpose<uint8_t>(const uint8_t* src, int height, int width, int channels,
-                                 uint8_t* dst, cudaStream_t stream);
-
-template void Transpose<float>(const float* src, int height, int width, int channels, float* dst,
-                               cudaStream_t stream);
-}  // namespace cuda
-}  // namespace operation
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
+
+            template<typename T>
+            __global__ void transpose(const T* src, int height, int width, int channels, int src_width_stride, T* dst, int dst_channel_stride)
+            {
+                auto x = blockIdx.x * blockDim.x + threadIdx.x;
+                auto y = blockIdx.y * blockDim.y + threadIdx.y;
+
+                if (x >= width || y >= height) return;
+
+                for (auto c = 0; c < channels; ++c)
+                {
+                    dst[c * dst_channel_stride + y * width + x] = src[y * src_width_stride + x * channels + c];
+                }
+            }
+
+            template<typename T>
+            void Transpose(const T* src, int height, int width, int channels, T* dst, cudaStream_t stream)
+            {
+                const dim3 thread_block(32, 8);
+                const dim3 block_num((width + thread_block.x - 1) / thread_block.x,
+                                     (height + thread_block.y - 1) / thread_block.y);
+
+                auto       src_width_stride   = width * channels;
+                auto       dst_channel_stride = width * height;
+
+                transpose<T><<<block_num, thread_block, 0, stream>>>(src, height, width, channels, src_width_stride, dst, dst_channel_stride);
+            }
+
+            template void Transpose<uint8_t>(const uint8_t* src, int height, int width, int channels, uint8_t* dst, cudaStream_t stream);
+
+            template void Transpose<float>(const float* src, int height, int width, int channels, float* dst, cudaStream_t stream);
+        }  // namespace cuda
+    }      // namespace operation
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/operation/cuda/warp_affine.cpp b/csrc/mmdeploy/operation/cuda/warp_affine.cpp
index 6c4834f018..4ab70c9896 100644
--- a/csrc/mmdeploy/operation/cuda/warp_affine.cpp
+++ b/csrc/mmdeploy/operation/cuda/warp_affine.cpp
@@ -5,115 +5,136 @@
 #include "mmdeploy/operation/vision.h"
 #include "ppl/cv/cuda/warpaffine.h"
 
-namespace mmdeploy::operation::cuda {
-
-class WarpAffineImpl : public WarpAffine {
- public:
-  explicit WarpAffineImpl(ppl::cv::InterpolationType interp) : interp_(interp) {}
-
-  Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6], int dst_h,
-                     int dst_w) override {
-    assert(src.device() == device());
-
-    TensorDesc desc{device(), src.data_type(), {1, dst_h, dst_w, src.shape(3)}, src.name()};
-    Tensor dst_tensor(desc);
-
-    const auto m = affine_matrix;
-    auto inv = Invert(affine_matrix);
-
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-    if (src.data_type() == DataType::kINT8) {
-      OUTCOME_TRY(Dispatch<uint8_t>(src, dst_tensor, inv.data(), cuda_stream));
-    } else if (src.data_type() == DataType::kFLOAT) {
-      OUTCOME_TRY(Dispatch<float>(src, dst_tensor, inv.data(), cuda_stream));
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-      return Status(eNotSupported);
+namespace mmdeploy::operation::cuda
+{
+
+    class WarpAffineImpl : public WarpAffine
+    {
+      public:
+        explicit WarpAffineImpl(ppl::cv::InterpolationType interp)
+            : interp_(interp)
+        {
+        }
+
+        Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6], int dst_h, int dst_w) override
+        {
+            assert(src.device() == device());
+
+            TensorDesc desc{device(), src.data_type(), {1, dst_h, dst_w, src.shape(3)}, src.name()};
+            Tensor     dst_tensor(desc);
+
+            const auto m   = affine_matrix;
+            auto       inv = Invert(affine_matrix);
+
+            auto       cuda_stream = GetNative<cudaStream_t>(stream());
+            if (src.data_type() == DataType::kINT8)
+            {
+                OUTCOME_TRY(Dispatch<uint8_t>(src, dst_tensor, inv.data(), cuda_stream));
+            }
+            else if (src.data_type() == DataType::kFLOAT)
+            {
+                OUTCOME_TRY(Dispatch<float>(src, dst_tensor, inv.data(), cuda_stream));
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                return Status(eNotSupported);
+            }
+
+            dst = std::move(dst_tensor);
+            return success();
+        }
+
+      private:
+        // ppl.cv uses inverted transform
+        // https://github.com/opencv/opencv/blob/bc6544c0bcfa9ca5db5e0d0551edf5c8e7da3852/modules/imgproc/src/imgwarp.cpp#L3478
+        static std::array<float, 6> Invert(const float affine_matrix[6])
+        {
+            const auto*          M = affine_matrix;
+            std::array<float, 6> inv{};
+            auto                 iM = inv.data();
+
+            auto                 D = M[0] * M[3 + 1] - M[1] * M[3];
+            D                      = D != 0.f ? 1.f / D : 0.f;
+            auto A11 = M[3 + 1] * D, A22 = M[0] * D, A12 = -M[1] * D, A21 = -M[3] * D;
+            auto b1 = -A11 * M[2] - A12 * M[3 + 2];
+            auto b2 = -A21 * M[2] - A22 * M[3 + 2];
+
+            iM[0]     = A11;
+            iM[1]     = A12;
+            iM[2]     = b1;
+            iM[3]     = A21;
+            iM[3 + 1] = A22;
+            iM[3 + 2] = b2;
+
+            return inv;
+        }
+
+        template<typename T>
+        auto Select(int channels) -> decltype(&ppl::cv::cuda::WarpAffine<T, 1>)
+        {
+            switch (channels)
+            {
+                case 1:
+                    return &ppl::cv::cuda::WarpAffine<T, 1>;
+                case 3:
+                    return &ppl::cv::cuda::WarpAffine<T, 3>;
+                case 4:
+                    return &ppl::cv::cuda::WarpAffine<T, 4>;
+                default:
+                    MMDEPLOY_ERROR("unsupported channels {}", channels);
+                    return nullptr;
+            }
+        }
+
+        template<class T>
+        Result<void> Dispatch(const Tensor& src, Tensor& dst, const float affine_matrix[6], cudaStream_t stream)
+        {
+            int                  h     = (int)src.shape(1);
+            int                  w     = (int)src.shape(2);
+            int                  c     = (int)src.shape(3);
+            int                  dst_h = (int)dst.shape(1);
+            int                  dst_w = (int)dst.shape(2);
+
+            auto                 input  = src.data<T>();
+            auto                 output = dst.data<T>();
+
+            ppl::common::RetCode ret = 0;
+
+            if (auto warp_affine = Select<T>(c); warp_affine)
+            {
+                ret = warp_affine(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output, affine_matrix, interp_, ppl::cv::BORDER_CONSTANT, 0);
+            }
+            else
+            {
+                return Status(eNotSupported);
+            }
+
+            return ret == 0 ? success() : Result<void>(Status(eFail));
+        }
+
+        ppl::cv::InterpolationType interp_;
+    };
+
+    static auto Create(const string_view& interp)
+    {
+        ppl::cv::InterpolationType type{};
+        if (interp == "bilinear")
+        {
+            type = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
+        }
+        else if (interp == "nearest")
+        {
+            type = ppl::cv::InterpolationType::INTERPOLATION_NEAREST_POINT;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("unsupported interpolation method: {}", interp);
+            throw_exception(eNotSupported);
+        }
+        return std::make_unique<WarpAffineImpl>(type);
     }
 
-    dst = std::move(dst_tensor);
-    return success();
-  }
-
- private:
-  // ppl.cv uses inverted transform
-  // https://github.com/opencv/opencv/blob/bc6544c0bcfa9ca5db5e0d0551edf5c8e7da3852/modules/imgproc/src/imgwarp.cpp#L3478
-  static std::array<float, 6> Invert(const float affine_matrix[6]) {
-    const auto* M = affine_matrix;
-    std::array<float, 6> inv{};
-    auto iM = inv.data();
-
-    auto D = M[0] * M[3 + 1] - M[1] * M[3];
-    D = D != 0.f ? 1.f / D : 0.f;
-    auto A11 = M[3 + 1] * D, A22 = M[0] * D, A12 = -M[1] * D, A21 = -M[3] * D;
-    auto b1 = -A11 * M[2] - A12 * M[3 + 2];
-    auto b2 = -A21 * M[2] - A22 * M[3 + 2];
-
-    iM[0] = A11;
-    iM[1] = A12;
-    iM[2] = b1;
-    iM[3] = A21;
-    iM[3 + 1] = A22;
-    iM[3 + 2] = b2;
-
-    return inv;
-  }
-
-  template <typename T>
-  auto Select(int channels) -> decltype(&ppl::cv::cuda::WarpAffine<T, 1>) {
-    switch (channels) {
-      case 1:
-        return &ppl::cv::cuda::WarpAffine<T, 1>;
-      case 3:
-        return &ppl::cv::cuda::WarpAffine<T, 3>;
-      case 4:
-        return &ppl::cv::cuda::WarpAffine<T, 4>;
-      default:
-        MMDEPLOY_ERROR("unsupported channels {}", channels);
-        return nullptr;
-    }
-  }
-
-  template <class T>
-  Result<void> Dispatch(const Tensor& src, Tensor& dst, const float affine_matrix[6],
-                        cudaStream_t stream) {
-    int h = (int)src.shape(1);
-    int w = (int)src.shape(2);
-    int c = (int)src.shape(3);
-    int dst_h = (int)dst.shape(1);
-    int dst_w = (int)dst.shape(2);
-
-    auto input = src.data<T>();
-    auto output = dst.data<T>();
-
-    ppl::common::RetCode ret = 0;
-
-    if (auto warp_affine = Select<T>(c); warp_affine) {
-      ret = warp_affine(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output, affine_matrix,
-                        interp_, ppl::cv::BORDER_CONSTANT, 0);
-    } else {
-      return Status(eNotSupported);
-    }
-
-    return ret == 0 ? success() : Result<void>(Status(eFail));
-  }
-
-  ppl::cv::InterpolationType interp_;
-};
-
-static auto Create(const string_view& interp) {
-  ppl::cv::InterpolationType type{};
-  if (interp == "bilinear") {
-    type = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
-  } else if (interp == "nearest") {
-    type = ppl::cv::InterpolationType::INTERPOLATION_NEAREST_POINT;
-  } else {
-    MMDEPLOY_ERROR("unsupported interpolation method: {}", interp);
-    throw_exception(eNotSupported);
-  }
-  return std::make_unique<WarpAffineImpl>(type);
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(WarpAffine, (cuda, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(WarpAffine, (cuda, 0), Create);
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/dummy/operations.cpp b/csrc/mmdeploy/operation/dummy/operations.cpp
index d2fac15e83..e2bd780178 100644
--- a/csrc/mmdeploy/operation/dummy/operations.cpp
+++ b/csrc/mmdeploy/operation/dummy/operations.cpp
@@ -2,97 +2,110 @@
 
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::dummy {
+namespace mmdeploy::operation::dummy
+{
 
-namespace {
+    namespace
+    {
 
-const Buffer& g_dummy_buffer() {
-  static Buffer buffer{Device(0), 0, nullptr};
-  return buffer;
-}
+        const Buffer& g_dummy_buffer()
+        {
+            static Buffer buffer{Device(0), 0, nullptr};
+            return buffer;
+        }
 
-}  // namespace
+    }  // namespace
 
-class HWC2CHWImpl : public HWC2CHW {
- public:
-  Result<void> apply(const Tensor& img, Tensor& dst) override {
-    auto& shape = img.shape();
-    dst = {{Device{0}, img.data_type(), {shape[0], shape[3], shape[1], shape[2]}},
-           g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (dummy, 0),
-                               []() { return std::make_unique<HWC2CHWImpl>(); });
+    class HWC2CHWImpl : public HWC2CHW
+    {
+      public:
+        Result<void> apply(const Tensor& img, Tensor& dst) override
+        {
+            auto& shape = img.shape();
+            dst         = {{Device{0}, img.data_type(), {shape[0], shape[3], shape[1], shape[2]}},
+                           g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (dummy, 0), []()
+                                   { return std::make_unique<HWC2CHWImpl>(); });
 
-class CropImpl : public Crop {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    auto shape = src.shape();
-    shape[1] = bottom - top + 1;  // h
-    shape[2] = right - left + 1;  // w
-    dst = {{Device{0}, src.data_type(), shape}, g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (dummy, 0), []() { return std::make_unique<CropImpl>(); });
+    class CropImpl : public Crop
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            auto shape = src.shape();
+            shape[1]   = bottom - top + 1;  // h
+            shape[2]   = right - left + 1;  // w
+            dst        = {{Device{0}, src.data_type(), shape}, g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (dummy, 0), []()
+                                   { return std::make_unique<CropImpl>(); });
 
-class ToFloatImpl : public ToFloat {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    dst = {{Device{0}, DataType::kFLOAT, src.shape()}, g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (dummy, 0),
-                               []() { return std::make_unique<ToFloatImpl>(); });
+    class ToFloatImpl : public ToFloat
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            dst = {{Device{0}, DataType::kFLOAT, src.shape()}, g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (dummy, 0), []()
+                                   { return std::make_unique<ToFloatImpl>(); });
 
-class CvtColorImpl : public CvtColor {
- public:
-  Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override {
-    dst = {src.height(), src.width(), dst_fmt, src.type(), nullptr, Device{0}};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (dummy, 0),
-                               [] { return std::make_unique<CvtColorImpl>(); });
+    class CvtColorImpl : public CvtColor
+    {
+      public:
+        Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override
+        {
+            dst = {src.height(), src.width(), dst_fmt, src.type(), nullptr, Device{0}};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (dummy, 0), []
+                                   { return std::make_unique<CvtColorImpl>(); });
 
-class NormalizeImpl : public Normalize {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    dst = {{Device{0}, DataType::kFLOAT, src.shape()}, g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (dummy, 0), [](const Normalize::Param& param) {
-  return std::make_unique<NormalizeImpl>();
-});
+    class NormalizeImpl : public Normalize
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            dst = {{Device{0}, DataType::kFLOAT, src.shape()}, g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (dummy, 0), [](const Normalize::Param& param)
+                                   { return std::make_unique<NormalizeImpl>(); });
 
-class PadImpl : public Pad {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    auto shape = src.shape();  // 1HWC
-    shape[1] += top + bottom;
-    shape[2] += left + right;
-    dst = {{Device{0}, src.data_type(), shape}, g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (dummy, 0), [](const string_view& border_type, float pad_val) {
-  return std::make_unique<PadImpl>();
-});
+    class PadImpl : public Pad
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            auto shape = src.shape();  // 1HWC
+            shape[1] += top + bottom;
+            shape[2] += left + right;
+            dst = {{Device{0}, src.data_type(), shape}, g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (dummy, 0), [](const string_view& border_type, float pad_val)
+                                   { return std::make_unique<PadImpl>(); });
 
-class ResizeImpl : public Resize {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override {
-    dst = {{Device{0}, dst.data_type(), {1, dst_h, dst_w, src.shape(3)}}, g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (dummy, 0), [](const string_view& interp) {
-  return std::make_unique<ResizeImpl>();
-});
+    class ResizeImpl : public Resize
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override
+        {
+            dst = {{Device{0}, dst.data_type(), {1, dst_h, dst_w, src.shape(3)}}, g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (dummy, 0), [](const string_view& interp)
+                                   { return std::make_unique<ResizeImpl>(); });
 
 }  // namespace mmdeploy::operation::dummy
diff --git a/csrc/mmdeploy/operation/managed.h b/csrc/mmdeploy/operation/managed.h
index 15866dbbc7..1d277efc16 100644
--- a/csrc/mmdeploy/operation/managed.h
+++ b/csrc/mmdeploy/operation/managed.h
@@ -5,177 +5,234 @@
 
 #include "mmdeploy/operation/operation.h"
 
-namespace mmdeploy::operation {
-
-namespace _apply {
-
-inline Result<void> Copy(const Buffer& src, Buffer& dst, size_t size, Stream& stream) {
-  OUTCOME_TRY(stream.Copy(src, dst, size));
-  if (dst.GetDevice() != stream.GetDevice()) {
-    OUTCOME_TRY(stream.Wait());
-  }
-  return success();
-}
-
-inline Result<Tensor> Secure(const Tensor& val, const Device& device, Stream& stream) {
-  if (val.device() == device || gContext().use_dummy()) {
-    return val;
-  }
-
-  TensorDesc desc{device, val.data_type(), val.shape(), val.name()};
-  Tensor dst(desc);
-
-  OUTCOME_TRY(Copy(val.buffer(), dst.buffer(), val.byte_size(), stream));
-
-  gContext().Track(dst);
-  return dst;
-}
-
-inline Result<Mat> Secure(const Mat& val, const Device& device, Stream& stream) {
-  if (val.device() == device || gContext().use_dummy()) {
-    return val;
-  }
-
-  Mat dst{val.height(), val.width(), val.pixel_format(), val.type(), device};
-
-  OUTCOME_TRY(Copy(val.buffer(), dst.buffer(), val.byte_size(), stream));
-
-  gContext().Track(dst);
-  return dst;
-}
-
-template <typename T>
-struct _base_handler {
-  using type = T;
-  static T input(T x, const Device&, Stream&) { return x; }
-  static T pass(T x) { return x; }
-  static void output(T) {}
-};
-
-template <typename T>
-struct _handler : _base_handler<T> {};
-
-template <>
-struct _handler<const Tensor&> : _base_handler<const Tensor&> {
-  using type = Result<Tensor>;
-  static type input(const Tensor& tensor, const Device& device, Stream& stream) {
-    return Secure(tensor, device, stream);
-  }
-  static const Tensor& pass(const type& tensor) { return tensor.value(); }
-  static void output(const Result<Tensor>&) {}
-};
-
-template <>
-struct _handler<const Mat&> {
-  using type = Result<Mat>;
-  static type input(const Mat& mat, const Device& device, Stream& stream) {
-    return Secure(mat, device, stream);
-  }
-  static const Mat& pass(const type& mat) { return mat.value(); }
-  static void output(const type&) {}
-};
-
-template <>
-struct _handler<const std::vector<Tensor>&> {
-  using type = Result<std::vector<Tensor>>;
-  static type input(const std::vector<Tensor>& tensors, const Device& device, Stream& stream) {
-    std::vector<Tensor> rets(tensors.size());
-    for (size_t i = 0; i < tensors.size(); ++i) {
-      OUTCOME_TRY(rets[i], Secure(tensors[i], device, stream));
-    }
-    return rets;
-  }
-  static const std::vector<Tensor>& pass(const type& tensors) { return tensors.value(); }
-  static void output(const type&) {}
-};
-
-template <>
-struct _handler<Tensor&> : _base_handler<Tensor&> {
-  static void output(Tensor& tensor) { gContext().Track(tensor); }
-};
-
-template <>
-struct _handler<Mat&> : _base_handler<Mat&> {
-  static void output(Mat& mat) { gContext().Track(mat); }
-};
-
-inline Result<void> _check() { return success(); }
-
-template <typename T, typename... Ts>
-Result<void> _check(T&& x, Ts&&... xs) {
-  return _check((Ts &&) xs...);
-}
-
-template <typename T, typename... Ts>
-Result<void> _check(Result<T>& x, Ts&&... xs) {
-  OUTCOME_TRY(x);
-  return _check((Ts &&) xs...);
-}
-
-template <typename Sig>
-struct apply_impl {
-  static_assert(!std::is_same_v<Sig, Sig>, "Not a member function pointer");
-};
-
-template <typename Ret, typename C, typename... Args>
-struct apply_impl<Ret (C::*)(Args...)> {
-  const Device& device;
-  Stream& stream;
-
-  template <typename Op, typename... As>
-  Result<void> operator()(Op& op, As&&... as) const {
-    return apply(op, std::index_sequence_for<Args...>{}, (As &&) as...);
-  }
-
-  template <typename Op, typename... As, size_t... Is>
-  Result<void> apply(Op& op, std::index_sequence<Is...>, As&&... as) const {
-    // transform input args and store them in a tuple
-    std::tuple<typename _handler<Args>::type...> tmps{
-        _handler<Args>::input((As &&) as, device, stream)...};
-
-    // check if any copy operations are failed
-    OUTCOME_TRY(_check(std::get<Is>(tmps)...));
-
-    // apply the operation
-    OUTCOME_TRY(op.apply(_handler<Args>::pass(std::get<Is>(tmps))...));
-
-    // track output data (Tensor& and Mat&)
-    (_handler<Args>::output(std::get<Is>(tmps)), ...);
-    return success();
-  }
-};
-
-template <typename Op, typename... Args>
-Result<void> apply(Op& op, Args&&... args) {
-  _apply::apply_impl<decltype(&Op::apply)> impl{op.device(), op.stream()};
-  return impl(op, (Args &&) args...);
-}
-
-}  // namespace _apply
-
-template <typename Op>
-class Managed {
- public:
-  Managed() = default;
-
-  explicit Managed(std::unique_ptr<Op> op) : op_(std::move(op)) {}
-
-  template <typename... Args>
-  Result<void> Apply(Args&&... args) {
-    assert(op_);
-    return _apply::apply(*op_, (Args &&) args...);
-  }
-
-  template <typename... Args>
-  static Managed<Op> Create(Args&&... args) {
-    return Managed<Op>(operation::Create<Op>((Args &&) args...));
-  }
-
- private:
-  std::unique_ptr<Op> op_;
-};
-
-using _apply::Secure;
+namespace mmdeploy::operation
+{
+
+    namespace _apply
+    {
+
+        inline Result<void> Copy(const Buffer& src, Buffer& dst, size_t size, Stream& stream)
+        {
+            OUTCOME_TRY(stream.Copy(src, dst, size));
+            if (dst.GetDevice() != stream.GetDevice())
+            {
+                OUTCOME_TRY(stream.Wait());
+            }
+            return success();
+        }
+
+        inline Result<Tensor> Secure(const Tensor& val, const Device& device, Stream& stream)
+        {
+            if (val.device() == device || gContext().use_dummy())
+            {
+                return val;
+            }
+
+            TensorDesc desc{device, val.data_type(), val.shape(), val.name()};
+            Tensor     dst(desc);
+
+            OUTCOME_TRY(Copy(val.buffer(), dst.buffer(), val.byte_size(), stream));
+
+            gContext().Track(dst);
+            return dst;
+        }
+
+        inline Result<Mat> Secure(const Mat& val, const Device& device, Stream& stream)
+        {
+            if (val.device() == device || gContext().use_dummy())
+            {
+                return val;
+            }
+
+            Mat dst{val.height(), val.width(), val.pixel_format(), val.type(), device};
+
+            OUTCOME_TRY(Copy(val.buffer(), dst.buffer(), val.byte_size(), stream));
+
+            gContext().Track(dst);
+            return dst;
+        }
+
+        template<typename T>
+        struct _base_handler
+        {
+            using type = T;
+            static T input(T x, const Device&, Stream&)
+            {
+                return x;
+            }
+            static T pass(T x)
+            {
+                return x;
+            }
+            static void output(T) {}
+        };
+
+        template<typename T>
+        struct _handler : _base_handler<T>
+        {
+        };
+
+        template<>
+        struct _handler<const Tensor&> : _base_handler<const Tensor&>
+        {
+            using type = Result<Tensor>;
+            static type input(const Tensor& tensor, const Device& device, Stream& stream)
+            {
+                return Secure(tensor, device, stream);
+            }
+            static const Tensor& pass(const type& tensor)
+            {
+                return tensor.value();
+            }
+            static void output(const Result<Tensor>&) {}
+        };
+
+        template<>
+        struct _handler<const Mat&>
+        {
+            using type = Result<Mat>;
+            static type input(const Mat& mat, const Device& device, Stream& stream)
+            {
+                return Secure(mat, device, stream);
+            }
+            static const Mat& pass(const type& mat)
+            {
+                return mat.value();
+            }
+            static void output(const type&) {}
+        };
+
+        template<>
+        struct _handler<const std::vector<Tensor>&>
+        {
+            using type = Result<std::vector<Tensor>>;
+            static type input(const std::vector<Tensor>& tensors, const Device& device, Stream& stream)
+            {
+                std::vector<Tensor> rets(tensors.size());
+                for (size_t i = 0; i < tensors.size(); ++i)
+                {
+                    OUTCOME_TRY(rets[i], Secure(tensors[i], device, stream));
+                }
+                return rets;
+            }
+            static const std::vector<Tensor>& pass(const type& tensors)
+            {
+                return tensors.value();
+            }
+            static void output(const type&) {}
+        };
+
+        template<>
+        struct _handler<Tensor&> : _base_handler<Tensor&>
+        {
+            static void output(Tensor& tensor)
+            {
+                gContext().Track(tensor);
+            }
+        };
+
+        template<>
+        struct _handler<Mat&> : _base_handler<Mat&>
+        {
+            static void output(Mat& mat)
+            {
+                gContext().Track(mat);
+            }
+        };
+
+        inline Result<void> _check()
+        {
+            return success();
+        }
+
+        template<typename T, typename... Ts>
+        Result<void> _check(T&& x, Ts&&... xs)
+        {
+            return _check((Ts&&)xs...);
+        }
+
+        template<typename T, typename... Ts>
+        Result<void> _check(Result<T>& x, Ts&&... xs)
+        {
+            OUTCOME_TRY(x);
+            return _check((Ts&&)xs...);
+        }
+
+        template<typename Sig>
+        struct apply_impl
+        {
+            static_assert(!std::is_same_v<Sig, Sig>, "Not a member function pointer");
+        };
+
+        template<typename Ret, typename C, typename... Args>
+        struct apply_impl<Ret (C::*)(Args...)>
+        {
+            const Device& device;
+            Stream&       stream;
+
+            template<typename Op, typename... As>
+            Result<void> operator()(Op& op, As&&... as) const
+            {
+                return apply(op, std::index_sequence_for<Args...>{}, (As&&)as...);
+            }
+
+            template<typename Op, typename... As, size_t... Is>
+            Result<void> apply(Op& op, std::index_sequence<Is...>, As&&... as) const
+            {
+                // transform input args and store them in a tuple
+                std::tuple<typename _handler<Args>::type...> tmps{
+                    _handler<Args>::input((As&&)as, device, stream)...};
+
+                // check if any copy operations are failed
+                OUTCOME_TRY(_check(std::get<Is>(tmps)...));
+
+                // apply the operation
+                OUTCOME_TRY(op.apply(_handler<Args>::pass(std::get<Is>(tmps))...));
+
+                // track output data (Tensor& and Mat&)
+                (_handler<Args>::output(std::get<Is>(tmps)), ...);
+                return success();
+            }
+        };
+
+        template<typename Op, typename... Args>
+        Result<void> apply(Op& op, Args&&... args)
+        {
+            _apply::apply_impl<decltype(&Op::apply)> impl{op.device(), op.stream()};
+            return impl(op, (Args&&)args...);
+        }
+
+    }  // namespace _apply
+
+    template<typename Op>
+    class Managed
+    {
+      public:
+        Managed() = default;
+
+        explicit Managed(std::unique_ptr<Op> op)
+            : op_(std::move(op))
+        {
+        }
+
+        template<typename... Args>
+        Result<void> Apply(Args&&... args)
+        {
+            assert(op_);
+            return _apply::apply(*op_, (Args&&)args...);
+        }
+
+        template<typename... Args>
+        static Managed<Op> Create(Args&&... args)
+        {
+            return Managed<Op>(operation::Create<Op>((Args&&)args...));
+        }
+
+      private:
+        std::unique_ptr<Op> op_;
+    };
+
+    using _apply::Secure;
 
 }  // namespace mmdeploy::operation
 
diff --git a/csrc/mmdeploy/operation/operation.cpp b/csrc/mmdeploy/operation/operation.cpp
index 9c2f0d26ad..4cf7f0d0ef 100644
--- a/csrc/mmdeploy/operation/operation.cpp
+++ b/csrc/mmdeploy/operation/operation.cpp
@@ -4,36 +4,58 @@
 
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy::operation {
+namespace mmdeploy::operation
+{
 
-thread_local Context* g_context{};
+    thread_local Context* g_context{};
 
-Context::Context(Device device, Stream stream)
-    : device_(device), stream_(std::move(stream)), parent_(std::exchange(g_context, this)) {}
+    Context::Context(Device device, Stream stream)
+        : device_(device)
+        , stream_(std::move(stream))
+        , parent_(std::exchange(g_context, this))
+    {
+    }
 
-Context::~Context() {
-  if (stream_) {
-    if (auto ec = stream_.Wait(); ec.has_error()) {
-      MMDEPLOY_ERROR("Stream synchronization failed: {}", ec.error().message().c_str());
+    Context::~Context()
+    {
+        if (stream_)
+        {
+            if (auto ec = stream_.Wait(); ec.has_error())
+            {
+                MMDEPLOY_ERROR("Stream synchronization failed: {}", ec.error().message().c_str());
+            }
+        }
+        g_context = std::exchange(parent_, nullptr);
     }
-  }
-  g_context = std::exchange(parent_, nullptr);
-}
 
-static Stream GetCurrentStream() { return g_context ? g_context->stream() : Stream{}; }
+    static Stream GetCurrentStream()
+    {
+        return g_context ? g_context->stream() : Stream{};
+    }
 
-static Device GetCurrentDevice() { return g_context ? g_context->device() : Device{}; }
+    static Device GetCurrentDevice()
+    {
+        return g_context ? g_context->device() : Device{};
+    }
 
-Context::Context(Device device) : Context(device, GetCurrentStream()) {}
+    Context::Context(Device device)
+        : Context(device, GetCurrentStream())
+    {
+    }
 
-Context::Context(Stream stream) : Context(GetCurrentDevice(), std::move(stream)) {}
+    Context::Context(Stream stream)
+        : Context(GetCurrentDevice(), std::move(stream))
+    {
+    }
 
-Context& gContext() {
-  if (g_context) {
-    return *g_context;
-  }
-  MMDEPLOY_ERROR("Operations must be used inside scopes guarded by operation::Context, aborting.");
-  std::abort();
-}
+    Context& gContext()
+    {
+        if (g_context)
+        {
+            return *g_context;
+        }
+        MMDEPLOY_ERROR("Operations must be used inside scopes guarded by operation::Context, aborting.");
+        std::abort();
+    }
 
 }  // namespace mmdeploy::operation
diff --git a/csrc/mmdeploy/operation/operation.h b/csrc/mmdeploy/operation/operation.h
index ed8d954d44..f27242a286 100644
--- a/csrc/mmdeploy/operation/operation.h
+++ b/csrc/mmdeploy/operation/operation.h
@@ -11,102 +11,151 @@
 #include "mmdeploy/core/utils/device_utils.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::operation {
-
-using namespace mmdeploy::framework;
-using std::string_view;
-using std::unique_ptr;
-
-class MMDEPLOY_API Context {
- public:
-  explicit Context(Device device);
-  explicit Context(Stream stream);
-  explicit Context(Device device, Stream stream);
-  ~Context();
-
-  Context(const Context&) = delete;
-  Context(Context&&) noexcept = delete;
-  Context& operator=(const Context&) = delete;
-  Context& operator=(Context&&) noexcept = delete;
-
-  void Track(const Tensor& tensor) { buffers_.push_back(tensor.buffer()); }
-  void Track(const Mat& mat) { buffers_.push_back(mat.buffer()); };
-  void Track(const Buffer& buffer) { buffers_.push_back(buffer); };
-
-  template <typename T, typename... Args>
-  T Create(Args&&... args) {
-    return _track(T((Args &&) args...));
-  }
-
-  const Device& device() const noexcept { return device_; }
-  Stream& stream() noexcept { return stream_; }
-  const std::vector<Buffer>& buffers() const noexcept { return buffers_; }
-
-  bool use_dummy() const noexcept { return use_dummy_; }
-  void set_use_dummy(bool value) noexcept { use_dummy_ = value; }
-
- private:
-  Tensor&& _track(Tensor&& tensor) {
-    Track(tensor);
-    return std::move(tensor);
-  }
-  Mat&& _track(Mat&& mat) {
-    Track(mat);
-    return std::move(mat);
-  }
-  Buffer&& _track(Buffer&& buffer) {
-    Track(buffer);
-    return std::move(buffer);
-  }
-
- private:
-  Device device_;
-  Stream stream_;
-  std::vector<Buffer> buffers_;
-  bool use_dummy_{false};
-  Context* parent_;
-};
-
-MMDEPLOY_API Context& gContext();
-
-template <typename T, typename... Args>
-static unique_ptr<T> Create(Args&&... args) {
-  std::vector<string_view> tried;
-  if (!gContext().use_dummy()) {
-    std::vector<Device> candidates{gContext().device()};
-    if (candidates[0].is_device()) {
-      candidates.emplace_back(0);
-    }
-    for (const auto& device : candidates) {
-      if (auto platform = GetPlatformName(device)) {
-        tried.emplace_back(platform);
-        if (auto creator = gRegistry<T>().Get(platform)) {
-          Context context(device);
-          return creator->Create((Args &&) args...);
+namespace mmdeploy::operation
+{
+
+    using namespace mmdeploy::framework;
+    using std::string_view;
+    using std::unique_ptr;
+
+    class MMDEPLOY_API Context
+    {
+      public:
+        explicit Context(Device device);
+        explicit Context(Stream stream);
+        explicit Context(Device device, Stream stream);
+        ~Context();
+
+        Context(const Context&)                = delete;
+        Context(Context&&) noexcept            = delete;
+        Context& operator=(const Context&)     = delete;
+        Context& operator=(Context&&) noexcept = delete;
+
+        void     Track(const Tensor& tensor)
+        {
+            buffers_.push_back(tensor.buffer());
         }
-      }
-    }
-  } else {
-    tried.emplace_back("dummy");
-    if (auto creator = gRegistry<T>().Get("dummy")) {
-      return creator->Create((Args &&) args...);
+        void Track(const Mat& mat)
+        {
+            buffers_.push_back(mat.buffer());
+        };
+        void Track(const Buffer& buffer)
+        {
+            buffers_.push_back(buffer);
+        };
+
+        template<typename T, typename... Args>
+        T Create(Args&&... args)
+        {
+            return _track(T((Args&&)args...));
+        }
+
+        const Device& device() const noexcept
+        {
+            return device_;
+        }
+        Stream& stream() noexcept
+        {
+            return stream_;
+        }
+        const std::vector<Buffer>& buffers() const noexcept
+        {
+            return buffers_;
+        }
+
+        bool use_dummy() const noexcept
+        {
+            return use_dummy_;
+        }
+        void set_use_dummy(bool value) noexcept
+        {
+            use_dummy_ = value;
+        }
+
+      private:
+        Tensor&& _track(Tensor&& tensor)
+        {
+            Track(tensor);
+            return std::move(tensor);
+        }
+        Mat&& _track(Mat&& mat)
+        {
+            Track(mat);
+            return std::move(mat);
+        }
+        Buffer&& _track(Buffer&& buffer)
+        {
+            Track(buffer);
+            return std::move(buffer);
+        }
+
+      private:
+        Device              device_;
+        Stream              stream_;
+        std::vector<Buffer> buffers_;
+        bool                use_dummy_{false};
+        Context*            parent_;
+    };
+
+    MMDEPLOY_API Context& gContext();
+
+    template<typename T, typename... Args>
+    static unique_ptr<T> Create(Args&&... args)
+    {
+        std::vector<string_view> tried;
+        if (!gContext().use_dummy())
+        {
+            std::vector<Device> candidates{gContext().device()};
+            if (candidates[0].is_device())
+            {
+                candidates.emplace_back(0);
+            }
+            for (const auto& device : candidates)
+            {
+                if (auto platform = GetPlatformName(device))
+                {
+                    tried.emplace_back(platform);
+                    if (auto creator = gRegistry<T>().Get(platform))
+                    {
+                        Context context(device);
+                        return creator->Create((Args&&)args...);
+                    }
+                }
+            }
+        }
+        else
+        {
+            tried.emplace_back("dummy");
+            if (auto creator = gRegistry<T>().Get("dummy"))
+            {
+                return creator->Create((Args&&)args...);
+            }
+        }
+        MMDEPLOY_ERROR("Unable to create operation, tried platforms: {}", tried);
+        throw_exception(eNotSupported);
     }
-  }
-  MMDEPLOY_ERROR("Unable to create operation, tried platforms: {}", tried);
-  throw_exception(eNotSupported);
-}
-
-class Operation {
- public:
-  Operation() : device_(gContext().device()) {}
-  virtual ~Operation() = default;
-
-  const Device& device() const noexcept { return device_; }
-  static Stream& stream() noexcept { return gContext().stream(); }
-
- protected:
-  Device device_;
-};
+
+    class Operation
+    {
+      public:
+        Operation()
+            : device_(gContext().device())
+        {
+        }
+        virtual ~Operation() = default;
+
+        const Device& device() const noexcept
+        {
+            return device_;
+        }
+        static Stream& stream() noexcept
+        {
+            return gContext().stream();
+        }
+
+      protected:
+        Device device_;
+    };
 
 }  // namespace mmdeploy::operation
 
diff --git a/csrc/mmdeploy/operation/vision.cpp b/csrc/mmdeploy/operation/vision.cpp
index 0c0b13eb9a..e9d743a9fc 100644
--- a/csrc/mmdeploy/operation/vision.cpp
+++ b/csrc/mmdeploy/operation/vision.cpp
@@ -2,18 +2,19 @@
 
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation {
+namespace mmdeploy::operation
+{
 
-MMDEPLOY_DEFINE_REGISTRY(CvtColor);
-MMDEPLOY_DEFINE_REGISTRY(Resize);
-MMDEPLOY_DEFINE_REGISTRY(Pad);
-MMDEPLOY_DEFINE_REGISTRY(ToFloat);
-MMDEPLOY_DEFINE_REGISTRY(HWC2CHW);
-MMDEPLOY_DEFINE_REGISTRY(Normalize);
-MMDEPLOY_DEFINE_REGISTRY(Crop);
-MMDEPLOY_DEFINE_REGISTRY(Flip);
-MMDEPLOY_DEFINE_REGISTRY(WarpAffine);
-MMDEPLOY_DEFINE_REGISTRY(CropResizePad);
-MMDEPLOY_DEFINE_REGISTRY(Permute);
+    MMDEPLOY_DEFINE_REGISTRY(CvtColor);
+    MMDEPLOY_DEFINE_REGISTRY(Resize);
+    MMDEPLOY_DEFINE_REGISTRY(Pad);
+    MMDEPLOY_DEFINE_REGISTRY(ToFloat);
+    MMDEPLOY_DEFINE_REGISTRY(HWC2CHW);
+    MMDEPLOY_DEFINE_REGISTRY(Normalize);
+    MMDEPLOY_DEFINE_REGISTRY(Crop);
+    MMDEPLOY_DEFINE_REGISTRY(Flip);
+    MMDEPLOY_DEFINE_REGISTRY(WarpAffine);
+    MMDEPLOY_DEFINE_REGISTRY(CropResizePad);
+    MMDEPLOY_DEFINE_REGISTRY(Permute);
 
 }  // namespace mmdeploy::operation
diff --git a/csrc/mmdeploy/operation/vision.h b/csrc/mmdeploy/operation/vision.h
index 013c3852b8..6876409283 100644
--- a/csrc/mmdeploy/operation/vision.h
+++ b/csrc/mmdeploy/operation/vision.h
@@ -8,95 +8,106 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/operation/operation.h"
 
-namespace mmdeploy::operation {
-
-class CvtColor : public Operation {
- public:
-  virtual Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(CvtColor, unique_ptr<CvtColor>());
-
-// resize in HWC format
-class Resize : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(Resize, unique_ptr<Resize>(const string_view& interp));
-
-// pad in HWC format
-class Pad : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                             int right) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(Pad, unique_ptr<Pad>(const string_view& border_type, float pad_val));
-
-// uint8 to float
-class ToFloat : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(ToFloat, unique_ptr<ToFloat>());
-
-class HWC2CHW : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(HWC2CHW, unique_ptr<HWC2CHW>());
-
-// normalize in HWC format
-class Normalize : public Operation {
- public:
-  struct Param {
-    std::vector<float> mean;
-    std::vector<float> std;
-    bool to_rgb;
-  };
-
-  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(Normalize, unique_ptr<Normalize>(const Normalize::Param& param));
-
-// crop in HWC format
-class Crop : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                             int right) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(Crop, unique_ptr<Crop>());
-
-class Flip : public Operation {
- public:
-  explicit Flip(int flip_code) : flip_code_(flip_code) {}
-
-  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
-
- protected:
-  int flip_code_;
-};
-MMDEPLOY_DECLARE_REGISTRY(Flip, unique_ptr<Flip>(int flip_code));
-
-// 2x3 OpenCV affine matrix, row major
-class WarpAffine : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6],
-                             int dst_h, int dst_w) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(WarpAffine, unique_ptr<WarpAffine>(const string_view& interp));
-
-class CropResizePad : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, const std::vector<int>& crop_rect,
-                             const std::vector<int>& target_size, const std::vector<int>& pad_rect,
-                             Tensor& dst) = 0;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(CropResizePad, unique_ptr<CropResizePad>());
-class Permute : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(Permute, unique_ptr<Permute>());
+namespace mmdeploy::operation
+{
+
+    class CvtColor : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(CvtColor, unique_ptr<CvtColor>());
+
+    // resize in HWC format
+    class Resize : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Resize, unique_ptr<Resize>(const string_view& interp));
+
+    // pad in HWC format
+    class Pad : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Pad, unique_ptr<Pad>(const string_view& border_type, float pad_val));
+
+    // uint8 to float
+    class ToFloat : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(ToFloat, unique_ptr<ToFloat>());
+
+    class HWC2CHW : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(HWC2CHW, unique_ptr<HWC2CHW>());
+
+    // normalize in HWC format
+    class Normalize : public Operation
+    {
+      public:
+        struct Param
+        {
+            std::vector<float> mean;
+            std::vector<float> std;
+            bool               to_rgb;
+        };
+
+        virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Normalize, unique_ptr<Normalize>(const Normalize::Param& param));
+
+    // crop in HWC format
+    class Crop : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Crop, unique_ptr<Crop>());
+
+    class Flip : public Operation
+    {
+      public:
+        explicit Flip(int flip_code)
+            : flip_code_(flip_code)
+        {
+        }
+
+        virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+
+      protected:
+        int flip_code_;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Flip, unique_ptr<Flip>(int flip_code));
+
+    // 2x3 OpenCV affine matrix, row major
+    class WarpAffine : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6], int dst_h, int dst_w) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(WarpAffine, unique_ptr<WarpAffine>(const string_view& interp));
+
+    class CropResizePad : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect, Tensor& dst) = 0;
+    };
+
+    MMDEPLOY_DECLARE_REGISTRY(CropResizePad, unique_ptr<CropResizePad>());
+    class Permute : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Permute, unique_ptr<Permute>());
 
 }  // namespace mmdeploy::operation
 
diff --git a/csrc/mmdeploy/preprocess/elena/elena_registry.cpp b/csrc/mmdeploy/preprocess/elena/elena_registry.cpp
index d38101ae9f..c848f3adaf 100644
--- a/csrc/mmdeploy/preprocess/elena/elena_registry.cpp
+++ b/csrc/mmdeploy/preprocess/elena/elena_registry.cpp
@@ -4,29 +4,36 @@
 
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy {
-namespace elena {
+namespace mmdeploy
+{
+    namespace elena
+    {
 
-FuseKernel& FuseKernel::Get() {
-  static FuseKernel fuse_kernel;
-  return fuse_kernel;
-}
+        FuseKernel& FuseKernel::Get()
+        {
+            static FuseKernel fuse_kernel;
+            return fuse_kernel;
+        }
 
-FuseFunc FuseKernel::GetFunc(const std::string& name) {
-  if (entries_.count(name)) {
-    return entries_[name];
-  }
-  return nullptr;
-}
+        FuseFunc FuseKernel::GetFunc(const std::string& name)
+        {
+            if (entries_.count(name))
+            {
+                return entries_[name];
+            }
+            return nullptr;
+        }
 
-int FuseKernel::Register(const std::string& name, FuseFunc func) {
-  if (entries_.count(name)) {
-    return -1;
-  }
-  MMDEPLOY_DEBUG("Register fuse kernel: '{}'", name);
-  entries_.emplace(name, func);
-  return 0;
-}
+        int FuseKernel::Register(const std::string& name, FuseFunc func)
+        {
+            if (entries_.count(name))
+            {
+                return -1;
+            }
+            MMDEPLOY_DEBUG("Register fuse kernel: '{}'", name);
+            entries_.emplace(name, func);
+            return 0;
+        }
 
-}  // namespace elena
+    }  // namespace elena
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/preprocess/elena/elena_registry.h b/csrc/mmdeploy/preprocess/elena/elena_registry.h
index 6eda61d8cd..8bb2d87f2c 100644
--- a/csrc/mmdeploy/preprocess/elena/elena_registry.h
+++ b/csrc/mmdeploy/preprocess/elena/elena_registry.h
@@ -8,38 +8,38 @@
 
 #include "mmdeploy/core/macro.h"
 
-namespace mmdeploy {
-namespace elena {
-
-using FuseFunc = void (*)(void* stream, uint8_t* data_in, int src_h, int src_w, const char* format,
-                          int resize_h, int resize_w, const char* interpolation, int crop_top,
-                          int crop_left, int crop_h, int crop_w, float mean0, float mean1,
-                          float mean2, float std0, float std1, float std2, int pad_top,
-                          int pad_left, int pad_bottom, int pad_right, int pad_h, int pad_w,
-                          float pad_value, float* data_out, int dst_h, int dst_w);
-
-class MMDEPLOY_API FuseKernel {
- public:
-  static FuseKernel& Get();
-  int Register(const std::string& name, FuseFunc func);
-  FuseFunc GetFunc(const std::string& name);
-
- private:
-  FuseKernel() = default;
-  std::map<std::string, FuseFunc> entries_;
-};
-
-class MMDEPLOY_API FuseKernelRegister {
- public:
-  FuseKernelRegister(const std::string& name, FuseFunc func) {
-    FuseKernel::Get().Register(name, func);
-  }
-};
-
-}  // namespace elena
+namespace mmdeploy
+{
+    namespace elena
+    {
+
+        using FuseFunc = void (*)(void* stream, uint8_t* data_in, int src_h, int src_w, const char* format, int resize_h, int resize_w, const char* interpolation, int crop_top, int crop_left, int crop_h, int crop_w, float mean0, float mean1, float mean2, float std0, float std1, float std2, int pad_top, int pad_left, int pad_bottom, int pad_right, int pad_h, int pad_w, float pad_value, float* data_out, int dst_h, int dst_w);
+
+        class MMDEPLOY_API FuseKernel
+        {
+          public:
+            static FuseKernel& Get();
+            int                Register(const std::string& name, FuseFunc func);
+            FuseFunc           GetFunc(const std::string& name);
+
+          private:
+            FuseKernel() = default;
+            std::map<std::string, FuseFunc> entries_;
+        };
+
+        class MMDEPLOY_API FuseKernelRegister
+        {
+          public:
+            FuseKernelRegister(const std::string& name, FuseFunc func)
+            {
+                FuseKernel::Get().Register(name, func);
+            }
+        };
+
+    }  // namespace elena
 }  // namespace mmdeploy
 
 #define REGISTER_FUSE_KERNEL(name, module_name, func) \
-  static ::mmdeploy::elena::FuseKernelRegister g_register_##name##_##func(module_name, func);
+    static ::mmdeploy::elena::FuseKernelRegister g_register_##name##_##func(module_name, func);
 
 #endif
diff --git a/csrc/mmdeploy/preprocess/elena/fused.cpp b/csrc/mmdeploy/preprocess/elena/fused.cpp
index e7f49b45ac..35bb5b352b 100644
--- a/csrc/mmdeploy/preprocess/elena/fused.cpp
+++ b/csrc/mmdeploy/preprocess/elena/fused.cpp
@@ -9,130 +9,146 @@
 #include "mmdeploy/preprocess/transform/tracer.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
-
-using namespace trace;
-using namespace elena;
-
-struct ExtractTransParamVisitor {
-  bool valid{true};
-  std::set<std::string> st;
-
-  std::array<float, 3> mean;
-  std::array<float, 3> std;
-  std::array<int, 2> resize_hw;
-  std::string resize_mode;
-  float pad_val;
-  std::array<int, 4> pad_tlbr;
-  std::array<int, 2> pad_hw;
-  std::array<int, 4> crop_tlbr;
-  std::array<int, 2> crop_hw;
-
-  void CheckValid(const std::string& name) {
-    if (st.count(name)) {
-      valid = false;
-      return;
-    }
-    st.insert(name);
-  }
-
-  void operator()(CvtColorParam&) {}
-  void operator()(CastParam&) {}
-  void operator()(HWC2CHWParam&) {}
-
-  void operator()(ResizeParam& param) {
-    CheckValid("Resize");
-    resize_hw = {param.size[0], param.size[1]};
-    resize_mode = param.mode;
-  }
-  void operator()(PadParam& param) {
-    CheckValid("Pad");
-    pad_val = param.pad_val;
-    std::copy_n(param.tlbr.begin(), 4, pad_tlbr.begin());
-    std::copy_n(param.size.begin(), 2, pad_hw.begin());
-  }
-  void operator()(NormParam& param) {
-    CheckValid("Normalize");
-    std::copy(param.mean.begin(), param.mean.end(), mean.begin());
-    std::copy(param.std.begin(), param.std.end(), std.begin());
-  }
-  void operator()(CropParam& param) {
-    CheckValid("CenterCrop");
-    std::copy_n(param.tlbr.begin(), 4, crop_tlbr.begin());
-    std::copy_n(param.size.begin(), 2, crop_hw.begin());
-  }
-};
-
-class Fused : public Transform {
- public:
-  explicit Fused(const Value& args) {
-    device_ = operation::gContext().device();
-    tag_ = args["hash_code"].get<std::string>();
-    tag_.append("_").append(GetPlatformName(device_));
-    func_ = FuseKernel::Get().GetFunc(tag_);
-    if (!func_) {
-      MMDEPLOY_ERROR("can't find fuse function with tag: {}", tag_);
-      throw_exception(eNotSupported);
-    }
-  }
-
-  struct Context {
-    Context() { operation::gContext().set_use_dummy(false); }
-    ~Context() { operation::gContext().set_use_dummy(true); }
-  };
-
-  Result<void> Apply(Value& data) override {
-    auto tracer = data["__tracer__"].get<Tracer>();
-    Mat _src_mat = data["ori_img"].get<Mat>();
-
-    auto& stream = operation::gContext().stream();
-
-    // ! Create a scope that `use_dummy` is false
-    Context context;
-    OUTCOME_TRY(auto src_mat, operation::Secure(_src_mat, device_, stream));
-
-    ExtractTransParamVisitor visitor{};
-    for (auto&& trans : tracer.trans_) {
-      std::visit(visitor, trans);
-    }
-
-    if (!visitor.valid) {
-      MMDEPLOY_ERROR("unsupported fuse transform");
-      return Status(eNotSupported);
-    }
-    if (src_mat.type() != DataType::kINT8) {
-      MMDEPLOY_ERROR("unsupported data type in fuse transform");
-      return Status(eNotSupported);
-    }
-
-    auto img_fields = GetImageFields(data);
-    for (auto& key : img_fields) {
-      assert(data.contains(key));
-      auto src_tensor = data[key].get<Tensor>();
-      auto desc = src_tensor.desc();
-      desc.device = device_;
-      Tensor dst_tensor{desc};
-
-      func_(stream.GetNative(), src_mat.data<uint8_t>(), src_mat.height(), src_mat.width(),
-            to_string(src_mat.pixel_format()).c_str(), visitor.resize_hw[0], visitor.resize_hw[1],
-            visitor.resize_mode.c_str(), visitor.crop_tlbr[0], visitor.crop_tlbr[1],
-            visitor.crop_hw[0], visitor.crop_hw[1], visitor.mean[0], visitor.mean[1],
-            visitor.mean[2], visitor.std[0], visitor.std[1], visitor.std[2], visitor.pad_tlbr[0],
-            visitor.pad_tlbr[1], visitor.pad_tlbr[2], visitor.pad_tlbr[3], visitor.pad_hw[0],
-            visitor.pad_hw[1], visitor.pad_val, dst_tensor.data<float>(), dst_tensor.shape(2),
-            dst_tensor.shape(3));
-      operation::gContext().Track(dst_tensor);
-      data[key] = std::move(dst_tensor);
-    }
-    return success();
-  }
-
- private:
-  Device device_;
-  std::string tag_;
-  FuseFunc func_;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(Fused);
+namespace mmdeploy::transform
+{
+
+    using namespace trace;
+    using namespace elena;
+
+    struct ExtractTransParamVisitor
+    {
+        bool                  valid{true};
+        std::set<std::string> st;
+
+        std::array<float, 3>  mean;
+        std::array<float, 3>  std;
+        std::array<int, 2>    resize_hw;
+        std::string           resize_mode;
+        float                 pad_val;
+        std::array<int, 4>    pad_tlbr;
+        std::array<int, 2>    pad_hw;
+        std::array<int, 4>    crop_tlbr;
+        std::array<int, 2>    crop_hw;
+
+        void                  CheckValid(const std::string& name)
+        {
+            if (st.count(name))
+            {
+                valid = false;
+                return;
+            }
+            st.insert(name);
+        }
+
+        void operator()(CvtColorParam&) {}
+        void operator()(CastParam&) {}
+        void operator()(HWC2CHWParam&) {}
+
+        void operator()(ResizeParam& param)
+        {
+            CheckValid("Resize");
+            resize_hw   = {param.size[0], param.size[1]};
+            resize_mode = param.mode;
+        }
+        void operator()(PadParam& param)
+        {
+            CheckValid("Pad");
+            pad_val = param.pad_val;
+            std::copy_n(param.tlbr.begin(), 4, pad_tlbr.begin());
+            std::copy_n(param.size.begin(), 2, pad_hw.begin());
+        }
+        void operator()(NormParam& param)
+        {
+            CheckValid("Normalize");
+            std::copy(param.mean.begin(), param.mean.end(), mean.begin());
+            std::copy(param.std.begin(), param.std.end(), std.begin());
+        }
+        void operator()(CropParam& param)
+        {
+            CheckValid("CenterCrop");
+            std::copy_n(param.tlbr.begin(), 4, crop_tlbr.begin());
+            std::copy_n(param.size.begin(), 2, crop_hw.begin());
+        }
+    };
+
+    class Fused : public Transform
+    {
+      public:
+        explicit Fused(const Value& args)
+        {
+            device_ = operation::gContext().device();
+            tag_    = args["hash_code"].get<std::string>();
+            tag_.append("_").append(GetPlatformName(device_));
+            func_ = FuseKernel::Get().GetFunc(tag_);
+            if (!func_)
+            {
+                MMDEPLOY_ERROR("can't find fuse function with tag: {}", tag_);
+                throw_exception(eNotSupported);
+            }
+        }
+
+        struct Context
+        {
+            Context()
+            {
+                operation::gContext().set_use_dummy(false);
+            }
+            ~Context()
+            {
+                operation::gContext().set_use_dummy(true);
+            }
+        };
+
+        Result<void> Apply(Value& data) override
+        {
+            auto    tracer   = data["__tracer__"].get<Tracer>();
+            Mat     _src_mat = data["ori_img"].get<Mat>();
+
+            auto&   stream = operation::gContext().stream();
+
+            // ! Create a scope that `use_dummy` is false
+            Context context;
+            OUTCOME_TRY(auto src_mat, operation::Secure(_src_mat, device_, stream));
+
+            ExtractTransParamVisitor visitor{};
+            for (auto&& trans : tracer.trans_)
+            {
+                std::visit(visitor, trans);
+            }
+
+            if (!visitor.valid)
+            {
+                MMDEPLOY_ERROR("unsupported fuse transform");
+                return Status(eNotSupported);
+            }
+            if (src_mat.type() != DataType::kINT8)
+            {
+                MMDEPLOY_ERROR("unsupported data type in fuse transform");
+                return Status(eNotSupported);
+            }
+
+            auto img_fields = GetImageFields(data);
+            for (auto& key : img_fields)
+            {
+                assert(data.contains(key));
+                auto src_tensor = data[key].get<Tensor>();
+                auto desc       = src_tensor.desc();
+                desc.device     = device_;
+                Tensor dst_tensor{desc};
+
+                func_(stream.GetNative(), src_mat.data<uint8_t>(), src_mat.height(), src_mat.width(), to_string(src_mat.pixel_format()).c_str(), visitor.resize_hw[0], visitor.resize_hw[1], visitor.resize_mode.c_str(), visitor.crop_tlbr[0], visitor.crop_tlbr[1], visitor.crop_hw[0], visitor.crop_hw[1], visitor.mean[0], visitor.mean[1], visitor.mean[2], visitor.std[0], visitor.std[1], visitor.std[2], visitor.pad_tlbr[0], visitor.pad_tlbr[1], visitor.pad_tlbr[2], visitor.pad_tlbr[3], visitor.pad_hw[0], visitor.pad_hw[1], visitor.pad_val, dst_tensor.data<float>(), dst_tensor.shape(2), dst_tensor.shape(3));
+                operation::gContext().Track(dst_tensor);
+                data[key] = std::move(dst_tensor);
+            }
+            return success();
+        }
+
+      private:
+        Device      device_;
+        std::string tag_;
+        FuseFunc    func_;
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(Fused);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/center_crop.cpp b/csrc/mmdeploy/preprocess/transform/center_crop.cpp
index 9022dbae46..7c4a0210fc 100644
--- a/csrc/mmdeploy/preprocess/transform/center_crop.cpp
+++ b/csrc/mmdeploy/preprocess/transform/center_crop.cpp
@@ -10,84 +10,100 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-class CenterCrop : public Transform {
- public:
-  explicit CenterCrop(const Value& args) {
-    if (!args.contains(("crop_size"))) {
-      MMDEPLOY_ERROR("'crop_size' is expected");
-      throw_exception(eInvalidArgument);
-    }
-    if (args["crop_size"].is_number_integer()) {
-      int crop_size = args["crop_size"].get<int>();
-      crop_size_[0] = crop_size_[1] = crop_size;
-    } else if (args["crop_size"].is_array() && args["crop_size"].size() == 2) {
-      crop_size_[0] = args["crop_size"][0].get<int>();
-      crop_size_[1] = args["crop_size"][1].get<int>();
-    } else {
-      MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
-      throw_exception(eInvalidArgument);
-    }
-
-    crop_ = operation::Managed<operation::Crop>::Create();
-  }
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto img_fields = GetImageFields(data);
-
-    for (auto& key : img_fields) {
-      auto tensor = data[key].get<Tensor>();
-      auto desc = tensor.desc();
-      int h = desc.shape[1];
-      int w = desc.shape[2];
-      int crop_height = crop_size_[0];
-      int crop_width = crop_size_[1];
-
-      int y1 = std::max(0, int(std::round((h - crop_height) / 2.0)));
-      int x1 = std::max(0, int(std::round((w - crop_width) / 2.0)));
-      int y2 = std::min(h, y1 + crop_height) - 1;
-      int x2 = std::min(w, x1 + crop_width) - 1;
-
-      Tensor dst_tensor;
-      OUTCOME_TRY(crop_.Apply(tensor, dst_tensor, y1, x1, y2, x2));
-
-      auto& shape = dst_tensor.desc().shape;
-
-      // trace static info & runtime args
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().CenterCrop(
-            {y1, x1, h - (int)shape[1] - y1, w - (int)shape[2] - x1},
-            {(int)shape[1], (int)shape[2]}, tensor.data_type());
-      }
-
-      data["img_shape"] = {shape[0], shape[1], shape[2], shape[3]};
-      if (data.contains("scale_factor")) {
-        // image has been processed by `Resize` transform before.
-        // Compute cropped image's offset against the original image
-        assert(data["scale_factor"].is_array() && data["scale_factor"].size() >= 2);
-        float w_scale = data["scale_factor"][0].get<float>();
-        float h_scale = data["scale_factor"][1].get<float>();
-        data["offset"].push_back(x1 / w_scale);
-        data["offset"].push_back(y1 / h_scale);
-      } else {
-        data["offset"].push_back(x1);
-        data["offset"].push_back(y1);
-      }
-
-      data[key] = std::move(dst_tensor);
-    }
-
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- private:
-  operation::Managed<operation::Crop> crop_;
-  std::array<int, 2> crop_size_{};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(CenterCrop);
+namespace mmdeploy::transform
+{
+
+    class CenterCrop : public Transform
+    {
+      public:
+        explicit CenterCrop(const Value& args)
+        {
+            if (!args.contains(("crop_size")))
+            {
+                MMDEPLOY_ERROR("'crop_size' is expected");
+                throw_exception(eInvalidArgument);
+            }
+            if (args["crop_size"].is_number_integer())
+            {
+                int crop_size = args["crop_size"].get<int>();
+                crop_size_[0] = crop_size_[1] = crop_size;
+            }
+            else if (args["crop_size"].is_array() && args["crop_size"].size() == 2)
+            {
+                crop_size_[0] = args["crop_size"][0].get<int>();
+                crop_size_[1] = args["crop_size"][1].get<int>();
+            }
+            else
+            {
+                MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
+                throw_exception(eInvalidArgument);
+            }
+
+            crop_ = operation::Managed<operation::Crop>::Create();
+        }
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto img_fields = GetImageFields(data);
+
+            for (auto& key : img_fields)
+            {
+                auto   tensor      = data[key].get<Tensor>();
+                auto   desc        = tensor.desc();
+                int    h           = desc.shape[1];
+                int    w           = desc.shape[2];
+                int    crop_height = crop_size_[0];
+                int    crop_width  = crop_size_[1];
+
+                int    y1 = std::max(0, int(std::round((h - crop_height) / 2.0)));
+                int    x1 = std::max(0, int(std::round((w - crop_width) / 2.0)));
+                int    y2 = std::min(h, y1 + crop_height) - 1;
+                int    x2 = std::min(w, x1 + crop_width) - 1;
+
+                Tensor dst_tensor;
+                OUTCOME_TRY(crop_.Apply(tensor, dst_tensor, y1, x1, y2, x2));
+
+                auto& shape = dst_tensor.desc().shape;
+
+                // trace static info & runtime args
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().CenterCrop(
+                        {y1, x1, h - (int)shape[1] - y1, w - (int)shape[2] - x1},
+                        {(int)shape[1], (int)shape[2]},
+                        tensor.data_type());
+                }
+
+                data["img_shape"] = {shape[0], shape[1], shape[2], shape[3]};
+                if (data.contains("scale_factor"))
+                {
+                    // image has been processed by `Resize` transform before.
+                    // Compute cropped image's offset against the original image
+                    assert(data["scale_factor"].is_array() && data["scale_factor"].size() >= 2);
+                    float w_scale = data["scale_factor"][0].get<float>();
+                    float h_scale = data["scale_factor"][1].get<float>();
+                    data["offset"].push_back(x1 / w_scale);
+                    data["offset"].push_back(y1 / h_scale);
+                }
+                else
+                {
+                    data["offset"].push_back(x1);
+                    data["offset"].push_back(y1);
+                }
+
+                data[key] = std::move(dst_tensor);
+            }
+
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      private:
+        operation::Managed<operation::Crop> crop_;
+        std::array<int, 2>                  crop_size_{};
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(CenterCrop);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/collect.cpp b/csrc/mmdeploy/preprocess/transform/collect.cpp
index 041320ff17..edeb2f93df 100644
--- a/csrc/mmdeploy/preprocess/transform/collect.cpp
+++ b/csrc/mmdeploy/preprocess/transform/collect.cpp
@@ -3,67 +3,84 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
+namespace mmdeploy::transform
+{
 
-class Collect : public Transform {
- public:
-  explicit Collect(const Value& args) {
-    if (!args.contains("keys") || !args["keys"].is_array()) {
-      MMDEPLOY_ERROR("'keys' is missed in arguments, or it is not an array as expected");
-      throw_exception(eInvalidArgument);
-    }
-    if (args.contains("meta_keys") && !args["meta_keys"].is_array()) {
-      MMDEPLOY_ERROR("'meta_keys' has to be an array");
-      throw_exception(eInvalidArgument);
-    }
+    class Collect : public Transform
+    {
+      public:
+        explicit Collect(const Value& args)
+        {
+            if (!args.contains("keys") || !args["keys"].is_array())
+            {
+                MMDEPLOY_ERROR("'keys' is missed in arguments, or it is not an array as expected");
+                throw_exception(eInvalidArgument);
+            }
+            if (args.contains("meta_keys") && !args["meta_keys"].is_array())
+            {
+                MMDEPLOY_ERROR("'meta_keys' has to be an array");
+                throw_exception(eInvalidArgument);
+            }
 
-    for (auto& v : args["keys"]) {
-      keys_.push_back(v.get<std::string>());
-    }
-    if (args.contains("meta_keys")) {
-      for (auto& v : args["meta_keys"]) {
-        meta_keys_.push_back(v.get<std::string>());
-      }
-    }
-  }
+            for (auto& v : args["keys"])
+            {
+                keys_.push_back(v.get<std::string>());
+            }
+            if (args.contains("meta_keys"))
+            {
+                for (auto& v : args["meta_keys"])
+                {
+                    meta_keys_.push_back(v.get<std::string>());
+                }
+            }
+        }
 
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    Value::Object output;
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            Value::Object output;
 
-    // collect 'ori_img' and 'attribute' from `input`, because those two fields
-    // are given by users, not generated by transform ops
-    if (data.contains("ori_img")) {
-      output["ori_img"] = data["ori_img"];
-    }
-    if (data.contains("attribute")) {
-      output["attribute"] = data["attribute"];
-    }
+            // collect 'ori_img' and 'attribute' from `input`, because those two fields
+            // are given by users, not generated by transform ops
+            if (data.contains("ori_img"))
+            {
+                output["ori_img"] = data["ori_img"];
+            }
+            if (data.contains("attribute"))
+            {
+                output["attribute"] = data["attribute"];
+            }
 
-    for (auto& meta_key : meta_keys_) {
-      if (data.contains(meta_key)) {
-        output["img_metas"][meta_key] = data[meta_key];
-      }
-    }
-    for (auto& key : keys_) {
-      if (!data.contains(key)) {
-        MMDEPLOY_INFO("missed key '{}' in input", key);
-        return Status(eInvalidArgument);
-      } else {
-        output[key] = data[key];
-      }
-    }
+            for (auto& meta_key : meta_keys_)
+            {
+                if (data.contains(meta_key))
+                {
+                    output["img_metas"][meta_key] = data[meta_key];
+                }
+            }
+            for (auto& key : keys_)
+            {
+                if (!data.contains(key))
+                {
+                    MMDEPLOY_INFO("missed key '{}' in input", key);
+                    return Status(eInvalidArgument);
+                }
+                else
+                {
+                    output[key] = data[key];
+                }
+            }
 
-    data = std::move(output);
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
+            data = std::move(output);
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
 
- private:
-  std::vector<std::string> keys_;
-  std::vector<std::string> meta_keys_;
-};
+      private:
+        std::vector<std::string> keys_;
+        std::vector<std::string> meta_keys_;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(Collect);
+    MMDEPLOY_REGISTER_TRANSFORM(Collect);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/compose.cpp b/csrc/mmdeploy/preprocess/transform/compose.cpp
index 44fbd86d4c..e8e5de0182 100644
--- a/csrc/mmdeploy/preprocess/transform/compose.cpp
+++ b/csrc/mmdeploy/preprocess/transform/compose.cpp
@@ -5,95 +5,112 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
+namespace mmdeploy::transform
+{
 
-class Compose : public Transform {
- public:
-  explicit Compose(const Value& args) {
-    assert(args.contains("context"));
+    class Compose : public Transform
+    {
+      public:
+        explicit Compose(const Value& args)
+        {
+            assert(args.contains("context"));
 
-    Value context;
-    context = args["context"];
-    context["device"].get_to(device_);
-    context["stream"].get_to(stream_);
+            Value context;
+            context = args["context"];
+            context["device"].get_to(device_);
+            context["stream"].get_to(stream_);
 
-    if (auto parent = context.value<profiler::Scope*>("scope", nullptr)) {
-      scope_ = parent->CreateScope("Compose");
-      context["scope"] = scope_;
-    }
+            if (auto parent = context.value<profiler::Scope*>("scope", nullptr))
+            {
+                scope_           = parent->CreateScope("Compose");
+                context["scope"] = scope_;
+            }
 
-    auto transforms = args["transforms"].array();
-    operation::Context ctx(device_, stream_);
+            auto               transforms = args["transforms"].array();
+            operation::Context ctx(device_, stream_);
 
-    EnableTransformFusion(args, transforms);
+            EnableTransformFusion(args, transforms);
 
-    for (auto cfg : transforms) {
-      cfg["context"] = context;
-      auto type = cfg.value("type", std::string{});
-      MMDEPLOY_DEBUG("creating transform: {} with cfg: {}", type, cfg);
-      auto creator = gRegistry<Transform>().Get(type);
-      if (!creator) {
-        MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type,
-                       gRegistry<Transform>().List());
-        throw_exception(eEntryNotFound);
-      }
-      auto transform = creator->Create(cfg);
-      if (!transform) {
-        MMDEPLOY_ERROR("Failed to create transform: {}, config: {}", type, cfg);
-        throw_exception(eFail);
-      }
-      transforms_.push_back(std::move(transform));
-      if (scope_) {
-        transform_scopes_.push_back(scope_->CreateScope(type));
-      }
-    }
-  }
+            for (auto cfg : transforms)
+            {
+                cfg["context"] = context;
+                auto type      = cfg.value("type", std::string{});
+                MMDEPLOY_DEBUG("creating transform: {} with cfg: {}", type, cfg);
+                auto creator = gRegistry<Transform>().Get(type);
+                if (!creator)
+                {
+                    MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type, gRegistry<Transform>().List());
+                    throw_exception(eEntryNotFound);
+                }
+                auto transform = creator->Create(cfg);
+                if (!transform)
+                {
+                    MMDEPLOY_ERROR("Failed to create transform: {}, config: {}", type, cfg);
+                    throw_exception(eFail);
+                }
+                transforms_.push_back(std::move(transform));
+                if (scope_)
+                {
+                    transform_scopes_.push_back(scope_->CreateScope(type));
+                }
+            }
+        }
 
-  Result<void> Apply(Value& data) override {
-    profiler::ScopedCounter counter(scope_);
-    operation::Context context(device_, stream_);
-    if (!hash_code_.empty()) {
-      context.set_use_dummy(true);
-    }
-    DeviceGuard guard(device_);
-    for (size_t i = 0; i < transforms_.size(); ++i) {
-      std::optional<profiler::ScopedCounter> child_counter;
-      if (scope_) {
-        child_counter.emplace(transform_scopes_[i]);
-      }
-      OUTCOME_TRY(transforms_[i]->Apply(data));
-      if (scope_) {
-        OUTCOME_TRY(stream_.Wait());
-      }
-    }
-    return success();
-  }
+        Result<void> Apply(Value& data) override
+        {
+            profiler::ScopedCounter counter(scope_);
+            operation::Context      context(device_, stream_);
+            if (!hash_code_.empty())
+            {
+                context.set_use_dummy(true);
+            }
+            DeviceGuard guard(device_);
+            for (size_t i = 0; i < transforms_.size(); ++i)
+            {
+                std::optional<profiler::ScopedCounter> child_counter;
+                if (scope_)
+                {
+                    child_counter.emplace(transform_scopes_[i]);
+                }
+                OUTCOME_TRY(transforms_[i]->Apply(data));
+                if (scope_)
+                {
+                    OUTCOME_TRY(stream_.Wait());
+                }
+            }
+            return success();
+        }
 
- private:
-  void EnableTransformFusion(const Value& args, Value::Array& transforms) {
-    if (args.value("fuse_transform", false)) {
-      hash_code_ = args.value("sha256", hash_code_);
-      if (!hash_code_.empty()) {
-        operation::gContext().set_use_dummy(true);
-        auto it = transforms.begin();
-        for (; it != transforms.end(); ++it) {
-          if (it->value<std::string>("type", {}) == "Collect") {
-            break;
-          }
+      private:
+        void EnableTransformFusion(const Value& args, Value::Array& transforms)
+        {
+            if (args.value("fuse_transform", false))
+            {
+                hash_code_ = args.value("sha256", hash_code_);
+                if (!hash_code_.empty())
+                {
+                    operation::gContext().set_use_dummy(true);
+                    auto it = transforms.begin();
+                    for (; it != transforms.end(); ++it)
+                    {
+                        if (it->value<std::string>("type", {}) == "Collect")
+                        {
+                            break;
+                        }
+                    }
+                    transforms.insert(it, Value::Object{{"type", "Fused"}, {"hash_code", hash_code_}});
+                }
+            }
         }
-        transforms.insert(it, Value::Object{{"type", "Fused"}, {"hash_code", hash_code_}});
-      }
-    }
-  }
 
-  std::vector<std::unique_ptr<Transform>> transforms_;
-  Device device_;
-  Stream stream_;
-  std::vector<profiler::Scope*> transform_scopes_;
-  profiler::Scope* scope_{};
-  std::string hash_code_;
-};
+        std::vector<std::unique_ptr<Transform>> transforms_;
+        Device                                  device_;
+        Stream                                  stream_;
+        std::vector<profiler::Scope*>           transform_scopes_;
+        profiler::Scope*                        scope_{};
+        std::string                             hash_code_;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(Compose);
+    MMDEPLOY_REGISTER_TRANSFORM(Compose);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/default_format_bundle.cpp b/csrc/mmdeploy/preprocess/transform/default_format_bundle.cpp
index 3cca32b9d6..f0a9e5ee9e 100644
--- a/csrc/mmdeploy/preprocess/transform/default_format_bundle.cpp
+++ b/csrc/mmdeploy/preprocess/transform/default_format_bundle.cpp
@@ -7,68 +7,82 @@
 #include "mmdeploy/preprocess/transform/tracer.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
+namespace mmdeploy::transform
+{
 
-class DefaultFormatBundle : public Transform {
- public:
-  explicit DefaultFormatBundle(const Value& args) {
-    if (args.contains("img_to_float") && args["img_to_float"].is_boolean()) {
-      img_to_float_ = args["img_to_float"].get<bool>();
-    }
-    to_float_ = operation::Managed<operation::ToFloat>::Create();
-    hwc2chw_ = operation::Managed<operation::HWC2CHW>::Create();
-  }
+    class DefaultFormatBundle : public Transform
+    {
+      public:
+        explicit DefaultFormatBundle(const Value& args)
+        {
+            if (args.contains("img_to_float") && args["img_to_float"].is_boolean())
+            {
+                img_to_float_ = args["img_to_float"].get<bool>();
+            }
+            to_float_ = operation::Managed<operation::ToFloat>::Create();
+            hwc2chw_  = operation::Managed<operation::HWC2CHW>::Create();
+        }
 
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("DefaultFormatBundle input: {}", data);
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("DefaultFormatBundle input: {}", data);
 
-    if (data.contains("img")) {
-      Tensor tensor = data["img"].get<Tensor>();
-      auto input_data_type = tensor.data_type();
-      if (img_to_float_) {
-        OUTCOME_TRY(to_float_.Apply(tensor, tensor));
-      }
+            if (data.contains("img"))
+            {
+                Tensor tensor          = data["img"].get<Tensor>();
+                auto   input_data_type = tensor.data_type();
+                if (img_to_float_)
+                {
+                    OUTCOME_TRY(to_float_.Apply(tensor, tensor));
+                }
 
-      // set default meta keys
-      if (!data.contains("pad_shape")) {
-        for (auto v : tensor.shape()) {
-          data["pad_shape"].push_back(v);
-        }
-      }
-      if (!data.contains("scale_factor")) {
-        for (int i = 0; i < 4; ++i) {
-          data["scale_factor"].push_back(1.0);
-        }
-      }
-      if (!data.contains("img_norm_cfg")) {
-        int channel = tensor.shape()[3];
-        for (int i = 0; i < channel; i++) {
-          data["img_norm_cfg"]["mean"].push_back(0.0);
-          data["img_norm_cfg"]["std"].push_back(1.0);
-        }
-        data["img_norm_cfg"]["to_rgb"] = false;
-      }
+                // set default meta keys
+                if (!data.contains("pad_shape"))
+                {
+                    for (auto v : tensor.shape())
+                    {
+                        data["pad_shape"].push_back(v);
+                    }
+                }
+                if (!data.contains("scale_factor"))
+                {
+                    for (int i = 0; i < 4; ++i)
+                    {
+                        data["scale_factor"].push_back(1.0);
+                    }
+                }
+                if (!data.contains("img_norm_cfg"))
+                {
+                    int channel = tensor.shape()[3];
+                    for (int i = 0; i < channel; i++)
+                    {
+                        data["img_norm_cfg"]["mean"].push_back(0.0);
+                        data["img_norm_cfg"]["std"].push_back(1.0);
+                    }
+                    data["img_norm_cfg"]["to_rgb"] = false;
+                }
 
-      // trace static info & runtime args
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().DefaultFormatBundle(img_to_float_, input_data_type);
-      }
+                // trace static info & runtime args
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().DefaultFormatBundle(img_to_float_, input_data_type);
+                }
 
-      // transpose
-      OUTCOME_TRY(hwc2chw_.Apply(tensor, tensor));
-      data["img"] = std::move(tensor);
-    }
+                // transpose
+                OUTCOME_TRY(hwc2chw_.Apply(tensor, tensor));
+                data["img"] = std::move(tensor);
+            }
 
-    MMDEPLOY_DEBUG("DefaultFormatBundle output: {}", data);
-    return success();
-  }
+            MMDEPLOY_DEBUG("DefaultFormatBundle output: {}", data);
+            return success();
+        }
 
- private:
-  operation::Managed<operation::ToFloat> to_float_;
-  operation::Managed<operation::HWC2CHW> hwc2chw_;
-  bool img_to_float_ = true;
-};
+      private:
+        operation::Managed<operation::ToFloat> to_float_;
+        operation::Managed<operation::HWC2CHW> hwc2chw_;
+        bool                                   img_to_float_ = true;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(DefaultFormatBundle);
+    MMDEPLOY_REGISTER_TRANSFORM(DefaultFormatBundle);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/image2tensor.cpp b/csrc/mmdeploy/preprocess/transform/image2tensor.cpp
index c622b477eb..6df9812d49 100644
--- a/csrc/mmdeploy/preprocess/transform/image2tensor.cpp
+++ b/csrc/mmdeploy/preprocess/transform/image2tensor.cpp
@@ -9,44 +9,51 @@
 #include "mmdeploy/preprocess/transform/tracer.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
-
-class ImageToTensor : public Transform {
- public:
-  explicit ImageToTensor(const Value& args) {
-    for (auto& key : args["keys"]) {
-      keys_.push_back(key.get<std::string>());
-    }
-    hwc2chw_ = operation::Managed<operation::HWC2CHW>::Create();
-  }
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    for (auto& key : keys_) {
-      assert(data.contains(key));
-      Tensor src_tensor = data[key].get<Tensor>();
-      auto& shape = src_tensor.desc().shape;
-
-      assert(shape.size() == 4);
-      assert(shape[3] == 1 || shape[3] == 3);
-
-      Tensor dst;
-      OUTCOME_TRY(hwc2chw_.Apply(src_tensor, dst));
-      data[key] = std::move(dst);
-
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().ImageToTensor(src_tensor.data_type());
-      }
-    }  // for key
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- private:
-  operation::Managed<operation::HWC2CHW> hwc2chw_;
-  std::vector<std::string> keys_;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(ImageToTensor);
+namespace mmdeploy::transform
+{
+
+    class ImageToTensor : public Transform
+    {
+      public:
+        explicit ImageToTensor(const Value& args)
+        {
+            for (auto& key : args["keys"])
+            {
+                keys_.push_back(key.get<std::string>());
+            }
+            hwc2chw_ = operation::Managed<operation::HWC2CHW>::Create();
+        }
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            for (auto& key : keys_)
+            {
+                assert(data.contains(key));
+                Tensor src_tensor = data[key].get<Tensor>();
+                auto&  shape      = src_tensor.desc().shape;
+
+                assert(shape.size() == 4);
+                assert(shape[3] == 1 || shape[3] == 3);
+
+                Tensor dst;
+                OUTCOME_TRY(hwc2chw_.Apply(src_tensor, dst));
+                data[key] = std::move(dst);
+
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().ImageToTensor(src_tensor.data_type());
+                }
+            }  // for key
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      private:
+        operation::Managed<operation::HWC2CHW> hwc2chw_;
+        std::vector<std::string>               keys_;
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(ImageToTensor);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/letter_resize.cpp b/csrc/mmdeploy/preprocess/transform/letter_resize.cpp
index 735b5efc8c..a723810165 100644
--- a/csrc/mmdeploy/preprocess/transform/letter_resize.cpp
+++ b/csrc/mmdeploy/preprocess/transform/letter_resize.cpp
@@ -12,138 +12,166 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-class LetterResize : public Transform {
- public:
-  explicit LetterResize(const Value& args) {
-    keep_ratio_ = args.value<bool>("keep_ratio", false);
-    if (args.contains("scale")) {
-      if (args["scale"].is_number_integer()) {
-        auto size = args["scale"].get<int>();
-        img_scale_ = {size, size};
-      } else if (args["scale"].is_array()) {
-        if (args["scale"].size() != 2) {
-          MMDEPLOY_ERROR("'scale' expects an array of size 2, but got {}", args["scale"].size());
-          throw_exception(eInvalidArgument);
+namespace mmdeploy::transform
+{
+
+    class LetterResize : public Transform
+    {
+      public:
+        explicit LetterResize(const Value& args)
+        {
+            keep_ratio_ = args.value<bool>("keep_ratio", false);
+            if (args.contains("scale"))
+            {
+                if (args["scale"].is_number_integer())
+                {
+                    auto size  = args["scale"].get<int>();
+                    img_scale_ = {size, size};
+                }
+                else if (args["scale"].is_array())
+                {
+                    if (args["scale"].size() != 2)
+                    {
+                        MMDEPLOY_ERROR("'scale' expects an array of size 2, but got {}", args["scale"].size());
+                        throw_exception(eInvalidArgument);
+                    }
+                    auto height = args["scale"][0].get<int>();
+                    auto width  = args["scale"][1].get<int>();
+                    img_scale_  = {height, width};
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("'scale' is expected to be an integer or and array of size 2");
+                    throw_exception(eInvalidArgument);
+                }
+            }
+            if (args.contains("pad_val"))
+            {
+                if (args["pad_val"].is_number())
+                {
+                    pad_val_ = args["pad_val"].get<float>();
+                }
+                else if (args["pad_val"].contains("img"))
+                {
+                    pad_val_ = args["pad_val"]["img"].get<float>();
+                }
+            }
+            interpolation_  = args.value<string>("interpolation", "bilinear");
+            allow_scale_up_ = args.value<bool>("allow_scale_up", true);
+            use_mini_pad_   = args.value<bool>("use_mini_pad", false);
+            stretch_only_   = args.value<bool>("stretch_only", false);
+
+            vector<string> interpolations{"nearest", "bilinear", "bicubic", "area", "lanczos"};
+            if (std::find(interpolations.begin(), interpolations.end(), interpolation_) ==
+                interpolations.end())
+            {
+                MMDEPLOY_ERROR("'{}' interpolation is not supported", interpolation_);
+                throw_exception(eInvalidArgument);
+            }
+
+            resize_ = operation::Managed<operation::Resize>::Create(interpolation_);
+            pad_    = operation::Managed<operation::Pad>::Create(std::string("constant"), pad_val_);
         }
-        auto height = args["scale"][0].get<int>();
-        auto width = args["scale"][1].get<int>();
-        img_scale_ = {height, width};
-      } else {
-        MMDEPLOY_ERROR("'scale' is expected to be an integer or and array of size 2");
-        throw_exception(eInvalidArgument);
-      }
-    }
-    if (args.contains("pad_val")) {
-      if (args["pad_val"].is_number()) {
-        pad_val_ = args["pad_val"].get<float>();
-      } else if (args["pad_val"].contains("img")) {
-        pad_val_ = args["pad_val"]["img"].get<float>();
-      }
-    }
-    interpolation_ = args.value<string>("interpolation", "bilinear");
-    allow_scale_up_ = args.value<bool>("allow_scale_up", true);
-    use_mini_pad_ = args.value<bool>("use_mini_pad", false);
-    stretch_only_ = args.value<bool>("stretch_only", false);
-
-    vector<string> interpolations{"nearest", "bilinear", "bicubic", "area", "lanczos"};
-    if (std::find(interpolations.begin(), interpolations.end(), interpolation_) ==
-        interpolations.end()) {
-      MMDEPLOY_ERROR("'{}' interpolation is not supported", interpolation_);
-      throw_exception(eInvalidArgument);
-    }
-
-    resize_ = operation::Managed<operation::Resize>::Create(interpolation_);
-    pad_ = operation::Managed<operation::Pad>::Create(std::string("constant"), pad_val_);
-  }
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto img_fields = GetImageFields(data);
-    for (auto& key : img_fields) {
-      Tensor src_img = data[key].get<Tensor>();
-      auto desc = src_img.desc();
-      assert(desc.shape.size() == 4);
-
-      int h = desc.shape[1];
-      int w = desc.shape[2];
-      float scale_factor = 0.f;
-
-      float ratio = 0.f;
-      std::vector<float> ratios{};
-
-      ratio = std::min(img_scale_[0] * 1.f / h, img_scale_[1] * 1.f / w);
-
-      // only scale down, do not scale up (for better test mAP)
-      if (!(allow_scale_up_)) {
-        ratio = std::min(ratio, 1.f);
-      }
-      ratios = {ratio, ratio};  // float -> (float, float) for (height, width)
-      std::vector<int> no_pad_shape = {int(std::round(h * ratios[0])),
-                                       int(std::round(w * ratios[1]))};
-      // padding height & width
-      int padding_h = img_scale_[0] - no_pad_shape[0];
-      int padding_w = img_scale_[1] - no_pad_shape[1];
-      if (use_mini_pad_) {
-        // minimum rectangle padding
-        padding_h = padding_h % 32;
-        padding_w = padding_w % 32;
-      } else if (stretch_only_) {
-        // stretch to the specified size directly
-        padding_h = 0;
-        padding_w = 0;
-        no_pad_shape = {img_scale_[0], img_scale_[1]};
-        ratios = {img_scale_[0] * 1.f / h, img_scale_[1] * 1.f / w};
-      }
-
-      Tensor dst_img;
-      if (!(no_pad_shape[0] == h && no_pad_shape[1] == w)) {
-        OUTCOME_TRY(resize_.Apply(src_img, dst_img, no_pad_shape[0], no_pad_shape[1]));
-      } else {
-        dst_img = src_img;
-      }
-
-      // TODO update when mmyolo match the scale sequence with mmcv
-      ratios = {ratios[1], ratios[0]};  // mmcv scale factor is (w, h)
-      if (data.contains("scale_factor")) {
-        data["scale_factor"] = {data["scale_factor"][0].get<float>() * ratios[0],
-                                data["scale_factor"][1].get<float>() * ratios[1],
-                                data["scale_factor"][2].get<float>() * ratios[0],
-                                data["scale_factor"][3].get<float>() * ratios[1]};
-      } else {
-        data["scale_factor"] = {ratios[0], ratios[1], ratios[0], ratios[1]};
-      }
-
-      // padding
-      int top_padding = int(std::round(padding_h / 2 - 0.1));
-      int left_padding = int(std::round(padding_w / 2 - 0.1));
-      int bottom_padding = padding_h - top_padding;
-      int right_padding = padding_w - left_padding;
-      if ((top_padding != 0) || (left_padding != 0) || (bottom_padding != 0) ||
-          (right_padding != 0)) {
-        pad_.Apply(dst_img, dst_img, top_padding, left_padding, bottom_padding, right_padding);
-      }
-
-      data["img_shape"] = {1, dst_img.shape(1), dst_img.shape(2), desc.shape[3]};
-      data["pad_param"] = {top_padding, left_padding, bottom_padding, right_padding};
-      data[key] = std::move(dst_img);
-    }
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- protected:
-  operation::Managed<operation::Resize> resize_;
-  operation::Managed<operation::Pad> pad_;
-  std::array<int, 2> img_scale_;
-  std::string interpolation_{"bilinear"};
-  float pad_val_{0};
-  bool keep_ratio_{true};
-  bool use_mini_pad_{false};
-  bool stretch_only_{false};
-  bool allow_scale_up_{true};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(LetterResize);
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto img_fields = GetImageFields(data);
+            for (auto& key : img_fields)
+            {
+                Tensor src_img = data[key].get<Tensor>();
+                auto   desc    = src_img.desc();
+                assert(desc.shape.size() == 4);
+
+                int                h            = desc.shape[1];
+                int                w            = desc.shape[2];
+                float              scale_factor = 0.f;
+
+                float              ratio = 0.f;
+                std::vector<float> ratios{};
+
+                ratio = std::min(img_scale_[0] * 1.f / h, img_scale_[1] * 1.f / w);
+
+                // only scale down, do not scale up (for better test mAP)
+                if (!(allow_scale_up_))
+                {
+                    ratio = std::min(ratio, 1.f);
+                }
+                ratios                        = {ratio, ratio};  // float -> (float, float) for (height, width)
+                std::vector<int> no_pad_shape = {int(std::round(h * ratios[0])),
+                                                 int(std::round(w * ratios[1]))};
+                // padding height & width
+                int              padding_h    = img_scale_[0] - no_pad_shape[0];
+                int              padding_w    = img_scale_[1] - no_pad_shape[1];
+                if (use_mini_pad_)
+                {
+                    // minimum rectangle padding
+                    padding_h = padding_h % 32;
+                    padding_w = padding_w % 32;
+                }
+                else if (stretch_only_)
+                {
+                    // stretch to the specified size directly
+                    padding_h    = 0;
+                    padding_w    = 0;
+                    no_pad_shape = {img_scale_[0], img_scale_[1]};
+                    ratios       = {img_scale_[0] * 1.f / h, img_scale_[1] * 1.f / w};
+                }
+
+                Tensor dst_img;
+                if (!(no_pad_shape[0] == h && no_pad_shape[1] == w))
+                {
+                    OUTCOME_TRY(resize_.Apply(src_img, dst_img, no_pad_shape[0], no_pad_shape[1]));
+                }
+                else
+                {
+                    dst_img = src_img;
+                }
+
+                // TODO update when mmyolo match the scale sequence with mmcv
+                ratios = {ratios[1], ratios[0]};  // mmcv scale factor is (w, h)
+                if (data.contains("scale_factor"))
+                {
+                    data["scale_factor"] = {data["scale_factor"][0].get<float>() * ratios[0],
+                                            data["scale_factor"][1].get<float>() * ratios[1],
+                                            data["scale_factor"][2].get<float>() * ratios[0],
+                                            data["scale_factor"][3].get<float>() * ratios[1]};
+                }
+                else
+                {
+                    data["scale_factor"] = {ratios[0], ratios[1], ratios[0], ratios[1]};
+                }
+
+                // padding
+                int top_padding    = int(std::round(padding_h / 2 - 0.1));
+                int left_padding   = int(std::round(padding_w / 2 - 0.1));
+                int bottom_padding = padding_h - top_padding;
+                int right_padding  = padding_w - left_padding;
+                if ((top_padding != 0) || (left_padding != 0) || (bottom_padding != 0) ||
+                    (right_padding != 0))
+                {
+                    pad_.Apply(dst_img, dst_img, top_padding, left_padding, bottom_padding, right_padding);
+                }
+
+                data["img_shape"] = {1, dst_img.shape(1), dst_img.shape(2), desc.shape[3]};
+                data["pad_param"] = {top_padding, left_padding, bottom_padding, right_padding};
+                data[key]         = std::move(dst_img);
+            }
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      protected:
+        operation::Managed<operation::Resize> resize_;
+        operation::Managed<operation::Pad>    pad_;
+        std::array<int, 2>                    img_scale_;
+        std::string                           interpolation_{"bilinear"};
+        float                                 pad_val_{0};
+        bool                                  keep_ratio_{true};
+        bool                                  use_mini_pad_{false};
+        bool                                  stretch_only_{false};
+        bool                                  allow_scale_up_{true};
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(LetterResize);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/lift.cpp b/csrc/mmdeploy/preprocess/transform/lift.cpp
index 2e0144d250..0aab73f745 100644
--- a/csrc/mmdeploy/preprocess/transform/lift.cpp
+++ b/csrc/mmdeploy/preprocess/transform/lift.cpp
@@ -4,32 +4,39 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
+namespace mmdeploy::transform
+{
 
-class Lift : public Transform {
- public:
-  explicit Lift(const Value& args) {
-    const char* type = "Compose";
-    if (auto creator = gRegistry<Transform>().Get(type)) {
-      compose_ = creator->Create(args);
-    } else {
-      MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type,
-                     gRegistry<Transform>().List());
-      throw_exception(eEntryNotFound);
-    }
-  }
+    class Lift : public Transform
+    {
+      public:
+        explicit Lift(const Value& args)
+        {
+            const char* type = "Compose";
+            if (auto creator = gRegistry<Transform>().Get(type))
+            {
+                compose_ = creator->Create(args);
+            }
+            else
+            {
+                MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type, gRegistry<Transform>().List());
+                throw_exception(eEntryNotFound);
+            }
+        }
 
-  Result<void> Apply(Value& data) override {
-    for (auto& item : data.array()) {
-      OUTCOME_TRY(compose_->Apply(item));
-    }
-    return success();
-  }
+        Result<void> Apply(Value& data) override
+        {
+            for (auto& item : data.array())
+            {
+                OUTCOME_TRY(compose_->Apply(item));
+            }
+            return success();
+        }
 
- private:
-  std::unique_ptr<Transform> compose_;
-};
+      private:
+        std::unique_ptr<Transform> compose_;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(Lift);
+    MMDEPLOY_REGISTER_TRANSFORM(Lift);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/load.cpp b/csrc/mmdeploy/preprocess/transform/load.cpp
index 88782d5c59..9a67a40e1c 100644
--- a/csrc/mmdeploy/preprocess/transform/load.cpp
+++ b/csrc/mmdeploy/preprocess/transform/load.cpp
@@ -6,101 +6,113 @@
 #include "mmdeploy/preprocess/transform/tracer.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
-
-using operation::CvtColor;
-using operation::ToFloat;
-
-inline Tensor to_tensor(const Mat& mat) {
-  assert(mat.pixel_format() != PixelFormat::kNV12 && mat.pixel_format() != PixelFormat::kNV21);
-  TensorDesc desc{mat.device(), mat.type(), {1, mat.height(), mat.width(), mat.channel()}, ""};
-  return {desc, mat.buffer()};
-}
-
-class PrepareImage : public Transform {
- public:
-  explicit PrepareImage(const Value& args) {
-    to_float32_ = args.value("to_float32", to_float32_);
-    color_type_ = args.value("color_type", color_type_);
-    channel_order_ = args.value("channel_order", channel_order_);
-
-    cvt_color_ = operation::Managed<CvtColor>::Create();
-    to_float_ = operation::Managed<ToFloat>::Create();
-  }
-  /**
-     * Input:
-      {
-        "ori_img": cv::Mat,
-        "attribute": {
-        }
-      }
-
-     * Output:
-      {
-        "ori_img": cv::Mat,
-        "img": Tensor,
-        "img_shape": [],
-        "ori_shape": [],
-        "img_fields": ["img"],
-        "attribute": {
-        }
-      }
-     */
+namespace mmdeploy::transform
+{
 
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
+    using operation::CvtColor;
+    using operation::ToFloat;
 
-    // early exit
-    if (data.contains("img") && data["img"].is_any<Tensor>()) {
-      return success();
+    inline Tensor to_tensor(const Mat& mat)
+    {
+        assert(mat.pixel_format() != PixelFormat::kNV12 && mat.pixel_format() != PixelFormat::kNV21);
+        TensorDesc desc{mat.device(), mat.type(), {1, mat.height(), mat.width(), mat.channel()}, ""};
+        return {desc, mat.buffer()};
     }
 
-    assert(data.contains("ori_img"));
-
-    Mat src_mat = data["ori_img"].get<Mat>();
-    Mat dst_mat;
-    if (color_type_ == "color" || color_type_ == "color_ignore_orientation") {
-      if (channel_order_ == "bgr") {
-        OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kBGR));
-      } else {
-        OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kRGB));
-      }
-    } else {
-      OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kGRAYSCALE));
-    }
-    auto tensor = to_tensor(dst_mat);
-    if (to_float32_) {
-      OUTCOME_TRY(to_float_.Apply(tensor, tensor));
-    }
+    class PrepareImage : public Transform
+    {
+      public:
+        explicit PrepareImage(const Value& args)
+        {
+            to_float32_    = args.value("to_float32", to_float32_);
+            color_type_    = args.value("color_type", color_type_);
+            channel_order_ = args.value("channel_order", channel_order_);
+
+            cvt_color_ = operation::Managed<CvtColor>::Create();
+            to_float_  = operation::Managed<ToFloat>::Create();
+        }
+        /**
+           * Input:
+            {
+              "ori_img": cv::Mat,
+              "attribute": {
+              }
+            }
+
+           * Output:
+            {
+              "ori_img": cv::Mat,
+              "img": Tensor,
+              "img_shape": [],
+              "ori_shape": [],
+              "img_fields": ["img"],
+              "attribute": {
+              }
+            }
+           */
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+
+            // early exit
+            if (data.contains("img") && data["img"].is_any<Tensor>())
+            {
+                return success();
+            }
+
+            assert(data.contains("ori_img"));
+
+            Mat src_mat = data["ori_img"].get<Mat>();
+            Mat dst_mat;
+            if (color_type_ == "color" || color_type_ == "color_ignore_orientation")
+            {
+                if (channel_order_ == "bgr")
+                {
+                    OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kBGR));
+                }
+                else
+                {
+                    OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kRGB));
+                }
+            }
+            else
+            {
+                OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kGRAYSCALE));
+            }
+            auto tensor = to_tensor(dst_mat);
+            if (to_float32_)
+            {
+                OUTCOME_TRY(to_float_.Apply(tensor, tensor));
+            }
+
+            data["img"] = tensor;
+
+            for (auto v : tensor.desc().shape)
+            {
+                data["img_shape"].push_back(v);
+            }
+            data["ori_shape"] = {1, src_mat.height(), src_mat.width(), src_mat.channel()};
+            data["img_fields"].push_back("img");
+
+            // trace static info & runtime args
+            Tracer tracer;
+            tracer.PrepareImage(color_type_, to_float32_, {1, src_mat.height(), src_mat.width(), src_mat.channel()}, src_mat.pixel_format(), src_mat.type());
+            data["__tracer__"] = std::move(tracer);
+
+            MMDEPLOY_DEBUG("output: {}", data);
+
+            return success();
+        }
 
-    data["img"] = tensor;
+      private:
+        operation::Managed<CvtColor> cvt_color_;
+        operation::Managed<ToFloat>  to_float_;
+        bool                         to_float32_{false};
+        std::string                  color_type_{"color"};
+        std::string                  channel_order_{"bgr"};
+    };
 
-    for (auto v : tensor.desc().shape) {
-      data["img_shape"].push_back(v);
-    }
-    data["ori_shape"] = {1, src_mat.height(), src_mat.width(), src_mat.channel()};
-    data["img_fields"].push_back("img");
-
-    // trace static info & runtime args
-    Tracer tracer;
-    tracer.PrepareImage(color_type_, to_float32_,
-                        {1, src_mat.height(), src_mat.width(), src_mat.channel()},
-                        src_mat.pixel_format(), src_mat.type());
-    data["__tracer__"] = std::move(tracer);
-
-    MMDEPLOY_DEBUG("output: {}", data);
-
-    return success();
-  }
-
- private:
-  operation::Managed<CvtColor> cvt_color_;
-  operation::Managed<ToFloat> to_float_;
-  bool to_float32_{false};
-  std::string color_type_{"color"};
-  std::string channel_order_{"bgr"};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM2(PrepareImage, (LoadImageFromFile, 0));
+    MMDEPLOY_REGISTER_TRANSFORM2(PrepareImage, (LoadImageFromFile, 0));
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/normalize.cpp b/csrc/mmdeploy/preprocess/transform/normalize.cpp
index 5fe2139ec9..e42d8e8b83 100644
--- a/csrc/mmdeploy/preprocess/transform/normalize.cpp
+++ b/csrc/mmdeploy/preprocess/transform/normalize.cpp
@@ -8,132 +8,149 @@
 #include "mmdeploy/preprocess/transform/tracer.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
-
-inline Tensor to_tensor(const Mat& mat) {
-  assert(mat.pixel_format() != PixelFormat::kNV12 && mat.pixel_format() != PixelFormat::kNV21);
-  TensorDesc desc{mat.device(), mat.type(), {1, mat.height(), mat.width(), mat.channel()}, ""};
-  return {desc, mat.buffer()};
-}
-
-inline Mat to_mat(const Tensor& tensor, PixelFormat format) {
-  assert(tensor.shape().size() == 4 && tensor.shape(0) == 1);
-  return {
-      static_cast<int>(tensor.shape(1)),  // height
-      static_cast<int>(tensor.shape(2)),  // width
-      format,                             // pixel format
-      tensor.data_type(),                 // data type
-      std::shared_ptr<void>(const_cast<void*>(tensor.data()),
-                            [buffer = tensor.buffer()](auto) {}),  // data
-      tensor.device()                                              // device
-  };
-}
-
-class Normalize : public Transform {
- public:
-  explicit Normalize(const Value& args) {
-    if (!args.contains("mean") || !args.contains("std")) {
-      MMDEPLOY_ERROR("no 'mean' or 'std' is configured");
-      throw_exception(eInvalidArgument);
-    }
-    for (auto& v : args["mean"]) {
-      mean_.push_back(v.get<float>());
-    }
-    for (auto& v : args["std"]) {
-      std_.push_back(v.get<float>());
-    }
-    to_rgb_ = args.value("to_rgb", to_rgb_);
-    to_float_ = args.value("to_float", to_float_);
-
-    if (!to_float_) {
-      if (std::count(mean_.begin(), mean_.end(), 0.f) != mean_.size() ||
-          std::count(std_.begin(), std_.end(), 1.f) != std_.size()) {
-        MMDEPLOY_ERROR("Non-trivial mean {} and std {} are not supported in uint8 mode", mean_,
-                       std_);
-        throw_exception(eInvalidArgument);
-      }
-    }
-
-    // auto context = GetContext(args);
-    normalize_ = operation::Managed<operation::Normalize>::Create(
-        operation::Normalize::Param{mean_, std_, to_rgb_});
-    cvt_color_ = operation::Managed<operation::CvtColor>::Create();
-  }
+namespace mmdeploy::transform
+{
 
-  /**
-    input:
+    inline Tensor to_tensor(const Mat& mat)
     {
-      "ori_img": Mat,
-      "img": Tensor,
-      "attribute": "",
-      "img_shape": [int],
-      "ori_shape": [int],
-      "img_fields": [int]
+        assert(mat.pixel_format() != PixelFormat::kNV12 && mat.pixel_format() != PixelFormat::kNV21);
+        TensorDesc desc{mat.device(), mat.type(), {1, mat.height(), mat.width(), mat.channel()}, ""};
+        return {desc, mat.buffer()};
     }
-    output:
+
+    inline Mat to_mat(const Tensor& tensor, PixelFormat format)
     {
-      "img": Tensor,
-      "attribute": "",
-      "img_shape": [int],
-      "ori_shape": [int],
-      "img_fields": [string],
-      "img_norm_cfg": {
-        "mean": [float],
-        "std": [float],
-        "to_rgb": true
-      }
-    }
-   */
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-
-    auto img_fields = GetImageFields(data);
-    for (auto& key : img_fields) {
-      Tensor tensor = data[key].get<Tensor>();
-      auto desc = tensor.desc();
-      assert(desc.data_type == DataType::kINT8 || desc.data_type == DataType::kFLOAT);
-      assert(desc.shape.size() == 4 /*n, h, w, c*/);
-      assert(desc.shape[3] == mean_.size());
-
-      Tensor dst;
-      if (to_float_) {
-        OUTCOME_TRY(normalize_.Apply(tensor, dst));
-        data[key] = std::move(dst);
-      } else if (to_rgb_) {
-        auto src_mat = to_mat(tensor, PixelFormat::kBGR);
-        Mat dst_mat;
-        OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kRGB));
-        dst = to_tensor(dst_mat);
-        data[key] = std::move(dst);
-      }
-
-      for (auto& v : mean_) {
-        data["img_norm_cfg"]["mean"].push_back(v);
-      }
-      for (auto v : std_) {
-        data["img_norm_cfg"]["std"].push_back(v);
-      }
-      data["img_norm_cfg"]["to_rgb"] = to_rgb_;
-
-      // trace static info & runtime args
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().Normalize(mean_, std_, to_rgb_, desc.data_type);
-      }
+        assert(tensor.shape().size() == 4 && tensor.shape(0) == 1);
+        return {
+            static_cast<int>(tensor.shape(1)),  // height
+            static_cast<int>(tensor.shape(2)),  // width
+            format,                             // pixel format
+            tensor.data_type(),                 // data type
+            std::shared_ptr<void>(const_cast<void*>(tensor.data()),
+                                  [buffer = tensor.buffer()](auto) {}),  // data
+            tensor.device()                                              // device
+        };
     }
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- private:
-  operation::Managed<operation::Normalize> normalize_;
-  operation::Managed<operation::CvtColor> cvt_color_;
-  std::vector<float> mean_;
-  std::vector<float> std_;
-  bool to_rgb_{true};
-  bool to_float_{true};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(Normalize);
+
+    class Normalize : public Transform
+    {
+      public:
+        explicit Normalize(const Value& args)
+        {
+            if (!args.contains("mean") || !args.contains("std"))
+            {
+                MMDEPLOY_ERROR("no 'mean' or 'std' is configured");
+                throw_exception(eInvalidArgument);
+            }
+            for (auto& v : args["mean"])
+            {
+                mean_.push_back(v.get<float>());
+            }
+            for (auto& v : args["std"])
+            {
+                std_.push_back(v.get<float>());
+            }
+            to_rgb_   = args.value("to_rgb", to_rgb_);
+            to_float_ = args.value("to_float", to_float_);
+
+            if (!to_float_)
+            {
+                if (std::count(mean_.begin(), mean_.end(), 0.f) != mean_.size() ||
+                    std::count(std_.begin(), std_.end(), 1.f) != std_.size())
+                {
+                    MMDEPLOY_ERROR("Non-trivial mean {} and std {} are not supported in uint8 mode", mean_, std_);
+                    throw_exception(eInvalidArgument);
+                }
+            }
+
+            // auto context = GetContext(args);
+            normalize_ = operation::Managed<operation::Normalize>::Create(
+                operation::Normalize::Param{mean_, std_, to_rgb_});
+            cvt_color_ = operation::Managed<operation::CvtColor>::Create();
+        }
+
+        /**
+          input:
+          {
+            "ori_img": Mat,
+            "img": Tensor,
+            "attribute": "",
+            "img_shape": [int],
+            "ori_shape": [int],
+            "img_fields": [int]
+          }
+          output:
+          {
+            "img": Tensor,
+            "attribute": "",
+            "img_shape": [int],
+            "ori_shape": [int],
+            "img_fields": [string],
+            "img_norm_cfg": {
+              "mean": [float],
+              "std": [float],
+              "to_rgb": true
+            }
+          }
+         */
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+
+            auto img_fields = GetImageFields(data);
+            for (auto& key : img_fields)
+            {
+                Tensor tensor = data[key].get<Tensor>();
+                auto   desc   = tensor.desc();
+                assert(desc.data_type == DataType::kINT8 || desc.data_type == DataType::kFLOAT);
+                assert(desc.shape.size() == 4 /*n, h, w, c*/);
+                assert(desc.shape[3] == mean_.size());
+
+                Tensor dst;
+                if (to_float_)
+                {
+                    OUTCOME_TRY(normalize_.Apply(tensor, dst));
+                    data[key] = std::move(dst);
+                }
+                else if (to_rgb_)
+                {
+                    auto src_mat = to_mat(tensor, PixelFormat::kBGR);
+                    Mat  dst_mat;
+                    OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kRGB));
+                    dst       = to_tensor(dst_mat);
+                    data[key] = std::move(dst);
+                }
+
+                for (auto& v : mean_)
+                {
+                    data["img_norm_cfg"]["mean"].push_back(v);
+                }
+                for (auto v : std_)
+                {
+                    data["img_norm_cfg"]["std"].push_back(v);
+                }
+                data["img_norm_cfg"]["to_rgb"] = to_rgb_;
+
+                // trace static info & runtime args
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().Normalize(mean_, std_, to_rgb_, desc.data_type);
+                }
+            }
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      private:
+        operation::Managed<operation::Normalize> normalize_;
+        operation::Managed<operation::CvtColor>  cvt_color_;
+        std::vector<float>                       mean_;
+        std::vector<float>                       std_;
+        bool                                     to_rgb_{true};
+        bool                                     to_float_{true};
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(Normalize);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/pad.cpp b/csrc/mmdeploy/preprocess/transform/pad.cpp
index cdfd9d873c..64466b5557 100644
--- a/csrc/mmdeploy/preprocess/transform/pad.cpp
+++ b/csrc/mmdeploy/preprocess/transform/pad.cpp
@@ -10,142 +10,177 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-class Pad : public Transform {
- public:
-  explicit Pad(const Value& args) {
-    size_[0] = size_[1] = 0;
-    if (args.contains("size") && args["size"].is_number_integer()) {
-      size_[0] = size_[1] = (args["size"].get<int>());
-    }
-    if (args.contains("size") && args["size"].is_array()) {
-      if (args["size"].size() != 2) {
-        MMDEPLOY_ERROR("the length of size should be 2");
-        throw_exception(eInvalidArgument);
-      }
-      size_[0] = args["size"][0].get<int>();
-      size_[1] = args["size"][1].get<int>();
-    }
-
-    size_divisor_ = args.value("size_divisor", 1);
-    if (args.contains("pad_val")) {
-      if (args["pad_val"].is_number()) {
-        pad_val_ = args["pad_val"].get<float>();
-      } else if (args["pad_val"].contains("img")) {
-        pad_val_ = args["pad_val"]["img"][0].get<float>();
-      } else {
-        MMDEPLOY_ERROR("args must be number or img dict");
-        throw_exception(eInvalidArgument);
-      }
-    } else {
-      pad_val_ = 0.0f;
-    }
-
-    logical_or_val_ = args.value("logical_or_val", 0);
-    add_pix_val_ = args.value("add_pix_val", 0);
-
-    pad_to_square_ = args.value("pad_to_square", false);
-    padding_mode_ = args.value("padding_mode", std::string("constant"));
-    orientation_agnostic_ = args.value("orientation_agnostic", false);
-
-    pad_ = operation::Managed<operation::Pad>::Create(padding_mode_, pad_val_);
-  }
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-
-    auto img_fields = GetImageFields(data);
-    for (auto& key : img_fields) {
-      Tensor output_tensor;
-      auto tensor = data[key].get<Tensor>();
-      assert(tensor.desc().shape.size() == 4);
-      assert(tensor.desc().shape[0] == 1);
-      assert(tensor.desc().shape[3] == 3 || tensor.desc().shape[3] == 1);
-
-      int height = tensor.shape(1);
-      int width = tensor.shape(2);
-
-      std::array<int, 4> padding{0, 0, 0, 0};
-      if (pad_to_square_) {
-        int max_size = std::max(tensor.shape(1), tensor.shape(2));
-        padding = {0, 0, max_size - width, max_size - height};
-        data["pad_fixed_size"].push_back(max_size);
-        data["pad_fixed_size"].push_back(max_size);
-      } else if (size_[0] != 0 && size_[1] != 0) {
-        if (orientation_agnostic_) {
-          auto size_min = min(size_[0], size_[1]);
-          auto size_max = max(size_[0], size_[1]);
-          auto pad_h = width < height ? size_max : size_min;
-          auto pad_w = width < height ? size_min : size_max;
-          padding = {0, 0, pad_w - width, pad_h - height};
-          data["pad_fixed_size"].push_back(pad_h);
-          data["pad_fixed_size"].push_back(pad_w);
-        } else {
-          padding = {0, 0, size_[0] - width, size_[1] - height};
-          data["pad_fixed_size"].push_back(size_[1]);
-          data["pad_fixed_size"].push_back(size_[0]);
+namespace mmdeploy::transform
+{
+
+    class Pad : public Transform
+    {
+      public:
+        explicit Pad(const Value& args)
+        {
+            size_[0] = size_[1] = 0;
+            if (args.contains("size") && args["size"].is_number_integer())
+            {
+                size_[0] = size_[1] = (args["size"].get<int>());
+            }
+            if (args.contains("size") && args["size"].is_array())
+            {
+                if (args["size"].size() != 2)
+                {
+                    MMDEPLOY_ERROR("the length of size should be 2");
+                    throw_exception(eInvalidArgument);
+                }
+                size_[0] = args["size"][0].get<int>();
+                size_[1] = args["size"][1].get<int>();
+            }
+
+            size_divisor_ = args.value("size_divisor", 1);
+            if (args.contains("pad_val"))
+            {
+                if (args["pad_val"].is_number())
+                {
+                    pad_val_ = args["pad_val"].get<float>();
+                }
+                else if (args["pad_val"].contains("img"))
+                {
+                    pad_val_ = args["pad_val"]["img"][0].get<float>();
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("args must be number or img dict");
+                    throw_exception(eInvalidArgument);
+                }
+            }
+            else
+            {
+                pad_val_ = 0.0f;
+            }
+
+            logical_or_val_ = args.value("logical_or_val", 0);
+            add_pix_val_    = args.value("add_pix_val", 0);
+
+            pad_to_square_        = args.value("pad_to_square", false);
+            padding_mode_         = args.value("padding_mode", std::string("constant"));
+            orientation_agnostic_ = args.value("orientation_agnostic", false);
+
+            pad_ = operation::Managed<operation::Pad>::Create(padding_mode_, pad_val_);
         }
-      } else if (size_divisor_ != 1) {
-        auto pad_h = (height + size_divisor_ - 1) / size_divisor_ * size_divisor_;
-        auto pad_w = (width + size_divisor_ - 1) / size_divisor_ * size_divisor_;
-        padding = {0, 0, pad_w - width, pad_h - height};
-        data["pad_size_divisor"] = size_divisor_;
-        data["pad_fixed_size"].push_back(pad_h);
-        data["pad_fixed_size"].push_back(pad_w);
-      } else if (logical_or_val_ > 0) {
-        int pad_h = (height | logical_or_val_) + add_pix_val_;
-        int pad_w = (width | logical_or_val_) + add_pix_val_;
-        int offset_h = pad_h / 2 - height / 2;
-        int offset_w = pad_w / 2 - width / 2;
-        padding = {offset_w, offset_h, pad_w - width - offset_w, pad_h - height - offset_h};
-        data["border"].push_back(offset_h);
-        data["border"].push_back(offset_w);
-        data["border"].push_back(offset_h + height);
-        data["border"].push_back(offset_w + width);
-      } else {
-        output_tensor = tensor;
-        data["pad_fixed_size"].push_back(height);
-        data["pad_fixed_size"].push_back(width);
-      }
-
-      if (std::count(begin(padding), end(padding), 0) != 4) {
-        OUTCOME_TRY(
-            pad_.Apply(tensor, output_tensor, padding[1], padding[0], padding[3], padding[2]));
-      } else {
-        output_tensor = tensor;
-      }
-
-      for (auto& v : output_tensor.shape()) {
-        data["pad_shape"].push_back(v);
-      }
-
-      // trace static info & runtime args
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().Pad(
-            pad_val_, {padding[1], padding[0], padding[3], padding[2]},
-            {(int)output_tensor.shape(1), (int)output_tensor.shape(2)}, output_tensor.data_type());
-      }
-
-      data[key] = std::move(output_tensor);
-    }
-
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- private:
-  operation::Managed<operation::Pad> pad_;
-  std::array<int, 2> size_;
-  int size_divisor_;
-  int logical_or_val_;
-  int add_pix_val_;
-  float pad_val_;
-  bool pad_to_square_;
-  bool orientation_agnostic_;
-  std::string padding_mode_;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(Pad);
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+
+            auto img_fields = GetImageFields(data);
+            for (auto& key : img_fields)
+            {
+                Tensor output_tensor;
+                auto   tensor = data[key].get<Tensor>();
+                assert(tensor.desc().shape.size() == 4);
+                assert(tensor.desc().shape[0] == 1);
+                assert(tensor.desc().shape[3] == 3 || tensor.desc().shape[3] == 1);
+
+                int                height = tensor.shape(1);
+                int                width  = tensor.shape(2);
+
+                std::array<int, 4> padding{0, 0, 0, 0};
+                if (pad_to_square_)
+                {
+                    int max_size = std::max(tensor.shape(1), tensor.shape(2));
+                    padding      = {0, 0, max_size - width, max_size - height};
+                    data["pad_fixed_size"].push_back(max_size);
+                    data["pad_fixed_size"].push_back(max_size);
+                }
+                else if (size_[0] != 0 && size_[1] != 0)
+                {
+                    if (orientation_agnostic_)
+                    {
+                        auto size_min = min(size_[0], size_[1]);
+                        auto size_max = max(size_[0], size_[1]);
+                        auto pad_h    = width < height ? size_max : size_min;
+                        auto pad_w    = width < height ? size_min : size_max;
+                        padding       = {0, 0, pad_w - width, pad_h - height};
+                        data["pad_fixed_size"].push_back(pad_h);
+                        data["pad_fixed_size"].push_back(pad_w);
+                    }
+                    else
+                    {
+                        padding = {0, 0, size_[0] - width, size_[1] - height};
+                        data["pad_fixed_size"].push_back(size_[1]);
+                        data["pad_fixed_size"].push_back(size_[0]);
+                    }
+                }
+                else if (size_divisor_ != 1)
+                {
+                    auto pad_h               = (height + size_divisor_ - 1) / size_divisor_ * size_divisor_;
+                    auto pad_w               = (width + size_divisor_ - 1) / size_divisor_ * size_divisor_;
+                    padding                  = {0, 0, pad_w - width, pad_h - height};
+                    data["pad_size_divisor"] = size_divisor_;
+                    data["pad_fixed_size"].push_back(pad_h);
+                    data["pad_fixed_size"].push_back(pad_w);
+                }
+                else if (logical_or_val_ > 0)
+                {
+                    int pad_h    = (height | logical_or_val_) + add_pix_val_;
+                    int pad_w    = (width | logical_or_val_) + add_pix_val_;
+                    int offset_h = pad_h / 2 - height / 2;
+                    int offset_w = pad_w / 2 - width / 2;
+                    padding      = {offset_w, offset_h, pad_w - width - offset_w, pad_h - height - offset_h};
+                    data["border"].push_back(offset_h);
+                    data["border"].push_back(offset_w);
+                    data["border"].push_back(offset_h + height);
+                    data["border"].push_back(offset_w + width);
+                }
+                else
+                {
+                    output_tensor = tensor;
+                    data["pad_fixed_size"].push_back(height);
+                    data["pad_fixed_size"].push_back(width);
+                }
+
+                if (std::count(begin(padding), end(padding), 0) != 4)
+                {
+                    OUTCOME_TRY(
+                        pad_.Apply(tensor, output_tensor, padding[1], padding[0], padding[3], padding[2]));
+                }
+                else
+                {
+                    output_tensor = tensor;
+                }
+
+                for (auto& v : output_tensor.shape())
+                {
+                    data["pad_shape"].push_back(v);
+                }
+
+                // trace static info & runtime args
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().Pad(
+                        pad_val_,
+                        {padding[1], padding[0], padding[3], padding[2]},
+                        {(int)output_tensor.shape(1), (int)output_tensor.shape(2)},
+                        output_tensor.data_type());
+                }
+
+                data[key] = std::move(output_tensor);
+            }
+
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      private:
+        operation::Managed<operation::Pad> pad_;
+        std::array<int, 2>                 size_;
+        int                                size_divisor_;
+        int                                logical_or_val_;
+        int                                add_pix_val_;
+        float                              pad_val_;
+        bool                               pad_to_square_;
+        bool                               orientation_agnostic_;
+        std::string                        padding_mode_;
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(Pad);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/resize.cpp b/csrc/mmdeploy/preprocess/transform/resize.cpp
index 44375ec1a0..2378ca3440 100644
--- a/csrc/mmdeploy/preprocess/transform/resize.cpp
+++ b/csrc/mmdeploy/preprocess/transform/resize.cpp
@@ -12,130 +12,162 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-class Resize : public Transform {
- public:
-  explicit Resize(const Value& args) {
-    keep_ratio_ = args.value<bool>("keep_ratio", false);
-    if (args.contains("size")) {
-      if (args["size"].is_number_integer()) {
-        auto size = args["size"].get<int>();
-        img_scale_ = {size, size};
-      } else if (args["size"].is_array()) {
-        if (args["size"].size() != 2) {
-          MMDEPLOY_ERROR("'size' expects an array of size 2, but got {}", args["size"].size());
-          throw_exception(eInvalidArgument);
+namespace mmdeploy::transform
+{
+
+    class Resize : public Transform
+    {
+      public:
+        explicit Resize(const Value& args)
+        {
+            keep_ratio_ = args.value<bool>("keep_ratio", false);
+            if (args.contains("size"))
+            {
+                if (args["size"].is_number_integer())
+                {
+                    auto size  = args["size"].get<int>();
+                    img_scale_ = {size, size};
+                }
+                else if (args["size"].is_array())
+                {
+                    if (args["size"].size() != 2)
+                    {
+                        MMDEPLOY_ERROR("'size' expects an array of size 2, but got {}", args["size"].size());
+                        throw_exception(eInvalidArgument);
+                    }
+                    // the order in openmmalb config is [width, height], while in SDK it is [height, width]
+                    // keep the last dim -1
+                    auto width  = args["size"][0].get<int>();
+                    auto height = args["size"][1].get<int>();
+                    if (-1 == height)
+                    {
+                        img_scale_ = {width, -1};
+                    }
+                    else
+                    {
+                        img_scale_ = {height, width};
+                    }
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("'size' is expected to be an integer or and array of size 2");
+                    throw_exception(eInvalidArgument);
+                }
+            }
+            interpolation_ = args.value<string>("interpolation", "bilinear");
+
+            vector<string> interpolations{"nearest", "bilinear", "bicubic", "area", "lanczos"};
+            if (std::find(interpolations.begin(), interpolations.end(), interpolation_) ==
+                interpolations.end())
+            {
+                MMDEPLOY_ERROR("'{}' interpolation is not supported", interpolation_);
+                throw_exception(eInvalidArgument);
+            }
+
+            resize_ = operation::Managed<operation::Resize>::Create(interpolation_);
         }
-        // the order in openmmalb config is [width, height], while in SDK it is [height, width]
-        // keep the last dim -1
-        auto width = args["size"][0].get<int>();
-        auto height = args["size"][1].get<int>();
-        if (-1 == height) {
-          img_scale_ = {width, -1};
-        } else {
-          img_scale_ = {height, width};
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto img_fields = GetImageFields(data);
+
+            for (auto& key : img_fields)
+            {
+                Tensor src_img = data[key].get<Tensor>();
+                auto   desc    = src_img.desc();
+                assert(desc.shape.size() == 4);
+
+                int   h            = desc.shape[1];
+                int   w            = desc.shape[2];
+                int   dst_h        = 0;
+                int   dst_w        = 0;
+                float scale_factor = 0.f;
+
+                if (data.contains("scale"))
+                {
+                    assert(data["scale"].is_array() && data["scale"].size() == 2);
+                    dst_h = data["scale"][0].get<int>();
+                    dst_w = data["scale"][1].get<int>();
+                }
+                else if (data.contains("scale_factor"))
+                {
+                    assert(data["scale_factor"].is_number());
+                    scale_factor = data["scale_factor"].get<float>();
+                    dst_h        = int(h * scale_factor + 0.5);
+                    dst_w        = int(w * scale_factor + 0.5);
+                }
+                else
+                {
+                    MMDEPLOY_DEBUG(
+                        "neither 'scale' or 'scale_factor' is provided in input value. "
+                        "'img_scale' will be used");
+                    if (-1 == img_scale_[1])
+                    {
+                        if (w < h)
+                        {
+                            dst_w = img_scale_[0];
+                            dst_h = dst_w * h / w;
+                        }
+                        else
+                        {
+                            dst_h = img_scale_[0];
+                            dst_w = dst_h * w / h;
+                        }
+                    }
+                    else
+                    {
+                        dst_h = img_scale_[0];
+                        dst_w = img_scale_[1];
+                    }
+                }
+                if (keep_ratio_)
+                {
+                    int max_long_edge  = dst_w;
+                    int max_short_edge = dst_h;
+                    if (max_long_edge < max_short_edge)
+                    {
+                        std::swap(max_long_edge, max_short_edge);
+                    }
+                    scale_factor = std::min(max_long_edge * 1.0 / (1.0 * std::max(h, w)),
+                                            max_short_edge * 1.0 / (1.0 * std::min(h, w)));
+                    dst_w        = int(w * scale_factor + 0.5);
+                    dst_h        = int(h * scale_factor + 0.5);
+                }
+                Tensor dst_img;
+                if (dst_h != h || dst_w != w)
+                {
+                    OUTCOME_TRY(resize_.Apply(src_img, dst_img, dst_h, dst_w));
+                }
+                else
+                {
+                    dst_img = src_img;
+                }
+                auto w_scale         = dst_w * 1.0 / w;
+                auto h_scale         = dst_h * 1.0 / h;
+                data["scale_factor"] = {w_scale, h_scale, w_scale, h_scale};
+                data["img_shape"]    = {1, dst_h, dst_w, desc.shape[3]};
+                data["keep_ratio"]   = keep_ratio_;
+
+                data[key] = dst_img;
+
+                // trace static info & runtime args
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().Resize(interpolation_, {dst_h, dst_w}, src_img.data_type());
+                }
+            }
+
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
         }
-      } else {
-        MMDEPLOY_ERROR("'size' is expected to be an integer or and array of size 2");
-        throw_exception(eInvalidArgument);
-      }
-    }
-    interpolation_ = args.value<string>("interpolation", "bilinear");
-
-    vector<string> interpolations{"nearest", "bilinear", "bicubic", "area", "lanczos"};
-    if (std::find(interpolations.begin(), interpolations.end(), interpolation_) ==
-        interpolations.end()) {
-      MMDEPLOY_ERROR("'{}' interpolation is not supported", interpolation_);
-      throw_exception(eInvalidArgument);
-    }
-
-    resize_ = operation::Managed<operation::Resize>::Create(interpolation_);
-  }
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto img_fields = GetImageFields(data);
-
-    for (auto& key : img_fields) {
-      Tensor src_img = data[key].get<Tensor>();
-      auto desc = src_img.desc();
-      assert(desc.shape.size() == 4);
-
-      int h = desc.shape[1];
-      int w = desc.shape[2];
-      int dst_h = 0;
-      int dst_w = 0;
-      float scale_factor = 0.f;
-
-      if (data.contains("scale")) {
-        assert(data["scale"].is_array() && data["scale"].size() == 2);
-        dst_h = data["scale"][0].get<int>();
-        dst_w = data["scale"][1].get<int>();
-      } else if (data.contains("scale_factor")) {
-        assert(data["scale_factor"].is_number());
-        scale_factor = data["scale_factor"].get<float>();
-        dst_h = int(h * scale_factor + 0.5);
-        dst_w = int(w * scale_factor + 0.5);
-      } else {
-        MMDEPLOY_DEBUG(
-            "neither 'scale' or 'scale_factor' is provided in input value. "
-            "'img_scale' will be used");
-        if (-1 == img_scale_[1]) {
-          if (w < h) {
-            dst_w = img_scale_[0];
-            dst_h = dst_w * h / w;
-          } else {
-            dst_h = img_scale_[0];
-            dst_w = dst_h * w / h;
-          }
-        } else {
-          dst_h = img_scale_[0];
-          dst_w = img_scale_[1];
-        }
-      }
-      if (keep_ratio_) {
-        int max_long_edge = dst_w;
-        int max_short_edge = dst_h;
-        if (max_long_edge < max_short_edge) {
-          std::swap(max_long_edge, max_short_edge);
-        }
-        scale_factor = std::min(max_long_edge * 1.0 / (1.0 * std::max(h, w)),
-                                max_short_edge * 1.0 / (1.0 * std::min(h, w)));
-        dst_w = int(w * scale_factor + 0.5);
-        dst_h = int(h * scale_factor + 0.5);
-      }
-      Tensor dst_img;
-      if (dst_h != h || dst_w != w) {
-        OUTCOME_TRY(resize_.Apply(src_img, dst_img, dst_h, dst_w));
-      } else {
-        dst_img = src_img;
-      }
-      auto w_scale = dst_w * 1.0 / w;
-      auto h_scale = dst_h * 1.0 / h;
-      data["scale_factor"] = {w_scale, h_scale, w_scale, h_scale};
-      data["img_shape"] = {1, dst_h, dst_w, desc.shape[3]};
-      data["keep_ratio"] = keep_ratio_;
-
-      data[key] = dst_img;
-
-      // trace static info & runtime args
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().Resize(interpolation_, {dst_h, dst_w},
-                                                     src_img.data_type());
-      }
-    }
-
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- protected:
-  operation::Managed<operation::Resize> resize_;
-  std::array<int, 2> img_scale_{};
-  std::string interpolation_{"bilinear"};
-  bool keep_ratio_{true};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(Resize);
+
+      protected:
+        operation::Managed<operation::Resize> resize_;
+        std::array<int, 2>                    img_scale_{};
+        std::string                           interpolation_{"bilinear"};
+        bool                                  keep_ratio_{true};
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(Resize);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/ten_crop.cpp b/csrc/mmdeploy/preprocess/transform/ten_crop.cpp
index 1cf44446f5..ebdd663497 100644
--- a/csrc/mmdeploy/preprocess/transform/ten_crop.cpp
+++ b/csrc/mmdeploy/preprocess/transform/ten_crop.cpp
@@ -9,82 +9,93 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-class TenCrop : public Transform {
- public:
-  explicit TenCrop(const Value& args);
-  ~TenCrop() override = default;
-
-  Result<void> Apply(Value& data) override;
-
- protected:
-  std::array<int, 2> crop_size_{};
-  operation::Managed<operation::Crop> crop_;
-  operation::Managed<operation::Flip> flip_;
-};
-
-TenCrop::TenCrop(const Value& args) {
-  // (w, h) of crop size
-  if (!args.contains(("crop_size"))) {
-    MMDEPLOY_ERROR("'crop_size' is expected");
-    throw_exception(eInvalidArgument);
-  }
-  if (args["crop_size"].is_number_integer()) {
-    crop_size_[0] = crop_size_[1] = args["crop_size"].get<int>();
-  } else if (args["crop_size"].is_array() && args["crop_size"].size() == 2) {
-    crop_size_[0] = args["crop_size"][0].get<int>();
-    crop_size_[1] = args["crop_size"][1].get<int>();
-  } else {
-    MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
-    throw_exception(eInvalidArgument);
-  }
-
-  crop_ = operation::Managed<operation::Crop>::Create();
-  // horizontal flip
-  flip_ = operation::Managed<operation::Flip>::Create(1);
-}
-
-Result<void> TenCrop::Apply(Value& data) {
-  MMDEPLOY_DEBUG("input: {}", data);
-
-  // copy input data, and update its properties
-  Value output = data;
-  auto tensor = data["img"].get<Tensor>();
-  int img_h = tensor.shape(1);
-  int img_w = tensor.shape(2);
-  int crop_w = crop_size_[0];
-  int crop_h = crop_size_[1];
-
-  int w_step = (img_w - crop_w) / 4;
-  int h_step = (img_h - crop_h) / 4;
-  std::array<std::pair<int, int>, 5> offsets = {{{0, 0},
-                                                 {4 * w_step, 0},
-                                                 {0, 4 * h_step},
-                                                 {4 * w_step, 4 * h_step},
-                                                 {2 * w_step, 2 * h_step}}};
-  vector<Tensor> cropped;
-  cropped.reserve(10);
-  for (const auto& [offx, offy] : offsets) {
-    int y1 = offy;
-    int y2 = offy + crop_h - 1;
-    int x1 = offx;
-    int x2 = offx + crop_w - 1;
-    // ! No reallocation
-    auto& cropped_tensor = cropped.emplace_back();
-    auto& flipped_tensor = cropped.emplace_back();
-
-    OUTCOME_TRY(crop_.Apply(tensor, cropped_tensor, y1, x1, y2, x2));
-    OUTCOME_TRY(flip_.Apply(cropped_tensor, flipped_tensor));
-  }
-
-  Value::Array imgs;
-  std::move(cropped.begin(), cropped.end(), std::back_inserter(imgs));
-  data["imgs"] = std::move(imgs);
-
-  return success();
-}
-
-MMDEPLOY_REGISTER_TRANSFORM(TenCrop);
+namespace mmdeploy::transform
+{
+
+    class TenCrop : public Transform
+    {
+      public:
+        explicit TenCrop(const Value& args);
+        ~TenCrop() override = default;
+
+        Result<void> Apply(Value& data) override;
+
+      protected:
+        std::array<int, 2>                  crop_size_{};
+        operation::Managed<operation::Crop> crop_;
+        operation::Managed<operation::Flip> flip_;
+    };
+
+    TenCrop::TenCrop(const Value& args)
+    {
+        // (w, h) of crop size
+        if (!args.contains(("crop_size")))
+        {
+            MMDEPLOY_ERROR("'crop_size' is expected");
+            throw_exception(eInvalidArgument);
+        }
+        if (args["crop_size"].is_number_integer())
+        {
+            crop_size_[0] = crop_size_[1] = args["crop_size"].get<int>();
+        }
+        else if (args["crop_size"].is_array() && args["crop_size"].size() == 2)
+        {
+            crop_size_[0] = args["crop_size"][0].get<int>();
+            crop_size_[1] = args["crop_size"][1].get<int>();
+        }
+        else
+        {
+            MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
+            throw_exception(eInvalidArgument);
+        }
+
+        crop_ = operation::Managed<operation::Crop>::Create();
+        // horizontal flip
+        flip_ = operation::Managed<operation::Flip>::Create(1);
+    }
+
+    Result<void> TenCrop::Apply(Value& data)
+    {
+        MMDEPLOY_DEBUG("input: {}", data);
+
+        // copy input data, and update its properties
+        Value                              output = data;
+        auto                               tensor = data["img"].get<Tensor>();
+        int                                img_h  = tensor.shape(1);
+        int                                img_w  = tensor.shape(2);
+        int                                crop_w = crop_size_[0];
+        int                                crop_h = crop_size_[1];
+
+        int                                w_step  = (img_w - crop_w) / 4;
+        int                                h_step  = (img_h - crop_h) / 4;
+        std::array<std::pair<int, int>, 5> offsets = {{{0, 0},
+                                                       {4 * w_step, 0},
+                                                       {0, 4 * h_step},
+                                                       {4 * w_step, 4 * h_step},
+                                                       {2 * w_step, 2 * h_step}}};
+        vector<Tensor>                     cropped;
+        cropped.reserve(10);
+        for (const auto& [offx, offy] : offsets)
+        {
+            int   y1             = offy;
+            int   y2             = offy + crop_h - 1;
+            int   x1             = offx;
+            int   x2             = offx + crop_w - 1;
+            // ! No reallocation
+            auto& cropped_tensor = cropped.emplace_back();
+            auto& flipped_tensor = cropped.emplace_back();
+
+            OUTCOME_TRY(crop_.Apply(tensor, cropped_tensor, y1, x1, y2, x2));
+            OUTCOME_TRY(flip_.Apply(cropped_tensor, flipped_tensor));
+        }
+
+        Value::Array imgs;
+        std::move(cropped.begin(), cropped.end(), std::back_inserter(imgs));
+        data["imgs"] = std::move(imgs);
+
+        return success();
+    }
+
+    MMDEPLOY_REGISTER_TRANSFORM(TenCrop);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/three_crop.cpp b/csrc/mmdeploy/preprocess/transform/three_crop.cpp
index fe8f1904ea..bca672aad0 100644
--- a/csrc/mmdeploy/preprocess/transform/three_crop.cpp
+++ b/csrc/mmdeploy/preprocess/transform/three_crop.cpp
@@ -8,87 +8,103 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-namespace {
-
-Result<void> check_input_shape(int img_h, int img_w, int crop_h, int crop_w) {
-  if (img_h == crop_h || img_w == crop_w) {
-    return success();
-  }
-  MMDEPLOY_ERROR("ThreeCrop error, img_h: {} != crop_h: {} && img_w: {} != crop_w {}", img_h,
-                 crop_h, img_w, crop_w);
-  return Status(eInvalidArgument);
-}
-
-}  // namespace
-
-class ThreeCrop : public Transform {
- public:
-  explicit ThreeCrop(const Value& args);
-  ~ThreeCrop() override = default;
-
-  Result<void> Apply(Value& data) override;
-
- protected:
-  std::array<int, 2> crop_size_{};
-  operation::Managed<operation::Crop> crop_;
-};
-
-ThreeCrop::ThreeCrop(const Value& args) {
-  // (w, h) of crop size
-  if (!args.contains(("crop_size"))) {
-    MMDEPLOY_ERROR("'crop_size' is expected");
-    throw_exception(eInvalidArgument);
-  }
-  if (args["crop_size"].is_number_integer()) {
-    crop_size_[0] = crop_size_[1] = args["crop_size"].get<int>();
-  } else if (args["crop_size"].is_array() && args["crop_size"].size() == 2) {
-    crop_size_[0] = args["crop_size"][0].get<int>();
-    crop_size_[1] = args["crop_size"][1].get<int>();
-  } else {
-    MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
-    throw_exception(eInvalidArgument);
-  }
-
-  crop_ = operation::Managed<operation::Crop>::Create();
-}
-
-Result<void> ThreeCrop::Apply(Value& data) {
-  auto tensor = data["img"].get<Tensor>();
-  auto desc = tensor.desc();
-  int img_h = desc.shape[1];
-  int img_w = desc.shape[2];
-  int crop_w = crop_size_[0];
-  int crop_h = crop_size_[1];
-  OUTCOME_TRY(check_input_shape(img_h, img_w, crop_h, crop_w));
-
-  std::array<std::pair<int, int>, 3> offsets;
-  if (crop_h == img_h) {
-    int w_step = (img_w - crop_w) / 2;
-    offsets = {{{0, 0}, {2 * w_step, 0}, {w_step, 0}}};
-  } else if (crop_w == img_w) {
-    int h_step = (img_h - crop_h) / 2;
-    offsets = {{{0, 0}, {0, 2 * h_step}, {0, h_step}}};
-  }
-  vector<Tensor> cropped;
-  cropped.reserve(3);
-  for (const auto& [offx, offy] : offsets) {
-    int y1 = offy;
-    int y2 = offy + crop_h - 1;
-    int x1 = offx;
-    int x2 = offx + crop_w - 1;
-    auto& dst_tensor = cropped.emplace_back();
-
-    OUTCOME_TRY(crop_.Apply(tensor, dst_tensor, y1, x1, y2, x2));
-  }
-
-  Value::Array imgs;
-  std::move(cropped.begin(), cropped.end(), std::back_inserter(imgs));
-  data["imgs"] = std::move(imgs);
-  return success();
-}
-
-MMDEPLOY_REGISTER_TRANSFORM(ThreeCrop);
+namespace mmdeploy::transform
+{
+
+    namespace
+    {
+
+        Result<void> check_input_shape(int img_h, int img_w, int crop_h, int crop_w)
+        {
+            if (img_h == crop_h || img_w == crop_w)
+            {
+                return success();
+            }
+            MMDEPLOY_ERROR("ThreeCrop error, img_h: {} != crop_h: {} && img_w: {} != crop_w {}", img_h, crop_h, img_w, crop_w);
+            return Status(eInvalidArgument);
+        }
+
+    }  // namespace
+
+    class ThreeCrop : public Transform
+    {
+      public:
+        explicit ThreeCrop(const Value& args);
+        ~ThreeCrop() override = default;
+
+        Result<void> Apply(Value& data) override;
+
+      protected:
+        std::array<int, 2>                  crop_size_{};
+        operation::Managed<operation::Crop> crop_;
+    };
+
+    ThreeCrop::ThreeCrop(const Value& args)
+    {
+        // (w, h) of crop size
+        if (!args.contains(("crop_size")))
+        {
+            MMDEPLOY_ERROR("'crop_size' is expected");
+            throw_exception(eInvalidArgument);
+        }
+        if (args["crop_size"].is_number_integer())
+        {
+            crop_size_[0] = crop_size_[1] = args["crop_size"].get<int>();
+        }
+        else if (args["crop_size"].is_array() && args["crop_size"].size() == 2)
+        {
+            crop_size_[0] = args["crop_size"][0].get<int>();
+            crop_size_[1] = args["crop_size"][1].get<int>();
+        }
+        else
+        {
+            MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
+            throw_exception(eInvalidArgument);
+        }
+
+        crop_ = operation::Managed<operation::Crop>::Create();
+    }
+
+    Result<void> ThreeCrop::Apply(Value& data)
+    {
+        auto tensor = data["img"].get<Tensor>();
+        auto desc   = tensor.desc();
+        int  img_h  = desc.shape[1];
+        int  img_w  = desc.shape[2];
+        int  crop_w = crop_size_[0];
+        int  crop_h = crop_size_[1];
+        OUTCOME_TRY(check_input_shape(img_h, img_w, crop_h, crop_w));
+
+        std::array<std::pair<int, int>, 3> offsets;
+        if (crop_h == img_h)
+        {
+            int w_step = (img_w - crop_w) / 2;
+            offsets    = {{{0, 0}, {2 * w_step, 0}, {w_step, 0}}};
+        }
+        else if (crop_w == img_w)
+        {
+            int h_step = (img_h - crop_h) / 2;
+            offsets    = {{{0, 0}, {0, 2 * h_step}, {0, h_step}}};
+        }
+        vector<Tensor> cropped;
+        cropped.reserve(3);
+        for (const auto& [offx, offy] : offsets)
+        {
+            int   y1         = offy;
+            int   y2         = offy + crop_h - 1;
+            int   x1         = offx;
+            int   x2         = offx + crop_w - 1;
+            auto& dst_tensor = cropped.emplace_back();
+
+            OUTCOME_TRY(crop_.Apply(tensor, dst_tensor, y1, x1, y2, x2));
+        }
+
+        Value::Array imgs;
+        std::move(cropped.begin(), cropped.end(), std::back_inserter(imgs));
+        data["imgs"] = std::move(imgs);
+        return success();
+    }
+
+    MMDEPLOY_REGISTER_TRANSFORM(ThreeCrop);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/tracer.cpp b/csrc/mmdeploy/preprocess/transform/tracer.cpp
index 7d2b28752e..5f961061bb 100644
--- a/csrc/mmdeploy/preprocess/transform/tracer.cpp
+++ b/csrc/mmdeploy/preprocess/transform/tracer.cpp
@@ -2,76 +2,85 @@
 
 #include "mmdeploy/preprocess/transform/tracer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-using namespace trace;
+    using namespace trace;
 
-void Tracer::PrepareImage(const std::string &color_type, bool to_float32, TensorShape shape,
-                          PixelFormat pfmt, DataType dtype) {
-  PixelFormat pdst = PixelFormat::kGRAYSCALE;
-  if (color_type == "color" || color_type == "color_ignore_orientation") {
-    pdst = PixelFormat::kBGR;
-  }
-  trans_.push_back(CvtColorParam{dtype, pfmt, pdst});
-  state_ = {dtype, pdst, shape};
+    void Tracer::PrepareImage(const std::string& color_type, bool to_float32, TensorShape shape, PixelFormat pfmt, DataType dtype)
+    {
+        PixelFormat pdst = PixelFormat::kGRAYSCALE;
+        if (color_type == "color" || color_type == "color_ignore_orientation")
+        {
+            pdst = PixelFormat::kBGR;
+        }
+        trans_.push_back(CvtColorParam{dtype, pfmt, pdst});
+        state_ = {dtype, pdst, shape};
 
-  if (to_float32) {
-    trans_.push_back(CastParam{dtype, DataType::kFLOAT});
-    state_.dtype = DataType::kFLOAT;
-    common_dtype_ = DataType::kFLOAT;
-  }
-}
+        if (to_float32)
+        {
+            trans_.push_back(CastParam{dtype, DataType::kFLOAT});
+            state_.dtype  = DataType::kFLOAT;
+            common_dtype_ = DataType::kFLOAT;
+        }
+    }
 
-void Tracer::Resize(const std::string &mode, const std::vector<int> &size, DataType dtype) {
-  trans_.push_back(ResizeParam{dtype, size, mode});
-  state_.shape[1] = size[0];
-  state_.shape[2] = size[1];
-}
+    void Tracer::Resize(const std::string& mode, const std::vector<int>& size, DataType dtype)
+    {
+        trans_.push_back(ResizeParam{dtype, size, mode});
+        state_.shape[1] = size[0];
+        state_.shape[2] = size[1];
+    }
 
-void Tracer::Pad(float pad_val, const std::vector<int> &tlbr, const std::vector<int> &size,
-                 DataType dtype) {
-  trans_.push_back(PadParam{dtype, pad_val, tlbr, size});
-  state_.shape[1] = size[0];
-  state_.shape[2] = size[1];
-}
+    void Tracer::Pad(float pad_val, const std::vector<int>& tlbr, const std::vector<int>& size, DataType dtype)
+    {
+        trans_.push_back(PadParam{dtype, pad_val, tlbr, size});
+        state_.shape[1] = size[0];
+        state_.shape[2] = size[1];
+    }
 
-void Tracer::Normalize(const std::vector<float> &mean, const std::vector<float> &std, bool to_rgb,
-                       DataType dtype) {
-  if (common_dtype_ == std::nullopt || common_dtype_.value() != DataType::kFLOAT) {
-    trans_.push_back(CastParam{dtype, DataType::kFLOAT});
-    state_.dtype = DataType::kFLOAT;
-    common_dtype_ = DataType::kFLOAT;
-  }
+    void Tracer::Normalize(const std::vector<float>& mean, const std::vector<float>& std, bool to_rgb, DataType dtype)
+    {
+        if (common_dtype_ == std::nullopt || common_dtype_.value() != DataType::kFLOAT)
+        {
+            trans_.push_back(CastParam{dtype, DataType::kFLOAT});
+            state_.dtype  = DataType::kFLOAT;
+            common_dtype_ = DataType::kFLOAT;
+        }
 
-  if (to_rgb) {
-    trans_.push_back(CvtColorParam{DataType::kFLOAT, state_.pfmt, PixelFormat::kRGB});
-    state_.pfmt = PixelFormat::kRGB;
-  }
+        if (to_rgb)
+        {
+            trans_.push_back(CvtColorParam{DataType::kFLOAT, state_.pfmt, PixelFormat::kRGB});
+            state_.pfmt = PixelFormat::kRGB;
+        }
 
-  trans_.push_back(NormParam{state_.dtype, mean, std});
-}
+        trans_.push_back(NormParam{state_.dtype, mean, std});
+    }
 
-void Tracer::CenterCrop(const std::vector<int> &tlbr, const std::vector<int> &size,
-                        DataType dtype) {
-  trans_.push_back(CropParam{state_.dtype, tlbr, size});
-  state_.shape[1] = size[0];
-  state_.shape[2] = size[1];
-}
+    void Tracer::CenterCrop(const std::vector<int>& tlbr, const std::vector<int>& size, DataType dtype)
+    {
+        trans_.push_back(CropParam{state_.dtype, tlbr, size});
+        state_.shape[1] = size[0];
+        state_.shape[2] = size[1];
+    }
 
-void Tracer::DefaultFormatBundle(bool to_float, DataType dtype) {
-  if (to_float && (common_dtype_ == std::nullopt || common_dtype_.value() != DataType::kFLOAT)) {
-    trans_.push_back(CastParam{dtype, DataType::kFLOAT});
-    state_.dtype = DataType::kFLOAT;
-    common_dtype_ = DataType::kFLOAT;
-  }
+    void Tracer::DefaultFormatBundle(bool to_float, DataType dtype)
+    {
+        if (to_float && (common_dtype_ == std::nullopt || common_dtype_.value() != DataType::kFLOAT))
+        {
+            trans_.push_back(CastParam{dtype, DataType::kFLOAT});
+            state_.dtype  = DataType::kFLOAT;
+            common_dtype_ = DataType::kFLOAT;
+        }
 
-  trans_.push_back(HWC2CHWParam{state_.dtype});
-  state_.shape = {state_.shape[0], state_.shape[3], state_.shape[1], state_.shape[2]};
-}
+        trans_.push_back(HWC2CHWParam{state_.dtype});
+        state_.shape = {state_.shape[0], state_.shape[3], state_.shape[1], state_.shape[2]};
+    }
 
-void Tracer::ImageToTensor(DataType dtype) {
-  trans_.push_back(HWC2CHWParam{state_.dtype});
-  state_.shape = {state_.shape[0], state_.shape[3], state_.shape[1], state_.shape[2]};
-}
+    void Tracer::ImageToTensor(DataType dtype)
+    {
+        trans_.push_back(HWC2CHWParam{state_.dtype});
+        state_.shape = {state_.shape[0], state_.shape[3], state_.shape[1], state_.shape[2]};
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/preprocess/transform/tracer.h b/csrc/mmdeploy/preprocess/transform/tracer.h
index d8885fe22a..148a8c1615 100644
--- a/csrc/mmdeploy/preprocess/transform/tracer.h
+++ b/csrc/mmdeploy/preprocess/transform/tracer.h
@@ -13,89 +13,96 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy {
-
-using namespace framework;
-
-namespace trace {
-
-struct CvtColorParam {
-  DataType dtype;
-  PixelFormat srt;
-  PixelFormat dst;
-};
-
-struct CastParam {
-  DataType srt;
-  DataType dst;
-};
-
-struct ResizeParam {
-  DataType dtype;
-  std::vector<int> size;
-  std::string mode;
-};
-
-struct CropParam {
-  DataType dtype;
-  std::vector<int> tlbr;
-  std::vector<int> size;
-};
-
-struct NormParam {
-  DataType dtype;
-  std::vector<float> mean;
-  std::vector<float> std;
-};
-
-struct PadParam {
-  DataType dtype;
-  float pad_val;
-  std::vector<int> tlbr;
-  std::vector<int> size;
-};
-
-struct HWC2CHWParam {
-  DataType dtype;
-};
-
-using TransParamType = std::variant<CvtColorParam, CastParam, ResizeParam, PadParam, NormParam,
-                                    CropParam, HWC2CHWParam>;
-
-}  // namespace trace
-
-class MMDEPLOY_API Tracer {
- public:
-  void Resize(const std::string &mode, const std::vector<int> &size, DataType dtype);
-
-  void PrepareImage(const std::string &color_type, bool to_float32, TensorShape shape,
-                    PixelFormat pfmt, DataType dtype);
-
-  void Pad(float pad_val, const std::vector<int> &tlbr, const std::vector<int> &size,
-           DataType dtype);
-
-  void Normalize(const std::vector<float> &mean, const std::vector<float> &std, bool to_rgb,
-                 DataType dtype);
-
-  void CenterCrop(const std::vector<int> &tlbr, const std::vector<int> &size, DataType dtype);
-
-  void DefaultFormatBundle(bool to_float, DataType dtype);
-
-  void ImageToTensor(DataType dtype);
-
- public:
-  struct state_t {
-    DataType dtype;
-    PixelFormat pfmt;
-    TensorShape shape;
-  };
-  using StateType = struct state_t;
-  StateType state_;
-  std::optional<DataType> common_dtype_;
-  std::vector<trace::TransParamType> trans_;
-};
-
-MMDEPLOY_REGISTER_TYPE_ID(Tracer, 9);
+namespace mmdeploy
+{
+
+    using namespace framework;
+
+    namespace trace
+    {
+
+        struct CvtColorParam
+        {
+            DataType    dtype;
+            PixelFormat srt;
+            PixelFormat dst;
+        };
+
+        struct CastParam
+        {
+            DataType srt;
+            DataType dst;
+        };
+
+        struct ResizeParam
+        {
+            DataType         dtype;
+            std::vector<int> size;
+            std::string      mode;
+        };
+
+        struct CropParam
+        {
+            DataType         dtype;
+            std::vector<int> tlbr;
+            std::vector<int> size;
+        };
+
+        struct NormParam
+        {
+            DataType           dtype;
+            std::vector<float> mean;
+            std::vector<float> std;
+        };
+
+        struct PadParam
+        {
+            DataType         dtype;
+            float            pad_val;
+            std::vector<int> tlbr;
+            std::vector<int> size;
+        };
+
+        struct HWC2CHWParam
+        {
+            DataType dtype;
+        };
+
+        using TransParamType = std::variant<CvtColorParam, CastParam, ResizeParam, PadParam, NormParam, CropParam, HWC2CHWParam>;
+
+    }  // namespace trace
+
+    class MMDEPLOY_API Tracer
+    {
+      public:
+        void Resize(const std::string& mode, const std::vector<int>& size, DataType dtype);
+
+        void PrepareImage(const std::string& color_type, bool to_float32, TensorShape shape, PixelFormat pfmt, DataType dtype);
+
+        void Pad(float pad_val, const std::vector<int>& tlbr, const std::vector<int>& size, DataType dtype);
+
+        void Normalize(const std::vector<float>& mean, const std::vector<float>& std, bool to_rgb, DataType dtype);
+
+        void CenterCrop(const std::vector<int>& tlbr, const std::vector<int>& size, DataType dtype);
+
+        void DefaultFormatBundle(bool to_float, DataType dtype);
+
+        void ImageToTensor(DataType dtype);
+
+      public:
+        struct state_t
+        {
+            DataType    dtype;
+            PixelFormat pfmt;
+            TensorShape shape;
+        };
+        using StateType = struct state_t;
+        StateType                          state_;
+        std::optional<DataType>            common_dtype_;
+        std::vector<trace::TransParamType> trans_;
+    };
+
+    MMDEPLOY_REGISTER_TYPE_ID(Tracer, 9);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/preprocess/transform/transform.cpp b/csrc/mmdeploy/preprocess/transform/transform.cpp
index 9569959fc7..008fa3b46a 100644
--- a/csrc/mmdeploy/preprocess/transform/transform.cpp
+++ b/csrc/mmdeploy/preprocess/transform/transform.cpp
@@ -4,23 +4,30 @@
 
 #include "mmdeploy/core/registry.h"
 
-namespace mmdeploy::transform {
+namespace mmdeploy::transform
+{
 
-std::vector<std::string> GetImageFields(const Value& input) {
-  if (input.contains("img_fields")) {
-    if (input["img_fields"].is_string()) {
-      return {input["img_fields"].get<std::string>()};
-    } else if (input["img_fields"].is_array()) {
-      std::vector<std::string> img_fields;
-      for (auto& v : input["img_fields"]) {
-        img_fields.push_back(v.get<std::string>());
-      }
-      return img_fields;
-    }
-  }
-  return {"img"};
-};
+    std::vector<std::string> GetImageFields(const Value& input)
+    {
+        if (input.contains("img_fields"))
+        {
+            if (input["img_fields"].is_string())
+            {
+                return {input["img_fields"].get<std::string>()};
+            }
+            else if (input["img_fields"].is_array())
+            {
+                std::vector<std::string> img_fields;
+                for (auto& v : input["img_fields"])
+                {
+                    img_fields.push_back(v.get<std::string>());
+                }
+                return img_fields;
+            }
+        }
+        return {"img"};
+    };
 
-MMDEPLOY_DEFINE_REGISTRY(Transform);
+    MMDEPLOY_DEFINE_REGISTRY(Transform);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/transform.h b/csrc/mmdeploy/preprocess/transform/transform.h
index e8246e54e8..6ad5ceb7f6 100644
--- a/csrc/mmdeploy/preprocess/transform/transform.h
+++ b/csrc/mmdeploy/preprocess/transform/transform.h
@@ -8,32 +8,33 @@
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/operation/operation.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-using namespace framework;
+    using namespace framework;
 
-namespace transform {
+    namespace transform
+    {
 
-class MMDEPLOY_API Transform {
- public:
-  virtual ~Transform() = default;
-  virtual Result<void> Apply(Value& data) = 0;
-};
+        class MMDEPLOY_API Transform
+        {
+          public:
+            virtual ~Transform()                    = default;
+            virtual Result<void> Apply(Value& data) = 0;
+        };
 
-MMDEPLOY_API std::vector<std::string> GetImageFields(const Value& input);
+        MMDEPLOY_API std::vector<std::string> GetImageFields(const Value& input);
 
-MMDEPLOY_DECLARE_REGISTRY(Transform, std::unique_ptr<Transform>(const Value& config));
+        MMDEPLOY_DECLARE_REGISTRY(Transform, std::unique_ptr<Transform>(const Value& config));
 
-#define MMDEPLOY_REGISTER_TRANSFORM2(type, desc)                                                   \
-  MMDEPLOY_REGISTER_FACTORY_FUNC(::mmdeploy::transform::Transform, desc, [](const Value& config) { \
-    return std::make_unique<type>(config);                                                         \
-  });
+#define MMDEPLOY_REGISTER_TRANSFORM2(type, desc) \
+    MMDEPLOY_REGISTER_FACTORY_FUNC(::mmdeploy::transform::Transform, desc, [](const Value& config) { return std::make_unique<type>(config); });
 
 #define MMDEPLOY_REGISTER_TRANSFORM(type) MMDEPLOY_REGISTER_TRANSFORM2(type, (type, 0))
 
-}  // namespace transform
+    }  // namespace transform
 
-using transform::Transform;
+    using transform::Transform;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/preprocess/transform_module.cpp b/csrc/mmdeploy/preprocess/transform_module.cpp
index b718843ea8..59b534a749 100644
--- a/csrc/mmdeploy/preprocess/transform_module.cpp
+++ b/csrc/mmdeploy/preprocess/transform_module.cpp
@@ -6,50 +6,54 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy {
-
-class TransformModule {
- public:
-  ~TransformModule();
-  TransformModule(TransformModule&&) noexcept;
-
-  explicit TransformModule(const Value& args);
-  Result<Value> operator()(const Value& input);
-
- private:
-  std::unique_ptr<transform::Transform> transform_;
-};
-
-TransformModule::~TransformModule() = default;
-
-TransformModule::TransformModule(TransformModule&&) noexcept = default;
-
-TransformModule::TransformModule(const Value& args) {
-  const auto type = "Compose";
-  auto creator = gRegistry<transform::Transform>().Get(type);
-  if (!creator) {
-    MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type,
-                   gRegistry<transform::Transform>().List());
-    throw_exception(eEntryNotFound);
-  }
-  auto cfg = args;
-  if (cfg.contains("device")) {
-    MMDEPLOY_WARN("force using device: {}", cfg["device"].get<const char*>());
-    auto device = Device(cfg["device"].get<const char*>());
-    cfg["context"]["device"] = device;
-    cfg["context"]["stream"] = Stream::GetDefault(device);
-  }
-  transform_ = creator->Create(cfg);
-}
-
-Result<Value> TransformModule::operator()(const Value& input) {
-  auto data = input;
-  OUTCOME_TRY(transform_->Apply(data));
-  return data;
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (Transform, 0), [](const Value& config) {
-  return CreateTask(TransformModule{config});
-});
+namespace mmdeploy
+{
+
+    class TransformModule
+    {
+      public:
+        ~TransformModule();
+        TransformModule(TransformModule&&) noexcept;
+
+        explicit TransformModule(const Value& args);
+        Result<Value> operator()(const Value& input);
+
+      private:
+        std::unique_ptr<transform::Transform> transform_;
+    };
+
+    TransformModule::~TransformModule() = default;
+
+    TransformModule::TransformModule(TransformModule&&) noexcept = default;
+
+    TransformModule::TransformModule(const Value& args)
+    {
+        const auto type    = "Compose";
+        auto       creator = gRegistry<transform::Transform>().Get(type);
+        if (!creator)
+        {
+            MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type, gRegistry<transform::Transform>().List());
+            throw_exception(eEntryNotFound);
+        }
+        auto cfg = args;
+        if (cfg.contains("device"))
+        {
+            MMDEPLOY_WARN("force using device: {}", cfg["device"].get<const char*>());
+            auto device              = Device(cfg["device"].get<const char*>());
+            cfg["context"]["device"] = device;
+            cfg["context"]["stream"] = Stream::GetDefault(device);
+        }
+        transform_ = creator->Create(cfg);
+    }
+
+    Result<Value> TransformModule::operator()(const Value& input)
+    {
+        auto data = input;
+        OUTCOME_TRY(transform_->Apply(data));
+        return data;
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (Transform, 0), [](const Value& config)
+                                   { return CreateTask(TransformModule{config}); });
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/utils/dlpack/dlpack_utils.cpp b/csrc/mmdeploy/utils/dlpack/dlpack_utils.cpp
index 343fcb8bed..77f8b3c437 100644
--- a/csrc/mmdeploy/utils/dlpack/dlpack_utils.cpp
+++ b/csrc/mmdeploy/utils/dlpack/dlpack_utils.cpp
@@ -11,177 +11,201 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy {
-
-using mmdeploy::framework::Device;
-using mmdeploy::framework::Stream;
-using mmdeploy::framework::Tensor;
-using mmdeploy::framework::TensorShape;
-
-static inline int64_t element_size(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return 4;
-    case DataType::kHALF:
-      return 2;
-    case DataType::kINT8:
-      return 1;
-    case DataType::kINT32:
-      return 4;
-    case DataType::kINT64:
-      return 8;
-    default:
-      return 0;
-  }
-}
-
-static inline int64_t get_size(const std::vector<int64_t>& shape) {
-  if (shape.empty()) {
-    return 0;
-  }
-  auto _size = std::accumulate(begin(shape), end(shape), 1LL, std::multiplies<>());
-  return std::max(0LL, _size);
-}
-
-inline static Result<Device> FromDLDevice(const DLDevice& device) {
-  int device_id = device.device_id;
-
-  switch (device.device_type) {
-    case kDLCPU:
-      return Device("cpu", device_id);
-    case kDLCUDA:
-      return Device("cuda", device_id);
-    default:
-      MMDEPLOY_ERROR("Unsupported DLDevice.");
-      return Status(eNotSupported);
-  }
-}
-
-inline static DLDevice ToDLDevice(const Device& device) {
-  auto device_type = device.is_device() ? kDLCUDA : kDLCPU;
-  int device_id = device.device_id();
-  return DLDevice{device_type, device_id};
-}
-
-inline static Result<DataType> FromDLDataType(const DLDataType& dtype) {
-  if (dtype.lanes != 1) {
-    MMDEPLOY_ERROR("DLDataType.lanes != 1 is not supported.");
-    return Status(eNotSupported);
-  }
-  switch (dtype.code) {
-    case kDLFloat:
-      if (dtype.bits == 32)
-        return DataType::kFLOAT;
-      else {
-        MMDEPLOY_ERROR("Unsupported bits. {}", dtype.bits);
-        return Status(eNotSupported);
-      }
-    case kDLInt:
-      if (dtype.bits == 32) return DataType::kINT32;
-      if (dtype.bits == 64) return DataType::kINT64;
-      if (dtype.bits == 8)
-        return DataType::kINT8;
-      else {
-        MMDEPLOY_ERROR("Unsupported bits. {}", dtype.bits);
-        return Status(eNotSupported);
-      }
-      break;
-    default:
-      MMDEPLOY_ERROR("Unsupported DLDataType.");
-      return Status(eNotSupported);
-  }
-}
-
-inline static Result<DLDataType> ToDLDataType(const DataType& dtype) {
-  switch (dtype) {
-    case DataType::kFLOAT:
-      return DLDataType{kDLFloat, 32, 1};
-    case DataType::kINT32:
-      return DLDataType{kDLInt, 32, 1};
-    case DataType::kINT64:
-      return DLDataType{kDLInt, 64, 1};
-    case DataType::kINT8:
-      return DLDataType{kDLInt, 8, 1};
-    default:
-      MMDEPLOY_ERROR("Unsupported mmdeploy::DataType");
-      return Status(eNotSupported);
-  }
-}
-
-static void TensorDeleter(struct DLManagedTensor* self) {
-  auto tensor = static_cast<Tensor*>(self->manager_ctx);
-  delete tensor;
-}
-
-static bool IsContiguous(const int64_t* shape, const int64_t* stride, int ndim) {
-  if (ndim <= 1 || stride == nullptr) return true;
-  for (auto i = 1; i < ndim; ++i) {
-    if (stride[i - 1] != shape[i] * stride[i]) return false;
-  }
-  return true;
-}
-
-Result<DLManagedTensor*> ToDLPack(Tensor& tensor, Stream stream) {
-  using mmdeploy::framework::Buffer;
-  auto managed_tensor = new DLManagedTensor();
-
-  // set deleter
-  managed_tensor->deleter = TensorDeleter;
-  Tensor* new_tensor = nullptr;
-
-  // create manager_ctx
-  {
-    auto desc = tensor.desc();
-    uint64_t data_val = reinterpret_cast<uint64_t>(tensor.data());
-    if ((data_val & 0xff) != 0) {
-      // copy buffer if data is not aligned.
-      new_tensor =
-          new Tensor(desc, Buffer(desc.device, tensor.byte_size(), tensor.allocator(), 256));
-      OUTCOME_TRY(tensor.CopyTo(*new_tensor, stream));
-    } else {
-      // reuse buffer
-      new_tensor = new Tensor(desc, tensor.buffer());
+namespace mmdeploy
+{
+
+    using mmdeploy::framework::Device;
+    using mmdeploy::framework::Stream;
+    using mmdeploy::framework::Tensor;
+    using mmdeploy::framework::TensorShape;
+
+    static inline int64_t element_size(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return 4;
+            case DataType::kHALF:
+                return 2;
+            case DataType::kINT8:
+                return 1;
+            case DataType::kINT32:
+                return 4;
+            case DataType::kINT64:
+                return 8;
+            default:
+                return 0;
+        }
+    }
+
+    static inline int64_t get_size(const std::vector<int64_t>& shape)
+    {
+        if (shape.empty())
+        {
+            return 0;
+        }
+        auto _size = std::accumulate(begin(shape), end(shape), 1LL, std::multiplies<>());
+        return std::max(0LL, _size);
+    }
+
+    inline static Result<Device> FromDLDevice(const DLDevice& device)
+    {
+        int device_id = device.device_id;
+
+        switch (device.device_type)
+        {
+            case kDLCPU:
+                return Device("cpu", device_id);
+            case kDLCUDA:
+                return Device("cuda", device_id);
+            default:
+                MMDEPLOY_ERROR("Unsupported DLDevice.");
+                return Status(eNotSupported);
+        }
+    }
+
+    inline static DLDevice ToDLDevice(const Device& device)
+    {
+        auto device_type = device.is_device() ? kDLCUDA : kDLCPU;
+        int  device_id   = device.device_id();
+        return DLDevice{device_type, device_id};
+    }
+
+    inline static Result<DataType> FromDLDataType(const DLDataType& dtype)
+    {
+        if (dtype.lanes != 1)
+        {
+            MMDEPLOY_ERROR("DLDataType.lanes != 1 is not supported.");
+            return Status(eNotSupported);
+        }
+        switch (dtype.code)
+        {
+            case kDLFloat:
+                if (dtype.bits == 32)
+                    return DataType::kFLOAT;
+                else
+                {
+                    MMDEPLOY_ERROR("Unsupported bits. {}", dtype.bits);
+                    return Status(eNotSupported);
+                }
+            case kDLInt:
+                if (dtype.bits == 32) return DataType::kINT32;
+                if (dtype.bits == 64) return DataType::kINT64;
+                if (dtype.bits == 8)
+                    return DataType::kINT8;
+                else
+                {
+                    MMDEPLOY_ERROR("Unsupported bits. {}", dtype.bits);
+                    return Status(eNotSupported);
+                }
+                break;
+            default:
+                MMDEPLOY_ERROR("Unsupported DLDataType.");
+                return Status(eNotSupported);
+        }
+    }
+
+    inline static Result<DLDataType> ToDLDataType(const DataType& dtype)
+    {
+        switch (dtype)
+        {
+            case DataType::kFLOAT:
+                return DLDataType{kDLFloat, 32, 1};
+            case DataType::kINT32:
+                return DLDataType{kDLInt, 32, 1};
+            case DataType::kINT64:
+                return DLDataType{kDLInt, 64, 1};
+            case DataType::kINT8:
+                return DLDataType{kDLInt, 8, 1};
+            default:
+                MMDEPLOY_ERROR("Unsupported mmdeploy::DataType");
+                return Status(eNotSupported);
+        }
+    }
+
+    static void TensorDeleter(struct DLManagedTensor* self)
+    {
+        auto tensor = static_cast<Tensor*>(self->manager_ctx);
+        delete tensor;
+    }
+
+    static bool IsContiguous(const int64_t* shape, const int64_t* stride, int ndim)
+    {
+        if (ndim <= 1 || stride == nullptr) return true;
+        for (auto i = 1; i < ndim; ++i)
+        {
+            if (stride[i - 1] != shape[i] * stride[i]) return false;
+        }
+        return true;
     }
-    managed_tensor->manager_ctx = static_cast<void*>(new_tensor);
-  }
-
-  // setup dl_tensor
-  {
-    auto& dl_tensor = managed_tensor->dl_tensor;
-    auto& desc = new_tensor->desc();
-    dl_tensor.data = new_tensor->data();
-    dl_tensor.device = ToDLDevice(desc.device);
-    OUTCOME_TRY(dl_tensor.dtype, ToDLDataType(desc.data_type));
-    dl_tensor.ndim = desc.shape.size();
-    dl_tensor.byte_offset = 0;
-    dl_tensor.shape = (int64_t*)(&(desc.shape[0]));
-    dl_tensor.strides = nullptr;
-  }
-
-  return managed_tensor;
-}  // namespace mmdeploy
 
-Result<Tensor> FromDLPack(DLManagedTensor* managed_tensor, const std::string& name, Stream stream) {
-  using mmdeploy::framework::TensorDesc;
-  auto& dl_tensor = managed_tensor->dl_tensor;
-  if (!IsContiguous(dl_tensor.shape, dl_tensor.strides, dl_tensor.ndim)) {
-    MMDEPLOY_ERROR("Only contiguous DLTensor is supported now.");
-    return Status(eNotSupported);
-  }
-
-  TensorShape shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
-  OUTCOME_TRY(auto device, FromDLDevice(dl_tensor.device));
-  OUTCOME_TRY(auto dtype, FromDLDataType(dl_tensor.dtype));
-
-  // create tensor
-  TensorDesc desc{device, dtype, shape, name};
-  auto buffer_size = get_size(shape) * element_size(dtype);
-  auto raw_data = static_cast<void*>(static_cast<uint8_t*>(dl_tensor.data) + dl_tensor.byte_offset);
-  Tensor ret(desc);
-  OUTCOME_TRY(ret.CopyFrom(raw_data, stream));
-
-  // delete old tensor
-  if (managed_tensor->deleter != nullptr) managed_tensor->deleter(managed_tensor);
-  return ret;
-}
+    Result<DLManagedTensor*> ToDLPack(Tensor& tensor, Stream stream)
+    {
+        using mmdeploy::framework::Buffer;
+        auto managed_tensor = new DLManagedTensor();
+
+        // set deleter
+        managed_tensor->deleter = TensorDeleter;
+        Tensor* new_tensor      = nullptr;
+
+        // create manager_ctx
+        {
+            auto     desc     = tensor.desc();
+            uint64_t data_val = reinterpret_cast<uint64_t>(tensor.data());
+            if ((data_val & 0xff) != 0)
+            {
+                // copy buffer if data is not aligned.
+                new_tensor =
+                    new Tensor(desc, Buffer(desc.device, tensor.byte_size(), tensor.allocator(), 256));
+                OUTCOME_TRY(tensor.CopyTo(*new_tensor, stream));
+            }
+            else
+            {
+                // reuse buffer
+                new_tensor = new Tensor(desc, tensor.buffer());
+            }
+            managed_tensor->manager_ctx = static_cast<void*>(new_tensor);
+        }
+
+        // setup dl_tensor
+        {
+            auto& dl_tensor  = managed_tensor->dl_tensor;
+            auto& desc       = new_tensor->desc();
+            dl_tensor.data   = new_tensor->data();
+            dl_tensor.device = ToDLDevice(desc.device);
+            OUTCOME_TRY(dl_tensor.dtype, ToDLDataType(desc.data_type));
+            dl_tensor.ndim        = desc.shape.size();
+            dl_tensor.byte_offset = 0;
+            dl_tensor.shape       = (int64_t*)(&(desc.shape[0]));
+            dl_tensor.strides     = nullptr;
+        }
+
+        return managed_tensor;
+    }  // namespace mmdeploy
+
+    Result<Tensor> FromDLPack(DLManagedTensor* managed_tensor, const std::string& name, Stream stream)
+    {
+        using mmdeploy::framework::TensorDesc;
+        auto& dl_tensor = managed_tensor->dl_tensor;
+        if (!IsContiguous(dl_tensor.shape, dl_tensor.strides, dl_tensor.ndim))
+        {
+            MMDEPLOY_ERROR("Only contiguous DLTensor is supported now.");
+            return Status(eNotSupported);
+        }
+
+        TensorShape shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
+        OUTCOME_TRY(auto device, FromDLDevice(dl_tensor.device));
+        OUTCOME_TRY(auto dtype, FromDLDataType(dl_tensor.dtype));
+
+        // create tensor
+        TensorDesc desc{device, dtype, shape, name};
+        auto       buffer_size = get_size(shape) * element_size(dtype);
+        auto       raw_data    = static_cast<void*>(static_cast<uint8_t*>(dl_tensor.data) + dl_tensor.byte_offset);
+        Tensor     ret(desc);
+        OUTCOME_TRY(ret.CopyFrom(raw_data, stream));
+
+        // delete old tensor
+        if (managed_tensor->deleter != nullptr) managed_tensor->deleter(managed_tensor);
+        return ret;
+    }
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/utils/dlpack/dlpack_utils.h b/csrc/mmdeploy/utils/dlpack/dlpack_utils.h
index 66112ae801..7db297e36e 100644
--- a/csrc/mmdeploy/utils/dlpack/dlpack_utils.h
+++ b/csrc/mmdeploy/utils/dlpack/dlpack_utils.h
@@ -7,11 +7,11 @@
 #include "mmdeploy/core/tensor.h"
 
 struct DLManagedTensor;
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-Result<DLManagedTensor*> ToDLPack(framework::Tensor& tensor, framework::Stream stream = {});
-Result<framework::Tensor> FromDLPack(DLManagedTensor* managed_tensor, const std::string& name = "",
-                                     framework::Stream stream = {});
+    Result<DLManagedTensor*>  ToDLPack(framework::Tensor& tensor, framework::Stream stream = {});
+    Result<framework::Tensor> FromDLPack(DLManagedTensor* managed_tensor, const std::string& name = "", framework::Stream stream = {});
 }  // namespace mmdeploy
 
 #endif  // MMDEPLOY_CSRC_UTILS_DLPACK_DLPACK_UTILS_H_
diff --git a/csrc/mmdeploy/utils/opencv/opencv_utils.cpp b/csrc/mmdeploy/utils/opencv/opencv_utils.cpp
index c303d63fc0..21b9e09fd6 100644
--- a/csrc/mmdeploy/utils/opencv/opencv_utils.cpp
+++ b/csrc/mmdeploy/utils/opencv/opencv_utils.cpp
@@ -8,316 +8,383 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "opencv2/imgproc/imgproc.hpp"
 
-namespace mmdeploy::cpu {
-
-using namespace framework;
-
-Mat CVMat2Mat(const cv::Mat& mat, PixelFormat format) {
-  std::shared_ptr<void> data(mat.data, [mat = mat](void* p) {});
-  DataType type;
-  auto depth = mat.depth();
-  switch (depth) {
-    case CV_8S:  // fall through
-    case CV_8U:
-      type = DataType::kINT8;
-      break;
-    case CV_16S:  // fall through
-    case CV_16U:
-      type = DataType::kHALF;
-      break;
-    case CV_32S:
-      type = DataType::kINT32;
-      break;
-    case CV_32F:
-      type = DataType::kFLOAT;
-      break;
-    default:
-      assert(0);
-  }
-  return Mat{mat.rows, mat.cols, format, type, data, Device{"cpu"}};
-}
-
-cv::Mat Mat2CVMat(const Mat& mat) {
-  static const std::map<DataType, int> type_mapper{{DataType::kFLOAT, CV_32F},
-                                                   {DataType::kHALF, CV_16U},
-                                                   {DataType::kINT8, CV_8U},
-                                                   {DataType::kINT32, CV_32S}};
-  auto type = CV_MAKETYPE(type_mapper.at(mat.type()), mat.channel());
-  auto format = mat.pixel_format();
-  if (PixelFormat::kBGR == format || PixelFormat::kRGB == format) {
-    return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
-  } else if (PixelFormat::kGRAYSCALE == format) {
-    return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
-  } else if (PixelFormat::kNV12 == format) {
-    cv::Mat src_mat(mat.height() * 3 / 2, mat.width(), type, mat.data<void>());
-    cv::Mat dst_mat;
-    cv::cvtColor(src_mat, dst_mat, cv::COLOR_YUV2BGR_NV12);
-    return dst_mat;
-  } else if (PixelFormat::kNV21 == format) {
-    cv::Mat src_mat(mat.height() * 3 / 2, mat.width(), type, mat.data<void>());
-    cv::Mat dst_mat;
-    cv::cvtColor(src_mat, dst_mat, cv::COLOR_YUV2BGR_NV21);
-    return dst_mat;
-  } else if (PixelFormat::kBGRA == format) {
-    return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
-  } else {
-    MMDEPLOY_ERROR("unsupported mat format {}", format);
-    return {};
-  }
-}
-
-cv::Mat Tensor2CVMat(const Tensor& tensor) {
-  auto desc = tensor.desc();
-  int h = (int)desc.shape[1];
-  int w = (int)desc.shape[2];
-  int c = (int)desc.shape[3];
-
-  if (DataType::kINT8 == desc.data_type) {
-    return {h, w, CV_8UC(c), const_cast<void*>(tensor.data())};
-  } else if (DataType::kFLOAT == desc.data_type) {
-    return {h, w, CV_32FC(c), const_cast<void*>(tensor.data())};
-  } else if (DataType::kINT32 == desc.data_type) {
-    return {h, w, CV_32SC(c), const_cast<void*>(tensor.data())};
-  } else {
-    assert(0);
-    MMDEPLOY_ERROR("unsupported type: {}", desc.data_type);
-    return {};
-  }
-}
-
-Tensor CVMat2Tensor(const cv::Mat& mat) {
-  TensorShape shape;
-  DataType data_type = DataType::kINT8;
-  if (mat.depth() == CV_8U) {
-    shape = {1, mat.rows, mat.cols, mat.channels()};
-  } else if (mat.depth() == CV_32F) {
-    shape = {1, mat.rows, mat.cols, mat.channels()};
-    data_type = DataType::kFLOAT;
-  } else if (mat.depth() == CV_32S) {
-    shape = {1, mat.rows, mat.cols, mat.channels()};
-    data_type = DataType::kINT32;
-  } else {
-    MMDEPLOY_ERROR("unsupported mat dat type {}", mat.type());
-    assert(0);
-    return {};
-  }
-  std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
-  TensorDesc desc{Device{"cpu"}, data_type, shape};
-  return Tensor{desc, data};
-}
-
-Result<int> GetInterpolationMethod(const std::string_view& method) {
-  if (method == "bilinear") {
-    return cv::INTER_LINEAR;
-  } else if (method == "nearest") {
-    return cv::INTER_NEAREST;
-  } else if (method == "area") {
-    return cv::INTER_AREA;
-  } else if (method == "bicubic") {
-    return cv::INTER_CUBIC;
-  } else if (method == "lanczos") {
-    return cv::INTER_LANCZOS4;
-  }
-  MMDEPLOY_ERROR("unsupported interpolation method: {}", method);
-  return Status(eNotSupported);
-}
-
-cv::Mat Resize(const cv::Mat& src, int dst_height, int dst_width,
-               const std::string& interpolation) {
-  cv::Mat dst(dst_height, dst_width, src.type());
-  auto method = GetInterpolationMethod(interpolation).value();
-  cv::resize(src, dst, dst.size(), 0, 0, method);
-  return dst;
-}
-
-cv::Mat WarpAffine(const cv::Mat& src, const cv::Mat& affine_matrix, int dst_height, int dst_width,
-                   int interpolation) {
-  cv::Mat dst(dst_height, dst_width, src.type());
-  cv::warpAffine(src, dst, affine_matrix, dst.size(), interpolation);
-  return dst;
-}
-
-cv::Mat Crop(const cv::Mat& src, int top, int left, int bottom, int right) {
-  return src(cv::Range(top, bottom + 1), cv::Range(left, right + 1)).clone();
-}
-
-template <int C0, int C1, int C2, typename T>
-void normalize3(const T* __restrict src, float* __restrict dst, size_t size, const float* mean,
-                const float* std) {
-  const float _mean[3] = {mean[0], mean[1], mean[2]};
-  const float _inv[3] = {1.f / std[0], 1.f / std[1], 1.f / std[2]};
-  for (size_t i = 0; i < size * 3; i += 3) {
-    dst[i] = (src[i + C0] - _mean[0]) * _inv[0];
-    dst[i + 1] = (src[i + C1] - _mean[1]) * _inv[1];
-    dst[i + 2] = (src[i + C2] - _mean[2]) * _inv[2];
-  }
-}
-
-template <typename T>
-void normalize1(const T* __restrict src, float* __restrict dst, size_t size, const float* mean,
-                const float* std) {
-  float _mean = mean[0];
-  float _inv = 1.f / std[0];
-  for (size_t i = 0; i < size; ++i) {
-    dst[i] = (src[i] - _mean) * _inv;
-  }
-}
-
-cv::Mat Normalize(cv::Mat& src, const std::vector<float>& mean, const std::vector<float>& std,
-                  bool to_rgb, bool inplace) {
-  assert(src.channels() == mean.size());
-  assert(mean.size() == std.size());
-
-  if (!inplace && src.isContinuous() && (src.channels() == 3 || src.channels() == 1)) {
-    if (src.depth() == CV_8U) {
-      cv::Mat dst(src.size(), CV_32FC(src.channels()));
-      auto normalize = src.channels() == 3
-                           ? (to_rgb ? normalize3<2, 1, 0, uint8_t> : normalize3<0, 1, 2, uint8_t>)
-                           : normalize1<uint8_t>;
-      normalize(src.ptr<uint8_t>(), dst.ptr<float>(), src.total(), mean.data(), std.data());
-      return dst;
-    } else if (src.depth() == CV_32F) {
-      cv::Mat dst(src.size(), CV_32FC(src.channels()));
-      auto normalize = src.channels() == 3
-                           ? (to_rgb ? normalize3<2, 1, 0, float> : normalize3<0, 1, 2, float>)
-                           : normalize1<float>;
-      normalize(src.ptr<float>(), dst.ptr<float>(), src.total(), mean.data(), std.data());
-      return dst;
+namespace mmdeploy::cpu
+{
+
+    using namespace framework;
+
+    Mat CVMat2Mat(const cv::Mat& mat, PixelFormat format)
+    {
+        std::shared_ptr<void> data(mat.data, [mat = mat](void* p) {});
+        DataType              type;
+        auto                  depth = mat.depth();
+        switch (depth)
+        {
+            case CV_8S:  // fall through
+            case CV_8U:
+                type = DataType::kINT8;
+                break;
+            case CV_16S:  // fall through
+            case CV_16U:
+                type = DataType::kHALF;
+                break;
+            case CV_32S:
+                type = DataType::kINT32;
+                break;
+            case CV_32F:
+                type = DataType::kFLOAT;
+                break;
+            default:
+                assert(0);
+        }
+        return Mat{mat.rows, mat.cols, format, type, data, Device{"cpu"}};
     }
-  }
-
-  cv::Mat dst;
-  if (src.depth() == CV_32F) {
-    dst = inplace ? src : src.clone();
-  } else {
-    src.convertTo(dst, CV_32FC(src.channels()));
-  }
-
-  if (to_rgb && dst.channels() == 3) {
-    cv::cvtColor(dst, dst, cv::COLOR_BGR2RGB);
-  }
-
-  auto _mean = mean;
-  auto _std = std;
-  for (auto i = mean.size(); i < 4; ++i) {
-    _mean.push_back(0.);
-    _std.push_back(1.0);
-  }
-  cv::Scalar mean_scalar(_mean[0], _mean[1], _mean[2], _mean[3]);
-  cv::Scalar std_scalar(1.0 / _std[0], 1.0 / _std[1], 1.0 / _std[2], 1.0 / _std[3]);
-
-  cv::subtract(dst, mean_scalar, dst);
-  cv::multiply(dst, std_scalar, dst);
-  return dst;
-}
-
-cv::Mat Transpose(const cv::Mat& src) {
-  cv::Mat _src{src.rows * src.cols, src.channels(), CV_MAKETYPE(src.depth(), 1), src.data};
-  cv::Mat _dst;
-  cv::transpose(_src, _dst);
-  return _dst;
-}
-
-namespace {
-
-class ColorConversionTable {
-  static constexpr auto kSize = static_cast<size_t>(PixelFormat::kCOUNT);
-
-  int codes_[kSize][kSize]{};
-
-  // until we have "Deducing `this`" in C++23
-  template <typename Self>
-  static auto& get_impl(Self& self, PixelFormat src, PixelFormat dst) {
-    return self.codes_[static_cast<int32_t>(src)][static_cast<int32_t>(dst)];
-  }
-
- public:
-  auto& get(PixelFormat src, PixelFormat dst) noexcept { return get_impl(*this, src, dst); }
-  auto& get(PixelFormat src, PixelFormat dst) const noexcept { return get_impl(*this, src, dst); }
-
-  ColorConversionTable() {
-    for (auto& row : codes_) {
-      std::fill(std::begin(row), std::end(row), -1);
+
+    cv::Mat Mat2CVMat(const Mat& mat)
+    {
+        static const std::map<DataType, int> type_mapper{{DataType::kFLOAT, CV_32F},
+                                                         {DataType::kHALF, CV_16U},
+                                                         {DataType::kINT8, CV_8U},
+                                                         {DataType::kINT32, CV_32S}};
+        auto                                 type   = CV_MAKETYPE(type_mapper.at(mat.type()), mat.channel());
+        auto                                 format = mat.pixel_format();
+        if (PixelFormat::kBGR == format || PixelFormat::kRGB == format)
+        {
+            return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
+        }
+        else if (PixelFormat::kGRAYSCALE == format)
+        {
+            return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
+        }
+        else if (PixelFormat::kNV12 == format)
+        {
+            cv::Mat src_mat(mat.height() * 3 / 2, mat.width(), type, mat.data<void>());
+            cv::Mat dst_mat;
+            cv::cvtColor(src_mat, dst_mat, cv::COLOR_YUV2BGR_NV12);
+            return dst_mat;
+        }
+        else if (PixelFormat::kNV21 == format)
+        {
+            cv::Mat src_mat(mat.height() * 3 / 2, mat.width(), type, mat.data<void>());
+            cv::Mat dst_mat;
+            cv::cvtColor(src_mat, dst_mat, cv::COLOR_YUV2BGR_NV21);
+            return dst_mat;
+        }
+        else if (PixelFormat::kBGRA == format)
+        {
+            return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
+        }
+        else
+        {
+            MMDEPLOY_ERROR("unsupported mat format {}", format);
+            return {};
+        }
+    }
+
+    cv::Mat Tensor2CVMat(const Tensor& tensor)
+    {
+        auto desc = tensor.desc();
+        int  h    = (int)desc.shape[1];
+        int  w    = (int)desc.shape[2];
+        int  c    = (int)desc.shape[3];
+
+        if (DataType::kINT8 == desc.data_type)
+        {
+            return {h, w, CV_8UC(c), const_cast<void*>(tensor.data())};
+        }
+        else if (DataType::kFLOAT == desc.data_type)
+        {
+            return {h, w, CV_32FC(c), const_cast<void*>(tensor.data())};
+        }
+        else if (DataType::kINT32 == desc.data_type)
+        {
+            return {h, w, CV_32SC(c), const_cast<void*>(tensor.data())};
+        }
+        else
+        {
+            assert(0);
+            MMDEPLOY_ERROR("unsupported type: {}", desc.data_type);
+            return {};
+        }
+    }
+
+    Tensor CVMat2Tensor(const cv::Mat& mat)
+    {
+        TensorShape shape;
+        DataType    data_type = DataType::kINT8;
+        if (mat.depth() == CV_8U)
+        {
+            shape = {1, mat.rows, mat.cols, mat.channels()};
+        }
+        else if (mat.depth() == CV_32F)
+        {
+            shape     = {1, mat.rows, mat.cols, mat.channels()};
+            data_type = DataType::kFLOAT;
+        }
+        else if (mat.depth() == CV_32S)
+        {
+            shape     = {1, mat.rows, mat.cols, mat.channels()};
+            data_type = DataType::kINT32;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("unsupported mat dat type {}", mat.type());
+            assert(0);
+            return {};
+        }
+        std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
+        TensorDesc            desc{Device{"cpu"}, data_type, shape};
+        return Tensor{desc, data};
+    }
+
+    Result<int> GetInterpolationMethod(const std::string_view& method)
+    {
+        if (method == "bilinear")
+        {
+            return cv::INTER_LINEAR;
+        }
+        else if (method == "nearest")
+        {
+            return cv::INTER_NEAREST;
+        }
+        else if (method == "area")
+        {
+            return cv::INTER_AREA;
+        }
+        else if (method == "bicubic")
+        {
+            return cv::INTER_CUBIC;
+        }
+        else if (method == "lanczos")
+        {
+            return cv::INTER_LANCZOS4;
+        }
+        MMDEPLOY_ERROR("unsupported interpolation method: {}", method);
+        return Status(eNotSupported);
+    }
+
+    cv::Mat Resize(const cv::Mat& src, int dst_height, int dst_width, const std::string& interpolation)
+    {
+        cv::Mat dst(dst_height, dst_width, src.type());
+        auto    method = GetInterpolationMethod(interpolation).value();
+        cv::resize(src, dst, dst.size(), 0, 0, method);
+        return dst;
+    }
+
+    cv::Mat WarpAffine(const cv::Mat& src, const cv::Mat& affine_matrix, int dst_height, int dst_width, int interpolation)
+    {
+        cv::Mat dst(dst_height, dst_width, src.type());
+        cv::warpAffine(src, dst, affine_matrix, dst.size(), interpolation);
+        return dst;
+    }
+
+    cv::Mat Crop(const cv::Mat& src, int top, int left, int bottom, int right)
+    {
+        return src(cv::Range(top, bottom + 1), cv::Range(left, right + 1)).clone();
+    }
+
+    template<int C0, int C1, int C2, typename T>
+    void normalize3(const T* __restrict src, float* __restrict dst, size_t size, const float* mean, const float* std)
+    {
+        const float _mean[3] = {mean[0], mean[1], mean[2]};
+        const float _inv[3]  = {1.f / std[0], 1.f / std[1], 1.f / std[2]};
+        for (size_t i = 0; i < size * 3; i += 3)
+        {
+            dst[i]     = (src[i + C0] - _mean[0]) * _inv[0];
+            dst[i + 1] = (src[i + C1] - _mean[1]) * _inv[1];
+            dst[i + 2] = (src[i + C2] - _mean[2]) * _inv[2];
+        }
+    }
+
+    template<typename T>
+    void normalize1(const T* __restrict src, float* __restrict dst, size_t size, const float* mean, const float* std)
+    {
+        float _mean = mean[0];
+        float _inv  = 1.f / std[0];
+        for (size_t i = 0; i < size; ++i)
+        {
+            dst[i] = (src[i] - _mean) * _inv;
+        }
+    }
+
+    cv::Mat Normalize(cv::Mat& src, const std::vector<float>& mean, const std::vector<float>& std, bool to_rgb, bool inplace)
+    {
+        assert(src.channels() == mean.size());
+        assert(mean.size() == std.size());
+
+        if (!inplace && src.isContinuous() && (src.channels() == 3 || src.channels() == 1))
+        {
+            if (src.depth() == CV_8U)
+            {
+                cv::Mat dst(src.size(), CV_32FC(src.channels()));
+                auto    normalize = src.channels() == 3 ? (to_rgb ? normalize3<2, 1, 0, uint8_t> : normalize3<0, 1, 2, uint8_t>) : normalize1<uint8_t>;
+                normalize(src.ptr<uint8_t>(), dst.ptr<float>(), src.total(), mean.data(), std.data());
+                return dst;
+            }
+            else if (src.depth() == CV_32F)
+            {
+                cv::Mat dst(src.size(), CV_32FC(src.channels()));
+                auto    normalize = src.channels() == 3 ? (to_rgb ? normalize3<2, 1, 0, float> : normalize3<0, 1, 2, float>) : normalize1<float>;
+                normalize(src.ptr<float>(), dst.ptr<float>(), src.total(), mean.data(), std.data());
+                return dst;
+            }
+        }
+
+        cv::Mat dst;
+        if (src.depth() == CV_32F)
+        {
+            dst = inplace ? src : src.clone();
+        }
+        else
+        {
+            src.convertTo(dst, CV_32FC(src.channels()));
+        }
+
+        if (to_rgb && dst.channels() == 3)
+        {
+            cv::cvtColor(dst, dst, cv::COLOR_BGR2RGB);
+        }
+
+        auto _mean = mean;
+        auto _std  = std;
+        for (auto i = mean.size(); i < 4; ++i)
+        {
+            _mean.push_back(0.);
+            _std.push_back(1.0);
+        }
+        cv::Scalar mean_scalar(_mean[0], _mean[1], _mean[2], _mean[3]);
+        cv::Scalar std_scalar(1.0 / _std[0], 1.0 / _std[1], 1.0 / _std[2], 1.0 / _std[3]);
+
+        cv::subtract(dst, mean_scalar, dst);
+        cv::multiply(dst, std_scalar, dst);
+        return dst;
+    }
+
+    cv::Mat Transpose(const cv::Mat& src)
+    {
+        cv::Mat _src{src.rows * src.cols, src.channels(), CV_MAKETYPE(src.depth(), 1), src.data};
+        cv::Mat _dst;
+        cv::transpose(_src, _dst);
+        return _dst;
+    }
+
+    namespace
+    {
+
+        class ColorConversionTable
+        {
+            static constexpr auto kSize = static_cast<size_t>(PixelFormat::kCOUNT);
+
+            int                   codes_[kSize][kSize]{};
+
+            // until we have "Deducing `this`" in C++23
+            template<typename Self>
+            static auto& get_impl(Self& self, PixelFormat src, PixelFormat dst)
+            {
+                return self.codes_[static_cast<int32_t>(src)][static_cast<int32_t>(dst)];
+            }
+
+          public:
+            auto& get(PixelFormat src, PixelFormat dst) noexcept
+            {
+                return get_impl(*this, src, dst);
+            }
+            auto& get(PixelFormat src, PixelFormat dst) const noexcept
+            {
+                return get_impl(*this, src, dst);
+            }
+
+            ColorConversionTable()
+            {
+                for (auto& row : codes_)
+                {
+                    std::fill(std::begin(row), std::end(row), -1);
+                }
+                using namespace pixel_formats;
+                // to BGR
+                get(kRGB, kBGR)   = cv::COLOR_RGB2BGR;
+                get(kGRAY, kBGR)  = cv::COLOR_GRAY2BGR;
+                get(kNV21, kBGR)  = cv::COLOR_YUV2BGR_NV21;
+                get(kNV12, kBGR)  = cv::COLOR_YUV2BGR_NV12;
+                get(kBGRA, kBGR)  = cv::COLOR_BGRA2BGR;
+                // to RGB
+                get(kBGR, kRGB)   = cv::COLOR_BGR2RGB;
+                get(kGRAY, kRGB)  = cv::COLOR_GRAY2RGB;
+                get(kNV21, kRGB)  = cv::COLOR_YUV2RGB_NV21;
+                get(kNV12, kRGB)  = cv::COLOR_YUV2RGB_NV12;
+                get(kBGRA, kRGB)  = cv::COLOR_BGRA2RGB;
+                // to GRAY
+                get(kBGR, kGRAY)  = cv::COLOR_BGR2GRAY;
+                get(kRGB, kGRAY)  = cv::COLOR_RGB2GRAY;
+                get(kNV21, kGRAY) = cv::COLOR_YUV2GRAY_NV21;
+                get(kNV12, kGRAY) = cv::COLOR_YUV2GRAY_NV12;
+                get(kBGRA, kGRAY) = cv::COLOR_BGRA2GRAY;
+            }
+        };
+
+        int GetConversionCode(PixelFormat src_fmt, PixelFormat dst_fmt)
+        {
+            static const ColorConversionTable table{};
+            return table.get(src_fmt, dst_fmt);
+        }
+
+    }  // namespace
+
+    cv::Mat CvtColor(const cv::Mat& src, PixelFormat src_format, PixelFormat dst_format)
+    {
+        if (src_format == dst_format)
+        {
+            return src;
+        }
+        auto code = GetConversionCode(src_format, dst_format);
+        if (code == -1)
+        {
+            MMDEPLOY_ERROR("Unsupported color conversion {} -> {}", src_format, dst_format);
+            return {};
+        }
+        cv::Mat dst;
+        cv::cvtColor(src, dst, code);
+        return dst;
+    }
+
+    cv::Mat Pad(const cv::Mat& src, int top, int left, int bottom, int right, int border_type, float val)
+    {
+        cv::Mat    dst;
+        cv::Scalar scalar = {val, val, val, val};
+        cv::copyMakeBorder(src, dst, top, bottom, left, right, border_type, scalar);
+        return dst;
+    }
+
+    cv::Mat CropResizePad(const cv::Mat& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect)
+    {
+        int     width  = target_size[0] + pad_rect[1] + pad_rect[3];
+        int     height = target_size[1] + pad_rect[0] + pad_rect[2];
+        cv::Mat dst    = cv::Mat::zeros(height, width, src.type());
+        if (crop_rect[2] - crop_rect[0] + 1 > 0 && crop_rect[3] - crop_rect[1] + 1 > 0)
+        {
+            cv::Rect roi1 = {crop_rect[1], crop_rect[0], crop_rect[3] - crop_rect[1] + 1, crop_rect[2] - crop_rect[0] + 1};
+            cv::Rect roi2 = {pad_rect[1], pad_rect[0], target_size[0], target_size[1]};
+            cv::resize(src(roi1), dst(roi2), {target_size[0], target_size[1]});
+        }
+        return dst;
+    }
+
+    bool Compare(const cv::Mat& src1, const cv::Mat& src2, float threshold)
+    {
+        cv::Mat _src1, _src2, diff;
+        src1.convertTo(_src1, CV_32FC(src1.channels()));
+        src2.convertTo(_src2, CV_32FC(src2.channels()));
+
+        cv::absdiff(_src1, _src2, diff);
+        auto sum    = cv::sum(cv::sum(diff));
+        auto metric = sum[0] / (src1.rows * src1.cols);
+
+        if (metric < threshold)
+        {
+            return true;
+        }
+        MMDEPLOY_ERROR("sum: {}, average: {}", sum[0], metric);
+        return false;
     }
-    using namespace pixel_formats;
-    // to BGR
-    get(kRGB, kBGR) = cv::COLOR_RGB2BGR;
-    get(kGRAY, kBGR) = cv::COLOR_GRAY2BGR;
-    get(kNV21, kBGR) = cv::COLOR_YUV2BGR_NV21;
-    get(kNV12, kBGR) = cv::COLOR_YUV2BGR_NV12;
-    get(kBGRA, kBGR) = cv::COLOR_BGRA2BGR;
-    // to RGB
-    get(kBGR, kRGB) = cv::COLOR_BGR2RGB;
-    get(kGRAY, kRGB) = cv::COLOR_GRAY2RGB;
-    get(kNV21, kRGB) = cv::COLOR_YUV2RGB_NV21;
-    get(kNV12, kRGB) = cv::COLOR_YUV2RGB_NV12;
-    get(kBGRA, kRGB) = cv::COLOR_BGRA2RGB;
-    // to GRAY
-    get(kBGR, kGRAY) = cv::COLOR_BGR2GRAY;
-    get(kRGB, kGRAY) = cv::COLOR_RGB2GRAY;
-    get(kNV21, kGRAY) = cv::COLOR_YUV2GRAY_NV21;
-    get(kNV12, kGRAY) = cv::COLOR_YUV2GRAY_NV12;
-    get(kBGRA, kGRAY) = cv::COLOR_BGRA2GRAY;
-  }
-};
-
-int GetConversionCode(PixelFormat src_fmt, PixelFormat dst_fmt) {
-  static const ColorConversionTable table{};
-  return table.get(src_fmt, dst_fmt);
-}
-
-}  // namespace
-
-cv::Mat CvtColor(const cv::Mat& src, PixelFormat src_format, PixelFormat dst_format) {
-  if (src_format == dst_format) {
-    return src;
-  }
-  auto code = GetConversionCode(src_format, dst_format);
-  if (code == -1) {
-    MMDEPLOY_ERROR("Unsupported color conversion {} -> {}", src_format, dst_format);
-    return {};
-  }
-  cv::Mat dst;
-  cv::cvtColor(src, dst, code);
-  return dst;
-}
-
-cv::Mat Pad(const cv::Mat& src, int top, int left, int bottom, int right, int border_type,
-            float val) {
-  cv::Mat dst;
-  cv::Scalar scalar = {val, val, val, val};
-  cv::copyMakeBorder(src, dst, top, bottom, left, right, border_type, scalar);
-  return dst;
-}
-
-cv::Mat CropResizePad(const cv::Mat& src, const std::vector<int>& crop_rect,
-                      const std::vector<int>& target_size, const std::vector<int>& pad_rect) {
-  int width = target_size[0] + pad_rect[1] + pad_rect[3];
-  int height = target_size[1] + pad_rect[0] + pad_rect[2];
-  cv::Mat dst = cv::Mat::zeros(height, width, src.type());
-  if (crop_rect[2] - crop_rect[0] + 1 > 0 && crop_rect[3] - crop_rect[1] + 1 > 0) {
-    cv::Rect roi1 = {crop_rect[1], crop_rect[0], crop_rect[3] - crop_rect[1] + 1,
-                     crop_rect[2] - crop_rect[0] + 1};
-    cv::Rect roi2 = {pad_rect[1], pad_rect[0], target_size[0], target_size[1]};
-    cv::resize(src(roi1), dst(roi2), {target_size[0], target_size[1]});
-  }
-  return dst;
-}
-
-bool Compare(const cv::Mat& src1, const cv::Mat& src2, float threshold) {
-  cv::Mat _src1, _src2, diff;
-  src1.convertTo(_src1, CV_32FC(src1.channels()));
-  src2.convertTo(_src2, CV_32FC(src2.channels()));
-
-  cv::absdiff(_src1, _src2, diff);
-  auto sum = cv::sum(cv::sum(diff));
-  auto metric = sum[0] / (src1.rows * src1.cols);
-
-  if (metric < threshold) {
-    return true;
-  }
-  MMDEPLOY_ERROR("sum: {}, average: {}", sum[0], metric);
-  return false;
-}
 
 }  // namespace mmdeploy::cpu
diff --git a/csrc/mmdeploy/utils/opencv/opencv_utils.h b/csrc/mmdeploy/utils/opencv/opencv_utils.h
index 9dd268e651..a0b639a15d 100644
--- a/csrc/mmdeploy/utils/opencv/opencv_utils.h
+++ b/csrc/mmdeploy/utils/opencv/opencv_utils.h
@@ -9,158 +9,165 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/core/core.hpp"
 
-namespace mmdeploy {
-namespace cpu {
-
-MMDEPLOY_API cv::Mat Mat2CVMat(const framework::Mat& mat);
-MMDEPLOY_API cv::Mat Tensor2CVMat(const framework::Tensor& tensor);
-
-MMDEPLOY_API framework::Mat CVMat2Mat(const cv::Mat& mat, PixelFormat format);
-MMDEPLOY_API framework::Tensor CVMat2Tensor(const cv::Mat& mat);
-
-MMDEPLOY_API Result<int> GetInterpolationMethod(const std::string_view& method);
-
-/**
- * @brief resize an image to specified size
- *
- * @param src input image
- * @param dst_height output image's height
- * @param dst_width output image's width
- * @return output image if success, error code otherwise
- */
-MMDEPLOY_API cv::Mat Resize(const cv::Mat& src, int dst_height, int dst_width,
-                            const std::string& interpolation);
-
-MMDEPLOY_API cv::Mat WarpAffine(const cv::Mat& src, const cv::Mat& affine_matrix, int dst_height,
-                                int dst_width, int interpolation);
-
-/**
- * @brief crop an image
- *
- * @param src input image
- * @param top
- * @param left
- * @param bottom
- * @param right
- * @return cv::Mat
- */
-MMDEPLOY_API cv::Mat Crop(const cv::Mat& src, int top, int left, int bottom, int right);
-
-/**
- * @brief Do normalization to an image
- *
- * @param src input image. It is assumed to be BGR if the channel is 3
- * @param mean
- * @param std
- * @param to_rgb
- * @param inplace
- * @return cv::Mat
- */
-MMDEPLOY_API cv::Mat Normalize(cv::Mat& src, const std::vector<float>& mean,
-                               const std::vector<float>& std, bool to_rgb, bool inplace = true);
-
-/**
- * @brief tranpose an image, from {h, w, c} to {c, h, w}
- *
- * @param src input image
- * @return
- */
-MMDEPLOY_API cv::Mat Transpose(const cv::Mat& src);
-
-/**
- * @brief convert an image to another color space
- *
- * @param src
- * @param src_format
- * @param dst_format
- * @return
- */
-MMDEPLOY_API cv::Mat CvtColor(const cv::Mat& src, PixelFormat src_format, PixelFormat dst_format);
-
-/**
- *
- * @param src
- * @param top
- * @param left
- * @param bottom
- * @param right
- * @param border_type
- * @param val
- * @return
- */
-MMDEPLOY_API cv::Mat Pad(const cv::Mat& src, int top, int left, int bottom, int right,
-                         int border_type, float val);
-
-/**
- * @param src
- * @param crop_rect t, l, b, r
- * @param target_size w, h
- * @param pad_rect t, l, b, r
- */
-MMDEPLOY_API cv::Mat CropResizePad(const cv::Mat& src, const std::vector<int>& crop_rect,
-                                   const std::vector<int>& target_size,
-                                   const std::vector<int>& pad_rect);
-
-/**
- * @brief compare two images
- *
- * @param src1 one input image
- * @param src2 the other input image
- * @return bool true means the images are the same
- */
-MMDEPLOY_API bool Compare(const cv::Mat& src1, const cv::Mat& src2, float threshold = .5f);
-
-}  // namespace cpu
-
-namespace detail {
-
-template <typename T>
-struct IsCvPoint : std::false_type {};
-
-template <typename T>
-struct IsCvPoint<::cv::Point_<T>> : std::true_type {};
-
-template <typename Archive, typename T,
-          std::enable_if_t<detail::IsCvPoint<uncvref_t<T>>::value, int> = 0>
-void serialize(Archive&& archive, T&& p) {
-  int size{2};
-  std::forward<Archive>(archive).init(size);
-  std::forward<Archive>(archive).item(std::forward<T>(p).x);
-  std::forward<Archive>(archive).item(std::forward<T>(p).y);
-}
-
-template <typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
-void save(Archive& archive, std::vector<T>& v) {
-  archive.init(array_tag<T>{v.size() * 2});
-  for (const auto& p : v) {
-    archive.item(p.x);
-    archive.item(p.y);
-  }
-}
-
-template <typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
-void save(Archive& archive, const std::vector<T>& v) {
-  archive.init(array_tag<T>{v.size() * 2});
-  for (const auto& p : v) {
-    archive.item(p.x);
-    archive.item(p.y);
-  }
-}
-
-template <typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
-void load(Archive& archive, std::vector<T>& v) {
-  size_t size{};
-  archive.init(size);
-  size /= 2;
-  T p;
-  for (int i = 0; i < size; ++i) {
-    archive.item(p.x);
-    archive.item(p.y);
-    v.push_back(p);
-  }
-}
-
-}  // namespace detail
+namespace mmdeploy
+{
+    namespace cpu
+    {
+
+        MMDEPLOY_API cv::Mat Mat2CVMat(const framework::Mat& mat);
+        MMDEPLOY_API cv::Mat Tensor2CVMat(const framework::Tensor& tensor);
+
+        MMDEPLOY_API framework::Mat CVMat2Mat(const cv::Mat& mat, PixelFormat format);
+        MMDEPLOY_API framework::Tensor CVMat2Tensor(const cv::Mat& mat);
+
+        MMDEPLOY_API Result<int> GetInterpolationMethod(const std::string_view& method);
+
+        /**
+         * @brief resize an image to specified size
+         *
+         * @param src input image
+         * @param dst_height output image's height
+         * @param dst_width output image's width
+         * @return output image if success, error code otherwise
+         */
+        MMDEPLOY_API cv::Mat Resize(const cv::Mat& src, int dst_height, int dst_width, const std::string& interpolation);
+
+        MMDEPLOY_API cv::Mat WarpAffine(const cv::Mat& src, const cv::Mat& affine_matrix, int dst_height, int dst_width, int interpolation);
+
+        /**
+         * @brief crop an image
+         *
+         * @param src input image
+         * @param top
+         * @param left
+         * @param bottom
+         * @param right
+         * @return cv::Mat
+         */
+        MMDEPLOY_API cv::Mat Crop(const cv::Mat& src, int top, int left, int bottom, int right);
+
+        /**
+         * @brief Do normalization to an image
+         *
+         * @param src input image. It is assumed to be BGR if the channel is 3
+         * @param mean
+         * @param std
+         * @param to_rgb
+         * @param inplace
+         * @return cv::Mat
+         */
+        MMDEPLOY_API cv::Mat Normalize(cv::Mat& src, const std::vector<float>& mean, const std::vector<float>& std, bool to_rgb, bool inplace = true);
+
+        /**
+         * @brief tranpose an image, from {h, w, c} to {c, h, w}
+         *
+         * @param src input image
+         * @return
+         */
+        MMDEPLOY_API cv::Mat Transpose(const cv::Mat& src);
+
+        /**
+         * @brief convert an image to another color space
+         *
+         * @param src
+         * @param src_format
+         * @param dst_format
+         * @return
+         */
+        MMDEPLOY_API cv::Mat CvtColor(const cv::Mat& src, PixelFormat src_format, PixelFormat dst_format);
+
+        /**
+         *
+         * @param src
+         * @param top
+         * @param left
+         * @param bottom
+         * @param right
+         * @param border_type
+         * @param val
+         * @return
+         */
+        MMDEPLOY_API cv::Mat Pad(const cv::Mat& src, int top, int left, int bottom, int right, int border_type, float val);
+
+        /**
+         * @param src
+         * @param crop_rect t, l, b, r
+         * @param target_size w, h
+         * @param pad_rect t, l, b, r
+         */
+        MMDEPLOY_API cv::Mat CropResizePad(const cv::Mat& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect);
+
+        /**
+         * @brief compare two images
+         *
+         * @param src1 one input image
+         * @param src2 the other input image
+         * @return bool true means the images are the same
+         */
+        MMDEPLOY_API bool    Compare(const cv::Mat& src1, const cv::Mat& src2, float threshold = .5f);
+
+    }  // namespace cpu
+
+    namespace detail
+    {
+
+        template<typename T>
+        struct IsCvPoint : std::false_type
+        {
+        };
+
+        template<typename T>
+        struct IsCvPoint<::cv::Point_<T>> : std::true_type
+        {
+        };
+
+        template<typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<uncvref_t<T>>::value, int> = 0>
+        void serialize(Archive&& archive, T&& p)
+        {
+            int size{2};
+            std::forward<Archive>(archive).init(size);
+            std::forward<Archive>(archive).item(std::forward<T>(p).x);
+            std::forward<Archive>(archive).item(std::forward<T>(p).y);
+        }
+
+        template<typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
+        void save(Archive& archive, std::vector<T>& v)
+        {
+            archive.init(array_tag<T>{v.size() * 2});
+            for (const auto& p : v)
+            {
+                archive.item(p.x);
+                archive.item(p.y);
+            }
+        }
+
+        template<typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
+        void save(Archive& archive, const std::vector<T>& v)
+        {
+            archive.init(array_tag<T>{v.size() * 2});
+            for (const auto& p : v)
+            {
+                archive.item(p.x);
+                archive.item(p.y);
+            }
+        }
+
+        template<typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
+        void load(Archive& archive, std::vector<T>& v)
+        {
+            size_t size{};
+            archive.init(size);
+            size /= 2;
+            T p;
+            for (int i = 0; i < size; ++i)
+            {
+                archive.item(p.x);
+                archive.item(p.y);
+                v.push_back(p);
+            }
+        }
+
+    }  // namespace detail
 
 }  // namespace mmdeploy
 
diff --git a/demo/csrc/c/batch_image_classification.cpp b/demo/csrc/c/batch_image_classification.cpp
index a9529f9bab..5f7f6dc3d9 100644
--- a/demo/csrc/c/batch_image_classification.cpp
+++ b/demo/csrc/c/batch_image_classification.cpp
@@ -4,97 +4,112 @@
 
 #include "mmdeploy/classifier.h"
 
-static int batch_inference(mmdeploy_classifier_t classifier,
-                           const std::vector<int>& image_ids,
+static int batch_inference(mmdeploy_classifier_t              classifier,
+                           const std::vector<int>&            image_ids,
                            const std::vector<mmdeploy_mat_t>& mats);
 
-int main(int argc, char* argv[]) {
-  if (argc < 5) {
-    fprintf(stderr, "usage:\n  image_classification device_name dump_model_directory "
-            "imagelist.txt batch_size\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-
-  mmdeploy_classifier_t classifier{};
-  int status{};
-  status = mmdeploy_classifier_create_by_path(model_path, device_name, 0, &classifier);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create classifier, code: %d\n", (int)status);
-    return 1;
-  }
+int        main(int argc, char* argv[])
+{
+    if (argc < 5)
+    {
+        fprintf(stderr, "usage:\n  image_classification device_name dump_model_directory "
+                        "imagelist.txt batch_size\n");
+        return 1;
+    }
+    auto                  device_name = argv[1];
+    auto                  model_path  = argv[2];
 
-  // `file_path` is the path of an image list file
-  std::string file_path = argv[3];
-  const int batch = std::stoi(argv[argc-1]);
+    mmdeploy_classifier_t classifier{};
+    int                   status{};
+    status = mmdeploy_classifier_create_by_path(model_path, device_name, 0, &classifier);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create classifier, code: %d\n", (int)status);
+        return 1;
+    }
 
-  // read image paths from the file
-  std::ifstream ifs(file_path);
-  std::string img_path;
-  std::vector<std::string> img_paths;
-  while (ifs >> img_path) {
-    img_paths.emplace_back(std::move(img_path));
-  }
+    // `file_path` is the path of an image list file
+    std::string              file_path = argv[3];
+    const int                batch     = std::stoi(argv[argc - 1]);
 
-  // read images and process batch inference
-  std::vector<cv::Mat> images;
-  std::vector<int> image_ids;
-  std::vector<mmdeploy_mat_t> mats;
-  for (int i = 0; i < (int)img_paths.size(); ++i) {
-    auto img = cv::imread(img_paths[i]);
-    if (!img.data) {
-      fprintf(stderr, "failed to load image: %s\n", img_paths[i].c_str());
-      continue;
+    // read image paths from the file
+    std::ifstream            ifs(file_path);
+    std::string              img_path;
+    std::vector<std::string> img_paths;
+    while (ifs >> img_path)
+    {
+        img_paths.emplace_back(std::move(img_path));
     }
-    images.push_back(img);
-    image_ids.push_back(i);
-    mmdeploy_mat_t mat{
-        img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-    mats.push_back(mat);
 
-    // process batch inference
-    if ((int)mats.size() == batch) {
-      if (batch_inference(classifier, image_ids, mats) != 0) {
-        continue;
-      }
-      // clear buffer for next batch
-      mats.clear();
-      image_ids.clear();
-      images.clear();
+    // read images and process batch inference
+    std::vector<cv::Mat>        images;
+    std::vector<int>            image_ids;
+    std::vector<mmdeploy_mat_t> mats;
+    for (int i = 0; i < (int)img_paths.size(); ++i)
+    {
+        auto img = cv::imread(img_paths[i]);
+        if (!img.data)
+        {
+            fprintf(stderr, "failed to load image: %s\n", img_paths[i].c_str());
+            continue;
+        }
+        images.push_back(img);
+        image_ids.push_back(i);
+        mmdeploy_mat_t mat{
+            img.data,
+            img.rows,
+            img.cols,
+            3,
+            MMDEPLOY_PIXEL_FORMAT_BGR,
+            MMDEPLOY_DATA_TYPE_UINT8};
+        mats.push_back(mat);
+
+        // process batch inference
+        if ((int)mats.size() == batch)
+        {
+            if (batch_inference(classifier, image_ids, mats) != 0)
+            {
+                continue;
+            }
+            // clear buffer for next batch
+            mats.clear();
+            image_ids.clear();
+            images.clear();
+        }
+    }
+    // process batch inference if there are still unhandled images
+    if (!mats.empty())
+    {
+        (void)batch_inference(classifier, image_ids, mats);
     }
-  }
-  // process batch inference if there are still unhandled images
-  if (!mats.empty()) {
-    (void)batch_inference(classifier, image_ids, mats);
-  }
 
-  mmdeploy_classifier_destroy(classifier);
+    mmdeploy_classifier_destroy(classifier);
 
-  return 0;
+    return 0;
 }
 
 
-int batch_inference(mmdeploy_classifier_t classifier, const std::vector<int>& image_ids,
-                    const std::vector<mmdeploy_mat_t>& mats) {
-  mmdeploy_classification_t* res{};
-  int* res_count{};
-  auto status = mmdeploy_classifier_apply(classifier, mats.data(), (int)mats.size(),
-                                          &res, &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply classifier to batch images %d, code: %d\n",
-            (int)mats.size(), (int)status);
-    return 1;
-  }
-  // print the inference results
-  auto res_ptr = res;
-  for (int j = 0; j < (int)mats.size(); ++j) {
-    fprintf(stderr, "results in the %d-th image:\n", image_ids[j]);
-    for (int k = 0; k < res_count[j]; ++k, ++res_ptr) {
-      fprintf(stderr, "  label: %d, score: %.4f\n", res_ptr->label_id, res_ptr->score);
+int batch_inference(mmdeploy_classifier_t classifier, const std::vector<int>& image_ids, const std::vector<mmdeploy_mat_t>& mats)
+{
+    mmdeploy_classification_t* res{};
+    int*                       res_count{};
+    auto                       status = mmdeploy_classifier_apply(classifier, mats.data(), (int)mats.size(), &res, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply classifier to batch images %d, code: %d\n", (int)mats.size(), (int)status);
+        return 1;
+    }
+    // print the inference results
+    auto res_ptr = res;
+    for (int j = 0; j < (int)mats.size(); ++j)
+    {
+        fprintf(stderr, "results in the %d-th image:\n", image_ids[j]);
+        for (int k = 0; k < res_count[j]; ++k, ++res_ptr)
+        {
+            fprintf(stderr, "  label: %d, score: %.4f\n", res_ptr->label_id, res_ptr->score);
+        }
     }
-  }
-  // release results buffer
-  mmdeploy_classifier_release_result(res, res_count, (int)mats.size());
-  return 0;
+    // release results buffer
+    mmdeploy_classifier_release_result(res, res_count, (int)mats.size());
+    return 0;
 }
diff --git a/demo/csrc/c/batch_object_detection.cpp b/demo/csrc/c/batch_object_detection.cpp
index 04cfe90f93..c638e1dfd1 100644
--- a/demo/csrc/c/batch_object_detection.cpp
+++ b/demo/csrc/c/batch_object_detection.cpp
@@ -5,143 +5,164 @@
 
 #include "mmdeploy/detector.h"
 
-static int batch_inference(mmdeploy_detector_t detector, std::vector<cv::Mat>& images,
-                           const std::vector<int>& image_ids,
-                           const std::vector<mmdeploy_mat_t>& mats);
-
-static void visualize_detection(const std::string& output_name, cv::Mat& image,
-                                const mmdeploy_detection_t* bboxes_ptr, int bboxes_num);
-
-int main(int argc, char* argv[]) {
-  if (argc < 5) {
-    fprintf(stderr, "usage:\n  object_detection device_name sdk_model_path "
-            "file_path batch_size\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-
-  mmdeploy_detector_t detector{};
-  int status{};
-  status = mmdeploy_detector_create_by_path(model_path, device_name, 0, &detector);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create detector, code: %d\n", (int)status);
-    return 1;
-  }
-
-  // file_path is the path of an image list file
-  std::string file_path = argv[3];
-  const int batch = std::stoi(argv[argc-1]);
-
-  // read image paths from the file
-  std::ifstream ifs(file_path);
-  std::string img_path;
-  std::vector<std::string> img_paths;
-  while (ifs >> img_path) {
-    img_paths.emplace_back(std::move(img_path));
-  }
-
-
-  // read images and process batch inference
-  std::vector<cv::Mat> images;
-  std::vector<int> image_ids;
-  std::vector<mmdeploy_mat_t> mats;
-  for (int i = 0; i < (int)img_paths.size(); ++i) {
-    auto img = cv::imread(img_paths[i]);
-    if (!img.data) {
-      fprintf(stderr, "failed to load image: %s\n", img_paths[i].c_str());
-      continue;
+static int  batch_inference(mmdeploy_detector_t detector, std::vector<cv::Mat>& images, const std::vector<int>& image_ids, const std::vector<mmdeploy_mat_t>& mats);
+
+static void visualize_detection(const std::string& output_name, cv::Mat& image, const mmdeploy_detection_t* bboxes_ptr, int bboxes_num);
+
+int         main(int argc, char* argv[])
+{
+    if (argc < 5)
+    {
+        fprintf(stderr, "usage:\n  object_detection device_name sdk_model_path "
+                        "file_path batch_size\n");
+        return 1;
     }
-    images.push_back(img);
-    image_ids.push_back(i);
-    mmdeploy_mat_t mat{
-        img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-    mats.push_back(mat);
-
-    // process batch inference
-    if ((int)mats.size() == batch) {
-      if (batch_inference(detector, images, image_ids, mats) != 0) {
-        continue;
-      }
-      // clear buffer for next batch
-      mats.clear();
-      image_ids.clear();
-      images.clear();
+    auto                device_name = argv[1];
+    auto                model_path  = argv[2];
+
+    mmdeploy_detector_t detector{};
+    int                 status{};
+    status = mmdeploy_detector_create_by_path(model_path, device_name, 0, &detector);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create detector, code: %d\n", (int)status);
+        return 1;
     }
-  }
-  // process batch inference if there are still unhandled images
-  if (!mats.empty()) {
-    (void)batch_inference(detector, images, image_ids, mats);
-  }
-
-  mmdeploy_detector_destroy(detector);
-  return 0;
-}
 
-int batch_inference(mmdeploy_detector_t detector, std::vector<cv::Mat>& images,
-                    const std::vector<int>& image_ids,
-                    const std::vector<mmdeploy_mat_t>& mats) {
-  mmdeploy_detection_t* bboxes{};
-  int* res_count{};
-  auto status = mmdeploy_detector_apply(detector, mats.data(), mats.size(), &bboxes, &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply detector, code: %d\n", (int)status);
-    return 1;
-  }
-
-  mmdeploy_detection_t* bboxes_ptr = bboxes;
-  for (int i = 0; i < (int)mats.size(); ++i) {
-    fprintf(stdout, "results in the %d-th image:\n  bbox_count=%d\n", image_ids[i], res_count[i]);
-    const std::string output_name = "output_detection_" + std::to_string(image_ids[i]) + ".png";
-    visualize_detection(output_name, images[i], bboxes_ptr, res_count[i]);
-    bboxes_ptr = bboxes_ptr + res_count[i];
-  }
-
-  mmdeploy_detector_release_result(bboxes, res_count, mats.size());
-  return 0;
-}
+    // file_path is the path of an image list file
+    std::string              file_path = argv[3];
+    const int                batch     = std::stoi(argv[argc - 1]);
+
+    // read image paths from the file
+    std::ifstream            ifs(file_path);
+    std::string              img_path;
+    std::vector<std::string> img_paths;
+    while (ifs >> img_path)
+    {
+        img_paths.emplace_back(std::move(img_path));
+    }
 
 
-void visualize_detection(const std::string& output_name, cv::Mat& image,
-                         const mmdeploy_detection_t* bboxes_ptr, int bbox_num) {
-  for (int i = 0; i < bbox_num; ++i, ++bboxes_ptr) {
-    const auto& box = bboxes_ptr->bbox;
-    const auto& mask = bboxes_ptr->mask;
+    // read images and process batch inference
+    std::vector<cv::Mat>        images;
+    std::vector<int>            image_ids;
+    std::vector<mmdeploy_mat_t> mats;
+    for (int i = 0; i < (int)img_paths.size(); ++i)
+    {
+        auto img = cv::imread(img_paths[i]);
+        if (!img.data)
+        {
+            fprintf(stderr, "failed to load image: %s\n", img_paths[i].c_str());
+            continue;
+        }
+        images.push_back(img);
+        image_ids.push_back(i);
+        mmdeploy_mat_t mat{
+            img.data,
+            img.rows,
+            img.cols,
+            3,
+            MMDEPLOY_PIXEL_FORMAT_BGR,
+            MMDEPLOY_DATA_TYPE_UINT8};
+        mats.push_back(mat);
+
+        // process batch inference
+        if ((int)mats.size() == batch)
+        {
+            if (batch_inference(detector, images, image_ids, mats) != 0)
+            {
+                continue;
+            }
+            // clear buffer for next batch
+            mats.clear();
+            image_ids.clear();
+            images.clear();
+        }
+    }
+    // process batch inference if there are still unhandled images
+    if (!mats.empty())
+    {
+        (void)batch_inference(detector, images, image_ids, mats);
+    }
 
-    fprintf(stdout,
-            "  box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, "
-            "label=%d, score=%.4f\n",
-            i, box.left, box.top, box.right, box.bottom, bboxes_ptr->label_id, bboxes_ptr->score);
+    mmdeploy_detector_destroy(detector);
+    return 0;
+}
 
-    // skip detections with invalid bbox size (bbox height or width < 1)
-    if ((box.right - box.left) < 1 || (box.bottom - box.top) < 1) {
-      continue;
+int batch_inference(mmdeploy_detector_t detector, std::vector<cv::Mat>& images, const std::vector<int>& image_ids, const std::vector<mmdeploy_mat_t>& mats)
+{
+    mmdeploy_detection_t* bboxes{};
+    int*                  res_count{};
+    auto                  status = mmdeploy_detector_apply(detector, mats.data(), mats.size(), &bboxes, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply detector, code: %d\n", (int)status);
+        return 1;
     }
 
-    // skip detections less than specified score threshold
-    if (bboxes_ptr->score < 0.3) {
-      continue;
+    mmdeploy_detection_t* bboxes_ptr = bboxes;
+    for (int i = 0; i < (int)mats.size(); ++i)
+    {
+        fprintf(stdout, "results in the %d-th image:\n  bbox_count=%d\n", image_ids[i], res_count[i]);
+        const std::string output_name = "output_detection_" + std::to_string(image_ids[i]) + ".png";
+        visualize_detection(output_name, images[i], bboxes_ptr, res_count[i]);
+        bboxes_ptr = bboxes_ptr + res_count[i];
     }
 
-    // generate mask overlay if model exports masks
-    if (mask != nullptr) {
-      fprintf(stdout, "mask %d, height=%d, width=%d\n", i, mask->height, mask->width);
-
-      cv::Mat imgMask(mask->height, mask->width, CV_8UC1, &mask->data[0]);
-      auto x0 = std::max(std::floor(box.left) - 1, 0.f);
-      auto y0 = std::max(std::floor(box.top) - 1, 0.f);
-      cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
-
-      // split the RGB channels, overlay mask to a specific color channel
-      cv::Mat ch[3];
-      split(image, ch);
-      int col = 0;
-      cv::bitwise_or(imgMask, ch[col](roi), ch[col](roi));
-      merge(ch, 3, image);
-    }
+    mmdeploy_detector_release_result(bboxes, res_count, mats.size());
+    return 0;
+}
+
 
-    cv::rectangle(image, cv::Point{(int)box.left, (int)box.top},
-                  cv::Point{(int)box.right, (int)box.bottom}, cv::Scalar{0, 255, 0});
-  }
-  cv::imwrite(output_name, image);
+void visualize_detection(const std::string& output_name, cv::Mat& image, const mmdeploy_detection_t* bboxes_ptr, int bbox_num)
+{
+    for (int i = 0; i < bbox_num; ++i, ++bboxes_ptr)
+    {
+        const auto& box  = bboxes_ptr->bbox;
+        const auto& mask = bboxes_ptr->mask;
+
+        fprintf(stdout,
+                "  box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, "
+                "label=%d, score=%.4f\n",
+                i,
+                box.left,
+                box.top,
+                box.right,
+                box.bottom,
+                bboxes_ptr->label_id,
+                bboxes_ptr->score);
+
+        // skip detections with invalid bbox size (bbox height or width < 1)
+        if ((box.right - box.left) < 1 || (box.bottom - box.top) < 1)
+        {
+            continue;
+        }
+
+        // skip detections less than specified score threshold
+        if (bboxes_ptr->score < 0.3)
+        {
+            continue;
+        }
+
+        // generate mask overlay if model exports masks
+        if (mask != nullptr)
+        {
+            fprintf(stdout, "mask %d, height=%d, width=%d\n", i, mask->height, mask->width);
+
+            cv::Mat  imgMask(mask->height, mask->width, CV_8UC1, &mask->data[0]);
+            auto     x0 = std::max(std::floor(box.left) - 1, 0.f);
+            auto     y0 = std::max(std::floor(box.top) - 1, 0.f);
+            cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
+
+            // split the RGB channels, overlay mask to a specific color channel
+            cv::Mat  ch[3];
+            split(image, ch);
+            int col = 0;
+            cv::bitwise_or(imgMask, ch[col](roi), ch[col](roi));
+            merge(ch, 3, image);
+        }
+
+        cv::rectangle(image, cv::Point{(int)box.left, (int)box.top}, cv::Point{(int)box.right, (int)box.bottom}, cv::Scalar{0, 255, 0});
+    }
+    cv::imwrite(output_name, image);
 }
diff --git a/demo/csrc/c/det_cls.cpp b/demo/csrc/c/det_cls.cpp
index 5d084dd23a..04e6677250 100644
--- a/demo/csrc/c/det_cls.cpp
+++ b/demo/csrc/c/det_cls.cpp
@@ -48,80 +48,85 @@ const auto config_json = R"(
 
 using namespace mmdeploy;
 
-class CropBox {
- public:
-  Result<Value> operator()(const Value& img, const Value& dets) {
-    auto patch = img["ori_img"].get<framework::Mat>();
-    if (dets.is_object() && dets.contains("bbox")) {
-      auto _box = from_value<std::vector<float>>(dets["bbox"]);
-      cv::Rect rect(cv::Rect2f(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
-      patch = crop(patch, rect);
+class CropBox
+{
+  public:
+    Result<Value> operator()(const Value& img, const Value& dets)
+    {
+        auto patch = img["ori_img"].get<framework::Mat>();
+        if (dets.is_object() && dets.contains("bbox"))
+        {
+            auto     _box = from_value<std::vector<float>>(dets["bbox"]);
+            cv::Rect rect(cv::Rect2f(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
+            patch = crop(patch, rect);
+        }
+        return Value{{"ori_img", patch}};
+    }
+
+  private:
+    static framework::Mat crop(const framework::Mat& img, cv::Rect rect)
+    {
+        cv::Mat mat(img.height(), img.width(), CV_8UC(img.channel()), img.data<void>());
+        rect &= cv::Rect(cv::Point(0, 0), mat.size());
+        mat = mat(rect).clone();
+        std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
+        return framework::Mat{mat.rows, mat.cols, img.pixel_format(), img.type(), std::move(data)};
     }
-    return Value{{"ori_img", patch}};
-  }
-
- private:
-  static framework::Mat crop(const framework::Mat& img, cv::Rect rect) {
-    cv::Mat mat(img.height(), img.width(), CV_8UC(img.channel()), img.data<void>());
-    rect &= cv::Rect(cv::Point(0, 0), mat.size());
-    mat = mat(rect).clone();
-    std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
-    return framework::Mat{mat.rows, mat.cols, img.pixel_format(), img.type(), std::move(data)};
-  }
 };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (CropBox, 0),
-                               [](const Value&) { return CreateTask(CropBox{}); });
+MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (CropBox, 0), [](const Value&)
+                               { return CreateTask(CropBox{}); });
 
-int main() {
-  auto config = from_json<Value>(config_json);
+int main()
+{
+    auto              config = from_json<Value>(config_json);
 
-  mmdeploy_device_t device{};
-  mmdeploy_device_create("cpu", 0, &device);
-  mmdeploy_profiler_t profiler{};
-  mmdeploy_profiler_create("profile.bin", &profiler);
+    mmdeploy_device_t device{};
+    mmdeploy_device_create("cpu", 0, &device);
+    mmdeploy_profiler_t profiler{};
+    mmdeploy_profiler_create("profile.bin", &profiler);
 
-  mmdeploy_context_t ctx{};
-  mmdeploy_context_create(&ctx);
+    mmdeploy_context_t ctx{};
+    mmdeploy_context_create(&ctx);
 
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_DEVICE, nullptr, device);
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_PROFILER, nullptr, profiler);
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_DEVICE, nullptr, device);
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_PROFILER, nullptr, profiler);
 
-  auto thread_pool = mmdeploy_executor_create_thread_pool(4);
-  auto infer_thread = mmdeploy_executor_create_thread();
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "preprocess", thread_pool);
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "crop", thread_pool);
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "net", infer_thread);
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "postprocess", thread_pool);
+    auto thread_pool  = mmdeploy_executor_create_thread_pool(4);
+    auto infer_thread = mmdeploy_executor_create_thread();
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "preprocess", thread_pool);
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "crop", thread_pool);
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "net", infer_thread);
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "postprocess", thread_pool);
 
-  mmdeploy_pipeline_t pipeline{};
-  if (auto ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, ctx, &pipeline)) {
-    MMDEPLOY_ERROR("failed to create pipeline: {}", ec);
-    return -1;
-  }
+    mmdeploy_pipeline_t pipeline{};
+    if (auto ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, ctx, &pipeline))
+    {
+        MMDEPLOY_ERROR("failed to create pipeline: {}", ec);
+        return -1;
+    }
 
-  cv::Mat mat = cv::imread("../demo.jpg");
-  framework::Mat img(mat.rows, mat.cols, PixelFormat::kBGR, DataType::kINT8, mat.data,
-                     framework::Device(0));
+    cv::Mat          mat = cv::imread("../demo.jpg");
+    framework::Mat   img(mat.rows, mat.cols, PixelFormat::kBGR, DataType::kINT8, mat.data, framework::Device(0));
 
-  Value input = Value::Array{Value::Array{Value::Object{{"ori_img", img}}}};
+    Value            input = Value::Array{Value::Array{Value::Object{{"ori_img", img}}}};
 
-  mmdeploy_value_t tmp{};
-  mmdeploy_pipeline_apply(pipeline, (mmdeploy_value_t)&input, &tmp);
+    mmdeploy_value_t tmp{};
+    mmdeploy_pipeline_apply(pipeline, (mmdeploy_value_t)&input, &tmp);
 
-  auto output = std::move(*(Value*)tmp);
-  mmdeploy_value_destroy(tmp);
+    auto output = std::move(*(Value*)tmp);
+    mmdeploy_value_destroy(tmp);
 
-  MMDEPLOY_INFO("{}", output);
+    MMDEPLOY_INFO("{}", output);
 
-  mmdeploy_pipeline_destroy(pipeline);
+    mmdeploy_pipeline_destroy(pipeline);
 
-  mmdeploy_context_destroy(ctx);
-  mmdeploy_scheduler_destroy(infer_thread);
-  mmdeploy_scheduler_destroy(thread_pool);
+    mmdeploy_context_destroy(ctx);
+    mmdeploy_scheduler_destroy(infer_thread);
+    mmdeploy_scheduler_destroy(thread_pool);
 
-  mmdeploy_device_destroy(device);
-  mmdeploy_profiler_destroy(profiler);
+    mmdeploy_device_destroy(device);
+    mmdeploy_profiler_destroy(profiler);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/det_pose.cpp b/demo/csrc/c/det_pose.cpp
index a12a33ef7b..9da724e3c8 100644
--- a/demo/csrc/c/det_pose.cpp
+++ b/demo/csrc/c/det_pose.cpp
@@ -56,129 +56,159 @@ const auto config_json = R"(
 
 using namespace mmdeploy;
 
-class AddBboxField {
- public:
-  Result<Value> operator()(const Value& img, const Value& dets) {
-    auto _img = img["ori_img"].get<framework::Mat>();
-    cv::Rect rect(0, 0, _img.width(), _img.height());
-    if (dets.is_object() && dets.contains("bbox")) {
-      auto _box = from_value<std::vector<float>>(dets["bbox"]);
-      rect = cv::Rect(cv::Rect2f(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
+class AddBboxField
+{
+  public:
+    Result<Value> operator()(const Value& img, const Value& dets)
+    {
+        auto     _img = img["ori_img"].get<framework::Mat>();
+        cv::Rect rect(0, 0, _img.width(), _img.height());
+        if (dets.is_object() && dets.contains("bbox"))
+        {
+            auto _box = from_value<std::vector<float>>(dets["bbox"]);
+            rect      = cv::Rect(cv::Rect2f(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
+        }
+        return Value{{"ori_img", _img}, {"bbox", {rect.x, rect.y, rect.width, rect.height}}};
     }
-    return Value{{"ori_img", _img}, {"bbox", {rect.x, rect.y, rect.width, rect.height}}};
-  }
 };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (AddBboxField, 0),
-                               [](const Value&) { return CreateTask(AddBboxField{}); });
+MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (AddBboxField, 0), [](const Value&)
+                               { return CreateTask(AddBboxField{}); });
 
-Result<Value> FilterBbox(const Value& dets) {
-  Value::Array rets;
-  for (const auto& det : dets) {
-    if (det["label_id"].get<int>() == 0 && det["score"].get<float>() >= 0.3) {
-      rets.push_back(det);
+Result<Value> FilterBbox(const Value& dets)
+{
+    Value::Array rets;
+    for (const auto& det : dets)
+    {
+        if (det["label_id"].get<int>() == 0 && det["score"].get<float>() >= 0.3)
+        {
+            rets.push_back(det);
+        }
     }
-  }
-  return rets;
+    return rets;
 }
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (FilterBbox, 0),
-                               [](const Value&) { return CreateTask(FilterBbox); });
+MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (FilterBbox, 0), [](const Value&)
+                               { return CreateTask(FilterBbox); });
 
 static std::vector<std::pair<int, int>> skeleton{
-    {15, 13}, {13, 11}, {16, 14}, {14, 12}, {11, 12}, {5, 11}, {6, 12}, {5, 6}, {5, 7}, {6, 8},
-    {7, 9},   {8, 10},  {1, 2},   {0, 1},   {0, 2},   {1, 3},  {2, 4},  {3, 5}, {4, 6}};
-
-int main(int argc, char* argv[]) {
-  if (argc != 5) {
-    MMDEPLOY_INFO("usage: det_pose device det_model pose_model image");
-    return 0;
-  }
-  const auto device_name = argv[1];
-  const auto det_model_path = argv[2];
-  const auto pose_model_path = argv[3];
-  const auto image_path = argv[4];
-
-  auto config = from_json<Value>(config_json);
-  config["tasks"][0]["params"]["model"] = det_model_path;
-  config["tasks"][2]["tasks"][1]["params"]["model"] = pose_model_path;
-
-  mmdeploy_context_t context{};
-  mmdeploy_context_create(&context);
-
-  auto thread_pool = mmdeploy_executor_create_thread_pool(4);
-  auto single_thread = mmdeploy_executor_create_thread();
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "preprocess", thread_pool);
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "net", single_thread);
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "postprocess", thread_pool);
-
-  mmdeploy_device_t device{};
-  mmdeploy_device_create(device_name, 0, &device);
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_DEVICE, nullptr, device);
-
-  mmdeploy_pipeline_t pipeline{};
-  if (auto ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, context, &pipeline)) {
-    MMDEPLOY_ERROR("failed to create pipeline: {}", ec);
-    return -1;
-  }
+    {15, 13},
+    {13, 11},
+    {16, 14},
+    {14, 12},
+    {11, 12},
+    {5, 11},
+    {6, 12},
+    {5, 6},
+    {5, 7},
+    {6, 8},
+    {7, 9},
+    {8, 10},
+    {1, 2},
+    {0, 1},
+    {0, 2},
+    {1, 3},
+    {2, 4},
+    {3, 5},
+    {4, 6}};
+
+int main(int argc, char* argv[])
+{
+    if (argc != 5)
+    {
+        MMDEPLOY_INFO("usage: det_pose device det_model pose_model image");
+        return 0;
+    }
+    const auto device_name     = argv[1];
+    const auto det_model_path  = argv[2];
+    const auto pose_model_path = argv[3];
+    const auto image_path      = argv[4];
+
+    auto       config                                 = from_json<Value>(config_json);
+    config["tasks"][0]["params"]["model"]             = det_model_path;
+    config["tasks"][2]["tasks"][1]["params"]["model"] = pose_model_path;
+
+    mmdeploy_context_t context{};
+    mmdeploy_context_create(&context);
+
+    auto thread_pool   = mmdeploy_executor_create_thread_pool(4);
+    auto single_thread = mmdeploy_executor_create_thread();
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "preprocess", thread_pool);
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "net", single_thread);
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "postprocess", thread_pool);
+
+    mmdeploy_device_t device{};
+    mmdeploy_device_create(device_name, 0, &device);
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_DEVICE, nullptr, device);
+
+    mmdeploy_pipeline_t pipeline{};
+    if (auto ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, context, &pipeline))
+    {
+        MMDEPLOY_ERROR("failed to create pipeline: {}", ec);
+        return -1;
+    }
 
-  cv::Mat mat = cv::imread(image_path);
-  if (!mat.data) {
-    MMDEPLOY_ERROR("invalid image path: {}", image_path);
-  }
-  framework::Mat img(mat.rows, mat.cols, PixelFormat::kBGR, DataType::kINT8, mat.data,
-                     framework::Device(0));
+    cv::Mat mat = cv::imread(image_path);
+    if (!mat.data)
+    {
+        MMDEPLOY_ERROR("invalid image path: {}", image_path);
+    }
+    framework::Mat   img(mat.rows, mat.cols, PixelFormat::kBGR, DataType::kINT8, mat.data, framework::Device(0));
 
-  Value input{{{{"ori_img", img}}}};
+    Value            input{{{{"ori_img", img}}}};
 
-  mmdeploy_value_t tmp{};
-  mmdeploy_pipeline_apply(pipeline, (mmdeploy_value_t)&input, &tmp);
+    mmdeploy_value_t tmp{};
+    mmdeploy_pipeline_apply(pipeline, (mmdeploy_value_t)&input, &tmp);
 
-  mmdeploy_detection_t* dets{};
-  int* det_count{};
-  mmdeploy_detector_get_result(tmp, &dets, &det_count);
+    mmdeploy_detection_t* dets{};
+    int*                  det_count{};
+    mmdeploy_detector_get_result(tmp, &dets, &det_count);
 
-  auto output = std::move(*(Value*)tmp);
-  mmdeploy_value_destroy(tmp);
+    auto output = std::move(*(Value*)tmp);
+    mmdeploy_value_destroy(tmp);
 
-  // result of second output
-  auto& pose = output[1];
+    // result of second output
+    auto&                      pose = output[1];
 
-  mmdeploy_pose_detection_t* kps{};
-  mmdeploy_pose_detector_get_result((mmdeploy_value_t)&pose, &kps);
+    mmdeploy_pose_detection_t* kps{};
+    mmdeploy_pose_detector_get_result((mmdeploy_value_t)&pose, &kps);
 
-  MMDEPLOY_INFO("{}", *det_count);
+    MMDEPLOY_INFO("{}", *det_count);
 
-  for (int i = 0; i < *det_count; ++i) {
-    if (dets[i].label_id != 0 || dets[i].score < 0.3) {
-      continue;
-    }
-    const auto& bbox = dets[i].bbox;
-    cv::Point p1(bbox.left, bbox.top);
-    cv::Point p2(bbox.right, bbox.bottom);
-    cv::rectangle(mat, p1, p2, cv::Scalar(0, 255, 0));
-    for (int j = 0; j < kps[i].length; ++j) {
-      cv::Point p(kps[i].point[j].x, kps[i].point[j].y);
-      cv::circle(mat, p, 1, cv::Scalar(0, 255, 255), 2, cv::LINE_AA);
-    }
-    for (int j = 0; j < skeleton.size(); ++j) {
-      int u = skeleton[j].first;
-      cv::Point p_u(kps[i].point[u].x, kps[i].point[u].y);
-      int v = skeleton[j].second;
-      cv::Point p_v(kps[i].point[v].x, kps[i].point[v].y);
-      cv::line(mat, p_u, p_v, cv::Scalar(0, 255, 255), 1, cv::LINE_AA);
+    for (int i = 0; i < *det_count; ++i)
+    {
+        if (dets[i].label_id != 0 || dets[i].score < 0.3)
+        {
+            continue;
+        }
+        const auto& bbox = dets[i].bbox;
+        cv::Point   p1(bbox.left, bbox.top);
+        cv::Point   p2(bbox.right, bbox.bottom);
+        cv::rectangle(mat, p1, p2, cv::Scalar(0, 255, 0));
+        for (int j = 0; j < kps[i].length; ++j)
+        {
+            cv::Point p(kps[i].point[j].x, kps[i].point[j].y);
+            cv::circle(mat, p, 1, cv::Scalar(0, 255, 255), 2, cv::LINE_AA);
+        }
+        for (int j = 0; j < skeleton.size(); ++j)
+        {
+            int       u = skeleton[j].first;
+            cv::Point p_u(kps[i].point[u].x, kps[i].point[u].y);
+            int       v = skeleton[j].second;
+            cv::Point p_v(kps[i].point[v].x, kps[i].point[v].y);
+            cv::line(mat, p_u, p_v, cv::Scalar(0, 255, 255), 1, cv::LINE_AA);
+        }
     }
-  }
 
-  mmdeploy_pose_detector_release_result(kps, pose.size());
+    mmdeploy_pose_detector_release_result(kps, pose.size());
 
-  cv::imwrite("output_det_pose.jpg", mat);
+    cv::imwrite("output_det_pose.jpg", mat);
 
-  mmdeploy_pipeline_destroy(pipeline);
+    mmdeploy_pipeline_destroy(pipeline);
 
-  mmdeploy_context_destroy(context);
-  mmdeploy_scheduler_destroy(single_thread);
-  mmdeploy_scheduler_destroy(thread_pool);
+    mmdeploy_context_destroy(context);
+    mmdeploy_scheduler_destroy(single_thread);
+    mmdeploy_scheduler_destroy(thread_pool);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/image_classification.cpp b/demo/csrc/c/image_classification.cpp
index 5e64581b9f..56bee51728 100644
--- a/demo/csrc/c/image_classification.cpp
+++ b/demo/csrc/c/image_classification.cpp
@@ -4,45 +4,56 @@
 
 #include "mmdeploy/classifier.h"
 
-int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  image_classification device_name dump_model_directory image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
-
-  mmdeploy_classifier_t classifier{};
-  int status{};
-  status = mmdeploy_classifier_create_by_path(model_path, device_name, 0, &classifier);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create classifier, code: %d\n", (int)status);
-    return 1;
-  }
-
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-
-  mmdeploy_classification_t* res{};
-  int* res_count{};
-  status = mmdeploy_classifier_apply(classifier, &mat, 1, &res, &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply classifier, code: %d\n", (int)status);
-    return 1;
-  }
-  for (int i = 0; i < res_count[0]; ++i) {
-    fprintf(stderr, "label: %d, score: %.4f\n", res[i].label_id, res[i].score);
-  }
-
-  mmdeploy_classifier_release_result(res, res_count, 1);
-
-  mmdeploy_classifier_destroy(classifier);
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  image_classification device_name dump_model_directory image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
+
+    mmdeploy_classifier_t classifier{};
+    int                   status{};
+    status = mmdeploy_classifier_create_by_path(model_path, device_name, 0, &classifier);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create classifier, code: %d\n", (int)status);
+        return 1;
+    }
+
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
+
+    mmdeploy_classification_t* res{};
+    int*                       res_count{};
+    status = mmdeploy_classifier_apply(classifier, &mat, 1, &res, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply classifier, code: %d\n", (int)status);
+        return 1;
+    }
+    for (int i = 0; i < res_count[0]; ++i)
+    {
+        fprintf(stderr, "label: %d, score: %.4f\n", res[i].label_id, res[i].score);
+    }
+
+    mmdeploy_classifier_release_result(res, res_count, 1);
+
+    mmdeploy_classifier_destroy(classifier);
+
+    return 0;
 }
diff --git a/demo/csrc/c/image_restorer.cpp b/demo/csrc/c/image_restorer.cpp
index ed12eefa7a..87ae0eed0c 100644
--- a/demo/csrc/c/image_restorer.cpp
+++ b/demo/csrc/c/image_restorer.cpp
@@ -7,44 +7,54 @@
 
 #include "mmdeploy/restorer.h"
 
-int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  image_restorer device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
-
-  mmdeploy_restorer_t restorer{};
-  int status{};
-  status = mmdeploy_restorer_create_by_path(model_path, device_name, 0, &restorer);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create restorer, code: %d\n", (int)status);
-    return 1;
-  }
-
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-
-  mmdeploy_mat_t* result{};
-  status = mmdeploy_restorer_apply(restorer, &mat, 1, &result);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply restorer, code: %d\n", (int)status);
-    return 1;
-  }
-
-  cv::Mat sr_img(result->height, result->width, CV_8UC3, result->data);
-  cv::cvtColor(sr_img, sr_img, cv::COLOR_RGB2BGR);
-  cv::imwrite("output_restorer.bmp", sr_img);
-
-  mmdeploy_restorer_release_result(result, 1);
-  mmdeploy_restorer_destroy(restorer);
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  image_restorer device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
+
+    mmdeploy_restorer_t restorer{};
+    int                 status{};
+    status = mmdeploy_restorer_create_by_path(model_path, device_name, 0, &restorer);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create restorer, code: %d\n", (int)status);
+        return 1;
+    }
+
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
+
+    mmdeploy_mat_t* result{};
+    status = mmdeploy_restorer_apply(restorer, &mat, 1, &result);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply restorer, code: %d\n", (int)status);
+        return 1;
+    }
+
+    cv::Mat sr_img(result->height, result->width, CV_8UC3, result->data);
+    cv::cvtColor(sr_img, sr_img, cv::COLOR_RGB2BGR);
+    cv::imwrite("output_restorer.bmp", sr_img);
+
+    mmdeploy_restorer_release_result(result, 1);
+    mmdeploy_restorer_destroy(restorer);
+
+    return 0;
 }
diff --git a/demo/csrc/c/image_segmentation.cpp b/demo/csrc/c/image_segmentation.cpp
index df26d1585c..f8ae7b6a5f 100644
--- a/demo/csrc/c/image_segmentation.cpp
+++ b/demo/csrc/c/image_segmentation.cpp
@@ -12,80 +12,93 @@
 
 using namespace std;
 
-vector<cv::Vec3b> gen_palette(int num_classes) {
-  std::mt19937 gen;
-  std::uniform_int_distribution<ushort> uniform_dist(0, 255);
+vector<cv::Vec3b> gen_palette(int num_classes)
+{
+    std::mt19937                          gen;
+    std::uniform_int_distribution<ushort> uniform_dist(0, 255);
 
-  vector<cv::Vec3b> palette;
-  palette.reserve(num_classes);
-  for (auto i = 0; i < num_classes; ++i) {
-    palette.emplace_back(uniform_dist(gen), uniform_dist(gen), uniform_dist(gen));
-  }
-  return palette;
+    vector<cv::Vec3b>                     palette;
+    palette.reserve(num_classes);
+    for (auto i = 0; i < num_classes; ++i)
+    {
+        palette.emplace_back(uniform_dist(gen), uniform_dist(gen), uniform_dist(gen));
+    }
+    return palette;
 }
 
-int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  image_segmentation device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  image_segmentation device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
 
-  mmdeploy_segmentor_t segmentor{};
-  int status{};
-  status = mmdeploy_segmentor_create_by_path(model_path, device_name, 0, &segmentor);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create segmentor, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_segmentor_t segmentor{};
+    int                  status{};
+    status = mmdeploy_segmentor_create_by_path(model_path, device_name, 0, &segmentor);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create segmentor, code: %d\n", (int)status);
+        return 1;
+    }
 
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
 
-  mmdeploy_segmentation_t* result{};
-  status = mmdeploy_segmentor_apply(segmentor, &mat, 1, &result);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply segmentor, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_segmentation_t* result{};
+    status = mmdeploy_segmentor_apply(segmentor, &mat, 1, &result);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply segmentor, code: %d\n", (int)status);
+        return 1;
+    }
 
-  auto palette = gen_palette(result->classes + 1);
+    auto             palette = gen_palette(result->classes + 1);
 
-  cv::Mat color_mask = cv::Mat::zeros(result->height, result->width, CV_8UC3);
-  int pos = 0;
-  int total = color_mask.rows * color_mask.cols;
-  std::vector<int> idxs(result->classes);
-  for (auto iter = color_mask.begin<cv::Vec3b>(); iter != color_mask.end<cv::Vec3b>(); ++iter) {
-    // output mask
-    if (result->mask) {
-      *iter = palette[result->mask[pos++]];
-    }
-    // output score
-    if (result->score) {
-      std::iota(idxs.begin(), idxs.end(), 0);
-      auto k =
-          std::max_element(idxs.begin(), idxs.end(),
-                           [&](int i, int j) {
-                             return result->score[i * total + pos] < result->score[j * total + pos];
-                           }) -
-          idxs.begin();
-      *iter = palette[k];
-      pos += 1;
+    cv::Mat          color_mask = cv::Mat::zeros(result->height, result->width, CV_8UC3);
+    int              pos        = 0;
+    int              total      = color_mask.rows * color_mask.cols;
+    std::vector<int> idxs(result->classes);
+    for (auto iter = color_mask.begin<cv::Vec3b>(); iter != color_mask.end<cv::Vec3b>(); ++iter)
+    {
+        // output mask
+        if (result->mask)
+        {
+            *iter = palette[result->mask[pos++]];
+        }
+        // output score
+        if (result->score)
+        {
+            std::iota(idxs.begin(), idxs.end(), 0);
+            auto k =
+                std::max_element(idxs.begin(), idxs.end(), [&](int i, int j)
+                                 { return result->score[i * total + pos] < result->score[j * total + pos]; }) -
+                idxs.begin();
+            *iter = palette[k];
+            pos += 1;
+        }
     }
-  }
 
-  img = img * 0.5 + color_mask * 0.5;
-  cv::imwrite("output_segmentation.png", img);
+    img = img * 0.5 + color_mask * 0.5;
+    cv::imwrite("output_segmentation.png", img);
 
-  mmdeploy_segmentor_release_result(result, 1);
-  mmdeploy_segmentor_destroy(segmentor);
+    mmdeploy_segmentor_release_result(result, 1);
+    mmdeploy_segmentor_destroy(segmentor);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/object_detection.cpp b/demo/csrc/c/object_detection.cpp
index eee66be62b..f1ae9bcbb7 100644
--- a/demo/csrc/c/object_detection.cpp
+++ b/demo/csrc/c/object_detection.cpp
@@ -5,89 +5,103 @@
 
 #include "mmdeploy/detector.h"
 
-int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  object_detection device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
-  cv::Size img_size = img.size();
-  mmdeploy_detector_t detector{};
-  int status{};
-  status = mmdeploy_detector_create_by_path(model_path, device_name, 0, &detector);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create detector, code: %d\n", (int)status);
-    return 1;
-  }
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  object_detection device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
+    cv::Size            img_size = img.size();
+    mmdeploy_detector_t detector{};
+    int                 status{};
+    status = mmdeploy_detector_create_by_path(model_path, device_name, 0, &detector);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create detector, code: %d\n", (int)status);
+        return 1;
+    }
+
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
 
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
+    mmdeploy_detection_t* bboxes{};
+    int*                  res_count{};
+    status = mmdeploy_detector_apply(detector, &mat, 1, &bboxes, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply detector, code: %d\n", (int)status);
+        return 1;
+    }
 
-  mmdeploy_detection_t* bboxes{};
-  int* res_count{};
-  status = mmdeploy_detector_apply(detector, &mat, 1, &bboxes, &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply detector, code: %d\n", (int)status);
-    return 1;
-  }
+    fprintf(stdout, "bbox_count=%d\n", *res_count);
 
-  fprintf(stdout, "bbox_count=%d\n", *res_count);
+    for (int i = 0; i < *res_count; ++i)
+    {
+        const auto& box  = bboxes[i].bbox;
+        const auto& mask = bboxes[i].mask;
 
-  for (int i = 0; i < *res_count; ++i) {
-    const auto& box = bboxes[i].bbox;
-    const auto& mask = bboxes[i].mask;
+        fprintf(stdout, "box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n", i, box.left, box.top, box.right, box.bottom, bboxes[i].label_id, bboxes[i].score);
 
-    fprintf(stdout, "box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n",
-            i, box.left, box.top, box.right, box.bottom, bboxes[i].label_id, bboxes[i].score);
+        // skip detections with invalid bbox size (bbox height or width < 1)
+        if ((box.right - box.left) < 1 || (box.bottom - box.top) < 1)
+        {
+            continue;
+        }
 
-    // skip detections with invalid bbox size (bbox height or width < 1)
-    if ((box.right - box.left) < 1 || (box.bottom - box.top) < 1) {
-      continue;
-    }
+        // skip detections less than specified score threshold
+        if (bboxes[i].score < 0.3)
+        {
+            continue;
+        }
 
-    // skip detections less than specified score threshold
-    if (bboxes[i].score < 0.3) {
-      continue;
-    }
+        // generate mask overlay if model exports masks
+        if (mask != nullptr)
+        {
+            fprintf(stdout, "mask %d, height=%d, width=%d\n", i, mask->height, mask->width);
+            // split the RGB channels, overlay mask to a specific color channel
+            cv::Mat ch[3], mask_img;
+            int     col = 0;  // int col = i % 3;
+            split(img, ch);
+            cv::Mat imgMask(mask->height, mask->width, CV_8UC1, &mask->data[0]);
+            // rtmdet-inst
+            if (img_size.height == mask->height && img_size.width == mask->width)
+            {
+                mask_img = ch[col];
+            }
+            else
+            {
+                auto     x0 = std::max(std::floor(box.left) - 1, 0.f);
+                auto     y0 = std::max(std::floor(box.top) - 1, 0.f);
+                cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
+                mask_img = ch[col](roi);
+            }
+            cv::bitwise_or(imgMask, mask_img, mask_img);
+            merge(ch, 3, img);
+        }
 
-    // generate mask overlay if model exports masks
-    if (mask != nullptr) {
-      fprintf(stdout, "mask %d, height=%d, width=%d\n", i, mask->height, mask->width);
-      // split the RGB channels, overlay mask to a specific color channel
-      cv::Mat ch[3], mask_img;
-      int col = 0;  // int col = i % 3;
-      split(img, ch);
-      cv::Mat imgMask(mask->height, mask->width, CV_8UC1, &mask->data[0]);
-      // rtmdet-inst
-      if (img_size.height == mask->height && img_size.width == mask->width) {
-        mask_img = ch[col];
-      }
-      else {
-        auto x0 = std::max(std::floor(box.left) - 1, 0.f);
-        auto y0 = std::max(std::floor(box.top) - 1, 0.f);
-        cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
-        mask_img = ch[col](roi);
-      }
-      cv::bitwise_or(imgMask, mask_img, mask_img);
-      merge(ch, 3, img);
+        cv::rectangle(img, cv::Point{(int)box.left, (int)box.top}, cv::Point{(int)box.right, (int)box.bottom}, cv::Scalar{0, 255, 0});
     }
 
-    cv::rectangle(img, cv::Point{(int)box.left, (int)box.top},
-                  cv::Point{(int)box.right, (int)box.bottom}, cv::Scalar{0, 255, 0});
-  }
-
-  cv::imwrite("output_detection.png", img);
+    cv::imwrite("output_detection.png", img);
 
-  mmdeploy_detector_release_result(bboxes, res_count, 1);
+    mmdeploy_detector_release_result(bboxes, res_count, 1);
 
-  mmdeploy_detector_destroy(detector);
+    mmdeploy_detector_destroy(detector);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/ocr.cpp b/demo/csrc/c/ocr.cpp
index 90ac1f8dc3..a30af9f0a4 100644
--- a/demo/csrc/c/ocr.cpp
+++ b/demo/csrc/c/ocr.cpp
@@ -6,76 +6,90 @@
 #include "mmdeploy/text_detector.h"
 #include "mmdeploy/text_recognizer.h"
 
-int main(int argc, char* argv[]) {
-  if (argc != 5) {
-    fprintf(stderr, "usage:\n  ocr device_name det_model_path reg_model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto det_model_path = argv[2];
-  auto reg_model_path = argv[3];
-  auto image_path = argv[4];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
+int main(int argc, char* argv[])
+{
+    if (argc != 5)
+    {
+        fprintf(stderr, "usage:\n  ocr device_name det_model_path reg_model_path image_path\n");
+        return 1;
+    }
+    auto    device_name    = argv[1];
+    auto    det_model_path = argv[2];
+    auto    reg_model_path = argv[3];
+    auto    image_path     = argv[4];
+    cv::Mat img            = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
 
-  mmdeploy_text_detector_t text_detector{};
-  int status{};
-  status = mmdeploy_text_detector_create_by_path(det_model_path, device_name, 0, &text_detector);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create text_detector, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_text_detector_t text_detector{};
+    int                      status{};
+    status = mmdeploy_text_detector_create_by_path(det_model_path, device_name, 0, &text_detector);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create text_detector, code: %d\n", (int)status);
+        return 1;
+    }
 
-  mmdeploy_text_recognizer_t text_recognizer{};
-  status =
-      mmdeploy_text_recognizer_create_by_path(reg_model_path, device_name, 0, &text_recognizer);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create text_recognizer, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_text_recognizer_t text_recognizer{};
+    status =
+        mmdeploy_text_recognizer_create_by_path(reg_model_path, device_name, 0, &text_recognizer);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create text_recognizer, code: %d\n", (int)status);
+        return 1;
+    }
 
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
 
-  mmdeploy_text_detection_t* bboxes{};
-  int* bbox_count{};
-  status = mmdeploy_text_detector_apply(text_detector, &mat, 1, &bboxes, &bbox_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply text_detector, code: %d\n", (int)status);
-    return 1;
-  }
-  fprintf(stdout, "bbox_count=%d\n", *bbox_count);
+    mmdeploy_text_detection_t* bboxes{};
+    int*                       bbox_count{};
+    status = mmdeploy_text_detector_apply(text_detector, &mat, 1, &bboxes, &bbox_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply text_detector, code: %d\n", (int)status);
+        return 1;
+    }
+    fprintf(stdout, "bbox_count=%d\n", *bbox_count);
 
-  mmdeploy_text_recognition_t* texts{};
-  status =
-      mmdeploy_text_recognizer_apply_bbox(text_recognizer, &mat, 1, bboxes, bbox_count, &texts);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply text_recognizer, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_text_recognition_t* texts{};
+    status =
+        mmdeploy_text_recognizer_apply_bbox(text_recognizer, &mat, 1, bboxes, bbox_count, &texts);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply text_recognizer, code: %d\n", (int)status);
+        return 1;
+    }
 
-  for (int i = 0; i < *bbox_count; ++i) {
-    fprintf(stdout, "box[%d]: %s\n", i, texts[i].text);
-    std::vector<cv::Point> poly_points;
-    for (int j = 0; j < 4; ++j) {
-      auto const& pt = bboxes[i].bbox[j];
-      fprintf(stdout, "x: %.2f, y: %.2f, ", pt.x, pt.y);
-      poly_points.push_back({(int)pt.x, (int)pt.y});
+    for (int i = 0; i < *bbox_count; ++i)
+    {
+        fprintf(stdout, "box[%d]: %s\n", i, texts[i].text);
+        std::vector<cv::Point> poly_points;
+        for (int j = 0; j < 4; ++j)
+        {
+            auto const& pt = bboxes[i].bbox[j];
+            fprintf(stdout, "x: %.2f, y: %.2f, ", pt.x, pt.y);
+            poly_points.push_back({(int)pt.x, (int)pt.y});
+        }
+        fprintf(stdout, "\n");
+        cv::polylines(img, poly_points, true, cv::Scalar{0, 255, 0});
     }
-    fprintf(stdout, "\n");
-    cv::polylines(img, poly_points, true, cv::Scalar{0, 255, 0});
-  }
 
-  cv::imwrite("output_ocr.png", img);
+    cv::imwrite("output_ocr.png", img);
 
-  mmdeploy_text_recognizer_release_result(texts, *bbox_count);
-  mmdeploy_text_recognizer_destroy(text_recognizer);
+    mmdeploy_text_recognizer_release_result(texts, *bbox_count);
+    mmdeploy_text_recognizer_destroy(text_recognizer);
 
-  mmdeploy_text_detector_release_result(bboxes, bbox_count, 1);
-  mmdeploy_text_detector_destroy(text_detector);
+    mmdeploy_text_detector_release_result(bboxes, bbox_count, 1);
+    mmdeploy_text_detector_destroy(text_detector);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/pose_detection.cpp b/demo/csrc/c/pose_detection.cpp
index 11d0ca6483..b6554b2582 100644
--- a/demo/csrc/c/pose_detection.cpp
+++ b/demo/csrc/c/pose_detection.cpp
@@ -6,45 +6,56 @@
 
 #include "mmdeploy/pose_detector.h"
 
-int main(int argc, char *argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  pose_detection device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
-
-  mmdeploy_pose_detector_t pose_detector{};
-  int status{};
-  status = mmdeploy_pose_detector_create_by_path(model_path, device_name, 0, &pose_detector);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create pose_estimator, code: %d\n", (int)status);
-    return 1;
-  }
-
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-
-  mmdeploy_pose_detection_t *res{};
-  status = mmdeploy_pose_detector_apply(pose_detector, &mat, 1, &res);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply pose estimator, code: %d\n", (int)status);
-    return 1;
-  }
-
-  for (int i = 0; i < res->length; i++) {
-    cv::circle(img, {(int)res->point[i].x, (int)res->point[i].y}, 1, {0, 255, 0}, 2);
-  }
-  cv::imwrite("output_pose.png", img);
-
-  mmdeploy_pose_detector_release_result(res, 1);
-  mmdeploy_pose_detector_destroy(pose_detector);
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  pose_detection device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
+
+    mmdeploy_pose_detector_t pose_detector{};
+    int                      status{};
+    status = mmdeploy_pose_detector_create_by_path(model_path, device_name, 0, &pose_detector);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create pose_estimator, code: %d\n", (int)status);
+        return 1;
+    }
+
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
+
+    mmdeploy_pose_detection_t* res{};
+    status = mmdeploy_pose_detector_apply(pose_detector, &mat, 1, &res);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply pose estimator, code: %d\n", (int)status);
+        return 1;
+    }
+
+    for (int i = 0; i < res->length; i++)
+    {
+        cv::circle(img, {(int)res->point[i].x, (int)res->point[i].y}, 1, {0, 255, 0}, 2);
+    }
+    cv::imwrite("output_pose.png", img);
+
+    mmdeploy_pose_detector_release_result(res, 1);
+    mmdeploy_pose_detector_destroy(pose_detector);
+
+    return 0;
 }
diff --git a/demo/csrc/c/rotated_object_detection.cpp b/demo/csrc/c/rotated_object_detection.cpp
index 937c65fdb4..94e47fbdc9 100644
--- a/demo/csrc/c/rotated_object_detection.cpp
+++ b/demo/csrc/c/rotated_object_detection.cpp
@@ -6,65 +6,76 @@
 
 #include "mmdeploy/rotated_detector.h"
 
-int main(int argc, char *argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  oriented_object_detection device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  oriented_object_detection device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
 
-  mmdeploy_rotated_detector_t detector{};
-  int status{};
-  status = mmdeploy_rotated_detector_create_by_path(model_path, device_name, 0, &detector);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create rotated detector, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_rotated_detector_t detector{};
+    int                         status{};
+    status = mmdeploy_rotated_detector_create_by_path(model_path, device_name, 0, &detector);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create rotated detector, code: %d\n", (int)status);
+        return 1;
+    }
 
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
 
-  mmdeploy_rotated_detection_t *rbboxes{};
-  int *res_count{};
-  status = mmdeploy_rotated_detector_apply(detector, &mat, 1, &rbboxes, &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply rotated detector, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_rotated_detection_t* rbboxes{};
+    int*                          res_count{};
+    status = mmdeploy_rotated_detector_apply(detector, &mat, 1, &rbboxes, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply rotated detector, code: %d\n", (int)status);
+        return 1;
+    }
 
-  for (int i = 0; i < *res_count; ++i) {
-    // skip low score
-    if (rbboxes[i].score < 0.1) {
-      continue;
+    for (int i = 0; i < *res_count; ++i)
+    {
+        // skip low score
+        if (rbboxes[i].score < 0.1)
+        {
+            continue;
+        }
+        const auto& rbbox = rbboxes[i].rbbox;
+        float       xc    = rbbox[0];
+        float       yc    = rbbox[1];
+        float       w     = rbbox[2];
+        float       h     = rbbox[3];
+        float       ag    = rbbox[4];
+        float       wx    = w / 2 * std::cos(ag);
+        float       wy    = w / 2 * std::sin(ag);
+        float       hx    = -h / 2 * std::sin(ag);
+        float       hy    = h / 2 * std::cos(ag);
+        cv::Point   p1    = {int(xc - wx - hx), int(yc - wy - hy)};
+        cv::Point   p2    = {int(xc + wx - hx), int(yc + wy - hy)};
+        cv::Point   p3    = {int(xc + wx + hx), int(yc + wy + hy)};
+        cv::Point   p4    = {int(xc - wx + hx), int(yc - wy + hy)};
+        cv::drawContours(img, std::vector<std::vector<cv::Point>>{{p1, p2, p3, p4}}, -1, {0, 255, 0}, 2);
     }
-    const auto &rbbox = rbboxes[i].rbbox;
-    float xc = rbbox[0];
-    float yc = rbbox[1];
-    float w = rbbox[2];
-    float h = rbbox[3];
-    float ag = rbbox[4];
-    float wx = w / 2 * std::cos(ag);
-    float wy = w / 2 * std::sin(ag);
-    float hx = -h / 2 * std::sin(ag);
-    float hy = h / 2 * std::cos(ag);
-    cv::Point p1 = {int(xc - wx - hx), int(yc - wy - hy)};
-    cv::Point p2 = {int(xc + wx - hx), int(yc + wy - hy)};
-    cv::Point p3 = {int(xc + wx + hx), int(yc + wy + hy)};
-    cv::Point p4 = {int(xc - wx + hx), int(yc - wy + hy)};
-    cv::drawContours(img, std::vector<std::vector<cv::Point>>{{p1, p2, p3, p4}}, -1, {0, 255, 0},
-                     2);
-  }
-  cv::imwrite("output_rotated_detection.png", img);
+    cv::imwrite("output_rotated_detection.png", img);
 
-  mmdeploy_rotated_detector_release_result(rbboxes, res_count);
-  mmdeploy_rotated_detector_destroy(detector);
+    mmdeploy_rotated_detector_release_result(rbboxes, res_count);
+    mmdeploy_rotated_detector_destroy(detector);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/video_recognition.cpp b/demo/csrc/c/video_recognition.cpp
index 2ae332453e..5bb44b556d 100644
--- a/demo/csrc/c/video_recognition.cpp
+++ b/demo/csrc/c/video_recognition.cpp
@@ -8,104 +8,119 @@
 
 #include "mmdeploy/video_recognizer.h"
 
-void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer,
-                  std::vector<mmdeploy_mat_t>& clips, int clip_len, int frame_interval = 1,
-                  int num_clips = 1) {
-  cv::VideoCapture cap = cv::VideoCapture(video_path);
-  if (!cap.isOpened()) {
-    fprintf(stderr, "failed to load video: %s\n", video_path);
-    exit(1);
-  }
-
-  int num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT);
-  printf("num_frames %d\n", num_frames);
-
-  int ori_clip_len = clip_len * frame_interval;
-  float avg_interval = (num_frames - ori_clip_len + 1.f) / num_clips;
-  std::vector<int> frame_inds;
-  for (int i = 0; i < num_clips; i++) {
-    int clip_offset = i * avg_interval + avg_interval / 2.0;
-    for (int j = 0; j < clip_len; j++) {
-      int ind = (j * frame_interval + clip_offset) % num_frames;
-      if (num_frames <= ori_clip_len - 1) {
-        ind = j % num_frames;
-      }
-      frame_inds.push_back(ind);
+void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer, std::vector<mmdeploy_mat_t>& clips, int clip_len, int frame_interval = 1, int num_clips = 1)
+{
+    cv::VideoCapture cap = cv::VideoCapture(video_path);
+    if (!cap.isOpened())
+    {
+        fprintf(stderr, "failed to load video: %s\n", video_path);
+        exit(1);
     }
-  }
-
-  std::vector<int> unique_inds(frame_inds.begin(), frame_inds.end());
-  std::sort(unique_inds.begin(), unique_inds.end());
-  auto last = std::unique(unique_inds.begin(), unique_inds.end());
-  unique_inds.erase(last, unique_inds.end());
-
-  int ind = 0;
-  for (int i = 0; i < unique_inds.size(); i++) {
-    int tid = unique_inds[i];
-    cv::Mat frame;
-    while (ind < tid) {
-      cap.read(frame);
-      ind++;
+
+    int num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT);
+    printf("num_frames %d\n", num_frames);
+
+    int              ori_clip_len = clip_len * frame_interval;
+    float            avg_interval = (num_frames - ori_clip_len + 1.f) / num_clips;
+    std::vector<int> frame_inds;
+    for (int i = 0; i < num_clips; i++)
+    {
+        int clip_offset = i * avg_interval + avg_interval / 2.0;
+        for (int j = 0; j < clip_len; j++)
+        {
+            int ind = (j * frame_interval + clip_offset) % num_frames;
+            if (num_frames <= ori_clip_len - 1)
+            {
+                ind = j % num_frames;
+            }
+            frame_inds.push_back(ind);
+        }
+    }
+
+    std::vector<int> unique_inds(frame_inds.begin(), frame_inds.end());
+    std::sort(unique_inds.begin(), unique_inds.end());
+    auto last = std::unique(unique_inds.begin(), unique_inds.end());
+    unique_inds.erase(last, unique_inds.end());
+
+    int ind = 0;
+    for (int i = 0; i < unique_inds.size(); i++)
+    {
+        int     tid = unique_inds[i];
+        cv::Mat frame;
+        while (ind < tid)
+        {
+            cap.read(frame);
+            ind++;
+        }
+        cap.read(frame);
+        buffer[tid] = frame;
+        ind++;
+    }
+
+    clips.resize(frame_inds.size());
+    for (int i = 0; i < frame_inds.size(); i++)
+    {
+        auto&          img = buffer[frame_inds[i]];
+        mmdeploy_mat_t mat{
+            img.data,
+            img.rows,
+            img.cols,
+            3,
+            MMDEPLOY_PIXEL_FORMAT_BGR,
+            MMDEPLOY_DATA_TYPE_UINT8};
+        clips[i] = mat;
     }
-    cap.read(frame);
-    buffer[tid] = frame;
-    ind++;
-  }
-
-  clips.resize(frame_inds.size());
-  for (int i = 0; i < frame_inds.size(); i++) {
-    auto& img = buffer[frame_inds[i]];
-    mmdeploy_mat_t mat{
-        img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-    clips[i] = mat;
-  }
 }
 
-int main(int argc, char* argv[]) {
-  if (argc != 7) {
-    fprintf(stderr,
-            "usage:\n  video_recognition device_name dump_model_directory video_path clip_len "
-            "frame_interval num_clips \n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto video_path = argv[3];
-
-  int clip_len = std::stoi(argv[4]);
-  int frame_interval = std::stoi(argv[5]);
-  int num_clips = std::stoi(argv[6]);
-
-  std::map<int, cv::Mat> buffer;
-  std::vector<mmdeploy_mat_t> clips;
-  std::vector<mmdeploy_video_sample_info_t> clip_info;
-  SampleFrames(video_path, buffer, clips, clip_len, frame_interval, num_clips);
-  clip_info.push_back({clip_len, num_clips});
-
-  mmdeploy_video_recognizer_t recognizer{};
-  int status{};
-  status = mmdeploy_video_recognizer_create_by_path(model_path, device_name, 0, &recognizer);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create recognizer, code: %d\n", (int)status);
-    return 1;
-  }
-
-  mmdeploy_video_recognition_t* res{};
-  int* res_count{};
-  status = mmdeploy_video_recognizer_apply(recognizer, clips.data(), clip_info.data(), 1, &res,
-                                           &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply classifier, code: %d\n", (int)status);
-    return 1;
-  }
-
-  for (int i = 0; i < res_count[0]; ++i) {
-    fprintf(stderr, "label: %d, score: %.4f\n", res[i].label_id, res[i].score);
-  }
-
-  mmdeploy_video_recognizer_release_result(res, res_count, 1);
-
-  mmdeploy_video_recognizer_destroy(recognizer);
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (argc != 7)
+    {
+        fprintf(stderr,
+                "usage:\n  video_recognition device_name dump_model_directory video_path clip_len "
+                "frame_interval num_clips \n");
+        return 1;
+    }
+    auto                                      device_name = argv[1];
+    auto                                      model_path  = argv[2];
+    auto                                      video_path  = argv[3];
+
+    int                                       clip_len       = std::stoi(argv[4]);
+    int                                       frame_interval = std::stoi(argv[5]);
+    int                                       num_clips      = std::stoi(argv[6]);
+
+    std::map<int, cv::Mat>                    buffer;
+    std::vector<mmdeploy_mat_t>               clips;
+    std::vector<mmdeploy_video_sample_info_t> clip_info;
+    SampleFrames(video_path, buffer, clips, clip_len, frame_interval, num_clips);
+    clip_info.push_back({clip_len, num_clips});
+
+    mmdeploy_video_recognizer_t recognizer{};
+    int                         status{};
+    status = mmdeploy_video_recognizer_create_by_path(model_path, device_name, 0, &recognizer);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create recognizer, code: %d\n", (int)status);
+        return 1;
+    }
+
+    mmdeploy_video_recognition_t* res{};
+    int*                          res_count{};
+    status = mmdeploy_video_recognizer_apply(recognizer, clips.data(), clip_info.data(), 1, &res, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply classifier, code: %d\n", (int)status);
+        return 1;
+    }
+
+    for (int i = 0; i < res_count[0]; ++i)
+    {
+        fprintf(stderr, "label: %d, score: %.4f\n", res[i].label_id, res[i].score);
+    }
+
+    mmdeploy_video_recognizer_release_result(res, res_count, 1);
+
+    mmdeploy_video_recognizer_destroy(recognizer);
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/classifier.cxx b/demo/csrc/cpp/classifier.cxx
index 3f55d9524d..ba065e55dc 100644
--- a/demo/csrc/cpp/classifier.cxx
+++ b/demo/csrc/cpp/classifier.cxx
@@ -10,44 +10,50 @@ DEFINE_ARG_string(image, "Input image path");
 DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
 DEFINE_string(output, "classifier_output.jpg", "Output image path");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(FLAGS_device));
-  context.Add(profiler);
-
-  // construct a classifier instance
-  mmdeploy::Classifier classifier(mmdeploy::Model{ARGS_model}, context);
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    classifier.Apply(img);
-  }
-
-  // apply the classifier; the result is an array-like class holding references to
-  // `mmdeploy_classification_t`, will be released automatically on destruction
-  mmdeploy::Classifier::Result result = classifier.Apply(img);
-
-  // visualize results
-  utils::Visualize v;
-  auto sess = v.get_session(img);
-  int count = 0;
-  for (const mmdeploy_classification_t& cls : result) {
-    sess.add_label(cls.label_id, cls.score, count++);
-  }
-
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
+    }
+
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(FLAGS_device));
+    context.Add(profiler);
+
+    // construct a classifier instance
+    mmdeploy::Classifier classifier(mmdeploy::Model{ARGS_model}, context);
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        classifier.Apply(img);
+    }
+
+    // apply the classifier; the result is an array-like class holding references to
+    // `mmdeploy_classification_t`, will be released automatically on destruction
+    mmdeploy::Classifier::Result result = classifier.Apply(img);
+
+    // visualize results
+    utils::Visualize             v;
+    auto                         sess  = v.get_session(img);
+    int                          count = 0;
+    for (const mmdeploy_classification_t& cls : result)
+    {
+        sess.add_label(cls.label_id, cls.score, count++);
+    }
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/det_pose.cxx b/demo/csrc/cpp/det_pose.cxx
index b18fbe003b..50ced81bac 100644
--- a/demo/csrc/cpp/det_pose.cxx
+++ b/demo/csrc/cpp/det_pose.cxx
@@ -22,54 +22,61 @@ DEFINE_double(det_min_bbox_size, -1, "Detection minimum bbox size");
 
 DEFINE_double(pose_thr, 0, "Pose key-point threshold");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Device device{FLAGS_device};
-  // create object detector
-  mmdeploy::Detector detector(mmdeploy::Model(ARGS_det_model), device);
-  // create pose detector
-  mmdeploy::PoseDetector pose(mmdeploy::Model(ARGS_pose_model), device);
-
-  // apply the detector, the result is an array-like class holding references to
-  // `mmdeploy_detection_t`, will be released automatically on destruction
-  mmdeploy::Detector::Result dets = detector.Apply(img);
-
-  // filter detections and extract bboxes for pose model
-  std::vector<mmdeploy_rect_t> bboxes;
-  for (const mmdeploy_detection_t& det : dets) {
-    if (det.label_id == FLAGS_det_label && det.score > FLAGS_det_thr) {
-      bboxes.push_back(det.bbox);
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
     }
-  }
 
-  // apply pose detector, if no bboxes are provided, full image will be used; the result is an
-  // array-like class holding references to `mmdeploy_pose_detection_t`, will be released
-  // automatically on destruction
-  mmdeploy::PoseDetector::Result poses = pose.Apply(img, bboxes);
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
 
-  assert(bboxes.size() == poses.size());
+    mmdeploy::Device             device{FLAGS_device};
+    // create object detector
+    mmdeploy::Detector           detector(mmdeploy::Model(ARGS_det_model), device);
+    // create pose detector
+    mmdeploy::PoseDetector       pose(mmdeploy::Model(ARGS_pose_model), device);
+
+    // apply the detector, the result is an array-like class holding references to
+    // `mmdeploy_detection_t`, will be released automatically on destruction
+    mmdeploy::Detector::Result   dets = detector.Apply(img);
+
+    // filter detections and extract bboxes for pose model
+    std::vector<mmdeploy_rect_t> bboxes;
+    for (const mmdeploy_detection_t& det : dets)
+    {
+        if (det.label_id == FLAGS_det_label && det.score > FLAGS_det_thr)
+        {
+            bboxes.push_back(det.bbox);
+        }
+    }
 
-  // visualize results
-  utils::Visualize v;
-  v.set_skeleton(utils::Skeleton::get(FLAGS_skeleton));
-  auto sess = v.get_session(img);
-  for (size_t i = 0; i < bboxes.size(); ++i) {
-    sess.add_bbox(bboxes[i], -1, -1);
-    sess.add_pose(poses[i].point, poses[i].score, poses[i].length, FLAGS_pose_thr);
-  }
+    // apply pose detector, if no bboxes are provided, full image will be used; the result is an
+    // array-like class holding references to `mmdeploy_pose_detection_t`, will be released
+    // automatically on destruction
+    mmdeploy::PoseDetector::Result poses = pose.Apply(img, bboxes);
+
+    assert(bboxes.size() == poses.size());
+
+    // visualize results
+    utils::Visualize v;
+    v.set_skeleton(utils::Skeleton::get(FLAGS_skeleton));
+    auto sess = v.get_session(img);
+    for (size_t i = 0; i < bboxes.size(); ++i)
+    {
+        sess.add_bbox(bboxes[i], -1, -1);
+        sess.add_pose(poses[i].point, poses[i].score, poses[i].length, FLAGS_pose_thr);
+    }
 
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/cpp/detector.cxx b/demo/csrc/cpp/detector.cxx
index b91ea1b280..e8d1765123 100644
--- a/demo/csrc/cpp/detector.cxx
+++ b/demo/csrc/cpp/detector.cxx
@@ -11,46 +11,53 @@ DEFINE_string(output, "detector_output.jpg", "Output image path");
 
 DEFINE_double(det_thr, .5, "Detection score threshold");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(FLAGS_device));
-  context.Add(profiler);
-
-  // construct a detector instance
-  mmdeploy::Detector detector(mmdeploy::Model{ARGS_model}, context);
-
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    detector.Apply(img);
-  }
-
-  // apply the detector, the result is an array-like class holding references to
-  // `mmdeploy_detection_t`, will be released automatically on destruction
-  mmdeploy::Detector::Result dets = detector.Apply(img);
-
-  // visualize
-  utils::Visualize v;
-  auto sess = v.get_session(img);
-  int count = 0;
-  for (const mmdeploy_detection_t& det : dets) {
-    if (det.score > FLAGS_det_thr) {  // filter bboxes
-      sess.add_det(det.bbox, det.label_id, det.score, det.mask, count++);
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
     }
-  }
 
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(FLAGS_device));
+    context.Add(profiler);
+
+    // construct a detector instance
+    mmdeploy::Detector detector(mmdeploy::Model{ARGS_model}, context);
+
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        detector.Apply(img);
+    }
+
+    // apply the detector, the result is an array-like class holding references to
+    // `mmdeploy_detection_t`, will be released automatically on destruction
+    mmdeploy::Detector::Result dets = detector.Apply(img);
+
+    // visualize
+    utils::Visualize           v;
+    auto                       sess  = v.get_session(img);
+    int                        count = 0;
+    for (const mmdeploy_detection_t& det : dets)
+    {
+        if (det.score > FLAGS_det_thr)
+        {  // filter bboxes
+            sess.add_det(det.bbox, det.label_id, det.score, det.mask, count++);
+        }
+    }
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/cpp/pose_detector.cxx b/demo/csrc/cpp/pose_detector.cxx
index 025afdd794..b48c4b7cc3 100644
--- a/demo/csrc/cpp/pose_detector.cxx
+++ b/demo/csrc/cpp/pose_detector.cxx
@@ -5,40 +5,45 @@
 #include <opencv2/imgproc.hpp>
 #include <string>
 
-int main(int argc, char *argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  pose_detection device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
-
-  using namespace mmdeploy;
-
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(device_name));
-  context.Add(profiler);
-
-  PoseDetector detector{Model(model_path), context};
-
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    detector.Apply(img);
-  }
-
-  auto res = detector.Apply(img);
-
-  for (int i = 0; i < res[0].length; i++) {
-    cv::circle(img, {(int)res[0].point[i].x, (int)res[0].point[i].y}, 1, {0, 255, 0}, 2);
-  }
-  cv::imwrite("output_pose.png", img);
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  pose_detection device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
+
+    using namespace mmdeploy;
+
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(device_name));
+    context.Add(profiler);
+
+    PoseDetector detector{Model(model_path), context};
+
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        detector.Apply(img);
+    }
+
+    auto res = detector.Apply(img);
+
+    for (int i = 0; i < res[0].length; i++)
+    {
+        cv::circle(img, {(int)res[0].point[i].x, (int)res[0].point[i].y}, 1, {0, 255, 0}, 2);
+    }
+    cv::imwrite("output_pose.png", img);
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/pose_tracker.cxx b/demo/csrc/cpp/pose_tracker.cxx
index c4cf26b891..391343a971 100644
--- a/demo/csrc/cpp/pose_tracker.cxx
+++ b/demo/csrc/cpp/pose_tracker.cxx
@@ -17,54 +17,56 @@ DEFINE_int32(output_size, 0, "Long-edge of output frames");
 DEFINE_int32(flip, 0, "Set to 1 for flipping the input horizontally");
 DEFINE_int32(show, 1, "Delay passed to `cv::waitKey` when using `cv::imshow`; -1: disable");
 
-DEFINE_string(skeleton, "coco",
-              R"(Path to skeleton data or name of predefined skeletons: "coco", "coco-wholebody", "coco-wholebody-hand")");
-DEFINE_string(background, "default",
-              R"(Output background, "default": original image, "black": black background)");
+DEFINE_string(skeleton, "coco", R"(Path to skeleton data or name of predefined skeletons: "coco", "coco-wholebody", "coco-wholebody-hand")");
+DEFINE_string(background, "default", R"(Output background, "default": original image, "black": black background)");
 
 #include "pose_tracker_params.h"
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  // create pose tracker pipeline
-  mmdeploy::PoseTracker tracker(mmdeploy::Model(ARGS_det_model), mmdeploy::Model(ARGS_pose_model),
-                                mmdeploy::Device{FLAGS_device});
-
-  mmdeploy::PoseTracker::Params params;
-  // initialize tracker params with program arguments
-  InitTrackerParams(params);
-
-  // create a tracker state for each video
-  mmdeploy::PoseTracker::State state = tracker.CreateState(params);
-
-  utils::mediaio::Input input(ARGS_input, FLAGS_flip);
-  utils::mediaio::Output output(FLAGS_output, FLAGS_show);
-
-  utils::Visualize v(FLAGS_output_size);
-  v.set_background(FLAGS_background);
-  v.set_skeleton(utils::Skeleton::get(FLAGS_skeleton));
-
-  for (const cv::Mat& frame : input) {
-    // apply the pipeline with the tracker state and video frame; the result is an array-like class
-    // holding references to `mmdeploy_pose_tracker_target_t`, will be released automatically on
-    // destruction
-    mmdeploy::PoseTracker::Result result = tracker.Apply(state, frame);
-
-    // visualize results
-    auto sess = v.get_session(frame);
-    for (const mmdeploy_pose_tracker_target_t& target : result) {
-      sess.add_pose(target.keypoints, target.scores, target.keypoint_count, FLAGS_pose_kpt_thr);
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
     }
 
-    // write to output stream
-    if (!output.write(sess.get())) {
-      // user requested exit by pressing ESC
-      break;
+    // create pose tracker pipeline
+    mmdeploy::PoseTracker         tracker(mmdeploy::Model(ARGS_det_model), mmdeploy::Model(ARGS_pose_model), mmdeploy::Device{FLAGS_device});
+
+    mmdeploy::PoseTracker::Params params;
+    // initialize tracker params with program arguments
+    InitTrackerParams(params);
+
+    // create a tracker state for each video
+    mmdeploy::PoseTracker::State state = tracker.CreateState(params);
+
+    utils::mediaio::Input        input(ARGS_input, FLAGS_flip);
+    utils::mediaio::Output       output(FLAGS_output, FLAGS_show);
+
+    utils::Visualize             v(FLAGS_output_size);
+    v.set_background(FLAGS_background);
+    v.set_skeleton(utils::Skeleton::get(FLAGS_skeleton));
+
+    for (const cv::Mat& frame : input)
+    {
+        // apply the pipeline with the tracker state and video frame; the result is an array-like class
+        // holding references to `mmdeploy_pose_tracker_target_t`, will be released automatically on
+        // destruction
+        mmdeploy::PoseTracker::Result result = tracker.Apply(state, frame);
+
+        // visualize results
+        auto                          sess = v.get_session(frame);
+        for (const mmdeploy_pose_tracker_target_t& target : result)
+        {
+            sess.add_pose(target.keypoints, target.scores, target.keypoint_count, FLAGS_pose_kpt_thr);
+        }
+
+        // write to output stream
+        if (!output.write(sess.get()))
+        {
+            // user requested exit by pressing ESC
+            break;
+        }
     }
-  }
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/cpp/pose_tracker_params.h b/demo/csrc/cpp/pose_tracker_params.h
index 2cda301869..8bfa59ac9b 100644
--- a/demo/csrc/cpp/pose_tracker_params.h
+++ b/demo/csrc/cpp/pose_tracker_params.h
@@ -4,36 +4,34 @@ DEFINE_int32(det_interval, 1, "Detection interval");
 DEFINE_int32(det_label, 0, "Detection label use for pose estimation");
 DEFINE_double(det_thr, 0.5, "Detection score threshold");
 DEFINE_double(det_min_bbox_size, -1, "Detection minimum bbox size");
-DEFINE_double(det_nms_thr, .7,
-              "NMS IOU threshold for merging detected bboxes and bboxes from tracked targets");
+DEFINE_double(det_nms_thr, .7, "NMS IOU threshold for merging detected bboxes and bboxes from tracked targets");
 
 DEFINE_int32(pose_max_num_bboxes, -1, "Max number of bboxes used for pose estimation per frame");
 DEFINE_double(pose_kpt_thr, .5, "Threshold for visible key-points");
-DEFINE_int32(pose_min_keypoints, -1,
-             "Min number of key-points for valid poses, -1 indicates ceil(n_kpts/2)");
+DEFINE_int32(pose_min_keypoints, -1, "Min number of key-points for valid poses, -1 indicates ceil(n_kpts/2)");
 DEFINE_double(pose_bbox_scale, 1.25, "Scale for expanding key-points to bbox");
 DEFINE_double(
-    pose_min_bbox_size, -1,
+    pose_min_bbox_size,
+    -1,
     "Min pose bbox size, tracks with bbox size smaller than the threshold will be dropped");
-DEFINE_double(pose_nms_thr, 0.5,
-              "NMS OKS/IOU threshold for suppressing overlapped poses, useful when multiple pose "
-              "estimations collapse to the same target");
+DEFINE_double(pose_nms_thr, 0.5, "NMS OKS/IOU threshold for suppressing overlapped poses, useful when multiple pose "
+                                 "estimations collapse to the same target");
 
 DEFINE_double(track_iou_thr, 0.4, "IOU threshold for associating missing tracks");
-DEFINE_int32(track_max_missing, 10,
-             "Max number of missing frames before a missing tracks is removed");
+DEFINE_int32(track_max_missing, 10, "Max number of missing frames before a missing tracks is removed");
 
-void InitTrackerParams(mmdeploy::PoseTracker::Params& params) {
-  params->det_interval = FLAGS_det_interval;
-  params->det_label = FLAGS_det_label;
-  params->det_thr = FLAGS_det_thr;
-  params->det_min_bbox_size = FLAGS_det_min_bbox_size;
-  params->pose_max_num_bboxes = FLAGS_pose_max_num_bboxes;
-  params->pose_kpt_thr = FLAGS_pose_kpt_thr;
-  params->pose_min_keypoints = FLAGS_pose_min_keypoints;
-  params->pose_bbox_scale = FLAGS_pose_bbox_scale;
-  params->pose_min_bbox_size = FLAGS_pose_min_bbox_size;
-  params->pose_nms_thr = FLAGS_pose_nms_thr;
-  params->track_iou_thr = FLAGS_track_iou_thr;
-  params->track_max_missing = FLAGS_track_max_missing;
+void InitTrackerParams(mmdeploy::PoseTracker::Params& params)
+{
+    params->det_interval        = FLAGS_det_interval;
+    params->det_label           = FLAGS_det_label;
+    params->det_thr             = FLAGS_det_thr;
+    params->det_min_bbox_size   = FLAGS_det_min_bbox_size;
+    params->pose_max_num_bboxes = FLAGS_pose_max_num_bboxes;
+    params->pose_kpt_thr        = FLAGS_pose_kpt_thr;
+    params->pose_min_keypoints  = FLAGS_pose_min_keypoints;
+    params->pose_bbox_scale     = FLAGS_pose_bbox_scale;
+    params->pose_min_bbox_size  = FLAGS_pose_min_bbox_size;
+    params->pose_nms_thr        = FLAGS_pose_nms_thr;
+    params->track_iou_thr       = FLAGS_track_iou_thr;
+    params->track_max_missing   = FLAGS_track_max_missing;
 }
diff --git a/demo/csrc/cpp/restorer.cxx b/demo/csrc/cpp/restorer.cxx
index 0b788a2514..a31b103635 100644
--- a/demo/csrc/cpp/restorer.cxx
+++ b/demo/csrc/cpp/restorer.cxx
@@ -11,40 +11,45 @@ DEFINE_ARG_string(image, "Input image path");
 DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
 DEFINE_string(output, "restorer_output.jpg", "Output image path");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(FLAGS_device));
-  context.Add(profiler);
-
-  // construct a restorer instance
-  mmdeploy::Restorer restorer{mmdeploy::Model{ARGS_model}, context};
-
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    restorer.Apply(img);
-  }
-
-  // apply restorer to the image
-  mmdeploy::Restorer::Result result = restorer.Apply(img);
-
-  // convert to BGR
-  cv::Mat upsampled(result->height, result->width, CV_8UC3, result->data);
-  cv::cvtColor(upsampled, upsampled, cv::COLOR_RGB2BGR);
-
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, upsampled);
-  }
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
+    }
+
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(FLAGS_device));
+    context.Add(profiler);
+
+    // construct a restorer instance
+    mmdeploy::Restorer restorer{mmdeploy::Model{ARGS_model}, context};
+
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        restorer.Apply(img);
+    }
+
+    // apply restorer to the image
+    mmdeploy::Restorer::Result result = restorer.Apply(img);
+
+    // convert to BGR
+    cv::Mat                    upsampled(result->height, result->width, CV_8UC3, result->data);
+    cv::cvtColor(upsampled, upsampled, cv::COLOR_RGB2BGR);
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, upsampled);
+    }
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/rotated_detector.cxx b/demo/csrc/cpp/rotated_detector.cxx
index 6c5b9f1d37..01e30411ac 100644
--- a/demo/csrc/cpp/rotated_detector.cxx
+++ b/demo/csrc/cpp/rotated_detector.cxx
@@ -11,46 +11,53 @@ DEFINE_string(output, "rotated_detector_output.jpg", "Output image path");
 
 DEFINE_double(det_thr, 0.1, "Detection score threshold");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(FLAGS_device));
-  context.Add(profiler);
-
-  // construct a detector instance
-  mmdeploy::RotatedDetector detector(mmdeploy::Model{ARGS_model}, context);
-
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    detector.Apply(img);
-  }
-
-  // apply the detector, the result is an array-like class holding references to
-  // `mmdeploy_rotated_detection_t`, will be released automatically on destruction
-  mmdeploy::RotatedDetector::Result dets = detector.Apply(img);
-
-  // visualize results
-  utils::Visualize v;
-  auto sess = v.get_session(img);
-  for (const mmdeploy_rotated_detection_t& det : dets) {
-    if (det.score > FLAGS_det_thr) {
-      sess.add_rotated_det(det.rbbox, det.label_id, det.score);
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
     }
-  }
 
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(FLAGS_device));
+    context.Add(profiler);
+
+    // construct a detector instance
+    mmdeploy::RotatedDetector detector(mmdeploy::Model{ARGS_model}, context);
+
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        detector.Apply(img);
+    }
+
+    // apply the detector, the result is an array-like class holding references to
+    // `mmdeploy_rotated_detection_t`, will be released automatically on destruction
+    mmdeploy::RotatedDetector::Result dets = detector.Apply(img);
+
+    // visualize results
+    utils::Visualize                  v;
+    auto                              sess = v.get_session(img);
+    for (const mmdeploy_rotated_detection_t& det : dets)
+    {
+        if (det.score > FLAGS_det_thr)
+        {
+            sess.add_rotated_det(det.rbbox, det.label_id, det.score);
+        }
+    }
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/cpp/segmentor.cxx b/demo/csrc/cpp/segmentor.cxx
index c12f8f4153..8c4b5f4901 100644
--- a/demo/csrc/cpp/segmentor.cxx
+++ b/demo/csrc/cpp/segmentor.cxx
@@ -13,44 +13,48 @@ DEFINE_ARG_string(model, "Model path");
 DEFINE_ARG_string(image, "Input image path");
 DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
 DEFINE_string(output, "segmentor_output.jpg", "Output image path");
-DEFINE_string(palette, "cityscapes",
-              R"(Path to palette data or name of predefined palettes: "cityscapes")");
-
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(FLAGS_device));
-  context.Add(profiler);
-  mmdeploy::Segmentor segmentor{mmdeploy::Model{ARGS_model}, context};
-
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    segmentor.Apply(img);
-  }
-
-  // apply the detector, the result is an array-like class holding a reference to
-  // `mmdeploy_segmentation_t`, will be released automatically on destruction
-  mmdeploy::Segmentor::Result seg = segmentor.Apply(img);
-
-  // visualize
-  utils::Visualize v;
-  v.set_palette(utils::Palette::get(FLAGS_palette));
-  auto sess = v.get_session(img);
-  sess.add_mask(seg->height, seg->width, seg->classes, seg->mask, seg->score);
-
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
-
-  return 0;
+DEFINE_string(palette, "cityscapes", R"(Path to palette data or name of predefined palettes: "cityscapes")");
+
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
+    }
+
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(FLAGS_device));
+    context.Add(profiler);
+    mmdeploy::Segmentor segmentor{mmdeploy::Model{ARGS_model}, context};
+
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        segmentor.Apply(img);
+    }
+
+    // apply the detector, the result is an array-like class holding a reference to
+    // `mmdeploy_segmentation_t`, will be released automatically on destruction
+    mmdeploy::Segmentor::Result seg = segmentor.Apply(img);
+
+    // visualize
+    utils::Visualize            v;
+    v.set_palette(utils::Palette::get(FLAGS_palette));
+    auto sess = v.get_session(img);
+    sess.add_mask(seg->height, seg->width, seg->classes, seg->mask, seg->score);
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/text_det_recog.cxx b/demo/csrc/cpp/text_det_recog.cxx
index 76f42d72c7..6a5e6c6bcb 100644
--- a/demo/csrc/cpp/text_det_recog.cxx
+++ b/demo/csrc/cpp/text_det_recog.cxx
@@ -43,40 +43,43 @@ const auto config_json = R"(
 
 using namespace mmdeploy;
 
-int main(int argc, char* argv[]) {
-  if (argc != 5) {
-    fprintf(stderr,
-            "usage:\n\ttext_det_recog device_name det_model_path reg_model_path image_path\n");
-    return -1;
-  }
+int main(int argc, char* argv[])
+{
+    if (argc != 5)
+    {
+        fprintf(stderr,
+                "usage:\n\ttext_det_recog device_name det_model_path reg_model_path image_path\n");
+        return -1;
+    }
 
-  auto device_name = argv[1];
-  auto det_model_path = argv[2];
-  auto reg_model_path = argv[3];
-  auto image_path = argv[4];
+    auto    device_name    = argv[1];
+    auto    det_model_path = argv[2];
+    auto    reg_model_path = argv[3];
+    auto    image_path     = argv[4];
 
-  cv::Mat mat = cv::imread(image_path);
-  if (!mat.data) {
-    fprintf(stderr, "failed to open image %s\n", image_path);
-    return -1;
-  }
+    cv::Mat mat = cv::imread(image_path);
+    if (!mat.data)
+    {
+        fprintf(stderr, "failed to open image %s\n", image_path);
+        return -1;
+    }
 
-  auto config = from_json<Value>(config_json);
+    auto    config = from_json<Value>(config_json);
 
-  Context context(Device(device_name, 0));
+    Context context(Device(device_name, 0));
 
-  auto thread_pool = Scheduler::ThreadPool(4);
-  auto infer_thread = Scheduler::Thread();
-  context.Add("thread_pool", thread_pool);
-  context.Add("infer_thread", infer_thread);
-  context.Add("text_detection", Model(det_model_path));
-  context.Add("text_recognition", Model(reg_model_path));
+    auto    thread_pool  = Scheduler::ThreadPool(4);
+    auto    infer_thread = Scheduler::Thread();
+    context.Add("thread_pool", thread_pool);
+    context.Add("infer_thread", infer_thread);
+    context.Add("text_detection", Model(det_model_path));
+    context.Add("text_recognition", Model(reg_model_path));
 
-  Pipeline pipeline(config, context);
+    Pipeline pipeline(config, context);
 
-  auto output = pipeline.Apply(mat);
+    auto     output = pipeline.Apply(mat);
 
-  // MMDEPLOY_INFO("output:\n{}", output);
+    // MMDEPLOY_INFO("output:\n{}", output);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/cpp/text_ocr.cxx b/demo/csrc/cpp/text_ocr.cxx
index 6c8fdb055b..064b4429b8 100644
--- a/demo/csrc/cpp/text_ocr.cxx
+++ b/demo/csrc/cpp/text_ocr.cxx
@@ -16,42 +16,47 @@ DEFINE_string(output, "text_ocr_output.jpg", "Output image path");
 using mmdeploy::TextDetector;
 using mmdeploy::TextRecognizer;
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Device device(FLAGS_device);
-  TextDetector detector{mmdeploy::Model(ARGS_det_model), device};
-  TextRecognizer recognizer{mmdeploy::Model(ARGS_reg_model), device};
-
-  // apply the detector, the result is an array-like class holding references to
-  // `mmdeploy_text_detection_t`, will be released automatically on destruction
-  TextDetector::Result bboxes = detector.Apply(img);
-
-  // apply recognizer, if no bboxes are provided, full image will be used; the result is an
-  // array-like class holding references to `mmdeploy_text_recognition_t`, will be released
-  // automatically on destruction
-  TextRecognizer::Result texts = recognizer.Apply(img, {bboxes.begin(), bboxes.size()});
-
-  // visualize results
-  utils::Visualize v;
-  auto sess = v.get_session(img);
-  for (size_t i = 0; i < bboxes.size(); ++i) {
-    mmdeploy_text_detection_t& bbox = bboxes[i];
-    mmdeploy_text_recognition_t& text = texts[i];
-    sess.add_text_det(bbox.bbox, bbox.score, text.text, text.length, i);
-  }
-
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
+    }
+
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+
+    mmdeploy::Device       device(FLAGS_device);
+    TextDetector           detector{mmdeploy::Model(ARGS_det_model), device};
+    TextRecognizer         recognizer{mmdeploy::Model(ARGS_reg_model), device};
+
+    // apply the detector, the result is an array-like class holding references to
+    // `mmdeploy_text_detection_t`, will be released automatically on destruction
+    TextDetector::Result   bboxes = detector.Apply(img);
+
+    // apply recognizer, if no bboxes are provided, full image will be used; the result is an
+    // array-like class holding references to `mmdeploy_text_recognition_t`, will be released
+    // automatically on destruction
+    TextRecognizer::Result texts = recognizer.Apply(img, {bboxes.begin(), bboxes.size()});
+
+    // visualize results
+    utils::Visualize       v;
+    auto                   sess = v.get_session(img);
+    for (size_t i = 0; i < bboxes.size(); ++i)
+    {
+        mmdeploy_text_detection_t&   bbox = bboxes[i];
+        mmdeploy_text_recognition_t& text = texts[i];
+        sess.add_text_det(bbox.bbox, bbox.score, text.text, text.length, i);
+    }
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/utils/argparse.h b/demo/csrc/cpp/utils/argparse.h
index 5c94c8afad..08384ff16c 100644
--- a/demo/csrc/cpp/utils/argparse.h
+++ b/demo/csrc/cpp/utils/argparse.h
@@ -19,254 +19,332 @@
 #define DEFINE_ARG_double(name, msg) _MMDEPLOY_DEFINE_ARG(double, name, msg)
 #define DEFINE_ARG_string(name, msg) _MMDEPLOY_DEFINE_ARG(std::string, name, msg)
 
-namespace utils {
+namespace utils
+{
 
-class ArgParse {
- public:
-  template <typename T>
-  static T Register(const std::string& type, const std::string& name, T init,
-                    const std::string& msg, void* ptr) {
-    instance()._Register(type, name, msg, true, ptr);
-    return init;
-  }
-
-  template <typename T>
-  static T Register(const std::string& type, const std::string& name, const std::string& msg,
-                    void* ptr) {
-    instance()._Register(type, name, msg, false, ptr);
-    return {};
-  }
+    class ArgParse
+    {
+      public:
+        template<typename T>
+        static T Register(const std::string& type, const std::string& name, T init, const std::string& msg, void* ptr)
+        {
+            instance()._Register(type, name, msg, true, ptr);
+            return init;
+        }
 
-  static bool ParseArguments(int argc, char* argv[]) {
-    if (!instance()._Parse(argc, argv)) {
-      ShowUsageWithFlags(argv[0]);
-      return false;
-    }
-    return true;
-  }
+        template<typename T>
+        static T Register(const std::string& type, const std::string& name, const std::string& msg, void* ptr)
+        {
+            instance()._Register(type, name, msg, false, ptr);
+            return {};
+        }
 
-  static void ShowUsageWithFlags(const char* argv0) { instance()._ShowUsageWithFlags(argv0); }
+        static bool ParseArguments(int argc, char* argv[])
+        {
+            if (!instance()._Parse(argc, argv))
+            {
+                ShowUsageWithFlags(argv[0]);
+                return false;
+            }
+            return true;
+        }
 
- private:
-  static ArgParse& instance() {
-    static ArgParse inst;
-    return inst;
-  }
+        static void ShowUsageWithFlags(const char* argv0)
+        {
+            instance()._ShowUsageWithFlags(argv0);
+        }
 
-  struct Info {
-    std::string name;
-    std::string type;
-    std::string msg;
-    bool is_flag;
-    void* ptr;
-  };
+      private:
+        static ArgParse& instance()
+        {
+            static ArgParse inst;
+            return inst;
+        }
 
-  void _Register(std::string type, const std::string& name, const std::string& msg, bool is_flag,
-                 void* ptr) {
-    if (type == "std::string") {
-      type = "string";
-    } else if (type == "int32_t") {
-      type = "int32";
-    }
-    infos_.push_back({name, type, msg, is_flag, ptr});
-  }
+        struct Info
+        {
+            std::string name;
+            std::string type;
+            std::string msg;
+            bool        is_flag;
+            void*       ptr;
+        };
 
-  bool _Parse(int argc, char* argv[]) {
-    int arg_idx{-1};
-    std::vector<std::string> args(infos_.size());
-    std::vector<int> used(infos_.size());
-    for (int i = 1; i < argc; ++i) {
-      if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
-        return false;
-      }
-      if (argv[i][0] == '-' && argv[i][1] == '-') {
-        // parse flag key-value pair (--x=y or --x y)
-        int eq{-1};
-        for (int k = 2; argv[i][k]; ++k) {
-          if (argv[i][k] == '=') {
-            eq = k;
-            break;
-          }
+        void _Register(std::string type, const std::string& name, const std::string& msg, bool is_flag, void* ptr)
+        {
+            if (type == "std::string")
+            {
+                type = "string";
+            }
+            else if (type == "int32_t")
+            {
+                type = "int32";
+            }
+            infos_.push_back({name, type, msg, is_flag, ptr});
         }
-        std::string key;
-        std::string val;
-        if (eq >= 0) {
-          key = std::string(argv[i] + 2, argv[i] + eq);
-          val = std::string(argv[i] + eq + 1);
-        } else {
-          key = std::string(argv[i] + 2);
-          if (i < argc - 1) {
-            val = argv[++i];
-          }
-        }
-        bool found{};
-        for (int j = 0; j < infos_.size(); ++j) {
-          auto& flag = infos_[j];
-          if (key == flag.name) {
-            args[j] = val;
-            found = used[j] = 1;
-            break;
-          }
-        }
-        if (!found) {
-          std::cout << "error: unknown option: " << key << std::endl;
-          return false;
-        }
-      } else {
-        for (arg_idx++; arg_idx < infos_.size(); ++arg_idx) {
-          if (!infos_[arg_idx].is_flag) {
-            args[arg_idx] = argv[i];
-            used[arg_idx] = 1;
-            break;
-          }
-        }
-        if (arg_idx == infos_.size()) {
-          std::cout << "error: unknown argument: " << argv[i] << std::endl;
-          return false;
-        }
-      }
-    }
-    std::vector<std::string> missing;
-    for (arg_idx++; arg_idx < infos_.size(); ++arg_idx) {
-      if (!infos_[arg_idx].is_flag) {
-        missing.push_back(infos_[arg_idx].name);
-      }
-    }
-    if (!missing.empty()) {
-      std::cout << "error: the following arguments are required:";
-      for (int i = 0; i < missing.size(); ++i) {
-        std::cout << " " << missing[i];
-        if (i != missing.size() - 1) {
-          std::cout << ",";
+
+        bool _Parse(int argc, char* argv[])
+        {
+            int                      arg_idx{-1};
+            std::vector<std::string> args(infos_.size());
+            std::vector<int>         used(infos_.size());
+            for (int i = 1; i < argc; ++i)
+            {
+                if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+                {
+                    return false;
+                }
+                if (argv[i][0] == '-' && argv[i][1] == '-')
+                {
+                    // parse flag key-value pair (--x=y or --x y)
+                    int eq{-1};
+                    for (int k = 2; argv[i][k]; ++k)
+                    {
+                        if (argv[i][k] == '=')
+                        {
+                            eq = k;
+                            break;
+                        }
+                    }
+                    std::string key;
+                    std::string val;
+                    if (eq >= 0)
+                    {
+                        key = std::string(argv[i] + 2, argv[i] + eq);
+                        val = std::string(argv[i] + eq + 1);
+                    }
+                    else
+                    {
+                        key = std::string(argv[i] + 2);
+                        if (i < argc - 1)
+                        {
+                            val = argv[++i];
+                        }
+                    }
+                    bool found{};
+                    for (int j = 0; j < infos_.size(); ++j)
+                    {
+                        auto& flag = infos_[j];
+                        if (key == flag.name)
+                        {
+                            args[j] = val;
+                            found = used[j] = 1;
+                            break;
+                        }
+                    }
+                    if (!found)
+                    {
+                        std::cout << "error: unknown option: " << key << std::endl;
+                        return false;
+                    }
+                }
+                else
+                {
+                    for (arg_idx++; arg_idx < infos_.size(); ++arg_idx)
+                    {
+                        if (!infos_[arg_idx].is_flag)
+                        {
+                            args[arg_idx] = argv[i];
+                            used[arg_idx] = 1;
+                            break;
+                        }
+                    }
+                    if (arg_idx == infos_.size())
+                    {
+                        std::cout << "error: unknown argument: " << argv[i] << std::endl;
+                        return false;
+                    }
+                }
+            }
+            std::vector<std::string> missing;
+            for (arg_idx++; arg_idx < infos_.size(); ++arg_idx)
+            {
+                if (!infos_[arg_idx].is_flag)
+                {
+                    missing.push_back(infos_[arg_idx].name);
+                }
+            }
+            if (!missing.empty())
+            {
+                std::cout << "error: the following arguments are required:";
+                for (int i = 0; i < missing.size(); ++i)
+                {
+                    std::cout << " " << missing[i];
+                    if (i != missing.size() - 1)
+                    {
+                        std::cout << ",";
+                    }
+                }
+                std::cout << "\n";
+                return false;
+            }
+
+            for (int i = 0; i < infos_.size(); ++i)
+            {
+                if (used[i])
+                {
+                    try
+                    {
+                        parse_str(infos_[i], args[i]);
+                    }
+                    catch (...)
+                    {
+                        std::cout << "error: failed to parse " << infos_[i].name << ": " << args[i] << std::endl;
+                        return false;
+                    }
+                }
+            }
+
+            return true;
         }
-      }
-      std::cout << "\n";
-      return false;
-    }
 
-    for (int i = 0; i < infos_.size(); ++i) {
-      if (used[i]) {
-        try {
-          parse_str(infos_[i], args[i]);
-        } catch (...) {
-          std::cout << "error: failed to parse " << infos_[i].name << ": " << args[i] << std::endl;
-          return false;
+        static void parse_str(Info& info, const std::string& str)
+        {
+            if (info.type == "int32")
+            {
+                *static_cast<int32_t*>(info.ptr) = std::stoi(str);
+            }
+            else if (info.type == "double")
+            {
+                *static_cast<double*>(info.ptr) = std::stod(str);
+            }
+            else if (info.type == "string")
+            {
+                *static_cast<std::string*>(info.ptr) = str;
+            }
+            else
+            {
+                // pass
+            }
         }
-      }
-    }
 
-    return true;
-  }
+        static std::string get_default_str(const Info& info)
+        {
+            if (info.type == "int32")
+            {
+                return std::to_string(*static_cast<int32_t*>(info.ptr));
+            }
+            else if (info.type == "double")
+            {
+                std::ostringstream os;
+                os << std::setprecision(3) << *static_cast<double*>(info.ptr);
+                return os.str();
+            }
+            else if (info.type == "string")
+            {
+                return "\"" + *(static_cast<std::string*>(info.ptr)) + "\"";
+            }
+            else
+            {
+                return "<unknown type>";
+            }
+        }
 
-  static void parse_str(Info& info, const std::string& str) {
-    if (info.type == "int32") {
-      *static_cast<int32_t*>(info.ptr) = std::stoi(str);
-    } else if (info.type == "double") {
-      *static_cast<double*>(info.ptr) = std::stod(str);
-    } else if (info.type == "string") {
-      *static_cast<std::string*>(info.ptr) = str;
-    } else {
-      // pass
-    }
-  }
+        void _ShowUsageWithFlags(const char* argv0) const
+        {
+            ShowUsage(argv0);
+            static constexpr const auto kLineLength = 80;
+            std::cout << std::endl;
+            int max_name_length = 0;
+            for (const auto& info : infos_)
+            {
+                max_name_length = std::max(max_name_length, (int)info.name.length());
+            }
+            max_name_length += 4;
+            auto name_col_size = max_name_length + 1;
+            auto msg_col_size  = kLineLength - name_col_size;
+            std::cout << "required arguments:\n";
+            ShowFlags(name_col_size, msg_col_size, false);
+            std::cout << std::endl;
+            std::cout << "optional arguments:\n";
+            ShowFlags(name_col_size, msg_col_size, true);
+        }
 
-  static std::string get_default_str(const Info& info) {
-    if (info.type == "int32") {
-      return std::to_string(*static_cast<int32_t*>(info.ptr));
-    } else if (info.type == "double") {
-      std::ostringstream os;
-      os << std::setprecision(3) << *static_cast<double*>(info.ptr);
-      return os.str();
-    } else if (info.type == "string") {
-      return "\"" + *(static_cast<std::string*>(info.ptr)) + "\"";
-    } else {
-      return "<unknown type>";
-    }
-  }
+        void ShowFlags(int name_col_size, int msg_col_size, bool is_flag) const
+        {
+            for (const auto& info : infos_)
+            {
+                if (info.is_flag != is_flag)
+                {
+                    continue;
+                }
+                std::string name = "  ";
+                if (info.is_flag)
+                {
+                    name.append("--");
+                }
+                name.append(info.name);
+                while (name.length() < name_col_size)
+                {
+                    name.append(" ");
+                }
+                std::cout << name;
+                std::string msg = info.msg;
+                while (msg.length() > msg_col_size)
+                {  // insert line-breaks when msg is too long
+                    auto pos = msg.rend() - std::find(std::make_reverse_iterator(msg.begin() + msg_col_size),
+                                                      msg.rend(),
+                                                      ' ');
+                    std::cout << msg.substr(0, pos - 1) << std::endl;
+                    std::cout << std::string(name_col_size, ' ');
+                    msg = msg.substr(pos);
+                }
+                std::cout << msg;
+                std::string type;
+                type.append("[").append(info.type);
+                if (info.is_flag)
+                {
+                    type.append(" = ").append(get_default_str(info));
+                }
+                type.append("]");
+                if (msg.length() + type.length() + 1 > msg_col_size)
+                {
+                    std::cout << std::endl
+                              << std::string(name_col_size, ' ') << type;
+                }
+                else
+                {
+                    std::cout << " " << type;
+                }
+                std::cout << std::endl;
+            }
+        }
 
-  void _ShowUsageWithFlags(const char* argv0) const {
-    ShowUsage(argv0);
-    static constexpr const auto kLineLength = 80;
-    std::cout << std::endl;
-    int max_name_length = 0;
-    for (const auto& info : infos_) {
-      max_name_length = std::max(max_name_length, (int)info.name.length());
-    }
-    max_name_length += 4;
-    auto name_col_size = max_name_length + 1;
-    auto msg_col_size = kLineLength - name_col_size;
-    std::cout << "required arguments:\n";
-    ShowFlags(name_col_size, msg_col_size, false);
-    std::cout << std::endl;
-    std::cout << "optional arguments:\n";
-    ShowFlags(name_col_size, msg_col_size, true);
-  }
+        void ShowUsage(const char* argv0) const
+        {
+            for (auto p = argv0; *p; ++p)
+            {
+                if (*p == '/' || *p == '\'')
+                {
+                    argv0 = p + 1;
+                }
+            }
+            std::cout << "Usage: " << argv0 << " [options]";
+            for (const auto& info : infos_)
+            {
+                if (!info.is_flag)
+                {
+                    std::cout << " " << info.name;
+                }
+            }
+            std::cout << std::endl;
+        }
 
-  void ShowFlags(int name_col_size, int msg_col_size, bool is_flag) const {
-    for (const auto& info : infos_) {
-      if (info.is_flag != is_flag) {
-        continue;
-      }
-      std::string name = "  ";
-      if (info.is_flag) {
-        name.append("--");
-      }
-      name.append(info.name);
-      while (name.length() < name_col_size) {
-        name.append(" ");
-      }
-      std::cout << name;
-      std::string msg = info.msg;
-      while (msg.length() > msg_col_size) {  // insert line-breaks when msg is too long
-        auto pos = msg.rend() - std::find(std::make_reverse_iterator(msg.begin() + msg_col_size),
-                                          msg.rend(), ' ');
-        std::cout << msg.substr(0, pos - 1) << std::endl;
-        std::cout << std::string(name_col_size, ' ');
-        msg = msg.substr(pos);
-      }
-      std::cout << msg;
-      std::string type;
-      type.append("[").append(info.type);
-      if (info.is_flag) {
-        type.append(" = ").append(get_default_str(info));
-      }
-      type.append("]");
-      if (msg.length() + type.length() + 1 > msg_col_size) {
-        std::cout << std::endl << std::string(name_col_size, ' ') << type;
-      } else {
-        std::cout << " " << type;
-      }
-      std::cout << std::endl;
-    }
-  }
+      private:
+        std::vector<Info> infos_;
+    };
 
-  void ShowUsage(const char* argv0) const {
-    for (auto p = argv0; *p; ++p) {
-      if (*p == '/' || *p == '\'') {
-        argv0 = p + 1;
-      }
+    inline bool ParseArguments(int argc, char* argv[])
+    {
+        return ArgParse::ParseArguments(argc, argv);
     }
-    std::cout << "Usage: " << argv0 << " [options]";
-    for (const auto& info : infos_) {
-      if (!info.is_flag) {
-        std::cout << " " << info.name;
-      }
-    }
-    std::cout << std::endl;
-  }
-
- private:
-  std::vector<Info> infos_;
-};
-
-inline bool ParseArguments(int argc, char* argv[]) { return ArgParse::ParseArguments(argc, argv); }
 
 }  // namespace utils
 
 #define _MMDEPLOY_DEFINE_FLAG(type, name, init, msg) \
-  type FLAGS_##name = ::utils::ArgParse::Register(#type, #name, type(init), msg, &FLAGS_##name)
+    type FLAGS_##name = ::utils::ArgParse::Register(#type, #name, type(init), msg, &FLAGS_##name)
 
 #define _MMDEPLOY_DEFINE_ARG(type, name, msg) \
-  type ARGS_##name = ::utils::ArgParse::Register<type>(#type, #name, msg, &ARGS_##name)
+    type ARGS_##name = ::utils::ArgParse::Register<type>(#type, #name, msg, &ARGS_##name)
 
 #endif  // MMDEPLOY_ARGPARSE_H
diff --git a/demo/csrc/cpp/utils/mediaio.h b/demo/csrc/cpp/utils/mediaio.h
index 65018602c2..871debaf09 100644
--- a/demo/csrc/cpp/utils/mediaio.h
+++ b/demo/csrc/cpp/utils/mediaio.h
@@ -10,384 +10,552 @@
 #include "opencv2/imgcodecs/imgcodecs.hpp"
 #include "opencv2/videoio/videoio.hpp"
 
-namespace utils {
-namespace mediaio {
-
-enum class MediaType { kUnknown, kImage, kVideo, kImageList, kWebcam, kFmtStr, kDisable };
-
-namespace detail {
-
-static std::string get_extension(const std::string& path) {
-  std::string ext;
-  for (auto i = (int)path.size() - 1; i >= 0; --i) {
-    if (path[i] == '.') {
-      ext.push_back(path[i]);
-      for (++i; i < path.size(); ++i) {
-        ext.push_back((char)std::tolower((unsigned char)path[i]));
-      }
-      return ext;
-    }
-  }
-  return {};
-}
-
-int ext2fourcc(const std::string& ext) {
-  auto get_fourcc = [](const char* s) { return cv::VideoWriter::fourcc(s[0], s[1], s[2], s[3]); };
-  static std::map<std::string, int> ext2fourcc{
-      {".mp4", get_fourcc("mp4v")},
-      {".avi", get_fourcc("DIVX")},
-      {".mkv", get_fourcc("X264")},
-      {".wmv", get_fourcc("WMV3")},
-  };
-  auto it = ext2fourcc.find(ext);
-  if (it != ext2fourcc.end()) {
-    return it->second;
-  }
-  return get_fourcc("DIVX");
-}
-
-static bool is_video(const std::string& ext) {
-  static const std::set<std::string> es{".mp4", ".avi", ".mkv", ".webm", ".mov", ".mpg", ".wmv"};
-  return es.count(ext);
-}
-
-static bool is_list(const std::string& ext) {
-  static const std::set<std::string> es{".txt"};
-  return es.count(ext);
-}
-
-static bool is_image(const std::string& ext) {
-  static const std::set<std::string> es{".jpg", ".jpeg", ".png", ".tif", ".tiff",
-                                        ".bmp", ".ppm",  ".pgm", ".webp"};
-  return es.count(ext);
-}
-
-static bool is_fmtstr(const std::string& str) {
-  for (const auto& c : str) {
-    if (c == '%') {
-      return true;
-    }
-  }
-  return false;
-}
-
-}  // namespace detail
-
-class Input;
-
-class InputIterator {
- public:
-  using iterator_category = std::input_iterator_tag;
-  using difference_type = std::ptrdiff_t;
-  using reference = cv::Mat&;
-  using value_type = reference;
-  using pointer = void;
-
- public:
-  InputIterator() = default;
-  explicit InputIterator(Input& input) : input_(&input) { next(); }
-  InputIterator& operator++() {
-    next();
-    return *this;
-  }
-  reference operator*() { return frame_; }
-  friend bool operator==(const InputIterator& a, const InputIterator& b) {
-    return &a == &b || a.is_end() == b.is_end();
-  }
-  friend bool operator!=(const InputIterator& a, const InputIterator& b) { return !(a == b); }
-
- private:
-  void next();
-  bool is_end() const noexcept { return frame_.data != nullptr; }
-
- private:
-  cv::Mat frame_;
-  Input* input_{};
-};
-
-class BatchInputIterator {
- public:
-  using iterator_category = std::input_iterator_tag;
-  using difference_type = std::ptrdiff_t;
-  using reference = std::vector<cv::Mat>&;
-  using value_type = reference;
-  using pointer = void;
-
- public:
-  BatchInputIterator() = default;
-  BatchInputIterator(InputIterator iter, InputIterator end, size_t batch_size)
-      : iter_(std::move(iter)), end_(std::move(end)), batch_size_(batch_size) {
-    next();
-  }
-
-  BatchInputIterator& operator++() {
-    next();
-    return *this;
-  }
-
-  reference operator*() { return data_; }
-
-  friend bool operator==(const BatchInputIterator& a, const BatchInputIterator& b) {
-    return &a == &b || a.is_end() == b.is_end();
-  }
-
-  friend bool operator!=(const BatchInputIterator& a, const BatchInputIterator& b) {
-    return !(a == b);
-  }
-
- private:
-  void next() {
-    data_.clear();
-    for (size_t i = 0; i < batch_size_ && iter_ != end_; ++i, ++iter_) {
-      data_.push_back(*iter_);
-    }
-  }
-
-  bool is_end() const { return data_.empty(); }
-
- private:
-  InputIterator iter_;
-  InputIterator end_;
-  size_t batch_size_{1};
-  std::vector<cv::Mat> data_;
-};
-
-class Input {
- public:
-  explicit Input(const std::string& path, bool flip = false, MediaType type = MediaType::kUnknown)
-      : path_(path), flip_(flip), type_(type) {
-    if (type_ == MediaType::kUnknown) {
-      auto ext = detail::get_extension(path);
-      if (detail::is_image(ext)) {
-        type_ = MediaType::kImage;
-      } else if (detail::is_video(ext)) {
-        type_ = MediaType::kVideo;
-      } else if (path.size() == 1 && std::isdigit((unsigned char)path[0])) {
-        type_ = MediaType::kWebcam;
-      } else if (detail::is_list(ext) || try_image_list(path)) {
-        type_ = MediaType::kImageList;
-      } else if (try_image(path)) {
-        type_ = MediaType::kImage;
-      } else if (try_video(path)) {
-        type_ = MediaType::kVideo;
-      } else {
-        std::cout << "unknown file type: " << path << "\n";
-      }
-    }
-    if (type_ != MediaType::kUnknown) {
-      if (type_ == MediaType::kVideo) {
-        cap_.open(path_);
-        if (!cap_.isOpened()) {
-          std::cerr << "failed to open video file: " << path_ << "\n";
+namespace utils
+{
+    namespace mediaio
+    {
+
+        enum class MediaType
+        {
+            kUnknown,
+            kImage,
+            kVideo,
+            kImageList,
+            kWebcam,
+            kFmtStr,
+            kDisable
+        };
+
+        namespace detail
+        {
+
+            static std::string get_extension(const std::string& path)
+            {
+                std::string ext;
+                for (auto i = (int)path.size() - 1; i >= 0; --i)
+                {
+                    if (path[i] == '.')
+                    {
+                        ext.push_back(path[i]);
+                        for (++i; i < path.size(); ++i)
+                        {
+                            ext.push_back((char)std::tolower((unsigned char)path[i]));
+                        }
+                        return ext;
+                    }
+                }
+                return {};
+            }
+
+            int ext2fourcc(const std::string& ext)
+            {
+                auto get_fourcc = [](const char* s)
+                { return cv::VideoWriter::fourcc(s[0], s[1], s[2], s[3]); };
+                static std::map<std::string, int> ext2fourcc{
+                    {".mp4", get_fourcc("mp4v")},
+                    {".avi", get_fourcc("DIVX")},
+                    {".mkv", get_fourcc("X264")},
+                    {".wmv", get_fourcc("WMV3")},
+                };
+                auto it = ext2fourcc.find(ext);
+                if (it != ext2fourcc.end())
+                {
+                    return it->second;
+                }
+                return get_fourcc("DIVX");
+            }
+
+            static bool is_video(const std::string& ext)
+            {
+                static const std::set<std::string> es{".mp4", ".avi", ".mkv", ".webm", ".mov", ".mpg", ".wmv"};
+                return es.count(ext);
+            }
+
+            static bool is_list(const std::string& ext)
+            {
+                static const std::set<std::string> es{".txt"};
+                return es.count(ext);
+            }
+
+            static bool is_image(const std::string& ext)
+            {
+                static const std::set<std::string> es{".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".ppm", ".pgm", ".webp"};
+                return es.count(ext);
+            }
+
+            static bool is_fmtstr(const std::string& str)
+            {
+                for (const auto& c : str)
+                {
+                    if (c == '%')
+                    {
+                        return true;
+                    }
+                }
+                return false;
+            }
+
+        }  // namespace detail
+
+        class Input;
+
+        class InputIterator
+        {
+          public:
+            using iterator_category = std::input_iterator_tag;
+            using difference_type   = std::ptrdiff_t;
+            using reference         = cv::Mat&;
+            using value_type        = reference;
+            using pointer           = void;
+
+          public:
+            InputIterator() = default;
+            explicit InputIterator(Input& input)
+                : input_(&input)
+            {
+                next();
+            }
+            InputIterator& operator++()
+            {
+                next();
+                return *this;
+            }
+            reference operator*()
+            {
+                return frame_;
+            }
+            friend bool operator==(const InputIterator& a, const InputIterator& b)
+            {
+                return &a == &b || a.is_end() == b.is_end();
+            }
+            friend bool operator!=(const InputIterator& a, const InputIterator& b)
+            {
+                return !(a == b);
+            }
+
+          private:
+            void next();
+            bool is_end() const noexcept
+            {
+                return frame_.data != nullptr;
+            }
+
+          private:
+            cv::Mat frame_;
+            Input*  input_{};
+        };
+
+        class BatchInputIterator
+        {
+          public:
+            using iterator_category = std::input_iterator_tag;
+            using difference_type   = std::ptrdiff_t;
+            using reference         = std::vector<cv::Mat>&;
+            using value_type        = reference;
+            using pointer           = void;
+
+          public:
+            BatchInputIterator() = default;
+            BatchInputIterator(InputIterator iter, InputIterator end, size_t batch_size)
+                : iter_(std::move(iter))
+                , end_(std::move(end))
+                , batch_size_(batch_size)
+            {
+                next();
+            }
+
+            BatchInputIterator& operator++()
+            {
+                next();
+                return *this;
+            }
+
+            reference operator*()
+            {
+                return data_;
+            }
+
+            friend bool operator==(const BatchInputIterator& a, const BatchInputIterator& b)
+            {
+                return &a == &b || a.is_end() == b.is_end();
+            }
+
+            friend bool operator!=(const BatchInputIterator& a, const BatchInputIterator& b)
+            {
+                return !(a == b);
+            }
+
+          private:
+            void next()
+            {
+                data_.clear();
+                for (size_t i = 0; i < batch_size_ && iter_ != end_; ++i, ++iter_)
+                {
+                    data_.push_back(*iter_);
+                }
+            }
+
+            bool is_end() const
+            {
+                return data_.empty();
+            }
+
+          private:
+            InputIterator        iter_;
+            InputIterator        end_;
+            size_t               batch_size_{1};
+            std::vector<cv::Mat> data_;
+        };
+
+        class Input
+        {
+          public:
+            explicit Input(const std::string& path, bool flip = false, MediaType type = MediaType::kUnknown)
+                : path_(path)
+                , flip_(flip)
+                , type_(type)
+            {
+                if (type_ == MediaType::kUnknown)
+                {
+                    auto ext = detail::get_extension(path);
+                    if (detail::is_image(ext))
+                    {
+                        type_ = MediaType::kImage;
+                    }
+                    else if (detail::is_video(ext))
+                    {
+                        type_ = MediaType::kVideo;
+                    }
+                    else if (path.size() == 1 && std::isdigit((unsigned char)path[0]))
+                    {
+                        type_ = MediaType::kWebcam;
+                    }
+                    else if (detail::is_list(ext) || try_image_list(path))
+                    {
+                        type_ = MediaType::kImageList;
+                    }
+                    else if (try_image(path))
+                    {
+                        type_ = MediaType::kImage;
+                    }
+                    else if (try_video(path))
+                    {
+                        type_ = MediaType::kVideo;
+                    }
+                    else
+                    {
+                        std::cout << "unknown file type: " << path << "\n";
+                    }
+                }
+                if (type_ != MediaType::kUnknown)
+                {
+                    if (type_ == MediaType::kVideo)
+                    {
+                        cap_.open(path_);
+                        if (!cap_.isOpened())
+                        {
+                            std::cerr << "failed to open video file: " << path_ << "\n";
+                        }
+                    }
+                    else if (type_ == MediaType::kWebcam)
+                    {
+                        cap_.open(std::stoi(path_));
+                        if (!cap_.isOpened())
+                        {
+                            std::cerr << "failed to open camera index: " << path_ << "\n";
+                        }
+                        type_ = MediaType::kVideo;
+                    }
+                    else if (type_ == MediaType::kImage)
+                    {
+                        items_ = {path_};
+                        type_  = MediaType::kImageList;
+                    }
+                    else if (type_ == MediaType::kImageList)
+                    {
+                        if (items_.empty())
+                        {
+                            items_ = load_image_list(path);
+                        }
+                    }
+                }
+            }
+            InputIterator begin()
+            {
+                return InputIterator(*this);
+            }
+            InputIterator end()
+            {
+                return {};
+            }  // NOLINT
+
+            cv::Mat read()
+            {
+                cv::Mat img;
+                if (type_ == MediaType::kVideo)
+                {
+                    cap_ >> img;
+                }
+                else if (type_ == MediaType::kImageList)
+                {
+                    while (!img.data && index_ < items_.size())
+                    {
+                        auto path = items_[index_++];
+                        img       = cv::imread(path);
+                        if (!img.data)
+                        {
+                            std::cerr << "failed to load image: " << path << "\n";
+                        }
+                    }
+                }
+                if (flip_ && !img.empty())
+                {
+                    cv::flip(img, img, 1);
+                }
+                return img;
+            }
+
+            class Batch
+            {
+              public:
+                Batch(Input& input, size_t batch_size)
+                    : input_(&input)
+                    , batch_size_(batch_size)
+                {
+                }
+                BatchInputIterator begin()
+                {
+                    return {input_->begin(), input_->end(), batch_size_};
+                }
+                BatchInputIterator end()
+                {
+                    return {};
+                }  // NOLINT
+
+              private:
+                Input* input_{};
+                size_t batch_size_{1};
+            };
+
+            Batch batch(size_t batch_size)
+            {
+                return {*this, batch_size};
+            }
+
+          private:
+            static bool try_image(const std::string& path)
+            {
+                return cv::imread(path).data;
+            }
+
+            static bool try_video(const std::string& path)
+            {
+                return cv::VideoCapture(path).isOpened();
+            }
+
+            static std::vector<std::string> load_image_list(const std::string& path, size_t max_bytes = 0)
+            {
+                std::ifstream ifs(path);
+                ifs.seekg(0, std::ifstream::end);
+                auto size = ifs.tellg();
+                ifs.seekg(0, std::ifstream::beg);
+                if (max_bytes && size > max_bytes)
+                {
+                    return {};
+                }
+                auto strip = [](std::string& s)
+                {
+                    while (!s.empty() && std::isspace((unsigned char)s.back()))
+                    {
+                        s.pop_back();
+                    }
+                };
+                std::vector<std::string> ret;
+                std::string              line;
+                while (std::getline(ifs, line))
+                {
+                    strip(line);
+                    if (!line.empty())
+                    {
+                        ret.push_back(std::move(line));
+                    }
+                }
+                return ret;
+            }
+
+            bool try_image_list(const std::string& path)
+            {
+                auto   items = load_image_list(path, 1 << 20);
+                size_t count = 0;
+                for (const auto& item : items)
+                {
+                    if (detail::is_image(detail::get_extension(item)) && ++count > items.size() / 10)
+                    {
+                        items_ = std::move(items);
+                        return true;
+                    }
+                }
+                return false;
+            }
+
+          private:
+            std::string              path_;
+            bool                     flip_{};
+            MediaType                type_{MediaType::kUnknown};
+            std::vector<std::string> items_;
+            cv::VideoCapture         cap_;
+            size_t                   index_{};
+        };
+
+        inline void InputIterator::next()
+        {
+            assert(input_);
+            frame_ = input_->read();
         }
-      } else if (type_ == MediaType::kWebcam) {
-        cap_.open(std::stoi(path_));
-        if (!cap_.isOpened()) {
-          std::cerr << "failed to open camera index: " << path_ << "\n";
-        }
-        type_ = MediaType::kVideo;
-      } else if (type_ == MediaType::kImage) {
-        items_ = {path_};
-        type_ = MediaType::kImageList;
-      } else if (type_ == MediaType::kImageList) {
-        if (items_.empty()) {
-          items_ = load_image_list(path);
-        }
-      }
-    }
-  }
-  InputIterator begin() { return InputIterator(*this); }
-  InputIterator end() { return {}; }  // NOLINT
-
-  cv::Mat read() {
-    cv::Mat img;
-    if (type_ == MediaType::kVideo) {
-      cap_ >> img;
-    } else if (type_ == MediaType::kImageList) {
-      while (!img.data && index_ < items_.size()) {
-        auto path = items_[index_++];
-        img = cv::imread(path);
-        if (!img.data) {
-          std::cerr << "failed to load image: " << path << "\n";
-        }
-      }
-    }
-    if (flip_ && !img.empty()) {
-      cv::flip(img, img, 1);
-    }
-    return img;
-  }
-
-  class Batch {
-   public:
-    Batch(Input& input, size_t batch_size) : input_(&input), batch_size_(batch_size) {}
-    BatchInputIterator begin() { return {input_->begin(), input_->end(), batch_size_}; }
-    BatchInputIterator end() { return {}; }  // NOLINT
-
-   private:
-    Input* input_{};
-    size_t batch_size_{1};
-  };
-
-  Batch batch(size_t batch_size) { return {*this, batch_size}; }
-
- private:
-  static bool try_image(const std::string& path) { return cv::imread(path).data; }
-
-  static bool try_video(const std::string& path) { return cv::VideoCapture(path).isOpened(); }
-
-  static std::vector<std::string> load_image_list(const std::string& path, size_t max_bytes = 0) {
-    std::ifstream ifs(path);
-    ifs.seekg(0, std::ifstream::end);
-    auto size = ifs.tellg();
-    ifs.seekg(0, std::ifstream::beg);
-    if (max_bytes && size > max_bytes) {
-      return {};
-    }
-    auto strip = [](std::string& s) {
-      while (!s.empty() && std::isspace((unsigned char)s.back())) {
-        s.pop_back();
-      }
-    };
-    std::vector<std::string> ret;
-    std::string line;
-    while (std::getline(ifs, line)) {
-      strip(line);
-      if (!line.empty()) {
-        ret.push_back(std::move(line));
-      }
-    }
-    return ret;
-  }
-
-  bool try_image_list(const std::string& path) {
-    auto items = load_image_list(path, 1 << 20);
-    size_t count = 0;
-    for (const auto& item : items) {
-      if (detail::is_image(detail::get_extension(item)) && ++count > items.size() / 10) {
-        items_ = std::move(items);
-        return true;
-      }
-    }
-    return false;
-  }
-
- private:
-  std::string path_;
-  bool flip_{};
-  MediaType type_{MediaType::kUnknown};
-  std::vector<std::string> items_;
-  cv::VideoCapture cap_;
-  size_t index_{};
-};
-
-inline void InputIterator::next() {
-  assert(input_);
-  frame_ = input_->read();
-}
-
-class Output;
-
-class OutputIterator {
- public:
-  using iterator_category = std::output_iterator_tag;
-  using difference_type = std::ptrdiff_t;
-  using reference = void;
-  using value_type = void;
-  using pointer = void;
-
- public:
-  explicit OutputIterator(Output& output) : output_(&output) {}
-
-  OutputIterator& operator=(const cv::Mat& frame);
-
-  OutputIterator& operator*() { return *this; }
-  OutputIterator& operator++() { return *this; }
-  OutputIterator& operator++(int) { return *this; }  // NOLINT
-
- private:
-  Output* output_{};
-};
-
-class Output {
- public:
-  explicit Output(const std::string& path, int show, MediaType type = MediaType::kUnknown)
-      : path_(path), type_(type), show_(show) {
-    ext_ = detail::get_extension(path);
-    if (type_ == MediaType::kUnknown) {
-      if (path_.empty()) {
-        type_ = MediaType::kDisable;
-      } else if (detail::is_image(ext_)) {
-        if (detail::is_fmtstr(path)) {
-          type_ = MediaType::kFmtStr;
-        } else {
-          type_ = MediaType::kImage;
+
+        class Output;
+
+        class OutputIterator
+        {
+          public:
+            using iterator_category = std::output_iterator_tag;
+            using difference_type   = std::ptrdiff_t;
+            using reference         = void;
+            using value_type        = void;
+            using pointer           = void;
+
+          public:
+            explicit OutputIterator(Output& output)
+                : output_(&output)
+            {
+            }
+
+            OutputIterator& operator=(const cv::Mat& frame);
+
+            OutputIterator& operator*()
+            {
+                return *this;
+            }
+            OutputIterator& operator++()
+            {
+                return *this;
+            }
+            OutputIterator& operator++(int)
+            {
+                return *this;
+            }  // NOLINT
+
+          private:
+            Output* output_{};
+        };
+
+        class Output
+        {
+          public:
+            explicit Output(const std::string& path, int show, MediaType type = MediaType::kUnknown)
+                : path_(path)
+                , type_(type)
+                , show_(show)
+            {
+                ext_ = detail::get_extension(path);
+                if (type_ == MediaType::kUnknown)
+                {
+                    if (path_.empty())
+                    {
+                        type_ = MediaType::kDisable;
+                    }
+                    else if (detail::is_image(ext_))
+                    {
+                        if (detail::is_fmtstr(path))
+                        {
+                            type_ = MediaType::kFmtStr;
+                        }
+                        else
+                        {
+                            type_ = MediaType::kImage;
+                        }
+                    }
+                    else if (detail::is_video(ext_))
+                    {
+                        type_ = MediaType::kVideo;
+                    }
+                    else
+                    {
+                        std::cout << "unknown file type: " << path << "\n";
+                    }
+                }
+            }
+
+            bool write(const cv::Mat& frame)
+            {
+                bool exit = false;
+                switch (type_)
+                {
+                    case MediaType::kDisable:
+                        break;
+                    case MediaType::kImage:
+                        cv::imwrite(path_, frame);
+                        break;
+                    case MediaType::kFmtStr:
+                    {
+                        char buf[256];
+                        snprintf(buf, sizeof(buf), path_.c_str(), frame_id_);
+                        cv::imwrite(buf, frame);
+                        break;
+                    }
+                    case MediaType::kVideo:
+                        write_video(frame);
+                        break;
+                    default:
+                        std::cout << "unsupported output media type\n";
+                        assert(0);
+                }
+                if (show_ >= 0)
+                {
+                    cv::imshow("", frame);
+                    exit = cv::waitKey(show_) == 27;  // ESC
+                }
+                ++frame_id_;
+                return !exit;
+            }
+
+            OutputIterator inserter()
+            {
+                return OutputIterator{*this};
+            }
+
+          private:
+            void write_video(const cv::Mat& frame)
+            {
+                if (!video_.isOpened())
+                {
+                    open_video(frame.size());
+                }
+                video_ << frame;
+            }
+
+            void open_video(const cv::Size& size)
+            {
+                video_.open(path_, detail::ext2fourcc(ext_), 30, size);
+            }
+
+          private:
+            std::string     path_;
+            std::string     ext_;
+            MediaType       type_{MediaType::kUnknown};
+            int             show_{1};
+            size_t          frame_id_{0};
+            cv::VideoWriter video_;
+        };
+
+        OutputIterator& OutputIterator::operator=(const cv::Mat& frame)
+        {
+            assert(output_);
+            output_->write(frame);
+            return *this;
         }
-      } else if (detail::is_video(ext_)) {
-        type_ = MediaType::kVideo;
-      } else {
-        std::cout << "unknown file type: " << path << "\n";
-      }
-    }
-  }
-
-  bool write(const cv::Mat& frame) {
-    bool exit = false;
-    switch (type_) {
-      case MediaType::kDisable:
-        break;
-      case MediaType::kImage:
-        cv::imwrite(path_, frame);
-        break;
-      case MediaType::kFmtStr: {
-        char buf[256];
-        snprintf(buf, sizeof(buf), path_.c_str(), frame_id_);
-        cv::imwrite(buf, frame);
-        break;
-      }
-      case MediaType::kVideo:
-        write_video(frame);
-        break;
-      default:
-        std::cout << "unsupported output media type\n";
-        assert(0);
-    }
-    if (show_ >= 0) {
-      cv::imshow("", frame);
-      exit = cv::waitKey(show_) == 27;  // ESC
-    }
-    ++frame_id_;
-    return !exit;
-  }
-
-  OutputIterator inserter() { return OutputIterator{*this}; }
-
- private:
-  void write_video(const cv::Mat& frame) {
-    if (!video_.isOpened()) {
-      open_video(frame.size());
-    }
-    video_ << frame;
-  }
-
-  void open_video(const cv::Size& size) { video_.open(path_, detail::ext2fourcc(ext_), 30, size); }
-
- private:
-  std::string path_;
-  std::string ext_;
-  MediaType type_{MediaType::kUnknown};
-  int show_{1};
-  size_t frame_id_{0};
-  cv::VideoWriter video_;
-};
-
-OutputIterator& OutputIterator::operator=(const cv::Mat& frame) {
-  assert(output_);
-  output_->write(frame);
-  return *this;
-}
-
-}  // namespace mediaio
+
+    }  // namespace mediaio
 }  // namespace utils
 
 #endif  // MMDEPLOY_MEDIAIO_H
diff --git a/demo/csrc/cpp/utils/palette.h b/demo/csrc/cpp/utils/palette.h
index 715fdbcb3b..010d9aa02c 100644
--- a/demo/csrc/cpp/utils/palette.h
+++ b/demo/csrc/cpp/utils/palette.h
@@ -10,87 +10,178 @@
 #include <utility>
 #include <vector>
 
-namespace utils {
+namespace utils
+{
 
-struct Palette {
-  std::vector<cv::Vec3b> data;
-  static Palette get(const std::string& path);
-  static Palette get(int n);
-};
+    struct Palette
+    {
+        std::vector<cv::Vec3b> data;
+        static Palette         get(const std::string& path);
+        static Palette         get(int n);
+    };
 
-inline Palette Palette::get(const std::string& path) {
-  if (path == "coco") {
-    Palette p{{
-        {220, 20, 60},   {119, 11, 32},   {0, 0, 142},     {0, 0, 230},     {106, 0, 228},
-        {0, 60, 100},    {0, 80, 100},    {0, 0, 70},      {0, 0, 192},     {250, 170, 30},
-        {100, 170, 30},  {220, 220, 0},   {175, 116, 175}, {250, 0, 30},    {165, 42, 42},
-        {255, 77, 255},  {0, 226, 252},   {182, 182, 255}, {0, 82, 0},      {120, 166, 157},
-        {110, 76, 0},    {174, 57, 255},  {199, 100, 0},   {72, 0, 118},    {255, 179, 240},
-        {0, 125, 92},    {209, 0, 151},   {188, 208, 182}, {0, 220, 176},   {255, 99, 164},
-        {92, 0, 73},     {133, 129, 255}, {78, 180, 255},  {0, 228, 0},     {174, 255, 243},
-        {45, 89, 255},   {134, 134, 103}, {145, 148, 174}, {255, 208, 186}, {197, 226, 255},
-        {171, 134, 1},   {109, 63, 54},   {207, 138, 255}, {151, 0, 95},    {9, 80, 61},
-        {84, 105, 51},   {74, 65, 105},   {166, 196, 102}, {208, 195, 210}, {255, 109, 65},
-        {0, 143, 149},   {179, 0, 194},   {209, 99, 106},  {5, 121, 0},     {227, 255, 205},
-        {147, 186, 208}, {153, 69, 1},    {3, 95, 161},    {163, 255, 0},   {119, 0, 170},
-        {0, 182, 199},   {0, 165, 120},   {183, 130, 88},  {95, 32, 0},     {130, 114, 135},
-        {110, 129, 133}, {166, 74, 118},  {219, 142, 185}, {79, 210, 114},  {178, 90, 62},
-        {65, 70, 15},    {127, 167, 115}, {59, 105, 106},  {142, 108, 45},  {196, 172, 0},
-        {95, 54, 80},    {128, 76, 255},  {201, 57, 1},    {246, 0, 122},   {191, 162, 208},
-    }};
-    for (auto& x : p.data) {
-      std::swap(x[0], x[2]);
+    inline Palette Palette::get(const std::string& path)
+    {
+        if (path == "coco")
+        {
+            Palette p{{
+                {220, 20, 60},
+                {119, 11, 32},
+                {0, 0, 142},
+                {0, 0, 230},
+                {106, 0, 228},
+                {0, 60, 100},
+                {0, 80, 100},
+                {0, 0, 70},
+                {0, 0, 192},
+                {250, 170, 30},
+                {100, 170, 30},
+                {220, 220, 0},
+                {175, 116, 175},
+                {250, 0, 30},
+                {165, 42, 42},
+                {255, 77, 255},
+                {0, 226, 252},
+                {182, 182, 255},
+                {0, 82, 0},
+                {120, 166, 157},
+                {110, 76, 0},
+                {174, 57, 255},
+                {199, 100, 0},
+                {72, 0, 118},
+                {255, 179, 240},
+                {0, 125, 92},
+                {209, 0, 151},
+                {188, 208, 182},
+                {0, 220, 176},
+                {255, 99, 164},
+                {92, 0, 73},
+                {133, 129, 255},
+                {78, 180, 255},
+                {0, 228, 0},
+                {174, 255, 243},
+                {45, 89, 255},
+                {134, 134, 103},
+                {145, 148, 174},
+                {255, 208, 186},
+                {197, 226, 255},
+                {171, 134, 1},
+                {109, 63, 54},
+                {207, 138, 255},
+                {151, 0, 95},
+                {9, 80, 61},
+                {84, 105, 51},
+                {74, 65, 105},
+                {166, 196, 102},
+                {208, 195, 210},
+                {255, 109, 65},
+                {0, 143, 149},
+                {179, 0, 194},
+                {209, 99, 106},
+                {5, 121, 0},
+                {227, 255, 205},
+                {147, 186, 208},
+                {153, 69, 1},
+                {3, 95, 161},
+                {163, 255, 0},
+                {119, 0, 170},
+                {0, 182, 199},
+                {0, 165, 120},
+                {183, 130, 88},
+                {95, 32, 0},
+                {130, 114, 135},
+                {110, 129, 133},
+                {166, 74, 118},
+                {219, 142, 185},
+                {79, 210, 114},
+                {178, 90, 62},
+                {65, 70, 15},
+                {127, 167, 115},
+                {59, 105, 106},
+                {142, 108, 45},
+                {196, 172, 0},
+                {95, 54, 80},
+                {128, 76, 255},
+                {201, 57, 1},
+                {246, 0, 122},
+                {191, 162, 208},
+            }};
+            for (auto& x : p.data)
+            {
+                std::swap(x[0], x[2]);
+            }
+            return p;
+        }
+        else if (path == "cityscapes")
+        {
+            Palette p{{
+                {128, 64, 128},
+                {244, 35, 232},
+                {70, 70, 70},
+                {102, 102, 156},
+                {190, 153, 153},
+                {153, 153, 153},
+                {250, 170, 30},
+                {220, 220, 0},
+                {107, 142, 35},
+                {152, 251, 152},
+                {70, 130, 180},
+                {220, 20, 60},
+                {255, 0, 0},
+                {0, 0, 142},
+                {0, 0, 70},
+                {0, 60, 100},
+                {0, 80, 100},
+                {0, 0, 230},
+                {119, 11, 32},
+            }};
+            for (auto& x : p.data)
+            {
+                std::swap(x[0], x[2]);
+            }
+            return p;
+        }
+        std::ifstream ifs(path);
+        if (!ifs.is_open())
+        {
+            std::cout << "error: failed to open palette data file: " << path << "\n";
+            std::abort();
+        }
+        Palette p;
+        int     n = 0;
+        ifs >> n;
+        for (int i = 0; i < n; ++i)
+        {
+            cv::Vec3b x{};
+            ifs >> x[0] >> x[1] >> x[2];
+            p.data.push_back(x);
+        }
+        return p;
     }
-    return p;
-  } else if (path == "cityscapes") {
-    Palette p{{
-        {128, 64, 128},  {244, 35, 232}, {70, 70, 70},  {102, 102, 156}, {190, 153, 153},
-        {153, 153, 153}, {250, 170, 30}, {220, 220, 0}, {107, 142, 35},  {152, 251, 152},
-        {70, 130, 180},  {220, 20, 60},  {255, 0, 0},   {0, 0, 142},     {0, 0, 70},
-        {0, 60, 100},    {0, 80, 100},   {0, 0, 230},   {119, 11, 32},
-    }};
-    for (auto& x : p.data) {
-      std::swap(x[0], x[2]);
-    }
-    return p;
-  }
-  std::ifstream ifs(path);
-  if (!ifs.is_open()) {
-    std::cout << "error: failed to open palette data file: " << path << "\n";
-    std::abort();
-  }
-  Palette p;
-  int n = 0;
-  ifs >> n;
-  for (int i = 0; i < n; ++i) {
-    cv::Vec3b x{};
-    ifs >> x[0] >> x[1] >> x[2];
-    p.data.push_back(x);
-  }
-  return p;
-}
 
-inline Palette Palette::get(int n) {
-  std::vector<cv::Point3f> samples(n * 100);
-  std::vector<int> indices(samples.size());
-  std::iota(indices.begin(), indices.end(), 0);
-  std::mt19937 gen;  // NOLINT
-  std::uniform_int_distribution<ushort> uniform_dist(0, 255);
-  for (auto& x : samples) {
-    x = {(float)uniform_dist(gen), (float)uniform_dist(gen), (float)uniform_dist(gen)};
-  }
-  std::vector<cv::Point3f> centers(n);
-  cv::Mat c_mat(centers, false);
-  cv::Mat s_mat(samples, false);
-  c_mat = c_mat.reshape(1, {n, 3});  // CV_32FC3 -> CV_32FC1 for cv::kmeans output
-  cv::kmeans(s_mat, n, indices, cv::TermCriteria(cv::TermCriteria::Type::COUNT, 10, 0), 1,
-             cv::KMEANS_PP_CENTERS, c_mat);
-  Palette p;
-  for (const auto& c : centers) {
-    p.data.emplace_back((uchar)c.x, (uchar)c.y, (uchar)c.z);
-  }
-  return p;
-}
+    inline Palette Palette::get(int n)
+    {
+        std::vector<cv::Point3f> samples(n * 100);
+        std::vector<int>         indices(samples.size());
+        std::iota(indices.begin(), indices.end(), 0);
+        std::mt19937                          gen;  // NOLINT
+        std::uniform_int_distribution<ushort> uniform_dist(0, 255);
+        for (auto& x : samples)
+        {
+            x = {(float)uniform_dist(gen), (float)uniform_dist(gen), (float)uniform_dist(gen)};
+        }
+        std::vector<cv::Point3f> centers(n);
+        cv::Mat                  c_mat(centers, false);
+        cv::Mat                  s_mat(samples, false);
+        c_mat = c_mat.reshape(1, {n, 3});  // CV_32FC3 -> CV_32FC1 for cv::kmeans output
+        cv::kmeans(s_mat, n, indices, cv::TermCriteria(cv::TermCriteria::Type::COUNT, 10, 0), 1, cv::KMEANS_PP_CENTERS, c_mat);
+        Palette p;
+        for (const auto& c : centers)
+        {
+            p.data.emplace_back((uchar)c.x, (uchar)c.y, (uchar)c.z);
+        }
+        return p;
+    }
 
 }  // namespace utils
 
diff --git a/demo/csrc/cpp/utils/skeleton.h b/demo/csrc/cpp/utils/skeleton.h
index 8319c28862..6d56138ba9 100644
--- a/demo/csrc/cpp/utils/skeleton.h
+++ b/demo/csrc/cpp/utils/skeleton.h
@@ -9,138 +9,297 @@
 #include <utility>
 #include <vector>
 
-namespace utils {
+namespace utils
+{
 
-struct Skeleton {
-  std::vector<std::pair<int, int>> links;
-  std::vector<cv::Scalar> palette;
-  std::vector<int> link_colors;
-  std::vector<int> point_colors;
-  static Skeleton get(const std::string& path);
-};
+    struct Skeleton
+    {
+        std::vector<std::pair<int, int>> links;
+        std::vector<cv::Scalar>          palette;
+        std::vector<int>                 link_colors;
+        std::vector<int>                 point_colors;
+        static Skeleton                  get(const std::string& path);
+    };
 
-const Skeleton& gSkeletonCoco() {
-  static const Skeleton inst{
-      {
-          {15, 13}, {13, 11}, {16, 14}, {14, 12}, {11, 12}, {5, 11}, {6, 12},
-          {5, 6},   {5, 7},   {6, 8},   {7, 9},   {8, 10},  {1, 2},  {0, 1},
-          {0, 2},   {1, 3},   {2, 4},   {3, 5},   {4, 6},
-      },
-      {
-          {255, 128, 0},   {255, 153, 51},  {255, 178, 102}, {230, 230, 0},   {255, 153, 255},
-          {153, 204, 255}, {255, 102, 255}, {255, 51, 255},  {102, 178, 255}, {51, 153, 255},
-          {255, 153, 153}, {255, 102, 102}, {255, 51, 51},   {153, 255, 153}, {102, 255, 102},
-          {51, 255, 51},   {0, 255, 0},     {0, 0, 255},     {255, 0, 0},     {255, 255, 255},
-      },
-      {0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16},
-      {16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0},
-  };
-  return inst;
-}
+    const Skeleton& gSkeletonCoco()
+    {
+        static const Skeleton inst{
+            {
+                {15, 13},
+                {13, 11},
+                {16, 14},
+                {14, 12},
+                {11, 12},
+                {5, 11},
+                {6, 12},
+                {5, 6},
+                {5, 7},
+                {6, 8},
+                {7, 9},
+                {8, 10},
+                {1, 2},
+                {0, 1},
+                {0, 2},
+                {1, 3},
+                {2, 4},
+                {3, 5},
+                {4, 6},
+            },
+            {
+                {255, 128, 0},
+                {255, 153, 51},
+                {255, 178, 102},
+                {230, 230, 0},
+                {255, 153, 255},
+                {153, 204, 255},
+                {255, 102, 255},
+                {255, 51, 255},
+                {102, 178, 255},
+                {51, 153, 255},
+                {255, 153, 153},
+                {255, 102, 102},
+                {255, 51, 51},
+                {153, 255, 153},
+                {102, 255, 102},
+                {51, 255, 51},
+                {0, 255, 0},
+                {0, 0, 255},
+                {255, 0, 0},
+                {255, 255, 255},
+            },
+            {0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16},
+            {16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0},
+        };
+        return inst;
+    }
 
-const Skeleton& gSkeletonCocoWholeBody() {
-  static const Skeleton inst{
-      {
-          {15, 13},   {13, 11},   {16, 14},   {14, 12},   {11, 12},   {5, 11},    {6, 12},
-          {5, 6},     {5, 7},     {6, 8},     {7, 9},     {8, 10},    {1, 2},     {0, 1},
-          {0, 2},     {1, 3},     {2, 4},     {3, 5},     {4, 6},     {15, 17},   {15, 18},
-          {15, 19},   {16, 20},   {16, 21},   {16, 22},   {91, 92},   {92, 93},   {93, 94},
-          {94, 95},   {91, 96},   {96, 97},   {97, 98},   {98, 99},   {91, 100},  {100, 101},
-          {101, 102}, {102, 103}, {91, 104},  {104, 105}, {105, 106}, {106, 107}, {91, 108},
-          {108, 109}, {109, 110}, {110, 111}, {112, 113}, {113, 114}, {114, 115}, {115, 116},
-          {112, 117}, {117, 118}, {118, 119}, {119, 120}, {112, 121}, {121, 122}, {122, 123},
-          {123, 124}, {112, 125}, {125, 126}, {126, 127}, {127, 128}, {112, 129}, {129, 130},
-          {130, 131}, {131, 132},
-      },
-      {
-          {51, 153, 255},
-          {0, 255, 0},
-          {255, 128, 0},
-          {255, 255, 255},
-          {255, 153, 255},
-          {102, 178, 255},
-          {255, 51, 51},
-      },
-      {1, 1, 2, 2, 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-       2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1,
-       1, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1},
-      {0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
-       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6,
-       1, 1, 1, 1, 3, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1},
-  };
-  return inst;
-}
+    const Skeleton& gSkeletonCocoWholeBody()
+    {
+        static const Skeleton inst{
+            {
+                {15, 13},
+                {13, 11},
+                {16, 14},
+                {14, 12},
+                {11, 12},
+                {5, 11},
+                {6, 12},
+                {5, 6},
+                {5, 7},
+                {6, 8},
+                {7, 9},
+                {8, 10},
+                {1, 2},
+                {0, 1},
+                {0, 2},
+                {1, 3},
+                {2, 4},
+                {3, 5},
+                {4, 6},
+                {15, 17},
+                {15, 18},
+                {15, 19},
+                {16, 20},
+                {16, 21},
+                {16, 22},
+                {91, 92},
+                {92, 93},
+                {93, 94},
+                {94, 95},
+                {91, 96},
+                {96, 97},
+                {97, 98},
+                {98, 99},
+                {91, 100},
+                {100, 101},
+                {101, 102},
+                {102, 103},
+                {91, 104},
+                {104, 105},
+                {105, 106},
+                {106, 107},
+                {91, 108},
+                {108, 109},
+                {109, 110},
+                {110, 111},
+                {112, 113},
+                {113, 114},
+                {114, 115},
+                {115, 116},
+                {112, 117},
+                {117, 118},
+                {118, 119},
+                {119, 120},
+                {112, 121},
+                {121, 122},
+                {122, 123},
+                {123, 124},
+                {112, 125},
+                {125, 126},
+                {126, 127},
+                {127, 128},
+                {112, 129},
+                {129, 130},
+                {130, 131},
+                {131, 132},
+            },
+            {
+                {51, 153, 255},
+                {0, 255, 0},
+                {255, 128, 0},
+                {255, 255, 255},
+                {255, 153, 255},
+                {102, 178, 255},
+                {255, 51, 51},
+            },
+            {1, 1, 2, 2, 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1},
+            {0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1, 3, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1},
+        };
+        return inst;
+    }
 
-const Skeleton& gSkeletonCocoWholeBodyHand() {
-  static const Skeleton inst{
-      {
-          {0, 1},  {1, 2},   {2, 3},   {3, 4},
-          {0, 5},  {5, 6},   {6, 7},   {7, 8},
-          {0, 9},  {9, 10},  {10, 11}, {11, 12},
-          {0, 13}, {13, 14}, {14, 15}, {15, 16},
-          {0, 17}, {17, 18}, {18, 19}, {19, 20},
-      },
-      {
-          {255, 255, 255}, {255, 128, 0}, {255, 153, 255},
-          {102, 178, 255}, {255, 51, 51}, {0, 255, 0},
-      },
-      {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,},
-      {0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,},
-  };
-  return inst;
-}
+    const Skeleton& gSkeletonCocoWholeBodyHand()
+    {
+        static const Skeleton inst{
+            {
+                {0, 1},
+                {1, 2},
+                {2, 3},
+                {3, 4},
+                {0, 5},
+                {5, 6},
+                {6, 7},
+                {7, 8},
+                {0, 9},
+                {9, 10},
+                {10, 11},
+                {11, 12},
+                {0, 13},
+                {13, 14},
+                {14, 15},
+                {15, 16},
+                {0, 17},
+                {17, 18},
+                {18, 19},
+                {19, 20},
+            },
+            {
+                {255, 255, 255},
+                {255, 128, 0},
+                {255, 153, 255},
+                {102, 178, 255},
+                {255, 51, 51},
+                {0, 255, 0},
+            },
+            {
+                1,
+                1,
+                1,
+                1,
+                2,
+                2,
+                2,
+                2,
+                3,
+                3,
+                3,
+                3,
+                4,
+                4,
+                4,
+                4,
+                5,
+                5,
+                5,
+                5,
+            },
+            {
+                0,
+                1,
+                1,
+                1,
+                1,
+                2,
+                2,
+                2,
+                2,
+                3,
+                3,
+                3,
+                3,
+                4,
+                4,
+                4,
+                4,
+                5,
+                5,
+                5,
+                5,
+            },
+        };
+        return inst;
+    }
 
-// n_links
-// u0, v0, u1, v1, ..., un-1, vn-1
-// n_palette
-// b0, g0, r0, ..., bn-1, gn-1, rn-1
-// n_link_color
-// i0, i1, ..., in-1
-// n_point_color
-// j0, j1, ..., jn-1
-inline Skeleton Skeleton::get(const std::string& path) {
-  if (path == "coco") {
-    return gSkeletonCoco();
-  } else if (path == "coco-wholebody") {
-    return gSkeletonCocoWholeBody();
-  } else if (path == "coco-wholebody-hand") {
-    return gSkeletonCocoWholeBodyHand();
-  }
-  std::ifstream ifs(path);
-  if (!ifs.is_open()) {
-    std::cout << "error: failed to open skeleton data file: " << path << "\n";
-    std::abort();
-  }
-  Skeleton skel;
-  int n = 0;
-  ifs >> n;
-  for (int i = 0; i < n; ++i) {
-    int u{}, v{};
-    ifs >> u >> v;
-    skel.links.emplace_back(u, v);
-  }
-  ifs >> n;
-  for (int i = 0; i < n; ++i) {
-    int b{}, g{}, r{};
-    ifs >> b >> g >> r;
-    skel.palette.emplace_back(b, g, r);
-  }
-  ifs >> n;
-  for (int i = 0; i < n; ++i) {
-    int x{};
-    ifs >> x;
-    skel.link_colors.push_back(x);
-  }
-  ifs >> n;
-  for (int i = 0; i < n; ++i) {
-    int x{};
-    ifs >> x;
-    skel.point_colors.push_back(x);
-  }
-  return skel;
-}
+    // n_links
+    // u0, v0, u1, v1, ..., un-1, vn-1
+    // n_palette
+    // b0, g0, r0, ..., bn-1, gn-1, rn-1
+    // n_link_color
+    // i0, i1, ..., in-1
+    // n_point_color
+    // j0, j1, ..., jn-1
+    inline Skeleton Skeleton::get(const std::string& path)
+    {
+        if (path == "coco")
+        {
+            return gSkeletonCoco();
+        }
+        else if (path == "coco-wholebody")
+        {
+            return gSkeletonCocoWholeBody();
+        }
+        else if (path == "coco-wholebody-hand")
+        {
+            return gSkeletonCocoWholeBodyHand();
+        }
+        std::ifstream ifs(path);
+        if (!ifs.is_open())
+        {
+            std::cout << "error: failed to open skeleton data file: " << path << "\n";
+            std::abort();
+        }
+        Skeleton skel;
+        int      n = 0;
+        ifs >> n;
+        for (int i = 0; i < n; ++i)
+        {
+            int u{}, v{};
+            ifs >> u >> v;
+            skel.links.emplace_back(u, v);
+        }
+        ifs >> n;
+        for (int i = 0; i < n; ++i)
+        {
+            int b{}, g{}, r{};
+            ifs >> b >> g >> r;
+            skel.palette.emplace_back(b, g, r);
+        }
+        ifs >> n;
+        for (int i = 0; i < n; ++i)
+        {
+            int x{};
+            ifs >> x;
+            skel.link_colors.push_back(x);
+        }
+        ifs >> n;
+        for (int i = 0; i < n; ++i)
+        {
+            int x{};
+            ifs >> x;
+            skel.point_colors.push_back(x);
+        }
+        return skel;
+    }
 
 }  // namespace utils
 
diff --git a/demo/csrc/cpp/utils/visualize.h b/demo/csrc/cpp/utils/visualize.h
index 9c6b0b04be..867c8e8553 100644
--- a/demo/csrc/cpp/utils/visualize.h
+++ b/demo/csrc/cpp/utils/visualize.h
@@ -13,249 +13,315 @@
 #include "palette.h"
 #include "skeleton.h"
 
-namespace utils {
+namespace utils
+{
 
-class Visualize {
- public:
-  class Session {
-   public:
-    explicit Session(Visualize& v, const cv::Mat& frame) : v_(v) {
-      if (v_.size_) {
-        scale_ = (float)v_.size_ / (float)std::max(frame.cols, frame.rows);
-      }
-      cv::Mat img;
-      if (v.background_ == "black") {
-        img = cv::Mat::zeros(frame.size(), CV_8UC3);
-      } else {
-        img = frame;
-        if (img.channels() == 1) {
-          cv::cvtColor(img, img, cv::COLOR_GRAY2BGR);
-        }
-      }
-      if (scale_ != 1) {
-        cv::resize(img, img, {}, scale_, scale_);
-      } else if (img.data == frame.data) {
-        img = img.clone();
-      }
-      img_ = std::move(img);
-    }
+    class Visualize
+    {
+      public:
+        class Session
+        {
+          public:
+            explicit Session(Visualize& v, const cv::Mat& frame)
+                : v_(v)
+            {
+                if (v_.size_)
+                {
+                    scale_ = (float)v_.size_ / (float)std::max(frame.cols, frame.rows);
+                }
+                cv::Mat img;
+                if (v.background_ == "black")
+                {
+                    img = cv::Mat::zeros(frame.size(), CV_8UC3);
+                }
+                else
+                {
+                    img = frame;
+                    if (img.channels() == 1)
+                    {
+                        cv::cvtColor(img, img, cv::COLOR_GRAY2BGR);
+                    }
+                }
+                if (scale_ != 1)
+                {
+                    cv::resize(img, img, {}, scale_, scale_);
+                }
+                else if (img.data == frame.data)
+                {
+                    img = img.clone();
+                }
+                img_ = std::move(img);
+            }
 
-    void add_label(int label_id, float score, int index) {
-      printf("label: %d, label_id: %d, score: %.4f\n", index, label_id, score);
-      auto size = .5f * static_cast<float>(img_.rows + img_.cols);
-      offset_ += add_text(to_text(label_id, score), {1, (float)offset_}, size) + 2;
-    }
+            void add_label(int label_id, float score, int index)
+            {
+                printf("label: %d, label_id: %d, score: %.4f\n", index, label_id, score);
+                auto size = .5f * static_cast<float>(img_.rows + img_.cols);
+                offset_ += add_text(to_text(label_id, score), {1, (float)offset_}, size) + 2;
+            }
 
-    int add_text(const std::string& text, const cv::Point2f& origin, float size) {
-      static constexpr const int font_face = cv::FONT_HERSHEY_SIMPLEX;
-      static constexpr const int thickness = 1;
-      static constexpr const auto max_font_scale = .5f;
-      static constexpr const auto min_font_scale = .25f;
-      float font_scale{};
-      if (size < 20) {
-        font_scale = min_font_scale;
-      } else if (size > 200) {
-        font_scale = max_font_scale;
-      } else {
-        font_scale = min_font_scale + (size - 20) / (200 - 20) * (max_font_scale - min_font_scale);
-      }
-      int baseline{};
-      auto text_size = cv::getTextSize(text, font_face, font_scale, thickness, &baseline);
-      cv::Rect rect(origin + cv::Point2f(0, text_size.height + 2 * thickness),
-                    origin + cv::Point2f(text_size.width, 0));
-      rect &= cv::Rect({}, img_.size());
-      if (rect.area() > 0) {
-        img_(rect) *= .35f;
-        cv::putText(img_, text, origin + cv::Point2f(0, text_size.height), font_face, font_scale,
-                    cv::Scalar::all(255), thickness, cv::LINE_AA);
-      }
-      return text_size.height;
-    }
+            int add_text(const std::string& text, const cv::Point2f& origin, float size)
+            {
+                static constexpr const int  font_face      = cv::FONT_HERSHEY_SIMPLEX;
+                static constexpr const int  thickness      = 1;
+                static constexpr const auto max_font_scale = .5f;
+                static constexpr const auto min_font_scale = .25f;
+                float                       font_scale{};
+                if (size < 20)
+                {
+                    font_scale = min_font_scale;
+                }
+                else if (size > 200)
+                {
+                    font_scale = max_font_scale;
+                }
+                else
+                {
+                    font_scale = min_font_scale + (size - 20) / (200 - 20) * (max_font_scale - min_font_scale);
+                }
+                int      baseline{};
+                auto     text_size = cv::getTextSize(text, font_face, font_scale, thickness, &baseline);
+                cv::Rect rect(origin + cv::Point2f(0, text_size.height + 2 * thickness),
+                              origin + cv::Point2f(text_size.width, 0));
+                rect &= cv::Rect({}, img_.size());
+                if (rect.area() > 0)
+                {
+                    img_(rect) *= .35f;
+                    cv::putText(img_, text, origin + cv::Point2f(0, text_size.height), font_face, font_scale, cv::Scalar::all(255), thickness, cv::LINE_AA);
+                }
+                return text_size.height;
+            }
 
-    static std::string to_text(int label_id, float score) {
-      std::stringstream ss;
-      ss << label_id << ": " << std::fixed << std::setprecision(1) << score * 100;
-      return ss.str();
-    }
+            static std::string to_text(int label_id, float score)
+            {
+                std::stringstream ss;
+                ss << label_id << ": " << std::fixed << std::setprecision(1) << score * 100;
+                return ss.str();
+            }
 
-    template <typename Mask>
-    void add_det(const mmdeploy_rect_t& rect, int label_id, float score, const Mask* mask,
-                 int index) {
-      printf("bbox %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n", index,
-             rect.left, rect.top, rect.right, rect.bottom, label_id, score);
-      if (mask) {
-        fprintf(stdout, "mask %d, height=%d, width=%d\n", index, mask->height, mask->width);
-        int x0 = 0, y0 = 0, img_h=img_.size().height, img_w =img_.size().width ;
-        if (img_h != (int)mask->height || img_w != (int)mask->width ) { // maskrcnn
-          x0 = (int)std::max(std::floor(rect.left) - 1, 0.f);
-          y0 = (int)std::max(std::floor(rect.top) - 1, 0.f);
-        }
-        add_instance_mask({x0, y0}, rand(), mask->data, mask->height, mask->width);
-      }
-      add_bbox(rect, label_id, score);
-    }
+            template<typename Mask>
+            void add_det(const mmdeploy_rect_t& rect, int label_id, float score, const Mask* mask, int index)
+            {
+                printf("bbox %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n", index, rect.left, rect.top, rect.right, rect.bottom, label_id, score);
+                if (mask)
+                {
+                    fprintf(stdout, "mask %d, height=%d, width=%d\n", index, mask->height, mask->width);
+                    int x0 = 0, y0 = 0, img_h = img_.size().height, img_w = img_.size().width;
+                    if (img_h != (int)mask->height || img_w != (int)mask->width)
+                    {  // maskrcnn
+                        x0 = (int)std::max(std::floor(rect.left) - 1, 0.f);
+                        y0 = (int)std::max(std::floor(rect.top) - 1, 0.f);
+                    }
+                    add_instance_mask({x0, y0}, rand(), mask->data, mask->height, mask->width);
+                }
+                add_bbox(rect, label_id, score);
+            }
 
-    void add_instance_mask(const cv::Point& origin, int color_id, const char* mask_data, int mask_h,
-                           int mask_w, float alpha = .5f) {
-      auto color = v_.palette_.data[color_id % v_.palette_.data.size()];
-      auto x_end = std::min(origin.x + mask_w, img_.cols);
-      auto y_end = std::min(origin.y + mask_h, img_.rows);
-      auto img_data = img_.ptr<cv::Vec3b>();
-      for (int i = origin.y; i < y_end; ++i) {
-        for (int j = origin.x; j < x_end; ++j) {
-          if (mask_data[(i - origin.y) * mask_w + (j - origin.x)]) {
-            img_data[i * img_.cols + j] = img_data[i * img_.cols + j] * (1 - alpha) + color * alpha;
-          }
-        }
-      }
-    }
+            void add_instance_mask(const cv::Point& origin, int color_id, const char* mask_data, int mask_h, int mask_w, float alpha = .5f)
+            {
+                auto color    = v_.palette_.data[color_id % v_.palette_.data.size()];
+                auto x_end    = std::min(origin.x + mask_w, img_.cols);
+                auto y_end    = std::min(origin.y + mask_h, img_.rows);
+                auto img_data = img_.ptr<cv::Vec3b>();
+                for (int i = origin.y; i < y_end; ++i)
+                {
+                    for (int j = origin.x; j < x_end; ++j)
+                    {
+                        if (mask_data[(i - origin.y) * mask_w + (j - origin.x)])
+                        {
+                            img_data[i * img_.cols + j] = img_data[i * img_.cols + j] * (1 - alpha) + color * alpha;
+                        }
+                    }
+                }
+            }
 
-    void add_bbox(mmdeploy_rect_t rect, int label_id, float score) {
-      rect.left *= scale_;
-      rect.right *= scale_;
-      rect.top *= scale_;
-      rect.bottom *= scale_;
-      if (label_id >= 0 && score > 0) {
-        auto area = std::max(0.f, (rect.right - rect.left) * (rect.bottom - rect.top));
-        add_text(to_text(label_id, score), {rect.left, rect.top}, std::sqrt(area));
-      }
-      cv::rectangle(img_, cv::Point2f(rect.left, rect.top), cv::Point2f(rect.right, rect.bottom),
-                    cv::Scalar(0, 255, 0));
-    }
+            void add_bbox(mmdeploy_rect_t rect, int label_id, float score)
+            {
+                rect.left *= scale_;
+                rect.right *= scale_;
+                rect.top *= scale_;
+                rect.bottom *= scale_;
+                if (label_id >= 0 && score > 0)
+                {
+                    auto area = std::max(0.f, (rect.right - rect.left) * (rect.bottom - rect.top));
+                    add_text(to_text(label_id, score), {rect.left, rect.top}, std::sqrt(area));
+                }
+                cv::rectangle(img_, cv::Point2f(rect.left, rect.top), cv::Point2f(rect.right, rect.bottom), cv::Scalar(0, 255, 0));
+            }
 
-    void add_text_det(mmdeploy_point_t bbox[4], float score, const char* text, size_t text_size,
-                      int index) {
-      printf("bbox[%d]: (%.2f, %.2f), (%.2f, %.2f), (%.2f, %.2f), (%.2f, %.2f), %.2f\n", index,  //
-             bbox[0].x, bbox[0].y,                                                               //
-             bbox[1].x, bbox[1].y,                                                               //
-             bbox[2].x, bbox[2].y,                                                               //
-             bbox[3].x, bbox[3].y, score);
-      std::vector<cv::Point> poly_points;
-      cv::Point2f center{};
-      for (int i = 0; i < 4; ++i) {
-        poly_points.emplace_back(bbox[i].x * scale_, bbox[i].y * scale_);
-        center += cv::Point2f(poly_points.back());
-      }
-      cv::polylines(img_, poly_points, true, cv::Scalar{0, 255, 0}, 1, cv::LINE_AA);
-      if (text) {
-        auto area = cv::contourArea(poly_points);
-        fprintf(stdout, "text[%d]: %s\n", index, text);
-        add_text(std::string(text, text + text_size), center / 4, std::sqrt(area));
-      }
-    }
+            void add_text_det(mmdeploy_point_t bbox[4], float score, const char* text, size_t text_size, int index)
+            {
+                printf("bbox[%d]: (%.2f, %.2f), (%.2f, %.2f), (%.2f, %.2f), (%.2f, %.2f), %.2f\n", index,  //
+                       bbox[0].x,
+                       bbox[0].y,  //
+                       bbox[1].x,
+                       bbox[1].y,  //
+                       bbox[2].x,
+                       bbox[2].y,  //
+                       bbox[3].x,
+                       bbox[3].y,
+                       score);
+                std::vector<cv::Point> poly_points;
+                cv::Point2f            center{};
+                for (int i = 0; i < 4; ++i)
+                {
+                    poly_points.emplace_back(bbox[i].x * scale_, bbox[i].y * scale_);
+                    center += cv::Point2f(poly_points.back());
+                }
+                cv::polylines(img_, poly_points, true, cv::Scalar{0, 255, 0}, 1, cv::LINE_AA);
+                if (text)
+                {
+                    auto area = cv::contourArea(poly_points);
+                    fprintf(stdout, "text[%d]: %s\n", index, text);
+                    add_text(std::string(text, text + text_size), center / 4, std::sqrt(area));
+                }
+            }
 
-    void add_rotated_det(const float bbox[5], int label_id, float score) {
-      float xc = bbox[0] * scale_;
-      float yc = bbox[1] * scale_;
-      float w = bbox[2] * scale_;
-      float h = bbox[3] * scale_;
-      float ag = bbox[4];
-      float wx = w / 2 * std::cos(ag);
-      float wy = w / 2 * std::sin(ag);
-      float hx = -h / 2 * std::sin(ag);
-      float hy = h / 2 * std::cos(ag);
-      cv::Point2f p1{xc - wx - hx, yc - wy - hy};
-      cv::Point2f p2{xc + wx - hx, yc + wy - hy};
-      cv::Point2f p3{xc + wx + hx, yc + wy + hy};
-      cv::Point2f p4{xc - wx + hx, yc - wy + hy};
-      cv::Point2f c = .25f * (p1 + p2 + p3 + p4);
-      cv::drawContours(
-          img_,
-          std::vector<std::vector<cv::Point>>{{p1 * scale_, p2 * scale_, p3 * scale_, p4 * scale_}},
-          -1, {0, 255, 0}, 2, cv::LINE_AA);
-      add_text(to_text(label_id, score), c, std::sqrt(w * h));
-    }
+            void add_rotated_det(const float bbox[5], int label_id, float score)
+            {
+                float       xc = bbox[0] * scale_;
+                float       yc = bbox[1] * scale_;
+                float       w  = bbox[2] * scale_;
+                float       h  = bbox[3] * scale_;
+                float       ag = bbox[4];
+                float       wx = w / 2 * std::cos(ag);
+                float       wy = w / 2 * std::sin(ag);
+                float       hx = -h / 2 * std::sin(ag);
+                float       hy = h / 2 * std::cos(ag);
+                cv::Point2f p1{xc - wx - hx, yc - wy - hy};
+                cv::Point2f p2{xc + wx - hx, yc + wy - hy};
+                cv::Point2f p3{xc + wx + hx, yc + wy + hy};
+                cv::Point2f p4{xc - wx + hx, yc - wy + hy};
+                cv::Point2f c = .25f * (p1 + p2 + p3 + p4);
+                cv::drawContours(
+                    img_,
+                    std::vector<std::vector<cv::Point>>{{p1 * scale_, p2 * scale_, p3 * scale_, p4 * scale_}},
+                    -1,
+                    {0, 255, 0},
+                    2,
+                    cv::LINE_AA);
+                add_text(to_text(label_id, score), c, std::sqrt(w * h));
+            }
 
-    void add_mask(int height, int width, int n_classes, const int* mask, const float* score) {
-      cv::Mat color_mask = cv::Mat::zeros(height, width, CV_8UC3);
-      auto n_pix = color_mask.total();
+            void add_mask(int height, int width, int n_classes, const int* mask, const float* score)
+            {
+                cv::Mat       color_mask = cv::Mat::zeros(height, width, CV_8UC3);
+                auto          n_pix      = color_mask.total();
 
-      // compute top 1 idx if score (CHW) is available
-      cv::Mat_<int> top;
-      if (!mask && score) {
-        top = cv::Mat_<int>::zeros(height, width);
-        for (auto c = 1; c < n_classes; ++c) {
-          top.forEach([&](int& x, const int* idx) {
+                // compute top 1 idx if score (CHW) is available
+                cv::Mat_<int> top;
+                if (!mask && score)
+                {
+                    top = cv::Mat_<int>::zeros(height, width);
+                    for (auto c = 1; c < n_classes; ++c)
+                    {
+                        top.forEach([&](int& x, const int* idx)
+                                    {
             auto offset = idx[0] * width + idx[1];
             if (score[c * n_pix + offset] > score[x * n_pix + offset]) {
               x = c;
-            }
-          });
-        }
-        mask = top.ptr<int>();
-      }
+            } });
+                    }
+                    mask = top.ptr<int>();
+                }
 
-      if (mask) {
-        // palette look-up
-        color_mask.forEach<cv::Vec3b>([&](cv::Vec3b& x, const int* idx) {
+                if (mask)
+                {
+                    // palette look-up
+                    color_mask.forEach<cv::Vec3b>([&](cv::Vec3b& x, const int* idx)
+                                                  {
           auto& palette = v_.palette_.data;
-          x = palette[mask[idx[0] * width + idx[1]] % palette.size()];
-        });
+          x = palette[mask[idx[0] * width + idx[1]] % palette.size()]; });
 
-        if (color_mask.size() != img_.size()) {
-          cv::resize(color_mask, color_mask, img_.size());
-        }
+                    if (color_mask.size() != img_.size())
+                    {
+                        cv::resize(color_mask, color_mask, img_.size());
+                    }
 
-        // blend mask and background image
-        cv::addWeighted(img_, .5, color_mask, .5, 0., img_);
-      }
-    }
+                    // blend mask and background image
+                    cv::addWeighted(img_, .5, color_mask, .5, 0., img_);
+                }
+            }
 
-    void add_pose(const mmdeploy_point_t* pts, const float* scores, int32_t pts_size, double thr) {
-      auto& skel = v_.skeleton_;
-      if (skel.point_colors.size() != pts_size) {
-        std::cout << "error: mismatched number of keypoints: " << skel.point_colors.size() << " vs "
-                  << pts_size << ", skip pose visualization.\n";
-        return;
-      }
-      std::vector<int> used(pts_size);
-      std::vector<int> is_end_point(pts_size);
-      for (size_t i = 0; i < skel.links.size(); ++i) {
-        auto u = skel.links[i].first;
-        auto v = skel.links[i].second;
-        is_end_point[u] = is_end_point[v] = 1;
-        if (scores[u] > thr && scores[v] > thr) {
-          used[u] = used[v] = 1;
-          cv::Point2f p0(pts[u].x, pts[u].y);
-          cv::Point2f p1(pts[v].x, pts[v].y);
-          cv::line(img_, p0 * scale_, p1 * scale_, skel.palette[skel.link_colors[i]], 1,
-                   cv::LINE_AA);
-        }
-      }
-      for (size_t i = 0; i < pts_size; ++i) {
-        if (!is_end_point[i] && scores[i] > thr || used[i]) {
-          cv::Point2f p(pts[i].x, pts[i].y);
-          cv::circle(img_, p * scale_, 1, skel.palette[skel.point_colors[i]], 2, cv::LINE_AA);
-        }
-      }
-    }
+            void add_pose(const mmdeploy_point_t* pts, const float* scores, int32_t pts_size, double thr)
+            {
+                auto& skel = v_.skeleton_;
+                if (skel.point_colors.size() != pts_size)
+                {
+                    std::cout << "error: mismatched number of keypoints: " << skel.point_colors.size() << " vs "
+                              << pts_size << ", skip pose visualization.\n";
+                    return;
+                }
+                std::vector<int> used(pts_size);
+                std::vector<int> is_end_point(pts_size);
+                for (size_t i = 0; i < skel.links.size(); ++i)
+                {
+                    auto u          = skel.links[i].first;
+                    auto v          = skel.links[i].second;
+                    is_end_point[u] = is_end_point[v] = 1;
+                    if (scores[u] > thr && scores[v] > thr)
+                    {
+                        used[u] = used[v] = 1;
+                        cv::Point2f p0(pts[u].x, pts[u].y);
+                        cv::Point2f p1(pts[v].x, pts[v].y);
+                        cv::line(img_, p0 * scale_, p1 * scale_, skel.palette[skel.link_colors[i]], 1, cv::LINE_AA);
+                    }
+                }
+                for (size_t i = 0; i < pts_size; ++i)
+                {
+                    if (!is_end_point[i] && scores[i] > thr || used[i])
+                    {
+                        cv::Point2f p(pts[i].x, pts[i].y);
+                        cv::circle(img_, p * scale_, 1, skel.palette[skel.point_colors[i]], 2, cv::LINE_AA);
+                    }
+                }
+            }
 
-    cv::Mat get() { return img_; }
+            cv::Mat get()
+            {
+                return img_;
+            }
 
-   private:
-    Visualize& v_;
-    float scale_{1};
-    int offset_{1};
-    cv::Mat img_;
-  };
+          private:
+            Visualize& v_;
+            float      scale_{1};
+            int        offset_{1};
+            cv::Mat    img_;
+        };
 
-  explicit Visualize(int size = 0) : size_(size) { palette_ = Palette::get(32); }
+        explicit Visualize(int size = 0)
+            : size_(size)
+        {
+            palette_ = Palette::get(32);
+        }
 
-  Session get_session(const cv::Mat& frame) { return Session(*this, frame); }
+        Session get_session(const cv::Mat& frame)
+        {
+            return Session(*this, frame);
+        }
 
-  void set_skeleton(const Skeleton& skeleton) { skeleton_ = skeleton; }
+        void set_skeleton(const Skeleton& skeleton)
+        {
+            skeleton_ = skeleton;
+        }
 
-  void set_palette(const Palette& palette) { palette_ = palette; }
+        void set_palette(const Palette& palette)
+        {
+            palette_ = palette;
+        }
 
-  void set_background(const std::string& background) { background_ = background; }
+        void set_background(const std::string& background)
+        {
+            background_ = background;
+        }
 
- private:
-  friend Session;
-  Skeleton skeleton_;
-  Palette palette_;
-  std::string background_;
-  int size_{};
-};
+      private:
+        friend Session;
+        Skeleton    skeleton_;
+        Palette     palette_;
+        std::string background_;
+        int         size_{};
+    };
 
 }  // namespace utils
 
diff --git a/demo/csrc/cpp/video_cls.cxx b/demo/csrc/cpp/video_cls.cxx
index 3d87ee4f7b..d06892b814 100644
--- a/demo/csrc/cpp/video_cls.cxx
+++ b/demo/csrc/cpp/video_cls.cxx
@@ -6,55 +6,61 @@
 #include "opencv2/videoio.hpp"
 #include "utils/argparse.h"
 
-void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer,
-                  std::vector<mmdeploy::Mat>& clips, int clip_len, int frame_interval = 1,
-                  int num_clips = 1) {
-  cv::VideoCapture cap = cv::VideoCapture(video_path);
-  if (!cap.isOpened()) {
-    fprintf(stderr, "failed to load video: %s\n", video_path);
-    exit(1);
-  }
+void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer, std::vector<mmdeploy::Mat>& clips, int clip_len, int frame_interval = 1, int num_clips = 1)
+{
+    cv::VideoCapture cap = cv::VideoCapture(video_path);
+    if (!cap.isOpened())
+    {
+        fprintf(stderr, "failed to load video: %s\n", video_path);
+        exit(1);
+    }
 
-  int num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT);
-  printf("num_frames %d\n", num_frames);
+    int num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT);
+    printf("num_frames %d\n", num_frames);
 
-  int ori_clip_len = clip_len * frame_interval;
-  float avg_interval = (num_frames - ori_clip_len + 1.f) / num_clips;
-  std::vector<int> frame_inds;
-  for (int i = 0; i < num_clips; i++) {
-    int clip_offset = i * avg_interval + avg_interval / 2.0;
-    for (int j = 0; j < clip_len; j++) {
-      int ind = (j * frame_interval + clip_offset) % num_frames;
-      if (num_frames <= ori_clip_len - 1) {
-        ind = j % num_frames;
-      }
-      frame_inds.push_back(ind);
+    int              ori_clip_len = clip_len * frame_interval;
+    float            avg_interval = (num_frames - ori_clip_len + 1.f) / num_clips;
+    std::vector<int> frame_inds;
+    for (int i = 0; i < num_clips; i++)
+    {
+        int clip_offset = i * avg_interval + avg_interval / 2.0;
+        for (int j = 0; j < clip_len; j++)
+        {
+            int ind = (j * frame_interval + clip_offset) % num_frames;
+            if (num_frames <= ori_clip_len - 1)
+            {
+                ind = j % num_frames;
+            }
+            frame_inds.push_back(ind);
+        }
     }
-  }
 
-  std::vector<int> unique_inds(frame_inds.begin(), frame_inds.end());
-  std::sort(unique_inds.begin(), unique_inds.end());
-  auto last = std::unique(unique_inds.begin(), unique_inds.end());
-  unique_inds.erase(last, unique_inds.end());
+    std::vector<int> unique_inds(frame_inds.begin(), frame_inds.end());
+    std::sort(unique_inds.begin(), unique_inds.end());
+    auto last = std::unique(unique_inds.begin(), unique_inds.end());
+    unique_inds.erase(last, unique_inds.end());
 
-  int ind = 0;
-  for (int i = 0; i < unique_inds.size(); i++) {
-    int tid = unique_inds[i];
-    cv::Mat frame;
-    while (ind < tid) {
-      cap.read(frame);
-      ind++;
+    int ind = 0;
+    for (int i = 0; i < unique_inds.size(); i++)
+    {
+        int     tid = unique_inds[i];
+        cv::Mat frame;
+        while (ind < tid)
+        {
+            cap.read(frame);
+            ind++;
+        }
+        cap.read(frame);
+        buffer[tid] = frame;
+        ind++;
     }
-    cap.read(frame);
-    buffer[tid] = frame;
-    ind++;
-  }
 
-  clips.resize(frame_inds.size());
-  for (int i = 0; i < frame_inds.size(); i++) {
-    auto& img = buffer[frame_inds[i]];
-    clips[i] = img;
-  }
+    clips.resize(frame_inds.size());
+    for (int i = 0; i < frame_inds.size(); i++)
+    {
+        auto& img = buffer[frame_inds[i]];
+        clips[i]  = img;
+    }
 }
 
 DEFINE_ARG_string(model, "Model path");
@@ -64,25 +70,27 @@ DEFINE_ARG_int32(frame_interval, "Frame interval");
 DEFINE_ARG_int32(num_clips, "Number of clips");
 DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
+    }
 
-  std::map<int, cv::Mat> buffer;
-  std::vector<mmdeploy::Mat> clips;
-  mmdeploy::VideoSampleInfo clip_info = {ARGS_clip_len, ARGS_num_clips};
-  SampleFrames(ARGS_video.c_str(), buffer, clips, ARGS_clip_len, ARGS_frame_interval,
-               ARGS_num_clips);
+    std::map<int, cv::Mat>     buffer;
+    std::vector<mmdeploy::Mat> clips;
+    mmdeploy::VideoSampleInfo  clip_info = {ARGS_clip_len, ARGS_num_clips};
+    SampleFrames(ARGS_video.c_str(), buffer, clips, ARGS_clip_len, ARGS_frame_interval, ARGS_num_clips);
 
-  mmdeploy::Model model(ARGS_model);
-  mmdeploy::VideoRecognizer recognizer(model, mmdeploy::Device{FLAGS_device});
+    mmdeploy::Model           model(ARGS_model);
+    mmdeploy::VideoRecognizer recognizer(model, mmdeploy::Device{FLAGS_device});
 
-  auto res = recognizer.Apply(clips, clip_info);
+    auto                      res = recognizer.Apply(clips, clip_info);
 
-  for (const auto& cls : res) {
-    fprintf(stderr, "label: %d, score: %.4f\n", cls.label_id, cls.score);
-  }
+    for (const auto& cls : res)
+    {
+        fprintf(stderr, "label: %d, score: %.4f\n", cls.label_id, cls.score);
+    }
 
-  return 0;
+    return 0;
 }
diff --git a/tests/test_csrc/archive/test_json_archive.cpp b/tests/test_csrc/archive/test_json_archive.cpp
index ce4aef9994..5a76194a36 100644
--- a/tests/test_csrc/archive/test_json_archive.cpp
+++ b/tests/test_csrc/archive/test_json_archive.cpp
@@ -12,53 +12,60 @@
 #include "catch.hpp"
 #include "mmdeploy/archive/json_archive.h"
 
-using ArrayLikeTypes = std::tuple<std::vector<int>, std::deque<int>, std::array<int, 15>,
-                                  std::list<int>, std::set<int>, std::unordered_set<int>,
-                                  std::multiset<int>, std::unordered_multiset<int> >;
+using ArrayLikeTypes = std::tuple<std::vector<int>, std::deque<int>, std::array<int, 15>, std::list<int>, std::set<int>, std::unordered_set<int>, std::multiset<int>, std::unordered_multiset<int>>;
 
-TEMPLATE_LIST_TEST_CASE("test array-like", "[archive]", ArrayLikeTypes) {
-  TestType v{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4};
-  nlohmann::json json;
-  mmdeploy::JsonOutputArchive oa(json);
-  oa(v);
-  mmdeploy::JsonInputArchive ia(json);
-  TestType u{};
-  ia(u);
-  std::cout << json << std::endl;
-  REQUIRE(u == v);
+TEMPLATE_LIST_TEST_CASE("test array-like", "[archive]", ArrayLikeTypes)
+{
+    TestType                    v{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4};
+    nlohmann::json              json;
+    mmdeploy::JsonOutputArchive oa(json);
+    oa(v);
+    mmdeploy::JsonInputArchive ia(json);
+    TestType                   u{};
+    ia(u);
+    std::cout << json << std::endl;
+    REQUIRE(u == v);
 }
 
 using MapLikeTypes = std::tuple<
     //        std::map<int, float>
-    std::map<int, float>, std::unordered_map<int, float>, std::multimap<int, float>,
-    std::unordered_multimap<int, float> >;
+    std::map<int, float>,
+    std::unordered_map<int, float>,
+    std::multimap<int, float>,
+    std::unordered_multimap<int, float>>;
 
-TEMPLATE_LIST_TEST_CASE("test map-like", "[archive]", MapLikeTypes) {
-  TestType v{{1, 123.456f}, {1, 222.222f}, {2, 111.222f}, {3, 223.332f}, {3, 1.22e10f}};
-  nlohmann::json json;
-  mmdeploy::JsonOutputArchive oa(json);
-  oa(v);
-  mmdeploy::JsonInputArchive ia(json);
-  TestType u;
-  ia(u);
-  std::cout << json << std::endl;
-  REQUIRE(u == v);
+TEMPLATE_LIST_TEST_CASE("test map-like", "[archive]", MapLikeTypes)
+{
+    TestType                    v{{1, 123.456f}, {1, 222.222f}, {2, 111.222f}, {3, 223.332f}, {3, 1.22e10f}};
+    nlohmann::json              json;
+    mmdeploy::JsonOutputArchive oa(json);
+    oa(v);
+    mmdeploy::JsonInputArchive ia(json);
+    TestType                   u;
+    ia(u);
+    std::cout << json << std::endl;
+    REQUIRE(u == v);
 }
 
-struct A {
-  std::vector<int> vec;
-  std::string str;
-  friend bool operator==(const A& a, const A& b) { return a.vec == b.vec && a.str == b.str; }
-  MMDEPLOY_ARCHIVE_MEMBERS(vec, str);
+struct A
+{
+    std::vector<int> vec;
+    std::string      str;
+    friend bool      operator==(const A& a, const A& b)
+    {
+        return a.vec == b.vec && a.str == b.str;
+    }
+    MMDEPLOY_ARCHIVE_MEMBERS(vec, str);
 };
 
-TEST_CASE("test struct", "[archive]") {
-  A a{{1, 2, 3, 4, 5}, "hello"};
-  nlohmann::json json;
-  mmdeploy::JsonOutputArchive oa(json);
-  oa(a);
-  mmdeploy::JsonInputArchive ia(json);
-  A b;
-  ia(b);
-  REQUIRE(a == b);
+TEST_CASE("test struct", "[archive]")
+{
+    A                           a{{1, 2, 3, 4, 5}, "hello"};
+    nlohmann::json              json;
+    mmdeploy::JsonOutputArchive oa(json);
+    oa(a);
+    mmdeploy::JsonInputArchive ia(json);
+    A                          b;
+    ia(b);
+    REQUIRE(a == b);
 }
diff --git a/tests/test_csrc/archive/test_value_archive.cpp b/tests/test_csrc/archive/test_value_archive.cpp
index ceb53bd764..70e0ac29fc 100644
--- a/tests/test_csrc/archive/test_value_archive.cpp
+++ b/tests/test_csrc/archive/test_value_archive.cpp
@@ -35,26 +35,28 @@ using ArrayLikeTypes =
 
 // clang-format on
 
-TEMPLATE_LIST_TEST_CASE("test array-like for value", "[value]", ArrayLikeTypes) {
-  TestType v{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4};
-  mmdeploy::Value value;
-  mmdeploy::ValueOutputArchive oa(value);
-  oa(v);
-  mmdeploy::ValueInputArchive ia(value);
-  TestType u{};
-  ia(u);
-  REQUIRE(u == v);
+TEMPLATE_LIST_TEST_CASE("test array-like for value", "[value]", ArrayLikeTypes)
+{
+    TestType                     v{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4};
+    mmdeploy::Value              value;
+    mmdeploy::ValueOutputArchive oa(value);
+    oa(v);
+    mmdeploy::ValueInputArchive ia(value);
+    TestType                    u{};
+    ia(u);
+    REQUIRE(u == v);
 }
 
-TEST_CASE("test native array for value archive", "[value1]") {
-  const int a[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  int b[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  mmdeploy::Value value;
-  mmdeploy::ValueOutputArchive oa(value);
-  oa(a);
-  mmdeploy::ValueInputArchive ia(value);
-  ia(b);
-  REQUIRE(std::vector<int>(a, a + 10) == std::vector<int>(b, b + 10));
+TEST_CASE("test native array for value archive", "[value1]")
+{
+    const int                    a[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    int                          b[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    mmdeploy::Value              value;
+    mmdeploy::ValueOutputArchive oa(value);
+    oa(a);
+    mmdeploy::ValueInputArchive ia(value);
+    ia(b);
+    REQUIRE(std::vector<int>(a, a + 10) == std::vector<int>(b, b + 10));
 }
 
 // clang-format off
@@ -70,49 +72,60 @@ using MapLikeTypes =
 
 // clang-format on
 
-TEMPLATE_LIST_TEST_CASE("test map-like for value archive", "[value]", MapLikeTypes) {
-  TestType v{{1, 123.456f}, {1, 222.222f}, {2, 111.222f}, {3, 223.332f}, {3, 1.22e10f}};
-  mmdeploy::Value value;
-  mmdeploy::ValueOutputArchive oa(value);
-  oa(v);
-  mmdeploy::ValueInputArchive ia(value);
-  TestType u{};
-  ia(u);
-  REQUIRE(u == v);
+TEMPLATE_LIST_TEST_CASE("test map-like for value archive", "[value]", MapLikeTypes)
+{
+    TestType                     v{{1, 123.456f}, {1, 222.222f}, {2, 111.222f}, {3, 223.332f}, {3, 1.22e10f}};
+    mmdeploy::Value              value;
+    mmdeploy::ValueOutputArchive oa(value);
+    oa(v);
+    mmdeploy::ValueInputArchive ia(value);
+    TestType                    u{};
+    ia(u);
+    REQUIRE(u == v);
 }
 
-struct OuterObject {
-  int x;
-  float y;
-  struct InnerObject {
-    std::string f;
-    bool g;
-    friend bool operator==(const InnerObject& a, const InnerObject& b) {
-      return a.f == b.f && a.g == b.g;
+struct OuterObject
+{
+    int   x;
+    float y;
+    struct InnerObject
+    {
+        std::string f;
+        bool        g;
+        friend bool operator==(const InnerObject& a, const InnerObject& b)
+        {
+            return a.f == b.f && a.g == b.g;
+        }
+        MMDEPLOY_ARCHIVE_MEMBERS(f, g);
+    };
+    InnerObject inner;
+
+    struct Stl
+    {
+        std::vector<std::string>   s_vec;
+        std::map<std::string, int> si_map;
+        friend bool                operator==(const Stl& a, const Stl& b)
+        {
+            return a.s_vec == b.s_vec && a.si_map == b.si_map;
+        }
+        MMDEPLOY_ARCHIVE_MEMBERS(s_vec);
+    };
+    Stl         stl;
+
+    friend bool operator==(const OuterObject& a, const OuterObject& b)
+    {
+        return a.x == b.x && a.y == b.y && a.inner == b.inner;
     }
-    MMDEPLOY_ARCHIVE_MEMBERS(f, g);
-  };
-  InnerObject inner;
-
-  struct Stl {
-    std::vector<std::string> s_vec;
-    std::map<std::string, int> si_map;
-    friend bool operator==(const Stl& a, const Stl& b) {
-      return a.s_vec == b.s_vec && a.si_map == b.si_map;
+    friend bool operator!=(const OuterObject& a, const OuterObject& b)
+    {
+        return !(a == b);
     }
-    MMDEPLOY_ARCHIVE_MEMBERS(s_vec);
-  };
-  Stl stl;
-
-  friend bool operator==(const OuterObject& a, const OuterObject& b) {
-    return a.x == b.x && a.y == b.y && a.inner == b.inner;
-  }
-  friend bool operator!=(const OuterObject& a, const OuterObject& b) { return !(a == b); }
-  MMDEPLOY_ARCHIVE_MEMBERS(x, y, inner, stl);
+    MMDEPLOY_ARCHIVE_MEMBERS(x, y, inner, stl);
 };
 
-TEST_CASE("test schema", "[value]") {
-  // clang-format off
+TEST_CASE("test schema", "[value]")
+{
+    // clang-format off
   OuterObject obj {
       1,
       2,
@@ -122,35 +135,35 @@ TEST_CASE("test schema", "[value]") {
         {{"1", 1}, {"er", 2}, {"three", 3}}
       }
   };
-  // clang-format on
-  mmdeploy::Value value;
-  mmdeploy::ValueOutputArchive oa(value);
-  oa(obj);
-
-  std::string ff;
-  mmdeploy::Value v(ff);
-  REQUIRE(v.is_string());
-
-  REQUIRE(value.is_object());
-  auto& x = value["x"];
-  REQUIRE(x.is_number_integer());
-  REQUIRE(x.get<int>() == 1);
-  auto& y = value["y"];
-  REQUIRE(y.is_number_float());
-  REQUIRE(y.get<float>() == 2);
-  auto& inner = value["inner"];
-  REQUIRE(inner.is_object());
-  auto& f = inner["f"];
-  REQUIRE(f.type() == mmdeploy::ValueType::kString);
-  REQUIRE(f.is_string());
-  REQUIRE(f.get<std::string>() == "3");
-  auto& g = inner["g"];
-  REQUIRE(g.type() == mmdeploy::ValueType::kBool);
-  REQUIRE(g.get<bool>() == false);
-
-  mmdeploy::ValueInputArchive ia(value);
-  OuterObject u{};
-  REQUIRE(obj != u);
-  ia(u);
-  REQUIRE(obj == u);
+    // clang-format on
+    mmdeploy::Value              value;
+    mmdeploy::ValueOutputArchive oa(value);
+    oa(obj);
+
+    std::string     ff;
+    mmdeploy::Value v(ff);
+    REQUIRE(v.is_string());
+
+    REQUIRE(value.is_object());
+    auto& x = value["x"];
+    REQUIRE(x.is_number_integer());
+    REQUIRE(x.get<int>() == 1);
+    auto& y = value["y"];
+    REQUIRE(y.is_number_float());
+    REQUIRE(y.get<float>() == 2);
+    auto& inner = value["inner"];
+    REQUIRE(inner.is_object());
+    auto& f = inner["f"];
+    REQUIRE(f.type() == mmdeploy::ValueType::kString);
+    REQUIRE(f.is_string());
+    REQUIRE(f.get<std::string>() == "3");
+    auto& g = inner["g"];
+    REQUIRE(g.type() == mmdeploy::ValueType::kBool);
+    REQUIRE(g.get<bool>() == false);
+
+    mmdeploy::ValueInputArchive ia(value);
+    OuterObject                 u{};
+    REQUIRE(obj != u);
+    ia(u);
+    REQUIRE(obj == u);
 }
diff --git a/tests/test_csrc/capi/test_classifier.cpp b/tests/test_csrc/capi/test_classifier.cpp
index 602012849b..da69d1998e 100644
--- a/tests/test_csrc/capi/test_classifier.cpp
+++ b/tests/test_csrc/capi/test_classifier.cpp
@@ -11,55 +11,61 @@
 
 using namespace std;
 
-TEST_CASE("test classifier's c api", "[.classifier][resource]") {
-  auto test = [](const std::string& device_name, const std::string& model_path,
-                 const std::vector<std::string>& img_list) {
-    mmdeploy_classifier_t classifier{nullptr};
-    auto ret =
-        mmdeploy_classifier_create_by_path(model_path.c_str(), device_name.c_str(), 0, &classifier);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
+TEST_CASE("test classifier's c api", "[.classifier][resource]")
+{
+    auto test = [](const std::string& device_name, const std::string& model_path, const std::vector<std::string>& img_list)
+    {
+        mmdeploy_classifier_t classifier{nullptr};
+        auto                  ret =
+            mmdeploy_classifier_create_by_path(model_path.c_str(), device_name.c_str(), 0, &classifier);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
 
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto& img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_classification_t* results{nullptr};
-    int* result_count{nullptr};
-    ret = mmdeploy_classifier_apply(classifier, mats.data(), (int)mats.size(), &results,
-                                    &result_count);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-    auto result_ptr = results;
-    MMDEPLOY_INFO("model_path: {}", model_path);
-    for (auto i = 0; i < (int)mats.size(); ++i) {
-      MMDEPLOY_INFO("the {}-th classification result: ", i);
-      for (int j = 0; j < *result_count; ++j, ++result_ptr) {
-        MMDEPLOY_INFO("\t label: {}, score: {}", result_ptr->label_id, result_ptr->score);
-      }
-    }
+        mmdeploy_classification_t* results{nullptr};
+        int*                       result_count{nullptr};
+        ret = mmdeploy_classifier_apply(classifier, mats.data(), (int)mats.size(), &results, &result_count);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+        auto result_ptr = results;
+        MMDEPLOY_INFO("model_path: {}", model_path);
+        for (auto i = 0; i < (int)mats.size(); ++i)
+        {
+            MMDEPLOY_INFO("the {}-th classification result: ", i);
+            for (int j = 0; j < *result_count; ++j, ++result_ptr)
+            {
+                MMDEPLOY_INFO("\t label: {}, score: {}", result_ptr->label_id, result_ptr->score);
+            }
+        }
 
-    mmdeploy_classifier_release_result(results, result_count, (int)mats.size());
-    mmdeploy_classifier_destroy(classifier);
-  };
+        mmdeploy_classifier_release_result(results, result_count, (int)mats.size());
+        mmdeploy_classifier_destroy(classifier);
+    };
 
-  auto gResources = MMDeployTestResources::Get();
-  auto img_lists = gResources.LocateImageResources(fs::path{"mmcls"} / "images");
-  REQUIRE(!img_lists.empty());
+    auto gResources = MMDeployTestResources::Get();
+    auto img_lists  = gResources.LocateImageResources(fs::path{"mmcls"} / "images");
+    REQUIRE(!img_lists.empty());
 
-  for (auto& backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmcls/"} / backend);
-      REQUIRE(!model_list.empty());
-      for (auto& model_path : model_list) {
-        for (auto& device_name : gResources.device_names(backend)) {
-          test(device_name, model_path, img_lists);
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmcls/"} / backend);
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, model_path, img_lists);
+                }
+            }
         }
-      }
     }
-  }
 }
diff --git a/tests/test_csrc/capi/test_detector.cpp b/tests/test_csrc/capi/test_detector.cpp
index 0fdb1252d1..888076fc42 100644
--- a/tests/test_csrc/capi/test_detector.cpp
+++ b/tests/test_csrc/capi/test_detector.cpp
@@ -11,59 +11,66 @@
 #include "test_resource.h"
 using namespace std;
 
-TEST_CASE("test detector's c api", "[.detector][resource]") {
-  MMDEPLOY_INFO("test detector");
-  auto test = [](const string &device, const string &model_path, const vector<string> &img_list) {
-    mmdeploy_detector_t detector{nullptr};
-    auto ret = mmdeploy_detector_create_by_path(model_path.c_str(), device.c_str(), 0, &detector);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
+TEST_CASE("test detector's c api", "[.detector][resource]")
+{
+    MMDEPLOY_INFO("test detector");
+    auto test = [](const string& device, const string& model_path, const vector<string>& img_list)
+    {
+        mmdeploy_detector_t detector{nullptr};
+        auto                ret = mmdeploy_detector_create_by_path(model_path.c_str(), device.c_str(), 0, &detector);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
 
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto &img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_detection_t *results{nullptr};
-    int *result_count{nullptr};
-    ret = mmdeploy_detector_apply(detector, mats.data(), (int)mats.size(), &results, &result_count);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-    auto result_ptr = results;
-    for (auto i = 0; i < mats.size(); ++i) {
-      MMDEPLOY_INFO("the '{}-th' image has '{}' objects", i, result_count[i]);
-      for (auto j = 0; j < result_count[i]; ++j, ++result_ptr) {
-        auto &bbox = result_ptr->bbox;
-        MMDEPLOY_INFO(" >> bbox[{}, {}, {}, {}], label_id {}, score {}", bbox.left, bbox.top,
-                      bbox.right, bbox.bottom, result_ptr->label_id, result_ptr->score);
-      }
-    }
-    mmdeploy_detector_release_result(results, result_count, (int)mats.size());
-    mmdeploy_detector_destroy(detector);
-  };
-  MMDEPLOY_INFO("get test resources");
-  auto &gResources = MMDeployTestResources::Get();
-  MMDEPLOY_INFO("locate image resources");
-  auto img_lists = gResources.LocateImageResources(fs::path{"mmdet"} / "images");
-  MMDEPLOY_INFO("{}", img_lists.size());
-  REQUIRE(!img_lists.empty());
+        mmdeploy_detection_t* results{nullptr};
+        int*                  result_count{nullptr};
+        ret = mmdeploy_detector_apply(detector, mats.data(), (int)mats.size(), &results, &result_count);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+        auto result_ptr = results;
+        for (auto i = 0; i < mats.size(); ++i)
+        {
+            MMDEPLOY_INFO("the '{}-th' image has '{}' objects", i, result_count[i]);
+            for (auto j = 0; j < result_count[i]; ++j, ++result_ptr)
+            {
+                auto& bbox = result_ptr->bbox;
+                MMDEPLOY_INFO(" >> bbox[{}, {}, {}, {}], label_id {}, score {}", bbox.left, bbox.top, bbox.right, bbox.bottom, result_ptr->label_id, result_ptr->score);
+            }
+        }
+        mmdeploy_detector_release_result(results, result_count, (int)mats.size());
+        mmdeploy_detector_destroy(detector);
+    };
+    MMDEPLOY_INFO("get test resources");
+    auto& gResources = MMDeployTestResources::Get();
+    MMDEPLOY_INFO("locate image resources");
+    auto img_lists = gResources.LocateImageResources(fs::path{"mmdet"} / "images");
+    MMDEPLOY_INFO("{}", img_lists.size());
+    REQUIRE(!img_lists.empty());
 
-  for (auto &backend : gResources.backends()) {
-    MMDEPLOY_INFO("backend: {}", backend);
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmdet"} / backend);
-      REQUIRE(!model_list.empty());
-      for (auto &model_path : model_list) {
-        MMDEPLOY_INFO("model: {}", model_path);
-        for (auto &device_name : gResources.device_names(backend)) {
-          test(device_name, model_path, img_lists);
+    for (auto& backend : gResources.backends())
+    {
+        MMDEPLOY_INFO("backend: {}", backend);
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmdet"} / backend);
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                MMDEPLOY_INFO("model: {}", model_path);
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, model_path, img_lists);
+                }
+            }
         }
-      }
     }
-  }
 }
 
 #if 0
diff --git a/tests/test_csrc/capi/test_model.cpp b/tests/test_csrc/capi/test_model.cpp
index 6c2aaa1e18..70a53f90df 100644
--- a/tests/test_csrc/capi/test_model.cpp
+++ b/tests/test_csrc/capi/test_model.cpp
@@ -7,25 +7,29 @@
 #include "mmdeploy/apis/c/mmdeploy/model.h"
 #include "test_resource.h"
 
-TEST_CASE("test model c capi", "[.model][resource]") {
-  auto &gResource = MMDeployTestResources::Get();
-  std::string model_path;
-  for (auto const &codebase : gResource.codebases()) {
-    for (auto const &backend : gResource.backends()) {
-      if (auto _model_list = gResource.LocateModelResources(fs::path{codebase} / backend);
-          !_model_list.empty()) {
-        model_path = _model_list.front();
-        break;
-      }
+TEST_CASE("test model c capi", "[.model][resource]")
+{
+    auto&       gResource = MMDeployTestResources::Get();
+    std::string model_path;
+    for (auto const& codebase : gResource.codebases())
+    {
+        for (auto const& backend : gResource.backends())
+        {
+            if (auto _model_list = gResource.LocateModelResources(fs::path{codebase} / backend);
+                !_model_list.empty())
+            {
+                model_path = _model_list.front();
+                break;
+            }
+        }
     }
-  }
 
-  REQUIRE(!model_path.empty());
-  mmdeploy_model_t model{};
-  REQUIRE(mmdeploy_model_create_by_path(model_path.c_str(), &model) == MMDEPLOY_SUCCESS);
-  mmdeploy_model_destroy(model);
-  model = nullptr;
+    REQUIRE(!model_path.empty());
+    mmdeploy_model_t model{};
+    REQUIRE(mmdeploy_model_create_by_path(model_path.c_str(), &model) == MMDEPLOY_SUCCESS);
+    mmdeploy_model_destroy(model);
+    model = nullptr;
 
-  REQUIRE(mmdeploy_model_create(nullptr, 0, &model) == MMDEPLOY_E_FAIL);
-  mmdeploy_model_destroy(model);
+    REQUIRE(mmdeploy_model_create(nullptr, 0, &model) == MMDEPLOY_E_FAIL);
+    mmdeploy_model_destroy(model);
 }
diff --git a/tests/test_csrc/capi/test_restorer.cpp b/tests/test_csrc/capi/test_restorer.cpp
index bade09941b..ed11c5988e 100644
--- a/tests/test_csrc/capi/test_restorer.cpp
+++ b/tests/test_csrc/capi/test_restorer.cpp
@@ -10,49 +10,55 @@
 
 using namespace std;
 
-TEST_CASE("test restorer's c api", "[.restorer][resource]") {
-  auto test = [](const string &device, const string &backend, const string &model_path,
-                 const vector<string> &img_list) {
-    mmdeploy_restorer_t restorer{nullptr};
-    auto ret = mmdeploy_restorer_create_by_path(model_path.c_str(), device.c_str(), 0, &restorer);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto &img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
-    mmdeploy_mat_t *res{};
-    ret = mmdeploy_restorer_apply(restorer, mats.data(), (int)mats.size(), &res);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-
-    for (auto i = 0; i < cv_mats.size(); ++i) {
-      cv::Mat out(res[i].height, res[i].width, CV_8UC3, res[i].data);
-      cv::cvtColor(out, out, cv::COLOR_RGB2BGR);
-      cv::imwrite("restorer_" + backend + "_" + to_string(i) + ".bmp", out);
-    }
+TEST_CASE("test restorer's c api", "[.restorer][resource]")
+{
+    auto test = [](const string& device, const string& backend, const string& model_path, const vector<string>& img_list)
+    {
+        mmdeploy_restorer_t restorer{nullptr};
+        auto                ret = mmdeploy_restorer_create_by_path(model_path.c_str(), device.c_str(), 0, &restorer);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
+        mmdeploy_mat_t* res{};
+        ret = mmdeploy_restorer_apply(restorer, mats.data(), (int)mats.size(), &res);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+
+        for (auto i = 0; i < cv_mats.size(); ++i)
+        {
+            cv::Mat out(res[i].height, res[i].width, CV_8UC3, res[i].data);
+            cv::cvtColor(out, out, cv::COLOR_RGB2BGR);
+            cv::imwrite("restorer_" + backend + "_" + to_string(i) + ".bmp", out);
+        }
+
+        mmdeploy_restorer_release_result(res, (int)mats.size());
+        mmdeploy_restorer_destroy(restorer);
+    };
+
+    auto gResources = MMDeployTestResources::Get();
+    auto img_lists  = gResources.LocateImageResources(fs::path{"mmedit"} / "images");
+    REQUIRE(!img_lists.empty());
 
-    mmdeploy_restorer_release_result(res, (int)mats.size());
-    mmdeploy_restorer_destroy(restorer);
-  };
-
-  auto gResources = MMDeployTestResources::Get();
-  auto img_lists = gResources.LocateImageResources(fs::path{"mmedit"} / "images");
-  REQUIRE(!img_lists.empty());
-
-  for (auto &backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmedit"} / backend);
-      REQUIRE(!model_list.empty());
-      for (auto &model_path : model_list) {
-        for (auto &device_name : gResources.device_names(backend)) {
-          test(device_name, backend, model_path, img_lists);
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmedit"} / backend);
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, backend, model_path, img_lists);
+                }
+            }
         }
-      }
     }
-  }
 }
diff --git a/tests/test_csrc/capi/test_segmentor.cpp b/tests/test_csrc/capi/test_segmentor.cpp
index ef9078aae1..0efbe9f96e 100644
--- a/tests/test_csrc/capi/test_segmentor.cpp
+++ b/tests/test_csrc/capi/test_segmentor.cpp
@@ -10,52 +10,58 @@
 
 using namespace std;
 
-TEST_CASE("test segmentor's c api", "[.segmentor][resource]") {
-  auto test = [](const string &device, const string &backend, const string &model_path,
-                 const vector<string> &img_list) {
-    mmdeploy_segmentor_t segmentor{nullptr};
-    auto ret = mmdeploy_segmentor_create_by_path(model_path.c_str(), device.c_str(), 0, &segmentor);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto &img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+TEST_CASE("test segmentor's c api", "[.segmentor][resource]")
+{
+    auto test = [](const string& device, const string& backend, const string& model_path, const vector<string>& img_list)
+    {
+        mmdeploy_segmentor_t segmentor{nullptr};
+        auto                 ret = mmdeploy_segmentor_create_by_path(model_path.c_str(), device.c_str(), 0, &segmentor);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_segmentation_t *results{nullptr};
-    int count = 0;
-    ret = mmdeploy_segmentor_apply(segmentor, mats.data(), (int)mats.size(), &results);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-    REQUIRE(results != nullptr);
+        mmdeploy_segmentation_t* results{nullptr};
+        int                      count = 0;
+        ret                            = mmdeploy_segmentor_apply(segmentor, mats.data(), (int)mats.size(), &results);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+        REQUIRE(results != nullptr);
 
-    auto result_ptr = results;
-    for (auto i = 0; i < mats.size(); ++i, ++result_ptr) {
-      cv::Mat mask(result_ptr->height, result_ptr->width, CV_32SC1, result_ptr->mask);
-      cv::imwrite("mask_" + backend + "_" + to_string(i) + ".png", mask * 10);
-    }
+        auto result_ptr = results;
+        for (auto i = 0; i < mats.size(); ++i, ++result_ptr)
+        {
+            cv::Mat mask(result_ptr->height, result_ptr->width, CV_32SC1, result_ptr->mask);
+            cv::imwrite("mask_" + backend + "_" + to_string(i) + ".png", mask * 10);
+        }
+
+        mmdeploy_segmentor_release_result(results, (int)mats.size());
+        mmdeploy_segmentor_destroy(segmentor);
+    };
+
+    auto gResources = MMDeployTestResources::Get();
+    auto img_lists  = gResources.LocateImageResources(fs::path{"mmseg"} / "images");
+    REQUIRE(!img_lists.empty());
 
-    mmdeploy_segmentor_release_result(results, (int)mats.size());
-    mmdeploy_segmentor_destroy(segmentor);
-  };
-
-  auto gResources = MMDeployTestResources::Get();
-  auto img_lists = gResources.LocateImageResources(fs::path{"mmseg"} / "images");
-  REQUIRE(!img_lists.empty());
-
-  for (auto &backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmseg"} / backend);
-      REQUIRE(!model_list.empty());
-      for (auto &model_path : model_list) {
-        for (auto &device_name : gResources.device_names(backend)) {
-          test(device_name, backend, model_path, img_lists);
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmseg"} / backend);
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, backend, model_path, img_lists);
+                }
+            }
         }
-      }
     }
-  }
 }
diff --git a/tests/test_csrc/capi/test_text_detector.cpp b/tests/test_csrc/capi/test_text_detector.cpp
index 95e1ae4932..9bbf24b0ac 100644
--- a/tests/test_csrc/capi/test_text_detector.cpp
+++ b/tests/test_csrc/capi/test_text_detector.cpp
@@ -10,58 +10,66 @@
 
 using namespace std;
 
-TEST_CASE("test text detector's c api", "[.text-detector][resource]") {
-  auto test = [](const string& device, const string& model_path, const vector<string>& img_list) {
-    mmdeploy_text_detector_t detector{nullptr};
-    auto ret =
-        mmdeploy_text_detector_create_by_path(model_path.c_str(), device.c_str(), 0, &detector);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
+TEST_CASE("test text detector's c api", "[.text-detector][resource]")
+{
+    auto test = [](const string& device, const string& model_path, const vector<string>& img_list)
+    {
+        mmdeploy_text_detector_t detector{nullptr};
+        auto                     ret =
+            mmdeploy_text_detector_create_by_path(model_path.c_str(), device.c_str(), 0, &detector);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
 
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto& img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_text_detection_t* results{nullptr};
-    int* result_count{nullptr};
-    ret = mmdeploy_text_detector_apply(detector, mats.data(), (int)mats.size(), &results,
-                                       &result_count);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
+        mmdeploy_text_detection_t* results{nullptr};
+        int*                       result_count{nullptr};
+        ret = mmdeploy_text_detector_apply(detector, mats.data(), (int)mats.size(), &results, &result_count);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
 
-    auto result_ptr = results;
-    for (auto i = 0; i < mats.size(); ++i) {
-      MMDEPLOY_INFO("the {}-th image has '{}' objects", i, result_count[i]);
-      for (auto j = 0; j < result_count[i]; ++j, ++result_ptr) {
-        auto& bbox = result_ptr->bbox;
-        MMDEPLOY_INFO(">> bbox[{}].score: {}, coordinate: ", i, result_ptr->score);
-        for (auto& _bbox : result_ptr->bbox) {
-          MMDEPLOY_INFO(">> >> ({}, {})", _bbox.x, _bbox.y);
+        auto result_ptr = results;
+        for (auto i = 0; i < mats.size(); ++i)
+        {
+            MMDEPLOY_INFO("the {}-th image has '{}' objects", i, result_count[i]);
+            for (auto j = 0; j < result_count[i]; ++j, ++result_ptr)
+            {
+                auto& bbox = result_ptr->bbox;
+                MMDEPLOY_INFO(">> bbox[{}].score: {}, coordinate: ", i, result_ptr->score);
+                for (auto& _bbox : result_ptr->bbox)
+                {
+                    MMDEPLOY_INFO(">> >> ({}, {})", _bbox.x, _bbox.y);
+                }
+            }
         }
-      }
-    }
 
-    mmdeploy_text_detector_release_result(results, result_count, (int)mats.size());
-    mmdeploy_text_detector_destroy(detector);
-  };
+        mmdeploy_text_detector_release_result(results, result_count, (int)mats.size());
+        mmdeploy_text_detector_destroy(detector);
+    };
 
-  auto& gResources = MMDeployTestResources::Get();
-  auto img_list = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
-  REQUIRE(!img_list.empty());
+    auto& gResources = MMDeployTestResources::Get();
+    auto  img_list   = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
+    REQUIRE(!img_list.empty());
 
-  for (auto& backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmocr"} / "textdet" / "backend");
-      REQUIRE(!model_list.empty());
-      for (auto& model_path : model_list) {
-        for (auto& device_name : gResources.device_names(backend)) {
-          test(device_name, model_path, img_list);
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmocr"} / "textdet" / "backend");
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, model_path, img_list);
+                }
+            }
         }
-      }
     }
-  }
 }
diff --git a/tests/test_csrc/capi/test_text_recognizer.cpp b/tests/test_csrc/capi/test_text_recognizer.cpp
index d7326848be..4051f368fc 100644
--- a/tests/test_csrc/capi/test_text_recognizer.cpp
+++ b/tests/test_csrc/capi/test_text_recognizer.cpp
@@ -12,118 +12,126 @@
 
 using namespace std;
 
-TEST_CASE("test text recognizer's c api", "[.text-recognizer][resource]") {
-  auto test = [](const string& device, const string& model_path, const vector<string>& img_list) {
-    mmdeploy_text_recognizer_t recognizer{nullptr};
-    auto ret =
-        mmdeploy_text_recognizer_create_by_path(model_path.c_str(), device.c_str(), 0, &recognizer);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto& img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+TEST_CASE("test text recognizer's c api", "[.text-recognizer][resource]")
+{
+    auto test = [](const string& device, const string& model_path, const vector<string>& img_list)
+    {
+        mmdeploy_text_recognizer_t recognizer{nullptr};
+        auto                       ret =
+            mmdeploy_text_recognizer_create_by_path(model_path.c_str(), device.c_str(), 0, &recognizer);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_text_recognition_t* results{};
-    ret = mmdeploy_text_recognizer_apply_bbox(recognizer, mats.data(), (int)mats.size(), nullptr,
-                                              nullptr, &results);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
+        mmdeploy_text_recognition_t* results{};
+        ret = mmdeploy_text_recognizer_apply_bbox(recognizer, mats.data(), (int)mats.size(), nullptr, nullptr, &results);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
 
-    for (auto i = 0; i < mats.size(); ++i) {
-      std::vector<float> score(results[i].score, results[i].score + results[i].length);
-      MMDEPLOY_INFO("image {}, text = {}, score = {}", i, results[i].text, score);
-    }
+        for (auto i = 0; i < mats.size(); ++i)
+        {
+            std::vector<float> score(results[i].score, results[i].score + results[i].length);
+            MMDEPLOY_INFO("image {}, text = {}, score = {}", i, results[i].text, score);
+        }
 
-    mmdeploy_text_recognizer_release_result(results, (int)mats.size());
-    mmdeploy_text_recognizer_destroy(recognizer);
-  };
-
-  auto& gResources = MMDeployTestResources::Get();
-  auto img_list = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
-  REQUIRE(!img_list.empty());
-
-  for (auto& backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmocr"} / "textreg" / "backend");
-      REQUIRE(!model_list.empty());
-      for (auto& model_path : model_list) {
-        for (auto& device_name : gResources.device_names(backend)) {
-          test(device_name, model_path, img_list);
+        mmdeploy_text_recognizer_release_result(results, (int)mats.size());
+        mmdeploy_text_recognizer_destroy(recognizer);
+    };
+
+    auto& gResources = MMDeployTestResources::Get();
+    auto  img_list   = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
+    REQUIRE(!img_list.empty());
+
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmocr"} / "textreg" / "backend");
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, model_path, img_list);
+                }
+            }
         }
-      }
     }
-  }
 }
 
-TEST_CASE("test text detector-recognizer combo", "[.text-detector-recognizer]") {
-  auto test = [](const std::string& device, const string& det_model_path,
-                 const string& reg_model_path, std::vector<string>& img_list) {
-    mmdeploy_text_detector_t detector{};
-    REQUIRE(mmdeploy_text_detector_create_by_path(det_model_path.c_str(), device.c_str(), 0,
-                                                  &detector) == MMDEPLOY_SUCCESS);
-    mmdeploy_text_recognizer_t recognizer{};
-    REQUIRE(mmdeploy_text_recognizer_create_by_path(reg_model_path.c_str(), device.c_str(), 0,
-                                                    &recognizer) == MMDEPLOY_SUCCESS);
-
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (const auto& img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+TEST_CASE("test text detector-recognizer combo", "[.text-detector-recognizer]")
+{
+    auto test = [](const std::string& device, const string& det_model_path, const string& reg_model_path, std::vector<string>& img_list)
+    {
+        mmdeploy_text_detector_t detector{};
+        REQUIRE(mmdeploy_text_detector_create_by_path(det_model_path.c_str(), device.c_str(), 0, &detector) == MMDEPLOY_SUCCESS);
+        mmdeploy_text_recognizer_t recognizer{};
+        REQUIRE(mmdeploy_text_recognizer_create_by_path(reg_model_path.c_str(), device.c_str(), 0, &recognizer) == MMDEPLOY_SUCCESS);
+
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (const auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_text_detection_t* bboxes{};
-    int* bbox_count{};
-    REQUIRE(mmdeploy_text_detector_apply(detector, mats.data(), mats.size(), &bboxes,
-                                         &bbox_count) == MMDEPLOY_SUCCESS);
+        mmdeploy_text_detection_t* bboxes{};
+        int*                       bbox_count{};
+        REQUIRE(mmdeploy_text_detector_apply(detector, mats.data(), mats.size(), &bboxes, &bbox_count) == MMDEPLOY_SUCCESS);
 
-    mmdeploy_text_recognition_t* texts{};
+        mmdeploy_text_recognition_t* texts{};
 
-    REQUIRE(mmdeploy_text_recognizer_apply_bbox(recognizer, mats.data(), (int)mats.size(), bboxes,
-                                                bbox_count, &texts) == MMDEPLOY_SUCCESS);
+        REQUIRE(mmdeploy_text_recognizer_apply_bbox(recognizer, mats.data(), (int)mats.size(), bboxes, bbox_count, &texts) == MMDEPLOY_SUCCESS);
 
-    int offset = 0;
-    for (auto i = 0; i < mats.size(); ++i) {
-      for (int j = 0; j < bbox_count[i]; ++j) {
-        auto& text = texts[offset + j];
-        std::vector<float> score(text.score, text.score + text.length);
-        MMDEPLOY_INFO("image {}, text = {}, score = {}", i, text.text, score);
-      }
-      offset += bbox_count[i];
-    }
+        int offset = 0;
+        for (auto i = 0; i < mats.size(); ++i)
+        {
+            for (int j = 0; j < bbox_count[i]; ++j)
+            {
+                auto&              text = texts[offset + j];
+                std::vector<float> score(text.score, text.score + text.length);
+                MMDEPLOY_INFO("image {}, text = {}, score = {}", i, text.text, score);
+            }
+            offset += bbox_count[i];
+        }
 
-    mmdeploy_text_recognizer_release_result(texts, offset);
-    mmdeploy_text_detector_release_result(bboxes, bbox_count, offset);
-
-    mmdeploy_text_recognizer_destroy(recognizer);
-    mmdeploy_text_detector_destroy(detector);
-  };
-
-  auto& gResources = MMDeployTestResources::Get();
-  auto img_list = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
-  REQUIRE(!img_list.empty());
-
-  for (auto& backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto det_model_list =
-          gResources.LocateModelResources(fs::path{"mmocr"} / "textdet" / backend);
-      auto reg_model_list =
-          gResources.LocateModelResources(fs::path{"mmocr"} / "textreg" / backend);
-      REQUIRE(!det_model_list.empty());
-      REQUIRE(!reg_model_list.empty());
-      auto det_model_path = det_model_list.front();
-      auto reg_model_path = reg_model_list.front();
-      for (auto& device_name : gResources.device_names(backend)) {
-        test(device_name, det_model_path, reg_model_path, img_list);
-      }
+        mmdeploy_text_recognizer_release_result(texts, offset);
+        mmdeploy_text_detector_release_result(bboxes, bbox_count, offset);
+
+        mmdeploy_text_recognizer_destroy(recognizer);
+        mmdeploy_text_detector_destroy(detector);
+    };
+
+    auto& gResources = MMDeployTestResources::Get();
+    auto  img_list   = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
+    REQUIRE(!img_list.empty());
+
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto det_model_list =
+                gResources.LocateModelResources(fs::path{"mmocr"} / "textdet" / backend);
+            auto reg_model_list =
+                gResources.LocateModelResources(fs::path{"mmocr"} / "textreg" / backend);
+            REQUIRE(!det_model_list.empty());
+            REQUIRE(!reg_model_list.empty());
+            auto det_model_path = det_model_list.front();
+            auto reg_model_path = reg_model_list.front();
+            for (auto& device_name : gResources.device_names(backend))
+            {
+                test(device_name, det_model_path, reg_model_path, img_list);
+            }
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/core/test_execution.cpp b/tests/test_csrc/core/test_execution.cpp
index debbd30f01..ba99c16e4e 100644
--- a/tests/test_csrc/core/test_execution.cpp
+++ b/tests/test_csrc/core/test_execution.cpp
@@ -19,253 +19,320 @@
 
 using namespace mmdeploy;
 
-TEST_CASE("test basic execution", "[execution]") {
-  auto x = Then(Just(), [] {});
-  static_assert(!_has_completion_scheduler_v<decltype(x)>);
-  InlineScheduler sch;
-  auto a = Just(Value{{"a", 100}, {"b", 200}});
-  static_assert(!_has_completion_scheduler_v<decltype(a)>);
-  auto b = ScheduleFrom(sch, a);
-  static_assert(_has_completion_scheduler_v<decltype(b)>);
-  static_assert(std::is_same_v<decltype(GetCompletionScheduler(b)), InlineScheduler>);
-  auto c = Then(b, [](Value v) -> Value { return {{"c", v["a"].get<int>() + v["b"].get<int>()}}; });
-  auto d = SyncWait(c);
-  MMDEPLOY_INFO("{}", d);
+TEST_CASE("test basic execution", "[execution]")
+{
+    auto x = Then(Just(), [] {});
+    static_assert(!_has_completion_scheduler_v<decltype(x)>);
+    InlineScheduler sch;
+    auto            a = Just(Value{{"a", 100}, {"b", 200}});
+    static_assert(!_has_completion_scheduler_v<decltype(a)>);
+    auto b = ScheduleFrom(sch, a);
+    static_assert(_has_completion_scheduler_v<decltype(b)>);
+    static_assert(std::is_same_v<decltype(GetCompletionScheduler(b)), InlineScheduler>);
+    auto c = Then(b, [](Value v) -> Value
+                  { return {{"c", v["a"].get<int>() + v["b"].get<int>()}}; });
+    auto d = SyncWait(c);
+    MMDEPLOY_INFO("{}", d);
 }
 
-template <class Sender>
-auto GetKey(Sender&& sndr, const std::string& key) {
-  return Then((Sender &&) sndr, [key](const Value& v) { return v[key]; });
+template<class Sender>
+auto GetKey(Sender&& sndr, const std::string& key)
+{
+    return Then((Sender&&)sndr, [key](const Value& v)
+                { return v[key]; });
 }
 
-TEST_CASE("test split", "[execution]") {
-  auto a = Just(Value{{"x", 100}, {"y", 1000}});
-  auto s = Split(a);
-  auto x = GetKey(s, "x");
-  auto y = GetKey(s, "y");
-  auto x_v = SyncWait(x);
-  auto y_v = SyncWait(y);
-  MMDEPLOY_INFO("x = {}, y = {}", x_v, y_v);
+TEST_CASE("test split", "[execution]")
+{
+    auto a   = Just(Value{{"x", 100}, {"y", 1000}});
+    auto s   = Split(a);
+    auto x   = GetKey(s, "x");
+    auto y   = GetKey(s, "y");
+    auto x_v = SyncWait(x);
+    auto y_v = SyncWait(y);
+    MMDEPLOY_INFO("x = {}, y = {}", x_v, y_v);
 }
 
-TEST_CASE("test when_all", "[execution]") {
-  auto a = Just(100);
-  auto b = Just(200);
-  auto c = Just(300);
-  auto d = Just(400);
-  auto e = Just(500);
-  auto t = WhenAll(a, b, c, d, e);
-  auto v = SyncWait(t);
-  MMDEPLOY_INFO("v = {}", v);
+TEST_CASE("test when_all", "[execution]")
+{
+    auto a = Just(100);
+    auto b = Just(200);
+    auto c = Just(300);
+    auto d = Just(400);
+    auto e = Just(500);
+    auto t = WhenAll(a, b, c, d, e);
+    auto v = SyncWait(t);
+    MMDEPLOY_INFO("v = {}", v);
 }
 
-void Func() {
-  auto a = Just(100, 200);
-  auto b =
-      LetValue(a, [](int& x, int& y) { return Then(Just(x + y), [](int v) { return v * v; }); });
-  auto v = SyncWait(b);
-  static_assert(std::is_same_v<decltype(v), std::tuple<int>>);
-  MMDEPLOY_INFO("v = {}", v);
+void Func()
+{
+    auto a = Just(100, 200);
+    auto b =
+        LetValue(a, [](int& x, int& y)
+                 { return Then(Just(x + y), [](int v)
+                               { return v * v; }); });
+    auto v = SyncWait(b);
+    static_assert(std::is_same_v<decltype(v), std::tuple<int>>);
+    MMDEPLOY_INFO("v = {}", v);
 }
 
-TEST_CASE("test let_value", "[execution]") { Func(); }
-
-TEST_CASE("test fork-join", "[execution]") {
-  auto a = Just(Value{{"x", 100}, {"y", 1000}});
-  auto s = Split(a);
-  auto x = GetKey(s, "x");
-  auto y = GetKey(s, "y");
-  auto xy = WhenAll(x, y);
-  auto v = SyncWait(xy);
-  static_assert(std::is_same_v<decltype(v), std::tuple<Value, Value>>);
-  MMDEPLOY_INFO("v = {}", v);
+TEST_CASE("test let_value", "[execution]")
+{
+    Func();
 }
 
-TEST_CASE("test ensure_started", "[execution]") {
-  //  auto s = Schedule(gThreadPool().GetScheduler());
-  auto pool = __static_thread_pool::StaticThreadPool{};
-  auto s = Schedule(pool.GetScheduler());
-  auto a = Then(s, []() -> Value {
+TEST_CASE("test fork-join", "[execution]")
+{
+    auto a  = Just(Value{{"x", 100}, {"y", 1000}});
+    auto s  = Split(a);
+    auto x  = GetKey(s, "x");
+    auto y  = GetKey(s, "y");
+    auto xy = WhenAll(x, y);
+    auto v  = SyncWait(xy);
+    static_assert(std::is_same_v<decltype(v), std::tuple<Value, Value>>);
+    MMDEPLOY_INFO("v = {}", v);
+}
+
+TEST_CASE("test ensure_started", "[execution]")
+{
+    //  auto s = Schedule(gThreadPool().GetScheduler());
+    auto pool = __static_thread_pool::StaticThreadPool{};
+    auto s    = Schedule(pool.GetScheduler());
+    auto a    = Then(s, []() -> Value
+                  {
     MMDEPLOY_INFO("ensure_started sleep start...");
     std::this_thread::sleep_for(std::chrono::seconds(1));
     MMDEPLOY_INFO("ensure_started sleep end");
-    return 23333;
-  });
-  MMDEPLOY_INFO("ensure_started call");
-  auto c = EnsureStarted(a);
-  MMDEPLOY_INFO("ensure_started ret");
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));
-  MMDEPLOY_INFO("ensure_started sync_wait");
-  auto v = SyncWait(c);
-  MMDEPLOY_INFO("ensure_started: {}", v);
+    return 23333; });
+    MMDEPLOY_INFO("ensure_started call");
+    auto c = EnsureStarted(a);
+    MMDEPLOY_INFO("ensure_started ret");
+    std::this_thread::sleep_for(std::chrono::milliseconds(500));
+    MMDEPLOY_INFO("ensure_started sync_wait");
+    auto v = SyncWait(c);
+    MMDEPLOY_INFO("ensure_started: {}", v);
 }
 
-TEST_CASE("test start_detached", "[execution]") {
-  MMDEPLOY_INFO("test start_detached");
-  __static_thread_pool::StaticThreadPool pool{4};
-  auto s = Schedule(pool.GetScheduler());
-  auto a = Then(s, [] {
+TEST_CASE("test start_detached", "[execution]")
+{
+    MMDEPLOY_INFO("test start_detached");
+    __static_thread_pool::StaticThreadPool pool{4};
+    auto                                   s = Schedule(pool.GetScheduler());
+    auto                                   a = Then(s, []
+                  {
     std::this_thread::sleep_for(std::chrono::seconds(1));
-    return Value(100);
-  });
-  auto b = Then(a, [](auto&&...) { MMDEPLOY_INFO("OK {}", 1); });
-  StartDetached(b);
-  MMDEPLOY_INFO("StartDetached ret");
+    return Value(100); });
+    auto                                   b = Then(a, [](auto&&...)
+                  { MMDEPLOY_INFO("OK {}", 1); });
+    StartDetached(b);
+    MMDEPLOY_INFO("StartDetached ret");
 }
 
-TEST_CASE("test on", "[execution]") {
-  auto pool = __static_thread_pool::StaticThreadPool{4};
-  auto a = Just(100, 200);
-  auto b = On(pool.GetScheduler(), a);
-  auto c = SyncWait(b);
-  static_assert(std::is_same_v<decltype(c), std::tuple<int, int>>);
-  MMDEPLOY_INFO("c = {}", c);
+TEST_CASE("test on", "[execution]")
+{
+    auto pool = __static_thread_pool::StaticThreadPool{4};
+    auto a    = Just(100, 200);
+    auto b    = On(pool.GetScheduler(), a);
+    auto c    = SyncWait(b);
+    static_assert(std::is_same_v<decltype(c), std::tuple<int, int>>);
+    MMDEPLOY_INFO("c = {}", c);
 }
 
-mmdeploy_value_t f(mmdeploy_value_t v, void*) {
-  auto& arr = ((Value*)v)->array();
-  return (mmdeploy_value_t)(new Value{arr[0].get<int>() + arr[1].get<int>()});
+mmdeploy_value_t f(mmdeploy_value_t v, void*)
+{
+    auto& arr = ((Value*)v)->array();
+    return (mmdeploy_value_t)(new Value{arr[0].get<int>() + arr[1].get<int>()});
 }
 
-void G() {
-  auto sched = TypeErasedScheduler<>(InlineScheduler{});
-  auto int2_sender = TypeErasedSender<int, int>(Just(100, 200));
-  auto float2_sender = Then(std::move(int2_sender),
-                            [](int x, int y) { return std::make_tuple((float)y, (float)x); });
-  auto b = Then(Expand(std::move(float2_sender)), [](float x, float y) {
+void G()
+{
+    auto sched         = TypeErasedScheduler<>(InlineScheduler{});
+    auto int2_sender   = TypeErasedSender<int, int>(Just(100, 200));
+    auto float2_sender = Then(std::move(int2_sender),
+                              [](int x, int y)
+                              { return std::make_tuple((float)y, (float)x); });
+    auto b             = Then(Expand(std::move(float2_sender)), [](float x, float y)
+                  {
     MMDEPLOY_INFO("{}, {}", x, y);
-    return static_cast<double>(x + y);
-  });
-  auto c = TypeErasedSender<double>(std::move(b));
-  auto val = SyncWait(std::move(c));
-  MMDEPLOY_INFO("val = {}", val);
+    return static_cast<double>(x + y); });
+    auto c             = TypeErasedSender<double>(std::move(b));
+    auto val           = SyncWait(std::move(c));
+    MMDEPLOY_INFO("val = {}", val);
 }
 
-TEST_CASE("test simple type erase", "[execution]") { G(); }
-
-void TestFunc(const char* sched_name) {
-  //  MMDEPLOY_INFO("testing with scheduler: {}", sched_name);
-  auto creator = gRegistry<TypeErasedScheduler<Value>>().Get(sched_name);
-  REQUIRE(creator);
-  auto sched = creator->Create({});
-  SECTION("Schedule") { (void)SyncWait(Schedule(sched)); }
-  SECTION("Just") {
-    auto [value] = SyncWait(Just(Value(100)) | TypeErase());
-    REQUIRE(value.get<int>() == 100);
-  }
-  SECTION("Transfer") {
-    auto sender = Just(Value(100)) | Transfer(sched) | TypeErase();
-    static_assert(std::is_same_v<decltype(sender), TypeErasedSender<Value>>);
-    auto [value] = SyncWait(std::move(sender));
-    REQUIRE(value.get<int>() == 100);
-  }
-  SECTION("Then") {
-    auto sender = Just(Value(100)) | Transfer(sched) |
-                  Then([](Value v) { return Value(v.get<int>() * v.get<int>()); });
-    auto value = std::get<Value>(SyncWait(std::move(sender)));
-    REQUIRE(value.get<int>() == 10000);
-  }
-  SECTION("On") {
-    auto sender = Just(Value(100)) |
-                  Then([](Value v) { return Value(v.get<int>() * v.get<int>()); }) | TypeErase();
-    auto [value] = SyncWait(On(sched, std::move(sender)));
-    REQUIRE(value.get<int>() == 10000);
-  }
-  SECTION("LetValue") {
-    auto sender = Just(Value(100)) | TypeErase() |
-                  LetValue([](Value& v) { return Just(Value(v.get<int>() * v.get<int>())); }) |
-                  TypeErase();
-    auto [value] = SyncWait(std::move(sender));
-    REQUIRE(value.get<int>() == 10000);
-  }
-  SECTION("Bulk") {
-    auto sender = Just(Value(Value::Array(100))) | Transfer(sched) |
-                  Bulk(100, [](size_t index, Value& v) { v[index] = (uint32_t)index; });
-    auto [value] = SyncWait(std::move(sender));
-    std::vector<int> a;
-    std::vector<int> b;
-    for (const auto& v : value) {
-      b.push_back(static_cast<int>(a.size()));
-      a.push_back(v.template get<int>());
+TEST_CASE("test simple type erase", "[execution]")
+{
+    G();
+}
+
+void TestFunc(const char* sched_name)
+{
+    //  MMDEPLOY_INFO("testing with scheduler: {}", sched_name);
+    auto creator = gRegistry<TypeErasedScheduler<Value>>().Get(sched_name);
+    REQUIRE(creator);
+    auto sched = creator->Create({});
+    SECTION("Schedule")
+    {
+        (void)SyncWait(Schedule(sched));
+    }
+    SECTION("Just")
+    {
+        auto [value] = SyncWait(Just(Value(100)) | TypeErase());
+        REQUIRE(value.get<int>() == 100);
+    }
+    SECTION("Transfer")
+    {
+        auto sender = Just(Value(100)) | Transfer(sched) | TypeErase();
+        static_assert(std::is_same_v<decltype(sender), TypeErasedSender<Value>>);
+        auto [value] = SyncWait(std::move(sender));
+        REQUIRE(value.get<int>() == 100);
+    }
+    SECTION("Then")
+    {
+        auto sender = Just(Value(100)) | Transfer(sched) |
+                      Then([](Value v)
+                           { return Value(v.get<int>() * v.get<int>()); });
+        auto value = std::get<Value>(SyncWait(std::move(sender)));
+        REQUIRE(value.get<int>() == 10000);
+    }
+    SECTION("On")
+    {
+        auto sender = Just(Value(100)) |
+                      Then([](Value v)
+                           { return Value(v.get<int>() * v.get<int>()); }) |
+                      TypeErase();
+        auto [value] = SyncWait(On(sched, std::move(sender)));
+        REQUIRE(value.get<int>() == 10000);
+    }
+    SECTION("LetValue")
+    {
+        auto sender = Just(Value(100)) | TypeErase() |
+                      LetValue([](Value& v)
+                               { return Just(Value(v.get<int>() * v.get<int>())); }) |
+                      TypeErase();
+        auto [value] = SyncWait(std::move(sender));
+        REQUIRE(value.get<int>() == 10000);
+    }
+    SECTION("Bulk")
+    {
+        auto sender = Just(Value(Value::Array(100))) | Transfer(sched) |
+                      Bulk(100, [](size_t index, Value& v)
+                           { v[index] = (uint32_t)index; });
+        auto [value] = SyncWait(std::move(sender));
+        std::vector<int> a;
+        std::vector<int> b;
+        for (const auto& v : value)
+        {
+            b.push_back(static_cast<int>(a.size()));
+            a.push_back(v.template get<int>());
+        }
+        REQUIRE(a == b);
+    }
+    SECTION("Split")
+    {
+        auto sender = Just(Value(100)) | Split();
+        auto [a]    = SyncWait(sender | Then([](Value v)
+                                          { return Value(v.get<int>() + 100); }));
+        auto [b]    = SyncWait(sender | Then([](Value v)
+                                          { return Value(v.get<int>() + 200); }));
+        REQUIRE(a.get<int>() == 200);
+        REQUIRE(b.get<int>() == 300);
+    }
+    SECTION("WhenAll")
+    {
+        auto sender   = Just(Value(100)) | Split();
+        auto a_sender = sender | Then([](Value v)
+                                      { return Value(v.get<int>() + 100); }) |
+                        TypeErase();
+        auto b_sender = sender | Then([](Value v)
+                                      { return Value(v.get<int>() + 200); }) |
+                        TypeErase();
+        auto [value] = SyncWait(WhenAll(std::vector{std::move(a_sender), std::move(b_sender)}));
+        REQUIRE(value[0].get<int>() == 200);
+        REQUIRE(value[1].get<int>() == 300);
+    }
+    SECTION("EnsureStarted")
+    {
+        auto sender = Just(Value(100)) |
+                      Then([](Value v)
+                           { return Value(v.get<int>() * v.get<int>()); }) |
+                      TypeErase();
+        sender       = EnsureStarted(std::move(sender));
+        auto [value] = SyncWait(std::move(sender));
+        REQUIRE(value.get<int>() == 10000);
+    }
+    SECTION("StartDetached")
+    {
+        auto sender = Just(Value(100)) |
+                      Then([](Value v)
+                           { MMDEPLOY_INFO("{}", v.get<int>() * v.get<int>()); }) |
+                      TypeErase();
+        StartDetached(std::move(sender));
+    }
+    SECTION("SyncWait")
+    {
+        (void)SyncWait(Schedule(sched));
     }
-    REQUIRE(a == b);
-  }
-  SECTION("Split") {
-    auto sender = Just(Value(100)) | Split();
-    auto [a] = SyncWait(sender | Then([](Value v) { return Value(v.get<int>() + 100); }));
-    auto [b] = SyncWait(sender | Then([](Value v) { return Value(v.get<int>() + 200); }));
-    REQUIRE(a.get<int>() == 200);
-    REQUIRE(b.get<int>() == 300);
-  }
-  SECTION("WhenAll") {
-    auto sender = Just(Value(100)) | Split();
-    auto a_sender = sender | Then([](Value v) { return Value(v.get<int>() + 100); }) | TypeErase();
-    auto b_sender = sender | Then([](Value v) { return Value(v.get<int>() + 200); }) | TypeErase();
-    auto [value] = SyncWait(WhenAll(std::vector{std::move(a_sender), std::move(b_sender)}));
-    REQUIRE(value[0].get<int>() == 200);
-    REQUIRE(value[1].get<int>() == 300);
-  }
-  SECTION("EnsureStarted") {
-    auto sender = Just(Value(100)) |
-                  Then([](Value v) { return Value(v.get<int>() * v.get<int>()); }) | TypeErase();
-    sender = EnsureStarted(std::move(sender));
-    auto [value] = SyncWait(std::move(sender));
-    REQUIRE(value.get<int>() == 10000);
-  }
-  SECTION("StartDetached") {
-    auto sender = Just(Value(100)) |
-                  Then([](Value v) { MMDEPLOY_INFO("{}", v.get<int>() * v.get<int>()); }) |
-                  TypeErase();
-    StartDetached(std::move(sender));
-  }
-  SECTION("SyncWait") { (void)SyncWait(Schedule(sched)); }
 }
 
-struct _inlined {
-  static constexpr const char* value = "Inline";
+struct _inlined
+{
+    static constexpr const char* value = "Inline";
 };
-struct _single_thread {
-  static constexpr const char* value = "SingleThread";
+struct _single_thread
+{
+    static constexpr const char* value = "SingleThread";
 };
-struct _thread_pool {
-  static constexpr const char* value = "ThreadPool";
+struct _thread_pool
+{
+    static constexpr const char* value = "ThreadPool";
 };
 
 using Schedulers = std::tuple<_inlined, _single_thread, _thread_pool>;
 
-TEMPLATE_LIST_TEST_CASE("test type erase", "[execution]", Schedulers) { TestFunc(TestType::value); }
-
-TEST_CASE("test executor C API", "[execution]") {
-  auto sched = mmdeploy_executor_inline();
-  REQUIRE(sched);
-  auto begin = mmdeploy_executor_just((mmdeploy_value_t) new Value{100, 200});
-  REQUIRE(begin);
-  auto a = mmdeploy_executor_transfer(begin, sched);
-  REQUIRE(a);
-  auto b = mmdeploy_executor_then(a, f, nullptr);
-  REQUIRE(b);
-  auto c = mmdeploy_executor_sync_wait(b);
-  REQUIRE(c);
-  MMDEPLOY_INFO("{}", *(Value*)c);
-  mmdeploy_value_destroy(c);
+TEMPLATE_LIST_TEST_CASE("test type erase", "[execution]", Schedulers)
+{
+    TestFunc(TestType::value);
 }
 
-auto Gen(int k) {
-  return [k](...) -> Value {
-    MMDEPLOY_INFO("{}: start sleeping", k);
-    std::this_thread::sleep_for(std::chrono::seconds(1));
-    MMDEPLOY_INFO("{}: done sleeping", k);
-    return k;
-  };
+TEST_CASE("test executor C API", "[execution]")
+{
+    auto sched = mmdeploy_executor_inline();
+    REQUIRE(sched);
+    auto begin = mmdeploy_executor_just((mmdeploy_value_t) new Value{100, 200});
+    REQUIRE(begin);
+    auto a = mmdeploy_executor_transfer(begin, sched);
+    REQUIRE(a);
+    auto b = mmdeploy_executor_then(a, f, nullptr);
+    REQUIRE(b);
+    auto c = mmdeploy_executor_sync_wait(b);
+    REQUIRE(c);
+    MMDEPLOY_INFO("{}", *(Value*)c);
+    mmdeploy_value_destroy(c);
+}
+
+auto Gen(int k)
+{
+    return [k](...) -> Value
+    {
+        MMDEPLOY_INFO("{}: start sleeping", k);
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+        MMDEPLOY_INFO("{}: done sleeping", k);
+        return k;
+    };
 }
 
-void Fn() {
-  auto pool = __static_thread_pool::StaticThreadPool{4};
-  auto sched = pool.GetScheduler();
-  //  auto sched = InlineScheduler{};
-  auto begin = Schedule(sched);
-  auto a = Then(begin, []() -> Value { return 100; });
-  auto b = LetValue(a, [&](Value& v) {
+void Fn()
+{
+    auto pool  = __static_thread_pool::StaticThreadPool{4};
+    auto sched = pool.GetScheduler();
+    //  auto sched = InlineScheduler{};
+    auto begin = Schedule(sched);
+    auto a     = Then(begin, []() -> Value
+                  { return 100; });
+    auto b     = LetValue(a, [&](Value& v)
+                      {
     auto b1 = Then(Schedule(sched), Gen(1));
     auto b2 = Then(Schedule(sched), Gen(2));
     auto b3 = Then(Schedule(sched), Gen(3));
@@ -274,181 +341,210 @@ void Fn() {
     return LetValue(b, [&](auto&... vals) {
       MMDEPLOY_INFO("vals = {}", std::tuple{vals.template get<int>()...});
       return Just(Value((vals.template get<int>() + ...)));
-    });
-  });
-  auto v = SyncWait(b);
-  MMDEPLOY_INFO("threaded split: {}", v);
+    }); });
+    auto v     = SyncWait(b);
+    MMDEPLOY_INFO("threaded split: {}", v);
 }
 
-void Gn() {
-  auto v = SyncWait(LetValue(Just(Value(100)), [&](Value& v) {
-    return LetValue(Just(Value(200)), [&](Value& u) {
-      return LetValue(Just(Value(300)), [&](Value& w) {
-        return LetValue(Just(Value(400)), [&](Value& x) {
-          return Just(Value(u.get<int>() + v.get<int>() + w.get<int>() + x.get<int>()));
-        });
-      });
-    });
-  }));
-  MMDEPLOY_INFO("Gn: {}", v);
+void Gn()
+{
+    auto v = SyncWait(LetValue(Just(Value(100)), [&](Value& v)
+                               { return LetValue(Just(Value(200)), [&](Value& u)
+                                                 { return LetValue(Just(Value(300)), [&](Value& w)
+                                                                   { return LetValue(Just(Value(400)), [&](Value& x)
+                                                                                     { return Just(Value(u.get<int>() + v.get<int>() + w.get<int>() + x.get<int>())); }); }); }); }));
+    MMDEPLOY_INFO("Gn: {}", v);
 }
 
-TEST_CASE("test threaded split", "[execution]") { Fn(); }
+TEST_CASE("test threaded split", "[execution]")
+{
+    Fn();
+}
 
-TEST_CASE("test inference pipeline", "[execution][pipeline]") { Gn(); }
+TEST_CASE("test inference pipeline", "[execution][pipeline]")
+{
+    Gn();
+}
 
-TEST_CASE("test generic just", "[execution]") {
-  auto j = Just(1, 2, 3, 4.0);
-  auto s = LetValue(j, [](const auto&... vs) { return Just((vs + ...)); });
-  auto v = SyncWait(s);
-  MMDEPLOY_INFO("generic: {}", v);
+TEST_CASE("test generic just", "[execution]")
+{
+    auto j = Just(1, 2, 3, 4.0);
+    auto s = LetValue(j, [](const auto&... vs)
+                      { return Just((vs + ...)); });
+    auto v = SyncWait(s);
+    MMDEPLOY_INFO("generic: {}", v);
 }
 
-TEST_CASE("test generic split", "[execution]") {
-  auto j = Just(1, 2, 3);
-  auto s = Split(j);
-  auto a1 = Then(s, [](int x, auto...) { return x; });
-  auto a2 = Then(s, [](int, int y, auto...) { return y; });
-  auto a3 = Then(s, [](int, int, int z) { return z; });
-  auto a = WhenAll(a3, a2, a1);
-  auto [z, y, x] = SyncWait(a);
-  MMDEPLOY_INFO("generic split: {} {} {}", z, y, x);
+TEST_CASE("test generic split", "[execution]")
+{
+    auto j         = Just(1, 2, 3);
+    auto s         = Split(j);
+    auto a1        = Then(s, [](int x, auto...)
+                   { return x; });
+    auto a2        = Then(s, [](int, int y, auto...)
+                   { return y; });
+    auto a3        = Then(s, [](int, int, int z)
+                   { return z; });
+    auto a         = WhenAll(a3, a2, a1);
+    auto [z, y, x] = SyncWait(a);
+    MMDEPLOY_INFO("generic split: {} {} {}", z, y, x);
 }
 
-TEST_CASE("test bulk", "[execution]") {
-  //  __static_thread_pool::StaticThreadPool pool;
-  _single_thread_context::SingleThreadContext ctx;
-  auto scheduler = ctx.GetScheduler();
-  constexpr int N = 1024;
-  std::vector<float> a(N), b(N), c(N);
-  std::iota(begin(a), end(a), 0);
-  std::iota(rbegin(b), rend(b), 0);
-  auto init = Just(std::move(a), std::move(b), std::move(c)) | Transfer(scheduler);
-  auto fma = std::move(init) | Bulk(N, [](int index, const auto& a, const auto& b, auto& c) {
-               c[index] += a[index] * b[index];
-             });
-  MMDEPLOY_INFO(">>> test bulk");
-  auto [x, y, z] = SyncWait(fma);
-  MMDEPLOY_INFO("<<< test bulk");
-  MMDEPLOY_INFO("{}", z);
+TEST_CASE("test bulk", "[execution]")
+{
+    //  __static_thread_pool::StaticThreadPool pool;
+    _single_thread_context::SingleThreadContext ctx;
+    auto                                        scheduler = ctx.GetScheduler();
+    constexpr int                               N         = 1024;
+    std::vector<float>                          a(N), b(N), c(N);
+    std::iota(begin(a), end(a), 0);
+    std::iota(rbegin(b), rend(b), 0);
+    auto init = Just(std::move(a), std::move(b), std::move(c)) | Transfer(scheduler);
+    auto fma  = std::move(init) | Bulk(N, [](int index, const auto& a, const auto& b, auto& c)
+                                      { c[index] += a[index] * b[index]; });
+    MMDEPLOY_INFO(">>> test bulk");
+    auto [x, y, z] = SyncWait(fma);
+    MMDEPLOY_INFO("<<< test bulk");
+    MMDEPLOY_INFO("{}", z);
 }
 
-TEST_CASE("test schedule_after", "[execution]") {
-  TimedSingleThreadContext context;
-  auto sched = context.GetScheduler();
+TEST_CASE("test schedule_after", "[execution]")
+{
+    TimedSingleThreadContext              context;
+    auto                                  sched = context.GetScheduler();
 
-  auto s = ScheduleAfter(sched, std::chrono::seconds(1));
-  std::chrono::steady_clock::time_point start;
-  auto t = Then(s, [&start] {
+    auto                                  s = ScheduleAfter(sched, std::chrono::seconds(1));
+    std::chrono::steady_clock::time_point start;
+    auto                                  t = Then(s, [&start]
+                  {
     auto end = std::chrono::steady_clock::now();
     auto dt = std::chrono::duration<double>(end - start).count();
     MMDEPLOY_INFO("{} seconds passed", dt);
-    return 0;
-  });
-  start = std::chrono::steady_clock::now();
-  SyncWait(t);
+    return 0; });
+    start                                   = std::chrono::steady_clock::now();
+    SyncWait(t);
 }
 
-TEST_CASE("pipeable sender", "[execution]") {
-  InlineScheduler sched;
-  auto sender = Just(1) | Transfer(sched) | Then([](int x) { return x + 1; });
-  auto [two] = SyncWait(sender);
-  MMDEPLOY_INFO("pipeable sender: {}", two);
+TEST_CASE("pipeable sender", "[execution]")
+{
+    InlineScheduler sched;
+    auto            sender = Just(1) | Transfer(sched) | Then([](int x)
+                                                   { return x + 1; });
+    auto [two]             = SyncWait(sender);
+    MMDEPLOY_INFO("pipeable sender: {}", two);
 }
 
-struct IntManager {
-  using range_t = std::pair<size_t, size_t>;
-  static size_t get_size(int) { return 1; }
-  static void input(std::tuple<int>, range_t, std::tuple<int>& dst, range_t, size_t) {
-    ++std::get<0>(dst);
-  }
-  static void output(int&, range_t, int& dst, range_t, size_t) { ++dst; }
+struct IntManager
+{
+    using range_t = std::pair<size_t, size_t>;
+    static size_t get_size(int)
+    {
+        return 1;
+    }
+    static void input(std::tuple<int>, range_t, std::tuple<int>& dst, range_t, size_t)
+    {
+        ++std::get<0>(dst);
+    }
+    static void output(int&, range_t, int& dst, range_t, size_t)
+    {
+        ++dst;
+    }
 };
 
-TEST_CASE("test dynamic batch", "[execution]") {
-  TimedSingleThreadContext timer;
-  SingleThreadContext thread;
-  StaticThreadPool pool;
-
-  DynamicBatchScheduler<InlineScheduler, __static_thread_pool::Scheduler, IntManager> scheduler{
-      InlineScheduler{}, pool.GetScheduler(), &timer, 2, std::chrono::microseconds(10)};
-
-  constexpr const int N = 16;
-
-  dynamic_batch_t::context_t context;
-
-  std::vector<TypeErasedSender<int>> senders;
-  senders.reserve(N);
-  for (int i = 0; i < N; ++i) {
-    auto begin = TransferJust(scheduler, i);
-    // tag_invoke(DynamicBatch, scheduler, std::move(begin), context, [](int x) { return x; });
-    // MMDEPLOY_INFO("+++ create {}", i);
-    senders.emplace_back(EnsureStarted(DynamicBatch(std::move(begin), context, [](int x) {
+TEST_CASE("test dynamic batch", "[execution]")
+{
+    TimedSingleThreadContext                                                            timer;
+    SingleThreadContext                                                                 thread;
+    StaticThreadPool                                                                    pool;
+
+    DynamicBatchScheduler<InlineScheduler, __static_thread_pool::Scheduler, IntManager> scheduler{
+        InlineScheduler{},
+        pool.GetScheduler(),
+        &timer,
+        2,
+        std::chrono::microseconds(10)};
+
+    constexpr const int                N = 16;
+
+    dynamic_batch_t::context_t         context;
+
+    std::vector<TypeErasedSender<int>> senders;
+    senders.reserve(N);
+    for (int i = 0; i < N; ++i)
+    {
+        auto begin = TransferJust(scheduler, i);
+        // tag_invoke(DynamicBatch, scheduler, std::move(begin), context, [](int x) { return x; });
+        // MMDEPLOY_INFO("+++ create {}", i);
+        senders.emplace_back(EnsureStarted(DynamicBatch(std::move(begin), context, [](int x)
+                                                        {
       MMDEPLOY_INFO("start, batch_size: {}", x);
       std::this_thread::sleep_for(std::chrono::milliseconds(100));
       MMDEPLOY_INFO("end");
-      return x;
-    })));
-    // MMDEPLOY_INFO("--- create {}", i);
-    //    if (i >= 5) {
-    //      std::this_thread::sleep_for(std::chrono::microseconds(1000));
-    //    }
-  }
-
-  MMDEPLOY_INFO("waiting starts...");
-  for (auto& s : senders) {
-    auto [v] = SyncWait(std::move(s));
-    // MMDEPLOY_INFO("val: {}", v);
-  }
+      return x; })));
+        // MMDEPLOY_INFO("--- create {}", i);
+        //    if (i >= 5) {
+        //      std::this_thread::sleep_for(std::chrono::microseconds(1000));
+        //    }
+    }
+
+    MMDEPLOY_INFO("waiting starts...");
+    for (auto& s : senders)
+    {
+        auto [v] = SyncWait(std::move(s));
+        // MMDEPLOY_INFO("val: {}", v);
+    }
 }
 
-TEST_CASE("test dynamic batch for Value", "[execution]") {
-  //  TimedSingleThreadContext timer;
-  //  SingleThreadContext thread;
-  //  StaticThreadPool pool(2);
-  //
-  //  auto get_scheduler = [&](TimedSingleThreadContext* timer, auto scheduler, size_t
-  //  max_batch_size,
-  //                           auto timeout) {
-  //    return DynamicBatchScheduler<InlineScheduler, decltype(scheduler), ValueAssembler>{
-  //        inline_scheduler, std::move(scheduler), timer, max_batch_size, timeout};
-  //  };
-  //
-  //  auto scheduler = TypeErasedScheduler<Value>(
-  //      get_scheduler(nullptr, pool.GetScheduler(), 8, std::chrono::microseconds(10)));
-
-  auto exec_sched = mmdeploy_executor_system_pool();
-  auto dynamic_batch_sched = mmdeploy_executor_dynamic_batch(exec_sched, 32, -1);
-  auto& scheduler = *reinterpret_cast<TypeErasedScheduler<Value>*>(dynamic_batch_sched);
-  //  auto p = mmdeploy_executor_inline();
-  //  auto& scheduler = *reinterpret_cast<TypeErasedScheduler<Value>*>(p);
-
-  constexpr const int N = 256;
-
-  dynamic_batch_t::context_t context;
-
-  std::vector<TypeErasedSender<Value>> senders;
-  senders.reserve(N);
-  for (int i = 0; i < N; ++i) {
-    // FIXME:            GCC    MSVC
-    //  Value{Value{i}}  [[i]]   [i]
-    auto begin = TransferJust(scheduler, Value{Value::Array{i}});
-    senders.emplace_back(EnsureStarted(DynamicBatch(std::move(begin), context, [](Value x) {
+TEST_CASE("test dynamic batch for Value", "[execution]")
+{
+    //  TimedSingleThreadContext timer;
+    //  SingleThreadContext thread;
+    //  StaticThreadPool pool(2);
+    //
+    //  auto get_scheduler = [&](TimedSingleThreadContext* timer, auto scheduler, size_t
+    //  max_batch_size,
+    //                           auto timeout) {
+    //    return DynamicBatchScheduler<InlineScheduler, decltype(scheduler), ValueAssembler>{
+    //        inline_scheduler, std::move(scheduler), timer, max_batch_size, timeout};
+    //  };
+    //
+    //  auto scheduler = TypeErasedScheduler<Value>(
+    //      get_scheduler(nullptr, pool.GetScheduler(), 8, std::chrono::microseconds(10)));
+
+    auto                                 exec_sched          = mmdeploy_executor_system_pool();
+    auto                                 dynamic_batch_sched = mmdeploy_executor_dynamic_batch(exec_sched, 32, -1);
+    auto&                                scheduler           = *reinterpret_cast<TypeErasedScheduler<Value>*>(dynamic_batch_sched);
+    //  auto p = mmdeploy_executor_inline();
+    //  auto& scheduler = *reinterpret_cast<TypeErasedScheduler<Value>*>(p);
+
+    constexpr const int                  N = 256;
+
+    dynamic_batch_t::context_t           context;
+
+    std::vector<TypeErasedSender<Value>> senders;
+    senders.reserve(N);
+    for (int i = 0; i < N; ++i)
+    {
+        // FIXME:            GCC    MSVC
+        //  Value{Value{i}}  [[i]]   [i]
+        auto begin = TransferJust(scheduler, Value{Value::Array{i}});
+        senders.emplace_back(EnsureStarted(DynamicBatch(std::move(begin), context, [](Value x)
+                                                        {
       MMDEPLOY_INFO("batch_size: {}", x.front().size());
       std::this_thread::sleep_for(std::chrono::milliseconds(100));
       for (auto& v : x.front()) {
         v = v.get<int>() * v.get<int>();
       }
-      return x;
-    })));
-  }
-
-  MMDEPLOY_INFO("waiting starts...");
-  for (auto& s : senders) {
-    auto [v] = SyncWait(std::move(s));
-    // MMDEPLOY_INFO("val: {}", v[0][0]);
-  }
-
-  mmdeploy_scheduler_destroy(dynamic_batch_sched);
-  mmdeploy_scheduler_destroy(exec_sched);
+      return x; })));
+    }
+
+    MMDEPLOY_INFO("waiting starts...");
+    for (auto& s : senders)
+    {
+        auto [v] = SyncWait(std::move(s));
+        // MMDEPLOY_INFO("val: {}", v[0][0]);
+    }
+
+    mmdeploy_scheduler_destroy(dynamic_batch_sched);
+    mmdeploy_scheduler_destroy(exec_sched);
 }
diff --git a/tests/test_csrc/core/test_mat.cpp b/tests/test_csrc/core/test_mat.cpp
index 8e6ec37c0f..ba6f50640c 100644
--- a/tests/test_csrc/core/test_mat.cpp
+++ b/tests/test_csrc/core/test_mat.cpp
@@ -13,92 +13,101 @@ using namespace mmdeploy;
 using namespace framework;
 using namespace std;
 
-TEST_CASE("default mat constructor", "[mat]") {
-  auto gResource = MMDeployTestResources::Get();
-  const Device kHost{"cpu"};
-
-  SECTION("default constructor") {
-    Mat mat;
-    REQUIRE(mat.pixel_format() == PixelFormat::kGRAYSCALE);
-    REQUIRE(mat.type() == DataType::kINT8);
-    REQUIRE(mat.height() == 0);
-    REQUIRE(mat.width() == 0);
-    REQUIRE(mat.channel() == 0);
-    REQUIRE(mat.size() == 0);
-    REQUIRE(mat.byte_size() == 0);
-    REQUIRE(mat.data<void>() == nullptr);
-    REQUIRE(mat.device().platform_id() == -1);
-  }
-
-  SECTION("construct with device") {
-    std::array<PixelFormat, 7> pixel_formats{PixelFormat::kBGR,       PixelFormat::kRGB,
-                                             PixelFormat::kGRAYSCALE, PixelFormat::kNV12,
-                                             PixelFormat::kNV21,      PixelFormat::kBGRA};
-    std::array<DataType, 5> data_types{DataType::kFLOAT, DataType::kHALF, DataType::kINT8,
-                                       DataType::kINT32};
-
-    int success = 0;
-    for (auto format : pixel_formats) {
-      for (auto data_type : data_types) {
-        Mat mat{100, 200, format, data_type, kHost};
-        success += (mat.byte_size() > 0);
-      }
+TEST_CASE("default mat constructor", "[mat]")
+{
+    auto         gResource = MMDeployTestResources::Get();
+    const Device kHost{"cpu"};
+
+    SECTION("default constructor")
+    {
+        Mat mat;
+        REQUIRE(mat.pixel_format() == PixelFormat::kGRAYSCALE);
+        REQUIRE(mat.type() == DataType::kINT8);
+        REQUIRE(mat.height() == 0);
+        REQUIRE(mat.width() == 0);
+        REQUIRE(mat.channel() == 0);
+        REQUIRE(mat.size() == 0);
+        REQUIRE(mat.byte_size() == 0);
+        REQUIRE(mat.data<void>() == nullptr);
+        REQUIRE(mat.device().platform_id() == -1);
     }
-    REQUIRE(success == pixel_formats.size() * data_types.size());
 
-    for (auto &device_name : gResource.device_names()) {
-      Device device{device_name.c_str()};
-      REQUIRE_THROWS(Mat{100, 200, PixelFormat(0xff), DataType::kINT8, device});
-      REQUIRE_THROWS(Mat{100, 200, PixelFormat::kGRAYSCALE, DataType(0xff), device});
-    }
-  }
-
-  SECTION("construct with data") {
-    constexpr int kRows = 100;
-    constexpr int kCols = 200;
-    vector<uint8_t> data(kRows * kCols, 0);
-    SECTION("void* data") {
-      Mat mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, data.data(), kHost};
-      REQUIRE(mat.byte_size() > 0);
+    SECTION("construct with device")
+    {
+        std::array<PixelFormat, 7> pixel_formats{PixelFormat::kBGR, PixelFormat::kRGB, PixelFormat::kGRAYSCALE, PixelFormat::kNV12, PixelFormat::kNV21, PixelFormat::kBGRA};
+        std::array<DataType, 5>    data_types{DataType::kFLOAT, DataType::kHALF, DataType::kINT8, DataType::kINT32};
+
+        int                        success = 0;
+        for (auto format : pixel_formats)
+        {
+            for (auto data_type : data_types)
+            {
+                Mat mat{100, 200, format, data_type, kHost};
+                success += (mat.byte_size() > 0);
+            }
+        }
+        REQUIRE(success == pixel_formats.size() * data_types.size());
+
+        for (auto& device_name : gResource.device_names())
+        {
+            Device device{device_name.c_str()};
+            REQUIRE_THROWS(Mat{100, 200, PixelFormat(0xff), DataType::kINT8, device});
+            REQUIRE_THROWS(Mat{100, 200, PixelFormat::kGRAYSCALE, DataType(0xff), device});
+        }
     }
 
-    SECTION("shared_ptr") {
-      std::shared_ptr<void> data_ptr(data.data(), [&](void *p) {});
-      Mat mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, data_ptr, kHost};
-      REQUIRE(mat.byte_size() > 0);
+    SECTION("construct with data")
+    {
+        constexpr int   kRows = 100;
+        constexpr int   kCols = 200;
+        vector<uint8_t> data(kRows * kCols, 0);
+        SECTION("void* data")
+        {
+            Mat mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, data.data(), kHost};
+            REQUIRE(mat.byte_size() > 0);
+        }
+
+        SECTION("shared_ptr")
+        {
+            std::shared_ptr<void> data_ptr(data.data(), [&](void* p) {});
+            Mat                   mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, data_ptr, kHost};
+            REQUIRE(mat.byte_size() > 0);
+        }
     }
-  }
 }
 
-TEST_CASE("mat constructor in difference devices", "[mat]") {
-  auto gResource = MMDeployTestResources::Get();
-
-  constexpr int kRows = 10;
-  constexpr int kCols = 10;
-  constexpr int kSize = kRows * kCols;
-
-  vector<uint8_t> data(kSize);
-  std::iota(data.begin(), data.end(), 1);
-
-  for (auto &device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-
-    // copy to device
-    Mat mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, device};
-    Stream stream = Stream::GetDefault(device);
-    REQUIRE(stream.Copy(data.data(), mat.buffer(), mat.buffer().GetSize()));
-    REQUIRE(stream.Wait());
-
-    // copy to host
-    vector<uint8_t> host_data(mat.size());
-    REQUIRE(stream.Copy(mat.buffer(), host_data.data(), mat.byte_size()));
-    REQUIRE(stream.Wait());
-
-    // compare data to check if they are the same
-    int count = 0;
-    for (size_t i = 0; i < host_data.size(); ++i) {
-      count += (host_data[i] == data[i]);
+TEST_CASE("mat constructor in difference devices", "[mat]")
+{
+    auto            gResource = MMDeployTestResources::Get();
+
+    constexpr int   kRows = 10;
+    constexpr int   kCols = 10;
+    constexpr int   kSize = kRows * kCols;
+
+    vector<uint8_t> data(kSize);
+    std::iota(data.begin(), data.end(), 1);
+
+    for (auto& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+
+        // copy to device
+        Mat    mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, device};
+        Stream stream = Stream::GetDefault(device);
+        REQUIRE(stream.Copy(data.data(), mat.buffer(), mat.buffer().GetSize()));
+        REQUIRE(stream.Wait());
+
+        // copy to host
+        vector<uint8_t> host_data(mat.size());
+        REQUIRE(stream.Copy(mat.buffer(), host_data.data(), mat.byte_size()));
+        REQUIRE(stream.Wait());
+
+        // compare data to check if they are the same
+        int count = 0;
+        for (size_t i = 0; i < host_data.size(); ++i)
+        {
+            count += (host_data[i] == data[i]);
+        }
+        REQUIRE(count == mat.size());
     }
-    REQUIRE(count == mat.size());
-  }
 }
diff --git a/tests/test_csrc/core/test_module_adapter.cpp b/tests/test_csrc/core/test_module_adapter.cpp
index 7529ea6ad6..03cc0ed0ff 100644
--- a/tests/test_csrc/core/test_module_adapter.cpp
+++ b/tests/test_csrc/core/test_module_adapter.cpp
@@ -7,27 +7,34 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/experimental/module_adapter.h"
 
-namespace test_module_adapter {
+namespace test_module_adapter
+{
 
-using mmdeploy::CreateTask;
-using mmdeploy::MakeTask;
-using mmdeploy::Module;
-using mmdeploy::Result;
-using mmdeploy::Value;
+    using mmdeploy::CreateTask;
+    using mmdeploy::MakeTask;
+    using mmdeploy::Module;
+    using mmdeploy::Result;
+    using mmdeploy::Value;
 
-class MyModule {
- public:
-  std::tuple<int, int> operator()(const double& a, const double& b) noexcept {
-    return {a + b, a - b};
-  }
-};
+    class MyModule
+    {
+      public:
+        std::tuple<int, int> operator()(const double& a, const double& b) noexcept
+        {
+            return {a + b, a - b};
+        }
+    };
 
-Result<std::tuple<int, int> > my_func(int x, int y) { return {x + y, x - y}; }
+    Result<std::tuple<int, int>> my_func(int x, int y)
+    {
+        return {x + y, x - y};
+    }
 
-TEST_CASE("test module adapter", "[module_adapter]") {
-  Value x{100, 200};
-  Value y;
-  // clang-format off
+    TEST_CASE("test module adapter", "[module_adapter]")
+    {
+        Value x{100, 200};
+        Value y;
+        // clang-format off
   SECTION("create") {
     std::unique_ptr<Module> task;
     SECTION("function object") {
@@ -68,9 +75,9 @@ TEST_CASE("test module adapter", "[module_adapter]") {
       y = task.Process(x).value();
     }
   }
-  // clang-format on
-  REQUIRE(y[0].get<int>() == 300);
-  REQUIRE(y[1].get<int>() == -100);
-}
+        // clang-format on
+        REQUIRE(y[0].get<int>() == 300);
+        REQUIRE(y[1].get<int>() == -100);
+    }
 
 }  // namespace test_module_adapter
diff --git a/tests/test_csrc/core/test_registry.cpp b/tests/test_csrc/core/test_registry.cpp
index 64c0feffa8..848c1d8e38 100644
--- a/tests/test_csrc/core/test_registry.cpp
+++ b/tests/test_csrc/core/test_registry.cpp
@@ -8,87 +8,118 @@
 
 using namespace mmdeploy;
 
-using Decoder = Module;
+using Decoder        = Module;
 using DecoderCreator = Creator<Decoder>;
 
-class ImageDecoder final : public Decoder {
- public:
-  Result<Value> Process(const Value& input) override {
-    if (input.contains("image_path")) {
-      std::cout << "decode image whose path " << input["image_path"].get<std::string>()
-                << std::endl;
-    } else {
-      std::cerr << "input error" << std::endl;
-      return Status(eInvalidArgument);
+class ImageDecoder final : public Decoder
+{
+  public:
+    Result<Value> Process(const Value& input) override
+    {
+        if (input.contains("image_path"))
+        {
+            std::cout << "decode image whose path " << input["image_path"].get<std::string>()
+                      << std::endl;
+        }
+        else
+        {
+            std::cerr << "input error" << std::endl;
+            return Status(eInvalidArgument);
+        }
+        return Value();
     }
-    return Value();
-  }
 };
 
-class ImageDecoderCreator : public DecoderCreator {
- public:
-  std::string_view name() const noexcept override { return "image"; }
-  int version() const noexcept override { return 2004000; }
-  std::unique_ptr<Decoder> Create(const Value& value) override {
-    ImageDecoder decoder;
-    return std::make_unique<ImageDecoder>(std::move(decoder));
-  }
+class ImageDecoderCreator : public DecoderCreator
+{
+  public:
+    std::string_view name() const noexcept override
+    {
+        return "image";
+    }
+    int version() const noexcept override
+    {
+        return 2004000;
+    }
+    std::unique_ptr<Decoder> Create(const Value& value) override
+    {
+        ImageDecoder decoder;
+        return std::make_unique<ImageDecoder>(std::move(decoder));
+    }
 };
 
 MMDEPLOY_REGISTER_CREATOR(Decoder, ImageDecoderCreator);
 
-namespace no_mmdeploy {
-class ImageDecoder final : public Decoder {
- public:
-  ImageDecoder() = default;
-  Result<Value> Process(const Value& input) override {
-    if (input.contains("image_content")) {
-      std::cout << "decode image content" << std::endl;
-    } else {
-      std::cerr << "input error" << std::endl;
-      return Status(eInvalidArgument);
-    }
-    return Value();
-  }
-};
+namespace no_mmdeploy
+{
+    class ImageDecoder final : public Decoder
+    {
+      public:
+        ImageDecoder() = default;
+        Result<Value> Process(const Value& input) override
+        {
+            if (input.contains("image_content"))
+            {
+                std::cout << "decode image content" << std::endl;
+            }
+            else
+            {
+                std::cerr << "input error" << std::endl;
+                return Status(eInvalidArgument);
+            }
+            return Value();
+        }
+    };
 
-class ImageDecoderCreator : public DecoderCreator {
- public:
-  std::string_view name() const noexcept override { return "image"; }
-  int version() const noexcept override { return 1003006; };
-  std::unique_ptr<Decoder> Create(const Value& value) override {
-    ImageDecoder decoder;
-    return std::make_unique<ImageDecoder>(std::move(decoder));
-  }
-};
+    class ImageDecoderCreator : public DecoderCreator
+    {
+      public:
+        std::string_view name() const noexcept override
+        {
+            return "image";
+        }
+        int version() const noexcept override
+        {
+            return 1003006;
+        };
+        std::unique_ptr<Decoder> Create(const Value& value) override
+        {
+            ImageDecoder decoder;
+            return std::make_unique<ImageDecoder>(std::move(decoder));
+        }
+    };
 
-MMDEPLOY_REGISTER_CREATOR(Decoder, ImageDecoderCreator);
+    MMDEPLOY_REGISTER_CREATOR(Decoder, ImageDecoderCreator);
 
 }  // namespace no_mmdeploy
 
-TEST_CASE("define module in global namespace", "[registry]") {
-  auto& registry = gRegistry<Decoder>();
-  std::string module_type{"image"};
-  SECTION("get not existing decoder") {
-    auto creator = registry.Get("dummy");
-    CHECK(creator == nullptr);
-  }
-  SECTION("get creator without specifying version") {
-    auto creator = registry.Get(module_type);
-    CHECK(creator == nullptr);
-  }
-  SECTION("get creator by providing version") {
-    auto creator = registry.Get(module_type, 100);
-    CHECK(creator == nullptr);
+TEST_CASE("define module in global namespace", "[registry]")
+{
+    auto&       registry = gRegistry<Decoder>();
+    std::string module_type{"image"};
+    SECTION("get not existing decoder")
+    {
+        auto creator = registry.Get("dummy");
+        CHECK(creator == nullptr);
+    }
+    SECTION("get creator without specifying version")
+    {
+        auto creator = registry.Get(module_type);
+        CHECK(creator == nullptr);
+    }
+    SECTION("get creator by providing version")
+    {
+        auto creator = registry.Get(module_type, 100);
+        CHECK(creator == nullptr);
 
-    creator = registry.Get(module_type, 2004000);
-    CHECK(creator != nullptr);
-    auto decoder = creator->Create({});
-    CHECK(decoder->Process({{"image_path", "./test.jpg"}}));
+        creator = registry.Get(module_type, 2004000);
+        CHECK(creator != nullptr);
+        auto decoder = creator->Create({});
+        CHECK(decoder->Process({{"image_path", "./test.jpg"}}));
 
-    auto another_creator = registry.Get(module_type, 1003006);
-    CHECK(another_creator != nullptr);
-    auto another_decoder = another_creator->Create({});
-    CHECK(!another_decoder->Process({{"image_path", "./test.jpg"}}));
-  }
+        auto another_creator = registry.Get(module_type, 1003006);
+        CHECK(another_creator != nullptr);
+        auto another_decoder = another_creator->Create({});
+        CHECK(!another_decoder->Process({{"image_path", "./test.jpg"}}));
+    }
 }
diff --git a/tests/test_csrc/core/test_span.cpp b/tests/test_csrc/core/test_span.cpp
index 43e33bf658..ecd299dc16 100644
--- a/tests/test_csrc/core/test_span.cpp
+++ b/tests/test_csrc/core/test_span.cpp
@@ -8,75 +8,92 @@
 
 using mmdeploy::Span;
 
-TEST_CASE("test span ctors & deduction guides", "[span]") {
-  std::array a{1, 2, 3, 4, 5};
-  std::vector v{1, 2, 3, 4, 5};
-  int c[] = {1, 2, 3, 4, 5};
-  Span x = a;
-  Span y = x;
-
-  SECTION("ctor by it & size") { y = Span(v.begin(), v.size()); }
-
-  SECTION("ctor by first & last") { y = Span(v.begin(), v.end()); }
-
-  SECTION("ctor by vector") { y = Span(v); }
-
-  SECTION("ctor by array") { y = Span(a); }
-
-  SECTION("ctor by c-style array") { y = Span(c); }
-
-  REQUIRE(x == y);
+TEST_CASE("test span ctors & deduction guides", "[span]")
+{
+    std::array  a{1, 2, 3, 4, 5};
+    std::vector v{1, 2, 3, 4, 5};
+    int         c[] = {1, 2, 3, 4, 5};
+    Span        x   = a;
+    Span        y   = x;
+
+    SECTION("ctor by it & size")
+    {
+        y = Span(v.begin(), v.size());
+    }
+
+    SECTION("ctor by first & last")
+    {
+        y = Span(v.begin(), v.end());
+    }
+
+    SECTION("ctor by vector")
+    {
+        y = Span(v);
+    }
+
+    SECTION("ctor by array")
+    {
+        y = Span(a);
+    }
+
+    SECTION("ctor by c-style array")
+    {
+        y = Span(c);
+    }
+
+    REQUIRE(x == y);
 }
 
-TEST_CASE("test span apis", "[span]") {
-  int c[] = {1, 2, 3, 4, 5};
-  Span<int> s;
-  REQUIRE(s.empty());
-  REQUIRE(s.size() == 0);
-  s = c;
-
-  {
-    std::vector v{1, 2, 3, 4, 5};
-    std::vector<int> u(s.begin(), s.end());
-    REQUIRE(u == v);
-  }
-
-  {
-    std::vector v{5, 4, 3, 2, 1};
-    std::vector<int> u(s.rbegin(), s.rend());
-    REQUIRE(u == v);
-  }
-
-  REQUIRE(s.front() == 1);
-  REQUIRE(s.back() == 5);
-  REQUIRE(s.size() == 5);
-  REQUIRE(s.size_bytes() == 5 * sizeof(int));
-  for (int i = 0; i < 5; ++i) REQUIRE(s[i] == i + 1);
-  REQUIRE(s.data()[4] == 5);
-  REQUIRE(!s.empty());
-
-  int a[] = {1, 2, 3};
-  Span t = a;
-  REQUIRE(s != t);
-  REQUIRE(s.first(0).empty());
-  REQUIRE(s.first(3) == t);
-  REQUIRE(s.first(5) == s);
-
-  int b[] = {3, 4, 5};
-  t = b;
-  REQUIRE(s.last(0).empty());
-  REQUIRE(s.last(3) == t);
-  REQUIRE(s.last(5) == s);
-
-  int m[] = {2, 3, 4};
-  t = m;
-
-  REQUIRE(s.subspan(0, 0).empty());
-  REQUIRE(s.subspan(0, 5) == s);
-  REQUIRE(s.subspan(0) == s);
-  REQUIRE(s.subspan(1, 3) == t);
-  REQUIRE(s.subspan(1, 3) == s.first(4).last(3));
-
-  m[0] = 1;
-  REQUIRE(s.subspan(1, 3) != t);
+TEST_CASE("test span apis", "[span]")
+{
+    int       c[] = {1, 2, 3, 4, 5};
+    Span<int> s;
+    REQUIRE(s.empty());
+    REQUIRE(s.size() == 0);
+    s = c;
+
+    {
+        std::vector      v{1, 2, 3, 4, 5};
+        std::vector<int> u(s.begin(), s.end());
+        REQUIRE(u == v);
+    }
+
+    {
+        std::vector      v{5, 4, 3, 2, 1};
+        std::vector<int> u(s.rbegin(), s.rend());
+        REQUIRE(u == v);
+    }
+
+    REQUIRE(s.front() == 1);
+    REQUIRE(s.back() == 5);
+    REQUIRE(s.size() == 5);
+    REQUIRE(s.size_bytes() == 5 * sizeof(int));
+    for (int i = 0; i < 5; ++i) REQUIRE(s[i] == i + 1);
+    REQUIRE(s.data()[4] == 5);
+    REQUIRE(!s.empty());
+
+    int  a[] = {1, 2, 3};
+    Span t   = a;
+    REQUIRE(s != t);
+    REQUIRE(s.first(0).empty());
+    REQUIRE(s.first(3) == t);
+    REQUIRE(s.first(5) == s);
+
+    int b[] = {3, 4, 5};
+    t       = b;
+    REQUIRE(s.last(0).empty());
+    REQUIRE(s.last(3) == t);
+    REQUIRE(s.last(5) == s);
+
+    int m[] = {2, 3, 4};
+    t       = m;
+
+    REQUIRE(s.subspan(0, 0).empty());
+    REQUIRE(s.subspan(0, 5) == s);
+    REQUIRE(s.subspan(0) == s);
+    REQUIRE(s.subspan(1, 3) == t);
+    REQUIRE(s.subspan(1, 3) == s.first(4).last(3));
+
+    m[0] = 1;
+    REQUIRE(s.subspan(1, 3) != t);
 }
diff --git a/tests/test_csrc/core/test_status_code.cpp b/tests/test_csrc/core/test_status_code.cpp
index d1c14f20a8..ee87b96f14 100644
--- a/tests/test_csrc/core/test_status_code.cpp
+++ b/tests/test_csrc/core/test_status_code.cpp
@@ -6,33 +6,43 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/core/status_code.h"
 
-namespace mmdeploy {
-
-Result<double> sqrt(int x) {
-  if (x >= 0) {
-    return std::sqrt(x);
-  } else {
-    return Status(eInvalidArgument);
-  }
-}
-
-Result<double> sqrt_of_negative() {
-  OUTCOME_TRY(auto x, sqrt(-1));
-  return x;
-}
-
-TEST_CASE("test status_code", "[status_code]") {
-  try {
-    sqrt_of_negative().value();
-  } catch (const Exception& e) {
-    REQUIRE(e.code() == eInvalidArgument);
-    MMDEPLOY_INFO("{}", e.what());
-  }
-
-  auto r = sqrt_of_negative();
-  REQUIRE(!r);
-  REQUIRE(r.error() == eInvalidArgument);
-  MMDEPLOY_INFO("{}", r.error().message().c_str());
-}
+namespace mmdeploy
+{
+
+    Result<double> sqrt(int x)
+    {
+        if (x >= 0)
+        {
+            return std::sqrt(x);
+        }
+        else
+        {
+            return Status(eInvalidArgument);
+        }
+    }
+
+    Result<double> sqrt_of_negative()
+    {
+        OUTCOME_TRY(auto x, sqrt(-1));
+        return x;
+    }
+
+    TEST_CASE("test status_code", "[status_code]")
+    {
+        try
+        {
+            sqrt_of_negative().value();
+        }
+        catch (const Exception& e)
+        {
+            REQUIRE(e.code() == eInvalidArgument);
+            MMDEPLOY_INFO("{}", e.what());
+        }
+
+        auto r = sqrt_of_negative();
+        REQUIRE(!r);
+        REQUIRE(r.error() == eInvalidArgument);
+        MMDEPLOY_INFO("{}", r.error().message().c_str());
+    }
 
 }  // namespace mmdeploy
diff --git a/tests/test_csrc/core/test_value.cpp b/tests/test_csrc/core/test_value.cpp
index dd2b451976..9be39c5f19 100644
--- a/tests/test_csrc/core/test_value.cpp
+++ b/tests/test_csrc/core/test_value.cpp
@@ -10,92 +10,107 @@
 
 using namespace mmdeploy;
 
-TEST_CASE("test value", "[value]") {
-  Value a;
-  REQUIRE(a.type() == ValueType::kNull);
-  Value value(1);
-  REQUIRE(value.type() == ValueType::kInt);
-  REQUIRE(value.get<int>() == 1);
-  REQUIRE(value.get<float>() == 1.f);
-  REQUIRE(value.get<double>() == 1.);
-  REQUIRE(value.get<bool>() == true);
-
-  value = true;
-  REQUIRE(value.type() == ValueType::kBool);
-  REQUIRE(value.get<int>() == 1);
-  REQUIRE(value.get<float>() == 1.f);
-  REQUIRE(value.get<double>() == 1.);
-  REQUIRE(value.get<bool>() == true);
-
-  value = ValueType::kObject;
-  REQUIRE(value.is_object());
-
-  using namespace std::string_literals;
-
-  value = "I'm a string";
-  REQUIRE(value.type() == ValueType::kString);
-  REQUIRE(value.get<std::string>() == "I'm a string");
-
-  value = "I'm a string"s;
-  REQUIRE(value.type() == ValueType::kString);
-  REQUIRE(value.get<const char*>() == "I'm a string"s);
-
-  Value copy = value;
-  Value integer(10);
-
-  Value array{0, 1, 2, 3, 4, 5};
-  REQUIRE(array.is_array());
-  for (const auto& x : array) {
-    std::cout << x.get<int>() << std::endl;
-  }
-
-  Value object{{"hello", 100}, {"world", 200}};
-  REQUIRE(object.is_object());
-  for (auto it = object.begin(); it != object.end(); ++it) {
-    std::cout << it.key() << " " << (*it).get<int>() << std::endl;
-  }
+TEST_CASE("test value", "[value]")
+{
+    Value a;
+    REQUIRE(a.type() == ValueType::kNull);
+    Value value(1);
+    REQUIRE(value.type() == ValueType::kInt);
+    REQUIRE(value.get<int>() == 1);
+    REQUIRE(value.get<float>() == 1.f);
+    REQUIRE(value.get<double>() == 1.);
+    REQUIRE(value.get<bool>() == true);
+
+    value = true;
+    REQUIRE(value.type() == ValueType::kBool);
+    REQUIRE(value.get<int>() == 1);
+    REQUIRE(value.get<float>() == 1.f);
+    REQUIRE(value.get<double>() == 1.);
+    REQUIRE(value.get<bool>() == true);
+
+    value = ValueType::kObject;
+    REQUIRE(value.is_object());
+
+    using namespace std::string_literals;
+
+    value = "I'm a string";
+    REQUIRE(value.type() == ValueType::kString);
+    REQUIRE(value.get<std::string>() == "I'm a string");
+
+    value = "I'm a string"s;
+    REQUIRE(value.type() == ValueType::kString);
+    REQUIRE(value.get<const char*>() == "I'm a string"s);
+
+    Value copy = value;
+    Value integer(10);
+
+    Value array{0, 1, 2, 3, 4, 5};
+    REQUIRE(array.is_array());
+    for (const auto& x : array)
+    {
+        std::cout << x.get<int>() << std::endl;
+    }
+
+    Value object{{"hello", 100}, {"world", 200}};
+    REQUIRE(object.is_object());
+    for (auto it = object.begin(); it != object.end(); ++it)
+    {
+        std::cout << it.key() << " " << (*it).get<int>() << std::endl;
+    }
 }
 
-TEST_CASE("test null interface for value", "[value]") {
-  Value v;
-  REQUIRE(v.is_null());
-  REQUIRE(v.size() == 0);
-  REQUIRE(v.empty());
+TEST_CASE("test null interface for value", "[value]")
+{
+    Value v;
+    REQUIRE(v.is_null());
+    REQUIRE(v.size() == 0);
+    REQUIRE(v.empty());
 }
 
-TEST_CASE("test array interface for value", "[value]") {
-  constexpr auto N = 10;
-  Value v;
-  SECTION("init by push_back") {
-    for (int i = 0; i < N; ++i) {
-      v.push_back(i);
+TEST_CASE("test array interface for value", "[value]")
+{
+    constexpr auto N = 10;
+    Value          v;
+    SECTION("init by push_back")
+    {
+        for (int i = 0; i < N; ++i)
+        {
+            v.push_back(i);
+        }
+    }
+    SECTION("init by initializer list")
+    {
+        v = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    }
+    REQUIRE(v.is_array());
+    REQUIRE(v.size() == N);
+    for (int i = 0; i < N; ++i)
+    {
+        REQUIRE(v[i].get<int>() == i);
     }
-  }
-  SECTION("init by initializer list") { v = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; }
-  REQUIRE(v.is_array());
-  REQUIRE(v.size() == N);
-  for (int i = 0; i < N; ++i) {
-    REQUIRE(v[i].get<int>() == i);
-  }
 }
 
-TEST_CASE("test object interface for value", "[value]") {
-  constexpr auto N = 10;
-  Value v;
-  SECTION("init by operator[]") {
-    for (int i = 0; i < N; ++i) {
-      v[std::to_string(i)] = i;
+TEST_CASE("test object interface for value", "[value]")
+{
+    constexpr auto N = 10;
+    Value          v;
+    SECTION("init by operator[]")
+    {
+        for (int i = 0; i < N; ++i)
+        {
+            v[std::to_string(i)] = i;
+        }
+    }
+    SECTION("init by initializer list")
+    {
+        v = {{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}, {"5", 5}, {"6", 6}, {"7", 7}, {"8", 8}, {"9", 9}};
+    }
+    REQUIRE(v.is_object());
+    REQUIRE(v.size() == N);
+    for (int i = 0; i < N; ++i)
+    {
+        REQUIRE(v[std::to_string(i)].get<int>() == i);
     }
-  }
-  SECTION("init by initializer list") {
-    v = {{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4},
-         {"5", 5}, {"6", 6}, {"7", 7}, {"8", 8}, {"9", 9}};
-  }
-  REQUIRE(v.is_object());
-  REQUIRE(v.size() == N);
-  for (int i = 0; i < N; ++i) {
-    REQUIRE(v[std::to_string(i)].get<int>() == i);
-  }
 }
 
 // clang-format off
@@ -119,233 +134,255 @@ using PrimaryTypes =
     >;
 // clang-format on
 
-TEMPLATE_LIST_TEST_CASE("test value set & get", "[value]", PrimaryTypes) {
-  using Type = typename TestType::type;
-  Type t{};
-  Value v = t;
-  REQUIRE(v.type() == TestType::value);
-  // copy ctor
-  Value u = v;
-  REQUIRE(u.type() == v.type());
-  // simple get
-  REQUIRE(u.get<Type>() == t);
-  // move ctor
-  Value w = std::move(v);
-  REQUIRE(v.type() == Value::kNull);
-  REQUIRE(w.type() == u.type());
-  REQUIRE(w.get<Type>() == u.get<Type>());
-  // from type enum
-  Value x = TestType::value;
-  REQUIRE(x.type() == TestType::value);
-  REQUIRE(x.get<Type>() == t);
+TEMPLATE_LIST_TEST_CASE("test value set & get", "[value]", PrimaryTypes)
+{
+    using Type = typename TestType::type;
+    Type  t{};
+    Value v = t;
+    REQUIRE(v.type() == TestType::value);
+    // copy ctor
+    Value u = v;
+    REQUIRE(u.type() == v.type());
+    // simple get
+    REQUIRE(u.get<Type>() == t);
+    // move ctor
+    Value w = std::move(v);
+    REQUIRE(v.type() == Value::kNull);
+    REQUIRE(w.type() == u.type());
+    REQUIRE(w.get<Type>() == u.get<Type>());
+    // from type enum
+    Value x = TestType::value;
+    REQUIRE(x.type() == TestType::value);
+    REQUIRE(x.get<Type>() == t);
 }
 
-TEST_CASE("test array interface of value", "[value]") {
-  Value a{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  REQUIRE(a.is_array());
-  REQUIRE(a.size() == 10);
-  REQUIRE(!a.empty());
-  REQUIRE(a.front().get<int>() == 0);
-  REQUIRE(a.back().get<int>() == 9);
-  REQUIRE(std::as_const(a).front().get<int>() == 0);
-  REQUIRE(std::as_const(a).back().get<int>() == 9);
-  a.push_back(10);
-  REQUIRE(a.back().get<int>() == 10);
-  REQUIRE(a[10].get<int>() == 10);
-  REQUIRE(std::as_const(a)[10].get<int>() == 10);
-  a[10] = 100;
-  REQUIRE(a[10].get<int>() == 100);
-  Value b(11);
-  a.push_back(b);
-  REQUIRE(a.back().get<int>() == 11);
-
-  // init by push back
-  Value c;
-  c.push_back(0);
-  REQUIRE(c.is_array());
-  REQUIRE(c.size() == 1);
-  REQUIRE(c.front().get<int>() == 0);
-
-  // init by native type
-  Value::Array d{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  auto size = d.size();
-  Value e = d;
-  REQUIRE(e.is_array());
-  REQUIRE(e.size() == d.size());
-  e = std::move(d);
-  REQUIRE(d.empty());
-  REQUIRE(e.size() == size);
-
-  // resize via ref to native type
-  Value f = Value::kArray;
-  REQUIRE(f.is_array());
-  f.get_ref<Value::Array&>().resize(1024);
-  REQUIRE(f.size() == 1024);
+TEST_CASE("test array interface of value", "[value]")
+{
+    Value a{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    REQUIRE(a.is_array());
+    REQUIRE(a.size() == 10);
+    REQUIRE(!a.empty());
+    REQUIRE(a.front().get<int>() == 0);
+    REQUIRE(a.back().get<int>() == 9);
+    REQUIRE(std::as_const(a).front().get<int>() == 0);
+    REQUIRE(std::as_const(a).back().get<int>() == 9);
+    a.push_back(10);
+    REQUIRE(a.back().get<int>() == 10);
+    REQUIRE(a[10].get<int>() == 10);
+    REQUIRE(std::as_const(a)[10].get<int>() == 10);
+    a[10] = 100;
+    REQUIRE(a[10].get<int>() == 100);
+    Value b(11);
+    a.push_back(b);
+    REQUIRE(a.back().get<int>() == 11);
+
+    // init by push back
+    Value c;
+    c.push_back(0);
+    REQUIRE(c.is_array());
+    REQUIRE(c.size() == 1);
+    REQUIRE(c.front().get<int>() == 0);
+
+    // init by native type
+    Value::Array d{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto         size = d.size();
+    Value        e    = d;
+    REQUIRE(e.is_array());
+    REQUIRE(e.size() == d.size());
+    e = std::move(d);
+    REQUIRE(d.empty());
+    REQUIRE(e.size() == size);
+
+    // resize via ref to native type
+    Value f = Value::kArray;
+    REQUIRE(f.is_array());
+    f.get_ref<Value::Array&>().resize(1024);
+    REQUIRE(f.size() == 1024);
 }
 
-TEST_CASE("test object interface of value", "[value]") {
-  Value a{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
-  REQUIRE(a.is_object());
-  REQUIRE(a.size() == 5);
-  REQUIRE(!a.empty());
-  REQUIRE(a.contains("0"));
-  REQUIRE(a.value("4", 0) == 4);
-  REQUIRE(a.value("5", 0) == 0);
-  a.update({{"6", 6}, {"7", 7}});
-  REQUIRE(a["6"].get<int>() == 6);
-  REQUIRE(a["7"].get<int>() == 7);
-  REQUIRE(a.find("100") == a.end());
-
-  Value b;
-  REQUIRE(b.is_null());
-  b.update({{"hello", "world"}});
-  REQUIRE(b.is_object());
-  REQUIRE(b.value<std::string>("hello", "") == "world");
-
-  Value c;
-  c["hello"] = "world";
-  REQUIRE(c.is_object());
-  REQUIRE(c.value<std::string>("hello", "") == "world");
+TEST_CASE("test object interface of value", "[value]")
+{
+    Value a{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
+    REQUIRE(a.is_object());
+    REQUIRE(a.size() == 5);
+    REQUIRE(!a.empty());
+    REQUIRE(a.contains("0"));
+    REQUIRE(a.value("4", 0) == 4);
+    REQUIRE(a.value("5", 0) == 0);
+    a.update({{"6", 6}, {"7", 7}});
+    REQUIRE(a["6"].get<int>() == 6);
+    REQUIRE(a["7"].get<int>() == 7);
+    REQUIRE(a.find("100") == a.end());
+
+    Value b;
+    REQUIRE(b.is_null());
+    b.update({{"hello", "world"}});
+    REQUIRE(b.is_object());
+    REQUIRE(b.value<std::string>("hello", "") == "world");
+
+    Value c;
+    c["hello"] = "world";
+    REQUIRE(c.is_object());
+    REQUIRE(c.value<std::string>("hello", "") == "world");
 }
 
 // TODO: Pointer
-TEST_CASE("test pointer of Value", "[value]") {
-  Value o{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
-  Value a{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  Value p{{"object", std::make_shared<Value>(std::move(o))},
-          {"array", std::make_shared<Value>(std::move(a))}};
-  REQUIRE(p.is_object());
-  REQUIRE(p["object"].is_pointer());
-  REQUIRE(p["object"].is_object());
-  REQUIRE(p["array"].is_array());
-  REQUIRE(p["array"].is_array());
-  MMDEPLOY_INFO("{}", p);
+TEST_CASE("test pointer of Value", "[value]")
+{
+    Value o{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
+    Value a{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Value p{{"object", std::make_shared<Value>(std::move(o))},
+            {"array", std::make_shared<Value>(std::move(a))}};
+    REQUIRE(p.is_object());
+    REQUIRE(p["object"].is_pointer());
+    REQUIRE(p["object"].is_object());
+    REQUIRE(p["array"].is_array());
+    REQUIRE(p["array"].is_array());
+    MMDEPLOY_INFO("{}", p);
 }
 
-TEST_CASE("test null Value", "[value]") {
-  Value a;
-  REQUIRE(a.is_null());
-  REQUIRE(a.empty());
-  REQUIRE(a.size() == 0);
-  Value b = a;
-  REQUIRE(b.is_null());
-  Value c = std::move(b);
-  REQUIRE(b.is_null());
-  REQUIRE(c.is_null());
-  Value d = Value::kNull;
-  REQUIRE(d.is_null());
+TEST_CASE("test null Value", "[value]")
+{
+    Value a;
+    REQUIRE(a.is_null());
+    REQUIRE(a.empty());
+    REQUIRE(a.size() == 0);
+    Value b = a;
+    REQUIRE(b.is_null());
+    Value c = std::move(b);
+    REQUIRE(b.is_null());
+    REQUIRE(c.is_null());
+    Value d = Value::kNull;
+    REQUIRE(d.is_null());
 }
 
-TEST_CASE("test value iterator", "[value]") {
-  {
-    Value source{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    int count{};
-    for (auto it = source.begin(); it != source.end(); ++it) {
-      count += it->get<int>() == count;
+TEST_CASE("test value iterator", "[value]")
+{
+    {
+        Value source{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+        int   count{};
+        for (auto it = source.begin(); it != source.end(); ++it)
+        {
+            count += it->get<int>() == count;
+        }
+        REQUIRE(count == source.size());
     }
-    REQUIRE(count == source.size());
-  }
-  {
-    const Value source{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    int count{};
-    for (auto it = source.begin(); it != source.end(); ++it) {
-      count += it->get<int>() == count;
+    {
+        const Value source{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+        int         count{};
+        for (auto it = source.begin(); it != source.end(); ++it)
+        {
+            count += it->get<int>() == count;
+        }
+        REQUIRE(count == source.size());
     }
-    REQUIRE(count == source.size());
-  }
-  {
-    Value source{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
-    int count{};
-    for (auto it = source.begin(); it != source.end(); ++it) {
-      if (it->get<int>() == count && it.key() == std::to_string(it->get<int>())) {
-        ++count;
-      }
+    {
+        Value source{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
+        int   count{};
+        for (auto it = source.begin(); it != source.end(); ++it)
+        {
+            if (it->get<int>() == count && it.key() == std::to_string(it->get<int>()))
+            {
+                ++count;
+            }
+        }
+        REQUIRE(count == source.size());
     }
-    REQUIRE(count == source.size());
-  }
-  {
-    const Value source{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
-    int count{};
-    for (auto it = source.begin(); it != source.end(); ++it) {
-      if (it->get<int>() == count && it.key() == std::to_string(it->get<int>())) {
-        ++count;
-      }
+    {
+        const Value source{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
+        int         count{};
+        for (auto it = source.begin(); it != source.end(); ++it)
+        {
+            if (it->get<int>() == count && it.key() == std::to_string(it->get<int>()))
+            {
+                ++count;
+            }
+        }
+        REQUIRE(count == source.size());
     }
-    REQUIRE(count == source.size());
-  }
 }
 
-struct Meow {
-  int value;
+struct Meow
+{
+    int value;
 };
 
-struct Doge {
-  int value;
+struct Doge
+{
+    int value;
 };
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_REGISTER_TYPE_ID(Meow, 1234);
-MMDEPLOY_REGISTER_TYPE_ID(Doge, 3456);
+    MMDEPLOY_REGISTER_TYPE_ID(Meow, 1234);
+    MMDEPLOY_REGISTER_TYPE_ID(Doge, 3456);
 
 }  // namespace mmdeploy
 
 
-TEST_CASE("test dynamic interface for value", "[value]") {
-  Value meow(Meow{100});
-  REQUIRE(meow.is_any());
-  REQUIRE(meow.is_any<Meow>());
-  REQUIRE_FALSE(meow.is_any<int>());
-  REQUIRE_FALSE(meow.is_any<Doge>());
-  REQUIRE(meow.get<Meow>().value == 100);
-  REQUIRE(meow.get_ref<Meow&>().value == 100);
-  REQUIRE(meow.get_ptr<Meow*>() == &meow.get_ref<Meow&>());
-  REQUIRE(meow.get_ptr<const Meow*>() == meow.get_ptr<Meow*>());
-  REQUIRE(meow.get_ptr<EraseType<Doge>*>() == nullptr);
-
-  Doge v{100};
-  Value doge(cast_by_erasure(v));
-  auto u = doge.get<EraseType<Doge>>();
-  REQUIRE(u.value == v.value);
-  REQUIRE(doge.get_ptr<Meow*>() == nullptr);
-  REQUIRE(doge.get_ref<EraseType<Doge>&>().value == v.value);
-  REQUIRE(doge.get_ptr<EraseType<Doge>*>() == &doge.get_ref<EraseType<Doge>&>());
+TEST_CASE("test dynamic interface for value", "[value]")
+{
+    Value meow(Meow{100});
+    REQUIRE(meow.is_any());
+    REQUIRE(meow.is_any<Meow>());
+    REQUIRE_FALSE(meow.is_any<int>());
+    REQUIRE_FALSE(meow.is_any<Doge>());
+    REQUIRE(meow.get<Meow>().value == 100);
+    REQUIRE(meow.get_ref<Meow&>().value == 100);
+    REQUIRE(meow.get_ptr<Meow*>() == &meow.get_ref<Meow&>());
+    REQUIRE(meow.get_ptr<const Meow*>() == meow.get_ptr<Meow*>());
+    REQUIRE(meow.get_ptr<EraseType<Doge>*>() == nullptr);
+
+    Doge  v{100};
+    Value doge(cast_by_erasure(v));
+    auto  u = doge.get<EraseType<Doge>>();
+    REQUIRE(u.value == v.value);
+    REQUIRE(doge.get_ptr<Meow*>() == nullptr);
+    REQUIRE(doge.get_ref<EraseType<Doge>&>().value == v.value);
+    REQUIRE(doge.get_ptr<EraseType<Doge>*>() == &doge.get_ref<EraseType<Doge>&>());
 }
 
 // conclusion: when value contains more than 8 elements, the pointer type is faster than copying
 //  on a modern x86 CPU
-TEST_CASE("test speed of value", "[value]") {
-  //  constexpr auto N = 512;
-  constexpr auto N = 32;
-  constexpr auto M = N / 1;
-  constexpr auto K = 10;
-  // construct NxNxM cube
-  Value::Array a0(N);
-  for (int i = 0; i < N; ++i) {
-    Value::Array a1(N);
-    for (int j = 0; j < N; ++j) {
-      Value::Array a2(M);
-      for (int k = 0; k < M; ++k) {
-        a2[k] = k;
-      }
-      //      a1[j] = std::move(a2);
-      a1[j] = make_pointer(std::move(a2));
+TEST_CASE("test speed of value", "[value]")
+{
+    //  constexpr auto N = 512;
+    constexpr auto N = 32;
+    constexpr auto M = N / 1;
+    constexpr auto K = 10;
+    // construct NxNxM cube
+    Value::Array   a0(N);
+    for (int i = 0; i < N; ++i)
+    {
+        Value::Array a1(N);
+        for (int j = 0; j < N; ++j)
+        {
+            Value::Array a2(M);
+            for (int k = 0; k < M; ++k)
+            {
+                a2[k] = k;
+            }
+            //      a1[j] = std::move(a2);
+            a1[j] = make_pointer(std::move(a2));
+        }
+        a0[i] = std::move(a1);
+    }
+    Value v(std::move(a0));
+    auto  t0 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < K; ++i)
+    {
+        Value t = graph::DistribAA(v).value();
     }
-    a0[i] = std::move(a1);
-  }
-  Value v(std::move(a0));
-  auto t0 = std::chrono::high_resolution_clock::now();
-  for (int i = 0; i < K; ++i) {
-    Value t = graph::DistribAA(v).value();
-  }
-  auto t1 = std::chrono::high_resolution_clock::now();
-  auto dt = std::chrono::duration<double, std::milli>(t1 - t0).count();
-  MMDEPLOY_INFO("time = {}ms", (float)dt);
+    auto t1 = std::chrono::high_resolution_clock::now();
+    auto dt = std::chrono::duration<double, std::milli>(t1 - t0).count();
+    MMDEPLOY_INFO("time = {}ms", (float)dt);
 }
 
-TEST_CASE("test ctor of value", "[value]") {
-  static_assert(!std::is_constructible<Value, void (*)(int)>::value, "");
-  static_assert(!std::is_constructible<Value, int*>::value, "");
+TEST_CASE("test ctor of value", "[value]")
+{
+    static_assert(!std::is_constructible<Value, void (*)(int)>::value, "");
+    static_assert(!std::is_constructible<Value, int*>::value, "");
 }
 
 //
diff --git a/tests/test_csrc/device/test_cpu_device.cpp b/tests/test_csrc/device/test_cpu_device.cpp
index 91882b3a70..c72104edd7 100644
--- a/tests/test_csrc/device/test_cpu_device.cpp
+++ b/tests/test_csrc/device/test_cpu_device.cpp
@@ -11,31 +11,33 @@ using namespace mmdeploy;
 using namespace framework;
 using namespace std::string_literals;
 
-TEST_CASE("test buffer", "[buffer]") {
-  using namespace mmdeploy;
-  Device device{"cpu"};
-  std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
-  std::vector dst(src.size(), 0.f);
-  auto size_in_bytes = src.size() * sizeof(float);
-  Buffer buf_x(device, size_in_bytes);
-  Buffer buf_y(device, size_in_bytes);
+TEST_CASE("test buffer", "[buffer]")
+{
+    using namespace mmdeploy;
+    Device      device{"cpu"};
+    std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
+    std::vector dst(src.size(), 0.f);
+    auto        size_in_bytes = src.size() * sizeof(float);
+    Buffer      buf_x(device, size_in_bytes);
+    Buffer      buf_y(device, size_in_bytes);
 
-  REQUIRE(buf_x);
-  REQUIRE(buf_y);
-  REQUIRE(buf_x.GetSize() == size_in_bytes);
-  REQUIRE(buf_y.GetSize() == size_in_bytes);
+    REQUIRE(buf_x);
+    REQUIRE(buf_y);
+    REQUIRE(buf_x.GetSize() == size_in_bytes);
+    REQUIRE(buf_y.GetSize() == size_in_bytes);
 
-  SECTION("copy w/ queue API") {
-    //    Stream stream(device);
-    auto stream = Stream::GetDefault(device);
-    Event event(device);
-    REQUIRE(stream);
-    REQUIRE(event);
-    REQUIRE(stream.Copy(src.data(), buf_x));
-    REQUIRE(stream.Copy(buf_x, buf_y));
-    REQUIRE(stream.Copy(buf_y, dst.data()));
-    REQUIRE(event.Record(stream));
-    REQUIRE(event.Wait());
-    REQUIRE(src == dst);
-  }
+    SECTION("copy w/ queue API")
+    {
+        //    Stream stream(device);
+        auto  stream = Stream::GetDefault(device);
+        Event event(device);
+        REQUIRE(stream);
+        REQUIRE(event);
+        REQUIRE(stream.Copy(src.data(), buf_x));
+        REQUIRE(stream.Copy(buf_x, buf_y));
+        REQUIRE(stream.Copy(buf_y, dst.data()));
+        REQUIRE(event.Record(stream));
+        REQUIRE(event.Wait());
+        REQUIRE(src == dst);
+    }
 }
diff --git a/tests/test_csrc/device/test_cuda_device.cpp b/tests/test_csrc/device/test_cuda_device.cpp
index 03dc99314b..d27632eb5c 100644
--- a/tests/test_csrc/device/test_cuda_device.cpp
+++ b/tests/test_csrc/device/test_cuda_device.cpp
@@ -11,33 +11,35 @@ using namespace mmdeploy;
 using namespace framework;
 using namespace std::string_literals;
 
-TEST_CASE("test cuda", "[cuda]") {
-  using namespace mmdeploy;
-  Device device{"cuda"};
-  REQUIRE(device.platform_id() > 0);
-  REQUIRE(device.device_id() == 0);
-  std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
-  std::vector dst(src.size(), 0.f);
-  auto size_in_bytes = src.size() * sizeof(float);
-  Buffer buf_x(device, size_in_bytes);
-  Buffer buf_y(device, size_in_bytes);
+TEST_CASE("test cuda", "[cuda]")
+{
+    using namespace mmdeploy;
+    Device device{"cuda"};
+    REQUIRE(device.platform_id() > 0);
+    REQUIRE(device.device_id() == 0);
+    std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
+    std::vector dst(src.size(), 0.f);
+    auto        size_in_bytes = src.size() * sizeof(float);
+    Buffer      buf_x(device, size_in_bytes);
+    Buffer      buf_y(device, size_in_bytes);
 
-  REQUIRE(buf_x);
-  REQUIRE(buf_y);
-  REQUIRE(buf_x.GetSize() == size_in_bytes);
-  REQUIRE(buf_y.GetSize() == size_in_bytes);
+    REQUIRE(buf_x);
+    REQUIRE(buf_y);
+    REQUIRE(buf_x.GetSize() == size_in_bytes);
+    REQUIRE(buf_y.GetSize() == size_in_bytes);
 
-  SECTION("copy w/ queue API") {
-    //    Stream stream(device);
-    auto stream = Stream::GetDefault(device);
-    Event event(device);
-    REQUIRE(stream);
-    REQUIRE(event);
-    REQUIRE(stream.Copy(src.data(), buf_x));
-    REQUIRE(stream.Copy(buf_x, buf_y));
-    REQUIRE(stream.Copy(buf_y, dst.data()));
-    REQUIRE(event.Record(stream));
-    REQUIRE(event.Wait());
-    REQUIRE(src == dst);
-  }
+    SECTION("copy w/ queue API")
+    {
+        //    Stream stream(device);
+        auto  stream = Stream::GetDefault(device);
+        Event event(device);
+        REQUIRE(stream);
+        REQUIRE(event);
+        REQUIRE(stream.Copy(src.data(), buf_x));
+        REQUIRE(stream.Copy(buf_x, buf_y));
+        REQUIRE(stream.Copy(buf_y, dst.data()));
+        REQUIRE(event.Record(stream));
+        REQUIRE(event.Wait());
+        REQUIRE(src == dst);
+    }
 }
diff --git a/tests/test_csrc/device/test_opencl_device.cpp b/tests/test_csrc/device/test_opencl_device.cpp
index 4d10301a77..85b2e42987 100644
--- a/tests/test_csrc/device/test_opencl_device.cpp
+++ b/tests/test_csrc/device/test_opencl_device.cpp
@@ -11,34 +11,36 @@ using namespace mmdeploy;
 using namespace runtime;
 using namespace std::string_literals;
 
-TEST_CASE("test opencl", "[opencl][!shouldfail]") {
-  using namespace mmdeploy;
-  Device device{"opencl"};
-  REQUIRE(device.platform_id() > 0);
-  REQUIRE(device.device_id() == 0);
+TEST_CASE("test opencl", "[opencl][!shouldfail]")
+{
+    using namespace mmdeploy;
+    Device device{"opencl"};
+    REQUIRE(device.platform_id() > 0);
+    REQUIRE(device.device_id() == 0);
 
-  std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
-  std::vector dst(src.size(), 0.f);
-  auto size_in_bytes = src.size() * sizeof(float);
-  Buffer buf_x(device, size_in_bytes);
-  Buffer buf_y(device, size_in_bytes);
+    std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
+    std::vector dst(src.size(), 0.f);
+    auto        size_in_bytes = src.size() * sizeof(float);
+    Buffer      buf_x(device, size_in_bytes);
+    Buffer      buf_y(device, size_in_bytes);
 
-  REQUIRE(buf_x);
-  REQUIRE(buf_y);
-  REQUIRE(buf_x.GetSize() == size_in_bytes);
-  REQUIRE(buf_y.GetSize() == size_in_bytes);
+    REQUIRE(buf_x);
+    REQUIRE(buf_y);
+    REQUIRE(buf_x.GetSize() == size_in_bytes);
+    REQUIRE(buf_y.GetSize() == size_in_bytes);
 
-  SECTION("copy w/ queue API") {
-    //    Stream stream(device);
-    auto stream = Stream::GetDefault(device);
-    Event event(device);
-    REQUIRE(stream);
-    REQUIRE(event);
-    REQUIRE(stream.Copy(src.data(), buf_x));
-    REQUIRE(stream.Copy(buf_x, buf_y));
-    REQUIRE(stream.Copy(buf_y, dst.data()));
-    REQUIRE(event.Record(stream));
-    REQUIRE(event.Wait());
-    REQUIRE(src == dst);
-  }
+    SECTION("copy w/ queue API")
+    {
+        //    Stream stream(device);
+        auto  stream = Stream::GetDefault(device);
+        Event event(device);
+        REQUIRE(stream);
+        REQUIRE(event);
+        REQUIRE(stream.Copy(src.data(), buf_x));
+        REQUIRE(stream.Copy(buf_x, buf_y));
+        REQUIRE(stream.Copy(buf_y, dst.data()));
+        REQUIRE(event.Record(stream));
+        REQUIRE(event.Wait());
+        REQUIRE(src == dst);
+    }
 }
diff --git a/tests/test_csrc/graph/test_cond.cpp b/tests/test_csrc/graph/test_cond.cpp
index 89db07d789..a421078b55 100644
--- a/tests/test_csrc/graph/test_cond.cpp
+++ b/tests/test_csrc/graph/test_cond.cpp
@@ -8,19 +8,26 @@
 
 using namespace mmdeploy;
 
-namespace {
+namespace
+{
 
-class PlusCreator : public Creator<Module> {
- public:
-  std::string_view name() const noexcept override { return "Plus"; }
-  std::unique_ptr<Module> Create(const Value&) override {
-    return CreateTask([](int a, int b) { return a + b; });
-  }
-};
+    class PlusCreator : public Creator<Module>
+    {
+      public:
+        std::string_view name() const noexcept override
+        {
+            return "Plus";
+        }
+        std::unique_ptr<Module> Create(const Value&) override
+        {
+            return CreateTask([](int a, int b)
+                              { return a + b; });
+        }
+    };
 
-MMDEPLOY_REGISTER_CREATOR(Module, PlusCreator);
+    MMDEPLOY_REGISTER_CREATOR(Module, PlusCreator);
 
-const auto json_config1 = R"(
+    const auto json_config1 = R"(
 {
   "type": "Cond",
   "input": ["pred", "a", "b"],
@@ -34,33 +41,34 @@ const auto json_config1 = R"(
 
 }  // namespace
 
-TEST_CASE("test Cond node", "[graph]") {
-  auto config = from_json<Value>(json_config1);
-  auto builder = graph::Builder::CreateFromConfig(config).value();
-  REQUIRE(builder);
-  auto node = builder->Build().value();
-  REQUIRE(node);
-  {
-    auto result = SyncWait(node->Process(Just(Value({{false}, {1}, {1}}))));
-    MMDEPLOY_INFO("{}", result);
-  }
-  {
-    auto result = SyncWait(node->Process(Just(Value({{true}, {1}, {1}}))));
-    MMDEPLOY_INFO("{}", result);
-  }
-  {
-    auto result = SyncWait(
-        node->Process(Just(Value({{false, false, false, false}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
-    MMDEPLOY_INFO("{}", result);
-  }
-  {
-    auto result = SyncWait(
-        node->Process(Just(Value({{true, true, true, true}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
-    MMDEPLOY_INFO("{}", result);
-  }
-  {
-    auto result = SyncWait(
-        node->Process(Just(Value({{true, false, false, true}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
-    MMDEPLOY_INFO("{}", result);
-  }
+TEST_CASE("test Cond node", "[graph]")
+{
+    auto config  = from_json<Value>(json_config1);
+    auto builder = graph::Builder::CreateFromConfig(config).value();
+    REQUIRE(builder);
+    auto node = builder->Build().value();
+    REQUIRE(node);
+    {
+        auto result = SyncWait(node->Process(Just(Value({{false}, {1}, {1}}))));
+        MMDEPLOY_INFO("{}", result);
+    }
+    {
+        auto result = SyncWait(node->Process(Just(Value({{true}, {1}, {1}}))));
+        MMDEPLOY_INFO("{}", result);
+    }
+    {
+        auto result = SyncWait(
+            node->Process(Just(Value({{false, false, false, false}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
+        MMDEPLOY_INFO("{}", result);
+    }
+    {
+        auto result = SyncWait(
+            node->Process(Just(Value({{true, true, true, true}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
+        MMDEPLOY_INFO("{}", result);
+    }
+    {
+        auto result = SyncWait(
+            node->Process(Just(Value({{true, false, false, true}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
+        MMDEPLOY_INFO("{}", result);
+    }
 }
diff --git a/tests/test_csrc/model/test_directory_model.cpp b/tests/test_csrc/model/test_directory_model.cpp
index e701d66a64..dd478af049 100644
--- a/tests/test_csrc/model/test_directory_model.cpp
+++ b/tests/test_csrc/model/test_directory_model.cpp
@@ -10,29 +10,32 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test directory model", "[.model][resource]") {
-  std::unique_ptr<ModelImpl> model_impl;
-  for (auto& entry : gRegistry<ModelImpl>().Creators()) {
-    if (entry->name() == "DirectoryModel") {
-      model_impl = entry->Create();
-      break;
+TEST_CASE("test directory model", "[.model][resource]")
+{
+    std::unique_ptr<ModelImpl> model_impl;
+    for (auto& entry : gRegistry<ModelImpl>().Creators())
+    {
+        if (entry->name() == "DirectoryModel")
+        {
+            model_impl = entry->Create();
+            break;
+        }
     }
-  }
-  REQUIRE(model_impl);
+    REQUIRE(model_impl);
 
-  auto& gResource = MMDeployTestResources::Get();
-  auto directory_model_list = gResource.LocateModelResources("sdk_models");
-  REQUIRE(!directory_model_list.empty());
-  auto model_dir = "sdk_models/good_model";
-  REQUIRE(gResource.IsDir(model_dir));
-  auto model_path = gResource.resource_root_path() / model_dir;
-  REQUIRE(!model_impl->Init(model_path.string()).has_error());
-  REQUIRE(!model_impl->ReadFile("deploy.json").has_error());
-  REQUIRE(model_impl->ReadFile("not-existing-file").has_error());
+    auto& gResource            = MMDeployTestResources::Get();
+    auto  directory_model_list = gResource.LocateModelResources("sdk_models");
+    REQUIRE(!directory_model_list.empty());
+    auto model_dir = "sdk_models/good_model";
+    REQUIRE(gResource.IsDir(model_dir));
+    auto model_path = gResource.resource_root_path() / model_dir;
+    REQUIRE(!model_impl->Init(model_path.string()).has_error());
+    REQUIRE(!model_impl->ReadFile("deploy.json").has_error());
+    REQUIRE(model_impl->ReadFile("not-existing-file").has_error());
 
-  model_dir = "sdk_models/bad_model";
-  REQUIRE(gResource.IsDir(model_dir));
-  model_path = gResource.resource_root_path() / model_dir;
-  REQUIRE(!model_impl->Init(model_path.string()).has_error());
-  REQUIRE(model_impl->ReadMeta().has_error());
+    model_dir = "sdk_models/bad_model";
+    REQUIRE(gResource.IsDir(model_dir));
+    model_path = gResource.resource_root_path() / model_dir;
+    REQUIRE(!model_impl->Init(model_path.string()).has_error());
+    REQUIRE(model_impl->ReadMeta().has_error());
 }
diff --git a/tests/test_csrc/model/test_model.cpp b/tests/test_csrc/model/test_model.cpp
index 93dd797449..07ba93ebb0 100644
--- a/tests/test_csrc/model/test_model.cpp
+++ b/tests/test_csrc/model/test_model.cpp
@@ -11,39 +11,51 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("model constructor", "[model]") {
-  SECTION("default constructor") {
-    Model model;
-    REQUIRE(!model);
-  }
-  SECTION("explicit constructor with model path") {
-    REQUIRE_THROWS(Model{"path/to/not/existing/model"});
-  }
-  SECTION("explicit constructor with buffer") { REQUIRE_THROWS(Model{nullptr, 0}); }
+TEST_CASE("model constructor", "[model]")
+{
+    SECTION("default constructor")
+    {
+        Model model;
+        REQUIRE(!model);
+    }
+    SECTION("explicit constructor with model path")
+    {
+        REQUIRE_THROWS(Model{"path/to/not/existing/model"});
+    }
+    SECTION("explicit constructor with buffer")
+    {
+        REQUIRE_THROWS(Model{nullptr, 0});
+    }
 }
 
-TEST_CASE("model init", "[model]") {
-  auto& gResource = MMDeployTestResources::Get();
-  for (auto& codebase : gResource.codebases()) {
-    if (auto img_list = gResource.LocateImageResources(fs::path{codebase} / "images");
-        !img_list.empty()) {
-      Model model;
-      REQUIRE(model.Init(img_list.front()).has_error());
-      break;
+TEST_CASE("model init", "[model]")
+{
+    auto& gResource = MMDeployTestResources::Get();
+    for (auto& codebase : gResource.codebases())
+    {
+        if (auto img_list = gResource.LocateImageResources(fs::path{codebase} / "images");
+            !img_list.empty())
+        {
+            Model model;
+            REQUIRE(model.Init(img_list.front()).has_error());
+            break;
+        }
     }
-  }
-  for (auto& codebase : gResource.codebases()) {
-    for (auto& backend : gResource.backends()) {
-      if (auto model_list = gResource.LocateModelResources(fs::path{codebase} / backend);
-          !model_list.empty()) {
-        Model model;
-        REQUIRE(!model.Init(model_list.front()).has_error());
-        REQUIRE(!model.ReadFile("deploy.json").has_error());
-        auto const& meta = model.meta();
-        REQUIRE(!model.GetModelConfig(meta.models[0].name).has_error());
-        REQUIRE(model.GetModelConfig("not-existing-model").has_error());
-        break;
-      }
+    for (auto& codebase : gResource.codebases())
+    {
+        for (auto& backend : gResource.backends())
+        {
+            if (auto model_list = gResource.LocateModelResources(fs::path{codebase} / backend);
+                !model_list.empty())
+            {
+                Model model;
+                REQUIRE(!model.Init(model_list.front()).has_error());
+                REQUIRE(!model.ReadFile("deploy.json").has_error());
+                auto const& meta = model.meta();
+                REQUIRE(!model.GetModelConfig(meta.models[0].name).has_error());
+                REQUIRE(model.GetModelConfig("not-existing-model").has_error());
+                break;
+            }
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/model/test_zip_model.cpp b/tests/test_csrc/model/test_zip_model.cpp
index 03d8fd4d2f..efd529d959 100644
--- a/tests/test_csrc/model/test_zip_model.cpp
+++ b/tests/test_csrc/model/test_zip_model.cpp
@@ -14,41 +14,47 @@ using namespace std;
 using namespace mmdeploy;
 
 #if MMDEPLOY_ZIP_MODEL
-TEST_CASE("test zip model", "[zip_model]") {
-  std::unique_ptr<ModelImpl> model_impl;
-  for (auto& entry : ModelRegistry::Get().ListEntries()) {
-    if (entry.name == "ZipModel") {
-      model_impl = entry.creator();
-      break;
+TEST_CASE("test zip model", "[zip_model]")
+{
+    std::unique_ptr<ModelImpl> model_impl;
+    for (auto& entry : ModelRegistry::Get().ListEntries())
+    {
+        if (entry.name == "ZipModel")
+        {
+            model_impl = entry.creator();
+            break;
+        }
     }
-  }
-  REQUIRE(model_impl);
+    REQUIRE(model_impl);
 
-  auto& gResource = MMDeployTestResources::Get();
-  SECTION("bad sdk model") {
-    auto zip_model_path = fs::path{"sdk_models"} / "not_zip_file";
-    REQUIRE(gResource.IsFile(zip_model_path));
-    auto model_path = gResource.resource_root_path() / zip_model_path;
-    REQUIRE(model_impl->Init(model_path.string()).has_error());
-  }
-  SECTION("bad zip buffer") {
-    std::vector<char> buffer(100);
-    REQUIRE(model_impl->Init(buffer.data(), buffer.size()).has_error());
-  }
+    auto& gResource = MMDeployTestResources::Get();
+    SECTION("bad sdk model")
+    {
+        auto zip_model_path = fs::path{"sdk_models"} / "not_zip_file";
+        REQUIRE(gResource.IsFile(zip_model_path));
+        auto model_path = gResource.resource_root_path() / zip_model_path;
+        REQUIRE(model_impl->Init(model_path.string()).has_error());
+    }
+    SECTION("bad zip buffer")
+    {
+        std::vector<char> buffer(100);
+        REQUIRE(model_impl->Init(buffer.data(), buffer.size()).has_error());
+    }
 
-  SECTION("good sdk model") {
-    auto zip_model_path = fs::path{"sdk_models"} / "good_model.zip";
-    REQUIRE(gResource.IsFile(zip_model_path));
-    auto model_path = gResource.resource_root_path() / zip_model_path;
-    REQUIRE(!model_impl->Init(model_path.string()).has_error());
-    REQUIRE(!model_impl->ReadFile("deploy.json").has_error());
-    REQUIRE(model_impl->ReadFile("not-exist-file").has_error());
-    REQUIRE(!model_impl->ReadMeta().has_error());
+    SECTION("good sdk model")
+    {
+        auto zip_model_path = fs::path{"sdk_models"} / "good_model.zip";
+        REQUIRE(gResource.IsFile(zip_model_path));
+        auto model_path = gResource.resource_root_path() / zip_model_path;
+        REQUIRE(!model_impl->Init(model_path.string()).has_error());
+        REQUIRE(!model_impl->ReadFile("deploy.json").has_error());
+        REQUIRE(model_impl->ReadFile("not-exist-file").has_error());
+        REQUIRE(!model_impl->ReadMeta().has_error());
 
-    ifstream ifs(model_path, std::ios::binary | std::ios::in);
-    REQUIRE(ifs.is_open());
-    string buffer((istreambuf_iterator<char>(ifs)), istreambuf_iterator<char>());
-    REQUIRE(!model_impl->Init(buffer.data(), buffer.size()).has_error());
-  }
+        ifstream ifs(model_path, std::ios::binary | std::ios::in);
+        REQUIRE(ifs.is_open());
+        string buffer((istreambuf_iterator<char>(ifs)), istreambuf_iterator<char>());
+        REQUIRE(!model_impl->Init(buffer.data(), buffer.size()).has_error());
+    }
 }
 #endif
diff --git a/tests/test_csrc/net/test_ncnn_net.cpp b/tests/test_csrc/net/test_ncnn_net.cpp
index 0546014592..43ed38905b 100644
--- a/tests/test_csrc/net/test_ncnn_net.cpp
+++ b/tests/test_csrc/net/test_ncnn_net.cpp
@@ -11,22 +11,23 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test ncnn net", "[.ncnn_net][resource]") {
-  auto& gResource = MMDeployTestResources::Get();
-  auto model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "ncnn");
-  REQUIRE(!model_list.empty());
+TEST_CASE("test ncnn net", "[.ncnn_net][resource]")
+{
+    auto& gResource  = MMDeployTestResources::Get();
+    auto  model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "ncnn");
+    REQUIRE(!model_list.empty());
 
-  Model model(model_list.front());
-  REQUIRE(model);
+    Model model(model_list.front());
+    REQUIRE(model);
 
-  auto backend("ncnn");
-  auto creator = gRegistry<Net>().Get(backend);
-  REQUIRE(creator);
+    auto backend("ncnn");
+    auto creator = gRegistry<Net>().Get(backend);
+    REQUIRE(creator);
 
-  Device device{"cpu"};
-  auto stream = Stream::GetDefault(device);
-  Value net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
-                   {"name", model.meta().models[0].name}};
-  auto net = creator->Create(net_config);
-  REQUIRE(net);
+    Device device{"cpu"};
+    auto   stream = Stream::GetDefault(device);
+    Value  net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
+                      {"name", model.meta().models[0].name}};
+    auto   net = creator->Create(net_config);
+    REQUIRE(net);
 }
diff --git a/tests/test_csrc/net/test_openvino_net.cpp b/tests/test_csrc/net/test_openvino_net.cpp
index f708771adb..ed92f6bf82 100644
--- a/tests/test_csrc/net/test_openvino_net.cpp
+++ b/tests/test_csrc/net/test_openvino_net.cpp
@@ -11,22 +11,23 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test openvino net", "[.openvino_net][resource]") {
-  auto& gResource = MMDeployTestResources::Get();
-  auto model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "openvino");
-  REQUIRE(!model_list.empty());
+TEST_CASE("test openvino net", "[.openvino_net][resource]")
+{
+    auto& gResource  = MMDeployTestResources::Get();
+    auto  model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "openvino");
+    REQUIRE(!model_list.empty());
 
-  Model model(model_list.front());
-  REQUIRE(model);
+    Model model(model_list.front());
+    REQUIRE(model);
 
-  auto backend("openvino");
-  auto creator = gRegistry<Net>().Get(backend);
-  REQUIRE(creator);
+    auto backend("openvino");
+    auto creator = gRegistry<Net>().Get(backend);
+    REQUIRE(creator);
 
-  Device device{"cpu"};
-  auto stream = Stream::GetDefault(device);
-  Value net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
-                   {"name", model.meta().models[0].name}};
-  auto net = creator->Create(net_config);
-  REQUIRE(net);
+    Device device{"cpu"};
+    auto   stream = Stream::GetDefault(device);
+    Value  net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
+                      {"name", model.meta().models[0].name}};
+    auto   net = creator->Create(net_config);
+    REQUIRE(net);
 }
diff --git a/tests/test_csrc/net/test_ort_net.cpp b/tests/test_csrc/net/test_ort_net.cpp
index 13f4d23064..d393707335 100644
--- a/tests/test_csrc/net/test_ort_net.cpp
+++ b/tests/test_csrc/net/test_ort_net.cpp
@@ -11,22 +11,23 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test ort net", "[.ort_net][resource]") {
-  auto& gResource = MMDeployTestResources::Get();
-  auto model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "ort");
-  REQUIRE(!model_list.empty());
+TEST_CASE("test ort net", "[.ort_net][resource]")
+{
+    auto& gResource  = MMDeployTestResources::Get();
+    auto  model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "ort");
+    REQUIRE(!model_list.empty());
 
-  Model model(model_list.front());
-  REQUIRE(model);
+    Model model(model_list.front());
+    REQUIRE(model);
 
-  auto backend("onnxruntime");
-  auto creator = gRegistry<Net>().Get(backend);
-  REQUIRE(creator);
+    auto backend("onnxruntime");
+    auto creator = gRegistry<Net>().Get(backend);
+    REQUIRE(creator);
 
-  Device device{"cpu"};
-  auto stream = Stream::GetDefault(device);
-  Value net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
-                   {"name", model.meta().models[0].name}};
-  auto net = creator->Create(net_config);
-  REQUIRE(net);
+    Device device{"cpu"};
+    auto   stream = Stream::GetDefault(device);
+    Value  net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
+                      {"name", model.meta().models[0].name}};
+    auto   net = creator->Create(net_config);
+    REQUIRE(net);
 }
diff --git a/tests/test_csrc/net/test_ppl_net.cpp b/tests/test_csrc/net/test_ppl_net.cpp
index 55e70b3627..939bda452d 100644
--- a/tests/test_csrc/net/test_ppl_net.cpp
+++ b/tests/test_csrc/net/test_ppl_net.cpp
@@ -11,21 +11,22 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test pplnn net", "[.ppl_net][resource]") {
-  auto& gResource = MMDeployTestResources::Get();
-  auto model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "pplnn");
-  REQUIRE(!model_list.empty());
+TEST_CASE("test pplnn net", "[.ppl_net][resource]")
+{
+    auto& gResource  = MMDeployTestResources::Get();
+    auto  model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "pplnn");
+    REQUIRE(!model_list.empty());
 
-  Model model(model_list.front());
-  REQUIRE(model);
+    Model model(model_list.front());
+    REQUIRE(model);
 
-  auto backend = "pplnn";
-  auto creator = gRegistry<Net>().Get(backend);
-  REQUIRE(creator);
+    auto backend = "pplnn";
+    auto creator = gRegistry<Net>().Get(backend);
+    REQUIRE(creator);
 
-  Device device{"cpu"};
-  auto stream = Stream::GetDefault(device);
-  // clang-format off
+    Device device{"cpu"};
+    auto   stream = Stream::GetDefault(device);
+    // clang-format off
   Value net_config{
       {"context", {
           {"device", device},
diff --git a/tests/test_csrc/net/test_trt_net.cpp b/tests/test_csrc/net/test_trt_net.cpp
index c0da700789..467489b3ce 100644
--- a/tests/test_csrc/net/test_trt_net.cpp
+++ b/tests/test_csrc/net/test_trt_net.cpp
@@ -11,22 +11,23 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test trt net", "[.trt_net][resource]") {
-  auto& gResource = MMDeployTestResources::Get();
-  auto model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "trt");
-  REQUIRE(!model_list.empty());
+TEST_CASE("test trt net", "[.trt_net][resource]")
+{
+    auto& gResource  = MMDeployTestResources::Get();
+    auto  model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "trt");
+    REQUIRE(!model_list.empty());
 
-  Model model(model_list.front());
-  REQUIRE(model);
+    Model model(model_list.front());
+    REQUIRE(model);
 
-  auto backend("tensorrt");
-  auto creator = gRegistry<Net>().Get(backend);
-  REQUIRE(creator);
+    auto backend("tensorrt");
+    auto creator = gRegistry<Net>().Get(backend);
+    REQUIRE(creator);
 
-  Device device{"cuda"};
-  auto stream = Stream::GetDefault(device);
-  Value net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
-                   {"name", model.meta().models[0].name}};
-  auto net = creator->Create(net_config);
-  REQUIRE(net);
+    Device device{"cuda"};
+    auto   stream = Stream::GetDefault(device);
+    Value  net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
+                      {"name", model.meta().models[0].name}};
+    auto   net = creator->Create(net_config);
+    REQUIRE(net);
 }
diff --git a/tests/test_csrc/preprocess/test_collect.cpp b/tests/test_csrc/preprocess/test_collect.cpp
index 15a95fc525..941b7c6c1f 100644
--- a/tests/test_csrc/preprocess/test_collect.cpp
+++ b/tests/test_csrc/preprocess/test_collect.cpp
@@ -7,98 +7,108 @@
 using namespace mmdeploy;
 using namespace std;
 
-TEST_CASE("test collect constructor", "[collect]") {
-  Device device{"cpu"};
-  Stream stream{device};
-  Value cfg = {{"context", {{"device", device}, {"stream", stream}}}};
+TEST_CASE("test collect constructor", "[collect]")
+{
+    Device      device{"cpu"};
+    Stream      stream{device};
+    Value       cfg = {{"context", {{"device", device}, {"stream", stream}}}};
 
-  std::string transform_type{"Collect"};
-  auto creator = gRegistry<Transform>().Get(transform_type);
-  REQUIRE(creator != nullptr);
+    std::string transform_type{"Collect"};
+    auto        creator = gRegistry<Transform>().Get(transform_type);
+    REQUIRE(creator != nullptr);
 
-  REQUIRE_THROWS(creator->Create(cfg));
+    REQUIRE_THROWS(creator->Create(cfg));
 
-  SECTION("args with 'keys' which is not an array") {
-    auto _cfg = cfg;
-    _cfg["keys"] = "img";
-    REQUIRE_THROWS(creator->Create(_cfg));
-  }
+    SECTION("args with 'keys' which is not an array")
+    {
+        auto _cfg    = cfg;
+        _cfg["keys"] = "img";
+        REQUIRE_THROWS(creator->Create(_cfg));
+    }
 
-  SECTION("args with keys in array") {
-    auto _cfg = cfg;
-    _cfg["keys"] = {"img"};
-    auto module = creator->Create(_cfg);
-    REQUIRE(module != nullptr);
-  }
+    SECTION("args with keys in array")
+    {
+        auto _cfg    = cfg;
+        _cfg["keys"] = {"img"};
+        auto module  = creator->Create(_cfg);
+        REQUIRE(module != nullptr);
+    }
 
-  SECTION("args with meta_keys that is not an array") {
-    auto _cfg = cfg;
-    _cfg["keys"] = {"img"};
-    _cfg["meta_keys"] = "ori_img";
-    REQUIRE_THROWS(creator->Create(_cfg));
-  }
-  SECTION("args with meta_keys in array") {
-    auto _cfg = cfg;
-    _cfg["keys"] = {"img"};
-    _cfg["meta_keys"] = {"ori_img"};
-    auto module = creator->Create(_cfg);
-    REQUIRE(module != nullptr);
-  }
+    SECTION("args with meta_keys that is not an array")
+    {
+        auto _cfg         = cfg;
+        _cfg["keys"]      = {"img"};
+        _cfg["meta_keys"] = "ori_img";
+        REQUIRE_THROWS(creator->Create(_cfg));
+    }
+    SECTION("args with meta_keys in array")
+    {
+        auto _cfg         = cfg;
+        _cfg["keys"]      = {"img"};
+        _cfg["meta_keys"] = {"ori_img"};
+        auto module       = creator->Create(_cfg);
+        REQUIRE(module != nullptr);
+    }
 }
 
-TEST_CASE("test collect", "[collect]") {
-  std::string transform_type{"Collect"};
-  vector<std::string> keys{"img"};
-  vector<std::string> meta_keys{"filename", "ori_filename",   "ori_shape",   "img_shape",
-                                "flip",     "flip_direction", "img_norm_cfg"};
-  Value args;
-  Device device{"cpu"};
-  Stream stream{device};
-  args["context"]["device"] = device;
-  args["context"]["stream"] = stream;
-  for (auto& key : keys) {
-    args["keys"].push_back(key);
-  }
-  for (auto& meta_key : meta_keys) {
-    args["meta_keys"].push_back(meta_key);
-  }
+TEST_CASE("test collect", "[collect]")
+{
+    std::string         transform_type{"Collect"};
+    vector<std::string> keys{"img"};
+    vector<std::string> meta_keys{"filename", "ori_filename", "ori_shape", "img_shape", "flip", "flip_direction", "img_norm_cfg"};
+    Value               args;
+    Device              device{"cpu"};
+    Stream              stream{device};
+    args["context"]["device"] = device;
+    args["context"]["stream"] = stream;
+    for (auto& key : keys)
+    {
+        args["keys"].push_back(key);
+    }
+    for (auto& meta_key : meta_keys)
+    {
+        args["meta_keys"].push_back(meta_key);
+    }
 
-  auto creator = gRegistry<Transform>().Get(transform_type);
-  REQUIRE(creator != nullptr);
-  auto module = creator->Create(args);
-  REQUIRE(module != nullptr);
+    auto creator = gRegistry<Transform>().Get(transform_type);
+    REQUIRE(creator != nullptr);
+    auto module = creator->Create(args);
+    REQUIRE(module != nullptr);
 
-  Value input;
+    Value input;
 
-  SECTION("input is empty") {
-    auto ret = module->Apply(input);
-    REQUIRE(ret.has_error());
-    REQUIRE(ret.error() == eInvalidArgument);
-  }
+    SECTION("input is empty")
+    {
+        auto ret = module->Apply(input);
+        REQUIRE(ret.has_error());
+        REQUIRE(ret.error() == eInvalidArgument);
+    }
 
-  SECTION("input has 'ori_img' and 'attribute'") {
-    input["ori_img"] = Tensor{};
-    input["attribute"] = "this is a faked image";
-    auto ret = module->Apply(input);
-    REQUIRE(ret.has_error());
-    REQUIRE(ret.error() == eInvalidArgument);
-  }
+    SECTION("input has 'ori_img' and 'attribute'")
+    {
+        input["ori_img"]   = Tensor{};
+        input["attribute"] = "this is a faked image";
+        auto ret           = module->Apply(input);
+        REQUIRE(ret.has_error());
+        REQUIRE(ret.error() == eInvalidArgument);
+    }
 
-  SECTION("array input with correct keys and meta keys") {
-    Tensor tensor;
-    Value input{{"img", tensor},
-                {"filename", "test.jpg"},
-                {"ori_filename", "/the/path/of/test.jpg"},
-                {"ori_shape", {1000, 1000, 3}},
-                {"img_shape", {1, 3, 224, 224}},
-                {"flip", "false"},
-                {"flip_direction", "horizontal"},
-                {"img_norm_cfg",
-                 {{"mean", {123.675, 116.28, 103.53}},
-                  {"std", {58.395, 57.12, 57.375}},
-                  {"to_rgb", true}}}};
+    SECTION("array input with correct keys and meta keys")
+    {
+        Tensor tensor;
+        Value  input{{"img", tensor},
+                     {"filename", "test.jpg"},
+                     {"ori_filename", "/the/path/of/test.jpg"},
+                     {"ori_shape", {1000, 1000, 3}},
+                     {"img_shape", {1, 3, 224, 224}},
+                     {"flip", "false"},
+                     {"flip_direction", "horizontal"},
+                     {"img_norm_cfg",
+                      {{"mean", {123.675, 116.28, 103.53}},
+                       {"std", {58.395, 57.12, 57.375}},
+                       {"to_rgb", true}}}};
 
-    auto ret = module->Apply(input);
-    REQUIRE(ret.has_value());
-  }
+        auto   ret = module->Apply(input);
+        REQUIRE(ret.has_value());
+    }
 }
diff --git a/tests/test_csrc/preprocess/test_compose.cpp b/tests/test_csrc/preprocess/test_compose.cpp
index fba95bd566..b8d151cb7a 100644
--- a/tests/test_csrc/preprocess/test_compose.cpp
+++ b/tests/test_csrc/preprocess/test_compose.cpp
@@ -22,7 +22,7 @@ using namespace mmdeploy::test;
 using namespace std;
 using nlohmann::json;
 
-static constexpr const char *gPipelineConfig = R"(
+static constexpr const char* gPipelineConfig = R"(
 [{
 		"type": "LoadImageFromFile"
 	},
@@ -65,38 +65,42 @@ static constexpr const char *gPipelineConfig = R"(
 ]
 )";
 
-TEST_CASE("transform Compose exceptional case", "[compose]") {
-  Value compose_cfg;
-  SECTION("wrong transform type") {
-    compose_cfg = {{"type", "Compose"}, {"transforms", {{{"type", "collect"}}}}};
-  }
+TEST_CASE("transform Compose exceptional case", "[compose]")
+{
+    Value compose_cfg;
+    SECTION("wrong transform type")
+    {
+        compose_cfg = {{"type", "Compose"}, {"transforms", {{{"type", "collect"}}}}};
+    }
 
-  SECTION("wrong transform parameter") {
-    compose_cfg = {{"type", "Compose"}, {"transforms", {{{"type", "Collect"}}}}};
-  }
-  const Device kHost{"cpu"};
-  Stream stream{kHost};
-  REQUIRE(CreateTransform(compose_cfg, kHost, stream) == nullptr);
+    SECTION("wrong transform parameter")
+    {
+        compose_cfg = {{"type", "Compose"}, {"transforms", {{{"type", "Collect"}}}}};
+    }
+    const Device kHost{"cpu"};
+    Stream       stream{kHost};
+    REQUIRE(CreateTransform(compose_cfg, kHost, stream) == nullptr);
 }
 
-TEST_CASE("transform Compose", "[compose]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("transform Compose", "[compose]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  auto src_mat = cpu::CVMat2Mat(bgr_mat, PixelFormat::kBGR);
-  Value input{{"ori_img", src_mat}};
+    auto         img_path = img_list.front();
+    cv::Mat      bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    auto         src_mat  = cpu::CVMat2Mat(bgr_mat, PixelFormat::kBGR);
+    Value        input{{"ori_img", src_mat}};
 
-  auto json = json::parse(gPipelineConfig);
-  auto cfg = ::mmdeploy::from_json<Value>(json);
-  Value compose_cfg{{"type", "Compose"}, {"transforms", cfg}};
+    auto         json = json::parse(gPipelineConfig);
+    auto         cfg  = ::mmdeploy::from_json<Value>(json);
+    Value        compose_cfg{{"type", "Compose"}, {"transforms", cfg}};
 
-  const Device kHost{"cpu"};
-  Stream stream{kHost};
-  auto transform = CreateTransform(compose_cfg, kHost, stream);
-  REQUIRE(transform != nullptr);
-  auto res = transform->Process({{"ori_img", src_mat}});
-  REQUIRE(!res.has_error());
+    const Device kHost{"cpu"};
+    Stream       stream{kHost};
+    auto         transform = CreateTransform(compose_cfg, kHost, stream);
+    REQUIRE(transform != nullptr);
+    auto res = transform->Process({{"ori_img", src_mat}});
+    REQUIRE(!res.has_error());
 }
diff --git a/tests/test_csrc/preprocess/test_crop.cpp b/tests/test_csrc/preprocess/test_crop.cpp
index 97fba9e153..c57e26bebc 100644
--- a/tests/test_csrc/preprocess/test_crop.cpp
+++ b/tests/test_csrc/preprocess/test_crop.cpp
@@ -15,97 +15,111 @@ using namespace framework;
 using namespace std;
 using namespace mmdeploy::test;
 
-tuple<int, int, int, int> CenterCropArea(const cv::Mat& mat, int crop_height, int crop_width) {
-  auto img_height = mat.rows;
-  auto img_width = mat.cols;
-  auto y1 = max(0, int(round((img_height - crop_height) / 2.)));
-  auto x1 = max(0, int(round((img_width - crop_width) / 2.)));
-  auto y2 = min(img_height, y1 + crop_height) - 1;
-  auto x2 = min(img_width, x1 + crop_width) - 1;
-  return {y1, x1, y2, x2};
+tuple<int, int, int, int> CenterCropArea(const cv::Mat& mat, int crop_height, int crop_width)
+{
+    auto img_height = mat.rows;
+    auto img_width  = mat.cols;
+    auto y1         = max(0, int(round((img_height - crop_height) / 2.)));
+    auto x1         = max(0, int(round((img_width - crop_width) / 2.)));
+    auto y2         = min(img_height, y1 + crop_height) - 1;
+    auto x2         = min(img_width, x1 + crop_width) - 1;
+    return {y1, x1, y2, x2};
 }
 
-void TestCenterCrop(const Value& cfg, const cv::Mat& mat, int crop_height, int crop_width) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
-
-    auto [top, left, bottom, right] = CenterCropArea(mat, crop_height, crop_width);
-    auto ref_mat = mmdeploy::cpu::Crop(mat, top, left, bottom, right);
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "img_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
-
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-  }
+void TestCenterCrop(const Value& cfg, const cv::Mat& mat, int crop_height, int crop_width)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
+
+        auto [top, left, bottom, right] = CenterCropArea(mat, crop_height, crop_width);
+        auto ref_mat                    = mmdeploy::cpu::Crop(mat, top, left, bottom, right);
+        auto res                        = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "img_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
+
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+    }
 }
 
-TEST_CASE("transform CenterCrop", "[crop]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
-
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
-  cv::Mat bgr_float_mat;
-  cv::Mat gray_float_mat;
-  bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
-  gray_mat.convertTo(gray_float_mat, CV_32FC1);
-
-  vector<cv::Mat> mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
-
-  SECTION("crop_size: int; small size") {
-    constexpr int crop_size = 224;
-    Value cfg{{"type", "CenterCrop"}, {"crop_size", crop_size}};
-    for (auto& mat : mats) {
-      TestCenterCrop(cfg, mat, crop_size, crop_size);
+TEST_CASE("transform CenterCrop", "[crop]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
+
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
+    cv::Mat bgr_float_mat;
+    cv::Mat gray_float_mat;
+    bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
+    gray_mat.convertTo(gray_float_mat, CV_32FC1);
+
+    vector<cv::Mat> mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
+
+    SECTION("crop_size: int; small size")
+    {
+        constexpr int crop_size = 224;
+        Value         cfg{{"type", "CenterCrop"}, {"crop_size", crop_size}};
+        for (auto& mat : mats)
+        {
+            TestCenterCrop(cfg, mat, crop_size, crop_size);
+        }
     }
-  }
 
-  SECTION("crop_size: int; oversize") {
-    constexpr int crop_size = 800;
-    Value cfg{{"type", "CenterCrop"}, {"crop_size", crop_size}};
-    for (auto& mat : mats) {
-      TestCenterCrop(cfg, mat, crop_size, crop_size);
+    SECTION("crop_size: int; oversize")
+    {
+        constexpr int crop_size = 800;
+        Value         cfg{{"type", "CenterCrop"}, {"crop_size", crop_size}};
+        for (auto& mat : mats)
+        {
+            TestCenterCrop(cfg, mat, crop_size, crop_size);
+        }
     }
-  }
-
-  SECTION("crop_size: tuple") {
-    constexpr int crop_height = 224;
-    constexpr int crop_width = 224;
-    Value cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
-    for (auto& mat : mats) {
-      TestCenterCrop(cfg, mat, crop_height, crop_width);
+
+    SECTION("crop_size: tuple")
+    {
+        constexpr int crop_height = 224;
+        constexpr int crop_width  = 224;
+        Value         cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
+        for (auto& mat : mats)
+        {
+            TestCenterCrop(cfg, mat, crop_height, crop_width);
+        }
     }
-  }
-
-  SECTION("crop_size: tuple;oversize in height") {
-    constexpr int crop_height = 640;
-    constexpr int crop_width = 224;
-    Value cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
-    for (auto& mat : mats) {
-      TestCenterCrop(cfg, mat, crop_height, crop_width);
+
+    SECTION("crop_size: tuple;oversize in height")
+    {
+        constexpr int crop_height = 640;
+        constexpr int crop_width  = 224;
+        Value         cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
+        for (auto& mat : mats)
+        {
+            TestCenterCrop(cfg, mat, crop_height, crop_width);
+        }
     }
-  }
-
-  SECTION("crop_size: tuple;oversize in width") {
-    constexpr int crop_height = 224;
-    constexpr int crop_width = 800;
-    Value cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
-    for (auto& mat : mats) {
-      TestCenterCrop(cfg, mat, crop_height, crop_width);
+
+    SECTION("crop_size: tuple;oversize in width")
+    {
+        constexpr int crop_height = 224;
+        constexpr int crop_width  = 800;
+        Value         cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
+        for (auto& mat : mats)
+        {
+            TestCenterCrop(cfg, mat, crop_height, crop_width);
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/preprocess/test_default_format_bundle.cpp b/tests/test_csrc/preprocess/test_default_format_bundle.cpp
index a011e898e8..4eb35f73f0 100644
--- a/tests/test_csrc/preprocess/test_default_format_bundle.cpp
+++ b/tests/test_csrc/preprocess/test_default_format_bundle.cpp
@@ -13,56 +13,62 @@ using namespace framework;
 using namespace mmdeploy::test;
 using namespace std;
 
-void TestDefaultFormatBundle(const Value& cfg, const cv::Mat& mat) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestDefaultFormatBundle(const Value& cfg, const cv::Mat& mat)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    vector<cv::Mat> channel_mats(mat.channels());
-    for (auto i = 0; i < mat.channels(); ++i) {
-      cv::extractChannel(mat, channel_mats[i], i);
-    }
+        vector<cv::Mat> channel_mats(mat.channels());
+        for (auto i = 0; i < mat.channels(); ++i)
+        {
+            cv::extractChannel(mat, channel_mats[i], i);
+        }
 
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    auto shape = res_tensor.desc().shape;
-    REQUIRE(shape == std::vector<int64_t>{1, mat.channels(), mat.rows, mat.cols});
+        auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        auto shape = res_tensor.desc().shape;
+        REQUIRE(shape == std::vector<int64_t>{1, mat.channels(), mat.rows, mat.cols});
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    // mat's shape is {h, w, c}, while res_tensor's shape is {1, c, h, w}
-    // compare each channel between `res_tensor` and `mat`
-    // note `data_type` of `res_tensor` is `float`
-    auto step = shape[2] * shape[3] * sizeof(float);
-    auto data = host_tensor.value().data<uint8_t>();
-    for (auto i = 0; i < mat.channels(); ++i) {
-      cv::Mat _mat{mat.rows, mat.cols, CV_32FC1, data};
-      REQUIRE(::mmdeploy::cpu::Compare(channel_mats[i], _mat));
-      data += step;
+        // mat's shape is {h, w, c}, while res_tensor's shape is {1, c, h, w}
+        // compare each channel between `res_tensor` and `mat`
+        // note `data_type` of `res_tensor` is `float`
+        auto step = shape[2] * shape[3] * sizeof(float);
+        auto data = host_tensor.value().data<uint8_t>();
+        for (auto i = 0; i < mat.channels(); ++i)
+        {
+            cv::Mat _mat{mat.rows, mat.cols, CV_32FC1, data};
+            REQUIRE(::mmdeploy::cpu::Compare(channel_mats[i], _mat));
+            data += step;
+        }
     }
-  }
 }
 
-TEST_CASE("transform DefaultFormatBundle", "[bundle]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("transform DefaultFormatBundle", "[bundle]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
+    auto            img_path = img_list.front();
+    cv::Mat         bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat         gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
 
 
-  Value cfg{{"type", "DefaultFormatBundle"}, {"keys", {"img"}}};
-  vector<cv::Mat> mats{bgr_mat, gray_mat};
-  for (auto& mat : mats) {
-    TestDefaultFormatBundle(cfg, mat);
-  }
+    Value           cfg{{"type", "DefaultFormatBundle"}, {"keys", {"img"}}};
+    vector<cv::Mat> mats{bgr_mat, gray_mat};
+    for (auto& mat : mats)
+    {
+        TestDefaultFormatBundle(cfg, mat);
+    }
 }
diff --git a/tests/test_csrc/preprocess/test_image2tensor.cpp b/tests/test_csrc/preprocess/test_image2tensor.cpp
index 23fab43d06..73f2d75a1e 100644
--- a/tests/test_csrc/preprocess/test_image2tensor.cpp
+++ b/tests/test_csrc/preprocess/test_image2tensor.cpp
@@ -13,58 +13,64 @@ using namespace framework;
 using namespace mmdeploy::test;
 using namespace std;
 
-void TestImage2Tensor(const Value& cfg, const cv::Mat& mat) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestImage2Tensor(const Value& cfg, const cv::Mat& mat)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    vector<cv::Mat> channel_mats(mat.channels());
-    for (auto i = 0; i < mat.channels(); ++i) {
-      cv::extractChannel(mat, channel_mats[i], i);
-    }
+        vector<cv::Mat> channel_mats(mat.channels());
+        for (auto i = 0; i < mat.channels(); ++i)
+        {
+            cv::extractChannel(mat, channel_mats[i], i);
+        }
 
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    auto shape = res_tensor.desc().shape;
-    REQUIRE(shape == std::vector<int64_t>{1, mat.channels(), mat.rows, mat.cols});
+        auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        auto shape = res_tensor.desc().shape;
+        REQUIRE(shape == std::vector<int64_t>{1, mat.channels(), mat.rows, mat.cols});
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    // mat's shape is {h, w, c}, while res_tensor's shape is {1, c, h, w}
-    // compare each channel between `res_tensor` and `mat`
-    auto step = shape[2] * shape[3] * mat.elemSize1();
-    auto data = host_tensor.value().data<uint8_t>();
-    for (auto i = 0; i < mat.channels(); ++i) {
-      cv::Mat _mat{mat.rows, mat.cols, CV_MAKETYPE(mat.depth(), 1), data};
-      REQUIRE(::mmdeploy::cpu::Compare(channel_mats[i], _mat));
-      data += step;
+        // mat's shape is {h, w, c}, while res_tensor's shape is {1, c, h, w}
+        // compare each channel between `res_tensor` and `mat`
+        auto step = shape[2] * shape[3] * mat.elemSize1();
+        auto data = host_tensor.value().data<uint8_t>();
+        for (auto i = 0; i < mat.channels(); ++i)
+        {
+            cv::Mat _mat{mat.rows, mat.cols, CV_MAKETYPE(mat.depth(), 1), data};
+            REQUIRE(::mmdeploy::cpu::Compare(channel_mats[i], _mat));
+            data += step;
+        }
     }
-  }
 }
 
-TEST_CASE("transform ImageToTensor", "[img2tensor]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("transform ImageToTensor", "[img2tensor]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
-  cv::Mat bgr_float_mat;
-  cv::Mat gray_float_mat;
-  bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
-  gray_mat.convertTo(gray_float_mat, CV_32FC1);
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
+    cv::Mat bgr_float_mat;
+    cv::Mat gray_float_mat;
+    bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
+    gray_mat.convertTo(gray_float_mat, CV_32FC1);
 
-  Value cfg{{"type", "ImageToTensor"}, {"keys", {"img"}}};
-  vector<cv::Mat> mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
-  for (auto& mat : mats) {
-    TestImage2Tensor(cfg, mat);
-  }
+    Value           cfg{{"type", "ImageToTensor"}, {"keys", {"img"}}};
+    vector<cv::Mat> mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
+    for (auto& mat : mats)
+    {
+        TestImage2Tensor(cfg, mat);
+    }
 }
diff --git a/tests/test_csrc/preprocess/test_load.cpp b/tests/test_csrc/preprocess/test_load.cpp
index 258d2c0ed3..a353602b74 100644
--- a/tests/test_csrc/preprocess/test_load.cpp
+++ b/tests/test_csrc/preprocess/test_load.cpp
@@ -16,69 +16,75 @@ using namespace framework;
 using namespace std;
 using namespace mmdeploy::test;
 
-void TestLoad(const Value& cfg, const cv::Mat& mat, PixelFormat src_format,
-              PixelFormat dst_format) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestLoad(const Value& cfg, const cv::Mat& mat, PixelFormat src_format, PixelFormat dst_format)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    auto ref_mat = mmdeploy::cpu::CvtColor(mat, src_format, dst_format);
+        auto ref_mat = mmdeploy::cpu::CvtColor(mat, src_format, dst_format);
 
-    auto res = transform->Process({{"ori_img", cpu::CVMat2Mat(mat, PixelFormat(src_format))}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "img_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-    REQUIRE(Shape(res.value(), "ori_shape") ==
-            vector<int64_t>{1, mat.rows, mat.cols, mat.channels()});
-    REQUIRE(res.value().contains("img_fields"));
-    REQUIRE(res.value()["img_fields"].is_array());
-    REQUIRE(res.value()["img_fields"].size() == 1);
-    REQUIRE(res.value()["img_fields"][0].get<string>() == "img");
+        auto res = transform->Process({{"ori_img", cpu::CVMat2Mat(mat, PixelFormat(src_format))}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "img_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+        REQUIRE(Shape(res.value(), "ori_shape") ==
+                vector<int64_t>{1, mat.rows, mat.cols, mat.channels()});
+        REQUIRE(res.value().contains("img_fields"));
+        REQUIRE(res.value()["img_fields"].is_array());
+        REQUIRE(res.value()["img_fields"].size() == 1);
+        REQUIRE(res.value()["img_fields"][0].get<string>() == "img");
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-  }
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+    }
 }
 
-TEST_CASE("prepare image, that is LoadImageFromFile transform", "[.load]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("prepare image, that is LoadImageFromFile transform", "[.load]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
-  cv::Mat rgb_mat;
-  cv::Mat bgra_mat;
-  // TODO: make up yuv nv12/nv21 mat
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
+    cv::Mat rgb_mat;
+    cv::Mat bgra_mat;
+    // TODO: make up yuv nv12/nv21 mat
 
-  cv::cvtColor(bgr_mat, rgb_mat, cv::COLOR_BGR2RGB);
-  cv::cvtColor(bgr_mat, bgra_mat, cv::COLOR_BGR2BGRA);
+    cv::cvtColor(bgr_mat, rgb_mat, cv::COLOR_BGR2RGB);
+    cv::cvtColor(bgr_mat, bgra_mat, cv::COLOR_BGR2BGRA);
 
-  vector<pair<cv::Mat, PixelFormat>> mats{{bgr_mat, PixelFormat::kBGR},
-                                          {rgb_mat, PixelFormat::kRGB},
-                                          {gray_mat, PixelFormat::kGRAYSCALE},
-                                          {bgra_mat, PixelFormat::kBGRA}};
-  // pair is <color_type, to_float32>
-  vector<pair<std::string, bool>> conditions{
-      {"color", true}, {"color", false}, {"grayscale", true}, {"grayscale", false}};
+    vector<pair<cv::Mat, PixelFormat>> mats{{bgr_mat, PixelFormat::kBGR},
+                                            {rgb_mat, PixelFormat::kRGB},
+                                            {gray_mat, PixelFormat::kGRAYSCALE},
+                                            {bgra_mat, PixelFormat::kBGRA}};
+    // pair is <color_type, to_float32>
+    vector<pair<std::string, bool>>    conditions{
+           {"color", true},
+           {"color", false},
+           {"grayscale", true},
+           {"grayscale", false}};
 
-  for (auto& condition : conditions) {
-    Value cfg{{"type", "LoadImageFromFile"},
-              {"to_float32", condition.second},
-              {"color_type", condition.first}};
-    for (auto& mat : mats) {
-      TestLoad(cfg, mat.first, mat.second,
-               condition.first == "color" ? PixelFormat::kBGR : PixelFormat::kGRAYSCALE);
+    for (auto& condition : conditions)
+    {
+        Value cfg{{"type", "LoadImageFromFile"},
+                  {"to_float32", condition.second},
+                  {"color_type", condition.first}};
+        for (auto& mat : mats)
+        {
+            TestLoad(cfg, mat.first, mat.second, condition.first == "color" ? PixelFormat::kBGR : PixelFormat::kGRAYSCALE);
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/preprocess/test_normalize.cpp b/tests/test_csrc/preprocess/test_normalize.cpp
index e44e2bd739..0f70d5fe69 100644
--- a/tests/test_csrc/preprocess/test_normalize.cpp
+++ b/tests/test_csrc/preprocess/test_normalize.cpp
@@ -15,90 +15,101 @@ using namespace framework;
 using namespace mmdeploy::test;
 using namespace std;
 
-void TestNormalize(const Value &cfg, const cv::Mat &mat) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const &device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestNormalize(const Value& cfg, const cv::Mat& mat)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    vector<float> mean;
-    vector<float> std;
-    for (auto &v : cfg["mean"]) {
-      mean.push_back(v.get<float>());
-    }
-    for (auto &v : cfg["std"]) {
-      std.push_back(v.get<float>());
-    }
-    bool to_rgb = cfg.value("to_rgb", false);
+        vector<float> mean;
+        vector<float> std;
+        for (auto& v : cfg["mean"])
+        {
+            mean.push_back(v.get<float>());
+        }
+        for (auto& v : cfg["std"])
+        {
+            std.push_back(v.get<float>());
+        }
+        bool to_rgb = cfg.value("to_rgb", false);
 
-    auto _mat = mat.clone();
-    auto ref_mat = mmdeploy::cpu::Normalize(_mat, mean, std, to_rgb);
+        auto _mat    = mat.clone();
+        auto ref_mat = mmdeploy::cpu::Normalize(_mat, mean, std, to_rgb);
 
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(res_tensor.desc().data_type == DataType::kFLOAT);
-    REQUIRE(ImageNormCfg(res.value(), "mean") == mean);
-    REQUIRE(ImageNormCfg(res.value(), "std") == std);
+        auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(res_tensor.desc().data_type == DataType::kFLOAT);
+        REQUIRE(ImageNormCfg(res.value(), "mean") == mean);
+        REQUIRE(ImageNormCfg(res.value(), "std") == std);
 
-    Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-  }
+        Device kHost{"cpu"};
+        auto   host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+    }
 }
 
-TEST_CASE("transform Normalize", "[normalize]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("transform Normalize", "[normalize]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path);
-  cv::Mat gray_mat;
-  cv::Mat float_bgr_mat;
-  cv::Mat float_gray_mat;
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path);
+    cv::Mat gray_mat;
+    cv::Mat float_bgr_mat;
+    cv::Mat float_gray_mat;
 
-  cv::cvtColor(bgr_mat, gray_mat, cv::COLOR_BGR2GRAY);
-  bgr_mat.convertTo(float_bgr_mat, CV_32FC3);
-  gray_mat.convertTo(float_gray_mat, CV_32FC1);
+    cv::cvtColor(bgr_mat, gray_mat, cv::COLOR_BGR2GRAY);
+    bgr_mat.convertTo(float_bgr_mat, CV_32FC3);
+    gray_mat.convertTo(float_gray_mat, CV_32FC1);
 
-  SECTION("cpu vs gpu: 3 channel mat") {
-    bool to_rgb = true;
-    Value cfg{{"type", "Normalize"},
-              {"mean", {123.675, 116.28, 103.53}},
-              {"std", {58.395, 57.12, 57.375}},
-              {"to_rgb", to_rgb}};
-    vector<cv::Mat> mats{bgr_mat, float_bgr_mat};
-    for (auto &mat : mats) {
-      TestNormalize(cfg, mat);
+    SECTION("cpu vs gpu: 3 channel mat")
+    {
+        bool            to_rgb = true;
+        Value           cfg{{"type", "Normalize"},
+                            {"mean", {123.675, 116.28, 103.53}},
+                            {"std", {58.395, 57.12, 57.375}},
+                            {"to_rgb", to_rgb}};
+        vector<cv::Mat> mats{bgr_mat, float_bgr_mat};
+        for (auto& mat : mats)
+        {
+            TestNormalize(cfg, mat);
+        }
     }
-  }
 
-  SECTION("cpu vs gpu: 3 channel mat, to_rgb false") {
-    bool to_rgb = false;
-    Value cfg{{"type", "Normalize"},
-              {"mean", {123.675, 116.28, 103.53}},
-              {"std", {58.395, 57.12, 57.375}},
-              {"to_rgb", to_rgb}};
+    SECTION("cpu vs gpu: 3 channel mat, to_rgb false")
+    {
+        bool            to_rgb = false;
+        Value           cfg{{"type", "Normalize"},
+                            {"mean", {123.675, 116.28, 103.53}},
+                            {"std", {58.395, 57.12, 57.375}},
+                            {"to_rgb", to_rgb}};
 
-    vector<cv::Mat> mats{bgr_mat, float_bgr_mat};
-    for (auto &mat : mats) {
-      TestNormalize(cfg, mat);
+        vector<cv::Mat> mats{bgr_mat, float_bgr_mat};
+        for (auto& mat : mats)
+        {
+            TestNormalize(cfg, mat);
+        }
     }
-  }
 
-  SECTION("cpu vs gpu: 1 channel mat") {
-    bool to_rgb = true;
-    Value cfg{{"type", "Normalize"}, {"mean", {123.675}}, {"std", {58.395}}, {"to_rgb", to_rgb}};
+    SECTION("cpu vs gpu: 1 channel mat")
+    {
+        bool            to_rgb = true;
+        Value           cfg{{"type", "Normalize"}, {"mean", {123.675}}, {"std", {58.395}}, {"to_rgb", to_rgb}};
 
-    vector<cv::Mat> mats{gray_mat, float_gray_mat};
-    for (auto &mat : mats) {
-      TestNormalize(cfg, mat);
+        vector<cv::Mat> mats{gray_mat, float_gray_mat};
+        for (auto& mat : mats)
+        {
+            TestNormalize(cfg, mat);
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/preprocess/test_pad.cpp b/tests/test_csrc/preprocess/test_pad.cpp
index 766de9c560..613679884c 100644
--- a/tests/test_csrc/preprocess/test_pad.cpp
+++ b/tests/test_csrc/preprocess/test_pad.cpp
@@ -16,105 +16,125 @@ using namespace std;
 using namespace mmdeploy::test;
 
 // left, top, right, bottom
-tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, int dst_height, int dst_width) {
-  return {0, 0, dst_width - mat.cols, dst_height - mat.rows};
+tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, int dst_height, int dst_width)
+{
+    return {0, 0, dst_width - mat.cols, dst_height - mat.rows};
 }
 
-tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, bool square = true) {
-  int size = std::max(mat.rows, mat.cols);
-  return GetPadSize(mat, size, size);
+tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, bool square = true)
+{
+    int size = std::max(mat.rows, mat.cols);
+    return GetPadSize(mat, size, size);
 }
 
-tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, int divisor) {
-  auto pad_h = int(ceil(mat.rows * 1.0 / divisor)) * divisor;
-  auto pad_w = int(ceil(mat.cols * 1.0 / divisor)) * divisor;
-  return GetPadSize(mat, pad_h, pad_w);
+tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, int divisor)
+{
+    auto pad_h = int(ceil(mat.rows * 1.0 / divisor)) * divisor;
+    auto pad_w = int(ceil(mat.cols * 1.0 / divisor)) * divisor;
+    return GetPadSize(mat, pad_h, pad_w);
 }
 
-void TestPad(const Value& cfg, const cv::Mat& mat, int top, int left, int bottom, int right,
-             int border_type, float val) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestPad(const Value& cfg, const cv::Mat& mat, int top, int left, int bottom, int right, int border_type, float val)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    auto ref_mat = mmdeploy::cpu::Pad(mat, top, left, bottom, right, border_type, val);
+        auto ref_mat = mmdeploy::cpu::Pad(mat, top, left, bottom, right, border_type, val);
 
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "pad_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-    REQUIRE(Shape(res.value(), "pad_fixed_size") ==
-            std::vector<int64_t>{ref_mat.rows, ref_mat.cols});
+        auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "pad_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+        REQUIRE(Shape(res.value(), "pad_fixed_size") ==
+                std::vector<int64_t>{ref_mat.rows, ref_mat.cols});
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-  }
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+    }
 }
 
-TEST_CASE("transform 'Pad'", "[pad]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("transform 'Pad'", "[pad]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat;
-  cv::Mat float_bgr_mat;
-  cv::Mat float_gray_mat;
-  cv::cvtColor(bgr_mat, gray_mat, cv::COLOR_BGR2GRAY);
-  bgr_mat.convertTo(float_bgr_mat, CV_32FC3);
-  gray_mat.convertTo(float_gray_mat, CV_32FC1);
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat gray_mat;
+    cv::Mat float_bgr_mat;
+    cv::Mat float_gray_mat;
+    cv::cvtColor(bgr_mat, gray_mat, cv::COLOR_BGR2GRAY);
+    bgr_mat.convertTo(float_bgr_mat, CV_32FC3);
+    gray_mat.convertTo(float_gray_mat, CV_32FC1);
 
-  vector<cv::Mat> mats{bgr_mat, gray_mat, float_bgr_mat, float_gray_mat};
-  vector<string> modes{"constant", "edge", "reflect", "symmetric"};
-  map<string, int> border_map{{"constant", cv::BORDER_CONSTANT},
-                              {"edge", cv::BORDER_REPLICATE},
-                              {"reflect", cv::BORDER_REFLECT_101},
-                              {"symmetric", cv::BORDER_REFLECT}};
-  SECTION("pad to square") {
-    bool square{true};
-    float val = 255.0f;
-    for (auto& mat : mats) {
-      for (auto& mode : modes) {
-        Value cfg{
-            {"type", "Pad"}, {"pad_to_square", square}, {"padding_mode", mode}, {"pad_val", val}};
-        auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, square);
-        TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 255);
-      }
+    vector<cv::Mat>  mats{bgr_mat, gray_mat, float_bgr_mat, float_gray_mat};
+    vector<string>   modes{"constant", "edge", "reflect", "symmetric"};
+    map<string, int> border_map{{"constant", cv::BORDER_CONSTANT},
+                                {"edge", cv::BORDER_REPLICATE},
+                                {"reflect", cv::BORDER_REFLECT_101},
+                                {"symmetric", cv::BORDER_REFLECT}};
+    SECTION("pad to square")
+    {
+        bool  square{true};
+        float val = 255.0f;
+        for (auto& mat : mats)
+        {
+            for (auto& mode : modes)
+            {
+                Value cfg{
+                    {"type", "Pad"},
+                    {"pad_to_square", square},
+                    {"padding_mode", mode},
+                    {"pad_val", val}};
+                auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, square);
+                TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 255);
+            }
+        }
     }
-  }
 
-  SECTION("pad with size_divisor") {
-    constexpr int divisor = 32;
-    float val = 255.0f;
-    for (auto& mat : mats) {
-      for (auto& mode : modes) {
-        Value cfg{
-            {"type", "Pad"}, {"size_divisor", divisor}, {"padding_mode", mode}, {"pad_val", val}};
-        auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, divisor);
-        TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 255);
-      }
+    SECTION("pad with size_divisor")
+    {
+        constexpr int divisor = 32;
+        float         val     = 255.0f;
+        for (auto& mat : mats)
+        {
+            for (auto& mode : modes)
+            {
+                Value cfg{
+                    {"type", "Pad"},
+                    {"size_divisor", divisor},
+                    {"padding_mode", mode},
+                    {"pad_val", val}};
+                auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, divisor);
+                TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 255);
+            }
+        }
     }
-  }
 
-  SECTION("pad with size") {
-    constexpr int height = 600;
-    constexpr int width = 800;
-    for (auto& mat : mats) {
-      for (auto& mode : modes) {
-        Value cfg{{"type", "Pad"}, {"size", {width, height}}, {"padding_mode", mode}};
-        auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, height, width);
-        TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 0);
-      }
+    SECTION("pad with size")
+    {
+        constexpr int height = 600;
+        constexpr int width  = 800;
+        for (auto& mat : mats)
+        {
+            for (auto& mode : modes)
+            {
+                Value cfg{{"type", "Pad"}, {"size", {width, height}}, {"padding_mode", mode}};
+                auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, height, width);
+                TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 0);
+            }
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/preprocess/test_permute.cpp b/tests/test_csrc/preprocess/test_permute.cpp
index a12a4aad6e..753318aaad 100644
--- a/tests/test_csrc/preprocess/test_permute.cpp
+++ b/tests/test_csrc/preprocess/test_permute.cpp
@@ -18,104 +18,111 @@ using namespace framework;
 using namespace std;
 using namespace mmdeploy::test;
 
-template <typename T>
-bool CheckEqual(const Tensor& res, const vector<T>& expected) {
-  auto r = res.data<T>();
-  auto e = expected.data();
-  for (int i = 0; i < expected.size(); i++) {
-    if (r[i] != e[i]) {
-      return false;
+template<typename T>
+bool CheckEqual(const Tensor& res, const vector<T>& expected)
+{
+    auto r = res.data<T>();
+    auto e = expected.data();
+    for (int i = 0; i < expected.size(); i++)
+    {
+        if (r[i] != e[i])
+        {
+            return false;
+        }
     }
-  }
-  return true;
+    return true;
 }
 
-template <typename T>
-void TestPermute(const Tensor& src, const vector<int>& axes, const vector<T>& expected) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    ::mmdeploy::operation::Context ctx(device, stream);
-    auto permute = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
-    Tensor dst;
-    auto ret = permute.Apply(src, dst, axes);
-    REQUIRE(!ret.has_error());
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(dst, kHost, stream);
-    REQUIRE(CheckEqual(host_tensor.value(), expected));
-  }
+template<typename T>
+void TestPermute(const Tensor& src, const vector<int>& axes, const vector<T>& expected)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device                         device{device_name.c_str()};
+        Stream                         stream{device};
+        ::mmdeploy::operation::Context ctx(device, stream);
+        auto                           permute = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
+        Tensor                         dst;
+        auto                           ret = permute.Apply(src, dst, axes);
+        REQUIRE(!ret.has_error());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(dst, kHost, stream);
+        REQUIRE(CheckEqual(host_tensor.value(), expected));
+    }
 }
 
-void TestPermuteWrongArgs(const Tensor& src) {
-  int sz = src.shape().size();
-  vector<int> oaxes(sz);
-  std::iota(oaxes.begin(), oaxes.end(), 0);
+void TestPermuteWrongArgs(const Tensor& src)
+{
+    int         sz = src.shape().size();
+    vector<int> oaxes(sz);
+    std::iota(oaxes.begin(), oaxes.end(), 0);
 
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    ::mmdeploy::operation::Context ctx(device, stream);
-    auto permute = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
-    Tensor dst;
-    {
-      auto axes = oaxes;
-      axes[0]--;
-      auto ret = permute.Apply(src, dst, axes);
-      REQUIRE(ret.has_error());
-    }
-    {
-      auto axes = oaxes;
-      axes.back()++;
-      auto ret = permute.Apply(src, dst, axes);
-      REQUIRE(ret.has_error());
-    }
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
     {
-      auto axes = oaxes;
-      axes[0] = axes[1];
-      auto ret = permute.Apply(src, dst, axes);
-      REQUIRE(ret.has_error());
+        Device                         device{device_name.c_str()};
+        Stream                         stream{device};
+        ::mmdeploy::operation::Context ctx(device, stream);
+        auto                           permute = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
+        Tensor                         dst;
+        {
+            auto axes = oaxes;
+            axes[0]--;
+            auto ret = permute.Apply(src, dst, axes);
+            REQUIRE(ret.has_error());
+        }
+        {
+            auto axes = oaxes;
+            axes.back()++;
+            auto ret = permute.Apply(src, dst, axes);
+            REQUIRE(ret.has_error());
+        }
+        {
+            auto axes = oaxes;
+            axes[0]   = axes[1];
+            auto ret  = permute.Apply(src, dst, axes);
+            REQUIRE(ret.has_error());
+        }
     }
-  }
 }
 
-TEST_CASE("operation Permute", "[permute]") {
-  const Device kHost{"cpu"};
-  const int kSize = 2 * 3 * 2 * 4;
-  vector<uint8_t> data(kSize);
-  std::iota(data.begin(), data.end(), 0);  // [0, 48)
-  TensorDesc desc = {kHost, DataType::kINT8, {kSize}};
-  Tensor tensor(desc);
-  memcpy(tensor.data(), data.data(), data.size() * sizeof(uint8_t));
+TEST_CASE("operation Permute", "[permute]")
+{
+    const Device    kHost{"cpu"};
+    const int       kSize = 2 * 3 * 2 * 4;
+    vector<uint8_t> data(kSize);
+    std::iota(data.begin(), data.end(), 0);  // [0, 48)
+    TensorDesc desc = {kHost, DataType::kINT8, {kSize}};
+    Tensor     tensor(desc);
+    memcpy(tensor.data(), data.data(), data.size() * sizeof(uint8_t));
 
-  SECTION("permute: wrong axes") {
-    Tensor src = tensor;
-    src.Reshape({6, 8});
-    TestPermuteWrongArgs(src);
-  }
+    SECTION("permute: wrong axes")
+    {
+        Tensor src = tensor;
+        src.Reshape({6, 8});
+        TestPermuteWrongArgs(src);
+    }
 
-  SECTION("permute: dims 4") {
-    Tensor src = tensor;
-    src.Reshape({2, 3, 2, 4});
-    vector<int> axes = {1, 0, 3, 2};
-    vector<uint8_t> expected = {0,  4,  1,  5,  2,  6,  3,  7,  24, 28, 25, 29, 26, 30, 27, 31,
-                                8,  12, 9,  13, 10, 14, 11, 15, 32, 36, 33, 37, 34, 38, 35, 39,
-                                16, 20, 17, 21, 18, 22, 19, 23, 40, 44, 41, 45, 42, 46, 43, 47};
-    Tensor dst(src.desc());
-    memcpy(dst.data(), expected.data(), data.size() * sizeof(uint8_t));
-    TestPermute(src, axes, expected);
-  }
+    SECTION("permute: dims 4")
+    {
+        Tensor src = tensor;
+        src.Reshape({2, 3, 2, 4});
+        vector<int>     axes     = {1, 0, 3, 2};
+        vector<uint8_t> expected = {0, 4, 1, 5, 2, 6, 3, 7, 24, 28, 25, 29, 26, 30, 27, 31, 8, 12, 9, 13, 10, 14, 11, 15, 32, 36, 33, 37, 34, 38, 35, 39, 16, 20, 17, 21, 18, 22, 19, 23, 40, 44, 41, 45, 42, 46, 43, 47};
+        Tensor          dst(src.desc());
+        memcpy(dst.data(), expected.data(), data.size() * sizeof(uint8_t));
+        TestPermute(src, axes, expected);
+    }
 
-  SECTION("permute: dims 5") {
-    Tensor src = tensor;
-    src.Reshape({2, 3, 1, 2, 4});
-    vector<int> axes = {2, 0, 1, 4, 3};
-    vector<uint8_t> expected = {0,  4,  1,  5,  2,  6,  3,  7,  8,  12, 9,  13, 10, 14, 11, 15,
-                                16, 20, 17, 21, 18, 22, 19, 23, 24, 28, 25, 29, 26, 30, 27, 31,
-                                32, 36, 33, 37, 34, 38, 35, 39, 40, 44, 41, 45, 42, 46, 43, 47};
-    Tensor dst(src.desc());
-    memcpy(dst.data(), expected.data(), data.size() * sizeof(uint8_t));
-    TestPermute(src, axes, expected);
-  }
+    SECTION("permute: dims 5")
+    {
+        Tensor src = tensor;
+        src.Reshape({2, 3, 1, 2, 4});
+        vector<int>     axes     = {2, 0, 1, 4, 3};
+        vector<uint8_t> expected = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, 16, 20, 17, 21, 18, 22, 19, 23, 24, 28, 25, 29, 26, 30, 27, 31, 32, 36, 33, 37, 34, 38, 35, 39, 40, 44, 41, 45, 42, 46, 43, 47};
+        Tensor          dst(src.desc());
+        memcpy(dst.data(), expected.data(), data.size() * sizeof(uint8_t));
+        TestPermute(src, axes, expected);
+    }
 }
diff --git a/tests/test_csrc/preprocess/test_resize.cpp b/tests/test_csrc/preprocess/test_resize.cpp
index b1c1ab0910..063f2f95f7 100644
--- a/tests/test_csrc/preprocess/test_resize.cpp
+++ b/tests/test_csrc/preprocess/test_resize.cpp
@@ -16,290 +16,339 @@ using namespace std;
 using namespace mmdeploy::test;
 
 // return {target_height, target_width}
-tuple<int, int> GetTargetSize(const cv::Mat& src, int size0, int size1) {
-  assert(size0 > 0);
-  if (size1 > 0) {
-    return {size0, size1};
-  } else {
-    if (src.rows < src.cols) {
-      return {size0, size0 * src.cols / src.rows};
-    } else {
-      return {size0 * src.rows / src.cols, size0};
+tuple<int, int> GetTargetSize(const cv::Mat& src, int size0, int size1)
+{
+    assert(size0 > 0);
+    if (size1 > 0)
+    {
+        return {size0, size1};
+    }
+    else
+    {
+        if (src.rows < src.cols)
+        {
+            return {size0, size0 * src.cols / src.rows};
+        }
+        else
+        {
+            return {size0 * src.rows / src.cols, size0};
+        }
     }
-  }
 }
 
 // return {target_height, target_width}
-tuple<int, int> GetTargetSize(const cv::Mat& src, int scale0, int scale1, bool keep_ratio) {
-  auto w = src.cols;
-  auto h = src.rows;
-  auto max_long_edge = max(scale0, scale1);
-  auto max_short_edge = min(scale0, scale1);
-  if (keep_ratio) {
-    auto scale_factor =
-        std::min(max_long_edge * 1.0 / std::max(h, w), max_short_edge * 1.0 / std::min(h, w));
-    return {int(h * scale_factor + 0.5f), int(w * scale_factor + 0.5f)};
-  } else {
-    return {scale0, scale1};
-  }
+tuple<int, int> GetTargetSize(const cv::Mat& src, int scale0, int scale1, bool keep_ratio)
+{
+    auto w              = src.cols;
+    auto h              = src.rows;
+    auto max_long_edge  = max(scale0, scale1);
+    auto max_short_edge = min(scale0, scale1);
+    if (keep_ratio)
+    {
+        auto scale_factor =
+            std::min(max_long_edge * 1.0 / std::max(h, w), max_short_edge * 1.0 / std::min(h, w));
+        return {int(h * scale_factor + 0.5f), int(w * scale_factor + 0.5f)};
+    }
+    else
+    {
+        return {scale0, scale1};
+    }
 }
 
-void TestResize(const Value& cfg, const std::string& device_name, const cv::Mat& mat,
-                int dst_height, int dst_width) {
-  if (MMDeployTestResources::Get().HasDevice(device_name)) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
+void TestResize(const Value& cfg, const std::string& device_name, const cv::Mat& mat, int dst_height, int dst_width)
+{
+    if (MMDeployTestResources::Get().HasDevice(device_name))
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
 
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    auto interpolation = cfg["interpolation"].get<string>();
-    auto ref_mat = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
+        auto interpolation = cfg["interpolation"].get<string>();
+        auto ref_mat       = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
 
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device().device_id() == device.device_id());
-    REQUIRE(res_tensor.device().platform_id() == device.platform_id());
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "img_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-    REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
+        auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device().device_id() == device.device_id());
+        REQUIRE(res_tensor.device().platform_id() == device.platform_id());
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "img_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+        REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-    // cv::imwrite("ref.bmp", ref_mat);
-    // cv::imwrite("res.bmp", res_mat);
-  }
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+        // cv::imwrite("ref.bmp", ref_mat);
+        // cv::imwrite("res.bmp", res_mat);
+    }
 }
 
-void TestResizeWithScale(const Value& cfg, const std::string& device_name, const cv::Mat& mat,
-                         int scale0, int scale1, bool keep_ratio) {
-  if (MMDeployTestResources::Get().HasDevice(device_name)) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestResizeWithScale(const Value& cfg, const std::string& device_name, const cv::Mat& mat, int scale0, int scale1, bool keep_ratio)
+{
+    if (MMDeployTestResources::Get().HasDevice(device_name))
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    auto [dst_height, dst_width] = GetTargetSize(mat, scale0, scale1, keep_ratio);
-    auto interpolation = cfg["interpolation"].get<string>();
-    auto ref_mat = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
+        auto [dst_height, dst_width] = GetTargetSize(mat, scale0, scale1, keep_ratio);
+        auto  interpolation          = cfg["interpolation"].get<string>();
+        auto  ref_mat                = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
 
-    Value input{{"img", cpu::CVMat2Tensor(mat)}, {"scale", {scale0, scale1}}};
-    auto res = transform->Process(input);
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "img_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-    REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
+        Value input{{"img", cpu::CVMat2Tensor(mat)}, {"scale", {scale0, scale1}}};
+        auto  res = transform->Process(input);
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "img_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+        REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-    //  cv::imwrite("ref.bmp", ref_mat);
-    //  cv::imwrite("res.bmp", res_mat);
-  }
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+        //  cv::imwrite("ref.bmp", ref_mat);
+        //  cv::imwrite("res.bmp", res_mat);
+    }
 }
 
-void TestResizeWithScaleFactor(const Value& cfg, const std::string& device_name, const cv::Mat& mat,
-                               float scale_factor) {
-  if (MMDeployTestResources::Get().HasDevice(device_name)) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestResizeWithScaleFactor(const Value& cfg, const std::string& device_name, const cv::Mat& mat, float scale_factor)
+{
+    if (MMDeployTestResources::Get().HasDevice(device_name))
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    // keep round policy with resize.cpp
-    const int dst_height = static_cast<int>(mat.rows * scale_factor + 0.5);
-    const int dst_width = static_cast<int>(mat.cols * scale_factor + 0.5);
-    auto interpolation = cfg["interpolation"].get<string>();
-    auto ref_mat = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
+        // keep round policy with resize.cpp
+        const int dst_height    = static_cast<int>(mat.rows * scale_factor + 0.5);
+        const int dst_width     = static_cast<int>(mat.cols * scale_factor + 0.5);
+        auto      interpolation = cfg["interpolation"].get<string>();
+        auto      ref_mat       = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
 
-    Value input{{"img", cpu::CVMat2Tensor(mat)}, {"scale_factor", scale_factor}};
-    auto res = transform->Process(input);
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "img_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-    REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
+        Value     input{{"img", cpu::CVMat2Tensor(mat)}, {"scale_factor", scale_factor}};
+        auto      res = transform->Process(input);
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "img_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+        REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-    //  cv::imwrite("ref.bmp", ref_mat);
-    //  cv::imwrite("res.bmp", res_mat);
-  }
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        auto         res_mat     = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+        //  cv::imwrite("ref.bmp", ref_mat);
+        //  cv::imwrite("res.bmp", res_mat);
+    }
 }
 
-TEST_CASE("resize transform: size", "[resize]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("resize transform: size", "[resize]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
-  cv::Mat bgr_float_mat;
-  cv::Mat gray_float_mat;
-  bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
-  gray_mat.convertTo(gray_float_mat, CV_32FC1);
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
+    cv::Mat bgr_float_mat;
+    cv::Mat gray_float_mat;
+    bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
+    gray_mat.convertTo(gray_float_mat, CV_32FC1);
 
-  vector<cv::Mat> mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
-  vector<string> interpolations{"bilinear", "nearest", "area", "bicubic", "lanczos"};
-  set<string> cuda_interpolations{"bilinear", "nearest", "area"};
-  constexpr const char* kHost = "cpu";
-  SECTION("tuple size with -1") {
-    for (auto& mat : mats) {
-      auto size = std::max(mat.rows, mat.cols) + 10;
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {size, -1}},
-                  {"keep_ratio", false},
-                  {"interpolation", interp}};
-        auto [dst_height, dst_width] = GetTargetSize(mat, size, -1);
-        TestResize(cfg, kHost, mat, dst_height, dst_width);
-        if (cuda_interpolations.find(interp) != cuda_interpolations.end()) {
-          TestResize(cfg, "cuda", mat, dst_height, dst_width);
+    vector<cv::Mat>       mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
+    vector<string>        interpolations{"bilinear", "nearest", "area", "bicubic", "lanczos"};
+    set<string>           cuda_interpolations{"bilinear", "nearest", "area"};
+    constexpr const char* kHost = "cpu";
+    SECTION("tuple size with -1")
+    {
+        for (auto& mat : mats)
+        {
+            auto size = std::max(mat.rows, mat.cols) + 10;
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {size, -1}},
+                          {"keep_ratio", false},
+                          {"interpolation", interp}};
+                auto [dst_height, dst_width] = GetTargetSize(mat, size, -1);
+                TestResize(cfg, kHost, mat, dst_height, dst_width);
+                if (cuda_interpolations.find(interp) != cuda_interpolations.end())
+                {
+                    TestResize(cfg, "cuda", mat, dst_height, dst_width);
+                }
+            }
         }
-      }
     }
-  }
 
-  SECTION("no need to resize") {
-    for (auto& mat : mats) {
-      auto size = std::min(mat.rows, mat.cols);
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {size, -1}},
-                  {"keep_ratio", false},
-                  {"interpolation", interp}};
-        auto [dst_height, dst_width] = GetTargetSize(mat, size, -1);
-        TestResize(cfg, kHost, mat, dst_height, dst_width);
-      }
+    SECTION("no need to resize")
+    {
+        for (auto& mat : mats)
+        {
+            auto size = std::min(mat.rows, mat.cols);
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {size, -1}},
+                          {"keep_ratio", false},
+                          {"interpolation", interp}};
+                auto [dst_height, dst_width] = GetTargetSize(mat, size, -1);
+                TestResize(cfg, kHost, mat, dst_height, dst_width);
+            }
+        }
     }
-  }
 
-  SECTION("fixed integer size") {
-    for (auto& mat : mats) {
-      constexpr int size = 224;
-      for (auto& interp : interpolations) {
-        Value cfg{
-            {"type", "Resize"}, {"size", size}, {"keep_ratio", false}, {"interpolation", interp}};
-        TestResize(cfg, kHost, mat, size, size);
-        if (cuda_interpolations.find(interp) != cuda_interpolations.end()) {
-          TestResize(cfg, "cuda", mat, size, size);
+    SECTION("fixed integer size")
+    {
+        for (auto& mat : mats)
+        {
+            constexpr int size = 224;
+            for (auto& interp : interpolations)
+            {
+                Value cfg{
+                    {"type", "Resize"},
+                    {"size", size},
+                    {"keep_ratio", false},
+                    {"interpolation", interp}};
+                TestResize(cfg, kHost, mat, size, size);
+                if (cuda_interpolations.find(interp) != cuda_interpolations.end())
+                {
+                    TestResize(cfg, "cuda", mat, size, size);
+                }
+            }
         }
-      }
     }
-  }
 
-  SECTION("fixed size: [1333, 800]. keep_ratio: true") {
-    constexpr int max_long_edge = 1333;
-    constexpr int max_short_edge = 800;
-    bool keep_ratio = true;
-    for (auto& mat : mats) {
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {max_long_edge, max_short_edge}},
-                  {"keep_ratio", keep_ratio},
-                  {"interpolation", interp}};
-        auto [dst_height, dst_width] =
-            GetTargetSize(mat, max_long_edge, max_short_edge, keep_ratio);
-        TestResize(cfg, kHost, mat, dst_height, dst_width);
-        if (cuda_interpolations.find(interp) != cuda_interpolations.end()) {
-          TestResize(cfg, "cuda", mat, dst_height, dst_width);
+    SECTION("fixed size: [1333, 800]. keep_ratio: true")
+    {
+        constexpr int max_long_edge  = 1333;
+        constexpr int max_short_edge = 800;
+        bool          keep_ratio     = true;
+        for (auto& mat : mats)
+        {
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {max_long_edge, max_short_edge}},
+                          {"keep_ratio", keep_ratio},
+                          {"interpolation", interp}};
+                auto [dst_height, dst_width] =
+                    GetTargetSize(mat, max_long_edge, max_short_edge, keep_ratio);
+                TestResize(cfg, kHost, mat, dst_height, dst_width);
+                if (cuda_interpolations.find(interp) != cuda_interpolations.end())
+                {
+                    TestResize(cfg, "cuda", mat, dst_height, dst_width);
+                }
+            }
         }
-      }
     }
-  }
 
-  SECTION("fixed size: [1333, 800]. keep_ratio: false") {
-    constexpr int dst_height = 800;
-    constexpr int dst_width = 1333;
-    bool keep_ratio = false;
-    for (auto& mat : mats) {
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {dst_width, dst_height}},
-                  {"keep_ratio", keep_ratio},
-                  {"interpolation", interp}};
-        TestResize(cfg, kHost, mat, dst_height, dst_width);
-        if (cuda_interpolations.find(interp) != cuda_interpolations.end()) {
-          TestResize(cfg, "cuda", mat, dst_height, dst_width);
+    SECTION("fixed size: [1333, 800]. keep_ratio: false")
+    {
+        constexpr int dst_height = 800;
+        constexpr int dst_width  = 1333;
+        bool          keep_ratio = false;
+        for (auto& mat : mats)
+        {
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {dst_width, dst_height}},
+                          {"keep_ratio", keep_ratio},
+                          {"interpolation", interp}};
+                TestResize(cfg, kHost, mat, dst_height, dst_width);
+                if (cuda_interpolations.find(interp) != cuda_interpolations.end())
+                {
+                    TestResize(cfg, "cuda", mat, dst_height, dst_width);
+                }
+            }
         }
-      }
     }
-  }
 
-  SECTION("fixed size: [800, 1333]. keep_ratio: true") {
-    constexpr int dst_height = 800;
-    constexpr int dst_width = 1333;
-    bool keep_ratio = true;
-    for (auto& mat : mats) {
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {dst_height, dst_width}},
-                  {"keep_ratio", keep_ratio},
-                  {"interpolation", interp}};
-        TestResizeWithScale(cfg, kHost, mat, dst_height, dst_width, keep_ratio);
-      }
+    SECTION("fixed size: [800, 1333]. keep_ratio: true")
+    {
+        constexpr int dst_height = 800;
+        constexpr int dst_width  = 1333;
+        bool          keep_ratio = true;
+        for (auto& mat : mats)
+        {
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {dst_height, dst_width}},
+                          {"keep_ratio", keep_ratio},
+                          {"interpolation", interp}};
+                TestResizeWithScale(cfg, kHost, mat, dst_height, dst_width, keep_ratio);
+            }
+        }
     }
-  }
 
-  SECTION("img_scale: [800, 1333]. keep_ratio: false") {
-    constexpr int dst_height = 800;
-    constexpr int dst_width = 1333;
-    bool keep_ratio = false;
-    for (auto& mat : mats) {
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {dst_height, dst_width}},
-                  {"keep_ratio", keep_ratio},
-                  {"interpolation", interp}};
-        TestResizeWithScale(cfg, kHost, mat, dst_height, dst_width, keep_ratio);
-      }
+    SECTION("img_scale: [800, 1333]. keep_ratio: false")
+    {
+        constexpr int dst_height = 800;
+        constexpr int dst_width  = 1333;
+        bool          keep_ratio = false;
+        for (auto& mat : mats)
+        {
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {dst_height, dst_width}},
+                          {"keep_ratio", keep_ratio},
+                          {"interpolation", interp}};
+                TestResizeWithScale(cfg, kHost, mat, dst_height, dst_width, keep_ratio);
+            }
+        }
     }
-  }
 
-  SECTION("scale_factor: 0.5") {
-    float scale_factor = 0.5;
-    bool keep_ratio = true;
-    for (auto& mat : mats) {
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {600, 800}},
-                  {"keep_ratio", keep_ratio},
-                  {"interpolation", interp}};
-        TestResizeWithScaleFactor(cfg, kHost, mat, scale_factor);
-      }
+    SECTION("scale_factor: 0.5")
+    {
+        float scale_factor = 0.5;
+        bool  keep_ratio   = true;
+        for (auto& mat : mats)
+        {
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {600, 800}},
+                          {"keep_ratio", keep_ratio},
+                          {"interpolation", interp}};
+                TestResizeWithScaleFactor(cfg, kHost, mat, scale_factor);
+            }
+        }
     }
-  }
 
-  SECTION("resize 4 channel image") {
-    cv::Mat mat = cv::imread(img_path, cv::IMREAD_COLOR);
-    cv::Mat bgra_mat;
-    cv::cvtColor(bgr_mat, bgra_mat, cv::COLOR_BGR2BGRA);
-    assert(bgra_mat.channels() == 4);
-    constexpr int size = 256;
-    auto [dst_height, dst_width] = GetTargetSize(bgra_mat, size, -1);
-    for (auto& device_name : gResource.device_names()) {
-      for (auto& interp : cuda_interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {size, -1}},
-                  {"keep_ratio", false},
-                  {"interpolation", interp}};
-        TestResize(cfg, device_name, bgra_mat, dst_height, dst_width);
-      }
+    SECTION("resize 4 channel image")
+    {
+        cv::Mat mat = cv::imread(img_path, cv::IMREAD_COLOR);
+        cv::Mat bgra_mat;
+        cv::cvtColor(bgr_mat, bgra_mat, cv::COLOR_BGR2BGRA);
+        assert(bgra_mat.channels() == 4);
+        constexpr int size           = 256;
+        auto [dst_height, dst_width] = GetTargetSize(bgra_mat, size, -1);
+        for (auto& device_name : gResource.device_names())
+        {
+            for (auto& interp : cuda_interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {size, -1}},
+                          {"keep_ratio", false},
+                          {"interpolation", interp}};
+                TestResize(cfg, device_name, bgra_mat, dst_height, dst_width);
+            }
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/preprocess/test_utils.cpp b/tests/test_csrc/preprocess/test_utils.cpp
index fd601c33ae..9865ef22a6 100644
--- a/tests/test_csrc/preprocess/test_utils.cpp
+++ b/tests/test_csrc/preprocess/test_utils.cpp
@@ -3,59 +3,74 @@
 #include "test_utils.h"
 using namespace std;
 
-namespace mmdeploy::test {
-unique_ptr<Transform> CreateTransform(const Value& cfg, Device device, Stream stream) {
-  auto op_type = cfg.value<string>("type", "");
-  auto op_version = cfg.value<int>("version", -1);
-
-  try {
-    auto creator = gRegistry<transform::Transform>().Get(op_type, op_version);
-    if (creator == nullptr) {
-      return nullptr;
+namespace mmdeploy::test
+{
+    unique_ptr<Transform> CreateTransform(const Value& cfg, Device device, Stream stream)
+    {
+        auto op_type    = cfg.value<string>("type", "");
+        auto op_version = cfg.value<int>("version", -1);
+
+        try
+        {
+            auto creator = gRegistry<transform::Transform>().Get(op_type, op_version);
+            if (creator == nullptr)
+            {
+                return nullptr;
+            }
+            auto _cfg                 = cfg;
+            _cfg["context"]["device"] = device;
+            _cfg["context"]["stream"] = stream;
+
+            operation::Context context(device, stream);
+            return std::make_unique<Transform>(creator->Create(_cfg));
+        }
+        catch (std::exception& e)
+        {
+            cout << "exception: " << e.what() << endl;
+            return nullptr;
+        }
+        catch (...)
+        {
+            cout << "unexpected exception" << endl;
+            return nullptr;
+        }
+    }
+
+    vector<int64_t> Shape(const Value& value, const string& shape_key)
+    {
+        vector<int64_t> shape;
+        for (auto& v : value[shape_key])
+        {
+            shape.push_back(v.get<int>());
+        }
+        return shape;
+    }
+
+    vector<float> ImageNormCfg(const Value& value, const std::string& key)
+    {
+        vector<float> res;
+        for (auto& v : value["img_norm_cfg"][key])
+        {
+            res.push_back(v.get<float>());
+        }
+        return res;
+    }
+
+    Transform::Transform(std::unique_ptr<transform::Transform> transform)
+        : device_(operation::gContext().device())
+        , stream_(operation::gContext().stream())
+        , transform_(std::move(transform))
+    {
+    }
+
+    Result<Value> Transform::Process(const Value& input)
+    {
+        auto output = input;
+        {
+            operation::Context context(device_, stream_);
+            OUTCOME_TRY(transform_->Apply(output));
+        }
+        return output;
     }
-    auto _cfg = cfg;
-    _cfg["context"]["device"] = device;
-    _cfg["context"]["stream"] = stream;
-
-    operation::Context context(device, stream);
-    return std::make_unique<Transform>(creator->Create(_cfg));
-  } catch (std::exception& e) {
-    cout << "exception: " << e.what() << endl;
-    return nullptr;
-  } catch (...) {
-    cout << "unexpected exception" << endl;
-    return nullptr;
-  }
-}
-
-vector<int64_t> Shape(const Value& value, const string& shape_key) {
-  vector<int64_t> shape;
-  for (auto& v : value[shape_key]) {
-    shape.push_back(v.get<int>());
-  }
-  return shape;
-}
-
-vector<float> ImageNormCfg(const Value& value, const std::string& key) {
-  vector<float> res;
-  for (auto& v : value["img_norm_cfg"][key]) {
-    res.push_back(v.get<float>());
-  }
-  return res;
-}
-
-Transform::Transform(std::unique_ptr<transform::Transform> transform)
-    : device_(operation::gContext().device()),
-      stream_(operation::gContext().stream()),
-      transform_(std::move(transform)) {}
-
-Result<Value> Transform::Process(const Value& input) {
-  auto output = input;
-  {
-    operation::Context context(device_, stream_);
-    OUTCOME_TRY(transform_->Apply(output));
-  }
-  return output;
-}
 
 }  // namespace mmdeploy::test
diff --git a/tests/test_csrc/preprocess/test_utils.h b/tests/test_csrc/preprocess/test_utils.h
index 750bf3d096..b94c220bed 100644
--- a/tests/test_csrc/preprocess/test_utils.h
+++ b/tests/test_csrc/preprocess/test_utils.h
@@ -7,24 +7,26 @@
 #include "mmdeploy/core/value.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::test {
+namespace mmdeploy::test
+{
 
-class Transform {
- public:
-  explicit Transform(std::unique_ptr<transform::Transform> transform);
-  Result<Value> Process(const Value& input);
+    class Transform
+    {
+      public:
+        explicit Transform(std::unique_ptr<transform::Transform> transform);
+        Result<Value> Process(const Value& input);
 
- private:
-  Device device_;
-  Stream stream_;
-  std::unique_ptr<transform::Transform> transform_;
-};
+      private:
+        Device                                device_;
+        Stream                                stream_;
+        std::unique_ptr<transform::Transform> transform_;
+    };
 
-std::unique_ptr<Transform> CreateTransform(const Value& cfg, Device device, Stream stream);
+    std::unique_ptr<Transform> CreateTransform(const Value& cfg, Device device, Stream stream);
 
-std::vector<int64_t> Shape(const Value& value, const std::string& shape_key);
+    std::vector<int64_t>       Shape(const Value& value, const std::string& shape_key);
 
-std::vector<float> ImageNormCfg(const Value& value, const std::string& key);
+    std::vector<float>         ImageNormCfg(const Value& value, const std::string& key);
 
 }  // namespace mmdeploy::test
 
diff --git a/tests/test_csrc/test_resource.h b/tests/test_csrc/test_resource.h
index 8d88cce3e7..24c5ffa23a 100644
--- a/tests/test_csrc/test_resource.h
+++ b/tests/test_csrc/test_resource.h
@@ -14,135 +14,174 @@
 
 using namespace std;
 
-class MMDeployTestResources {
- public:
-  static MMDeployTestResources &Get() {
-    static MMDeployTestResources resource;
-    return resource;
-  }
-
-  const std::vector<std::string> &device_names() const { return devices_; }
-  const std::vector<std::string> &device_names(const std::string &backend) const {
-    return backend_devices_.at(backend);
-  }
-  const std::vector<std::string> &backends() const { return backends_; }
-  const std::vector<std::string> &codebases() const { return codebases_; }
-  const fs::path &resource_root_path() const { return resource_root_path_; }
-
-  bool HasDevice(const std::string &name) const {
-    return std::any_of(devices_.begin(), devices_.end(),
-                       [&](const std::string &device_name) { return device_name == name; });
-  }
-
-  bool IsDir(const fs::path &dir_name) const {
-    auto path = resource_root_path_ / dir_name;
-    return fs::is_directory(path);
-  }
-
-  bool IsFile(const fs::path &file_name) const {
-    auto path = resource_root_path_ / file_name;
-    return fs::is_regular_file(path);
-  }
-
- public:
-  std::vector<std::string> LocateModelResources(const fs::path &sdk_model_zoo_dir) {
-    std::vector<std::string> sdk_model_list;
-    if (resource_root_path_.empty()) {
-      return sdk_model_list;
+class MMDeployTestResources
+{
+  public:
+    static MMDeployTestResources& Get()
+    {
+        static MMDeployTestResources resource;
+        return resource;
     }
 
-    auto path = resource_root_path_ / sdk_model_zoo_dir;
-    if (!fs::is_directory(path)) {
-      return sdk_model_list;
+    const std::vector<std::string>& device_names() const
+    {
+        return devices_;
     }
-    for (auto const &dir_entry : fs::directory_iterator{path}) {
-      fs::directory_entry entry{dir_entry.path()};
-      if (auto const &_path = dir_entry.path(); fs::is_directory(_path)) {
-        sdk_model_list.push_back(dir_entry.path().string());
-      }
+    const std::vector<std::string>& device_names(const std::string& backend) const
+    {
+        return backend_devices_.at(backend);
+    }
+    const std::vector<std::string>& backends() const
+    {
+        return backends_;
+    }
+    const std::vector<std::string>& codebases() const
+    {
+        return codebases_;
+    }
+    const fs::path& resource_root_path() const
+    {
+        return resource_root_path_;
     }
-    return sdk_model_list;
-  }
 
-  std::vector<std::string> LocateImageResources(const fs::path &img_dir) {
-    std::vector<std::string> img_list;
+    bool HasDevice(const std::string& name) const
+    {
+        return std::any_of(devices_.begin(), devices_.end(), [&](const std::string& device_name)
+                           { return device_name == name; });
+    }
 
-    if (resource_root_path_.empty()) {
-      return img_list;
+    bool IsDir(const fs::path& dir_name) const
+    {
+        auto path = resource_root_path_ / dir_name;
+        return fs::is_directory(path);
     }
 
-    auto path = resource_root_path_ / img_dir;
-    if (!fs::is_directory(path)) {
-      return img_list;
+    bool IsFile(const fs::path& file_name) const
+    {
+        auto path = resource_root_path_ / file_name;
+        return fs::is_regular_file(path);
     }
 
-    set<string> extensions{".png", ".jpg", ".jpeg", ".bmp"};
-    for (auto const &dir_entry : fs::directory_iterator{path}) {
-      if (!fs::is_regular_file(dir_entry.path())) {
-        std::cout << dir_entry.path().string() << std::endl;
-        continue;
-      }
-      auto const &_path = dir_entry.path();
-      auto ext = _path.extension().string();
-      std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
-      if (extensions.find(ext) != extensions.end()) {
-        img_list.push_back(_path.string());
-      }
+  public:
+    std::vector<std::string> LocateModelResources(const fs::path& sdk_model_zoo_dir)
+    {
+        std::vector<std::string> sdk_model_list;
+        if (resource_root_path_.empty())
+        {
+            return sdk_model_list;
+        }
+
+        auto path = resource_root_path_ / sdk_model_zoo_dir;
+        if (!fs::is_directory(path))
+        {
+            return sdk_model_list;
+        }
+        for (auto const& dir_entry : fs::directory_iterator{path})
+        {
+            fs::directory_entry entry{dir_entry.path()};
+            if (auto const& _path = dir_entry.path(); fs::is_directory(_path))
+            {
+                sdk_model_list.push_back(dir_entry.path().string());
+            }
+        }
+        return sdk_model_list;
     }
-    return img_list;
-  }
-
- private:
-  MMDeployTestResources() {
-    devices_ = Split(kDevices);
-    backends_ = Split(kBackends);
-    codebases_ = Split(kCodebases);
-    backend_devices_["pplnn"] = {"cpu", "cuda"};
-    backend_devices_["trt"] = {"cuda"};
-    backend_devices_["ort"] = {"cpu"};
-    backend_devices_["ncnn"] = {"cpu"};
-    backend_devices_["openvino"] = {"cpu"};
-    resource_root_path_ = LocateResourceRootPath(fs::current_path(), 8);
-  }
-
-  static std::vector<std::string> Split(const std::string &text, char delimiter = ';') {
-    std::vector<std::string> result;
-    std::istringstream ss(text);
-    for (std::string word; std::getline(ss, word, delimiter);) {
-      result.emplace_back(word);
+
+    std::vector<std::string> LocateImageResources(const fs::path& img_dir)
+    {
+        std::vector<std::string> img_list;
+
+        if (resource_root_path_.empty())
+        {
+            return img_list;
+        }
+
+        auto path = resource_root_path_ / img_dir;
+        if (!fs::is_directory(path))
+        {
+            return img_list;
+        }
+
+        set<string> extensions{".png", ".jpg", ".jpeg", ".bmp"};
+        for (auto const& dir_entry : fs::directory_iterator{path})
+        {
+            if (!fs::is_regular_file(dir_entry.path()))
+            {
+                std::cout << dir_entry.path().string() << std::endl;
+                continue;
+            }
+            auto const& _path = dir_entry.path();
+            auto        ext   = _path.extension().string();
+            std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+            if (extensions.find(ext) != extensions.end())
+            {
+                img_list.push_back(_path.string());
+            }
+        }
+        return img_list;
     }
-    return result;
-  }
 
-  fs::path LocateResourceRootPath(const fs::path &cur_path, int max_depth) {
-    if (max_depth < 0) {
-      return "";
+  private:
+    MMDeployTestResources()
+    {
+        devices_                     = Split(kDevices);
+        backends_                    = Split(kBackends);
+        codebases_                   = Split(kCodebases);
+        backend_devices_["pplnn"]    = {"cpu", "cuda"};
+        backend_devices_["trt"]      = {"cuda"};
+        backend_devices_["ort"]      = {"cpu"};
+        backend_devices_["ncnn"]     = {"cpu"};
+        backend_devices_["openvino"] = {"cpu"};
+        resource_root_path_          = LocateResourceRootPath(fs::current_path(), 8);
     }
-    for (auto const &dir_entry : fs::directory_iterator{cur_path}) {
-      fs::directory_entry entry{dir_entry.path()};
-      auto const &_path = dir_entry.path();
-      // filename must be checked before fs::is_directory, the latter will throw
-      // when _path points to a system file on Windows
-      if (_path.filename() == "mmdeploy_test_resources" && fs::is_directory(_path)) {
-        return _path;
-      }
+
+    static std::vector<std::string> Split(const std::string& text, char delimiter = ';')
+    {
+        std::vector<std::string> result;
+        std::istringstream       ss(text);
+        for (std::string word; std::getline(ss, word, delimiter);)
+        {
+            result.emplace_back(word);
+        }
+        return result;
     }
-    // Didn't find 'mmdeploy_test_resources' in current directory.
-    // Move to its parent directory and keep looking for it
-    if (cur_path.has_parent_path()) {
-      return LocateResourceRootPath(cur_path.parent_path(), max_depth - 1);
-    } else {
-      return "";
+
+    fs::path LocateResourceRootPath(const fs::path& cur_path, int max_depth)
+    {
+        if (max_depth < 0)
+        {
+            return "";
+        }
+        for (auto const& dir_entry : fs::directory_iterator{cur_path})
+        {
+            fs::directory_entry entry{dir_entry.path()};
+            auto const&         _path = dir_entry.path();
+            // filename must be checked before fs::is_directory, the latter will throw
+            // when _path points to a system file on Windows
+            if (_path.filename() == "mmdeploy_test_resources" && fs::is_directory(_path))
+            {
+                return _path;
+            }
+        }
+        // Didn't find 'mmdeploy_test_resources' in current directory.
+        // Move to its parent directory and keep looking for it
+        if (cur_path.has_parent_path())
+        {
+            return LocateResourceRootPath(cur_path.parent_path(), max_depth - 1);
+        }
+        else
+        {
+            return "";
+        }
     }
-  }
-
- private:
-  std::vector<std::string> devices_;
-  std::vector<std::string> backends_;
-  std::vector<std::string> codebases_;
-  std::map<std::string, std::vector<std::string>> backend_devices_;
-  fs::path resource_root_path_;
-  //  std::string resource_root_path_;
+
+  private:
+    std::vector<std::string>                        devices_;
+    std::vector<std::string>                        backends_;
+    std::vector<std::string>                        codebases_;
+    std::map<std::string, std::vector<std::string>> backend_devices_;
+    fs::path                                        resource_root_path_;
+    //  std::string resource_root_path_;
 };
 
 #endif  // MMDEPLOY_TEST_RESOURCE_H
diff --git a/third_party/clipper/clipper.cpp b/third_party/clipper/clipper.cpp
index 626fe05eb0..5ea25f50e5 100644
--- a/third_party/clipper/clipper.cpp
+++ b/third_party/clipper/clipper.cpp
@@ -1,42 +1,42 @@
 /*******************************************************************************
-*                                                                              *
-* Author    :  Angus Johnson                                                   *
-* Version   :  6.4.2                                                           *
-* Date      :  27 February 2017                                                *
-* Website   :  http://www.angusj.com                                           *
-* Copyright :  Angus Johnson 2010-2017                                         *
-*                                                                              *
-* License:                                                                     *
-* Use, modification & distribution is subject to Boost Software License Ver 1. *
-* http://www.boost.org/LICENSE_1_0.txt                                         *
-*                                                                              *
-* Attributions:                                                                *
-* The code in this library is an extension of Bala Vatti's clipping algorithm: *
-* "A generic solution to polygon clipping"                                     *
-* Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
-* http://portal.acm.org/citation.cfm?id=129906                                 *
-*                                                                              *
-* Computer graphics and geometric modeling: implementation and algorithms      *
-* By Max K. Agoston                                                            *
-* Springer; 1 edition (January 4, 2005)                                        *
-* http://books.google.com/books?q=vatti+clipping+agoston                       *
-*                                                                              *
-* See also:                                                                    *
-* "Polygon Offsetting by Computing Winding Numbers"                            *
-* Paper no. DETC2005-85513 pp. 565-575                                         *
-* ASME 2005 International Design Engineering Technical Conferences             *
-* and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
-* September 24-28, 2005 , Long Beach, California, USA                          *
-* http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
-*                                                                              *
-*******************************************************************************/
+ *                                                                              *
+ * Author    :  Angus Johnson                                                   *
+ * Version   :  6.4.2                                                           *
+ * Date      :  27 February 2017                                                *
+ * Website   :  http://www.angusj.com                                           *
+ * Copyright :  Angus Johnson 2010-2017                                         *
+ *                                                                              *
+ * License:                                                                     *
+ * Use, modification & distribution is subject to Boost Software License Ver 1. *
+ * http://www.boost.org/LICENSE_1_0.txt                                         *
+ *                                                                              *
+ * Attributions:                                                                *
+ * The code in this library is an extension of Bala Vatti's clipping algorithm: *
+ * "A generic solution to polygon clipping"                                     *
+ * Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
+ * http://portal.acm.org/citation.cfm?id=129906                                 *
+ *                                                                              *
+ * Computer graphics and geometric modeling: implementation and algorithms      *
+ * By Max K. Agoston                                                            *
+ * Springer; 1 edition (January 4, 2005)                                        *
+ * http://books.google.com/books?q=vatti+clipping+agoston                       *
+ *                                                                              *
+ * See also:                                                                    *
+ * "Polygon Offsetting by Computing Winding Numbers"                            *
+ * Paper no. DETC2005-85513 pp. 565-575                                         *
+ * ASME 2005 International Design Engineering Technical Conferences             *
+ * and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
+ * September 24-28, 2005 , Long Beach, California, USA                          *
+ * http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
+ *                                                                              *
+ *******************************************************************************/
 
 /*******************************************************************************
-*                                                                              *
-* This is a translation of the Delphi Clipper library and the naming style     *
-* used has retained a Delphi flavour.                                          *
-*                                                                              *
-*******************************************************************************/
+ *                                                                              *
+ * This is a translation of the Delphi Clipper library and the naming style     *
+ * used has retained a Delphi flavour.                                          *
+ *                                                                              *
+ *******************************************************************************/
 
 #include "clipper.hpp"
 #include <cmath>
@@ -48,4582 +48,4771 @@
 #include <ostream>
 #include <functional>
 
-namespace ClipperLib {
+namespace ClipperLib
+{
 
-static double const pi = 3.141592653589793238;
-static double const two_pi = pi *2;
-static double const def_arc_tolerance = 0.25;
+    static double const pi                = 3.141592653589793238;
+    static double const two_pi            = pi * 2;
+    static double const def_arc_tolerance = 0.25;
 
-enum Direction { dRightToLeft, dLeftToRight };
+    enum Direction
+    {
+        dRightToLeft,
+        dLeftToRight
+    };
 
-static int const Unassigned = -1;  //edge not currently 'owning' a solution
-static int const Skip = -2;        //edge that would otherwise close a path
+    static int const Unassigned = -1;  // edge not currently 'owning' a solution
+    static int const Skip       = -2;  // edge that would otherwise close a path
 
 #define HORIZONTAL (-1.0E+40)
 #define TOLERANCE (1.0e-20)
 #define NEAR_ZERO(val) (((val) > -TOLERANCE) && ((val) < TOLERANCE))
 
-struct TEdge {
-  IntPoint Bot;
-  IntPoint Curr; //current (updated for every new scanbeam)
-  IntPoint Top;
-  double Dx;
-  PolyType PolyTyp;
-  EdgeSide Side; //side only refers to current side of solution poly
-  int WindDelta; //1 or -1 depending on winding direction
-  int WindCnt;
-  int WindCnt2; //winding count of the opposite polytype
-  int OutIdx;
-  TEdge *Next;
-  TEdge *Prev;
-  TEdge *NextInLML;
-  TEdge *NextInAEL;
-  TEdge *PrevInAEL;
-  TEdge *NextInSEL;
-  TEdge *PrevInSEL;
-};
-
-struct IntersectNode {
-  TEdge          *Edge1;
-  TEdge          *Edge2;
-  IntPoint        Pt;
-};
-
-struct LocalMinimum {
-  cInt          Y;
-  TEdge        *LeftBound;
-  TEdge        *RightBound;
-};
-
-struct OutPt;
-
-//OutRec: contains a path in the clipping solution. Edges in the AEL will
-//carry a pointer to an OutRec when they are part of the clipping solution.
-struct OutRec {
-  int       Idx;
-  bool      IsHole;
-  bool      IsOpen;
-  OutRec   *FirstLeft;  //see comments in clipper.pas
-  PolyNode *PolyNd;
-  OutPt    *Pts;
-  OutPt    *BottomPt;
-};
-
-struct OutPt {
-  int       Idx;
-  IntPoint  Pt;
-  OutPt    *Next;
-  OutPt    *Prev;
-};
-
-struct Join {
-  OutPt    *OutPt1;
-  OutPt    *OutPt2;
-  IntPoint  OffPt;
-};
-
-struct LocMinSorter
-{
-  inline bool operator()(const LocalMinimum& locMin1, const LocalMinimum& locMin2)
-  {
-    return locMin2.Y < locMin1.Y;
-  }
-};
-
-//------------------------------------------------------------------------------
-//------------------------------------------------------------------------------
-
-inline cInt Round(double val)
-{
-  if ((val < 0)) return static_cast<cInt>(val - 0.5);
-  else return static_cast<cInt>(val + 0.5);
-}
-//------------------------------------------------------------------------------
-
-inline cInt Abs(cInt val)
-{
-  return val < 0 ? -val : val;
-}
-
-//------------------------------------------------------------------------------
-// PolyTree methods ...
-//------------------------------------------------------------------------------
-
-void PolyTree::Clear()
-{
-    for (PolyNodes::size_type i = 0; i < AllNodes.size(); ++i)
-      delete AllNodes[i];
-    AllNodes.resize(0);
-    Childs.resize(0);
-}
-//------------------------------------------------------------------------------
-
-PolyNode* PolyTree::GetFirst() const
-{
-  if (!Childs.empty())
-      return Childs[0];
-  else
-      return 0;
-}
-//------------------------------------------------------------------------------
-
-int PolyTree::Total() const
-{
-  int result = (int)AllNodes.size();
-  //with negative offsets, ignore the hidden outer polygon ...
-  if (result > 0 && Childs[0] != AllNodes[0]) result--;
-  return result;
-}
-
-//------------------------------------------------------------------------------
-// PolyNode methods ...
-//------------------------------------------------------------------------------
-
-PolyNode::PolyNode(): Parent(0), Index(0), m_IsOpen(false)
-{
-}
-//------------------------------------------------------------------------------
-
-int PolyNode::ChildCount() const
-{
-  return (int)Childs.size();
-}
-//------------------------------------------------------------------------------
+    struct TEdge
+    {
+        IntPoint Bot;
+        IntPoint Curr;  // current (updated for every new scanbeam)
+        IntPoint Top;
+        double   Dx;
+        PolyType PolyTyp;
+        EdgeSide Side;       // side only refers to current side of solution poly
+        int      WindDelta;  // 1 or -1 depending on winding direction
+        int      WindCnt;
+        int      WindCnt2;  // winding count of the opposite polytype
+        int      OutIdx;
+        TEdge*   Next;
+        TEdge*   Prev;
+        TEdge*   NextInLML;
+        TEdge*   NextInAEL;
+        TEdge*   PrevInAEL;
+        TEdge*   NextInSEL;
+        TEdge*   PrevInSEL;
+    };
 
-void PolyNode::AddChild(PolyNode& child)
-{
-  unsigned cnt = (unsigned)Childs.size();
-  Childs.push_back(&child);
-  child.Parent = this;
-  child.Index = cnt;
-}
-//------------------------------------------------------------------------------
+    struct IntersectNode
+    {
+        TEdge*   Edge1;
+        TEdge*   Edge2;
+        IntPoint Pt;
+    };
 
-PolyNode* PolyNode::GetNext() const
-{
-  if (!Childs.empty())
-      return Childs[0];
-  else
-      return GetNextSiblingUp();
-}
-//------------------------------------------------------------------------------
+    struct LocalMinimum
+    {
+        cInt   Y;
+        TEdge* LeftBound;
+        TEdge* RightBound;
+    };
 
-PolyNode* PolyNode::GetNextSiblingUp() const
-{
-  if (!Parent) //protects against PolyTree.GetNextSiblingUp()
-      return 0;
-  else if (Index == Parent->Childs.size() - 1)
-      return Parent->GetNextSiblingUp();
-  else
-      return Parent->Childs[Index + 1];
-}
-//------------------------------------------------------------------------------
+    struct OutPt;
 
-bool PolyNode::IsHole() const
-{
-  bool result = true;
-  PolyNode* node = Parent;
-  while (node)
-  {
-      result = !result;
-      node = node->Parent;
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
+    // OutRec: contains a path in the clipping solution. Edges in the AEL will
+    // carry a pointer to an OutRec when they are part of the clipping solution.
+    struct OutRec
+    {
+        int       Idx;
+        bool      IsHole;
+        bool      IsOpen;
+        OutRec*   FirstLeft;  // see comments in clipper.pas
+        PolyNode* PolyNd;
+        OutPt*    Pts;
+        OutPt*    BottomPt;
+    };
 
-bool PolyNode::IsOpen() const
-{
-  return m_IsOpen;
-}
-//------------------------------------------------------------------------------
+    struct OutPt
+    {
+        int      Idx;
+        IntPoint Pt;
+        OutPt*   Next;
+        OutPt*   Prev;
+    };
 
-#ifndef use_int32
+    struct Join
+    {
+        OutPt*   OutPt1;
+        OutPt*   OutPt2;
+        IntPoint OffPt;
+    };
 
-//------------------------------------------------------------------------------
-// Int128 class (enables safe math on signed 64bit integers)
-// eg Int128 val1((long64)9223372036854775807); //ie 2^63 -1
-//    Int128 val2((long64)9223372036854775807);
-//    Int128 val3 = val1 * val2;
-//    val3.AsString => "85070591730234615847396907784232501249" (8.5e+37)
-//------------------------------------------------------------------------------
+    struct LocMinSorter
+    {
+        inline bool operator()(const LocalMinimum& locMin1, const LocalMinimum& locMin2)
+        {
+            return locMin2.Y < locMin1.Y;
+        }
+    };
 
-class Int128
-{
-  public:
-    ulong64 lo;
-    long64 hi;
+    //------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-    Int128(long64 _lo = 0)
+    inline cInt Round(double val)
     {
-      lo = (ulong64)_lo;
-      if (_lo < 0)  hi = -1; else hi = 0;
+        if ((val < 0))
+            return static_cast<cInt>(val - 0.5);
+        else
+            return static_cast<cInt>(val + 0.5);
     }
+    //------------------------------------------------------------------------------
 
-
-    Int128(const Int128 &val): lo(val.lo), hi(val.hi){}
-
-    Int128(const long64& _hi, const ulong64& _lo): lo(_lo), hi(_hi){}
-
-    Int128& operator = (const long64 &val)
+    inline cInt Abs(cInt val)
     {
-      lo = (ulong64)val;
-      if (val < 0) hi = -1; else hi = 0;
-      return *this;
+        return val < 0 ? -val : val;
     }
 
-    bool operator == (const Int128 &val) const
-      {return (hi == val.hi && lo == val.lo);}
+    //------------------------------------------------------------------------------
+    // PolyTree methods ...
+    //------------------------------------------------------------------------------
 
-    bool operator != (const Int128 &val) const
-      { return !(*this == val);}
-
-    bool operator > (const Int128 &val) const
+    void PolyTree::Clear()
     {
-      if (hi != val.hi)
-        return hi > val.hi;
-      else
-        return lo > val.lo;
+        for (PolyNodes::size_type i = 0; i < AllNodes.size(); ++i)
+            delete AllNodes[i];
+        AllNodes.resize(0);
+        Childs.resize(0);
     }
+    //------------------------------------------------------------------------------
 
-    bool operator < (const Int128 &val) const
+    PolyNode* PolyTree::GetFirst() const
     {
-      if (hi != val.hi)
-        return hi < val.hi;
-      else
-        return lo < val.lo;
+        if (!Childs.empty())
+            return Childs[0];
+        else
+            return 0;
     }
+    //------------------------------------------------------------------------------
 
-    bool operator >= (const Int128 &val) const
-      { return !(*this < val);}
+    int PolyTree::Total() const
+    {
+        int result = (int)AllNodes.size();
+        // with negative offsets, ignore the hidden outer polygon ...
+        if (result > 0 && Childs[0] != AllNodes[0]) result--;
+        return result;
+    }
 
-    bool operator <= (const Int128 &val) const
-      { return !(*this > val);}
+    //------------------------------------------------------------------------------
+    // PolyNode methods ...
+    //------------------------------------------------------------------------------
 
-    Int128& operator += (const Int128 &rhs)
+    PolyNode::PolyNode()
+        : Parent(0)
+        , Index(0)
+        , m_IsOpen(false)
     {
-      hi += rhs.hi;
-      lo += rhs.lo;
-      if (lo < rhs.lo) hi++;
-      return *this;
     }
+    //------------------------------------------------------------------------------
 
-    Int128 operator + (const Int128 &rhs) const
+    int PolyNode::ChildCount() const
     {
-      Int128 result(*this);
-      result+= rhs;
-      return result;
+        return (int)Childs.size();
     }
+    //------------------------------------------------------------------------------
 
-    Int128& operator -= (const Int128 &rhs)
+    void PolyNode::AddChild(PolyNode& child)
     {
-      *this += -rhs;
-      return *this;
+        unsigned cnt = (unsigned)Childs.size();
+        Childs.push_back(&child);
+        child.Parent = this;
+        child.Index  = cnt;
     }
+    //------------------------------------------------------------------------------
 
-    Int128 operator - (const Int128 &rhs) const
+    PolyNode* PolyNode::GetNext() const
     {
-      Int128 result(*this);
-      result -= rhs;
-      return result;
+        if (!Childs.empty())
+            return Childs[0];
+        else
+            return GetNextSiblingUp();
     }
+    //------------------------------------------------------------------------------
 
-    Int128 operator-() const //unary negation
+    PolyNode* PolyNode::GetNextSiblingUp() const
     {
-      if (lo == 0)
-        return Int128(-hi, 0);
-      else
-        return Int128(~hi, ~lo + 1);
+        if (!Parent)  // protects against PolyTree.GetNextSiblingUp()
+            return 0;
+        else if (Index == Parent->Childs.size() - 1)
+            return Parent->GetNextSiblingUp();
+        else
+            return Parent->Childs[Index + 1];
     }
+    //------------------------------------------------------------------------------
 
-    operator double() const
+    bool PolyNode::IsHole() const
     {
-      const double shift64 = 18446744073709551616.0; //2^64
-      if (hi < 0)
-      {
-        if (lo == 0) return (double)hi * shift64;
-        else return -(double)(~lo + ~hi * shift64);
-      }
-      else
-        return (double)(lo + hi * shift64);
+        bool      result = true;
+        PolyNode* node   = Parent;
+        while (node)
+        {
+            result = !result;
+            node   = node->Parent;
+        }
+        return result;
     }
+    //------------------------------------------------------------------------------
 
-};
-//------------------------------------------------------------------------------
+    bool PolyNode::IsOpen() const
+    {
+        return m_IsOpen;
+    }
+    //------------------------------------------------------------------------------
 
-Int128 Int128Mul (long64 lhs, long64 rhs)
-{
-  bool negate = (lhs < 0) != (rhs < 0);
-
-  if (lhs < 0) lhs = -lhs;
-  ulong64 int1Hi = ulong64(lhs) >> 32;
-  ulong64 int1Lo = ulong64(lhs & 0xFFFFFFFF);
-
-  if (rhs < 0) rhs = -rhs;
-  ulong64 int2Hi = ulong64(rhs) >> 32;
-  ulong64 int2Lo = ulong64(rhs & 0xFFFFFFFF);
-
-  //nb: see comments in clipper.pas
-  ulong64 a = int1Hi * int2Hi;
-  ulong64 b = int1Lo * int2Lo;
-  ulong64 c = int1Hi * int2Lo + int1Lo * int2Hi;
-
-  Int128 tmp;
-  tmp.hi = long64(a + (c >> 32));
-  tmp.lo = long64(c << 32);
-  tmp.lo += long64(b);
-  if (tmp.lo < b) tmp.hi++;
-  if (negate) tmp = -tmp;
-  return tmp;
-};
-#endif
+#ifndef use_int32
 
-//------------------------------------------------------------------------------
-// Miscellaneous global functions
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
+    // Int128 class (enables safe math on signed 64bit integers)
+    // eg Int128 val1((long64)9223372036854775807); //ie 2^63 -1
+    //    Int128 val2((long64)9223372036854775807);
+    //    Int128 val3 = val1 * val2;
+    //    val3.AsString => "85070591730234615847396907784232501249" (8.5e+37)
+    //------------------------------------------------------------------------------
 
-bool Orientation(const Path &poly)
-{
-    return Area(poly) >= 0;
-}
-//------------------------------------------------------------------------------
+    class Int128
+    {
+      public:
+        ulong64 lo;
+        long64  hi;
 
-double Area(const Path &poly)
-{
-  int size = (int)poly.size();
-  if (size < 3) return 0;
-
-  double a = 0;
-  for (int i = 0, j = size -1; i < size; ++i)
-  {
-    a += ((double)poly[j].X + poly[i].X) * ((double)poly[j].Y - poly[i].Y);
-    j = i;
-  }
-  return -a * 0.5;
-}
-//------------------------------------------------------------------------------
+        Int128(long64 _lo = 0)
+        {
+            lo = (ulong64)_lo;
+            if (_lo < 0)
+                hi = -1;
+            else
+                hi = 0;
+        }
 
-double Area(const OutPt *op)
-{
-  const OutPt *startOp = op;
-  if (!op) return 0;
-  double a = 0;
-  do {
-    a +=  (double)(op->Prev->Pt.X + op->Pt.X) * (double)(op->Prev->Pt.Y - op->Pt.Y);
-    op = op->Next;
-  } while (op != startOp);
-  return a * 0.5;
-}
-//------------------------------------------------------------------------------
 
-double Area(const OutRec &outRec)
-{
-  return Area(outRec.Pts);
-}
-//------------------------------------------------------------------------------
+        Int128(const Int128& val)
+            : lo(val.lo)
+            , hi(val.hi)
+        {
+        }
 
-bool PointIsVertex(const IntPoint &Pt, OutPt *pp)
-{
-  OutPt *pp2 = pp;
-  do
-  {
-    if (pp2->Pt == Pt) return true;
-    pp2 = pp2->Next;
-  }
-  while (pp2 != pp);
-  return false;
-}
-//------------------------------------------------------------------------------
+        Int128(const long64& _hi, const ulong64& _lo)
+            : lo(_lo)
+            , hi(_hi)
+        {
+        }
 
-//See "The Point in Polygon Problem for Arbitrary Polygons" by Hormann & Agathos
-//http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.88.5498&rep=rep1&type=pdf
-int PointInPolygon(const IntPoint &pt, const Path &path)
-{
-  //returns 0 if false, +1 if true, -1 if pt ON polygon boundary
-  int result = 0;
-  size_t cnt = path.size();
-  if (cnt < 3) return 0;
-  IntPoint ip = path[0];
-  for(size_t i = 1; i <= cnt; ++i)
-  {
-    IntPoint ipNext = (i == cnt ? path[0] : path[i]);
-    if (ipNext.Y == pt.Y)
-    {
-        if ((ipNext.X == pt.X) || (ip.Y == pt.Y &&
-          ((ipNext.X > pt.X) == (ip.X < pt.X)))) return -1;
-    }
-    if ((ip.Y < pt.Y) != (ipNext.Y < pt.Y))
-    {
-      if (ip.X >= pt.X)
-      {
-        if (ipNext.X > pt.X) result = 1 - result;
-        else
+        Int128& operator=(const long64& val)
         {
-          double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) -
-            (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
-          if (!d) return -1;
-          if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
+            lo = (ulong64)val;
+            if (val < 0)
+                hi = -1;
+            else
+                hi = 0;
+            return *this;
         }
-      } else
-      {
-        if (ipNext.X > pt.X)
+
+        bool operator==(const Int128& val) const
         {
-          double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) -
-            (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
-          if (!d) return -1;
-          if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
+            return (hi == val.hi && lo == val.lo);
         }
-      }
-    }
-    ip = ipNext;
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
 
-int PointInPolygon (const IntPoint &pt, OutPt *op)
-{
-  //returns 0 if false, +1 if true, -1 if pt ON polygon boundary
-  int result = 0;
-  OutPt* startOp = op;
-  for(;;)
-  {
-    if (op->Next->Pt.Y == pt.Y)
-    {
-        if ((op->Next->Pt.X == pt.X) || (op->Pt.Y == pt.Y &&
-          ((op->Next->Pt.X > pt.X) == (op->Pt.X < pt.X)))) return -1;
-    }
-    if ((op->Pt.Y < pt.Y) != (op->Next->Pt.Y < pt.Y))
-    {
-      if (op->Pt.X >= pt.X)
-      {
-        if (op->Next->Pt.X > pt.X) result = 1 - result;
-        else
+        bool operator!=(const Int128& val) const
         {
-          double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) -
-            (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
-          if (!d) return -1;
-          if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
+            return !(*this == val);
         }
-      } else
-      {
-        if (op->Next->Pt.X > pt.X)
+
+        bool operator>(const Int128& val) const
         {
-          double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) -
-            (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
-          if (!d) return -1;
-          if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
+            if (hi != val.hi)
+                return hi > val.hi;
+            else
+                return lo > val.lo;
         }
-      }
-    }
-    op = op->Next;
-    if (startOp == op) break;
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
 
-bool Poly2ContainsPoly1(OutPt *OutPt1, OutPt *OutPt2)
-{
-  OutPt* op = OutPt1;
-  do
-  {
-    //nb: PointInPolygon returns 0 if false, +1 if true, -1 if pt on polygon
-    int res = PointInPolygon(op->Pt, OutPt2);
-    if (res >= 0) return res > 0;
-    op = op->Next;
-  }
-  while (op != OutPt1);
-  return true;
-}
-//----------------------------------------------------------------------
-
-bool SlopesEqual(const TEdge &e1, const TEdge &e2, bool UseFullInt64Range)
-{
-#ifndef use_int32
-  if (UseFullInt64Range)
-    return Int128Mul(e1.Top.Y - e1.Bot.Y, e2.Top.X - e2.Bot.X) ==
-    Int128Mul(e1.Top.X - e1.Bot.X, e2.Top.Y - e2.Bot.Y);
-  else
-#endif
-    return (e1.Top.Y - e1.Bot.Y) * (e2.Top.X - e2.Bot.X) ==
-    (e1.Top.X - e1.Bot.X) * (e2.Top.Y - e2.Bot.Y);
-}
-//------------------------------------------------------------------------------
+        bool operator<(const Int128& val) const
+        {
+            if (hi != val.hi)
+                return hi < val.hi;
+            else
+                return lo < val.lo;
+        }
 
-bool SlopesEqual(const IntPoint pt1, const IntPoint pt2,
-  const IntPoint pt3, bool UseFullInt64Range)
-{
-#ifndef use_int32
-  if (UseFullInt64Range)
-    return Int128Mul(pt1.Y-pt2.Y, pt2.X-pt3.X) == Int128Mul(pt1.X-pt2.X, pt2.Y-pt3.Y);
-  else
-#endif
-    return (pt1.Y-pt2.Y)*(pt2.X-pt3.X) == (pt1.X-pt2.X)*(pt2.Y-pt3.Y);
-}
-//------------------------------------------------------------------------------
+        bool operator>=(const Int128& val) const
+        {
+            return !(*this < val);
+        }
 
-bool SlopesEqual(const IntPoint pt1, const IntPoint pt2,
-  const IntPoint pt3, const IntPoint pt4, bool UseFullInt64Range)
-{
-#ifndef use_int32
-  if (UseFullInt64Range)
-    return Int128Mul(pt1.Y-pt2.Y, pt3.X-pt4.X) == Int128Mul(pt1.X-pt2.X, pt3.Y-pt4.Y);
-  else
-#endif
-    return (pt1.Y-pt2.Y)*(pt3.X-pt4.X) == (pt1.X-pt2.X)*(pt3.Y-pt4.Y);
-}
-//------------------------------------------------------------------------------
+        bool operator<=(const Int128& val) const
+        {
+            return !(*this > val);
+        }
 
-inline bool IsHorizontal(TEdge &e)
-{
-  return e.Dx == HORIZONTAL;
-}
-//------------------------------------------------------------------------------
+        Int128& operator+=(const Int128& rhs)
+        {
+            hi += rhs.hi;
+            lo += rhs.lo;
+            if (lo < rhs.lo) hi++;
+            return *this;
+        }
 
-inline double GetDx(const IntPoint pt1, const IntPoint pt2)
-{
-  return (pt1.Y == pt2.Y) ?
-    HORIZONTAL : (double)(pt2.X - pt1.X) / (pt2.Y - pt1.Y);
-}
-//---------------------------------------------------------------------------
+        Int128 operator+(const Int128& rhs) const
+        {
+            Int128 result(*this);
+            result += rhs;
+            return result;
+        }
 
-inline void SetDx(TEdge &e)
-{
-  cInt dy  = (e.Top.Y - e.Bot.Y);
-  if (dy == 0) e.Dx = HORIZONTAL;
-  else e.Dx = (double)(e.Top.X - e.Bot.X) / dy;
-}
-//---------------------------------------------------------------------------
+        Int128& operator-=(const Int128& rhs)
+        {
+            *this += -rhs;
+            return *this;
+        }
 
-inline void SwapSides(TEdge &Edge1, TEdge &Edge2)
-{
-  EdgeSide Side =  Edge1.Side;
-  Edge1.Side = Edge2.Side;
-  Edge2.Side = Side;
-}
-//------------------------------------------------------------------------------
+        Int128 operator-(const Int128& rhs) const
+        {
+            Int128 result(*this);
+            result -= rhs;
+            return result;
+        }
 
-inline void SwapPolyIndexes(TEdge &Edge1, TEdge &Edge2)
-{
-  int OutIdx =  Edge1.OutIdx;
-  Edge1.OutIdx = Edge2.OutIdx;
-  Edge2.OutIdx = OutIdx;
-}
-//------------------------------------------------------------------------------
+        Int128 operator-() const  // unary negation
+        {
+            if (lo == 0)
+                return Int128(-hi, 0);
+            else
+                return Int128(~hi, ~lo + 1);
+        }
 
-inline cInt TopX(TEdge &edge, const cInt currentY)
-{
-  return ( currentY == edge.Top.Y ) ?
-    edge.Top.X : edge.Bot.X + Round(edge.Dx *(currentY - edge.Bot.Y));
-}
-//------------------------------------------------------------------------------
+        operator double() const
+        {
+            const double shift64 = 18446744073709551616.0;  // 2^64
+            if (hi < 0)
+            {
+                if (lo == 0)
+                    return (double)hi * shift64;
+                else
+                    return -(double)(~lo + ~hi * shift64);
+            }
+            else
+                return (double)(lo + hi * shift64);
+        }
+    };
+    //------------------------------------------------------------------------------
 
-void IntersectPoint(TEdge &Edge1, TEdge &Edge2, IntPoint &ip)
-{
-#ifdef use_xyz
-  ip.Z = 0;
+    Int128 Int128Mul(long64 lhs, long64 rhs)
+    {
+        bool negate = (lhs < 0) != (rhs < 0);
+
+        if (lhs < 0) lhs = -lhs;
+        ulong64 int1Hi = ulong64(lhs) >> 32;
+        ulong64 int1Lo = ulong64(lhs & 0xFFFFFFFF);
+
+        if (rhs < 0) rhs = -rhs;
+        ulong64 int2Hi = ulong64(rhs) >> 32;
+        ulong64 int2Lo = ulong64(rhs & 0xFFFFFFFF);
+
+        // nb: see comments in clipper.pas
+        ulong64 a = int1Hi * int2Hi;
+        ulong64 b = int1Lo * int2Lo;
+        ulong64 c = int1Hi * int2Lo + int1Lo * int2Hi;
+
+        Int128  tmp;
+        tmp.hi = long64(a + (c >> 32));
+        tmp.lo = long64(c << 32);
+        tmp.lo += long64(b);
+        if (tmp.lo < b) tmp.hi++;
+        if (negate) tmp = -tmp;
+        return tmp;
+    };
 #endif
 
-  double b1, b2;
-  if (Edge1.Dx == Edge2.Dx)
-  {
-    ip.Y = Edge1.Curr.Y;
-    ip.X = TopX(Edge1, ip.Y);
-    return;
-  }
-  else if (Edge1.Dx == 0)
-  {
-    ip.X = Edge1.Bot.X;
-    if (IsHorizontal(Edge2))
-      ip.Y = Edge2.Bot.Y;
-    else
-    {
-      b2 = Edge2.Bot.Y - (Edge2.Bot.X / Edge2.Dx);
-      ip.Y = Round(ip.X / Edge2.Dx + b2);
-    }
-  }
-  else if (Edge2.Dx == 0)
-  {
-    ip.X = Edge2.Bot.X;
-    if (IsHorizontal(Edge1))
-      ip.Y = Edge1.Bot.Y;
-    else
-    {
-      b1 = Edge1.Bot.Y - (Edge1.Bot.X / Edge1.Dx);
-      ip.Y = Round(ip.X / Edge1.Dx + b1);
-    }
-  }
-  else
-  {
-    b1 = Edge1.Bot.X - Edge1.Bot.Y * Edge1.Dx;
-    b2 = Edge2.Bot.X - Edge2.Bot.Y * Edge2.Dx;
-    double q = (b2-b1) / (Edge1.Dx - Edge2.Dx);
-    ip.Y = Round(q);
-    if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
-      ip.X = Round(Edge1.Dx * q + b1);
-    else
-      ip.X = Round(Edge2.Dx * q + b2);
-  }
-
-  if (ip.Y < Edge1.Top.Y || ip.Y < Edge2.Top.Y)
-  {
-    if (Edge1.Top.Y > Edge2.Top.Y)
-      ip.Y = Edge1.Top.Y;
-    else
-      ip.Y = Edge2.Top.Y;
-    if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
-      ip.X = TopX(Edge1, ip.Y);
-    else
-      ip.X = TopX(Edge2, ip.Y);
-  }
-  //finally, don't allow 'ip' to be BELOW curr.Y (ie bottom of scanbeam) ...
-  if (ip.Y > Edge1.Curr.Y)
-  {
-    ip.Y = Edge1.Curr.Y;
-    //use the more vertical edge to derive X ...
-    if (std::fabs(Edge1.Dx) > std::fabs(Edge2.Dx))
-      ip.X = TopX(Edge2, ip.Y); else
-      ip.X = TopX(Edge1, ip.Y);
-  }
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
+    // Miscellaneous global functions
+    //------------------------------------------------------------------------------
 
-void ReversePolyPtLinks(OutPt *pp)
-{
-  if (!pp) return;
-  OutPt *pp1, *pp2;
-  pp1 = pp;
-  do {
-  pp2 = pp1->Next;
-  pp1->Next = pp1->Prev;
-  pp1->Prev = pp2;
-  pp1 = pp2;
-  } while( pp1 != pp );
-}
-//------------------------------------------------------------------------------
+    bool Orientation(const Path& poly)
+    {
+        return Area(poly) >= 0;
+    }
+    //------------------------------------------------------------------------------
 
-void DisposeOutPts(OutPt*& pp)
-{
-  if (pp == 0) return;
-    pp->Prev->Next = 0;
-  while( pp )
-  {
-    OutPt *tmpPp = pp;
-    pp = pp->Next;
-    delete tmpPp;
-  }
-}
-//------------------------------------------------------------------------------
+    double Area(const Path& poly)
+    {
+        int size = (int)poly.size();
+        if (size < 3) return 0;
 
-inline void InitEdge(TEdge* e, TEdge* eNext, TEdge* ePrev, const IntPoint& Pt)
-{
-  std::memset(e, 0, sizeof(TEdge));
-  e->Next = eNext;
-  e->Prev = ePrev;
-  e->Curr = Pt;
-  e->OutIdx = Unassigned;
-}
-//------------------------------------------------------------------------------
+        double a = 0;
+        for (int i = 0, j = size - 1; i < size; ++i)
+        {
+            a += ((double)poly[j].X + poly[i].X) * ((double)poly[j].Y - poly[i].Y);
+            j = i;
+        }
+        return -a * 0.5;
+    }
+    //------------------------------------------------------------------------------
 
-void InitEdge2(TEdge& e, PolyType Pt)
-{
-  if (e.Curr.Y >= e.Next->Curr.Y)
-  {
-    e.Bot = e.Curr;
-    e.Top = e.Next->Curr;
-  } else
-  {
-    e.Top = e.Curr;
-    e.Bot = e.Next->Curr;
-  }
-  SetDx(e);
-  e.PolyTyp = Pt;
-}
-//------------------------------------------------------------------------------
+    double Area(const OutPt* op)
+    {
+        const OutPt* startOp = op;
+        if (!op) return 0;
+        double a = 0;
+        do {
+            a += (double)(op->Prev->Pt.X + op->Pt.X) * (double)(op->Prev->Pt.Y - op->Pt.Y);
+            op = op->Next;
+        } while (op != startOp);
+        return a * 0.5;
+    }
+    //------------------------------------------------------------------------------
 
-TEdge* RemoveEdge(TEdge* e)
-{
-  //removes e from double_linked_list (but without removing from memory)
-  e->Prev->Next = e->Next;
-  e->Next->Prev = e->Prev;
-  TEdge* result = e->Next;
-  e->Prev = 0; //flag as removed (see ClipperBase.Clear)
-  return result;
-}
-//------------------------------------------------------------------------------
+    double Area(const OutRec& outRec)
+    {
+        return Area(outRec.Pts);
+    }
+    //------------------------------------------------------------------------------
 
-inline void ReverseHorizontal(TEdge &e)
-{
-  //swap horizontal edges' Top and Bottom x's so they follow the natural
-  //progression of the bounds - ie so their xbots will align with the
-  //adjoining lower edge. [Helpful in the ProcessHorizontal() method.]
-  std::swap(e.Top.X, e.Bot.X);
-#ifdef use_xyz
-  std::swap(e.Top.Z, e.Bot.Z);
-#endif
-}
-//------------------------------------------------------------------------------
+    bool PointIsVertex(const IntPoint& Pt, OutPt* pp)
+    {
+        OutPt* pp2 = pp;
+        do
+        {
+            if (pp2->Pt == Pt) return true;
+            pp2 = pp2->Next;
+        } while (pp2 != pp);
+        return false;
+    }
+    //------------------------------------------------------------------------------
 
-void SwapPoints(IntPoint &pt1, IntPoint &pt2)
-{
-  IntPoint tmp = pt1;
-  pt1 = pt2;
-  pt2 = tmp;
-}
-//------------------------------------------------------------------------------
+    // See "The Point in Polygon Problem for Arbitrary Polygons" by Hormann & Agathos
+    // http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.88.5498&rep=rep1&type=pdf
+    int PointInPolygon(const IntPoint& pt, const Path& path)
+    {
+        // returns 0 if false, +1 if true, -1 if pt ON polygon boundary
+        int    result = 0;
+        size_t cnt    = path.size();
+        if (cnt < 3) return 0;
+        IntPoint ip = path[0];
+        for (size_t i = 1; i <= cnt; ++i)
+        {
+            IntPoint ipNext = (i == cnt ? path[0] : path[i]);
+            if (ipNext.Y == pt.Y)
+            {
+                if ((ipNext.X == pt.X) || (ip.Y == pt.Y &&
+                                           ((ipNext.X > pt.X) == (ip.X < pt.X)))) return -1;
+            }
+            if ((ip.Y < pt.Y) != (ipNext.Y < pt.Y))
+            {
+                if (ip.X >= pt.X)
+                {
+                    if (ipNext.X > pt.X)
+                        result = 1 - result;
+                    else
+                    {
+                        double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) -
+                                   (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
+                    }
+                }
+                else
+                {
+                    if (ipNext.X > pt.X)
+                    {
+                        double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) -
+                                   (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
+                    }
+                }
+            }
+            ip = ipNext;
+        }
+        return result;
+    }
+    //------------------------------------------------------------------------------
 
-bool GetOverlapSegment(IntPoint pt1a, IntPoint pt1b, IntPoint pt2a,
-  IntPoint pt2b, IntPoint &pt1, IntPoint &pt2)
-{
-  //precondition: segments are Collinear.
-  if (Abs(pt1a.X - pt1b.X) > Abs(pt1a.Y - pt1b.Y))
-  {
-    if (pt1a.X > pt1b.X) SwapPoints(pt1a, pt1b);
-    if (pt2a.X > pt2b.X) SwapPoints(pt2a, pt2b);
-    if (pt1a.X > pt2a.X) pt1 = pt1a; else pt1 = pt2a;
-    if (pt1b.X < pt2b.X) pt2 = pt1b; else pt2 = pt2b;
-    return pt1.X < pt2.X;
-  } else
-  {
-    if (pt1a.Y < pt1b.Y) SwapPoints(pt1a, pt1b);
-    if (pt2a.Y < pt2b.Y) SwapPoints(pt2a, pt2b);
-    if (pt1a.Y < pt2a.Y) pt1 = pt1a; else pt1 = pt2a;
-    if (pt1b.Y > pt2b.Y) pt2 = pt1b; else pt2 = pt2b;
-    return pt1.Y > pt2.Y;
-  }
-}
-//------------------------------------------------------------------------------
+    int PointInPolygon(const IntPoint& pt, OutPt* op)
+    {
+        // returns 0 if false, +1 if true, -1 if pt ON polygon boundary
+        int    result  = 0;
+        OutPt* startOp = op;
+        for (;;)
+        {
+            if (op->Next->Pt.Y == pt.Y)
+            {
+                if ((op->Next->Pt.X == pt.X) || (op->Pt.Y == pt.Y &&
+                                                 ((op->Next->Pt.X > pt.X) == (op->Pt.X < pt.X)))) return -1;
+            }
+            if ((op->Pt.Y < pt.Y) != (op->Next->Pt.Y < pt.Y))
+            {
+                if (op->Pt.X >= pt.X)
+                {
+                    if (op->Next->Pt.X > pt.X)
+                        result = 1 - result;
+                    else
+                    {
+                        double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) -
+                                   (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
+                    }
+                }
+                else
+                {
+                    if (op->Next->Pt.X > pt.X)
+                    {
+                        double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) -
+                                   (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
+                    }
+                }
+            }
+            op = op->Next;
+            if (startOp == op) break;
+        }
+        return result;
+    }
+    //------------------------------------------------------------------------------
 
-bool FirstIsBottomPt(const OutPt* btmPt1, const OutPt* btmPt2)
-{
-  OutPt *p = btmPt1->Prev;
-  while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Prev;
-  double dx1p = std::fabs(GetDx(btmPt1->Pt, p->Pt));
-  p = btmPt1->Next;
-  while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Next;
-  double dx1n = std::fabs(GetDx(btmPt1->Pt, p->Pt));
-
-  p = btmPt2->Prev;
-  while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Prev;
-  double dx2p = std::fabs(GetDx(btmPt2->Pt, p->Pt));
-  p = btmPt2->Next;
-  while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Next;
-  double dx2n = std::fabs(GetDx(btmPt2->Pt, p->Pt));
-
-  if (std::max(dx1p, dx1n) == std::max(dx2p, dx2n) &&
-    std::min(dx1p, dx1n) == std::min(dx2p, dx2n))
-      return Area(btmPt1) > 0; //if otherwise identical use orientation
-  else
-    return (dx1p >= dx2p && dx1p >= dx2n) || (dx1n >= dx2p && dx1n >= dx2n);
-}
-//------------------------------------------------------------------------------
+    bool Poly2ContainsPoly1(OutPt* OutPt1, OutPt* OutPt2)
+    {
+        OutPt* op = OutPt1;
+        do
+        {
+            // nb: PointInPolygon returns 0 if false, +1 if true, -1 if pt on polygon
+            int res = PointInPolygon(op->Pt, OutPt2);
+            if (res >= 0) return res > 0;
+            op = op->Next;
+        } while (op != OutPt1);
+        return true;
+    }
+    //----------------------------------------------------------------------
 
-OutPt* GetBottomPt(OutPt *pp)
-{
-  OutPt* dups = 0;
-  OutPt* p = pp->Next;
-  while (p != pp)
-  {
-    if (p->Pt.Y > pp->Pt.Y)
-    {
-      pp = p;
-      dups = 0;
-    }
-    else if (p->Pt.Y == pp->Pt.Y && p->Pt.X <= pp->Pt.X)
-    {
-      if (p->Pt.X < pp->Pt.X)
-      {
-        dups = 0;
-        pp = p;
-      } else
-      {
-        if (p->Next != pp && p->Prev != pp) dups = p;
-      }
-    }
-    p = p->Next;
-  }
-  if (dups)
-  {
-    //there appears to be at least 2 vertices at BottomPt so ...
-    while (dups != p)
-    {
-      if (!FirstIsBottomPt(p, dups)) pp = dups;
-      dups = dups->Next;
-      while (dups->Pt != pp->Pt) dups = dups->Next;
-    }
-  }
-  return pp;
-}
-//------------------------------------------------------------------------------
+    bool SlopesEqual(const TEdge& e1, const TEdge& e2, bool UseFullInt64Range)
+    {
+#ifndef use_int32
+        if (UseFullInt64Range)
+            return Int128Mul(e1.Top.Y - e1.Bot.Y, e2.Top.X - e2.Bot.X) ==
+                   Int128Mul(e1.Top.X - e1.Bot.X, e2.Top.Y - e2.Bot.Y);
+        else
+#endif
+            return (e1.Top.Y - e1.Bot.Y) * (e2.Top.X - e2.Bot.X) ==
+                   (e1.Top.X - e1.Bot.X) * (e2.Top.Y - e2.Bot.Y);
+    }
+    //------------------------------------------------------------------------------
 
-bool Pt2IsBetweenPt1AndPt3(const IntPoint pt1,
-  const IntPoint pt2, const IntPoint pt3)
-{
-  if ((pt1 == pt3) || (pt1 == pt2) || (pt3 == pt2))
-    return false;
-  else if (pt1.X != pt3.X)
-    return (pt2.X > pt1.X) == (pt2.X < pt3.X);
-  else
-    return (pt2.Y > pt1.Y) == (pt2.Y < pt3.Y);
-}
-//------------------------------------------------------------------------------
+    bool SlopesEqual(const IntPoint pt1, const IntPoint pt2, const IntPoint pt3, bool UseFullInt64Range)
+    {
+#ifndef use_int32
+        if (UseFullInt64Range)
+            return Int128Mul(pt1.Y - pt2.Y, pt2.X - pt3.X) == Int128Mul(pt1.X - pt2.X, pt2.Y - pt3.Y);
+        else
+#endif
+            return (pt1.Y - pt2.Y) * (pt2.X - pt3.X) == (pt1.X - pt2.X) * (pt2.Y - pt3.Y);
+    }
+    //------------------------------------------------------------------------------
 
-bool HorzSegmentsOverlap(cInt seg1a, cInt seg1b, cInt seg2a, cInt seg2b)
-{
-  if (seg1a > seg1b) std::swap(seg1a, seg1b);
-  if (seg2a > seg2b) std::swap(seg2a, seg2b);
-  return (seg1a < seg2b) && (seg2a < seg1b);
-}
+    bool SlopesEqual(const IntPoint pt1, const IntPoint pt2, const IntPoint pt3, const IntPoint pt4, bool UseFullInt64Range)
+    {
+#ifndef use_int32
+        if (UseFullInt64Range)
+            return Int128Mul(pt1.Y - pt2.Y, pt3.X - pt4.X) == Int128Mul(pt1.X - pt2.X, pt3.Y - pt4.Y);
+        else
+#endif
+            return (pt1.Y - pt2.Y) * (pt3.X - pt4.X) == (pt1.X - pt2.X) * (pt3.Y - pt4.Y);
+    }
+    //------------------------------------------------------------------------------
 
-//------------------------------------------------------------------------------
-// ClipperBase class methods ...
-//------------------------------------------------------------------------------
+    inline bool IsHorizontal(TEdge& e)
+    {
+        return e.Dx == HORIZONTAL;
+    }
+    //------------------------------------------------------------------------------
 
-ClipperBase::ClipperBase() //constructor
-{
-  m_CurrentLM = m_MinimaList.begin(); //begin() == end() here
-  m_UseFullRange = false;
-}
-//------------------------------------------------------------------------------
+    inline double GetDx(const IntPoint pt1, const IntPoint pt2)
+    {
+        return (pt1.Y == pt2.Y) ?
+                   HORIZONTAL :
+                   (double)(pt2.X - pt1.X) / (pt2.Y - pt1.Y);
+    }
+    //---------------------------------------------------------------------------
 
-ClipperBase::~ClipperBase() //destructor
-{
-  Clear();
-}
-//------------------------------------------------------------------------------
+    inline void SetDx(TEdge& e)
+    {
+        cInt dy = (e.Top.Y - e.Bot.Y);
+        if (dy == 0)
+            e.Dx = HORIZONTAL;
+        else
+            e.Dx = (double)(e.Top.X - e.Bot.X) / dy;
+    }
+    //---------------------------------------------------------------------------
 
-void RangeTest(const IntPoint& Pt, bool& useFullRange)
-{
-  if (useFullRange)
-  {
-    if (Pt.X > hiRange || Pt.Y > hiRange || -Pt.X > hiRange || -Pt.Y > hiRange)
-      throw clipperException("Coordinate outside allowed range");
-  }
-  else if (Pt.X > loRange|| Pt.Y > loRange || -Pt.X > loRange || -Pt.Y > loRange)
-  {
-    useFullRange = true;
-    RangeTest(Pt, useFullRange);
-  }
-}
-//------------------------------------------------------------------------------
+    inline void SwapSides(TEdge& Edge1, TEdge& Edge2)
+    {
+        EdgeSide Side = Edge1.Side;
+        Edge1.Side    = Edge2.Side;
+        Edge2.Side    = Side;
+    }
+    //------------------------------------------------------------------------------
 
-TEdge* FindNextLocMin(TEdge* E)
-{
-  for (;;)
-  {
-    while (E->Bot != E->Prev->Bot || E->Curr == E->Top) E = E->Next;
-    if (!IsHorizontal(*E) && !IsHorizontal(*E->Prev)) break;
-    while (IsHorizontal(*E->Prev)) E = E->Prev;
-    TEdge* E2 = E;
-    while (IsHorizontal(*E)) E = E->Next;
-    if (E->Top.Y == E->Prev->Bot.Y) continue; //ie just an intermediate horz.
-    if (E2->Prev->Bot.X < E->Bot.X) E = E2;
-    break;
-  }
-  return E;
-}
-//------------------------------------------------------------------------------
+    inline void SwapPolyIndexes(TEdge& Edge1, TEdge& Edge2)
+    {
+        int OutIdx   = Edge1.OutIdx;
+        Edge1.OutIdx = Edge2.OutIdx;
+        Edge2.OutIdx = OutIdx;
+    }
+    //------------------------------------------------------------------------------
 
-TEdge* ClipperBase::ProcessBound(TEdge* E, bool NextIsForward)
-{
-  TEdge *Result = E;
-  TEdge *Horz = 0;
-
-  if (E->OutIdx == Skip)
-  {
-    //if edges still remain in the current bound beyond the skip edge then
-    //create another LocMin and call ProcessBound once more
-    if (NextIsForward)
-    {
-      while (E->Top.Y == E->Next->Bot.Y) E = E->Next;
-      //don't include top horizontals when parsing a bound a second time,
-      //they will be contained in the opposite bound ...
-      while (E != Result && IsHorizontal(*E)) E = E->Prev;
-    }
-    else
-    {
-      while (E->Top.Y == E->Prev->Bot.Y) E = E->Prev;
-      while (E != Result && IsHorizontal(*E)) E = E->Next;
-    }
-
-    if (E == Result)
-    {
-      if (NextIsForward) Result = E->Next;
-      else Result = E->Prev;
-    }
-    else
-    {
-      //there are more edges in the bound beyond result starting with E
-      if (NextIsForward)
-        E = Result->Next;
-      else
-        E = Result->Prev;
-      MinimaList::value_type locMin;
-      locMin.Y = E->Bot.Y;
-      locMin.LeftBound = 0;
-      locMin.RightBound = E;
-      E->WindDelta = 0;
-      Result = ProcessBound(E, NextIsForward);
-      m_MinimaList.push_back(locMin);
-    }
-    return Result;
-  }
-
-  TEdge *EStart;
-
-  if (IsHorizontal(*E))
-  {
-    //We need to be careful with open paths because this may not be a
-    //true local minima (ie E may be following a skip edge).
-    //Also, consecutive horz. edges may start heading left before going right.
-    if (NextIsForward)
-      EStart = E->Prev;
-    else
-      EStart = E->Next;
-    if (IsHorizontal(*EStart)) //ie an adjoining horizontal skip edge
-      {
-        if (EStart->Bot.X != E->Bot.X && EStart->Top.X != E->Bot.X)
-          ReverseHorizontal(*E);
-      }
-      else if (EStart->Bot.X != E->Bot.X)
-        ReverseHorizontal(*E);
-  }
-
-  EStart = E;
-  if (NextIsForward)
-  {
-    while (Result->Top.Y == Result->Next->Bot.Y && Result->Next->OutIdx != Skip)
-      Result = Result->Next;
-    if (IsHorizontal(*Result) && Result->Next->OutIdx != Skip)
-    {
-      //nb: at the top of a bound, horizontals are added to the bound
-      //only when the preceding edge attaches to the horizontal's left vertex
-      //unless a Skip edge is encountered when that becomes the top divide
-      Horz = Result;
-      while (IsHorizontal(*Horz->Prev)) Horz = Horz->Prev;
-      if (Horz->Prev->Top.X > Result->Next->Top.X) Result = Horz->Prev;
-    }
-    while (E != Result)
-    {
-      E->NextInLML = E->Next;
-      if (IsHorizontal(*E) && E != EStart &&
-        E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
-      E = E->Next;
-    }
-    if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Prev->Top.X)
-      ReverseHorizontal(*E);
-    Result = Result->Next; //move to the edge just beyond current bound
-  } else
-  {
-    while (Result->Top.Y == Result->Prev->Bot.Y && Result->Prev->OutIdx != Skip)
-      Result = Result->Prev;
-    if (IsHorizontal(*Result) && Result->Prev->OutIdx != Skip)
-    {
-      Horz = Result;
-      while (IsHorizontal(*Horz->Next)) Horz = Horz->Next;
-      if (Horz->Next->Top.X == Result->Prev->Top.X ||
-          Horz->Next->Top.X > Result->Prev->Top.X) Result = Horz->Next;
-    }
-
-    while (E != Result)
-    {
-      E->NextInLML = E->Prev;
-      if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X)
-        ReverseHorizontal(*E);
-      E = E->Prev;
-    }
-    if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X)
-      ReverseHorizontal(*E);
-    Result = Result->Prev; //move to the edge just beyond current bound
-  }
-
-  return Result;
-}
-//------------------------------------------------------------------------------
+    inline cInt TopX(TEdge& edge, const cInt currentY)
+    {
+        return (currentY == edge.Top.Y) ?
+                   edge.Top.X :
+                   edge.Bot.X + Round(edge.Dx * (currentY - edge.Bot.Y));
+    }
+    //------------------------------------------------------------------------------
 
-bool ClipperBase::AddPath(const Path &pg, PolyType PolyTyp, bool Closed)
-{
-#ifdef use_lines
-  if (!Closed && PolyTyp == ptClip)
-    throw clipperException("AddPath: Open paths must be subject.");
-#else
-  if (!Closed)
-    throw clipperException("AddPath: Open paths have been disabled.");
+    void IntersectPoint(TEdge& Edge1, TEdge& Edge2, IntPoint& ip)
+    {
+#ifdef use_xyz
+        ip.Z = 0;
 #endif
 
-  int highI = (int)pg.size() -1;
-  if (Closed) while (highI > 0 && (pg[highI] == pg[0])) --highI;
-  while (highI > 0 && (pg[highI] == pg[highI -1])) --highI;
-  if ((Closed && highI < 2) || (!Closed && highI < 1)) return false;
-
-  //create a new edge array ...
-  TEdge *edges = new TEdge [highI +1];
-
-  bool IsFlat = true;
-  //1. Basic (first) edge initialization ...
-  try
-  {
-    edges[1].Curr = pg[1];
-    RangeTest(pg[0], m_UseFullRange);
-    RangeTest(pg[highI], m_UseFullRange);
-    InitEdge(&edges[0], &edges[1], &edges[highI], pg[0]);
-    InitEdge(&edges[highI], &edges[0], &edges[highI-1], pg[highI]);
-    for (int i = highI - 1; i >= 1; --i)
-    {
-      RangeTest(pg[i], m_UseFullRange);
-      InitEdge(&edges[i], &edges[i+1], &edges[i-1], pg[i]);
-    }
-  }
-  catch(...)
-  {
-    delete [] edges;
-    throw; //range test fails
-  }
-  TEdge *eStart = &edges[0];
-
-  //2. Remove duplicate vertices, and (when closed) collinear edges ...
-  TEdge *E = eStart, *eLoopStop = eStart;
-  for (;;)
-  {
-    //nb: allows matching start and end points when not Closed ...
-    if (E->Curr == E->Next->Curr && (Closed || E->Next != eStart))
-    {
-      if (E == E->Next) break;
-      if (E == eStart) eStart = E->Next;
-      E = RemoveEdge(E);
-      eLoopStop = E;
-      continue;
-    }
-    if (E->Prev == E->Next)
-      break; //only two vertices
-    else if (Closed &&
-      SlopesEqual(E->Prev->Curr, E->Curr, E->Next->Curr, m_UseFullRange) &&
-      (!m_PreserveCollinear ||
-      !Pt2IsBetweenPt1AndPt3(E->Prev->Curr, E->Curr, E->Next->Curr)))
-    {
-      //Collinear edges are allowed for open paths but in closed paths
-      //the default is to merge adjacent collinear edges into a single edge.
-      //However, if the PreserveCollinear property is enabled, only overlapping
-      //collinear edges (ie spikes) will be removed from closed paths.
-      if (E == eStart) eStart = E->Next;
-      E = RemoveEdge(E);
-      E = E->Prev;
-      eLoopStop = E;
-      continue;
-    }
-    E = E->Next;
-    if ((E == eLoopStop) || (!Closed && E->Next == eStart)) break;
-  }
-
-  if ((!Closed && (E == E->Next)) || (Closed && (E->Prev == E->Next)))
-  {
-    delete [] edges;
-    return false;
-  }
-
-  if (!Closed)
-  {
-    m_HasOpenPaths = true;
-    eStart->Prev->OutIdx = Skip;
-  }
-
-  //3. Do second stage of edge initialization ...
-  E = eStart;
-  do
-  {
-    InitEdge2(*E, PolyTyp);
-    E = E->Next;
-    if (IsFlat && E->Curr.Y != eStart->Curr.Y) IsFlat = false;
-  }
-  while (E != eStart);
-
-  //4. Finally, add edge bounds to LocalMinima list ...
-
-  //Totally flat paths must be handled differently when adding them
-  //to LocalMinima list to avoid endless loops etc ...
-  if (IsFlat)
-  {
-    if (Closed)
-    {
-      delete [] edges;
-      return false;
-    }
-    E->Prev->OutIdx = Skip;
-    MinimaList::value_type locMin;
-    locMin.Y = E->Bot.Y;
-    locMin.LeftBound = 0;
-    locMin.RightBound = E;
-    locMin.RightBound->Side = esRight;
-    locMin.RightBound->WindDelta = 0;
-    for (;;)
-    {
-      if (E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
-      if (E->Next->OutIdx == Skip) break;
-      E->NextInLML = E->Next;
-      E = E->Next;
-    }
-    m_MinimaList.push_back(locMin);
-    m_edges.push_back(edges);
-	  return true;
-  }
-
-  m_edges.push_back(edges);
-  bool leftBoundIsForward;
-  TEdge* EMin = 0;
-
-  //workaround to avoid an endless loop in the while loop below when
-  //open paths have matching start and end points ...
-  if (E->Prev->Bot == E->Prev->Top) E = E->Next;
-
-  for (;;)
-  {
-    E = FindNextLocMin(E);
-    if (E == EMin) break;
-    else if (!EMin) EMin = E;
-
-    //E and E.Prev now share a local minima (left aligned if horizontal).
-    //Compare their slopes to find which starts which bound ...
-    MinimaList::value_type locMin;
-    locMin.Y = E->Bot.Y;
-    if (E->Dx < E->Prev->Dx)
-    {
-      locMin.LeftBound = E->Prev;
-      locMin.RightBound = E;
-      leftBoundIsForward = false; //Q.nextInLML = Q.prev
-    } else
-    {
-      locMin.LeftBound = E;
-      locMin.RightBound = E->Prev;
-      leftBoundIsForward = true; //Q.nextInLML = Q.next
-    }
-
-    if (!Closed) locMin.LeftBound->WindDelta = 0;
-    else if (locMin.LeftBound->Next == locMin.RightBound)
-      locMin.LeftBound->WindDelta = -1;
-    else locMin.LeftBound->WindDelta = 1;
-    locMin.RightBound->WindDelta = -locMin.LeftBound->WindDelta;
-
-    E = ProcessBound(locMin.LeftBound, leftBoundIsForward);
-    if (E->OutIdx == Skip) E = ProcessBound(E, leftBoundIsForward);
-
-    TEdge* E2 = ProcessBound(locMin.RightBound, !leftBoundIsForward);
-    if (E2->OutIdx == Skip) E2 = ProcessBound(E2, !leftBoundIsForward);
-
-    if (locMin.LeftBound->OutIdx == Skip)
-      locMin.LeftBound = 0;
-    else if (locMin.RightBound->OutIdx == Skip)
-      locMin.RightBound = 0;
-    m_MinimaList.push_back(locMin);
-    if (!leftBoundIsForward) E = E2;
-  }
-  return true;
-}
-//------------------------------------------------------------------------------
+        double b1, b2;
+        if (Edge1.Dx == Edge2.Dx)
+        {
+            ip.Y = Edge1.Curr.Y;
+            ip.X = TopX(Edge1, ip.Y);
+            return;
+        }
+        else if (Edge1.Dx == 0)
+        {
+            ip.X = Edge1.Bot.X;
+            if (IsHorizontal(Edge2))
+                ip.Y = Edge2.Bot.Y;
+            else
+            {
+                b2   = Edge2.Bot.Y - (Edge2.Bot.X / Edge2.Dx);
+                ip.Y = Round(ip.X / Edge2.Dx + b2);
+            }
+        }
+        else if (Edge2.Dx == 0)
+        {
+            ip.X = Edge2.Bot.X;
+            if (IsHorizontal(Edge1))
+                ip.Y = Edge1.Bot.Y;
+            else
+            {
+                b1   = Edge1.Bot.Y - (Edge1.Bot.X / Edge1.Dx);
+                ip.Y = Round(ip.X / Edge1.Dx + b1);
+            }
+        }
+        else
+        {
+            b1       = Edge1.Bot.X - Edge1.Bot.Y * Edge1.Dx;
+            b2       = Edge2.Bot.X - Edge2.Bot.Y * Edge2.Dx;
+            double q = (b2 - b1) / (Edge1.Dx - Edge2.Dx);
+            ip.Y     = Round(q);
+            if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
+                ip.X = Round(Edge1.Dx * q + b1);
+            else
+                ip.X = Round(Edge2.Dx * q + b2);
+        }
 
-bool ClipperBase::AddPaths(const Paths &ppg, PolyType PolyTyp, bool Closed)
-{
-  bool result = false;
-  for (Paths::size_type i = 0; i < ppg.size(); ++i)
-    if (AddPath(ppg[i], PolyTyp, Closed)) result = true;
-  return result;
-}
-//------------------------------------------------------------------------------
+        if (ip.Y < Edge1.Top.Y || ip.Y < Edge2.Top.Y)
+        {
+            if (Edge1.Top.Y > Edge2.Top.Y)
+                ip.Y = Edge1.Top.Y;
+            else
+                ip.Y = Edge2.Top.Y;
+            if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
+                ip.X = TopX(Edge1, ip.Y);
+            else
+                ip.X = TopX(Edge2, ip.Y);
+        }
+        // finally, don't allow 'ip' to be BELOW curr.Y (ie bottom of scanbeam) ...
+        if (ip.Y > Edge1.Curr.Y)
+        {
+            ip.Y = Edge1.Curr.Y;
+            // use the more vertical edge to derive X ...
+            if (std::fabs(Edge1.Dx) > std::fabs(Edge2.Dx))
+                ip.X = TopX(Edge2, ip.Y);
+            else
+                ip.X = TopX(Edge1, ip.Y);
+        }
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::Clear()
-{
-  DisposeLocalMinimaList();
-  for (EdgeList::size_type i = 0; i < m_edges.size(); ++i)
-  {
-    TEdge* edges = m_edges[i];
-    delete [] edges;
-  }
-  m_edges.clear();
-  m_UseFullRange = false;
-  m_HasOpenPaths = false;
-}
-//------------------------------------------------------------------------------
+    void ReversePolyPtLinks(OutPt* pp)
+    {
+        if (!pp) return;
+        OutPt *pp1, *pp2;
+        pp1 = pp;
+        do {
+            pp2       = pp1->Next;
+            pp1->Next = pp1->Prev;
+            pp1->Prev = pp2;
+            pp1       = pp2;
+        } while (pp1 != pp);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::Reset()
-{
-  m_CurrentLM = m_MinimaList.begin();
-  if (m_CurrentLM == m_MinimaList.end()) return; //ie nothing to process
-  std::sort(m_MinimaList.begin(), m_MinimaList.end(), LocMinSorter());
-
-  m_Scanbeam = ScanbeamList(); //clears/resets priority_queue
-  //reset all edges ...
-  for (MinimaList::iterator lm = m_MinimaList.begin(); lm != m_MinimaList.end(); ++lm)
-  {
-    InsertScanbeam(lm->Y);
-    TEdge* e = lm->LeftBound;
-    if (e)
-    {
-      e->Curr = e->Bot;
-      e->Side = esLeft;
-      e->OutIdx = Unassigned;
-    }
-
-    e = lm->RightBound;
-    if (e)
-    {
-      e->Curr = e->Bot;
-      e->Side = esRight;
-      e->OutIdx = Unassigned;
-    }
-  }
-  m_ActiveEdges = 0;
-  m_CurrentLM = m_MinimaList.begin();
-}
-//------------------------------------------------------------------------------
+    void DisposeOutPts(OutPt*& pp)
+    {
+        if (pp == 0) return;
+        pp->Prev->Next = 0;
+        while (pp)
+        {
+            OutPt* tmpPp = pp;
+            pp           = pp->Next;
+            delete tmpPp;
+        }
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::DisposeLocalMinimaList()
-{
-  m_MinimaList.clear();
-  m_CurrentLM = m_MinimaList.begin();
-}
-//------------------------------------------------------------------------------
+    inline void InitEdge(TEdge* e, TEdge* eNext, TEdge* ePrev, const IntPoint& Pt)
+    {
+        std::memset(e, 0, sizeof(TEdge));
+        e->Next   = eNext;
+        e->Prev   = ePrev;
+        e->Curr   = Pt;
+        e->OutIdx = Unassigned;
+    }
+    //------------------------------------------------------------------------------
 
-bool ClipperBase::PopLocalMinima(cInt Y, const LocalMinimum *&locMin)
-{
-  if (m_CurrentLM == m_MinimaList.end() || (*m_CurrentLM).Y != Y) return false;
-  locMin = &(*m_CurrentLM);
-  ++m_CurrentLM;
-  return true;
-}
-//------------------------------------------------------------------------------
+    void InitEdge2(TEdge& e, PolyType Pt)
+    {
+        if (e.Curr.Y >= e.Next->Curr.Y)
+        {
+            e.Bot = e.Curr;
+            e.Top = e.Next->Curr;
+        }
+        else
+        {
+            e.Top = e.Curr;
+            e.Bot = e.Next->Curr;
+        }
+        SetDx(e);
+        e.PolyTyp = Pt;
+    }
+    //------------------------------------------------------------------------------
 
-IntRect ClipperBase::GetBounds()
-{
-  IntRect result;
-  MinimaList::iterator lm = m_MinimaList.begin();
-  if (lm == m_MinimaList.end())
-  {
-    result.left = result.top = result.right = result.bottom = 0;
-    return result;
-  }
-  result.left = lm->LeftBound->Bot.X;
-  result.top = lm->LeftBound->Bot.Y;
-  result.right = lm->LeftBound->Bot.X;
-  result.bottom = lm->LeftBound->Bot.Y;
-  while (lm != m_MinimaList.end())
-  {
-    //todo - needs fixing for open paths
-    result.bottom = std::max(result.bottom, lm->LeftBound->Bot.Y);
-    TEdge* e = lm->LeftBound;
-    for (;;) {
-      TEdge* bottomE = e;
-      while (e->NextInLML)
-      {
-        if (e->Bot.X < result.left) result.left = e->Bot.X;
-        if (e->Bot.X > result.right) result.right = e->Bot.X;
-        e = e->NextInLML;
-      }
-      result.left = std::min(result.left, e->Bot.X);
-      result.right = std::max(result.right, e->Bot.X);
-      result.left = std::min(result.left, e->Top.X);
-      result.right = std::max(result.right, e->Top.X);
-      result.top = std::min(result.top, e->Top.Y);
-      if (bottomE == lm->LeftBound) e = lm->RightBound;
-      else break;
-    }
-    ++lm;
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
+    TEdge* RemoveEdge(TEdge* e)
+    {
+        // removes e from double_linked_list (but without removing from memory)
+        e->Prev->Next = e->Next;
+        e->Next->Prev = e->Prev;
+        TEdge* result = e->Next;
+        e->Prev       = 0;  // flag as removed (see ClipperBase.Clear)
+        return result;
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::InsertScanbeam(const cInt Y)
-{
-  m_Scanbeam.push(Y);
-}
-//------------------------------------------------------------------------------
+    inline void ReverseHorizontal(TEdge& e)
+    {
+        // swap horizontal edges' Top and Bottom x's so they follow the natural
+        // progression of the bounds - ie so their xbots will align with the
+        // adjoining lower edge. [Helpful in the ProcessHorizontal() method.]
+        std::swap(e.Top.X, e.Bot.X);
+#ifdef use_xyz
+        std::swap(e.Top.Z, e.Bot.Z);
+#endif
+    }
+    //------------------------------------------------------------------------------
 
-bool ClipperBase::PopScanbeam(cInt &Y)
-{
-  if (m_Scanbeam.empty()) return false;
-  Y = m_Scanbeam.top();
-  m_Scanbeam.pop();
-  while (!m_Scanbeam.empty() && Y == m_Scanbeam.top()) { m_Scanbeam.pop(); } // Pop duplicates.
-  return true;
-}
-//------------------------------------------------------------------------------
+    void SwapPoints(IntPoint& pt1, IntPoint& pt2)
+    {
+        IntPoint tmp = pt1;
+        pt1          = pt2;
+        pt2          = tmp;
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::DisposeAllOutRecs(){
-  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-    DisposeOutRec(i);
-  m_PolyOuts.clear();
-}
-//------------------------------------------------------------------------------
+    bool GetOverlapSegment(IntPoint pt1a, IntPoint pt1b, IntPoint pt2a, IntPoint pt2b, IntPoint& pt1, IntPoint& pt2)
+    {
+        // precondition: segments are Collinear.
+        if (Abs(pt1a.X - pt1b.X) > Abs(pt1a.Y - pt1b.Y))
+        {
+            if (pt1a.X > pt1b.X) SwapPoints(pt1a, pt1b);
+            if (pt2a.X > pt2b.X) SwapPoints(pt2a, pt2b);
+            if (pt1a.X > pt2a.X)
+                pt1 = pt1a;
+            else
+                pt1 = pt2a;
+            if (pt1b.X < pt2b.X)
+                pt2 = pt1b;
+            else
+                pt2 = pt2b;
+            return pt1.X < pt2.X;
+        }
+        else
+        {
+            if (pt1a.Y < pt1b.Y) SwapPoints(pt1a, pt1b);
+            if (pt2a.Y < pt2b.Y) SwapPoints(pt2a, pt2b);
+            if (pt1a.Y < pt2a.Y)
+                pt1 = pt1a;
+            else
+                pt1 = pt2a;
+            if (pt1b.Y > pt2b.Y)
+                pt2 = pt1b;
+            else
+                pt2 = pt2b;
+            return pt1.Y > pt2.Y;
+        }
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::DisposeOutRec(PolyOutList::size_type index)
-{
-  OutRec *outRec = m_PolyOuts[index];
-  if (outRec->Pts) DisposeOutPts(outRec->Pts);
-  delete outRec;
-  m_PolyOuts[index] = 0;
-}
-//------------------------------------------------------------------------------
+    bool FirstIsBottomPt(const OutPt* btmPt1, const OutPt* btmPt2)
+    {
+        OutPt* p = btmPt1->Prev;
+        while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Prev;
+        double dx1p = std::fabs(GetDx(btmPt1->Pt, p->Pt));
+        p           = btmPt1->Next;
+        while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Next;
+        double dx1n = std::fabs(GetDx(btmPt1->Pt, p->Pt));
+
+        p = btmPt2->Prev;
+        while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Prev;
+        double dx2p = std::fabs(GetDx(btmPt2->Pt, p->Pt));
+        p           = btmPt2->Next;
+        while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Next;
+        double dx2n = std::fabs(GetDx(btmPt2->Pt, p->Pt));
+
+        if (std::max(dx1p, dx1n) == std::max(dx2p, dx2n) &&
+            std::min(dx1p, dx1n) == std::min(dx2p, dx2n))
+            return Area(btmPt1) > 0;  // if otherwise identical use orientation
+        else
+            return (dx1p >= dx2p && dx1p >= dx2n) || (dx1n >= dx2p && dx1n >= dx2n);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::DeleteFromAEL(TEdge *e)
-{
-  TEdge* AelPrev = e->PrevInAEL;
-  TEdge* AelNext = e->NextInAEL;
-  if (!AelPrev &&  !AelNext && (e != m_ActiveEdges)) return; //already deleted
-  if (AelPrev) AelPrev->NextInAEL = AelNext;
-  else m_ActiveEdges = AelNext;
-  if (AelNext) AelNext->PrevInAEL = AelPrev;
-  e->NextInAEL = 0;
-  e->PrevInAEL = 0;
-}
-//------------------------------------------------------------------------------
+    OutPt* GetBottomPt(OutPt* pp)
+    {
+        OutPt* dups = 0;
+        OutPt* p    = pp->Next;
+        while (p != pp)
+        {
+            if (p->Pt.Y > pp->Pt.Y)
+            {
+                pp   = p;
+                dups = 0;
+            }
+            else if (p->Pt.Y == pp->Pt.Y && p->Pt.X <= pp->Pt.X)
+            {
+                if (p->Pt.X < pp->Pt.X)
+                {
+                    dups = 0;
+                    pp   = p;
+                }
+                else
+                {
+                    if (p->Next != pp && p->Prev != pp) dups = p;
+                }
+            }
+            p = p->Next;
+        }
+        if (dups)
+        {
+            // there appears to be at least 2 vertices at BottomPt so ...
+            while (dups != p)
+            {
+                if (!FirstIsBottomPt(p, dups)) pp = dups;
+                dups = dups->Next;
+                while (dups->Pt != pp->Pt) dups = dups->Next;
+            }
+        }
+        return pp;
+    }
+    //------------------------------------------------------------------------------
 
-OutRec* ClipperBase::CreateOutRec()
-{
-  OutRec* result = new OutRec;
-  result->IsHole = false;
-  result->IsOpen = false;
-  result->FirstLeft = 0;
-  result->Pts = 0;
-  result->BottomPt = 0;
-  result->PolyNd = 0;
-  m_PolyOuts.push_back(result);
-  result->Idx = (int)m_PolyOuts.size() - 1;
-  return result;
-}
-//------------------------------------------------------------------------------
+    bool Pt2IsBetweenPt1AndPt3(const IntPoint pt1,
+                               const IntPoint pt2,
+                               const IntPoint pt3)
+    {
+        if ((pt1 == pt3) || (pt1 == pt2) || (pt3 == pt2))
+            return false;
+        else if (pt1.X != pt3.X)
+            return (pt2.X > pt1.X) == (pt2.X < pt3.X);
+        else
+            return (pt2.Y > pt1.Y) == (pt2.Y < pt3.Y);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::SwapPositionsInAEL(TEdge *Edge1, TEdge *Edge2)
-{
-  //check that one or other edge hasn't already been removed from AEL ...
-  if (Edge1->NextInAEL == Edge1->PrevInAEL ||
-    Edge2->NextInAEL == Edge2->PrevInAEL) return;
-
-  if (Edge1->NextInAEL == Edge2)
-  {
-    TEdge* Next = Edge2->NextInAEL;
-    if (Next) Next->PrevInAEL = Edge1;
-    TEdge* Prev = Edge1->PrevInAEL;
-    if (Prev) Prev->NextInAEL = Edge2;
-    Edge2->PrevInAEL = Prev;
-    Edge2->NextInAEL = Edge1;
-    Edge1->PrevInAEL = Edge2;
-    Edge1->NextInAEL = Next;
-  }
-  else if (Edge2->NextInAEL == Edge1)
-  {
-    TEdge* Next = Edge1->NextInAEL;
-    if (Next) Next->PrevInAEL = Edge2;
-    TEdge* Prev = Edge2->PrevInAEL;
-    if (Prev) Prev->NextInAEL = Edge1;
-    Edge1->PrevInAEL = Prev;
-    Edge1->NextInAEL = Edge2;
-    Edge2->PrevInAEL = Edge1;
-    Edge2->NextInAEL = Next;
-  }
-  else
-  {
-    TEdge* Next = Edge1->NextInAEL;
-    TEdge* Prev = Edge1->PrevInAEL;
-    Edge1->NextInAEL = Edge2->NextInAEL;
-    if (Edge1->NextInAEL) Edge1->NextInAEL->PrevInAEL = Edge1;
-    Edge1->PrevInAEL = Edge2->PrevInAEL;
-    if (Edge1->PrevInAEL) Edge1->PrevInAEL->NextInAEL = Edge1;
-    Edge2->NextInAEL = Next;
-    if (Edge2->NextInAEL) Edge2->NextInAEL->PrevInAEL = Edge2;
-    Edge2->PrevInAEL = Prev;
-    if (Edge2->PrevInAEL) Edge2->PrevInAEL->NextInAEL = Edge2;
-  }
-
-  if (!Edge1->PrevInAEL) m_ActiveEdges = Edge1;
-  else if (!Edge2->PrevInAEL) m_ActiveEdges = Edge2;
-}
-//------------------------------------------------------------------------------
+    bool HorzSegmentsOverlap(cInt seg1a, cInt seg1b, cInt seg2a, cInt seg2b)
+    {
+        if (seg1a > seg1b) std::swap(seg1a, seg1b);
+        if (seg2a > seg2b) std::swap(seg2a, seg2b);
+        return (seg1a < seg2b) && (seg2a < seg1b);
+    }
 
-void ClipperBase::UpdateEdgeIntoAEL(TEdge *&e)
-{
-  if (!e->NextInLML)
-    throw clipperException("UpdateEdgeIntoAEL: invalid call");
-
-  e->NextInLML->OutIdx = e->OutIdx;
-  TEdge* AelPrev = e->PrevInAEL;
-  TEdge* AelNext = e->NextInAEL;
-  if (AelPrev) AelPrev->NextInAEL = e->NextInLML;
-  else m_ActiveEdges = e->NextInLML;
-  if (AelNext) AelNext->PrevInAEL = e->NextInLML;
-  e->NextInLML->Side = e->Side;
-  e->NextInLML->WindDelta = e->WindDelta;
-  e->NextInLML->WindCnt = e->WindCnt;
-  e->NextInLML->WindCnt2 = e->WindCnt2;
-  e = e->NextInLML;
-  e->Curr = e->Bot;
-  e->PrevInAEL = AelPrev;
-  e->NextInAEL = AelNext;
-  if (!IsHorizontal(*e)) InsertScanbeam(e->Top.Y);
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
+    // ClipperBase class methods ...
+    //------------------------------------------------------------------------------
 
-bool ClipperBase::LocalMinimaPending()
-{
-  return (m_CurrentLM != m_MinimaList.end());
-}
+    ClipperBase::ClipperBase()  // constructor
+    {
+        m_CurrentLM    = m_MinimaList.begin();  // begin() == end() here
+        m_UseFullRange = false;
+    }
+    //------------------------------------------------------------------------------
 
-//------------------------------------------------------------------------------
-// TClipper methods ...
-//------------------------------------------------------------------------------
+    ClipperBase::~ClipperBase()  // destructor
+    {
+        Clear();
+    }
+    //------------------------------------------------------------------------------
 
-Clipper::Clipper(int initOptions) : ClipperBase() //constructor
-{
-  m_ExecuteLocked = false;
-  m_UseFullRange = false;
-  m_ReverseOutput = ((initOptions & ioReverseSolution) != 0);
-  m_StrictSimple = ((initOptions & ioStrictlySimple) != 0);
-  m_PreserveCollinear = ((initOptions & ioPreserveCollinear) != 0);
-  m_HasOpenPaths = false;
-#ifdef use_xyz
-  m_ZFill = 0;
-#endif
-}
-//------------------------------------------------------------------------------
+    void RangeTest(const IntPoint& Pt, bool& useFullRange)
+    {
+        if (useFullRange)
+        {
+            if (Pt.X > hiRange || Pt.Y > hiRange || -Pt.X > hiRange || -Pt.Y > hiRange)
+                throw clipperException("Coordinate outside allowed range");
+        }
+        else if (Pt.X > loRange || Pt.Y > loRange || -Pt.X > loRange || -Pt.Y > loRange)
+        {
+            useFullRange = true;
+            RangeTest(Pt, useFullRange);
+        }
+    }
+    //------------------------------------------------------------------------------
 
-#ifdef use_xyz
-void Clipper::ZFillFunction(ZFillCallback zFillFunc)
-{
-  m_ZFill = zFillFunc;
-}
-//------------------------------------------------------------------------------
-#endif
+    TEdge* FindNextLocMin(TEdge* E)
+    {
+        for (;;)
+        {
+            while (E->Bot != E->Prev->Bot || E->Curr == E->Top) E = E->Next;
+            if (!IsHorizontal(*E) && !IsHorizontal(*E->Prev)) break;
+            while (IsHorizontal(*E->Prev)) E = E->Prev;
+            TEdge* E2 = E;
+            while (IsHorizontal(*E)) E = E->Next;
+            if (E->Top.Y == E->Prev->Bot.Y) continue;  // ie just an intermediate horz.
+            if (E2->Prev->Bot.X < E->Bot.X) E = E2;
+            break;
+        }
+        return E;
+    }
+    //------------------------------------------------------------------------------
 
-bool Clipper::Execute(ClipType clipType, Paths &solution, PolyFillType fillType)
-{
-    return Execute(clipType, solution, fillType, fillType);
-}
-//------------------------------------------------------------------------------
+    TEdge* ClipperBase::ProcessBound(TEdge* E, bool NextIsForward)
+    {
+        TEdge* Result = E;
+        TEdge* Horz   = 0;
 
-bool Clipper::Execute(ClipType clipType, PolyTree &polytree, PolyFillType fillType)
-{
-    return Execute(clipType, polytree, fillType, fillType);
-}
-//------------------------------------------------------------------------------
+        if (E->OutIdx == Skip)
+        {
+            // if edges still remain in the current bound beyond the skip edge then
+            // create another LocMin and call ProcessBound once more
+            if (NextIsForward)
+            {
+                while (E->Top.Y == E->Next->Bot.Y) E = E->Next;
+                // don't include top horizontals when parsing a bound a second time,
+                // they will be contained in the opposite bound ...
+                while (E != Result && IsHorizontal(*E)) E = E->Prev;
+            }
+            else
+            {
+                while (E->Top.Y == E->Prev->Bot.Y) E = E->Prev;
+                while (E != Result && IsHorizontal(*E)) E = E->Next;
+            }
 
-bool Clipper::Execute(ClipType clipType, Paths &solution,
-    PolyFillType subjFillType, PolyFillType clipFillType)
-{
-  if( m_ExecuteLocked ) return false;
-  if (m_HasOpenPaths)
-    throw clipperException("Error: PolyTree struct is needed for open path clipping.");
-  m_ExecuteLocked = true;
-  solution.resize(0);
-  m_SubjFillType = subjFillType;
-  m_ClipFillType = clipFillType;
-  m_ClipType = clipType;
-  m_UsingPolyTree = false;
-  bool succeeded = ExecuteInternal();
-  if (succeeded) BuildResult(solution);
-  DisposeAllOutRecs();
-  m_ExecuteLocked = false;
-  return succeeded;
-}
-//------------------------------------------------------------------------------
+            if (E == Result)
+            {
+                if (NextIsForward)
+                    Result = E->Next;
+                else
+                    Result = E->Prev;
+            }
+            else
+            {
+                // there are more edges in the bound beyond result starting with E
+                if (NextIsForward)
+                    E = Result->Next;
+                else
+                    E = Result->Prev;
+                MinimaList::value_type locMin;
+                locMin.Y          = E->Bot.Y;
+                locMin.LeftBound  = 0;
+                locMin.RightBound = E;
+                E->WindDelta      = 0;
+                Result            = ProcessBound(E, NextIsForward);
+                m_MinimaList.push_back(locMin);
+            }
+            return Result;
+        }
 
-bool Clipper::Execute(ClipType clipType, PolyTree& polytree,
-    PolyFillType subjFillType, PolyFillType clipFillType)
-{
-  if( m_ExecuteLocked ) return false;
-  m_ExecuteLocked = true;
-  m_SubjFillType = subjFillType;
-  m_ClipFillType = clipFillType;
-  m_ClipType = clipType;
-  m_UsingPolyTree = true;
-  bool succeeded = ExecuteInternal();
-  if (succeeded) BuildResult2(polytree);
-  DisposeAllOutRecs();
-  m_ExecuteLocked = false;
-  return succeeded;
-}
-//------------------------------------------------------------------------------
+        TEdge* EStart;
 
-void Clipper::FixHoleLinkage(OutRec &outrec)
-{
-  //skip OutRecs that (a) contain outermost polygons or
-  //(b) already have the correct owner/child linkage ...
-  if (!outrec.FirstLeft ||
-      (outrec.IsHole != outrec.FirstLeft->IsHole &&
-      outrec.FirstLeft->Pts)) return;
-
-  OutRec* orfl = outrec.FirstLeft;
-  while (orfl && ((orfl->IsHole == outrec.IsHole) || !orfl->Pts))
-      orfl = orfl->FirstLeft;
-  outrec.FirstLeft = orfl;
-}
-//------------------------------------------------------------------------------
+        if (IsHorizontal(*E))
+        {
+            // We need to be careful with open paths because this may not be a
+            // true local minima (ie E may be following a skip edge).
+            // Also, consecutive horz. edges may start heading left before going right.
+            if (NextIsForward)
+                EStart = E->Prev;
+            else
+                EStart = E->Next;
+            if (IsHorizontal(*EStart))  // ie an adjoining horizontal skip edge
+            {
+                if (EStart->Bot.X != E->Bot.X && EStart->Top.X != E->Bot.X)
+                    ReverseHorizontal(*E);
+            }
+            else if (EStart->Bot.X != E->Bot.X)
+                ReverseHorizontal(*E);
+        }
 
-bool Clipper::ExecuteInternal()
-{
-  bool succeeded = true;
-  try {
-    Reset();
-    m_Maxima = MaximaList();
-    m_SortedEdges = 0;
-
-    succeeded = true;
-    cInt botY, topY;
-    if (!PopScanbeam(botY)) return false;
-    InsertLocalMinimaIntoAEL(botY);
-    while (PopScanbeam(topY) || LocalMinimaPending())
-    {
-      ProcessHorizontals();
-	    ClearGhostJoins();
-      if (!ProcessIntersections(topY))
-      {
-        succeeded = false;
-        break;
-      }
-      ProcessEdgesAtTopOfScanbeam(topY);
-      botY = topY;
-      InsertLocalMinimaIntoAEL(botY);
-    }
-  }
-  catch(...)
-  {
-    succeeded = false;
-  }
-
-  if (succeeded)
-  {
-    //fix orientations ...
-    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-    {
-      OutRec *outRec = m_PolyOuts[i];
-      if (!outRec->Pts || outRec->IsOpen) continue;
-      if ((outRec->IsHole ^ m_ReverseOutput) == (Area(*outRec) > 0))
-        ReversePolyPtLinks(outRec->Pts);
-    }
-
-    if (!m_Joins.empty()) JoinCommonEdges();
-
-    //unfortunately FixupOutPolygon() must be done after JoinCommonEdges()
-    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-    {
-      OutRec *outRec = m_PolyOuts[i];
-      if (!outRec->Pts) continue;
-      if (outRec->IsOpen)
-        FixupOutPolyline(*outRec);
-      else
-        FixupOutPolygon(*outRec);
-    }
-
-    if (m_StrictSimple) DoSimplePolygons();
-  }
-
-  ClearJoins();
-  ClearGhostJoins();
-  return succeeded;
-}
-//------------------------------------------------------------------------------
+        EStart = E;
+        if (NextIsForward)
+        {
+            while (Result->Top.Y == Result->Next->Bot.Y && Result->Next->OutIdx != Skip)
+                Result = Result->Next;
+            if (IsHorizontal(*Result) && Result->Next->OutIdx != Skip)
+            {
+                // nb: at the top of a bound, horizontals are added to the bound
+                // only when the preceding edge attaches to the horizontal's left vertex
+                // unless a Skip edge is encountered when that becomes the top divide
+                Horz = Result;
+                while (IsHorizontal(*Horz->Prev)) Horz = Horz->Prev;
+                if (Horz->Prev->Top.X > Result->Next->Top.X) Result = Horz->Prev;
+            }
+            while (E != Result)
+            {
+                E->NextInLML = E->Next;
+                if (IsHorizontal(*E) && E != EStart &&
+                    E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
+                E = E->Next;
+            }
+            if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Prev->Top.X)
+                ReverseHorizontal(*E);
+            Result = Result->Next;  // move to the edge just beyond current bound
+        }
+        else
+        {
+            while (Result->Top.Y == Result->Prev->Bot.Y && Result->Prev->OutIdx != Skip)
+                Result = Result->Prev;
+            if (IsHorizontal(*Result) && Result->Prev->OutIdx != Skip)
+            {
+                Horz = Result;
+                while (IsHorizontal(*Horz->Next)) Horz = Horz->Next;
+                if (Horz->Next->Top.X == Result->Prev->Top.X ||
+                    Horz->Next->Top.X > Result->Prev->Top.X) Result = Horz->Next;
+            }
 
-void Clipper::SetWindingCount(TEdge &edge)
-{
-  TEdge *e = edge.PrevInAEL;
-  //find the edge of the same polytype that immediately preceeds 'edge' in AEL
-  while (e  && ((e->PolyTyp != edge.PolyTyp) || (e->WindDelta == 0))) e = e->PrevInAEL;
-  if (!e)
-  {
-    if (edge.WindDelta == 0)
-    {
-      PolyFillType pft = (edge.PolyTyp == ptSubject ? m_SubjFillType : m_ClipFillType);
-      edge.WindCnt = (pft == pftNegative ? -1 : 1);
-    }
-    else
-      edge.WindCnt = edge.WindDelta;
-    edge.WindCnt2 = 0;
-    e = m_ActiveEdges; //ie get ready to calc WindCnt2
-  }
-  else if (edge.WindDelta == 0 && m_ClipType != ctUnion)
-  {
-    edge.WindCnt = 1;
-    edge.WindCnt2 = e->WindCnt2;
-    e = e->NextInAEL; //ie get ready to calc WindCnt2
-  }
-  else if (IsEvenOddFillType(edge))
-  {
-    //EvenOdd filling ...
-    if (edge.WindDelta == 0)
-    {
-      //are we inside a subj polygon ...
-      bool Inside = true;
-      TEdge *e2 = e->PrevInAEL;
-      while (e2)
-      {
-        if (e2->PolyTyp == e->PolyTyp && e2->WindDelta != 0)
-          Inside = !Inside;
-        e2 = e2->PrevInAEL;
-      }
-      edge.WindCnt = (Inside ? 0 : 1);
-    }
-    else
-    {
-      edge.WindCnt = edge.WindDelta;
-    }
-    edge.WindCnt2 = e->WindCnt2;
-    e = e->NextInAEL; //ie get ready to calc WindCnt2
-  }
-  else
-  {
-    //nonZero, Positive or Negative filling ...
-    if (e->WindCnt * e->WindDelta < 0)
-    {
-      //prev edge is 'decreasing' WindCount (WC) toward zero
-      //so we're outside the previous polygon ...
-      if (Abs(e->WindCnt) > 1)
-      {
-        //outside prev poly but still inside another.
-        //when reversing direction of prev poly use the same WC
-        if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt;
-        //otherwise continue to 'decrease' WC ...
-        else edge.WindCnt = e->WindCnt + edge.WindDelta;
-      }
-      else
-        //now outside all polys of same polytype so set own WC ...
-        edge.WindCnt = (edge.WindDelta == 0 ? 1 : edge.WindDelta);
-    } else
-    {
-      //prev edge is 'increasing' WindCount (WC) away from zero
-      //so we're inside the previous polygon ...
-      if (edge.WindDelta == 0)
-        edge.WindCnt = (e->WindCnt < 0 ? e->WindCnt - 1 : e->WindCnt + 1);
-      //if wind direction is reversing prev then use same WC
-      else if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt;
-      //otherwise add to WC ...
-      else edge.WindCnt = e->WindCnt + edge.WindDelta;
-    }
-    edge.WindCnt2 = e->WindCnt2;
-    e = e->NextInAEL; //ie get ready to calc WindCnt2
-  }
-
-  //update WindCnt2 ...
-  if (IsEvenOddAltFillType(edge))
-  {
-    //EvenOdd filling ...
-    while (e != &edge)
-    {
-      if (e->WindDelta != 0)
-        edge.WindCnt2 = (edge.WindCnt2 == 0 ? 1 : 0);
-      e = e->NextInAEL;
-    }
-  } else
-  {
-    //nonZero, Positive or Negative filling ...
-    while ( e != &edge )
-    {
-      edge.WindCnt2 += e->WindDelta;
-      e = e->NextInAEL;
-    }
-  }
-}
-//------------------------------------------------------------------------------
+            while (E != Result)
+            {
+                E->NextInLML = E->Prev;
+                if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X)
+                    ReverseHorizontal(*E);
+                E = E->Prev;
+            }
+            if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X)
+                ReverseHorizontal(*E);
+            Result = Result->Prev;  // move to the edge just beyond current bound
+        }
 
-bool Clipper::IsEvenOddFillType(const TEdge& edge) const
-{
-  if (edge.PolyTyp == ptSubject)
-    return m_SubjFillType == pftEvenOdd; else
-    return m_ClipFillType == pftEvenOdd;
-}
-//------------------------------------------------------------------------------
+        return Result;
+    }
+    //------------------------------------------------------------------------------
 
-bool Clipper::IsEvenOddAltFillType(const TEdge& edge) const
-{
-  if (edge.PolyTyp == ptSubject)
-    return m_ClipFillType == pftEvenOdd; else
-    return m_SubjFillType == pftEvenOdd;
-}
-//------------------------------------------------------------------------------
+    bool ClipperBase::AddPath(const Path& pg, PolyType PolyTyp, bool Closed)
+    {
+#ifdef use_lines
+        if (!Closed && PolyTyp == ptClip)
+            throw clipperException("AddPath: Open paths must be subject.");
+#else
+        if (!Closed)
+            throw clipperException("AddPath: Open paths have been disabled.");
+#endif
 
-bool Clipper::IsContributing(const TEdge& edge) const
-{
-  PolyFillType pft, pft2;
-  if (edge.PolyTyp == ptSubject)
-  {
-    pft = m_SubjFillType;
-    pft2 = m_ClipFillType;
-  } else
-  {
-    pft = m_ClipFillType;
-    pft2 = m_SubjFillType;
-  }
-
-  switch(pft)
-  {
-    case pftEvenOdd:
-      //return false if a subj line has been flagged as inside a subj polygon
-      if (edge.WindDelta == 0 && edge.WindCnt != 1) return false;
-      break;
-    case pftNonZero:
-      if (Abs(edge.WindCnt) != 1) return false;
-      break;
-    case pftPositive:
-      if (edge.WindCnt != 1) return false;
-      break;
-    default: //pftNegative
-      if (edge.WindCnt != -1) return false;
-  }
-
-  switch(m_ClipType)
-  {
-    case ctIntersection:
-      switch(pft2)
-      {
-        case pftEvenOdd:
-        case pftNonZero:
-          return (edge.WindCnt2 != 0);
-        case pftPositive:
-          return (edge.WindCnt2 > 0);
-        default:
-          return (edge.WindCnt2 < 0);
-      }
-      break;
-    case ctUnion:
-      switch(pft2)
-      {
-        case pftEvenOdd:
-        case pftNonZero:
-          return (edge.WindCnt2 == 0);
-        case pftPositive:
-          return (edge.WindCnt2 <= 0);
-        default:
-          return (edge.WindCnt2 >= 0);
-      }
-      break;
-    case ctDifference:
-      if (edge.PolyTyp == ptSubject)
-        switch(pft2)
-        {
-          case pftEvenOdd:
-          case pftNonZero:
-            return (edge.WindCnt2 == 0);
-          case pftPositive:
-            return (edge.WindCnt2 <= 0);
-          default:
-            return (edge.WindCnt2 >= 0);
-        }
-      else
-        switch(pft2)
-        {
-          case pftEvenOdd:
-          case pftNonZero:
-            return (edge.WindCnt2 != 0);
-          case pftPositive:
-            return (edge.WindCnt2 > 0);
-          default:
-            return (edge.WindCnt2 < 0);
-        }
-      break;
-    case ctXor:
-      if (edge.WindDelta == 0) //XOr always contributing unless open
-        switch(pft2)
-        {
-          case pftEvenOdd:
-          case pftNonZero:
-            return (edge.WindCnt2 == 0);
-          case pftPositive:
-            return (edge.WindCnt2 <= 0);
-          default:
-            return (edge.WindCnt2 >= 0);
-        }
-      else
-        return true;
-      break;
-    default:
-      return true;
-  }
-}
-//------------------------------------------------------------------------------
+        int highI = (int)pg.size() - 1;
+        if (Closed)
+            while (highI > 0 && (pg[highI] == pg[0])) --highI;
+        while (highI > 0 && (pg[highI] == pg[highI - 1])) --highI;
+        if ((Closed && highI < 2) || (!Closed && highI < 1)) return false;
 
-OutPt* Clipper::AddLocalMinPoly(TEdge *e1, TEdge *e2, const IntPoint &Pt)
-{
-  OutPt* result;
-  TEdge *e, *prevE;
-  if (IsHorizontal(*e2) || ( e1->Dx > e2->Dx ))
-  {
-    result = AddOutPt(e1, Pt);
-    e2->OutIdx = e1->OutIdx;
-    e1->Side = esLeft;
-    e2->Side = esRight;
-    e = e1;
-    if (e->PrevInAEL == e2)
-      prevE = e2->PrevInAEL;
-    else
-      prevE = e->PrevInAEL;
-  } else
-  {
-    result = AddOutPt(e2, Pt);
-    e1->OutIdx = e2->OutIdx;
-    e1->Side = esRight;
-    e2->Side = esLeft;
-    e = e2;
-    if (e->PrevInAEL == e1)
-        prevE = e1->PrevInAEL;
-    else
-        prevE = e->PrevInAEL;
-  }
-
-  if (prevE && prevE->OutIdx >= 0 && prevE->Top.Y < Pt.Y && e->Top.Y < Pt.Y)
-  {
-    cInt xPrev = TopX(*prevE, Pt.Y);
-    cInt xE = TopX(*e, Pt.Y);
-    if (xPrev == xE && (e->WindDelta != 0) && (prevE->WindDelta != 0) &&
-      SlopesEqual(IntPoint(xPrev, Pt.Y), prevE->Top, IntPoint(xE, Pt.Y), e->Top, m_UseFullRange))
-    {
-      OutPt* outPt = AddOutPt(prevE, Pt);
-      AddJoin(result, outPt, e->Top);
-    }
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
+        // create a new edge array ...
+        TEdge* edges = new TEdge[highI + 1];
 
-void Clipper::AddLocalMaxPoly(TEdge *e1, TEdge *e2, const IntPoint &Pt)
-{
-  AddOutPt( e1, Pt );
-  if (e2->WindDelta == 0) AddOutPt(e2, Pt);
-  if( e1->OutIdx == e2->OutIdx )
-  {
-    e1->OutIdx = Unassigned;
-    e2->OutIdx = Unassigned;
-  }
-  else if (e1->OutIdx < e2->OutIdx)
-    AppendPolygon(e1, e2);
-  else
-    AppendPolygon(e2, e1);
-}
-//------------------------------------------------------------------------------
+        bool   IsFlat = true;
+        // 1. Basic (first) edge initialization ...
+        try
+        {
+            edges[1].Curr = pg[1];
+            RangeTest(pg[0], m_UseFullRange);
+            RangeTest(pg[highI], m_UseFullRange);
+            InitEdge(&edges[0], &edges[1], &edges[highI], pg[0]);
+            InitEdge(&edges[highI], &edges[0], &edges[highI - 1], pg[highI]);
+            for (int i = highI - 1; i >= 1; --i)
+            {
+                RangeTest(pg[i], m_UseFullRange);
+                InitEdge(&edges[i], &edges[i + 1], &edges[i - 1], pg[i]);
+            }
+        }
+        catch (...)
+        {
+            delete[] edges;
+            throw;  // range test fails
+        }
+        TEdge* eStart = &edges[0];
 
-void Clipper::AddEdgeToSEL(TEdge *edge)
-{
-  //SEL pointers in PEdge are reused to build a list of horizontal edges.
-  //However, we don't need to worry about order with horizontal edge processing.
-  if( !m_SortedEdges )
-  {
-    m_SortedEdges = edge;
-    edge->PrevInSEL = 0;
-    edge->NextInSEL = 0;
-  }
-  else
-  {
-    edge->NextInSEL = m_SortedEdges;
-    edge->PrevInSEL = 0;
-    m_SortedEdges->PrevInSEL = edge;
-    m_SortedEdges = edge;
-  }
-}
-//------------------------------------------------------------------------------
+        // 2. Remove duplicate vertices, and (when closed) collinear edges ...
+        TEdge *E = eStart, *eLoopStop = eStart;
+        for (;;)
+        {
+            // nb: allows matching start and end points when not Closed ...
+            if (E->Curr == E->Next->Curr && (Closed || E->Next != eStart))
+            {
+                if (E == E->Next) break;
+                if (E == eStart) eStart = E->Next;
+                E         = RemoveEdge(E);
+                eLoopStop = E;
+                continue;
+            }
+            if (E->Prev == E->Next)
+                break;  // only two vertices
+            else if (Closed &&
+                     SlopesEqual(E->Prev->Curr, E->Curr, E->Next->Curr, m_UseFullRange) &&
+                     (!m_PreserveCollinear ||
+                      !Pt2IsBetweenPt1AndPt3(E->Prev->Curr, E->Curr, E->Next->Curr)))
+            {
+                // Collinear edges are allowed for open paths but in closed paths
+                // the default is to merge adjacent collinear edges into a single edge.
+                // However, if the PreserveCollinear property is enabled, only overlapping
+                // collinear edges (ie spikes) will be removed from closed paths.
+                if (E == eStart) eStart = E->Next;
+                E         = RemoveEdge(E);
+                E         = E->Prev;
+                eLoopStop = E;
+                continue;
+            }
+            E = E->Next;
+            if ((E == eLoopStop) || (!Closed && E->Next == eStart)) break;
+        }
 
-bool Clipper::PopEdgeFromSEL(TEdge *&edge)
-{
-  if (!m_SortedEdges) return false;
-  edge = m_SortedEdges;
-  DeleteFromSEL(m_SortedEdges);
-  return true;
-}
-//------------------------------------------------------------------------------
+        if ((!Closed && (E == E->Next)) || (Closed && (E->Prev == E->Next)))
+        {
+            delete[] edges;
+            return false;
+        }
 
-void Clipper::CopyAELToSEL()
-{
-  TEdge* e = m_ActiveEdges;
-  m_SortedEdges = e;
-  while ( e )
-  {
-    e->PrevInSEL = e->PrevInAEL;
-    e->NextInSEL = e->NextInAEL;
-    e = e->NextInAEL;
-  }
-}
-//------------------------------------------------------------------------------
+        if (!Closed)
+        {
+            m_HasOpenPaths       = true;
+            eStart->Prev->OutIdx = Skip;
+        }
 
-void Clipper::AddJoin(OutPt *op1, OutPt *op2, const IntPoint OffPt)
-{
-  Join* j = new Join;
-  j->OutPt1 = op1;
-  j->OutPt2 = op2;
-  j->OffPt = OffPt;
-  m_Joins.push_back(j);
-}
-//------------------------------------------------------------------------------
+        // 3. Do second stage of edge initialization ...
+        E = eStart;
+        do
+        {
+            InitEdge2(*E, PolyTyp);
+            E = E->Next;
+            if (IsFlat && E->Curr.Y != eStart->Curr.Y) IsFlat = false;
+        } while (E != eStart);
 
-void Clipper::ClearJoins()
-{
-  for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
-    delete m_Joins[i];
-  m_Joins.resize(0);
-}
-//------------------------------------------------------------------------------
+        // 4. Finally, add edge bounds to LocalMinima list ...
 
-void Clipper::ClearGhostJoins()
-{
-  for (JoinList::size_type i = 0; i < m_GhostJoins.size(); i++)
-    delete m_GhostJoins[i];
-  m_GhostJoins.resize(0);
-}
-//------------------------------------------------------------------------------
+        // Totally flat paths must be handled differently when adding them
+        // to LocalMinima list to avoid endless loops etc ...
+        if (IsFlat)
+        {
+            if (Closed)
+            {
+                delete[] edges;
+                return false;
+            }
+            E->Prev->OutIdx = Skip;
+            MinimaList::value_type locMin;
+            locMin.Y                     = E->Bot.Y;
+            locMin.LeftBound             = 0;
+            locMin.RightBound            = E;
+            locMin.RightBound->Side      = esRight;
+            locMin.RightBound->WindDelta = 0;
+            for (;;)
+            {
+                if (E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
+                if (E->Next->OutIdx == Skip) break;
+                E->NextInLML = E->Next;
+                E            = E->Next;
+            }
+            m_MinimaList.push_back(locMin);
+            m_edges.push_back(edges);
+            return true;
+        }
 
-void Clipper::AddGhostJoin(OutPt *op, const IntPoint OffPt)
-{
-  Join* j = new Join;
-  j->OutPt1 = op;
-  j->OutPt2 = 0;
-  j->OffPt = OffPt;
-  m_GhostJoins.push_back(j);
-}
-//------------------------------------------------------------------------------
+        m_edges.push_back(edges);
+        bool   leftBoundIsForward;
+        TEdge* EMin = 0;
 
-void Clipper::InsertLocalMinimaIntoAEL(const cInt botY)
-{
-  const LocalMinimum *lm;
-  while (PopLocalMinima(botY, lm))
-  {
-    TEdge* lb = lm->LeftBound;
-    TEdge* rb = lm->RightBound;
-
-    OutPt *Op1 = 0;
-    if (!lb)
-    {
-      //nb: don't insert LB into either AEL or SEL
-      InsertEdgeIntoAEL(rb, 0);
-      SetWindingCount(*rb);
-      if (IsContributing(*rb))
-        Op1 = AddOutPt(rb, rb->Bot);
-    }
-    else if (!rb)
-    {
-      InsertEdgeIntoAEL(lb, 0);
-      SetWindingCount(*lb);
-      if (IsContributing(*lb))
-        Op1 = AddOutPt(lb, lb->Bot);
-      InsertScanbeam(lb->Top.Y);
-    }
-    else
-    {
-      InsertEdgeIntoAEL(lb, 0);
-      InsertEdgeIntoAEL(rb, lb);
-      SetWindingCount( *lb );
-      rb->WindCnt = lb->WindCnt;
-      rb->WindCnt2 = lb->WindCnt2;
-      if (IsContributing(*lb))
-        Op1 = AddLocalMinPoly(lb, rb, lb->Bot);
-      InsertScanbeam(lb->Top.Y);
-    }
-
-     if (rb)
-     {
-		 if (IsHorizontal(*rb))
-		 {
-			 AddEdgeToSEL(rb);
-			 if (rb->NextInLML)
-				 InsertScanbeam(rb->NextInLML->Top.Y);
-		 }
-		 else InsertScanbeam( rb->Top.Y );
-     }
-
-    if (!lb || !rb) continue;
-
-    //if any output polygons share an edge, they'll need joining later ...
-    if (Op1 && IsHorizontal(*rb) &&
-      m_GhostJoins.size() > 0 && (rb->WindDelta != 0))
-    {
-      for (JoinList::size_type i = 0; i < m_GhostJoins.size(); ++i)
-      {
-        Join* jr = m_GhostJoins[i];
-        //if the horizontal Rb and a 'ghost' horizontal overlap, then convert
-        //the 'ghost' join to a real join ready for later ...
-        if (HorzSegmentsOverlap(jr->OutPt1->Pt.X, jr->OffPt.X, rb->Bot.X, rb->Top.X))
-          AddJoin(jr->OutPt1, Op1, jr->OffPt);
-      }
-    }
-
-    if (lb->OutIdx >= 0 && lb->PrevInAEL &&
-      lb->PrevInAEL->Curr.X == lb->Bot.X &&
-      lb->PrevInAEL->OutIdx >= 0 &&
-      SlopesEqual(lb->PrevInAEL->Bot, lb->PrevInAEL->Top, lb->Curr, lb->Top, m_UseFullRange) &&
-      (lb->WindDelta != 0) && (lb->PrevInAEL->WindDelta != 0))
-    {
-        OutPt *Op2 = AddOutPt(lb->PrevInAEL, lb->Bot);
-        AddJoin(Op1, Op2, lb->Top);
-    }
-
-    if(lb->NextInAEL != rb)
-    {
-
-      if (rb->OutIdx >= 0 && rb->PrevInAEL->OutIdx >= 0 &&
-        SlopesEqual(rb->PrevInAEL->Curr, rb->PrevInAEL->Top, rb->Curr, rb->Top, m_UseFullRange) &&
-        (rb->WindDelta != 0) && (rb->PrevInAEL->WindDelta != 0))
-      {
-          OutPt *Op2 = AddOutPt(rb->PrevInAEL, rb->Bot);
-          AddJoin(Op1, Op2, rb->Top);
-      }
-
-      TEdge* e = lb->NextInAEL;
-      if (e)
-      {
-        while( e != rb )
-        {
-          //nb: For calculating winding counts etc, IntersectEdges() assumes
-          //that param1 will be to the Right of param2 ABOVE the intersection ...
-          IntersectEdges(rb , e , lb->Curr); //order important here
-          e = e->NextInAEL;
-        }
-      }
-    }
-
-  }
-}
+        // workaround to avoid an endless loop in the while loop below when
+        // open paths have matching start and end points ...
+        if (E->Prev->Bot == E->Prev->Top) E = E->Next;
+
+        for (;;)
+        {
+            E = FindNextLocMin(E);
+            if (E == EMin)
+                break;
+            else if (!EMin)
+                EMin = E;
+
+            // E and E.Prev now share a local minima (left aligned if horizontal).
+            // Compare their slopes to find which starts which bound ...
+            MinimaList::value_type locMin;
+            locMin.Y = E->Bot.Y;
+            if (E->Dx < E->Prev->Dx)
+            {
+                locMin.LeftBound   = E->Prev;
+                locMin.RightBound  = E;
+                leftBoundIsForward = false;  // Q.nextInLML = Q.prev
+            }
+            else
+            {
+                locMin.LeftBound   = E;
+                locMin.RightBound  = E->Prev;
+                leftBoundIsForward = true;  // Q.nextInLML = Q.next
+            }
+
+            if (!Closed)
+                locMin.LeftBound->WindDelta = 0;
+            else if (locMin.LeftBound->Next == locMin.RightBound)
+                locMin.LeftBound->WindDelta = -1;
+            else
+                locMin.LeftBound->WindDelta = 1;
+            locMin.RightBound->WindDelta = -locMin.LeftBound->WindDelta;
+
+            E = ProcessBound(locMin.LeftBound, leftBoundIsForward);
+            if (E->OutIdx == Skip) E = ProcessBound(E, leftBoundIsForward);
+
+            TEdge* E2 = ProcessBound(locMin.RightBound, !leftBoundIsForward);
+            if (E2->OutIdx == Skip) E2 = ProcessBound(E2, !leftBoundIsForward);
+
+            if (locMin.LeftBound->OutIdx == Skip)
+                locMin.LeftBound = 0;
+            else if (locMin.RightBound->OutIdx == Skip)
+                locMin.RightBound = 0;
+            m_MinimaList.push_back(locMin);
+            if (!leftBoundIsForward) E = E2;
+        }
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    bool ClipperBase::AddPaths(const Paths& ppg, PolyType PolyTyp, bool Closed)
+    {
+        bool result = false;
+        for (Paths::size_type i = 0; i < ppg.size(); ++i)
+            if (AddPath(ppg[i], PolyTyp, Closed)) result = true;
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::Clear()
+    {
+        DisposeLocalMinimaList();
+        for (EdgeList::size_type i = 0; i < m_edges.size(); ++i)
+        {
+            TEdge* edges = m_edges[i];
+            delete[] edges;
+        }
+        m_edges.clear();
+        m_UseFullRange = false;
+        m_HasOpenPaths = false;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::Reset()
+    {
+        m_CurrentLM = m_MinimaList.begin();
+        if (m_CurrentLM == m_MinimaList.end()) return;  // ie nothing to process
+        std::sort(m_MinimaList.begin(), m_MinimaList.end(), LocMinSorter());
+
+        m_Scanbeam = ScanbeamList();  // clears/resets priority_queue
+        // reset all edges ...
+        for (MinimaList::iterator lm = m_MinimaList.begin(); lm != m_MinimaList.end(); ++lm)
+        {
+            InsertScanbeam(lm->Y);
+            TEdge* e = lm->LeftBound;
+            if (e)
+            {
+                e->Curr   = e->Bot;
+                e->Side   = esLeft;
+                e->OutIdx = Unassigned;
+            }
+
+            e = lm->RightBound;
+            if (e)
+            {
+                e->Curr   = e->Bot;
+                e->Side   = esRight;
+                e->OutIdx = Unassigned;
+            }
+        }
+        m_ActiveEdges = 0;
+        m_CurrentLM   = m_MinimaList.begin();
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::DisposeLocalMinimaList()
+    {
+        m_MinimaList.clear();
+        m_CurrentLM = m_MinimaList.begin();
+    }
+    //------------------------------------------------------------------------------
+
+    bool ClipperBase::PopLocalMinima(cInt Y, const LocalMinimum*& locMin)
+    {
+        if (m_CurrentLM == m_MinimaList.end() || (*m_CurrentLM).Y != Y) return false;
+        locMin = &(*m_CurrentLM);
+        ++m_CurrentLM;
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    IntRect ClipperBase::GetBounds()
+    {
+        IntRect              result;
+        MinimaList::iterator lm = m_MinimaList.begin();
+        if (lm == m_MinimaList.end())
+        {
+            result.left = result.top = result.right = result.bottom = 0;
+            return result;
+        }
+        result.left   = lm->LeftBound->Bot.X;
+        result.top    = lm->LeftBound->Bot.Y;
+        result.right  = lm->LeftBound->Bot.X;
+        result.bottom = lm->LeftBound->Bot.Y;
+        while (lm != m_MinimaList.end())
+        {
+            // todo - needs fixing for open paths
+            result.bottom = std::max(result.bottom, lm->LeftBound->Bot.Y);
+            TEdge* e      = lm->LeftBound;
+            for (;;)
+            {
+                TEdge* bottomE = e;
+                while (e->NextInLML)
+                {
+                    if (e->Bot.X < result.left) result.left = e->Bot.X;
+                    if (e->Bot.X > result.right) result.right = e->Bot.X;
+                    e = e->NextInLML;
+                }
+                result.left  = std::min(result.left, e->Bot.X);
+                result.right = std::max(result.right, e->Bot.X);
+                result.left  = std::min(result.left, e->Top.X);
+                result.right = std::max(result.right, e->Top.X);
+                result.top   = std::min(result.top, e->Top.Y);
+                if (bottomE == lm->LeftBound)
+                    e = lm->RightBound;
+                else
+                    break;
+            }
+            ++lm;
+        }
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::InsertScanbeam(const cInt Y)
+    {
+        m_Scanbeam.push(Y);
+    }
+    //------------------------------------------------------------------------------
+
+    bool ClipperBase::PopScanbeam(cInt& Y)
+    {
+        if (m_Scanbeam.empty()) return false;
+        Y = m_Scanbeam.top();
+        m_Scanbeam.pop();
+        while (!m_Scanbeam.empty() && Y == m_Scanbeam.top()) { m_Scanbeam.pop(); }  // Pop duplicates.
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::DisposeAllOutRecs()
+    {
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+            DisposeOutRec(i);
+        m_PolyOuts.clear();
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::DisposeOutRec(PolyOutList::size_type index)
+    {
+        OutRec* outRec = m_PolyOuts[index];
+        if (outRec->Pts) DisposeOutPts(outRec->Pts);
+        delete outRec;
+        m_PolyOuts[index] = 0;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::DeleteFromAEL(TEdge* e)
+    {
+        TEdge* AelPrev = e->PrevInAEL;
+        TEdge* AelNext = e->NextInAEL;
+        if (!AelPrev && !AelNext && (e != m_ActiveEdges)) return;  // already deleted
+        if (AelPrev)
+            AelPrev->NextInAEL = AelNext;
+        else
+            m_ActiveEdges = AelNext;
+        if (AelNext) AelNext->PrevInAEL = AelPrev;
+        e->NextInAEL = 0;
+        e->PrevInAEL = 0;
+    }
+    //------------------------------------------------------------------------------
+
+    OutRec* ClipperBase::CreateOutRec()
+    {
+        OutRec* result    = new OutRec;
+        result->IsHole    = false;
+        result->IsOpen    = false;
+        result->FirstLeft = 0;
+        result->Pts       = 0;
+        result->BottomPt  = 0;
+        result->PolyNd    = 0;
+        m_PolyOuts.push_back(result);
+        result->Idx = (int)m_PolyOuts.size() - 1;
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::SwapPositionsInAEL(TEdge* Edge1, TEdge* Edge2)
+    {
+        // check that one or other edge hasn't already been removed from AEL ...
+        if (Edge1->NextInAEL == Edge1->PrevInAEL ||
+            Edge2->NextInAEL == Edge2->PrevInAEL) return;
+
+        if (Edge1->NextInAEL == Edge2)
+        {
+            TEdge* Next = Edge2->NextInAEL;
+            if (Next) Next->PrevInAEL = Edge1;
+            TEdge* Prev = Edge1->PrevInAEL;
+            if (Prev) Prev->NextInAEL = Edge2;
+            Edge2->PrevInAEL = Prev;
+            Edge2->NextInAEL = Edge1;
+            Edge1->PrevInAEL = Edge2;
+            Edge1->NextInAEL = Next;
+        }
+        else if (Edge2->NextInAEL == Edge1)
+        {
+            TEdge* Next = Edge1->NextInAEL;
+            if (Next) Next->PrevInAEL = Edge2;
+            TEdge* Prev = Edge2->PrevInAEL;
+            if (Prev) Prev->NextInAEL = Edge1;
+            Edge1->PrevInAEL = Prev;
+            Edge1->NextInAEL = Edge2;
+            Edge2->PrevInAEL = Edge1;
+            Edge2->NextInAEL = Next;
+        }
+        else
+        {
+            TEdge* Next      = Edge1->NextInAEL;
+            TEdge* Prev      = Edge1->PrevInAEL;
+            Edge1->NextInAEL = Edge2->NextInAEL;
+            if (Edge1->NextInAEL) Edge1->NextInAEL->PrevInAEL = Edge1;
+            Edge1->PrevInAEL = Edge2->PrevInAEL;
+            if (Edge1->PrevInAEL) Edge1->PrevInAEL->NextInAEL = Edge1;
+            Edge2->NextInAEL = Next;
+            if (Edge2->NextInAEL) Edge2->NextInAEL->PrevInAEL = Edge2;
+            Edge2->PrevInAEL = Prev;
+            if (Edge2->PrevInAEL) Edge2->PrevInAEL->NextInAEL = Edge2;
+        }
+
+        if (!Edge1->PrevInAEL)
+            m_ActiveEdges = Edge1;
+        else if (!Edge2->PrevInAEL)
+            m_ActiveEdges = Edge2;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::UpdateEdgeIntoAEL(TEdge*& e)
+    {
+        if (!e->NextInLML)
+            throw clipperException("UpdateEdgeIntoAEL: invalid call");
+
+        e->NextInLML->OutIdx = e->OutIdx;
+        TEdge* AelPrev       = e->PrevInAEL;
+        TEdge* AelNext       = e->NextInAEL;
+        if (AelPrev)
+            AelPrev->NextInAEL = e->NextInLML;
+        else
+            m_ActiveEdges = e->NextInLML;
+        if (AelNext) AelNext->PrevInAEL = e->NextInLML;
+        e->NextInLML->Side      = e->Side;
+        e->NextInLML->WindDelta = e->WindDelta;
+        e->NextInLML->WindCnt   = e->WindCnt;
+        e->NextInLML->WindCnt2  = e->WindCnt2;
+        e                       = e->NextInLML;
+        e->Curr                 = e->Bot;
+        e->PrevInAEL            = AelPrev;
+        e->NextInAEL            = AelNext;
+        if (!IsHorizontal(*e)) InsertScanbeam(e->Top.Y);
+    }
+    //------------------------------------------------------------------------------
+
+    bool ClipperBase::LocalMinimaPending()
+    {
+        return (m_CurrentLM != m_MinimaList.end());
+    }
+
+    //------------------------------------------------------------------------------
+    // TClipper methods ...
+    //------------------------------------------------------------------------------
+
+    Clipper::Clipper(int initOptions)
+        : ClipperBase()  // constructor
+    {
+        m_ExecuteLocked     = false;
+        m_UseFullRange      = false;
+        m_ReverseOutput     = ((initOptions & ioReverseSolution) != 0);
+        m_StrictSimple      = ((initOptions & ioStrictlySimple) != 0);
+        m_PreserveCollinear = ((initOptions & ioPreserveCollinear) != 0);
+        m_HasOpenPaths      = false;
+#ifdef use_xyz
+        m_ZFill = 0;
+#endif
+    }
+    //------------------------------------------------------------------------------
+
+#ifdef use_xyz
+    void Clipper::ZFillFunction(ZFillCallback zFillFunc)
+    {
+        m_ZFill = zFillFunc;
+    }
 //------------------------------------------------------------------------------
+#endif
+
+    bool Clipper::Execute(ClipType clipType, Paths& solution, PolyFillType fillType)
+    {
+        return Execute(clipType, solution, fillType, fillType);
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::Execute(ClipType clipType, PolyTree& polytree, PolyFillType fillType)
+    {
+        return Execute(clipType, polytree, fillType, fillType);
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::Execute(ClipType clipType, Paths& solution, PolyFillType subjFillType, PolyFillType clipFillType)
+    {
+        if (m_ExecuteLocked) return false;
+        if (m_HasOpenPaths)
+            throw clipperException("Error: PolyTree struct is needed for open path clipping.");
+        m_ExecuteLocked = true;
+        solution.resize(0);
+        m_SubjFillType  = subjFillType;
+        m_ClipFillType  = clipFillType;
+        m_ClipType      = clipType;
+        m_UsingPolyTree = false;
+        bool succeeded  = ExecuteInternal();
+        if (succeeded) BuildResult(solution);
+        DisposeAllOutRecs();
+        m_ExecuteLocked = false;
+        return succeeded;
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::Execute(ClipType clipType, PolyTree& polytree, PolyFillType subjFillType, PolyFillType clipFillType)
+    {
+        if (m_ExecuteLocked) return false;
+        m_ExecuteLocked = true;
+        m_SubjFillType  = subjFillType;
+        m_ClipFillType  = clipFillType;
+        m_ClipType      = clipType;
+        m_UsingPolyTree = true;
+        bool succeeded  = ExecuteInternal();
+        if (succeeded) BuildResult2(polytree);
+        DisposeAllOutRecs();
+        m_ExecuteLocked = false;
+        return succeeded;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::FixHoleLinkage(OutRec& outrec)
+    {
+        // skip OutRecs that (a) contain outermost polygons or
+        //(b) already have the correct owner/child linkage ...
+        if (!outrec.FirstLeft ||
+            (outrec.IsHole != outrec.FirstLeft->IsHole &&
+             outrec.FirstLeft->Pts)) return;
+
+        OutRec* orfl = outrec.FirstLeft;
+        while (orfl && ((orfl->IsHole == outrec.IsHole) || !orfl->Pts))
+            orfl = orfl->FirstLeft;
+        outrec.FirstLeft = orfl;
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::ExecuteInternal()
+    {
+        bool succeeded = true;
+        try
+        {
+            Reset();
+            m_Maxima      = MaximaList();
+            m_SortedEdges = 0;
+
+            succeeded = true;
+            cInt botY, topY;
+            if (!PopScanbeam(botY)) return false;
+            InsertLocalMinimaIntoAEL(botY);
+            while (PopScanbeam(topY) || LocalMinimaPending())
+            {
+                ProcessHorizontals();
+                ClearGhostJoins();
+                if (!ProcessIntersections(topY))
+                {
+                    succeeded = false;
+                    break;
+                }
+                ProcessEdgesAtTopOfScanbeam(topY);
+                botY = topY;
+                InsertLocalMinimaIntoAEL(botY);
+            }
+        }
+        catch (...)
+        {
+            succeeded = false;
+        }
+
+        if (succeeded)
+        {
+            // fix orientations ...
+            for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+            {
+                OutRec* outRec = m_PolyOuts[i];
+                if (!outRec->Pts || outRec->IsOpen) continue;
+                if ((outRec->IsHole ^ m_ReverseOutput) == (Area(*outRec) > 0))
+                    ReversePolyPtLinks(outRec->Pts);
+            }
+
+            if (!m_Joins.empty()) JoinCommonEdges();
+
+            // unfortunately FixupOutPolygon() must be done after JoinCommonEdges()
+            for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+            {
+                OutRec* outRec = m_PolyOuts[i];
+                if (!outRec->Pts) continue;
+                if (outRec->IsOpen)
+                    FixupOutPolyline(*outRec);
+                else
+                    FixupOutPolygon(*outRec);
+            }
+
+            if (m_StrictSimple) DoSimplePolygons();
+        }
+
+        ClearJoins();
+        ClearGhostJoins();
+        return succeeded;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::SetWindingCount(TEdge& edge)
+    {
+        TEdge* e = edge.PrevInAEL;
+        // find the edge of the same polytype that immediately preceeds 'edge' in AEL
+        while (e && ((e->PolyTyp != edge.PolyTyp) || (e->WindDelta == 0))) e = e->PrevInAEL;
+        if (!e)
+        {
+            if (edge.WindDelta == 0)
+            {
+                PolyFillType pft = (edge.PolyTyp == ptSubject ? m_SubjFillType : m_ClipFillType);
+                edge.WindCnt     = (pft == pftNegative ? -1 : 1);
+            }
+            else
+                edge.WindCnt = edge.WindDelta;
+            edge.WindCnt2 = 0;
+            e             = m_ActiveEdges;  // ie get ready to calc WindCnt2
+        }
+        else if (edge.WindDelta == 0 && m_ClipType != ctUnion)
+        {
+            edge.WindCnt  = 1;
+            edge.WindCnt2 = e->WindCnt2;
+            e             = e->NextInAEL;  // ie get ready to calc WindCnt2
+        }
+        else if (IsEvenOddFillType(edge))
+        {
+            // EvenOdd filling ...
+            if (edge.WindDelta == 0)
+            {
+                // are we inside a subj polygon ...
+                bool   Inside = true;
+                TEdge* e2     = e->PrevInAEL;
+                while (e2)
+                {
+                    if (e2->PolyTyp == e->PolyTyp && e2->WindDelta != 0)
+                        Inside = !Inside;
+                    e2 = e2->PrevInAEL;
+                }
+                edge.WindCnt = (Inside ? 0 : 1);
+            }
+            else
+            {
+                edge.WindCnt = edge.WindDelta;
+            }
+            edge.WindCnt2 = e->WindCnt2;
+            e             = e->NextInAEL;  // ie get ready to calc WindCnt2
+        }
+        else
+        {
+            // nonZero, Positive or Negative filling ...
+            if (e->WindCnt * e->WindDelta < 0)
+            {
+                // prev edge is 'decreasing' WindCount (WC) toward zero
+                // so we're outside the previous polygon ...
+                if (Abs(e->WindCnt) > 1)
+                {
+                    // outside prev poly but still inside another.
+                    // when reversing direction of prev poly use the same WC
+                    if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt;
+                    // otherwise continue to 'decrease' WC ...
+                    else
+                        edge.WindCnt = e->WindCnt + edge.WindDelta;
+                }
+                else
+                    // now outside all polys of same polytype so set own WC ...
+                    edge.WindCnt = (edge.WindDelta == 0 ? 1 : edge.WindDelta);
+            }
+            else
+            {
+                // prev edge is 'increasing' WindCount (WC) away from zero
+                // so we're inside the previous polygon ...
+                if (edge.WindDelta == 0)
+                    edge.WindCnt = (e->WindCnt < 0 ? e->WindCnt - 1 : e->WindCnt + 1);
+                // if wind direction is reversing prev then use same WC
+                else if (e->WindDelta * edge.WindDelta < 0)
+                    edge.WindCnt = e->WindCnt;
+                // otherwise add to WC ...
+                else
+                    edge.WindCnt = e->WindCnt + edge.WindDelta;
+            }
+            edge.WindCnt2 = e->WindCnt2;
+            e             = e->NextInAEL;  // ie get ready to calc WindCnt2
+        }
+
+        // update WindCnt2 ...
+        if (IsEvenOddAltFillType(edge))
+        {
+            // EvenOdd filling ...
+            while (e != &edge)
+            {
+                if (e->WindDelta != 0)
+                    edge.WindCnt2 = (edge.WindCnt2 == 0 ? 1 : 0);
+                e = e->NextInAEL;
+            }
+        }
+        else
+        {
+            // nonZero, Positive or Negative filling ...
+            while (e != &edge)
+            {
+                edge.WindCnt2 += e->WindDelta;
+                e = e->NextInAEL;
+            }
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::IsEvenOddFillType(const TEdge& edge) const
+    {
+        if (edge.PolyTyp == ptSubject)
+            return m_SubjFillType == pftEvenOdd;
+        else
+            return m_ClipFillType == pftEvenOdd;
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::IsEvenOddAltFillType(const TEdge& edge) const
+    {
+        if (edge.PolyTyp == ptSubject)
+            return m_ClipFillType == pftEvenOdd;
+        else
+            return m_SubjFillType == pftEvenOdd;
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::IsContributing(const TEdge& edge) const
+    {
+        PolyFillType pft, pft2;
+        if (edge.PolyTyp == ptSubject)
+        {
+            pft  = m_SubjFillType;
+            pft2 = m_ClipFillType;
+        }
+        else
+        {
+            pft  = m_ClipFillType;
+            pft2 = m_SubjFillType;
+        }
+
+        switch (pft)
+        {
+            case pftEvenOdd:
+                // return false if a subj line has been flagged as inside a subj polygon
+                if (edge.WindDelta == 0 && edge.WindCnt != 1) return false;
+                break;
+            case pftNonZero:
+                if (Abs(edge.WindCnt) != 1) return false;
+                break;
+            case pftPositive:
+                if (edge.WindCnt != 1) return false;
+                break;
+            default:  // pftNegative
+                if (edge.WindCnt != -1) return false;
+        }
+
+        switch (m_ClipType)
+        {
+            case ctIntersection:
+                switch (pft2)
+                {
+                    case pftEvenOdd:
+                    case pftNonZero:
+                        return (edge.WindCnt2 != 0);
+                    case pftPositive:
+                        return (edge.WindCnt2 > 0);
+                    default:
+                        return (edge.WindCnt2 < 0);
+                }
+                break;
+            case ctUnion:
+                switch (pft2)
+                {
+                    case pftEvenOdd:
+                    case pftNonZero:
+                        return (edge.WindCnt2 == 0);
+                    case pftPositive:
+                        return (edge.WindCnt2 <= 0);
+                    default:
+                        return (edge.WindCnt2 >= 0);
+                }
+                break;
+            case ctDifference:
+                if (edge.PolyTyp == ptSubject)
+                    switch (pft2)
+                    {
+                        case pftEvenOdd:
+                        case pftNonZero:
+                            return (edge.WindCnt2 == 0);
+                        case pftPositive:
+                            return (edge.WindCnt2 <= 0);
+                        default:
+                            return (edge.WindCnt2 >= 0);
+                    }
+                else
+                    switch (pft2)
+                    {
+                        case pftEvenOdd:
+                        case pftNonZero:
+                            return (edge.WindCnt2 != 0);
+                        case pftPositive:
+                            return (edge.WindCnt2 > 0);
+                        default:
+                            return (edge.WindCnt2 < 0);
+                    }
+                break;
+            case ctXor:
+                if (edge.WindDelta == 0)  // XOr always contributing unless open
+                    switch (pft2)
+                    {
+                        case pftEvenOdd:
+                        case pftNonZero:
+                            return (edge.WindCnt2 == 0);
+                        case pftPositive:
+                            return (edge.WindCnt2 <= 0);
+                        default:
+                            return (edge.WindCnt2 >= 0);
+                    }
+                else
+                    return true;
+                break;
+            default:
+                return true;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    OutPt* Clipper::AddLocalMinPoly(TEdge* e1, TEdge* e2, const IntPoint& Pt)
+    {
+        OutPt* result;
+        TEdge *e, *prevE;
+        if (IsHorizontal(*e2) || (e1->Dx > e2->Dx))
+        {
+            result     = AddOutPt(e1, Pt);
+            e2->OutIdx = e1->OutIdx;
+            e1->Side   = esLeft;
+            e2->Side   = esRight;
+            e          = e1;
+            if (e->PrevInAEL == e2)
+                prevE = e2->PrevInAEL;
+            else
+                prevE = e->PrevInAEL;
+        }
+        else
+        {
+            result     = AddOutPt(e2, Pt);
+            e1->OutIdx = e2->OutIdx;
+            e1->Side   = esRight;
+            e2->Side   = esLeft;
+            e          = e2;
+            if (e->PrevInAEL == e1)
+                prevE = e1->PrevInAEL;
+            else
+                prevE = e->PrevInAEL;
+        }
+
+        if (prevE && prevE->OutIdx >= 0 && prevE->Top.Y < Pt.Y && e->Top.Y < Pt.Y)
+        {
+            cInt xPrev = TopX(*prevE, Pt.Y);
+            cInt xE    = TopX(*e, Pt.Y);
+            if (xPrev == xE && (e->WindDelta != 0) && (prevE->WindDelta != 0) &&
+                SlopesEqual(IntPoint(xPrev, Pt.Y), prevE->Top, IntPoint(xE, Pt.Y), e->Top, m_UseFullRange))
+            {
+                OutPt* outPt = AddOutPt(prevE, Pt);
+                AddJoin(result, outPt, e->Top);
+            }
+        }
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::AddLocalMaxPoly(TEdge* e1, TEdge* e2, const IntPoint& Pt)
+    {
+        AddOutPt(e1, Pt);
+        if (e2->WindDelta == 0) AddOutPt(e2, Pt);
+        if (e1->OutIdx == e2->OutIdx)
+        {
+            e1->OutIdx = Unassigned;
+            e2->OutIdx = Unassigned;
+        }
+        else if (e1->OutIdx < e2->OutIdx)
+            AppendPolygon(e1, e2);
+        else
+            AppendPolygon(e2, e1);
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::AddEdgeToSEL(TEdge* edge)
+    {
+        // SEL pointers in PEdge are reused to build a list of horizontal edges.
+        // However, we don't need to worry about order with horizontal edge processing.
+        if (!m_SortedEdges)
+        {
+            m_SortedEdges   = edge;
+            edge->PrevInSEL = 0;
+            edge->NextInSEL = 0;
+        }
+        else
+        {
+            edge->NextInSEL          = m_SortedEdges;
+            edge->PrevInSEL          = 0;
+            m_SortedEdges->PrevInSEL = edge;
+            m_SortedEdges            = edge;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::PopEdgeFromSEL(TEdge*& edge)
+    {
+        if (!m_SortedEdges) return false;
+        edge = m_SortedEdges;
+        DeleteFromSEL(m_SortedEdges);
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::CopyAELToSEL()
+    {
+        TEdge* e      = m_ActiveEdges;
+        m_SortedEdges = e;
+        while (e)
+        {
+            e->PrevInSEL = e->PrevInAEL;
+            e->NextInSEL = e->NextInAEL;
+            e            = e->NextInAEL;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::AddJoin(OutPt* op1, OutPt* op2, const IntPoint OffPt)
+    {
+        Join* j   = new Join;
+        j->OutPt1 = op1;
+        j->OutPt2 = op2;
+        j->OffPt  = OffPt;
+        m_Joins.push_back(j);
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::ClearJoins()
+    {
+        for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
+            delete m_Joins[i];
+        m_Joins.resize(0);
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::ClearGhostJoins()
+    {
+        for (JoinList::size_type i = 0; i < m_GhostJoins.size(); i++)
+            delete m_GhostJoins[i];
+        m_GhostJoins.resize(0);
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::AddGhostJoin(OutPt* op, const IntPoint OffPt)
+    {
+        Join* j   = new Join;
+        j->OutPt1 = op;
+        j->OutPt2 = 0;
+        j->OffPt  = OffPt;
+        m_GhostJoins.push_back(j);
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::InsertLocalMinimaIntoAEL(const cInt botY)
+    {
+        const LocalMinimum* lm;
+        while (PopLocalMinima(botY, lm))
+        {
+            TEdge* lb = lm->LeftBound;
+            TEdge* rb = lm->RightBound;
+
+            OutPt* Op1 = 0;
+            if (!lb)
+            {
+                // nb: don't insert LB into either AEL or SEL
+                InsertEdgeIntoAEL(rb, 0);
+                SetWindingCount(*rb);
+                if (IsContributing(*rb))
+                    Op1 = AddOutPt(rb, rb->Bot);
+            }
+            else if (!rb)
+            {
+                InsertEdgeIntoAEL(lb, 0);
+                SetWindingCount(*lb);
+                if (IsContributing(*lb))
+                    Op1 = AddOutPt(lb, lb->Bot);
+                InsertScanbeam(lb->Top.Y);
+            }
+            else
+            {
+                InsertEdgeIntoAEL(lb, 0);
+                InsertEdgeIntoAEL(rb, lb);
+                SetWindingCount(*lb);
+                rb->WindCnt  = lb->WindCnt;
+                rb->WindCnt2 = lb->WindCnt2;
+                if (IsContributing(*lb))
+                    Op1 = AddLocalMinPoly(lb, rb, lb->Bot);
+                InsertScanbeam(lb->Top.Y);
+            }
+
+            if (rb)
+            {
+                if (IsHorizontal(*rb))
+                {
+                    AddEdgeToSEL(rb);
+                    if (rb->NextInLML)
+                        InsertScanbeam(rb->NextInLML->Top.Y);
+                }
+                else
+                    InsertScanbeam(rb->Top.Y);
+            }
+
+            if (!lb || !rb) continue;
+
+            // if any output polygons share an edge, they'll need joining later ...
+            if (Op1 && IsHorizontal(*rb) &&
+                m_GhostJoins.size() > 0 && (rb->WindDelta != 0))
+            {
+                for (JoinList::size_type i = 0; i < m_GhostJoins.size(); ++i)
+                {
+                    Join* jr = m_GhostJoins[i];
+                    // if the horizontal Rb and a 'ghost' horizontal overlap, then convert
+                    // the 'ghost' join to a real join ready for later ...
+                    if (HorzSegmentsOverlap(jr->OutPt1->Pt.X, jr->OffPt.X, rb->Bot.X, rb->Top.X))
+                        AddJoin(jr->OutPt1, Op1, jr->OffPt);
+                }
+            }
+
+            if (lb->OutIdx >= 0 && lb->PrevInAEL &&
+                lb->PrevInAEL->Curr.X == lb->Bot.X &&
+                lb->PrevInAEL->OutIdx >= 0 &&
+                SlopesEqual(lb->PrevInAEL->Bot, lb->PrevInAEL->Top, lb->Curr, lb->Top, m_UseFullRange) &&
+                (lb->WindDelta != 0) && (lb->PrevInAEL->WindDelta != 0))
+            {
+                OutPt* Op2 = AddOutPt(lb->PrevInAEL, lb->Bot);
+                AddJoin(Op1, Op2, lb->Top);
+            }
+
+            if (lb->NextInAEL != rb)
+            {
+                if (rb->OutIdx >= 0 && rb->PrevInAEL->OutIdx >= 0 &&
+                    SlopesEqual(rb->PrevInAEL->Curr, rb->PrevInAEL->Top, rb->Curr, rb->Top, m_UseFullRange) &&
+                    (rb->WindDelta != 0) && (rb->PrevInAEL->WindDelta != 0))
+                {
+                    OutPt* Op2 = AddOutPt(rb->PrevInAEL, rb->Bot);
+                    AddJoin(Op1, Op2, rb->Top);
+                }
+
+                TEdge* e = lb->NextInAEL;
+                if (e)
+                {
+                    while (e != rb)
+                    {
+                        // nb: For calculating winding counts etc, IntersectEdges() assumes
+                        // that param1 will be to the Right of param2 ABOVE the intersection ...
+                        IntersectEdges(rb, e, lb->Curr);  // order important here
+                        e = e->NextInAEL;
+                    }
+                }
+            }
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::DeleteFromSEL(TEdge* e)
+    {
+        TEdge* SelPrev = e->PrevInSEL;
+        TEdge* SelNext = e->NextInSEL;
+        if (!SelPrev && !SelNext && (e != m_SortedEdges)) return;  // already deleted
+        if (SelPrev)
+            SelPrev->NextInSEL = SelNext;
+        else
+            m_SortedEdges = SelNext;
+        if (SelNext) SelNext->PrevInSEL = SelPrev;
+        e->NextInSEL = 0;
+        e->PrevInSEL = 0;
+    }
+    //------------------------------------------------------------------------------
+
+#ifdef use_xyz
+    void Clipper::SetZ(IntPoint& pt, TEdge& e1, TEdge& e2)
+    {
+        if (pt.Z != 0 || !m_ZFill)
+            return;
+        else if (pt == e1.Bot)
+            pt.Z = e1.Bot.Z;
+        else if (pt == e1.Top)
+            pt.Z = e1.Top.Z;
+        else if (pt == e2.Bot)
+            pt.Z = e2.Bot.Z;
+        else if (pt == e2.Top)
+            pt.Z = e2.Top.Z;
+        else
+            (*m_ZFill)(e1.Bot, e1.Top, e2.Bot, e2.Top, pt);
+    }
+//------------------------------------------------------------------------------
+#endif
+
+    void Clipper::IntersectEdges(TEdge* e1, TEdge* e2, IntPoint& Pt)
+    {
+        bool e1Contributing = (e1->OutIdx >= 0);
+        bool e2Contributing = (e2->OutIdx >= 0);
+
+#ifdef use_xyz
+        SetZ(Pt, *e1, *e2);
+#endif
+
+#ifdef use_lines
+        // if either edge is on an OPEN path ...
+        if (e1->WindDelta == 0 || e2->WindDelta == 0)
+        {
+            // ignore subject-subject open path intersections UNLESS they
+            // are both open paths, AND they are both 'contributing maximas' ...
+            if (e1->WindDelta == 0 && e2->WindDelta == 0) return;
+
+            // if intersecting a subj line with a subj poly ...
+            else if (e1->PolyTyp == e2->PolyTyp &&
+                     e1->WindDelta != e2->WindDelta && m_ClipType == ctUnion)
+            {
+                if (e1->WindDelta == 0)
+                {
+                    if (e2Contributing)
+                    {
+                        AddOutPt(e1, Pt);
+                        if (e1Contributing) e1->OutIdx = Unassigned;
+                    }
+                }
+                else
+                {
+                    if (e1Contributing)
+                    {
+                        AddOutPt(e2, Pt);
+                        if (e2Contributing) e2->OutIdx = Unassigned;
+                    }
+                }
+            }
+            else if (e1->PolyTyp != e2->PolyTyp)
+            {
+                // toggle subj open path OutIdx on/off when Abs(clip.WndCnt) == 1 ...
+                if ((e1->WindDelta == 0) && abs(e2->WindCnt) == 1 &&
+                    (m_ClipType != ctUnion || e2->WindCnt2 == 0))
+                {
+                    AddOutPt(e1, Pt);
+                    if (e1Contributing) e1->OutIdx = Unassigned;
+                }
+                else if ((e2->WindDelta == 0) && (abs(e1->WindCnt) == 1) &&
+                         (m_ClipType != ctUnion || e1->WindCnt2 == 0))
+                {
+                    AddOutPt(e2, Pt);
+                    if (e2Contributing) e2->OutIdx = Unassigned;
+                }
+            }
+            return;
+        }
+#endif
+
+        // update winding counts...
+        // assumes that e1 will be to the Right of e2 ABOVE the intersection
+        if (e1->PolyTyp == e2->PolyTyp)
+        {
+            if (IsEvenOddFillType(*e1))
+            {
+                int oldE1WindCnt = e1->WindCnt;
+                e1->WindCnt      = e2->WindCnt;
+                e2->WindCnt      = oldE1WindCnt;
+            }
+            else
+            {
+                if (e1->WindCnt + e2->WindDelta == 0)
+                    e1->WindCnt = -e1->WindCnt;
+                else
+                    e1->WindCnt += e2->WindDelta;
+                if (e2->WindCnt - e1->WindDelta == 0)
+                    e2->WindCnt = -e2->WindCnt;
+                else
+                    e2->WindCnt -= e1->WindDelta;
+            }
+        }
+        else
+        {
+            if (!IsEvenOddFillType(*e2))
+                e1->WindCnt2 += e2->WindDelta;
+            else
+                e1->WindCnt2 = (e1->WindCnt2 == 0) ? 1 : 0;
+            if (!IsEvenOddFillType(*e1))
+                e2->WindCnt2 -= e1->WindDelta;
+            else
+                e2->WindCnt2 = (e2->WindCnt2 == 0) ? 1 : 0;
+        }
+
+        PolyFillType e1FillType, e2FillType, e1FillType2, e2FillType2;
+        if (e1->PolyTyp == ptSubject)
+        {
+            e1FillType  = m_SubjFillType;
+            e1FillType2 = m_ClipFillType;
+        }
+        else
+        {
+            e1FillType  = m_ClipFillType;
+            e1FillType2 = m_SubjFillType;
+        }
+        if (e2->PolyTyp == ptSubject)
+        {
+            e2FillType  = m_SubjFillType;
+            e2FillType2 = m_ClipFillType;
+        }
+        else
+        {
+            e2FillType  = m_ClipFillType;
+            e2FillType2 = m_SubjFillType;
+        }
+
+        cInt e1Wc, e2Wc;
+        switch (e1FillType)
+        {
+            case pftPositive: e1Wc = e1->WindCnt; break;
+            case pftNegative: e1Wc = -e1->WindCnt; break;
+            default: e1Wc = Abs(e1->WindCnt);
+        }
+        switch (e2FillType)
+        {
+            case pftPositive: e2Wc = e2->WindCnt; break;
+            case pftNegative: e2Wc = -e2->WindCnt; break;
+            default: e2Wc = Abs(e2->WindCnt);
+        }
+
+        if (e1Contributing && e2Contributing)
+        {
+            if ((e1Wc != 0 && e1Wc != 1) || (e2Wc != 0 && e2Wc != 1) ||
+                (e1->PolyTyp != e2->PolyTyp && m_ClipType != ctXor))
+            {
+                AddLocalMaxPoly(e1, e2, Pt);
+            }
+            else
+            {
+                AddOutPt(e1, Pt);
+                AddOutPt(e2, Pt);
+                SwapSides(*e1, *e2);
+                SwapPolyIndexes(*e1, *e2);
+            }
+        }
+        else if (e1Contributing)
+        {
+            if (e2Wc == 0 || e2Wc == 1)
+            {
+                AddOutPt(e1, Pt);
+                SwapSides(*e1, *e2);
+                SwapPolyIndexes(*e1, *e2);
+            }
+        }
+        else if (e2Contributing)
+        {
+            if (e1Wc == 0 || e1Wc == 1)
+            {
+                AddOutPt(e2, Pt);
+                SwapSides(*e1, *e2);
+                SwapPolyIndexes(*e1, *e2);
+            }
+        }
+        else if ((e1Wc == 0 || e1Wc == 1) && (e2Wc == 0 || e2Wc == 1))
+        {
+            // neither edge is currently contributing ...
+
+            cInt e1Wc2, e2Wc2;
+            switch (e1FillType2)
+            {
+                case pftPositive: e1Wc2 = e1->WindCnt2; break;
+                case pftNegative: e1Wc2 = -e1->WindCnt2; break;
+                default: e1Wc2 = Abs(e1->WindCnt2);
+            }
+            switch (e2FillType2)
+            {
+                case pftPositive: e2Wc2 = e2->WindCnt2; break;
+                case pftNegative: e2Wc2 = -e2->WindCnt2; break;
+                default: e2Wc2 = Abs(e2->WindCnt2);
+            }
+
+            if (e1->PolyTyp != e2->PolyTyp)
+            {
+                AddLocalMinPoly(e1, e2, Pt);
+            }
+            else if (e1Wc == 1 && e2Wc == 1)
+                switch (m_ClipType)
+                {
+                    case ctIntersection:
+                        if (e1Wc2 > 0 && e2Wc2 > 0)
+                            AddLocalMinPoly(e1, e2, Pt);
+                        break;
+                    case ctUnion:
+                        if (e1Wc2 <= 0 && e2Wc2 <= 0)
+                            AddLocalMinPoly(e1, e2, Pt);
+                        break;
+                    case ctDifference:
+                        if (((e1->PolyTyp == ptClip) && (e1Wc2 > 0) && (e2Wc2 > 0)) ||
+                            ((e1->PolyTyp == ptSubject) && (e1Wc2 <= 0) && (e2Wc2 <= 0)))
+                            AddLocalMinPoly(e1, e2, Pt);
+                        break;
+                    case ctXor:
+                        AddLocalMinPoly(e1, e2, Pt);
+                }
+            else
+                SwapSides(*e1, *e2);
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::SetHoleState(TEdge* e, OutRec* outrec)
+    {
+        TEdge* e2   = e->PrevInAEL;
+        TEdge* eTmp = 0;
+        while (e2)
+        {
+            if (e2->OutIdx >= 0 && e2->WindDelta != 0)
+            {
+                if (!eTmp)
+                    eTmp = e2;
+                else if (eTmp->OutIdx == e2->OutIdx)
+                    eTmp = 0;
+            }
+            e2 = e2->PrevInAEL;
+        }
+        if (!eTmp)
+        {
+            outrec->FirstLeft = 0;
+            outrec->IsHole    = false;
+        }
+        else
+        {
+            outrec->FirstLeft = m_PolyOuts[eTmp->OutIdx];
+            outrec->IsHole    = !outrec->FirstLeft->IsHole;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    OutRec* GetLowermostRec(OutRec* outRec1, OutRec* outRec2)
+    {
+        // work out which polygon fragment has the correct hole state ...
+        if (!outRec1->BottomPt)
+            outRec1->BottomPt = GetBottomPt(outRec1->Pts);
+        if (!outRec2->BottomPt)
+            outRec2->BottomPt = GetBottomPt(outRec2->Pts);
+        OutPt* OutPt1 = outRec1->BottomPt;
+        OutPt* OutPt2 = outRec2->BottomPt;
+        if (OutPt1->Pt.Y > OutPt2->Pt.Y)
+            return outRec1;
+        else if (OutPt1->Pt.Y < OutPt2->Pt.Y)
+            return outRec2;
+        else if (OutPt1->Pt.X < OutPt2->Pt.X)
+            return outRec1;
+        else if (OutPt1->Pt.X > OutPt2->Pt.X)
+            return outRec2;
+        else if (OutPt1->Next == OutPt1)
+            return outRec2;
+        else if (OutPt2->Next == OutPt2)
+            return outRec1;
+        else if (FirstIsBottomPt(OutPt1, OutPt2))
+            return outRec1;
+        else
+            return outRec2;
+    }
+    //------------------------------------------------------------------------------
+
+    bool OutRec1RightOfOutRec2(OutRec* outRec1, OutRec* outRec2)
+    {
+        do
+        {
+            outRec1 = outRec1->FirstLeft;
+            if (outRec1 == outRec2) return true;
+        } while (outRec1);
+        return false;
+    }
+    //------------------------------------------------------------------------------
+
+    OutRec* Clipper::GetOutRec(int Idx)
+    {
+        OutRec* outrec = m_PolyOuts[Idx];
+        while (outrec != m_PolyOuts[outrec->Idx])
+            outrec = m_PolyOuts[outrec->Idx];
+        return outrec;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::AppendPolygon(TEdge* e1, TEdge* e2)
+    {
+        // get the start and ends of both output polygons ...
+        OutRec* outRec1 = m_PolyOuts[e1->OutIdx];
+        OutRec* outRec2 = m_PolyOuts[e2->OutIdx];
+
+        OutRec* holeStateRec;
+        if (OutRec1RightOfOutRec2(outRec1, outRec2))
+            holeStateRec = outRec2;
+        else if (OutRec1RightOfOutRec2(outRec2, outRec1))
+            holeStateRec = outRec1;
+        else
+            holeStateRec = GetLowermostRec(outRec1, outRec2);
+
+        // get the start and ends of both output polygons and
+        // join e2 poly onto e1 poly and delete pointers to e2 ...
+
+        OutPt* p1_lft = outRec1->Pts;
+        OutPt* p1_rt  = p1_lft->Prev;
+        OutPt* p2_lft = outRec2->Pts;
+        OutPt* p2_rt  = p2_lft->Prev;
+
+        // join e2 poly onto e1 poly and delete pointers to e2 ...
+        if (e1->Side == esLeft)
+        {
+            if (e2->Side == esLeft)
+            {
+                // z y x a b c
+                ReversePolyPtLinks(p2_lft);
+                p2_lft->Next = p1_lft;
+                p1_lft->Prev = p2_lft;
+                p1_rt->Next  = p2_rt;
+                p2_rt->Prev  = p1_rt;
+                outRec1->Pts = p2_rt;
+            }
+            else
+            {
+                // x y z a b c
+                p2_rt->Next  = p1_lft;
+                p1_lft->Prev = p2_rt;
+                p2_lft->Prev = p1_rt;
+                p1_rt->Next  = p2_lft;
+                outRec1->Pts = p2_lft;
+            }
+        }
+        else
+        {
+            if (e2->Side == esRight)
+            {
+                // a b c z y x
+                ReversePolyPtLinks(p2_lft);
+                p1_rt->Next  = p2_rt;
+                p2_rt->Prev  = p1_rt;
+                p2_lft->Next = p1_lft;
+                p1_lft->Prev = p2_lft;
+            }
+            else
+            {
+                // a b c x y z
+                p1_rt->Next  = p2_lft;
+                p2_lft->Prev = p1_rt;
+                p1_lft->Prev = p2_rt;
+                p2_rt->Next  = p1_lft;
+            }
+        }
+
+        outRec1->BottomPt = 0;
+        if (holeStateRec == outRec2)
+        {
+            if (outRec2->FirstLeft != outRec1)
+                outRec1->FirstLeft = outRec2->FirstLeft;
+            outRec1->IsHole = outRec2->IsHole;
+        }
+        outRec2->Pts       = 0;
+        outRec2->BottomPt  = 0;
+        outRec2->FirstLeft = outRec1;
+
+        int OKIdx       = e1->OutIdx;
+        int ObsoleteIdx = e2->OutIdx;
+
+        e1->OutIdx = Unassigned;  // nb: safe because we only get here via AddLocalMaxPoly
+        e2->OutIdx = Unassigned;
+
+        TEdge* e = m_ActiveEdges;
+        while (e)
+        {
+            if (e->OutIdx == ObsoleteIdx)
+            {
+                e->OutIdx = OKIdx;
+                e->Side   = e1->Side;
+                break;
+            }
+            e = e->NextInAEL;
+        }
+
+        outRec2->Idx = outRec1->Idx;
+    }
+    //------------------------------------------------------------------------------
+
+    OutPt* Clipper::AddOutPt(TEdge* e, const IntPoint& pt)
+    {
+        if (e->OutIdx < 0)
+        {
+            OutRec* outRec = CreateOutRec();
+            outRec->IsOpen = (e->WindDelta == 0);
+            OutPt* newOp   = new OutPt;
+            outRec->Pts    = newOp;
+            newOp->Idx     = outRec->Idx;
+            newOp->Pt      = pt;
+            newOp->Next    = newOp;
+            newOp->Prev    = newOp;
+            if (!outRec->IsOpen)
+                SetHoleState(e, outRec);
+            e->OutIdx = outRec->Idx;
+            return newOp;
+        }
+        else
+        {
+            OutRec* outRec = m_PolyOuts[e->OutIdx];
+            // OutRec.Pts is the 'Left-most' point & OutRec.Pts.Prev is the 'Right-most'
+            OutPt*  op     = outRec->Pts;
+
+            bool    ToFront = (e->Side == esLeft);
+            if (ToFront && (pt == op->Pt))
+                return op;
+            else if (!ToFront && (pt == op->Prev->Pt))
+                return op->Prev;
+
+            OutPt* newOp      = new OutPt;
+            newOp->Idx        = outRec->Idx;
+            newOp->Pt         = pt;
+            newOp->Next       = op;
+            newOp->Prev       = op->Prev;
+            newOp->Prev->Next = newOp;
+            op->Prev          = newOp;
+            if (ToFront) outRec->Pts = newOp;
+            return newOp;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    OutPt* Clipper::GetLastOutPt(TEdge* e)
+    {
+        OutRec* outRec = m_PolyOuts[e->OutIdx];
+        if (e->Side == esLeft)
+            return outRec->Pts;
+        else
+            return outRec->Pts->Prev;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::ProcessHorizontals()
+    {
+        TEdge* horzEdge;
+        while (PopEdgeFromSEL(horzEdge))
+            ProcessHorizontal(horzEdge);
+    }
+    //------------------------------------------------------------------------------
+
+    inline bool IsMinima(TEdge* e)
+    {
+        return e && (e->Prev->NextInLML != e) && (e->Next->NextInLML != e);
+    }
+    //------------------------------------------------------------------------------
+
+    inline bool IsMaxima(TEdge* e, const cInt Y)
+    {
+        return e && e->Top.Y == Y && !e->NextInLML;
+    }
+    //------------------------------------------------------------------------------
+
+    inline bool IsIntermediate(TEdge* e, const cInt Y)
+    {
+        return e->Top.Y == Y && e->NextInLML;
+    }
+    //------------------------------------------------------------------------------
+
+    TEdge* GetMaximaPair(TEdge* e)
+    {
+        if ((e->Next->Top == e->Top) && !e->Next->NextInLML)
+            return e->Next;
+        else if ((e->Prev->Top == e->Top) && !e->Prev->NextInLML)
+            return e->Prev;
+        else
+            return 0;
+    }
+    //------------------------------------------------------------------------------
 
-void Clipper::DeleteFromSEL(TEdge *e)
-{
-  TEdge* SelPrev = e->PrevInSEL;
-  TEdge* SelNext = e->NextInSEL;
-  if( !SelPrev &&  !SelNext && (e != m_SortedEdges) ) return; //already deleted
-  if( SelPrev ) SelPrev->NextInSEL = SelNext;
-  else m_SortedEdges = SelNext;
-  if( SelNext ) SelNext->PrevInSEL = SelPrev;
-  e->NextInSEL = 0;
-  e->PrevInSEL = 0;
-}
-//------------------------------------------------------------------------------
+    TEdge* GetMaximaPairEx(TEdge* e)
+    {
+        // as GetMaximaPair() but returns 0 if MaxPair isn't in AEL (unless it's horizontal)
+        TEdge* result = GetMaximaPair(e);
+        if (result && (result->OutIdx == Skip ||
+                       (result->NextInAEL == result->PrevInAEL && !IsHorizontal(*result)))) return 0;
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::SwapPositionsInSEL(TEdge* Edge1, TEdge* Edge2)
+    {
+        if (!(Edge1->NextInSEL) && !(Edge1->PrevInSEL)) return;
+        if (!(Edge2->NextInSEL) && !(Edge2->PrevInSEL)) return;
+
+        if (Edge1->NextInSEL == Edge2)
+        {
+            TEdge* Next = Edge2->NextInSEL;
+            if (Next) Next->PrevInSEL = Edge1;
+            TEdge* Prev = Edge1->PrevInSEL;
+            if (Prev) Prev->NextInSEL = Edge2;
+            Edge2->PrevInSEL = Prev;
+            Edge2->NextInSEL = Edge1;
+            Edge1->PrevInSEL = Edge2;
+            Edge1->NextInSEL = Next;
+        }
+        else if (Edge2->NextInSEL == Edge1)
+        {
+            TEdge* Next = Edge1->NextInSEL;
+            if (Next) Next->PrevInSEL = Edge2;
+            TEdge* Prev = Edge2->PrevInSEL;
+            if (Prev) Prev->NextInSEL = Edge1;
+            Edge1->PrevInSEL = Prev;
+            Edge1->NextInSEL = Edge2;
+            Edge2->PrevInSEL = Edge1;
+            Edge2->NextInSEL = Next;
+        }
+        else
+        {
+            TEdge* Next      = Edge1->NextInSEL;
+            TEdge* Prev      = Edge1->PrevInSEL;
+            Edge1->NextInSEL = Edge2->NextInSEL;
+            if (Edge1->NextInSEL) Edge1->NextInSEL->PrevInSEL = Edge1;
+            Edge1->PrevInSEL = Edge2->PrevInSEL;
+            if (Edge1->PrevInSEL) Edge1->PrevInSEL->NextInSEL = Edge1;
+            Edge2->NextInSEL = Next;
+            if (Edge2->NextInSEL) Edge2->NextInSEL->PrevInSEL = Edge2;
+            Edge2->PrevInSEL = Prev;
+            if (Edge2->PrevInSEL) Edge2->PrevInSEL->NextInSEL = Edge2;
+        }
+
+        if (!Edge1->PrevInSEL)
+            m_SortedEdges = Edge1;
+        else if (!Edge2->PrevInSEL)
+            m_SortedEdges = Edge2;
+    }
+    //------------------------------------------------------------------------------
+
+    TEdge* GetNextInAEL(TEdge* e, Direction dir)
+    {
+        return dir == dLeftToRight ? e->NextInAEL : e->PrevInAEL;
+    }
+    //------------------------------------------------------------------------------
+
+    void GetHorzDirection(TEdge& HorzEdge, Direction& Dir, cInt& Left, cInt& Right)
+    {
+        if (HorzEdge.Bot.X < HorzEdge.Top.X)
+        {
+            Left  = HorzEdge.Bot.X;
+            Right = HorzEdge.Top.X;
+            Dir   = dLeftToRight;
+        }
+        else
+        {
+            Left  = HorzEdge.Top.X;
+            Right = HorzEdge.Bot.X;
+            Dir   = dRightToLeft;
+        }
+    }
+    //------------------------------------------------------------------------
+
+    /*******************************************************************************
+     * Notes: Horizontal edges (HEs) at scanline intersections (ie at the Top or    *
+     * Bottom of a scanbeam) are processed as if layered. The order in which HEs    *
+     * are processed doesn't matter. HEs intersect with other HE Bot.Xs only [#]    *
+     * (or they could intersect with Top.Xs only, ie EITHER Bot.Xs OR Top.Xs),      *
+     * and with other non-horizontal edges [*]. Once these intersections are        *
+     * processed, intermediate HEs then 'promote' the Edge above (NextInLML) into   *
+     * the AEL. These 'promoted' edges may in turn intersect [%] with other HEs.    *
+     *******************************************************************************/
+
+    void Clipper::ProcessHorizontal(TEdge* horzEdge)
+    {
+        Direction dir;
+        cInt      horzLeft, horzRight;
+        bool      IsOpen = (horzEdge->WindDelta == 0);
+
+        GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);
+
+        TEdge *eLastHorz = horzEdge, *eMaxPair = 0;
+        while (eLastHorz->NextInLML && IsHorizontal(*eLastHorz->NextInLML))
+            eLastHorz = eLastHorz->NextInLML;
+        if (!eLastHorz->NextInLML)
+            eMaxPair = GetMaximaPair(eLastHorz);
 
+        MaximaList::const_iterator         maxIt;
+        MaximaList::const_reverse_iterator maxRit;
+        if (m_Maxima.size() > 0)
+        {
+            // get the first maxima in range (X) ...
+            if (dir == dLeftToRight)
+            {
+                maxIt = m_Maxima.begin();
+                while (maxIt != m_Maxima.end() && *maxIt <= horzEdge->Bot.X) maxIt++;
+                if (maxIt != m_Maxima.end() && *maxIt >= eLastHorz->Top.X)
+                    maxIt = m_Maxima.end();
+            }
+            else
+            {
+                maxRit = m_Maxima.rbegin();
+                while (maxRit != m_Maxima.rend() && *maxRit > horzEdge->Bot.X) maxRit++;
+                if (maxRit != m_Maxima.rend() && *maxRit <= eLastHorz->Top.X)
+                    maxRit = m_Maxima.rend();
+            }
+        }
+
+        OutPt* op1 = 0;
+
+        for (;;)  // loop through consec. horizontal edges
+        {
+            bool   IsLastHorz = (horzEdge == eLastHorz);
+            TEdge* e          = GetNextInAEL(horzEdge, dir);
+            while (e)
+            {
+                // this code block inserts extra coords into horizontal edges (in output
+                // polygons) whereever maxima touch these horizontal edges. This helps
+                //'simplifying' polygons (ie if the Simplify property is set).
+                if (m_Maxima.size() > 0)
+                {
+                    if (dir == dLeftToRight)
+                    {
+                        while (maxIt != m_Maxima.end() && *maxIt < e->Curr.X)
+                        {
+                            if (horzEdge->OutIdx >= 0 && !IsOpen)
+                                AddOutPt(horzEdge, IntPoint(*maxIt, horzEdge->Bot.Y));
+                            maxIt++;
+                        }
+                    }
+                    else
+                    {
+                        while (maxRit != m_Maxima.rend() && *maxRit > e->Curr.X)
+                        {
+                            if (horzEdge->OutIdx >= 0 && !IsOpen)
+                                AddOutPt(horzEdge, IntPoint(*maxRit, horzEdge->Bot.Y));
+                            maxRit++;
+                        }
+                    }
+                };
+
+                if ((dir == dLeftToRight && e->Curr.X > horzRight) ||
+                    (dir == dRightToLeft && e->Curr.X < horzLeft)) break;
+
+                // Also break if we've got to the end of an intermediate horizontal edge ...
+                // nb: Smaller Dx's are to the right of larger Dx's ABOVE the horizontal.
+                if (e->Curr.X == horzEdge->Top.X && horzEdge->NextInLML &&
+                    e->Dx < horzEdge->NextInLML->Dx) break;
+
+                if (horzEdge->OutIdx >= 0 && !IsOpen)  // note: may be done multiple times
+                {
 #ifdef use_xyz
-void Clipper::SetZ(IntPoint& pt, TEdge& e1, TEdge& e2)
-{
-  if (pt.Z != 0 || !m_ZFill) return;
-  else if (pt == e1.Bot) pt.Z = e1.Bot.Z;
-  else if (pt == e1.Top) pt.Z = e1.Top.Z;
-  else if (pt == e2.Bot) pt.Z = e2.Bot.Z;
-  else if (pt == e2.Top) pt.Z = e2.Top.Z;
-  else (*m_ZFill)(e1.Bot, e1.Top, e2.Bot, e2.Top, pt);
-}
-//------------------------------------------------------------------------------
+                    if (dir == dLeftToRight)
+                        SetZ(e->Curr, *horzEdge, *e);
+                    else
+                        SetZ(e->Curr, *e, *horzEdge);
 #endif
+                    op1              = AddOutPt(horzEdge, e->Curr);
+                    TEdge* eNextHorz = m_SortedEdges;
+                    while (eNextHorz)
+                    {
+                        if (eNextHorz->OutIdx >= 0 &&
+                            HorzSegmentsOverlap(horzEdge->Bot.X,
+                                                horzEdge->Top.X,
+                                                eNextHorz->Bot.X,
+                                                eNextHorz->Top.X))
+                        {
+                            OutPt* op2 = GetLastOutPt(eNextHorz);
+                            AddJoin(op2, op1, eNextHorz->Top);
+                        }
+                        eNextHorz = eNextHorz->NextInSEL;
+                    }
+                    AddGhostJoin(op1, horzEdge->Bot);
+                }
 
-void Clipper::IntersectEdges(TEdge *e1, TEdge *e2, IntPoint &Pt)
-{
-  bool e1Contributing = ( e1->OutIdx >= 0 );
-  bool e2Contributing = ( e2->OutIdx >= 0 );
+                // OK, so far we're still in range of the horizontal Edge  but make sure
+                // we're at the last of consec. horizontals when matching with eMaxPair
+                if (e == eMaxPair && IsLastHorz)
+                {
+                    if (horzEdge->OutIdx >= 0)
+                        AddLocalMaxPoly(horzEdge, eMaxPair, horzEdge->Top);
+                    DeleteFromAEL(horzEdge);
+                    DeleteFromAEL(eMaxPair);
+                    return;
+                }
+
+                if (dir == dLeftToRight)
+                {
+                    IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
+                    IntersectEdges(horzEdge, e, Pt);
+                }
+                else
+                {
+                    IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
+                    IntersectEdges(e, horzEdge, Pt);
+                }
+                TEdge* eNext = GetNextInAEL(e, dir);
+                SwapPositionsInAEL(horzEdge, e);
+                e = eNext;
+            }  // end while(e)
+
+            // Break out of loop if HorzEdge.NextInLML is not also horizontal ...
+            if (!horzEdge->NextInLML || !IsHorizontal(*horzEdge->NextInLML)) break;
+
+            UpdateEdgeIntoAEL(horzEdge);
+            if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Bot);
+            GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);
+
+        }  // end for (;;)
+
+        if (horzEdge->OutIdx >= 0 && !op1)
+        {
+            op1              = GetLastOutPt(horzEdge);
+            TEdge* eNextHorz = m_SortedEdges;
+            while (eNextHorz)
+            {
+                if (eNextHorz->OutIdx >= 0 &&
+                    HorzSegmentsOverlap(horzEdge->Bot.X,
+                                        horzEdge->Top.X,
+                                        eNextHorz->Bot.X,
+                                        eNextHorz->Top.X))
+                {
+                    OutPt* op2 = GetLastOutPt(eNextHorz);
+                    AddJoin(op2, op1, eNextHorz->Top);
+                }
+                eNextHorz = eNextHorz->NextInSEL;
+            }
+            AddGhostJoin(op1, horzEdge->Top);
+        }
+
+        if (horzEdge->NextInLML)
+        {
+            if (horzEdge->OutIdx >= 0)
+            {
+                op1 = AddOutPt(horzEdge, horzEdge->Top);
+                UpdateEdgeIntoAEL(horzEdge);
+                if (horzEdge->WindDelta == 0) return;
+                // nb: HorzEdge is no longer horizontal here
+                TEdge* ePrev = horzEdge->PrevInAEL;
+                TEdge* eNext = horzEdge->NextInAEL;
+                if (ePrev && ePrev->Curr.X == horzEdge->Bot.X &&
+                    ePrev->Curr.Y == horzEdge->Bot.Y && ePrev->WindDelta != 0 &&
+                    (ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
+                     SlopesEqual(*horzEdge, *ePrev, m_UseFullRange)))
+                {
+                    OutPt* op2 = AddOutPt(ePrev, horzEdge->Bot);
+                    AddJoin(op1, op2, horzEdge->Top);
+                }
+                else if (eNext && eNext->Curr.X == horzEdge->Bot.X &&
+                         eNext->Curr.Y == horzEdge->Bot.Y && eNext->WindDelta != 0 &&
+                         eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
+                         SlopesEqual(*horzEdge, *eNext, m_UseFullRange))
+                {
+                    OutPt* op2 = AddOutPt(eNext, horzEdge->Bot);
+                    AddJoin(op1, op2, horzEdge->Top);
+                }
+            }
+            else
+                UpdateEdgeIntoAEL(horzEdge);
+        }
+        else
+        {
+            if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Top);
+            DeleteFromAEL(horzEdge);
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::ProcessIntersections(const cInt topY)
+    {
+        if (!m_ActiveEdges) return true;
+        try
+        {
+            BuildIntersectList(topY);
+            size_t IlSize = m_IntersectList.size();
+            if (IlSize == 0) return true;
+            if (IlSize == 1 || FixupIntersectionOrder())
+                ProcessIntersectList();
+            else
+                return false;
+        }
+        catch (...)
+        {
+            m_SortedEdges = 0;
+            DisposeIntersectNodes();
+            throw clipperException("ProcessIntersections error");
+        }
+        m_SortedEdges = 0;
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::DisposeIntersectNodes()
+    {
+        for (size_t i = 0; i < m_IntersectList.size(); ++i)
+            delete m_IntersectList[i];
+        m_IntersectList.clear();
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::BuildIntersectList(const cInt topY)
+    {
+        if (!m_ActiveEdges) return;
+
+        // prepare for sorting ...
+        TEdge* e      = m_ActiveEdges;
+        m_SortedEdges = e;
+        while (e)
+        {
+            e->PrevInSEL = e->PrevInAEL;
+            e->NextInSEL = e->NextInAEL;
+            e->Curr.X    = TopX(*e, topY);
+            e            = e->NextInAEL;
+        }
+
+        // bubblesort ...
+        bool isModified;
+        do
+        {
+            isModified = false;
+            e          = m_SortedEdges;
+            while (e->NextInSEL)
+            {
+                TEdge*   eNext = e->NextInSEL;
+                IntPoint Pt;
+                if (e->Curr.X > eNext->Curr.X)
+                {
+                    IntersectPoint(*e, *eNext, Pt);
+                    if (Pt.Y < topY) Pt = IntPoint(TopX(*e, topY), topY);
+                    IntersectNode* newNode = new IntersectNode;
+                    newNode->Edge1         = e;
+                    newNode->Edge2         = eNext;
+                    newNode->Pt            = Pt;
+                    m_IntersectList.push_back(newNode);
+
+                    SwapPositionsInSEL(e, eNext);
+                    isModified = true;
+                }
+                else
+                    e = eNext;
+            }
+            if (e->PrevInSEL)
+                e->PrevInSEL->NextInSEL = 0;
+            else
+                break;
+        } while (isModified);
+        m_SortedEdges = 0;  // important
+    }
+    //------------------------------------------------------------------------------
+
+
+    void Clipper::ProcessIntersectList()
+    {
+        for (size_t i = 0; i < m_IntersectList.size(); ++i)
+        {
+            IntersectNode* iNode = m_IntersectList[i];
+            {
+                IntersectEdges(iNode->Edge1, iNode->Edge2, iNode->Pt);
+                SwapPositionsInAEL(iNode->Edge1, iNode->Edge2);
+            }
+            delete iNode;
+        }
+        m_IntersectList.clear();
+    }
+    //------------------------------------------------------------------------------
+
+    bool IntersectListSort(IntersectNode* node1, IntersectNode* node2)
+    {
+        return node2->Pt.Y < node1->Pt.Y;
+    }
+    //------------------------------------------------------------------------------
+
+    inline bool EdgesAdjacent(const IntersectNode& inode)
+    {
+        return (inode.Edge1->NextInSEL == inode.Edge2) ||
+               (inode.Edge1->PrevInSEL == inode.Edge2);
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::FixupIntersectionOrder()
+    {
+        // pre-condition: intersections are sorted Bottom-most first.
+        // Now it's crucial that intersections are made only between adjacent edges,
+        // so to ensure this the order of intersections may need adjusting ...
+        CopyAELToSEL();
+        std::sort(m_IntersectList.begin(), m_IntersectList.end(), IntersectListSort);
+        size_t cnt = m_IntersectList.size();
+        for (size_t i = 0; i < cnt; ++i)
+        {
+            if (!EdgesAdjacent(*m_IntersectList[i]))
+            {
+                size_t j = i + 1;
+                while (j < cnt && !EdgesAdjacent(*m_IntersectList[j])) j++;
+                if (j == cnt) return false;
+                std::swap(m_IntersectList[i], m_IntersectList[j]);
+            }
+            SwapPositionsInSEL(m_IntersectList[i]->Edge1, m_IntersectList[i]->Edge2);
+        }
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::DoMaxima(TEdge* e)
+    {
+        TEdge* eMaxPair = GetMaximaPairEx(e);
+        if (!eMaxPair)
+        {
+            if (e->OutIdx >= 0)
+                AddOutPt(e, e->Top);
+            DeleteFromAEL(e);
+            return;
+        }
+
+        TEdge* eNext = e->NextInAEL;
+        while (eNext && eNext != eMaxPair)
+        {
+            IntersectEdges(e, eNext, e->Top);
+            SwapPositionsInAEL(e, eNext);
+            eNext = e->NextInAEL;
+        }
+
+        if (e->OutIdx == Unassigned && eMaxPair->OutIdx == Unassigned)
+        {
+            DeleteFromAEL(e);
+            DeleteFromAEL(eMaxPair);
+        }
+        else if (e->OutIdx >= 0 && eMaxPair->OutIdx >= 0)
+        {
+            if (e->OutIdx >= 0) AddLocalMaxPoly(e, eMaxPair, e->Top);
+            DeleteFromAEL(e);
+            DeleteFromAEL(eMaxPair);
+        }
+#ifdef use_lines
+        else if (e->WindDelta == 0)
+        {
+            if (e->OutIdx >= 0)
+            {
+                AddOutPt(e, e->Top);
+                e->OutIdx = Unassigned;
+            }
+            DeleteFromAEL(e);
+
+            if (eMaxPair->OutIdx >= 0)
+            {
+                AddOutPt(eMaxPair, e->Top);
+                eMaxPair->OutIdx = Unassigned;
+            }
+            DeleteFromAEL(eMaxPair);
+        }
+#endif
+        else
+            throw clipperException("DoMaxima error");
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::ProcessEdgesAtTopOfScanbeam(const cInt topY)
+    {
+        TEdge* e = m_ActiveEdges;
+        while (e)
+        {
+            // 1. process maxima, treating them as if they're 'bent' horizontal edges,
+            //    but exclude maxima with horizontal edges. nb: e can't be a horizontal.
+            bool IsMaximaEdge = IsMaxima(e, topY);
+
+            if (IsMaximaEdge)
+            {
+                TEdge* eMaxPair = GetMaximaPairEx(e);
+                IsMaximaEdge    = (!eMaxPair || !IsHorizontal(*eMaxPair));
+            }
+
+            if (IsMaximaEdge)
+            {
+                if (m_StrictSimple) m_Maxima.push_back(e->Top.X);
+                TEdge* ePrev = e->PrevInAEL;
+                DoMaxima(e);
+                if (!ePrev)
+                    e = m_ActiveEdges;
+                else
+                    e = ePrev->NextInAEL;
+            }
+            else
+            {
+                // 2. promote horizontal edges, otherwise update Curr.X and Curr.Y ...
+                if (IsIntermediate(e, topY) && IsHorizontal(*e->NextInLML))
+                {
+                    UpdateEdgeIntoAEL(e);
+                    if (e->OutIdx >= 0)
+                        AddOutPt(e, e->Bot);
+                    AddEdgeToSEL(e);
+                }
+                else
+                {
+                    e->Curr.X = TopX(*e, topY);
+                    e->Curr.Y = topY;
+#ifdef use_xyz
+                    e->Curr.Z = topY == e->Top.Y ? e->Top.Z : (topY == e->Bot.Y ? e->Bot.Z : 0);
+#endif
+                }
 
+                // When StrictlySimple and 'e' is being touched by another edge, then
+                // make sure both edges have a vertex here ...
+                if (m_StrictSimple)
+                {
+                    TEdge* ePrev = e->PrevInAEL;
+                    if ((e->OutIdx >= 0) && (e->WindDelta != 0) && ePrev && (ePrev->OutIdx >= 0) &&
+                        (ePrev->Curr.X == e->Curr.X) && (ePrev->WindDelta != 0))
+                    {
+                        IntPoint pt = e->Curr;
 #ifdef use_xyz
-        SetZ(Pt, *e1, *e2);
+                        SetZ(pt, *ePrev, *e);
 #endif
+                        OutPt* op  = AddOutPt(ePrev, pt);
+                        OutPt* op2 = AddOutPt(e, pt);
+                        AddJoin(op, op2, pt);  // StrictlySimple (type-3) join
+                    }
+                }
+
+                e = e->NextInAEL;
+            }
+        }
+
+        // 3. Process horizontals at the Top of the scanbeam ...
+        m_Maxima.sort();
+        ProcessHorizontals();
+        m_Maxima.clear();
+
+        // 4. Promote intermediate vertices ...
+        e = m_ActiveEdges;
+        while (e)
+        {
+            if (IsIntermediate(e, topY))
+            {
+                OutPt* op = 0;
+                if (e->OutIdx >= 0)
+                    op = AddOutPt(e, e->Top);
+                UpdateEdgeIntoAEL(e);
+
+                // if output polygons share an edge, they'll need joining later ...
+                TEdge* ePrev = e->PrevInAEL;
+                TEdge* eNext = e->NextInAEL;
+                if (ePrev && ePrev->Curr.X == e->Bot.X &&
+                    ePrev->Curr.Y == e->Bot.Y && op &&
+                    ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
+                    SlopesEqual(e->Curr, e->Top, ePrev->Curr, ePrev->Top, m_UseFullRange) &&
+                    (e->WindDelta != 0) && (ePrev->WindDelta != 0))
+                {
+                    OutPt* op2 = AddOutPt(ePrev, e->Bot);
+                    AddJoin(op, op2, e->Top);
+                }
+                else if (eNext && eNext->Curr.X == e->Bot.X &&
+                         eNext->Curr.Y == e->Bot.Y && op &&
+                         eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
+                         SlopesEqual(e->Curr, e->Top, eNext->Curr, eNext->Top, m_UseFullRange) &&
+                         (e->WindDelta != 0) && (eNext->WindDelta != 0))
+                {
+                    OutPt* op2 = AddOutPt(eNext, e->Bot);
+                    AddJoin(op, op2, e->Top);
+                }
+            }
+            e = e->NextInAEL;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::FixupOutPolyline(OutRec& outrec)
+    {
+        OutPt* pp     = outrec.Pts;
+        OutPt* lastPP = pp->Prev;
+        while (pp != lastPP)
+        {
+            pp = pp->Next;
+            if (pp->Pt == pp->Prev->Pt)
+            {
+                if (pp == lastPP) lastPP = pp->Prev;
+                OutPt* tmpPP   = pp->Prev;
+                tmpPP->Next    = pp->Next;
+                pp->Next->Prev = tmpPP;
+                delete pp;
+                pp = tmpPP;
+            }
+        }
+
+        if (pp == pp->Prev)
+        {
+            DisposeOutPts(pp);
+            outrec.Pts = 0;
+            return;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::FixupOutPolygon(OutRec& outrec)
+    {
+        // FixupOutPolygon() - removes duplicate points and simplifies consecutive
+        // parallel edges by removing the middle vertex.
+        OutPt* lastOK      = 0;
+        outrec.BottomPt    = 0;
+        OutPt* pp          = outrec.Pts;
+        bool   preserveCol = m_PreserveCollinear || m_StrictSimple;
+
+        for (;;)
+        {
+            if (pp->Prev == pp || pp->Prev == pp->Next)
+            {
+                DisposeOutPts(pp);
+                outrec.Pts = 0;
+                return;
+            }
+
+            // test for duplicate points and collinear edges ...
+            if ((pp->Pt == pp->Next->Pt) || (pp->Pt == pp->Prev->Pt) ||
+                (SlopesEqual(pp->Prev->Pt, pp->Pt, pp->Next->Pt, m_UseFullRange) &&
+                 (!preserveCol || !Pt2IsBetweenPt1AndPt3(pp->Prev->Pt, pp->Pt, pp->Next->Pt))))
+            {
+                lastOK         = 0;
+                OutPt* tmp     = pp;
+                pp->Prev->Next = pp->Next;
+                pp->Next->Prev = pp->Prev;
+                pp             = pp->Prev;
+                delete tmp;
+            }
+            else if (pp == lastOK)
+                break;
+            else
+            {
+                if (!lastOK) lastOK = pp;
+                pp = pp->Next;
+            }
+        }
+        outrec.Pts = pp;
+    }
+    //------------------------------------------------------------------------------
+
+    int PointCount(OutPt* Pts)
+    {
+        if (!Pts) return 0;
+        int    result = 0;
+        OutPt* p      = Pts;
+        do
+        {
+            result++;
+            p = p->Next;
+        } while (p != Pts);
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::BuildResult(Paths& polys)
+    {
+        polys.reserve(m_PolyOuts.size());
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            if (!m_PolyOuts[i]->Pts) continue;
+            Path   pg;
+            OutPt* p   = m_PolyOuts[i]->Pts->Prev;
+            int    cnt = PointCount(p);
+            if (cnt < 2) continue;
+            pg.reserve(cnt);
+            for (int i = 0; i < cnt; ++i)
+            {
+                pg.push_back(p->Pt);
+                p = p->Prev;
+            }
+            polys.push_back(pg);
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::BuildResult2(PolyTree& polytree)
+    {
+        polytree.Clear();
+        polytree.AllNodes.reserve(m_PolyOuts.size());
+        // add each output polygon/contour to polytree ...
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
+        {
+            OutRec* outRec = m_PolyOuts[i];
+            int     cnt    = PointCount(outRec->Pts);
+            if ((outRec->IsOpen && cnt < 2) || (!outRec->IsOpen && cnt < 3)) continue;
+            FixHoleLinkage(*outRec);
+            PolyNode* pn = new PolyNode();
+            // nb: polytree takes ownership of all the PolyNodes
+            polytree.AllNodes.push_back(pn);
+            outRec->PolyNd = pn;
+            pn->Parent     = 0;
+            pn->Index      = 0;
+            pn->Contour.reserve(cnt);
+            OutPt* op = outRec->Pts->Prev;
+            for (int j = 0; j < cnt; j++)
+            {
+                pn->Contour.push_back(op->Pt);
+                op = op->Prev;
+            }
+        }
+
+        // fixup PolyNode links etc ...
+        polytree.Childs.reserve(m_PolyOuts.size());
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
+        {
+            OutRec* outRec = m_PolyOuts[i];
+            if (!outRec->PolyNd) continue;
+            if (outRec->IsOpen)
+            {
+                outRec->PolyNd->m_IsOpen = true;
+                polytree.AddChild(*outRec->PolyNd);
+            }
+            else if (outRec->FirstLeft && outRec->FirstLeft->PolyNd)
+                outRec->FirstLeft->PolyNd->AddChild(*outRec->PolyNd);
+            else
+                polytree.AddChild(*outRec->PolyNd);
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void SwapIntersectNodes(IntersectNode& int1, IntersectNode& int2)
+    {
+        // just swap the contents (because fIntersectNodes is a single-linked-list)
+        IntersectNode inode = int1;  // gets a copy of Int1
+        int1.Edge1          = int2.Edge1;
+        int1.Edge2          = int2.Edge2;
+        int1.Pt             = int2.Pt;
+        int2.Edge1          = inode.Edge1;
+        int2.Edge2          = inode.Edge2;
+        int2.Pt             = inode.Pt;
+    }
+    //------------------------------------------------------------------------------
+
+    inline bool E2InsertsBeforeE1(TEdge& e1, TEdge& e2)
+    {
+        if (e2.Curr.X == e1.Curr.X)
+        {
+            if (e2.Top.Y > e1.Top.Y)
+                return e2.Top.X < TopX(e1, e2.Top.Y);
+            else
+                return e1.Top.X > TopX(e2, e1.Top.Y);
+        }
+        else
+            return e2.Curr.X < e1.Curr.X;
+    }
+    //------------------------------------------------------------------------------
+
+    bool GetOverlap(const cInt a1, const cInt a2, const cInt b1, const cInt b2, cInt& Left, cInt& Right)
+    {
+        if (a1 < a2)
+        {
+            if (b1 < b2)
+            {
+                Left  = std::max(a1, b1);
+                Right = std::min(a2, b2);
+            }
+            else
+            {
+                Left  = std::max(a1, b2);
+                Right = std::min(a2, b1);
+            }
+        }
+        else
+        {
+            if (b1 < b2)
+            {
+                Left  = std::max(a2, b1);
+                Right = std::min(a1, b2);
+            }
+            else
+            {
+                Left  = std::max(a2, b2);
+                Right = std::min(a1, b1);
+            }
+        }
+        return Left < Right;
+    }
+    //------------------------------------------------------------------------------
 
-#ifdef use_lines
-  //if either edge is on an OPEN path ...
-  if (e1->WindDelta == 0 || e2->WindDelta == 0)
-  {
-    //ignore subject-subject open path intersections UNLESS they
-    //are both open paths, AND they are both 'contributing maximas' ...
-	if (e1->WindDelta == 0 && e2->WindDelta == 0) return;
+    inline void UpdateOutPtIdxs(OutRec& outrec)
+    {
+        OutPt* op = outrec.Pts;
+        do
+        {
+            op->Idx = outrec.Idx;
+            op      = op->Prev;
+        } while (op != outrec.Pts);
+    }
+    //------------------------------------------------------------------------------
 
-    //if intersecting a subj line with a subj poly ...
-    else if (e1->PolyTyp == e2->PolyTyp &&
-      e1->WindDelta != e2->WindDelta && m_ClipType == ctUnion)
+    void Clipper::InsertEdgeIntoAEL(TEdge* edge, TEdge* startEdge)
     {
-      if (e1->WindDelta == 0)
-      {
-        if (e2Contributing)
+        if (!m_ActiveEdges)
         {
-          AddOutPt(e1, Pt);
-          if (e1Contributing) e1->OutIdx = Unassigned;
+            edge->PrevInAEL = 0;
+            edge->NextInAEL = 0;
+            m_ActiveEdges   = edge;
         }
-      }
-      else
-      {
-        if (e1Contributing)
+        else if (!startEdge && E2InsertsBeforeE1(*m_ActiveEdges, *edge))
         {
-          AddOutPt(e2, Pt);
-          if (e2Contributing) e2->OutIdx = Unassigned;
+            edge->PrevInAEL          = 0;
+            edge->NextInAEL          = m_ActiveEdges;
+            m_ActiveEdges->PrevInAEL = edge;
+            m_ActiveEdges            = edge;
+        }
+        else
+        {
+            if (!startEdge) startEdge = m_ActiveEdges;
+            while (startEdge->NextInAEL &&
+                   !E2InsertsBeforeE1(*startEdge->NextInAEL, *edge))
+                startEdge = startEdge->NextInAEL;
+            edge->NextInAEL = startEdge->NextInAEL;
+            if (startEdge->NextInAEL) startEdge->NextInAEL->PrevInAEL = edge;
+            edge->PrevInAEL      = startEdge;
+            startEdge->NextInAEL = edge;
         }
-      }
     }
-    else if (e1->PolyTyp != e2->PolyTyp)
+    //----------------------------------------------------------------------
+
+    OutPt* DupOutPt(OutPt* outPt, bool InsertAfter)
     {
-      //toggle subj open path OutIdx on/off when Abs(clip.WndCnt) == 1 ...
-      if ((e1->WindDelta == 0) && abs(e2->WindCnt) == 1 &&
-        (m_ClipType != ctUnion || e2->WindCnt2 == 0))
-      {
-        AddOutPt(e1, Pt);
-        if (e1Contributing) e1->OutIdx = Unassigned;
-      }
-      else if ((e2->WindDelta == 0) && (abs(e1->WindCnt) == 1) &&
-        (m_ClipType != ctUnion || e1->WindCnt2 == 0))
-      {
-        AddOutPt(e2, Pt);
-        if (e2Contributing) e2->OutIdx = Unassigned;
-      }
-    }
-    return;
-  }
-#endif
+        OutPt* result = new OutPt;
+        result->Pt    = outPt->Pt;
+        result->Idx   = outPt->Idx;
+        if (InsertAfter)
+        {
+            result->Next      = outPt->Next;
+            result->Prev      = outPt;
+            outPt->Next->Prev = result;
+            outPt->Next       = result;
+        }
+        else
+        {
+            result->Prev      = outPt->Prev;
+            result->Next      = outPt;
+            outPt->Prev->Next = result;
+            outPt->Prev       = result;
+        }
+        return result;
+    }
+    //------------------------------------------------------------------------------
 
-  //update winding counts...
-  //assumes that e1 will be to the Right of e2 ABOVE the intersection
-  if ( e1->PolyTyp == e2->PolyTyp )
-  {
-    if ( IsEvenOddFillType( *e1) )
-    {
-      int oldE1WindCnt = e1->WindCnt;
-      e1->WindCnt = e2->WindCnt;
-      e2->WindCnt = oldE1WindCnt;
-    } else
-    {
-      if (e1->WindCnt + e2->WindDelta == 0 ) e1->WindCnt = -e1->WindCnt;
-      else e1->WindCnt += e2->WindDelta;
-      if ( e2->WindCnt - e1->WindDelta == 0 ) e2->WindCnt = -e2->WindCnt;
-      else e2->WindCnt -= e1->WindDelta;
-    }
-  } else
-  {
-    if (!IsEvenOddFillType(*e2)) e1->WindCnt2 += e2->WindDelta;
-    else e1->WindCnt2 = ( e1->WindCnt2 == 0 ) ? 1 : 0;
-    if (!IsEvenOddFillType(*e1)) e2->WindCnt2 -= e1->WindDelta;
-    else e2->WindCnt2 = ( e2->WindCnt2 == 0 ) ? 1 : 0;
-  }
-
-  PolyFillType e1FillType, e2FillType, e1FillType2, e2FillType2;
-  if (e1->PolyTyp == ptSubject)
-  {
-    e1FillType = m_SubjFillType;
-    e1FillType2 = m_ClipFillType;
-  } else
-  {
-    e1FillType = m_ClipFillType;
-    e1FillType2 = m_SubjFillType;
-  }
-  if (e2->PolyTyp == ptSubject)
-  {
-    e2FillType = m_SubjFillType;
-    e2FillType2 = m_ClipFillType;
-  } else
-  {
-    e2FillType = m_ClipFillType;
-    e2FillType2 = m_SubjFillType;
-  }
-
-  cInt e1Wc, e2Wc;
-  switch (e1FillType)
-  {
-    case pftPositive: e1Wc = e1->WindCnt; break;
-    case pftNegative: e1Wc = -e1->WindCnt; break;
-    default: e1Wc = Abs(e1->WindCnt);
-  }
-  switch(e2FillType)
-  {
-    case pftPositive: e2Wc = e2->WindCnt; break;
-    case pftNegative: e2Wc = -e2->WindCnt; break;
-    default: e2Wc = Abs(e2->WindCnt);
-  }
-
-  if ( e1Contributing && e2Contributing )
-  {
-    if ((e1Wc != 0 && e1Wc != 1) || (e2Wc != 0 && e2Wc != 1) ||
-      (e1->PolyTyp != e2->PolyTyp && m_ClipType != ctXor) )
-    {
-      AddLocalMaxPoly(e1, e2, Pt);
-    }
-    else
-    {
-      AddOutPt(e1, Pt);
-      AddOutPt(e2, Pt);
-      SwapSides( *e1 , *e2 );
-      SwapPolyIndexes( *e1 , *e2 );
-    }
-  }
-  else if ( e1Contributing )
-  {
-    if (e2Wc == 0 || e2Wc == 1)
-    {
-      AddOutPt(e1, Pt);
-      SwapSides(*e1, *e2);
-      SwapPolyIndexes(*e1, *e2);
-    }
-  }
-  else if ( e2Contributing )
-  {
-    if (e1Wc == 0 || e1Wc == 1)
-    {
-      AddOutPt(e2, Pt);
-      SwapSides(*e1, *e2);
-      SwapPolyIndexes(*e1, *e2);
-    }
-  }
-  else if ( (e1Wc == 0 || e1Wc == 1) && (e2Wc == 0 || e2Wc == 1))
-  {
-    //neither edge is currently contributing ...
-
-    cInt e1Wc2, e2Wc2;
-    switch (e1FillType2)
-    {
-      case pftPositive: e1Wc2 = e1->WindCnt2; break;
-      case pftNegative : e1Wc2 = -e1->WindCnt2; break;
-      default: e1Wc2 = Abs(e1->WindCnt2);
-    }
-    switch (e2FillType2)
-    {
-      case pftPositive: e2Wc2 = e2->WindCnt2; break;
-      case pftNegative: e2Wc2 = -e2->WindCnt2; break;
-      default: e2Wc2 = Abs(e2->WindCnt2);
-    }
-
-    if (e1->PolyTyp != e2->PolyTyp)
-    {
-      AddLocalMinPoly(e1, e2, Pt);
-    }
-    else if (e1Wc == 1 && e2Wc == 1)
-      switch( m_ClipType ) {
-        case ctIntersection:
-          if (e1Wc2 > 0 && e2Wc2 > 0)
-            AddLocalMinPoly(e1, e2, Pt);
-          break;
-        case ctUnion:
-          if ( e1Wc2 <= 0 && e2Wc2 <= 0 )
-            AddLocalMinPoly(e1, e2, Pt);
-          break;
-        case ctDifference:
-          if (((e1->PolyTyp == ptClip) && (e1Wc2 > 0) && (e2Wc2 > 0)) ||
-              ((e1->PolyTyp == ptSubject) && (e1Wc2 <= 0) && (e2Wc2 <= 0)))
-                AddLocalMinPoly(e1, e2, Pt);
-          break;
-        case ctXor:
-          AddLocalMinPoly(e1, e2, Pt);
-      }
-    else
-      SwapSides( *e1, *e2 );
-  }
-}
-//------------------------------------------------------------------------------
+    bool JoinHorz(OutPt* op1, OutPt* op1b, OutPt* op2, OutPt* op2b, const IntPoint Pt, bool DiscardLeft)
+    {
+        Direction Dir1 = (op1->Pt.X > op1b->Pt.X ? dRightToLeft : dLeftToRight);
+        Direction Dir2 = (op2->Pt.X > op2b->Pt.X ? dRightToLeft : dLeftToRight);
+        if (Dir1 == Dir2) return false;
+
+        // When DiscardLeft, we want Op1b to be on the Left of Op1, otherwise we
+        // want Op1b to be on the Right. (And likewise with Op2 and Op2b.)
+        // So, to facilitate this while inserting Op1b and Op2b ...
+        // when DiscardLeft, make sure we're AT or RIGHT of Pt before adding Op1b,
+        // otherwise make sure we're AT or LEFT of Pt. (Likewise with Op2b.)
+        if (Dir1 == dLeftToRight)
+        {
+            while (op1->Next->Pt.X <= Pt.X &&
+                   op1->Next->Pt.X >= op1->Pt.X && op1->Next->Pt.Y == Pt.Y)
+                op1 = op1->Next;
+            if (DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
+            op1b = DupOutPt(op1, !DiscardLeft);
+            if (op1b->Pt != Pt)
+            {
+                op1     = op1b;
+                op1->Pt = Pt;
+                op1b    = DupOutPt(op1, !DiscardLeft);
+            }
+        }
+        else
+        {
+            while (op1->Next->Pt.X >= Pt.X &&
+                   op1->Next->Pt.X <= op1->Pt.X && op1->Next->Pt.Y == Pt.Y)
+                op1 = op1->Next;
+            if (!DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
+            op1b = DupOutPt(op1, DiscardLeft);
+            if (op1b->Pt != Pt)
+            {
+                op1     = op1b;
+                op1->Pt = Pt;
+                op1b    = DupOutPt(op1, DiscardLeft);
+            }
+        }
 
-void Clipper::SetHoleState(TEdge *e, OutRec *outrec)
-{
-  TEdge *e2 = e->PrevInAEL;
-  TEdge *eTmp = 0;
-  while (e2)
-  {
-    if (e2->OutIdx >= 0 && e2->WindDelta != 0)
-    {
-      if (!eTmp) eTmp = e2;
-      else if (eTmp->OutIdx == e2->OutIdx) eTmp = 0;
-    }
-    e2 = e2->PrevInAEL;
-  }
-  if (!eTmp)
-  {
-    outrec->FirstLeft = 0;
-    outrec->IsHole = false;
-  }
-  else
-  {
-    outrec->FirstLeft = m_PolyOuts[eTmp->OutIdx];
-    outrec->IsHole = !outrec->FirstLeft->IsHole;
-  }
-}
-//------------------------------------------------------------------------------
+        if (Dir2 == dLeftToRight)
+        {
+            while (op2->Next->Pt.X <= Pt.X &&
+                   op2->Next->Pt.X >= op2->Pt.X && op2->Next->Pt.Y == Pt.Y)
+                op2 = op2->Next;
+            if (DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
+            op2b = DupOutPt(op2, !DiscardLeft);
+            if (op2b->Pt != Pt)
+            {
+                op2     = op2b;
+                op2->Pt = Pt;
+                op2b    = DupOutPt(op2, !DiscardLeft);
+            };
+        }
+        else
+        {
+            while (op2->Next->Pt.X >= Pt.X &&
+                   op2->Next->Pt.X <= op2->Pt.X && op2->Next->Pt.Y == Pt.Y)
+                op2 = op2->Next;
+            if (!DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
+            op2b = DupOutPt(op2, DiscardLeft);
+            if (op2b->Pt != Pt)
+            {
+                op2     = op2b;
+                op2->Pt = Pt;
+                op2b    = DupOutPt(op2, DiscardLeft);
+            };
+        };
 
-OutRec* GetLowermostRec(OutRec *outRec1, OutRec *outRec2)
-{
-  //work out which polygon fragment has the correct hole state ...
-  if (!outRec1->BottomPt)
-    outRec1->BottomPt = GetBottomPt(outRec1->Pts);
-  if (!outRec2->BottomPt)
-    outRec2->BottomPt = GetBottomPt(outRec2->Pts);
-  OutPt *OutPt1 = outRec1->BottomPt;
-  OutPt *OutPt2 = outRec2->BottomPt;
-  if (OutPt1->Pt.Y > OutPt2->Pt.Y) return outRec1;
-  else if (OutPt1->Pt.Y < OutPt2->Pt.Y) return outRec2;
-  else if (OutPt1->Pt.X < OutPt2->Pt.X) return outRec1;
-  else if (OutPt1->Pt.X > OutPt2->Pt.X) return outRec2;
-  else if (OutPt1->Next == OutPt1) return outRec2;
-  else if (OutPt2->Next == OutPt2) return outRec1;
-  else if (FirstIsBottomPt(OutPt1, OutPt2)) return outRec1;
-  else return outRec2;
-}
-//------------------------------------------------------------------------------
+        if ((Dir1 == dLeftToRight) == DiscardLeft)
+        {
+            op1->Prev  = op2;
+            op2->Next  = op1;
+            op1b->Next = op2b;
+            op2b->Prev = op1b;
+        }
+        else
+        {
+            op1->Next  = op2;
+            op2->Prev  = op1;
+            op1b->Prev = op2b;
+            op2b->Next = op1b;
+        }
+        return true;
+    }
+    //------------------------------------------------------------------------------
 
-bool OutRec1RightOfOutRec2(OutRec* outRec1, OutRec* outRec2)
-{
-  do
-  {
-    outRec1 = outRec1->FirstLeft;
-    if (outRec1 == outRec2) return true;
-  } while (outRec1);
-  return false;
-}
-//------------------------------------------------------------------------------
+    bool Clipper::JoinPoints(Join* j, OutRec* outRec1, OutRec* outRec2)
+    {
+        OutPt *op1 = j->OutPt1, *op1b;
+        OutPt *op2 = j->OutPt2, *op2b;
+
+        // There are 3 kinds of joins for output polygons ...
+        // 1. Horizontal joins where Join.OutPt1 & Join.OutPt2 are vertices anywhere
+        // along (horizontal) collinear edges (& Join.OffPt is on the same horizontal).
+        // 2. Non-horizontal joins where Join.OutPt1 & Join.OutPt2 are at the same
+        // location at the Bottom of the overlapping segment (& Join.OffPt is above).
+        // 3. StrictSimple joins where edges touch but are not collinear and where
+        // Join.OutPt1, Join.OutPt2 & Join.OffPt all share the same point.
+        bool   isHorizontal = (j->OutPt1->Pt.Y == j->OffPt.Y);
+
+        if (isHorizontal && (j->OffPt == j->OutPt1->Pt) &&
+            (j->OffPt == j->OutPt2->Pt))
+        {
+            // Strictly Simple join ...
+            if (outRec1 != outRec2) return false;
+            op1b = j->OutPt1->Next;
+            while (op1b != op1 && (op1b->Pt == j->OffPt))
+                op1b = op1b->Next;
+            bool reverse1 = (op1b->Pt.Y > j->OffPt.Y);
+            op2b          = j->OutPt2->Next;
+            while (op2b != op2 && (op2b->Pt == j->OffPt))
+                op2b = op2b->Next;
+            bool reverse2 = (op2b->Pt.Y > j->OffPt.Y);
+            if (reverse1 == reverse2) return false;
+            if (reverse1)
+            {
+                op1b       = DupOutPt(op1, false);
+                op2b       = DupOutPt(op2, true);
+                op1->Prev  = op2;
+                op2->Next  = op1;
+                op1b->Next = op2b;
+                op2b->Prev = op1b;
+                j->OutPt1  = op1;
+                j->OutPt2  = op1b;
+                return true;
+            }
+            else
+            {
+                op1b       = DupOutPt(op1, true);
+                op2b       = DupOutPt(op2, false);
+                op1->Next  = op2;
+                op2->Prev  = op1;
+                op1b->Prev = op2b;
+                op2b->Next = op1b;
+                j->OutPt1  = op1;
+                j->OutPt2  = op1b;
+                return true;
+            }
+        }
+        else if (isHorizontal)
+        {
+            // treat horizontal joins differently to non-horizontal joins since with
+            // them we're not yet sure where the overlapping is. OutPt1.Pt & OutPt2.Pt
+            // may be anywhere along the horizontal edge.
+            op1b = op1;
+            while (op1->Prev->Pt.Y == op1->Pt.Y && op1->Prev != op1b && op1->Prev != op2)
+                op1 = op1->Prev;
+            while (op1b->Next->Pt.Y == op1b->Pt.Y && op1b->Next != op1 && op1b->Next != op2)
+                op1b = op1b->Next;
+            if (op1b->Next == op1 || op1b->Next == op2) return false;  // a flat 'polygon'
+
+            op2b = op2;
+            while (op2->Prev->Pt.Y == op2->Pt.Y && op2->Prev != op2b && op2->Prev != op1b)
+                op2 = op2->Prev;
+            while (op2b->Next->Pt.Y == op2b->Pt.Y && op2b->Next != op2 && op2b->Next != op1)
+                op2b = op2b->Next;
+            if (op2b->Next == op2 || op2b->Next == op1) return false;  // a flat 'polygon'
+
+            cInt Left, Right;
+            // Op1 --> Op1b & Op2 --> Op2b are the extremites of the horizontal edges
+            if (!GetOverlap(op1->Pt.X, op1b->Pt.X, op2->Pt.X, op2b->Pt.X, Left, Right))
+                return false;
+
+            // DiscardLeftSide: when overlapping edges are joined, a spike will created
+            // which needs to be cleaned up. However, we don't want Op1 or Op2 caught up
+            // on the discard Side as either may still be needed for other joins ...
+            IntPoint Pt;
+            bool     DiscardLeftSide;
+            if (op1->Pt.X >= Left && op1->Pt.X <= Right)
+            {
+                Pt              = op1->Pt;
+                DiscardLeftSide = (op1->Pt.X > op1b->Pt.X);
+            }
+            else if (op2->Pt.X >= Left && op2->Pt.X <= Right)
+            {
+                Pt              = op2->Pt;
+                DiscardLeftSide = (op2->Pt.X > op2b->Pt.X);
+            }
+            else if (op1b->Pt.X >= Left && op1b->Pt.X <= Right)
+            {
+                Pt              = op1b->Pt;
+                DiscardLeftSide = op1b->Pt.X > op1->Pt.X;
+            }
+            else
+            {
+                Pt              = op2b->Pt;
+                DiscardLeftSide = (op2b->Pt.X > op2->Pt.X);
+            }
+            j->OutPt1 = op1;
+            j->OutPt2 = op2;
+            return JoinHorz(op1, op1b, op2, op2b, Pt, DiscardLeftSide);
+        }
+        else
+        {
+            // nb: For non-horizontal joins ...
+            //     1. Jr.OutPt1.Pt.Y == Jr.OutPt2.Pt.Y
+            //     2. Jr.OutPt1.Pt > Jr.OffPt.Y
+
+            // make sure the polygons are correctly oriented ...
+            op1b = op1->Next;
+            while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Next;
+            bool Reverse1 = ((op1b->Pt.Y > op1->Pt.Y) ||
+                             !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange));
+            if (Reverse1)
+            {
+                op1b = op1->Prev;
+                while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Prev;
+                if ((op1b->Pt.Y > op1->Pt.Y) ||
+                    !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange)) return false;
+            };
+            op2b = op2->Next;
+            while ((op2b->Pt == op2->Pt) && (op2b != op2)) op2b = op2b->Next;
+            bool Reverse2 = ((op2b->Pt.Y > op2->Pt.Y) ||
+                             !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange));
+            if (Reverse2)
+            {
+                op2b = op2->Prev;
+                while ((op2b->Pt == op2->Pt) && (op2b != op2)) op2b = op2b->Prev;
+                if ((op2b->Pt.Y > op2->Pt.Y) ||
+                    !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange)) return false;
+            }
 
-OutRec* Clipper::GetOutRec(int Idx)
-{
-  OutRec* outrec = m_PolyOuts[Idx];
-  while (outrec != m_PolyOuts[outrec->Idx])
-    outrec = m_PolyOuts[outrec->Idx];
-  return outrec;
-}
-//------------------------------------------------------------------------------
+            if ((op1b == op1) || (op2b == op2) || (op1b == op2b) ||
+                ((outRec1 == outRec2) && (Reverse1 == Reverse2))) return false;
 
-void Clipper::AppendPolygon(TEdge *e1, TEdge *e2)
-{
-  //get the start and ends of both output polygons ...
-  OutRec *outRec1 = m_PolyOuts[e1->OutIdx];
-  OutRec *outRec2 = m_PolyOuts[e2->OutIdx];
-
-  OutRec *holeStateRec;
-  if (OutRec1RightOfOutRec2(outRec1, outRec2))
-    holeStateRec = outRec2;
-  else if (OutRec1RightOfOutRec2(outRec2, outRec1))
-    holeStateRec = outRec1;
-  else
-    holeStateRec = GetLowermostRec(outRec1, outRec2);
-
-  //get the start and ends of both output polygons and
-  //join e2 poly onto e1 poly and delete pointers to e2 ...
-
-  OutPt* p1_lft = outRec1->Pts;
-  OutPt* p1_rt = p1_lft->Prev;
-  OutPt* p2_lft = outRec2->Pts;
-  OutPt* p2_rt = p2_lft->Prev;
-
-  //join e2 poly onto e1 poly and delete pointers to e2 ...
-  if(  e1->Side == esLeft )
-  {
-    if(  e2->Side == esLeft )
-    {
-      //z y x a b c
-      ReversePolyPtLinks(p2_lft);
-      p2_lft->Next = p1_lft;
-      p1_lft->Prev = p2_lft;
-      p1_rt->Next = p2_rt;
-      p2_rt->Prev = p1_rt;
-      outRec1->Pts = p2_rt;
-    } else
-    {
-      //x y z a b c
-      p2_rt->Next = p1_lft;
-      p1_lft->Prev = p2_rt;
-      p2_lft->Prev = p1_rt;
-      p1_rt->Next = p2_lft;
-      outRec1->Pts = p2_lft;
-    }
-  } else
-  {
-    if(  e2->Side == esRight )
-    {
-      //a b c z y x
-      ReversePolyPtLinks(p2_lft);
-      p1_rt->Next = p2_rt;
-      p2_rt->Prev = p1_rt;
-      p2_lft->Next = p1_lft;
-      p1_lft->Prev = p2_lft;
-    } else
-    {
-      //a b c x y z
-      p1_rt->Next = p2_lft;
-      p2_lft->Prev = p1_rt;
-      p1_lft->Prev = p2_rt;
-      p2_rt->Next = p1_lft;
-    }
-  }
-
-  outRec1->BottomPt = 0;
-  if (holeStateRec == outRec2)
-  {
-    if (outRec2->FirstLeft != outRec1)
-      outRec1->FirstLeft = outRec2->FirstLeft;
-    outRec1->IsHole = outRec2->IsHole;
-  }
-  outRec2->Pts = 0;
-  outRec2->BottomPt = 0;
-  outRec2->FirstLeft = outRec1;
-
-  int OKIdx = e1->OutIdx;
-  int ObsoleteIdx = e2->OutIdx;
-
-  e1->OutIdx = Unassigned; //nb: safe because we only get here via AddLocalMaxPoly
-  e2->OutIdx = Unassigned;
-
-  TEdge* e = m_ActiveEdges;
-  while( e )
-  {
-    if( e->OutIdx == ObsoleteIdx )
-    {
-      e->OutIdx = OKIdx;
-      e->Side = e1->Side;
-      break;
-    }
-    e = e->NextInAEL;
-  }
-
-  outRec2->Idx = outRec1->Idx;
-}
-//------------------------------------------------------------------------------
+            if (Reverse1)
+            {
+                op1b       = DupOutPt(op1, false);
+                op2b       = DupOutPt(op2, true);
+                op1->Prev  = op2;
+                op2->Next  = op1;
+                op1b->Next = op2b;
+                op2b->Prev = op1b;
+                j->OutPt1  = op1;
+                j->OutPt2  = op1b;
+                return true;
+            }
+            else
+            {
+                op1b       = DupOutPt(op1, true);
+                op2b       = DupOutPt(op2, false);
+                op1->Next  = op2;
+                op2->Prev  = op1;
+                op1b->Prev = op2b;
+                op2b->Next = op1b;
+                j->OutPt1  = op1;
+                j->OutPt2  = op1b;
+                return true;
+            }
+        }
+    }
+    //----------------------------------------------------------------------
 
-OutPt* Clipper::AddOutPt(TEdge *e, const IntPoint &pt)
-{
-  if(  e->OutIdx < 0 )
-  {
-    OutRec *outRec = CreateOutRec();
-    outRec->IsOpen = (e->WindDelta == 0);
-    OutPt* newOp = new OutPt;
-    outRec->Pts = newOp;
-    newOp->Idx = outRec->Idx;
-    newOp->Pt = pt;
-    newOp->Next = newOp;
-    newOp->Prev = newOp;
-    if (!outRec->IsOpen)
-      SetHoleState(e, outRec);
-    e->OutIdx = outRec->Idx;
-    return newOp;
-  } else
-  {
-    OutRec *outRec = m_PolyOuts[e->OutIdx];
-    //OutRec.Pts is the 'Left-most' point & OutRec.Pts.Prev is the 'Right-most'
-    OutPt* op = outRec->Pts;
-
-	bool ToFront = (e->Side == esLeft);
-	if (ToFront && (pt == op->Pt)) return op;
-    else if (!ToFront && (pt == op->Prev->Pt)) return op->Prev;
-
-    OutPt* newOp = new OutPt;
-    newOp->Idx = outRec->Idx;
-    newOp->Pt = pt;
-    newOp->Next = op;
-    newOp->Prev = op->Prev;
-    newOp->Prev->Next = newOp;
-    op->Prev = newOp;
-    if (ToFront) outRec->Pts = newOp;
-    return newOp;
-  }
-}
-//------------------------------------------------------------------------------
+    static OutRec* ParseFirstLeft(OutRec* FirstLeft)
+    {
+        while (FirstLeft && !FirstLeft->Pts)
+            FirstLeft = FirstLeft->FirstLeft;
+        return FirstLeft;
+    }
+    //------------------------------------------------------------------------------
 
-OutPt* Clipper::GetLastOutPt(TEdge *e)
-{
-	OutRec *outRec = m_PolyOuts[e->OutIdx];
-	if (e->Side == esLeft)
-		return outRec->Pts;
-	else
-		return outRec->Pts->Prev;
-}
-//------------------------------------------------------------------------------
+    void Clipper::FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec)
+    {
+        // tests if NewOutRec contains the polygon before reassigning FirstLeft
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            OutRec* outRec    = m_PolyOuts[i];
+            OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
+            if (outRec->Pts && firstLeft == OldOutRec)
+            {
+                if (Poly2ContainsPoly1(outRec->Pts, NewOutRec->Pts))
+                    outRec->FirstLeft = NewOutRec;
+            }
+        }
+    }
+    //----------------------------------------------------------------------
 
-void Clipper::ProcessHorizontals()
-{
-  TEdge* horzEdge;
-  while (PopEdgeFromSEL(horzEdge))
-    ProcessHorizontal(horzEdge);
-}
-//------------------------------------------------------------------------------
+    void Clipper::FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec)
+    {
+        // A polygon has split into two such that one is now the inner of the other.
+        // It's possible that these polygons now wrap around other polygons, so check
+        // every polygon that's also contained by OuterOutRec's FirstLeft container
+        //(including 0) to see if they've become inner to the new inner polygon ...
+        OutRec* orfl = OuterOutRec->FirstLeft;
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            OutRec* outRec = m_PolyOuts[i];
+
+            if (!outRec->Pts || outRec == OuterOutRec || outRec == InnerOutRec)
+                continue;
+            OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
+            if (firstLeft != orfl && firstLeft != InnerOutRec && firstLeft != OuterOutRec)
+                continue;
+            if (Poly2ContainsPoly1(outRec->Pts, InnerOutRec->Pts))
+                outRec->FirstLeft = InnerOutRec;
+            else if (Poly2ContainsPoly1(outRec->Pts, OuterOutRec->Pts))
+                outRec->FirstLeft = OuterOutRec;
+            else if (outRec->FirstLeft == InnerOutRec || outRec->FirstLeft == OuterOutRec)
+                outRec->FirstLeft = orfl;
+        }
+    }
+    //----------------------------------------------------------------------
+    void Clipper::FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec)
+    {
+        // reassigns FirstLeft WITHOUT testing if NewOutRec contains the polygon
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            OutRec* outRec    = m_PolyOuts[i];
+            OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
+            if (outRec->Pts && firstLeft == OldOutRec)
+                outRec->FirstLeft = NewOutRec;
+        }
+    }
+    //----------------------------------------------------------------------
 
-inline bool IsMinima(TEdge *e)
-{
-  return e  && (e->Prev->NextInLML != e) && (e->Next->NextInLML != e);
-}
-//------------------------------------------------------------------------------
+    void Clipper::JoinCommonEdges()
+    {
+        for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
+        {
+            Join*   join = m_Joins[i];
+
+            OutRec* outRec1 = GetOutRec(join->OutPt1->Idx);
+            OutRec* outRec2 = GetOutRec(join->OutPt2->Idx);
+
+            if (!outRec1->Pts || !outRec2->Pts) continue;
+            if (outRec1->IsOpen || outRec2->IsOpen) continue;
+
+            // get the polygon fragment with the correct hole state (FirstLeft)
+            // before calling JoinPoints() ...
+            OutRec* holeStateRec;
+            if (outRec1 == outRec2)
+                holeStateRec = outRec1;
+            else if (OutRec1RightOfOutRec2(outRec1, outRec2))
+                holeStateRec = outRec2;
+            else if (OutRec1RightOfOutRec2(outRec2, outRec1))
+                holeStateRec = outRec1;
+            else
+                holeStateRec = GetLowermostRec(outRec1, outRec2);
 
-inline bool IsMaxima(TEdge *e, const cInt Y)
-{
-  return e && e->Top.Y == Y && !e->NextInLML;
-}
-//------------------------------------------------------------------------------
+            if (!JoinPoints(join, outRec1, outRec2)) continue;
 
-inline bool IsIntermediate(TEdge *e, const cInt Y)
-{
-  return e->Top.Y == Y && e->NextInLML;
-}
-//------------------------------------------------------------------------------
+            if (outRec1 == outRec2)
+            {
+                // instead of joining two polygons, we've just created a new one by
+                // splitting one polygon into two.
+                outRec1->Pts      = join->OutPt1;
+                outRec1->BottomPt = 0;
+                outRec2           = CreateOutRec();
+                outRec2->Pts      = join->OutPt2;
 
-TEdge *GetMaximaPair(TEdge *e)
-{
-  if ((e->Next->Top == e->Top) && !e->Next->NextInLML)
-    return e->Next;
-  else if ((e->Prev->Top == e->Top) && !e->Prev->NextInLML)
-    return e->Prev;
-  else return 0;
-}
-//------------------------------------------------------------------------------
+                // update all OutRec2.Pts Idx's ...
+                UpdateOutPtIdxs(*outRec2);
 
-TEdge *GetMaximaPairEx(TEdge *e)
-{
-  //as GetMaximaPair() but returns 0 if MaxPair isn't in AEL (unless it's horizontal)
-  TEdge* result = GetMaximaPair(e);
-  if (result && (result->OutIdx == Skip ||
-    (result->NextInAEL == result->PrevInAEL && !IsHorizontal(*result)))) return 0;
-  return result;
-}
-//------------------------------------------------------------------------------
+                if (Poly2ContainsPoly1(outRec2->Pts, outRec1->Pts))
+                {
+                    // outRec1 contains outRec2 ...
+                    outRec2->IsHole    = !outRec1->IsHole;
+                    outRec2->FirstLeft = outRec1;
 
-void Clipper::SwapPositionsInSEL(TEdge *Edge1, TEdge *Edge2)
-{
-  if(  !( Edge1->NextInSEL ) &&  !( Edge1->PrevInSEL ) ) return;
-  if(  !( Edge2->NextInSEL ) &&  !( Edge2->PrevInSEL ) ) return;
-
-  if(  Edge1->NextInSEL == Edge2 )
-  {
-    TEdge* Next = Edge2->NextInSEL;
-    if( Next ) Next->PrevInSEL = Edge1;
-    TEdge* Prev = Edge1->PrevInSEL;
-    if( Prev ) Prev->NextInSEL = Edge2;
-    Edge2->PrevInSEL = Prev;
-    Edge2->NextInSEL = Edge1;
-    Edge1->PrevInSEL = Edge2;
-    Edge1->NextInSEL = Next;
-  }
-  else if(  Edge2->NextInSEL == Edge1 )
-  {
-    TEdge* Next = Edge1->NextInSEL;
-    if( Next ) Next->PrevInSEL = Edge2;
-    TEdge* Prev = Edge2->PrevInSEL;
-    if( Prev ) Prev->NextInSEL = Edge1;
-    Edge1->PrevInSEL = Prev;
-    Edge1->NextInSEL = Edge2;
-    Edge2->PrevInSEL = Edge1;
-    Edge2->NextInSEL = Next;
-  }
-  else
-  {
-    TEdge* Next = Edge1->NextInSEL;
-    TEdge* Prev = Edge1->PrevInSEL;
-    Edge1->NextInSEL = Edge2->NextInSEL;
-    if( Edge1->NextInSEL ) Edge1->NextInSEL->PrevInSEL = Edge1;
-    Edge1->PrevInSEL = Edge2->PrevInSEL;
-    if( Edge1->PrevInSEL ) Edge1->PrevInSEL->NextInSEL = Edge1;
-    Edge2->NextInSEL = Next;
-    if( Edge2->NextInSEL ) Edge2->NextInSEL->PrevInSEL = Edge2;
-    Edge2->PrevInSEL = Prev;
-    if( Edge2->PrevInSEL ) Edge2->PrevInSEL->NextInSEL = Edge2;
-  }
-
-  if( !Edge1->PrevInSEL ) m_SortedEdges = Edge1;
-  else if( !Edge2->PrevInSEL ) m_SortedEdges = Edge2;
-}
-//------------------------------------------------------------------------------
+                    if (m_UsingPolyTree) FixupFirstLefts2(outRec2, outRec1);
 
-TEdge* GetNextInAEL(TEdge *e, Direction dir)
-{
-  return dir == dLeftToRight ? e->NextInAEL : e->PrevInAEL;
-}
-//------------------------------------------------------------------------------
+                    if ((outRec2->IsHole ^ m_ReverseOutput) == (Area(*outRec2) > 0))
+                        ReversePolyPtLinks(outRec2->Pts);
+                }
+                else if (Poly2ContainsPoly1(outRec1->Pts, outRec2->Pts))
+                {
+                    // outRec2 contains outRec1 ...
+                    outRec2->IsHole    = outRec1->IsHole;
+                    outRec1->IsHole    = !outRec2->IsHole;
+                    outRec2->FirstLeft = outRec1->FirstLeft;
+                    outRec1->FirstLeft = outRec2;
 
-void GetHorzDirection(TEdge& HorzEdge, Direction& Dir, cInt& Left, cInt& Right)
-{
-  if (HorzEdge.Bot.X < HorzEdge.Top.X)
-  {
-    Left = HorzEdge.Bot.X;
-    Right = HorzEdge.Top.X;
-    Dir = dLeftToRight;
-  } else
-  {
-    Left = HorzEdge.Top.X;
-    Right = HorzEdge.Bot.X;
-    Dir = dRightToLeft;
-  }
-}
-//------------------------------------------------------------------------
+                    if (m_UsingPolyTree) FixupFirstLefts2(outRec1, outRec2);
 
-/*******************************************************************************
-* Notes: Horizontal edges (HEs) at scanline intersections (ie at the Top or    *
-* Bottom of a scanbeam) are processed as if layered. The order in which HEs    *
-* are processed doesn't matter. HEs intersect with other HE Bot.Xs only [#]    *
-* (or they could intersect with Top.Xs only, ie EITHER Bot.Xs OR Top.Xs),      *
-* and with other non-horizontal edges [*]. Once these intersections are        *
-* processed, intermediate HEs then 'promote' the Edge above (NextInLML) into   *
-* the AEL. These 'promoted' edges may in turn intersect [%] with other HEs.    *
-*******************************************************************************/
-
-void Clipper::ProcessHorizontal(TEdge *horzEdge)
-{
-  Direction dir;
-  cInt horzLeft, horzRight;
-  bool IsOpen = (horzEdge->WindDelta == 0);
-
-  GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);
-
-  TEdge* eLastHorz = horzEdge, *eMaxPair = 0;
-  while (eLastHorz->NextInLML && IsHorizontal(*eLastHorz->NextInLML))
-    eLastHorz = eLastHorz->NextInLML;
-  if (!eLastHorz->NextInLML)
-    eMaxPair = GetMaximaPair(eLastHorz);
-
-  MaximaList::const_iterator maxIt;
-  MaximaList::const_reverse_iterator maxRit;
-  if (m_Maxima.size() > 0)
-  {
-      //get the first maxima in range (X) ...
-      if (dir == dLeftToRight)
-      {
-          maxIt = m_Maxima.begin();
-          while (maxIt != m_Maxima.end() && *maxIt <= horzEdge->Bot.X) maxIt++;
-          if (maxIt != m_Maxima.end() && *maxIt >= eLastHorz->Top.X)
-              maxIt = m_Maxima.end();
-      }
-      else
-      {
-          maxRit = m_Maxima.rbegin();
-          while (maxRit != m_Maxima.rend() && *maxRit > horzEdge->Bot.X) maxRit++;
-          if (maxRit != m_Maxima.rend() && *maxRit <= eLastHorz->Top.X)
-              maxRit = m_Maxima.rend();
-      }
-  }
-
-  OutPt* op1 = 0;
-
-  for (;;) //loop through consec. horizontal edges
-  {
-
-    bool IsLastHorz = (horzEdge == eLastHorz);
-    TEdge* e = GetNextInAEL(horzEdge, dir);
-    while(e)
-    {
-
-        //this code block inserts extra coords into horizontal edges (in output
-        //polygons) whereever maxima touch these horizontal edges. This helps
-        //'simplifying' polygons (ie if the Simplify property is set).
-        if (m_Maxima.size() > 0)
-        {
-            if (dir == dLeftToRight)
-            {
-                while (maxIt != m_Maxima.end() && *maxIt < e->Curr.X)
+                    if ((outRec1->IsHole ^ m_ReverseOutput) == (Area(*outRec1) > 0))
+                        ReversePolyPtLinks(outRec1->Pts);
+                }
+                else
                 {
-                  if (horzEdge->OutIdx >= 0 && !IsOpen)
-                    AddOutPt(horzEdge, IntPoint(*maxIt, horzEdge->Bot.Y));
-                  maxIt++;
+                    // the 2 polygons are completely separate ...
+                    outRec2->IsHole    = outRec1->IsHole;
+                    outRec2->FirstLeft = outRec1->FirstLeft;
+
+                    // fixup FirstLeft pointers that may need reassigning to OutRec2
+                    if (m_UsingPolyTree) FixupFirstLefts1(outRec1, outRec2);
                 }
             }
             else
             {
-                while (maxRit != m_Maxima.rend() && *maxRit > e->Curr.X)
-                {
-                  if (horzEdge->OutIdx >= 0 && !IsOpen)
-                    AddOutPt(horzEdge, IntPoint(*maxRit, horzEdge->Bot.Y));
-                  maxRit++;
-                }
-            }
-        };
-
-        if ((dir == dLeftToRight && e->Curr.X > horzRight) ||
-			(dir == dRightToLeft && e->Curr.X < horzLeft)) break;
+                // joined 2 polygons together ...
 
-		//Also break if we've got to the end of an intermediate horizontal edge ...
-		//nb: Smaller Dx's are to the right of larger Dx's ABOVE the horizontal.
-		if (e->Curr.X == horzEdge->Top.X && horzEdge->NextInLML &&
-			e->Dx < horzEdge->NextInLML->Dx) break;
+                outRec2->Pts      = 0;
+                outRec2->BottomPt = 0;
+                outRec2->Idx      = outRec1->Idx;
 
-    if (horzEdge->OutIdx >= 0 && !IsOpen)  //note: may be done multiple times
-		{
-#ifdef use_xyz
-			if (dir == dLeftToRight) SetZ(e->Curr, *horzEdge, *e);
-			else SetZ(e->Curr, *e, *horzEdge);
-#endif
-			op1 = AddOutPt(horzEdge, e->Curr);
-			TEdge* eNextHorz = m_SortedEdges;
-			while (eNextHorz)
-			{
-				if (eNextHorz->OutIdx >= 0 &&
-					HorzSegmentsOverlap(horzEdge->Bot.X,
-					horzEdge->Top.X, eNextHorz->Bot.X, eNextHorz->Top.X))
-				{
-                    OutPt* op2 = GetLastOutPt(eNextHorz);
-                    AddJoin(op2, op1, eNextHorz->Top);
-				}
-				eNextHorz = eNextHorz->NextInSEL;
-			}
-			AddGhostJoin(op1, horzEdge->Bot);
-		}
+                outRec1->IsHole = holeStateRec->IsHole;
+                if (holeStateRec == outRec2)
+                    outRec1->FirstLeft = outRec2->FirstLeft;
+                outRec2->FirstLeft = outRec1;
 
-		//OK, so far we're still in range of the horizontal Edge  but make sure
-        //we're at the last of consec. horizontals when matching with eMaxPair
-        if(e == eMaxPair && IsLastHorz)
-        {
-          if (horzEdge->OutIdx >= 0)
-            AddLocalMaxPoly(horzEdge, eMaxPair, horzEdge->Top);
-          DeleteFromAEL(horzEdge);
-          DeleteFromAEL(eMaxPair);
-          return;
-        }
-
-		if(dir == dLeftToRight)
-        {
-          IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
-          IntersectEdges(horzEdge, e, Pt);
+                if (m_UsingPolyTree) FixupFirstLefts3(outRec2, outRec1);
+            }
         }
-        else
-        {
-          IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
-          IntersectEdges( e, horzEdge, Pt);
-        }
-        TEdge* eNext = GetNextInAEL(e, dir);
-        SwapPositionsInAEL( horzEdge, e );
-        e = eNext;
-    } //end while(e)
-
-	//Break out of loop if HorzEdge.NextInLML is not also horizontal ...
-	if (!horzEdge->NextInLML || !IsHorizontal(*horzEdge->NextInLML)) break;
-
-	UpdateEdgeIntoAEL(horzEdge);
-    if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Bot);
-    GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);
-
-  } //end for (;;)
-
-  if (horzEdge->OutIdx >= 0 && !op1)
-  {
-      op1 = GetLastOutPt(horzEdge);
-      TEdge* eNextHorz = m_SortedEdges;
-      while (eNextHorz)
-      {
-          if (eNextHorz->OutIdx >= 0 &&
-              HorzSegmentsOverlap(horzEdge->Bot.X,
-              horzEdge->Top.X, eNextHorz->Bot.X, eNextHorz->Top.X))
-          {
-              OutPt* op2 = GetLastOutPt(eNextHorz);
-              AddJoin(op2, op1, eNextHorz->Top);
-          }
-          eNextHorz = eNextHorz->NextInSEL;
-      }
-      AddGhostJoin(op1, horzEdge->Top);
-  }
-
-  if (horzEdge->NextInLML)
-  {
-    if(horzEdge->OutIdx >= 0)
-    {
-      op1 = AddOutPt( horzEdge, horzEdge->Top);
-      UpdateEdgeIntoAEL(horzEdge);
-      if (horzEdge->WindDelta == 0) return;
-      //nb: HorzEdge is no longer horizontal here
-      TEdge* ePrev = horzEdge->PrevInAEL;
-      TEdge* eNext = horzEdge->NextInAEL;
-      if (ePrev && ePrev->Curr.X == horzEdge->Bot.X &&
-        ePrev->Curr.Y == horzEdge->Bot.Y && ePrev->WindDelta != 0 &&
-        (ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
-        SlopesEqual(*horzEdge, *ePrev, m_UseFullRange)))
-      {
-        OutPt* op2 = AddOutPt(ePrev, horzEdge->Bot);
-        AddJoin(op1, op2, horzEdge->Top);
-      }
-      else if (eNext && eNext->Curr.X == horzEdge->Bot.X &&
-        eNext->Curr.Y == horzEdge->Bot.Y && eNext->WindDelta != 0 &&
-        eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
-        SlopesEqual(*horzEdge, *eNext, m_UseFullRange))
-      {
-        OutPt* op2 = AddOutPt(eNext, horzEdge->Bot);
-        AddJoin(op1, op2, horzEdge->Top);
-      }
-    }
-    else
-      UpdateEdgeIntoAEL(horzEdge);
-  }
-  else
-  {
-    if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Top);
-    DeleteFromAEL(horzEdge);
-  }
-}
-//------------------------------------------------------------------------------
-
-bool Clipper::ProcessIntersections(const cInt topY)
-{
-  if( !m_ActiveEdges ) return true;
-  try {
-    BuildIntersectList(topY);
-    size_t IlSize = m_IntersectList.size();
-    if (IlSize == 0) return true;
-    if (IlSize == 1 || FixupIntersectionOrder()) ProcessIntersectList();
-    else return false;
-  }
-  catch(...)
-  {
-    m_SortedEdges = 0;
-    DisposeIntersectNodes();
-    throw clipperException("ProcessIntersections error");
-  }
-  m_SortedEdges = 0;
-  return true;
-}
-//------------------------------------------------------------------------------
-
-void Clipper::DisposeIntersectNodes()
-{
-  for (size_t i = 0; i < m_IntersectList.size(); ++i )
-    delete m_IntersectList[i];
-  m_IntersectList.clear();
-}
-//------------------------------------------------------------------------------
-
-void Clipper::BuildIntersectList(const cInt topY)
-{
-  if ( !m_ActiveEdges ) return;
-
-  //prepare for sorting ...
-  TEdge* e = m_ActiveEdges;
-  m_SortedEdges = e;
-  while( e )
-  {
-    e->PrevInSEL = e->PrevInAEL;
-    e->NextInSEL = e->NextInAEL;
-    e->Curr.X = TopX( *e, topY );
-    e = e->NextInAEL;
-  }
-
-  //bubblesort ...
-  bool isModified;
-  do
-  {
-    isModified = false;
-    e = m_SortedEdges;
-    while( e->NextInSEL )
-    {
-      TEdge *eNext = e->NextInSEL;
-      IntPoint Pt;
-      if(e->Curr.X > eNext->Curr.X)
-      {
-        IntersectPoint(*e, *eNext, Pt);
-        if (Pt.Y < topY) Pt = IntPoint(TopX(*e, topY), topY);
-        IntersectNode * newNode = new IntersectNode;
-        newNode->Edge1 = e;
-        newNode->Edge2 = eNext;
-        newNode->Pt = Pt;
-        m_IntersectList.push_back(newNode);
-
-        SwapPositionsInSEL(e, eNext);
-        isModified = true;
-      }
-      else
-        e = eNext;
-    }
-    if( e->PrevInSEL ) e->PrevInSEL->NextInSEL = 0;
-    else break;
-  }
-  while ( isModified );
-  m_SortedEdges = 0; //important
-}
-//------------------------------------------------------------------------------
+    }
 
+    //------------------------------------------------------------------------------
+    // ClipperOffset support functions ...
+    //------------------------------------------------------------------------------
 
-void Clipper::ProcessIntersectList()
-{
-  for (size_t i = 0; i < m_IntersectList.size(); ++i)
-  {
-    IntersectNode* iNode = m_IntersectList[i];
+    DoublePoint GetUnitNormal(const IntPoint& pt1, const IntPoint& pt2)
     {
-      IntersectEdges( iNode->Edge1, iNode->Edge2, iNode->Pt);
-      SwapPositionsInAEL( iNode->Edge1 , iNode->Edge2 );
+        if (pt2.X == pt1.X && pt2.Y == pt1.Y)
+            return DoublePoint(0, 0);
+
+        double Dx = (double)(pt2.X - pt1.X);
+        double dy = (double)(pt2.Y - pt1.Y);
+        double f  = 1 * 1.0 / std::sqrt(Dx * Dx + dy * dy);
+        Dx *= f;
+        dy *= f;
+        return DoublePoint(dy, -Dx);
     }
-    delete iNode;
-  }
-  m_IntersectList.clear();
-}
-//------------------------------------------------------------------------------
 
-bool IntersectListSort(IntersectNode* node1, IntersectNode* node2)
-{
-  return node2->Pt.Y < node1->Pt.Y;
-}
-//------------------------------------------------------------------------------
-
-inline bool EdgesAdjacent(const IntersectNode &inode)
-{
-  return (inode.Edge1->NextInSEL == inode.Edge2) ||
-    (inode.Edge1->PrevInSEL == inode.Edge2);
-}
-//------------------------------------------------------------------------------
-
-bool Clipper::FixupIntersectionOrder()
-{
-  //pre-condition: intersections are sorted Bottom-most first.
-  //Now it's crucial that intersections are made only between adjacent edges,
-  //so to ensure this the order of intersections may need adjusting ...
-  CopyAELToSEL();
-  std::sort(m_IntersectList.begin(), m_IntersectList.end(), IntersectListSort);
-  size_t cnt = m_IntersectList.size();
-  for (size_t i = 0; i < cnt; ++i)
-  {
-    if (!EdgesAdjacent(*m_IntersectList[i]))
-    {
-      size_t j = i + 1;
-      while (j < cnt && !EdgesAdjacent(*m_IntersectList[j])) j++;
-      if (j == cnt)  return false;
-      std::swap(m_IntersectList[i], m_IntersectList[j]);
-    }
-    SwapPositionsInSEL(m_IntersectList[i]->Edge1, m_IntersectList[i]->Edge2);
-  }
-  return true;
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
+    // ClipperOffset class
+    //------------------------------------------------------------------------------
 
-void Clipper::DoMaxima(TEdge *e)
-{
-  TEdge* eMaxPair = GetMaximaPairEx(e);
-  if (!eMaxPair)
-  {
-    if (e->OutIdx >= 0)
-      AddOutPt(e, e->Top);
-    DeleteFromAEL(e);
-    return;
-  }
-
-  TEdge* eNext = e->NextInAEL;
-  while(eNext && eNext != eMaxPair)
-  {
-    IntersectEdges(e, eNext, e->Top);
-    SwapPositionsInAEL(e, eNext);
-    eNext = e->NextInAEL;
-  }
-
-  if(e->OutIdx == Unassigned && eMaxPair->OutIdx == Unassigned)
-  {
-    DeleteFromAEL(e);
-    DeleteFromAEL(eMaxPair);
-  }
-  else if( e->OutIdx >= 0 && eMaxPair->OutIdx >= 0 )
-  {
-    if (e->OutIdx >= 0) AddLocalMaxPoly(e, eMaxPair, e->Top);
-    DeleteFromAEL(e);
-    DeleteFromAEL(eMaxPair);
-  }
-#ifdef use_lines
-  else if (e->WindDelta == 0)
-  {
-    if (e->OutIdx >= 0)
+    ClipperOffset::ClipperOffset(double miterLimit, double arcTolerance)
     {
-      AddOutPt(e, e->Top);
-      e->OutIdx = Unassigned;
+        this->MiterLimit   = miterLimit;
+        this->ArcTolerance = arcTolerance;
+        m_lowest.X         = -1;
     }
-    DeleteFromAEL(e);
+    //------------------------------------------------------------------------------
 
-    if (eMaxPair->OutIdx >= 0)
+    ClipperOffset::~ClipperOffset()
     {
-      AddOutPt(eMaxPair, e->Top);
-      eMaxPair->OutIdx = Unassigned;
+        Clear();
     }
-    DeleteFromAEL(eMaxPair);
-  }
-#endif
-  else throw clipperException("DoMaxima error");
-}
-//------------------------------------------------------------------------------
-
-void Clipper::ProcessEdgesAtTopOfScanbeam(const cInt topY)
-{
-  TEdge* e = m_ActiveEdges;
-  while( e )
-  {
-    //1. process maxima, treating them as if they're 'bent' horizontal edges,
-    //   but exclude maxima with horizontal edges. nb: e can't be a horizontal.
-    bool IsMaximaEdge = IsMaxima(e, topY);
-
-    if(IsMaximaEdge)
-    {
-      TEdge* eMaxPair = GetMaximaPairEx(e);
-      IsMaximaEdge = (!eMaxPair || !IsHorizontal(*eMaxPair));
-    }
-
-    if(IsMaximaEdge)
-    {
-      if (m_StrictSimple) m_Maxima.push_back(e->Top.X);
-      TEdge* ePrev = e->PrevInAEL;
-      DoMaxima(e);
-      if( !ePrev ) e = m_ActiveEdges;
-      else e = ePrev->NextInAEL;
-    }
-    else
-    {
-      //2. promote horizontal edges, otherwise update Curr.X and Curr.Y ...
-      if (IsIntermediate(e, topY) && IsHorizontal(*e->NextInLML))
-      {
-        UpdateEdgeIntoAEL(e);
-        if (e->OutIdx >= 0)
-          AddOutPt(e, e->Bot);
-        AddEdgeToSEL(e);
-      }
-      else
-      {
-        e->Curr.X = TopX( *e, topY );
-        e->Curr.Y = topY;
-#ifdef use_xyz
-		e->Curr.Z = topY == e->Top.Y ? e->Top.Z : (topY == e->Bot.Y ? e->Bot.Z : 0);
-#endif
-	  }
+    //------------------------------------------------------------------------------
 
-      //When StrictlySimple and 'e' is being touched by another edge, then
-      //make sure both edges have a vertex here ...
-      if (m_StrictSimple)
-      {
-        TEdge* ePrev = e->PrevInAEL;
-        if ((e->OutIdx >= 0) && (e->WindDelta != 0) && ePrev && (ePrev->OutIdx >= 0) &&
-          (ePrev->Curr.X == e->Curr.X) && (ePrev->WindDelta != 0))
-        {
-          IntPoint pt = e->Curr;
-#ifdef use_xyz
-          SetZ(pt, *ePrev, *e);
-#endif
-          OutPt* op = AddOutPt(ePrev, pt);
-          OutPt* op2 = AddOutPt(e, pt);
-          AddJoin(op, op2, pt); //StrictlySimple (type-3) join
-        }
-      }
-
-      e = e->NextInAEL;
-    }
-  }
-
-  //3. Process horizontals at the Top of the scanbeam ...
-  m_Maxima.sort();
-  ProcessHorizontals();
-  m_Maxima.clear();
-
-  //4. Promote intermediate vertices ...
-  e = m_ActiveEdges;
-  while(e)
-  {
-    if(IsIntermediate(e, topY))
-    {
-      OutPt* op = 0;
-      if( e->OutIdx >= 0 )
-        op = AddOutPt(e, e->Top);
-      UpdateEdgeIntoAEL(e);
-
-      //if output polygons share an edge, they'll need joining later ...
-      TEdge* ePrev = e->PrevInAEL;
-      TEdge* eNext = e->NextInAEL;
-      if (ePrev && ePrev->Curr.X == e->Bot.X &&
-        ePrev->Curr.Y == e->Bot.Y && op &&
-        ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
-        SlopesEqual(e->Curr, e->Top, ePrev->Curr, ePrev->Top, m_UseFullRange) &&
-        (e->WindDelta != 0) && (ePrev->WindDelta != 0))
-      {
-        OutPt* op2 = AddOutPt(ePrev, e->Bot);
-        AddJoin(op, op2, e->Top);
-      }
-      else if (eNext && eNext->Curr.X == e->Bot.X &&
-        eNext->Curr.Y == e->Bot.Y && op &&
-        eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
-        SlopesEqual(e->Curr, e->Top, eNext->Curr, eNext->Top, m_UseFullRange) &&
-        (e->WindDelta != 0) && (eNext->WindDelta != 0))
-      {
-        OutPt* op2 = AddOutPt(eNext, e->Bot);
-        AddJoin(op, op2, e->Top);
-      }
-    }
-    e = e->NextInAEL;
-  }
-}
-//------------------------------------------------------------------------------
-
-void Clipper::FixupOutPolyline(OutRec &outrec)
-{
-  OutPt *pp = outrec.Pts;
-  OutPt *lastPP = pp->Prev;
-  while (pp != lastPP)
-  {
-    pp = pp->Next;
-    if (pp->Pt == pp->Prev->Pt)
-    {
-      if (pp == lastPP) lastPP = pp->Prev;
-      OutPt *tmpPP = pp->Prev;
-      tmpPP->Next = pp->Next;
-      pp->Next->Prev = tmpPP;
-      delete pp;
-      pp = tmpPP;
-    }
-  }
-
-  if (pp == pp->Prev)
-  {
-    DisposeOutPts(pp);
-    outrec.Pts = 0;
-    return;
-  }
-}
-//------------------------------------------------------------------------------
-
-void Clipper::FixupOutPolygon(OutRec &outrec)
-{
-    //FixupOutPolygon() - removes duplicate points and simplifies consecutive
-    //parallel edges by removing the middle vertex.
-    OutPt *lastOK = 0;
-    outrec.BottomPt = 0;
-    OutPt *pp = outrec.Pts;
-    bool preserveCol = m_PreserveCollinear || m_StrictSimple;
+    void ClipperOffset::Clear()
+    {
+        for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
+            delete m_polyNodes.Childs[i];
+        m_polyNodes.Childs.clear();
+        m_lowest.X = -1;
+    }
+    //------------------------------------------------------------------------------
 
-    for (;;)
+    void ClipperOffset::AddPath(const Path& path, JoinType joinType, EndType endType)
     {
-        if (pp->Prev == pp || pp->Prev == pp->Next)
+        int highI = (int)path.size() - 1;
+        if (highI < 0) return;
+        PolyNode* newNode   = new PolyNode();
+        newNode->m_jointype = joinType;
+        newNode->m_endtype  = endType;
+
+        // strip duplicate points from path and also get index to the lowest point ...
+        if (endType == etClosedLine || endType == etClosedPolygon)
+            while (highI > 0 && path[0] == path[highI]) highI--;
+        newNode->Contour.reserve(highI + 1);
+        newNode->Contour.push_back(path[0]);
+        int j = 0, k = 0;
+        for (int i = 1; i <= highI; i++)
+            if (newNode->Contour[j] != path[i])
+            {
+                j++;
+                newNode->Contour.push_back(path[i]);
+                if (path[i].Y > newNode->Contour[k].Y ||
+                    (path[i].Y == newNode->Contour[k].Y &&
+                     path[i].X < newNode->Contour[k].X)) k = j;
+            }
+        if (endType == etClosedPolygon && j < 2)
         {
-            DisposeOutPts(pp);
-            outrec.Pts = 0;
+            delete newNode;
             return;
         }
+        m_polyNodes.AddChild(*newNode);
 
-        //test for duplicate points and collinear edges ...
-        if ((pp->Pt == pp->Next->Pt) || (pp->Pt == pp->Prev->Pt) ||
-            (SlopesEqual(pp->Prev->Pt, pp->Pt, pp->Next->Pt, m_UseFullRange) &&
-            (!preserveCol || !Pt2IsBetweenPt1AndPt3(pp->Prev->Pt, pp->Pt, pp->Next->Pt))))
-        {
-            lastOK = 0;
-            OutPt *tmp = pp;
-            pp->Prev->Next = pp->Next;
-            pp->Next->Prev = pp->Prev;
-            pp = pp->Prev;
-            delete tmp;
-        }
-        else if (pp == lastOK) break;
+        // if this path's lowest pt is lower than all the others then update m_lowest
+        if (endType != etClosedPolygon) return;
+        if (m_lowest.X < 0)
+            m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
         else
         {
-            if (!lastOK) lastOK = pp;
-            pp = pp->Next;
+            IntPoint ip = m_polyNodes.Childs[(int)m_lowest.X]->Contour[(int)m_lowest.Y];
+            if (newNode->Contour[k].Y > ip.Y ||
+                (newNode->Contour[k].Y == ip.Y &&
+                 newNode->Contour[k].X < ip.X))
+                m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
         }
     }
-    outrec.Pts = pp;
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-int PointCount(OutPt *Pts)
-{
-    if (!Pts) return 0;
-    int result = 0;
-    OutPt* p = Pts;
-    do
+    void ClipperOffset::AddPaths(const Paths& paths, JoinType joinType, EndType endType)
     {
-        result++;
-        p = p->Next;
+        for (Paths::size_type i = 0; i < paths.size(); ++i)
+            AddPath(paths[i], joinType, endType);
     }
-    while (p != Pts);
-    return result;
-}
-//------------------------------------------------------------------------------
-
-void Clipper::BuildResult(Paths &polys)
-{
-  polys.reserve(m_PolyOuts.size());
-  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-  {
-    if (!m_PolyOuts[i]->Pts) continue;
-    Path pg;
-    OutPt* p = m_PolyOuts[i]->Pts->Prev;
-    int cnt = PointCount(p);
-    if (cnt < 2) continue;
-    pg.reserve(cnt);
-    for (int i = 0; i < cnt; ++i)
-    {
-      pg.push_back(p->Pt);
-      p = p->Prev;
-    }
-    polys.push_back(pg);
-  }
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-void Clipper::BuildResult2(PolyTree& polytree)
-{
-    polytree.Clear();
-    polytree.AllNodes.reserve(m_PolyOuts.size());
-    //add each output polygon/contour to polytree ...
-    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
-    {
-        OutRec* outRec = m_PolyOuts[i];
-        int cnt = PointCount(outRec->Pts);
-        if ((outRec->IsOpen && cnt < 2) || (!outRec->IsOpen && cnt < 3)) continue;
-        FixHoleLinkage(*outRec);
-        PolyNode* pn = new PolyNode();
-        //nb: polytree takes ownership of all the PolyNodes
-        polytree.AllNodes.push_back(pn);
-        outRec->PolyNd = pn;
-        pn->Parent = 0;
-        pn->Index = 0;
-        pn->Contour.reserve(cnt);
-        OutPt *op = outRec->Pts->Prev;
-        for (int j = 0; j < cnt; j++)
-        {
-            pn->Contour.push_back(op->Pt);
-            op = op->Prev;
-        }
-    }
-
-    //fixup PolyNode links etc ...
-    polytree.Childs.reserve(m_PolyOuts.size());
-    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
-    {
-        OutRec* outRec = m_PolyOuts[i];
-        if (!outRec->PolyNd) continue;
-        if (outRec->IsOpen)
-        {
-          outRec->PolyNd->m_IsOpen = true;
-          polytree.AddChild(*outRec->PolyNd);
-        }
-        else if (outRec->FirstLeft && outRec->FirstLeft->PolyNd)
-          outRec->FirstLeft->PolyNd->AddChild(*outRec->PolyNd);
+    void ClipperOffset::FixOrientations()
+    {
+        // fixup orientations of all closed paths if the orientation of the
+        // closed path with the lowermost vertex is wrong ...
+        if (m_lowest.X >= 0 &&
+            !Orientation(m_polyNodes.Childs[(int)m_lowest.X]->Contour))
+        {
+            for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
+            {
+                PolyNode& node = *m_polyNodes.Childs[i];
+                if (node.m_endtype == etClosedPolygon ||
+                    (node.m_endtype == etClosedLine && Orientation(node.Contour)))
+                    ReversePath(node.Contour);
+            }
+        }
         else
-          polytree.AddChild(*outRec->PolyNd);
+        {
+            for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
+            {
+                PolyNode& node = *m_polyNodes.Childs[i];
+                if (node.m_endtype == etClosedLine && !Orientation(node.Contour))
+                    ReversePath(node.Contour);
+            }
+        }
     }
-}
-//------------------------------------------------------------------------------
-
-void SwapIntersectNodes(IntersectNode &int1, IntersectNode &int2)
-{
-  //just swap the contents (because fIntersectNodes is a single-linked-list)
-  IntersectNode inode = int1; //gets a copy of Int1
-  int1.Edge1 = int2.Edge1;
-  int1.Edge2 = int2.Edge2;
-  int1.Pt = int2.Pt;
-  int2.Edge1 = inode.Edge1;
-  int2.Edge2 = inode.Edge2;
-  int2.Pt = inode.Pt;
-}
-//------------------------------------------------------------------------------
-
-inline bool E2InsertsBeforeE1(TEdge &e1, TEdge &e2)
-{
-  if (e2.Curr.X == e1.Curr.X)
-  {
-    if (e2.Top.Y > e1.Top.Y)
-      return e2.Top.X < TopX(e1, e2.Top.Y);
-      else return e1.Top.X > TopX(e2, e1.Top.Y);
-  }
-  else return e2.Curr.X < e1.Curr.X;
-}
-//------------------------------------------------------------------------------
-
-bool GetOverlap(const cInt a1, const cInt a2, const cInt b1, const cInt b2,
-    cInt& Left, cInt& Right)
-{
-  if (a1 < a2)
-  {
-    if (b1 < b2) {Left = std::max(a1,b1); Right = std::min(a2,b2);}
-    else {Left = std::max(a1,b2); Right = std::min(a2,b1);}
-  }
-  else
-  {
-    if (b1 < b2) {Left = std::max(a2,b1); Right = std::min(a1,b2);}
-    else {Left = std::max(a2,b2); Right = std::min(a1,b1);}
-  }
-  return Left < Right;
-}
-//------------------------------------------------------------------------------
-
-inline void UpdateOutPtIdxs(OutRec& outrec)
-{
-  OutPt* op = outrec.Pts;
-  do
-  {
-    op->Idx = outrec.Idx;
-    op = op->Prev;
-  }
-  while(op != outrec.Pts);
-}
-//------------------------------------------------------------------------------
-
-void Clipper::InsertEdgeIntoAEL(TEdge *edge, TEdge* startEdge)
-{
-  if(!m_ActiveEdges)
-  {
-    edge->PrevInAEL = 0;
-    edge->NextInAEL = 0;
-    m_ActiveEdges = edge;
-  }
-  else if(!startEdge && E2InsertsBeforeE1(*m_ActiveEdges, *edge))
-  {
-      edge->PrevInAEL = 0;
-      edge->NextInAEL = m_ActiveEdges;
-      m_ActiveEdges->PrevInAEL = edge;
-      m_ActiveEdges = edge;
-  }
-  else
-  {
-    if(!startEdge) startEdge = m_ActiveEdges;
-    while(startEdge->NextInAEL  &&
-      !E2InsertsBeforeE1(*startEdge->NextInAEL , *edge))
-        startEdge = startEdge->NextInAEL;
-    edge->NextInAEL = startEdge->NextInAEL;
-    if(startEdge->NextInAEL) startEdge->NextInAEL->PrevInAEL = edge;
-    edge->PrevInAEL = startEdge;
-    startEdge->NextInAEL = edge;
-  }
-}
-//----------------------------------------------------------------------
-
-OutPt* DupOutPt(OutPt* outPt, bool InsertAfter)
-{
-  OutPt* result = new OutPt;
-  result->Pt = outPt->Pt;
-  result->Idx = outPt->Idx;
-  if (InsertAfter)
-  {
-    result->Next = outPt->Next;
-    result->Prev = outPt;
-    outPt->Next->Prev = result;
-    outPt->Next = result;
-  }
-  else
-  {
-    result->Prev = outPt->Prev;
-    result->Next = outPt;
-    outPt->Prev->Next = result;
-    outPt->Prev = result;
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
-
-bool JoinHorz(OutPt* op1, OutPt* op1b, OutPt* op2, OutPt* op2b,
-  const IntPoint Pt, bool DiscardLeft)
-{
-  Direction Dir1 = (op1->Pt.X > op1b->Pt.X ? dRightToLeft : dLeftToRight);
-  Direction Dir2 = (op2->Pt.X > op2b->Pt.X ? dRightToLeft : dLeftToRight);
-  if (Dir1 == Dir2) return false;
-
-  //When DiscardLeft, we want Op1b to be on the Left of Op1, otherwise we
-  //want Op1b to be on the Right. (And likewise with Op2 and Op2b.)
-  //So, to facilitate this while inserting Op1b and Op2b ...
-  //when DiscardLeft, make sure we're AT or RIGHT of Pt before adding Op1b,
-  //otherwise make sure we're AT or LEFT of Pt. (Likewise with Op2b.)
-  if (Dir1 == dLeftToRight)
-  {
-    while (op1->Next->Pt.X <= Pt.X &&
-      op1->Next->Pt.X >= op1->Pt.X && op1->Next->Pt.Y == Pt.Y)
-        op1 = op1->Next;
-    if (DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
-    op1b = DupOutPt(op1, !DiscardLeft);
-    if (op1b->Pt != Pt)
-    {
-      op1 = op1b;
-      op1->Pt = Pt;
-      op1b = DupOutPt(op1, !DiscardLeft);
-    }
-  }
-  else
-  {
-    while (op1->Next->Pt.X >= Pt.X &&
-      op1->Next->Pt.X <= op1->Pt.X && op1->Next->Pt.Y == Pt.Y)
-        op1 = op1->Next;
-    if (!DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
-    op1b = DupOutPt(op1, DiscardLeft);
-    if (op1b->Pt != Pt)
-    {
-      op1 = op1b;
-      op1->Pt = Pt;
-      op1b = DupOutPt(op1, DiscardLeft);
-    }
-  }
-
-  if (Dir2 == dLeftToRight)
-  {
-    while (op2->Next->Pt.X <= Pt.X &&
-      op2->Next->Pt.X >= op2->Pt.X && op2->Next->Pt.Y == Pt.Y)
-        op2 = op2->Next;
-    if (DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
-    op2b = DupOutPt(op2, !DiscardLeft);
-    if (op2b->Pt != Pt)
-    {
-      op2 = op2b;
-      op2->Pt = Pt;
-      op2b = DupOutPt(op2, !DiscardLeft);
-    };
-  } else
-  {
-    while (op2->Next->Pt.X >= Pt.X &&
-      op2->Next->Pt.X <= op2->Pt.X && op2->Next->Pt.Y == Pt.Y)
-        op2 = op2->Next;
-    if (!DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
-    op2b = DupOutPt(op2, DiscardLeft);
-    if (op2b->Pt != Pt)
-    {
-      op2 = op2b;
-      op2->Pt = Pt;
-      op2b = DupOutPt(op2, DiscardLeft);
-    };
-  };
-
-  if ((Dir1 == dLeftToRight) == DiscardLeft)
-  {
-    op1->Prev = op2;
-    op2->Next = op1;
-    op1b->Next = op2b;
-    op2b->Prev = op1b;
-  }
-  else
-  {
-    op1->Next = op2;
-    op2->Prev = op1;
-    op1b->Prev = op2b;
-    op2b->Next = op1b;
-  }
-  return true;
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-bool Clipper::JoinPoints(Join *j, OutRec* outRec1, OutRec* outRec2)
-{
-  OutPt *op1 = j->OutPt1, *op1b;
-  OutPt *op2 = j->OutPt2, *op2b;
-
-  //There are 3 kinds of joins for output polygons ...
-  //1. Horizontal joins where Join.OutPt1 & Join.OutPt2 are vertices anywhere
-  //along (horizontal) collinear edges (& Join.OffPt is on the same horizontal).
-  //2. Non-horizontal joins where Join.OutPt1 & Join.OutPt2 are at the same
-  //location at the Bottom of the overlapping segment (& Join.OffPt is above).
-  //3. StrictSimple joins where edges touch but are not collinear and where
-  //Join.OutPt1, Join.OutPt2 & Join.OffPt all share the same point.
-  bool isHorizontal = (j->OutPt1->Pt.Y == j->OffPt.Y);
-
-  if (isHorizontal  && (j->OffPt == j->OutPt1->Pt) &&
-  (j->OffPt == j->OutPt2->Pt))
-  {
-    //Strictly Simple join ...
-    if (outRec1 != outRec2) return false;
-    op1b = j->OutPt1->Next;
-    while (op1b != op1 && (op1b->Pt == j->OffPt))
-      op1b = op1b->Next;
-    bool reverse1 = (op1b->Pt.Y > j->OffPt.Y);
-    op2b = j->OutPt2->Next;
-    while (op2b != op2 && (op2b->Pt == j->OffPt))
-      op2b = op2b->Next;
-    bool reverse2 = (op2b->Pt.Y > j->OffPt.Y);
-    if (reverse1 == reverse2) return false;
-    if (reverse1)
-    {
-      op1b = DupOutPt(op1, false);
-      op2b = DupOutPt(op2, true);
-      op1->Prev = op2;
-      op2->Next = op1;
-      op1b->Next = op2b;
-      op2b->Prev = op1b;
-      j->OutPt1 = op1;
-      j->OutPt2 = op1b;
-      return true;
-    } else
-    {
-      op1b = DupOutPt(op1, true);
-      op2b = DupOutPt(op2, false);
-      op1->Next = op2;
-      op2->Prev = op1;
-      op1b->Prev = op2b;
-      op2b->Next = op1b;
-      j->OutPt1 = op1;
-      j->OutPt2 = op1b;
-      return true;
-    }
-  }
-  else if (isHorizontal)
-  {
-    //treat horizontal joins differently to non-horizontal joins since with
-    //them we're not yet sure where the overlapping is. OutPt1.Pt & OutPt2.Pt
-    //may be anywhere along the horizontal edge.
-    op1b = op1;
-    while (op1->Prev->Pt.Y == op1->Pt.Y && op1->Prev != op1b && op1->Prev != op2)
-      op1 = op1->Prev;
-    while (op1b->Next->Pt.Y == op1b->Pt.Y && op1b->Next != op1 && op1b->Next != op2)
-      op1b = op1b->Next;
-    if (op1b->Next == op1 || op1b->Next == op2) return false; //a flat 'polygon'
-
-    op2b = op2;
-    while (op2->Prev->Pt.Y == op2->Pt.Y && op2->Prev != op2b && op2->Prev != op1b)
-      op2 = op2->Prev;
-    while (op2b->Next->Pt.Y == op2b->Pt.Y && op2b->Next != op2 && op2b->Next != op1)
-      op2b = op2b->Next;
-    if (op2b->Next == op2 || op2b->Next == op1) return false; //a flat 'polygon'
-
-    cInt Left, Right;
-    //Op1 --> Op1b & Op2 --> Op2b are the extremites of the horizontal edges
-    if (!GetOverlap(op1->Pt.X, op1b->Pt.X, op2->Pt.X, op2b->Pt.X, Left, Right))
-      return false;
-
-    //DiscardLeftSide: when overlapping edges are joined, a spike will created
-    //which needs to be cleaned up. However, we don't want Op1 or Op2 caught up
-    //on the discard Side as either may still be needed for other joins ...
-    IntPoint Pt;
-    bool DiscardLeftSide;
-    if (op1->Pt.X >= Left && op1->Pt.X <= Right)
-    {
-      Pt = op1->Pt; DiscardLeftSide = (op1->Pt.X > op1b->Pt.X);
-    }
-    else if (op2->Pt.X >= Left&& op2->Pt.X <= Right)
-    {
-      Pt = op2->Pt; DiscardLeftSide = (op2->Pt.X > op2b->Pt.X);
-    }
-    else if (op1b->Pt.X >= Left && op1b->Pt.X <= Right)
-    {
-      Pt = op1b->Pt; DiscardLeftSide = op1b->Pt.X > op1->Pt.X;
-    }
-    else
-    {
-      Pt = op2b->Pt; DiscardLeftSide = (op2b->Pt.X > op2->Pt.X);
-    }
-    j->OutPt1 = op1; j->OutPt2 = op2;
-    return JoinHorz(op1, op1b, op2, op2b, Pt, DiscardLeftSide);
-  } else
-  {
-    //nb: For non-horizontal joins ...
-    //    1. Jr.OutPt1.Pt.Y == Jr.OutPt2.Pt.Y
-    //    2. Jr.OutPt1.Pt > Jr.OffPt.Y
-
-    //make sure the polygons are correctly oriented ...
-    op1b = op1->Next;
-    while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Next;
-    bool Reverse1 = ((op1b->Pt.Y > op1->Pt.Y) ||
-      !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange));
-    if (Reverse1)
-    {
-      op1b = op1->Prev;
-      while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Prev;
-      if ((op1b->Pt.Y > op1->Pt.Y) ||
-        !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange)) return false;
-    };
-    op2b = op2->Next;
-    while ((op2b->Pt == op2->Pt) && (op2b != op2))op2b = op2b->Next;
-    bool Reverse2 = ((op2b->Pt.Y > op2->Pt.Y) ||
-      !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange));
-    if (Reverse2)
-    {
-      op2b = op2->Prev;
-      while ((op2b->Pt == op2->Pt) && (op2b != op2)) op2b = op2b->Prev;
-      if ((op2b->Pt.Y > op2->Pt.Y) ||
-        !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange)) return false;
-    }
-
-    if ((op1b == op1) || (op2b == op2) || (op1b == op2b) ||
-      ((outRec1 == outRec2) && (Reverse1 == Reverse2))) return false;
-
-    if (Reverse1)
-    {
-      op1b = DupOutPt(op1, false);
-      op2b = DupOutPt(op2, true);
-      op1->Prev = op2;
-      op2->Next = op1;
-      op1b->Next = op2b;
-      op2b->Prev = op1b;
-      j->OutPt1 = op1;
-      j->OutPt2 = op1b;
-      return true;
-    } else
-    {
-      op1b = DupOutPt(op1, true);
-      op2b = DupOutPt(op2, false);
-      op1->Next = op2;
-      op2->Prev = op1;
-      op1b->Prev = op2b;
-      op2b->Next = op1b;
-      j->OutPt1 = op1;
-      j->OutPt2 = op1b;
-      return true;
-    }
-  }
-}
-//----------------------------------------------------------------------
-
-static OutRec* ParseFirstLeft(OutRec* FirstLeft)
-{
-  while (FirstLeft && !FirstLeft->Pts)
-    FirstLeft = FirstLeft->FirstLeft;
-  return FirstLeft;
-}
-//------------------------------------------------------------------------------
+    void ClipperOffset::Execute(Paths& solution, double delta)
+    {
+        solution.clear();
+        FixOrientations();
+        DoOffset(delta);
+
+        // now clean up 'corners' ...
+        Clipper clpr;
+        clpr.AddPaths(m_destPolys, ptSubject, true);
+        if (delta > 0)
+        {
+            clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
+        }
+        else
+        {
+            IntRect r = clpr.GetBounds();
+            Path    outer(4);
+            outer[0] = IntPoint(r.left - 10, r.bottom + 10);
+            outer[1] = IntPoint(r.right + 10, r.bottom + 10);
+            outer[2] = IntPoint(r.right + 10, r.top - 10);
+            outer[3] = IntPoint(r.left - 10, r.top - 10);
+
+            clpr.AddPath(outer, ptSubject, true);
+            clpr.ReverseSolution(true);
+            clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
+            if (solution.size() > 0) solution.erase(solution.begin());
+        }
+    }
+    //------------------------------------------------------------------------------
 
-void Clipper::FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec)
-{
-  //tests if NewOutRec contains the polygon before reassigning FirstLeft
-  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-  {
-    OutRec* outRec = m_PolyOuts[i];
-    OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
-    if (outRec->Pts  && firstLeft == OldOutRec)
+    void ClipperOffset::Execute(PolyTree& solution, double delta)
     {
-      if (Poly2ContainsPoly1(outRec->Pts, NewOutRec->Pts))
-        outRec->FirstLeft = NewOutRec;
+        solution.Clear();
+        FixOrientations();
+        DoOffset(delta);
+
+        // now clean up 'corners' ...
+        Clipper clpr;
+        clpr.AddPaths(m_destPolys, ptSubject, true);
+        if (delta > 0)
+        {
+            clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
+        }
+        else
+        {
+            IntRect r = clpr.GetBounds();
+            Path    outer(4);
+            outer[0] = IntPoint(r.left - 10, r.bottom + 10);
+            outer[1] = IntPoint(r.right + 10, r.bottom + 10);
+            outer[2] = IntPoint(r.right + 10, r.top - 10);
+            outer[3] = IntPoint(r.left - 10, r.top - 10);
+
+            clpr.AddPath(outer, ptSubject, true);
+            clpr.ReverseSolution(true);
+            clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
+            // remove the outer PolyNode rectangle ...
+            if (solution.ChildCount() == 1 && solution.Childs[0]->ChildCount() > 0)
+            {
+                PolyNode* outerNode = solution.Childs[0];
+                solution.Childs.reserve(outerNode->ChildCount());
+                solution.Childs[0]         = outerNode->Childs[0];
+                solution.Childs[0]->Parent = outerNode->Parent;
+                for (int i = 1; i < outerNode->ChildCount(); ++i)
+                    solution.AddChild(*outerNode->Childs[i]);
+            }
+            else
+                solution.Clear();
+        }
     }
-  }
-}
-//----------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-void Clipper::FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec)
-{
-  //A polygon has split into two such that one is now the inner of the other.
-  //It's possible that these polygons now wrap around other polygons, so check
-  //every polygon that's also contained by OuterOutRec's FirstLeft container
-  //(including 0) to see if they've become inner to the new inner polygon ...
-  OutRec* orfl = OuterOutRec->FirstLeft;
-  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-  {
-    OutRec* outRec = m_PolyOuts[i];
-
-    if (!outRec->Pts || outRec == OuterOutRec || outRec == InnerOutRec)
-      continue;
-    OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
-    if (firstLeft != orfl && firstLeft != InnerOutRec && firstLeft != OuterOutRec)
-      continue;
-    if (Poly2ContainsPoly1(outRec->Pts, InnerOutRec->Pts))
-      outRec->FirstLeft = InnerOutRec;
-    else if (Poly2ContainsPoly1(outRec->Pts, OuterOutRec->Pts))
-      outRec->FirstLeft = OuterOutRec;
-    else if (outRec->FirstLeft == InnerOutRec || outRec->FirstLeft == OuterOutRec)
-      outRec->FirstLeft = orfl;
-  }
-}
-//----------------------------------------------------------------------
-void Clipper::FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec)
-{
-  //reassigns FirstLeft WITHOUT testing if NewOutRec contains the polygon
-  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-  {
-    OutRec* outRec = m_PolyOuts[i];
-    OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
-    if (outRec->Pts && firstLeft == OldOutRec)
-      outRec->FirstLeft = NewOutRec;
-  }
-}
-//----------------------------------------------------------------------
-
-void Clipper::JoinCommonEdges()
-{
-  for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
-  {
-    Join* join = m_Joins[i];
-
-    OutRec *outRec1 = GetOutRec(join->OutPt1->Idx);
-    OutRec *outRec2 = GetOutRec(join->OutPt2->Idx);
-
-    if (!outRec1->Pts || !outRec2->Pts) continue;
-    if (outRec1->IsOpen || outRec2->IsOpen) continue;
-
-    //get the polygon fragment with the correct hole state (FirstLeft)
-    //before calling JoinPoints() ...
-    OutRec *holeStateRec;
-    if (outRec1 == outRec2) holeStateRec = outRec1;
-    else if (OutRec1RightOfOutRec2(outRec1, outRec2)) holeStateRec = outRec2;
-    else if (OutRec1RightOfOutRec2(outRec2, outRec1)) holeStateRec = outRec1;
-    else holeStateRec = GetLowermostRec(outRec1, outRec2);
-
-    if (!JoinPoints(join, outRec1, outRec2)) continue;
-
-    if (outRec1 == outRec2)
-    {
-      //instead of joining two polygons, we've just created a new one by
-      //splitting one polygon into two.
-      outRec1->Pts = join->OutPt1;
-      outRec1->BottomPt = 0;
-      outRec2 = CreateOutRec();
-      outRec2->Pts = join->OutPt2;
-
-      //update all OutRec2.Pts Idx's ...
-      UpdateOutPtIdxs(*outRec2);
-
-      if (Poly2ContainsPoly1(outRec2->Pts, outRec1->Pts))
-      {
-        //outRec1 contains outRec2 ...
-        outRec2->IsHole = !outRec1->IsHole;
-        outRec2->FirstLeft = outRec1;
+    void ClipperOffset::DoOffset(double delta)
+    {
+        m_destPolys.clear();
+        m_delta = delta;
 
-        if (m_UsingPolyTree) FixupFirstLefts2(outRec2, outRec1);
+        // if Zero offset, just copy any CLOSED polygons to m_p and return ...
+        if (NEAR_ZERO(delta))
+        {
+            m_destPolys.reserve(m_polyNodes.ChildCount());
+            for (int i = 0; i < m_polyNodes.ChildCount(); i++)
+            {
+                PolyNode& node = *m_polyNodes.Childs[i];
+                if (node.m_endtype == etClosedPolygon)
+                    m_destPolys.push_back(node.Contour);
+            }
+            return;
+        }
 
-        if ((outRec2->IsHole ^ m_ReverseOutput) == (Area(*outRec2) > 0))
-          ReversePolyPtLinks(outRec2->Pts);
+        // see offset_triginometry3.svg in the documentation folder ...
+        if (MiterLimit > 2)
+            m_miterLim = 2 / (MiterLimit * MiterLimit);
+        else
+            m_miterLim = 0.5;
 
-      } else if (Poly2ContainsPoly1(outRec1->Pts, outRec2->Pts))
-      {
-        //outRec2 contains outRec1 ...
-        outRec2->IsHole = outRec1->IsHole;
-        outRec1->IsHole = !outRec2->IsHole;
-        outRec2->FirstLeft = outRec1->FirstLeft;
-        outRec1->FirstLeft = outRec2;
+        double y;
+        if (ArcTolerance <= 0.0)
+            y = def_arc_tolerance;
+        else if (ArcTolerance > std::fabs(delta) * def_arc_tolerance)
+            y = std::fabs(delta) * def_arc_tolerance;
+        else
+            y = ArcTolerance;
+        // see offset_triginometry2.svg in the documentation folder ...
+        double steps = pi / std::acos(1 - y / std::fabs(delta));
+        if (steps > std::fabs(delta) * pi)
+            steps = std::fabs(delta) * pi;  // ie excessive precision check
+        m_sin         = std::sin(two_pi / steps);
+        m_cos         = std::cos(two_pi / steps);
+        m_StepsPerRad = steps / two_pi;
+        if (delta < 0.0) m_sin = -m_sin;
+
+        m_destPolys.reserve(m_polyNodes.ChildCount() * 2);
+        for (int i = 0; i < m_polyNodes.ChildCount(); i++)
+        {
+            PolyNode& node = *m_polyNodes.Childs[i];
+            m_srcPoly      = node.Contour;
 
-        if (m_UsingPolyTree) FixupFirstLefts2(outRec1, outRec2);
+            int len = (int)m_srcPoly.size();
+            if (len == 0 || (delta <= 0 && (len < 3 || node.m_endtype != etClosedPolygon)))
+                continue;
 
-        if ((outRec1->IsHole ^ m_ReverseOutput) == (Area(*outRec1) > 0))
-          ReversePolyPtLinks(outRec1->Pts);
-      }
-      else
-      {
-        //the 2 polygons are completely separate ...
-        outRec2->IsHole = outRec1->IsHole;
-        outRec2->FirstLeft = outRec1->FirstLeft;
+            m_destPoly.clear();
+            if (len == 1)
+            {
+                if (node.m_jointype == jtRound)
+                {
+                    double X = 1.0, Y = 0.0;
+                    for (cInt j = 1; j <= steps; j++)
+                    {
+                        m_destPoly.push_back(IntPoint(
+                            Round(m_srcPoly[0].X + X * delta),
+                            Round(m_srcPoly[0].Y + Y * delta)));
+                        double X2 = X;
+                        X         = X * m_cos - m_sin * Y;
+                        Y         = X2 * m_sin + Y * m_cos;
+                    }
+                }
+                else
+                {
+                    double X = -1.0, Y = -1.0;
+                    for (int j = 0; j < 4; ++j)
+                    {
+                        m_destPoly.push_back(IntPoint(
+                            Round(m_srcPoly[0].X + X * delta),
+                            Round(m_srcPoly[0].Y + Y * delta)));
+                        if (X < 0)
+                            X = 1;
+                        else if (Y < 0)
+                            Y = 1;
+                        else
+                            X = -1;
+                    }
+                }
+                m_destPolys.push_back(m_destPoly);
+                continue;
+            }
+            // build m_normals ...
+            m_normals.clear();
+            m_normals.reserve(len);
+            for (int j = 0; j < len - 1; ++j)
+                m_normals.push_back(GetUnitNormal(m_srcPoly[j], m_srcPoly[j + 1]));
+            if (node.m_endtype == etClosedLine || node.m_endtype == etClosedPolygon)
+                m_normals.push_back(GetUnitNormal(m_srcPoly[len - 1], m_srcPoly[0]));
+            else
+                m_normals.push_back(DoublePoint(m_normals[len - 2]));
 
-        //fixup FirstLeft pointers that may need reassigning to OutRec2
-        if (m_UsingPolyTree) FixupFirstLefts1(outRec1, outRec2);
-      }
+            if (node.m_endtype == etClosedPolygon)
+            {
+                int k = len - 1;
+                for (int j = 0; j < len; ++j)
+                    OffsetPoint(j, k, node.m_jointype);
+                m_destPolys.push_back(m_destPoly);
+            }
+            else if (node.m_endtype == etClosedLine)
+            {
+                int k = len - 1;
+                for (int j = 0; j < len; ++j)
+                    OffsetPoint(j, k, node.m_jointype);
+                m_destPolys.push_back(m_destPoly);
+                m_destPoly.clear();
+                // re-build m_normals ...
+                DoublePoint n = m_normals[len - 1];
+                for (int j = len - 1; j > 0; j--)
+                    m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
+                m_normals[0] = DoublePoint(-n.X, -n.Y);
+                k            = 0;
+                for (int j = len - 1; j >= 0; j--)
+                    OffsetPoint(j, k, node.m_jointype);
+                m_destPolys.push_back(m_destPoly);
+            }
+            else
+            {
+                int k = 0;
+                for (int j = 1; j < len - 1; ++j)
+                    OffsetPoint(j, k, node.m_jointype);
 
-    } else
-    {
-      //joined 2 polygons together ...
+                IntPoint pt1;
+                if (node.m_endtype == etOpenButt)
+                {
+                    int j = len - 1;
+                    pt1   = IntPoint((cInt)Round(m_srcPoly[j].X + m_normals[j].X *
+                                                                    delta),
+                                   (cInt)Round(m_srcPoly[j].Y + m_normals[j].Y * delta));
+                    m_destPoly.push_back(pt1);
+                    pt1 = IntPoint((cInt)Round(m_srcPoly[j].X - m_normals[j].X *
+                                                                    delta),
+                                   (cInt)Round(m_srcPoly[j].Y - m_normals[j].Y * delta));
+                    m_destPoly.push_back(pt1);
+                }
+                else
+                {
+                    int j        = len - 1;
+                    k            = len - 2;
+                    m_sinA       = 0;
+                    m_normals[j] = DoublePoint(-m_normals[j].X, -m_normals[j].Y);
+                    if (node.m_endtype == etOpenSquare)
+                        DoSquare(j, k);
+                    else
+                        DoRound(j, k);
+                }
 
-      outRec2->Pts = 0;
-      outRec2->BottomPt = 0;
-      outRec2->Idx = outRec1->Idx;
+                // re-build m_normals ...
+                for (int j = len - 1; j > 0; j--)
+                    m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
+                m_normals[0] = DoublePoint(-m_normals[1].X, -m_normals[1].Y);
 
-      outRec1->IsHole = holeStateRec->IsHole;
-      if (holeStateRec == outRec2)
-        outRec1->FirstLeft = outRec2->FirstLeft;
-      outRec2->FirstLeft = outRec1;
+                k = len - 1;
+                for (int j = k - 1; j > 0; --j) OffsetPoint(j, k, node.m_jointype);
 
-      if (m_UsingPolyTree) FixupFirstLefts3(outRec2, outRec1);
+                if (node.m_endtype == etOpenButt)
+                {
+                    pt1 = IntPoint((cInt)Round(m_srcPoly[0].X - m_normals[0].X * delta),
+                                   (cInt)Round(m_srcPoly[0].Y - m_normals[0].Y * delta));
+                    m_destPoly.push_back(pt1);
+                    pt1 = IntPoint((cInt)Round(m_srcPoly[0].X + m_normals[0].X * delta),
+                                   (cInt)Round(m_srcPoly[0].Y + m_normals[0].Y * delta));
+                    m_destPoly.push_back(pt1);
+                }
+                else
+                {
+                    k      = 1;
+                    m_sinA = 0;
+                    if (node.m_endtype == etOpenSquare)
+                        DoSquare(0, 1);
+                    else
+                        DoRound(0, 1);
+                }
+                m_destPolys.push_back(m_destPoly);
+            }
+        }
     }
-  }
-}
-
-//------------------------------------------------------------------------------
-// ClipperOffset support functions ...
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-DoublePoint GetUnitNormal(const IntPoint &pt1, const IntPoint &pt2)
-{
-  if(pt2.X == pt1.X && pt2.Y == pt1.Y)
-    return DoublePoint(0, 0);
-
-  double Dx = (double)(pt2.X - pt1.X);
-  double dy = (double)(pt2.Y - pt1.Y);
-  double f = 1 *1.0/ std::sqrt( Dx*Dx + dy*dy );
-  Dx *= f;
-  dy *= f;
-  return DoublePoint(dy, -Dx);
-}
-
-//------------------------------------------------------------------------------
-// ClipperOffset class
-//------------------------------------------------------------------------------
+    void ClipperOffset::OffsetPoint(int j, int& k, JoinType jointype)
+    {
+        // cross product ...
+        m_sinA = (m_normals[k].X * m_normals[j].Y - m_normals[j].X * m_normals[k].Y);
+        if (std::fabs(m_sinA * m_delta) < 1.0)
+        {
+            // dot product ...
+            double cosA = (m_normals[k].X * m_normals[j].X + m_normals[j].Y * m_normals[k].Y);
+            if (cosA > 0)  // angle => 0 degrees
+            {
+                m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
+                                              Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
+                return;
+            }
+            // else angle => 180 degrees
+        }
+        else if (m_sinA > 1.0)
+            m_sinA = 1.0;
+        else if (m_sinA < -1.0)
+            m_sinA = -1.0;
 
-ClipperOffset::ClipperOffset(double miterLimit, double arcTolerance)
-{
-  this->MiterLimit = miterLimit;
-  this->ArcTolerance = arcTolerance;
-  m_lowest.X = -1;
-}
-//------------------------------------------------------------------------------
+        if (m_sinA * m_delta < 0)
+        {
+            m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
+                                          Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
+            m_destPoly.push_back(m_srcPoly[j]);
+            m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
+                                          Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
+        }
+        else
+            switch (jointype)
+            {
+                case jtMiter:
+                {
+                    double r = 1 + (m_normals[j].X * m_normals[k].X +
+                                    m_normals[j].Y * m_normals[k].Y);
+                    if (r >= m_miterLim)
+                        DoMiter(j, k, r);
+                    else
+                        DoSquare(j, k);
+                    break;
+                }
+                case jtSquare: DoSquare(j, k); break;
+                case jtRound: DoRound(j, k); break;
+            }
+        k = j;
+    }
+    //------------------------------------------------------------------------------
 
-ClipperOffset::~ClipperOffset()
-{
-  Clear();
-}
-//------------------------------------------------------------------------------
+    void ClipperOffset::DoSquare(int j, int k)
+    {
+        double dx = std::tan(std::atan2(m_sinA,
+                                        m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y) /
+                             4);
+        m_destPoly.push_back(IntPoint(
+            Round(m_srcPoly[j].X + m_delta * (m_normals[k].X - m_normals[k].Y * dx)),
+            Round(m_srcPoly[j].Y + m_delta * (m_normals[k].Y + m_normals[k].X * dx))));
+        m_destPoly.push_back(IntPoint(
+            Round(m_srcPoly[j].X + m_delta * (m_normals[j].X + m_normals[j].Y * dx)),
+            Round(m_srcPoly[j].Y + m_delta * (m_normals[j].Y - m_normals[j].X * dx))));
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::Clear()
-{
-  for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
-    delete m_polyNodes.Childs[i];
-  m_polyNodes.Childs.clear();
-  m_lowest.X = -1;
-}
-//------------------------------------------------------------------------------
+    void ClipperOffset::DoMiter(int j, int k, double r)
+    {
+        double q = m_delta / r;
+        m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + (m_normals[k].X + m_normals[j].X) * q),
+                                      Round(m_srcPoly[j].Y + (m_normals[k].Y + m_normals[j].Y) * q)));
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::AddPath(const Path& path, JoinType joinType, EndType endType)
-{
-  int highI = (int)path.size() - 1;
-  if (highI < 0) return;
-  PolyNode* newNode = new PolyNode();
-  newNode->m_jointype = joinType;
-  newNode->m_endtype = endType;
-
-  //strip duplicate points from path and also get index to the lowest point ...
-  if (endType == etClosedLine || endType == etClosedPolygon)
-    while (highI > 0 && path[0] == path[highI]) highI--;
-  newNode->Contour.reserve(highI + 1);
-  newNode->Contour.push_back(path[0]);
-  int j = 0, k = 0;
-  for (int i = 1; i <= highI; i++)
-    if (newNode->Contour[j] != path[i])
-    {
-      j++;
-      newNode->Contour.push_back(path[i]);
-      if (path[i].Y > newNode->Contour[k].Y ||
-        (path[i].Y == newNode->Contour[k].Y &&
-        path[i].X < newNode->Contour[k].X)) k = j;
-    }
-  if (endType == etClosedPolygon && j < 2)
-  {
-    delete newNode;
-    return;
-  }
-  m_polyNodes.AddChild(*newNode);
-
-  //if this path's lowest pt is lower than all the others then update m_lowest
-  if (endType != etClosedPolygon) return;
-  if (m_lowest.X < 0)
-    m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
-  else
-  {
-    IntPoint ip = m_polyNodes.Childs[(int)m_lowest.X]->Contour[(int)m_lowest.Y];
-    if (newNode->Contour[k].Y > ip.Y ||
-      (newNode->Contour[k].Y == ip.Y &&
-      newNode->Contour[k].X < ip.X))
-      m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
-  }
-}
-//------------------------------------------------------------------------------
+    void ClipperOffset::DoRound(int j, int k)
+    {
+        double a     = std::atan2(m_sinA,
+                              m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y);
+        int    steps = std::max((int)Round(m_StepsPerRad * std::fabs(a)), 1);
 
-void ClipperOffset::AddPaths(const Paths& paths, JoinType joinType, EndType endType)
-{
-  for (Paths::size_type i = 0; i < paths.size(); ++i)
-    AddPath(paths[i], joinType, endType);
-}
-//------------------------------------------------------------------------------
+        double X = m_normals[k].X, Y = m_normals[k].Y, X2;
+        for (int i = 0; i < steps; ++i)
+        {
+            m_destPoly.push_back(IntPoint(
+                Round(m_srcPoly[j].X + X * m_delta),
+                Round(m_srcPoly[j].Y + Y * m_delta)));
+            X2 = X;
+            X  = X * m_cos - m_sin * Y;
+            Y  = X2 * m_sin + Y * m_cos;
+        }
+        m_destPoly.push_back(IntPoint(
+            Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
+            Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
+    }
 
-void ClipperOffset::FixOrientations()
-{
-  //fixup orientations of all closed paths if the orientation of the
-  //closed path with the lowermost vertex is wrong ...
-  if (m_lowest.X >= 0 &&
-    !Orientation(m_polyNodes.Childs[(int)m_lowest.X]->Contour))
-  {
-    for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
-    {
-      PolyNode& node = *m_polyNodes.Childs[i];
-      if (node.m_endtype == etClosedPolygon ||
-        (node.m_endtype == etClosedLine && Orientation(node.Contour)))
-          ReversePath(node.Contour);
-    }
-  } else
-  {
-    for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
-    {
-      PolyNode& node = *m_polyNodes.Childs[i];
-      if (node.m_endtype == etClosedLine && !Orientation(node.Contour))
-        ReversePath(node.Contour);
-    }
-  }
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
+    // Miscellaneous public functions
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::Execute(Paths& solution, double delta)
-{
-  solution.clear();
-  FixOrientations();
-  DoOffset(delta);
-
-  //now clean up 'corners' ...
-  Clipper clpr;
-  clpr.AddPaths(m_destPolys, ptSubject, true);
-  if (delta > 0)
-  {
-    clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
-  }
-  else
-  {
-    IntRect r = clpr.GetBounds();
-    Path outer(4);
-    outer[0] = IntPoint(r.left - 10, r.bottom + 10);
-    outer[1] = IntPoint(r.right + 10, r.bottom + 10);
-    outer[2] = IntPoint(r.right + 10, r.top - 10);
-    outer[3] = IntPoint(r.left - 10, r.top - 10);
-
-    clpr.AddPath(outer, ptSubject, true);
-    clpr.ReverseSolution(true);
-    clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
-    if (solution.size() > 0) solution.erase(solution.begin());
-  }
-}
-//------------------------------------------------------------------------------
+    void Clipper::DoSimplePolygons()
+    {
+        PolyOutList::size_type i = 0;
+        while (i < m_PolyOuts.size())
+        {
+            OutRec* outrec = m_PolyOuts[i++];
+            OutPt*  op     = outrec->Pts;
+            if (!op || outrec->IsOpen) continue;
+            do  // for each Pt in Polygon until duplicate found do ...
+            {
+                OutPt* op2 = op->Next;
+                while (op2 != outrec->Pts)
+                {
+                    if ((op->Pt == op2->Pt) && op2->Next != op && op2->Prev != op)
+                    {
+                        // split the polygon into two ...
+                        OutPt* op3 = op->Prev;
+                        OutPt* op4 = op2->Prev;
+                        op->Prev   = op4;
+                        op4->Next  = op;
+                        op2->Prev  = op3;
+                        op3->Next  = op2;
+
+                        outrec->Pts     = op;
+                        OutRec* outrec2 = CreateOutRec();
+                        outrec2->Pts    = op2;
+                        UpdateOutPtIdxs(*outrec2);
+                        if (Poly2ContainsPoly1(outrec2->Pts, outrec->Pts))
+                        {
+                            // OutRec2 is contained by OutRec1 ...
+                            outrec2->IsHole    = !outrec->IsHole;
+                            outrec2->FirstLeft = outrec;
+                            if (m_UsingPolyTree) FixupFirstLefts2(outrec2, outrec);
+                        }
+                        else if (Poly2ContainsPoly1(outrec->Pts, outrec2->Pts))
+                        {
+                            // OutRec1 is contained by OutRec2 ...
+                            outrec2->IsHole    = outrec->IsHole;
+                            outrec->IsHole     = !outrec2->IsHole;
+                            outrec2->FirstLeft = outrec->FirstLeft;
+                            outrec->FirstLeft  = outrec2;
+                            if (m_UsingPolyTree) FixupFirstLefts2(outrec, outrec2);
+                        }
+                        else
+                        {
+                            // the 2 polygons are separate ...
+                            outrec2->IsHole    = outrec->IsHole;
+                            outrec2->FirstLeft = outrec->FirstLeft;
+                            if (m_UsingPolyTree) FixupFirstLefts1(outrec, outrec2);
+                        }
+                        op2 = op;  // ie get ready for the Next iteration
+                    }
+                    op2 = op2->Next;
+                }
+                op = op->Next;
+            } while (op != outrec->Pts);
+        }
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::Execute(PolyTree& solution, double delta)
-{
-  solution.Clear();
-  FixOrientations();
-  DoOffset(delta);
-
-  //now clean up 'corners' ...
-  Clipper clpr;
-  clpr.AddPaths(m_destPolys, ptSubject, true);
-  if (delta > 0)
-  {
-    clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
-  }
-  else
-  {
-    IntRect r = clpr.GetBounds();
-    Path outer(4);
-    outer[0] = IntPoint(r.left - 10, r.bottom + 10);
-    outer[1] = IntPoint(r.right + 10, r.bottom + 10);
-    outer[2] = IntPoint(r.right + 10, r.top - 10);
-    outer[3] = IntPoint(r.left - 10, r.top - 10);
-
-    clpr.AddPath(outer, ptSubject, true);
-    clpr.ReverseSolution(true);
-    clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
-    //remove the outer PolyNode rectangle ...
-    if (solution.ChildCount() == 1 && solution.Childs[0]->ChildCount() > 0)
-    {
-      PolyNode* outerNode = solution.Childs[0];
-      solution.Childs.reserve(outerNode->ChildCount());
-      solution.Childs[0] = outerNode->Childs[0];
-      solution.Childs[0]->Parent = outerNode->Parent;
-      for (int i = 1; i < outerNode->ChildCount(); ++i)
-        solution.AddChild(*outerNode->Childs[i]);
-    }
-    else
-      solution.Clear();
-  }
-}
-//------------------------------------------------------------------------------
+    void ReversePath(Path& p)
+    {
+        std::reverse(p.begin(), p.end());
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::DoOffset(double delta)
-{
-  m_destPolys.clear();
-  m_delta = delta;
-
-  //if Zero offset, just copy any CLOSED polygons to m_p and return ...
-  if (NEAR_ZERO(delta))
-  {
-    m_destPolys.reserve(m_polyNodes.ChildCount());
-    for (int i = 0; i < m_polyNodes.ChildCount(); i++)
-    {
-      PolyNode& node = *m_polyNodes.Childs[i];
-      if (node.m_endtype == etClosedPolygon)
-        m_destPolys.push_back(node.Contour);
-    }
-    return;
-  }
-
-  //see offset_triginometry3.svg in the documentation folder ...
-  if (MiterLimit > 2) m_miterLim = 2/(MiterLimit * MiterLimit);
-  else m_miterLim = 0.5;
-
-  double y;
-  if (ArcTolerance <= 0.0) y = def_arc_tolerance;
-  else if (ArcTolerance > std::fabs(delta) * def_arc_tolerance)
-    y = std::fabs(delta) * def_arc_tolerance;
-  else y = ArcTolerance;
-  //see offset_triginometry2.svg in the documentation folder ...
-  double steps = pi / std::acos(1 - y / std::fabs(delta));
-  if (steps > std::fabs(delta) * pi)
-    steps = std::fabs(delta) * pi;  //ie excessive precision check
-  m_sin = std::sin(two_pi / steps);
-  m_cos = std::cos(two_pi / steps);
-  m_StepsPerRad = steps / two_pi;
-  if (delta < 0.0) m_sin = -m_sin;
-
-  m_destPolys.reserve(m_polyNodes.ChildCount() * 2);
-  for (int i = 0; i < m_polyNodes.ChildCount(); i++)
-  {
-    PolyNode& node = *m_polyNodes.Childs[i];
-    m_srcPoly = node.Contour;
-
-    int len = (int)m_srcPoly.size();
-    if (len == 0 || (delta <= 0 && (len < 3 || node.m_endtype != etClosedPolygon)))
-        continue;
-
-    m_destPoly.clear();
-    if (len == 1)
-    {
-      if (node.m_jointype == jtRound)
-      {
-        double X = 1.0, Y = 0.0;
-        for (cInt j = 1; j <= steps; j++)
-        {
-          m_destPoly.push_back(IntPoint(
-            Round(m_srcPoly[0].X + X * delta),
-            Round(m_srcPoly[0].Y + Y * delta)));
-          double X2 = X;
-          X = X * m_cos - m_sin * Y;
-          Y = X2 * m_sin + Y * m_cos;
-        }
-      }
-      else
-      {
-        double X = -1.0, Y = -1.0;
-        for (int j = 0; j < 4; ++j)
-        {
-          m_destPoly.push_back(IntPoint(
-            Round(m_srcPoly[0].X + X * delta),
-            Round(m_srcPoly[0].Y + Y * delta)));
-          if (X < 0) X = 1;
-          else if (Y < 0) Y = 1;
-          else X = -1;
-        }
-      }
-      m_destPolys.push_back(m_destPoly);
-      continue;
-    }
-    //build m_normals ...
-    m_normals.clear();
-    m_normals.reserve(len);
-    for (int j = 0; j < len - 1; ++j)
-      m_normals.push_back(GetUnitNormal(m_srcPoly[j], m_srcPoly[j + 1]));
-    if (node.m_endtype == etClosedLine || node.m_endtype == etClosedPolygon)
-      m_normals.push_back(GetUnitNormal(m_srcPoly[len - 1], m_srcPoly[0]));
-    else
-      m_normals.push_back(DoublePoint(m_normals[len - 2]));
-
-    if (node.m_endtype == etClosedPolygon)
-    {
-      int k = len - 1;
-      for (int j = 0; j < len; ++j)
-        OffsetPoint(j, k, node.m_jointype);
-      m_destPolys.push_back(m_destPoly);
-    }
-    else if (node.m_endtype == etClosedLine)
-    {
-      int k = len - 1;
-      for (int j = 0; j < len; ++j)
-        OffsetPoint(j, k, node.m_jointype);
-      m_destPolys.push_back(m_destPoly);
-      m_destPoly.clear();
-      //re-build m_normals ...
-      DoublePoint n = m_normals[len -1];
-      for (int j = len - 1; j > 0; j--)
-        m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
-      m_normals[0] = DoublePoint(-n.X, -n.Y);
-      k = 0;
-      for (int j = len - 1; j >= 0; j--)
-        OffsetPoint(j, k, node.m_jointype);
-      m_destPolys.push_back(m_destPoly);
-    }
-    else
-    {
-      int k = 0;
-      for (int j = 1; j < len - 1; ++j)
-        OffsetPoint(j, k, node.m_jointype);
-
-      IntPoint pt1;
-      if (node.m_endtype == etOpenButt)
-      {
-        int j = len - 1;
-        pt1 = IntPoint((cInt)Round(m_srcPoly[j].X + m_normals[j].X *
-          delta), (cInt)Round(m_srcPoly[j].Y + m_normals[j].Y * delta));
-        m_destPoly.push_back(pt1);
-        pt1 = IntPoint((cInt)Round(m_srcPoly[j].X - m_normals[j].X *
-          delta), (cInt)Round(m_srcPoly[j].Y - m_normals[j].Y * delta));
-        m_destPoly.push_back(pt1);
-      }
-      else
-      {
-        int j = len - 1;
-        k = len - 2;
-        m_sinA = 0;
-        m_normals[j] = DoublePoint(-m_normals[j].X, -m_normals[j].Y);
-        if (node.m_endtype == etOpenSquare)
-          DoSquare(j, k);
-        else
-          DoRound(j, k);
-      }
-
-      //re-build m_normals ...
-      for (int j = len - 1; j > 0; j--)
-        m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
-      m_normals[0] = DoublePoint(-m_normals[1].X, -m_normals[1].Y);
-
-      k = len - 1;
-      for (int j = k - 1; j > 0; --j) OffsetPoint(j, k, node.m_jointype);
-
-      if (node.m_endtype == etOpenButt)
-      {
-        pt1 = IntPoint((cInt)Round(m_srcPoly[0].X - m_normals[0].X * delta),
-          (cInt)Round(m_srcPoly[0].Y - m_normals[0].Y * delta));
-        m_destPoly.push_back(pt1);
-        pt1 = IntPoint((cInt)Round(m_srcPoly[0].X + m_normals[0].X * delta),
-          (cInt)Round(m_srcPoly[0].Y + m_normals[0].Y * delta));
-        m_destPoly.push_back(pt1);
-      }
-      else
-      {
-        k = 1;
-        m_sinA = 0;
-        if (node.m_endtype == etOpenSquare)
-          DoSquare(0, 1);
-        else
-          DoRound(0, 1);
-      }
-      m_destPolys.push_back(m_destPoly);
+    void ReversePaths(Paths& p)
+    {
+        for (Paths::size_type i = 0; i < p.size(); ++i)
+            ReversePath(p[i]);
     }
-  }
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::OffsetPoint(int j, int& k, JoinType jointype)
-{
-  //cross product ...
-  m_sinA = (m_normals[k].X * m_normals[j].Y - m_normals[j].X * m_normals[k].Y);
-  if (std::fabs(m_sinA * m_delta) < 1.0)
-  {
-    //dot product ...
-    double cosA = (m_normals[k].X * m_normals[j].X + m_normals[j].Y * m_normals[k].Y );
-    if (cosA > 0) // angle => 0 degrees
-    {
-      m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
-        Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
-      return;
-    }
-    //else angle => 180 degrees
-  }
-  else if (m_sinA > 1.0) m_sinA = 1.0;
-  else if (m_sinA < -1.0) m_sinA = -1.0;
-
-  if (m_sinA * m_delta < 0)
-  {
-    m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
-      Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
-    m_destPoly.push_back(m_srcPoly[j]);
-    m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
-      Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
-  }
-  else
-    switch (jointype)
-    {
-      case jtMiter:
-        {
-          double r = 1 + (m_normals[j].X * m_normals[k].X +
-            m_normals[j].Y * m_normals[k].Y);
-          if (r >= m_miterLim) DoMiter(j, k, r); else DoSquare(j, k);
-          break;
-        }
-      case jtSquare: DoSquare(j, k); break;
-      case jtRound: DoRound(j, k); break;
-    }
-  k = j;
-}
-//------------------------------------------------------------------------------
+    void SimplifyPolygon(const Path& in_poly, Paths& out_polys, PolyFillType fillType)
+    {
+        Clipper c;
+        c.StrictlySimple(true);
+        c.AddPath(in_poly, ptSubject, true);
+        c.Execute(ctUnion, out_polys, fillType, fillType);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::DoSquare(int j, int k)
-{
-  double dx = std::tan(std::atan2(m_sinA,
-      m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y) / 4);
-  m_destPoly.push_back(IntPoint(
-      Round(m_srcPoly[j].X + m_delta * (m_normals[k].X - m_normals[k].Y * dx)),
-      Round(m_srcPoly[j].Y + m_delta * (m_normals[k].Y + m_normals[k].X * dx))));
-  m_destPoly.push_back(IntPoint(
-      Round(m_srcPoly[j].X + m_delta * (m_normals[j].X + m_normals[j].Y * dx)),
-      Round(m_srcPoly[j].Y + m_delta * (m_normals[j].Y - m_normals[j].X * dx))));
-}
-//------------------------------------------------------------------------------
+    void SimplifyPolygons(const Paths& in_polys, Paths& out_polys, PolyFillType fillType)
+    {
+        Clipper c;
+        c.StrictlySimple(true);
+        c.AddPaths(in_polys, ptSubject, true);
+        c.Execute(ctUnion, out_polys, fillType, fillType);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::DoMiter(int j, int k, double r)
-{
-  double q = m_delta / r;
-  m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + (m_normals[k].X + m_normals[j].X) * q),
-      Round(m_srcPoly[j].Y + (m_normals[k].Y + m_normals[j].Y) * q)));
-}
-//------------------------------------------------------------------------------
+    void SimplifyPolygons(Paths& polys, PolyFillType fillType)
+    {
+        SimplifyPolygons(polys, polys, fillType);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::DoRound(int j, int k)
-{
-  double a = std::atan2(m_sinA,
-  m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y);
-  int steps = std::max((int)Round(m_StepsPerRad * std::fabs(a)), 1);
-
-  double X = m_normals[k].X, Y = m_normals[k].Y, X2;
-  for (int i = 0; i < steps; ++i)
-  {
-    m_destPoly.push_back(IntPoint(
-        Round(m_srcPoly[j].X + X * m_delta),
-        Round(m_srcPoly[j].Y + Y * m_delta)));
-    X2 = X;
-    X = X * m_cos - m_sin * Y;
-    Y = X2 * m_sin + Y * m_cos;
-  }
-  m_destPoly.push_back(IntPoint(
-  Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
-  Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
-}
+    inline double DistanceSqrd(const IntPoint& pt1, const IntPoint& pt2)
+    {
+        double Dx = ((double)pt1.X - pt2.X);
+        double dy = ((double)pt1.Y - pt2.Y);
+        return (Dx * Dx + dy * dy);
+    }
+    //------------------------------------------------------------------------------
 
-//------------------------------------------------------------------------------
-// Miscellaneous public functions
-//------------------------------------------------------------------------------
+    double DistanceFromLineSqrd(
+        const IntPoint& pt,
+        const IntPoint& ln1,
+        const IntPoint& ln2)
+    {
+        // The equation of a line in general form (Ax + By + C = 0)
+        // given 2 points (x�,y�) & (x�,y�) is ...
+        //(y� - y�)x + (x� - x�)y + (y� - y�)x� - (x� - x�)y� = 0
+        // A = (y� - y�); B = (x� - x�); C = (y� - y�)x� - (x� - x�)y�
+        // perpendicular distance of point (x�,y�) = (Ax� + By� + C)/Sqrt(A� + B�)
+        // see http://en.wikipedia.org/wiki/Perpendicular_distance
+        double A = double(ln1.Y - ln2.Y);
+        double B = double(ln2.X - ln1.X);
+        double C = A * ln1.X + B * ln1.Y;
+        C        = A * pt.X + B * pt.Y - C;
+        return (C * C) / (A * A + B * B);
+    }
+    //---------------------------------------------------------------------------
 
-void Clipper::DoSimplePolygons()
-{
-  PolyOutList::size_type i = 0;
-  while (i < m_PolyOuts.size())
-  {
-    OutRec* outrec = m_PolyOuts[i++];
-    OutPt* op = outrec->Pts;
-    if (!op || outrec->IsOpen) continue;
-    do //for each Pt in Polygon until duplicate found do ...
-    {
-      OutPt* op2 = op->Next;
-      while (op2 != outrec->Pts)
-      {
-        if ((op->Pt == op2->Pt) && op2->Next != op && op2->Prev != op)
-        {
-          //split the polygon into two ...
-          OutPt* op3 = op->Prev;
-          OutPt* op4 = op2->Prev;
-          op->Prev = op4;
-          op4->Next = op;
-          op2->Prev = op3;
-          op3->Next = op2;
-
-          outrec->Pts = op;
-          OutRec* outrec2 = CreateOutRec();
-          outrec2->Pts = op2;
-          UpdateOutPtIdxs(*outrec2);
-          if (Poly2ContainsPoly1(outrec2->Pts, outrec->Pts))
-          {
-            //OutRec2 is contained by OutRec1 ...
-            outrec2->IsHole = !outrec->IsHole;
-            outrec2->FirstLeft = outrec;
-            if (m_UsingPolyTree) FixupFirstLefts2(outrec2, outrec);
-          }
-          else
-            if (Poly2ContainsPoly1(outrec->Pts, outrec2->Pts))
-          {
-            //OutRec1 is contained by OutRec2 ...
-            outrec2->IsHole = outrec->IsHole;
-            outrec->IsHole = !outrec2->IsHole;
-            outrec2->FirstLeft = outrec->FirstLeft;
-            outrec->FirstLeft = outrec2;
-            if (m_UsingPolyTree) FixupFirstLefts2(outrec, outrec2);
-            }
+    bool SlopesNearCollinear(const IntPoint& pt1,
+                             const IntPoint& pt2,
+                             const IntPoint& pt3,
+                             double          distSqrd)
+    {
+        // this function is more accurate when the point that's geometrically
+        // between the other 2 points is the one that's tested for distance.
+        // ie makes it more likely to pick up 'spikes' ...
+        if (Abs(pt1.X - pt2.X) > Abs(pt1.Y - pt2.Y))
+        {
+            if ((pt1.X > pt2.X) == (pt1.X < pt3.X))
+                return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
+            else if ((pt2.X > pt1.X) == (pt2.X < pt3.X))
+                return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
             else
-          {
-            //the 2 polygons are separate ...
-            outrec2->IsHole = outrec->IsHole;
-            outrec2->FirstLeft = outrec->FirstLeft;
-            if (m_UsingPolyTree) FixupFirstLefts1(outrec, outrec2);
-            }
-          op2 = op; //ie get ready for the Next iteration
+                return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
+        }
+        else
+        {
+            if ((pt1.Y > pt2.Y) == (pt1.Y < pt3.Y))
+                return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
+            else if ((pt2.Y > pt1.Y) == (pt2.Y < pt3.Y))
+                return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
+            else
+                return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
         }
-        op2 = op2->Next;
-      }
-      op = op->Next;
     }
-    while (op != outrec->Pts);
-  }
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-void ReversePath(Path& p)
-{
-  std::reverse(p.begin(), p.end());
-}
-//------------------------------------------------------------------------------
+    bool PointsAreClose(IntPoint pt1, IntPoint pt2, double distSqrd)
+    {
+        double Dx = (double)pt1.X - pt2.X;
+        double dy = (double)pt1.Y - pt2.Y;
+        return ((Dx * Dx) + (dy * dy) <= distSqrd);
+    }
+    //------------------------------------------------------------------------------
 
-void ReversePaths(Paths& p)
-{
-  for (Paths::size_type i = 0; i < p.size(); ++i)
-    ReversePath(p[i]);
-}
-//------------------------------------------------------------------------------
+    OutPt* ExcludeOp(OutPt* op)
+    {
+        OutPt* result  = op->Prev;
+        result->Next   = op->Next;
+        op->Next->Prev = result;
+        result->Idx    = 0;
+        return result;
+    }
+    //------------------------------------------------------------------------------
 
-void SimplifyPolygon(const Path &in_poly, Paths &out_polys, PolyFillType fillType)
-{
-  Clipper c;
-  c.StrictlySimple(true);
-  c.AddPath(in_poly, ptSubject, true);
-  c.Execute(ctUnion, out_polys, fillType, fillType);
-}
-//------------------------------------------------------------------------------
+    void CleanPolygon(const Path& in_poly, Path& out_poly, double distance)
+    {
+        // distance = proximity in units/pixels below which vertices
+        // will be stripped. Default ~= sqrt(2).
 
-void SimplifyPolygons(const Paths &in_polys, Paths &out_polys, PolyFillType fillType)
-{
-  Clipper c;
-  c.StrictlySimple(true);
-  c.AddPaths(in_polys, ptSubject, true);
-  c.Execute(ctUnion, out_polys, fillType, fillType);
-}
-//------------------------------------------------------------------------------
+        size_t size = in_poly.size();
 
-void SimplifyPolygons(Paths &polys, PolyFillType fillType)
-{
-  SimplifyPolygons(polys, polys, fillType);
-}
-//------------------------------------------------------------------------------
+        if (size == 0)
+        {
+            out_poly.clear();
+            return;
+        }
 
-inline double DistanceSqrd(const IntPoint& pt1, const IntPoint& pt2)
-{
-  double Dx = ((double)pt1.X - pt2.X);
-  double dy = ((double)pt1.Y - pt2.Y);
-  return (Dx*Dx + dy*dy);
-}
-//------------------------------------------------------------------------------
+        OutPt* outPts = new OutPt[size];
+        for (size_t i = 0; i < size; ++i)
+        {
+            outPts[i].Pt         = in_poly[i];
+            outPts[i].Next       = &outPts[(i + 1) % size];
+            outPts[i].Next->Prev = &outPts[i];
+            outPts[i].Idx        = 0;
+        }
 
-double DistanceFromLineSqrd(
-  const IntPoint& pt, const IntPoint& ln1, const IntPoint& ln2)
-{
-  //The equation of a line in general form (Ax + By + C = 0)
-  //given 2 points (x�,y�) & (x�,y�) is ...
-  //(y� - y�)x + (x� - x�)y + (y� - y�)x� - (x� - x�)y� = 0
-  //A = (y� - y�); B = (x� - x�); C = (y� - y�)x� - (x� - x�)y�
-  //perpendicular distance of point (x�,y�) = (Ax� + By� + C)/Sqrt(A� + B�)
-  //see http://en.wikipedia.org/wiki/Perpendicular_distance
-  double A = double(ln1.Y - ln2.Y);
-  double B = double(ln2.X - ln1.X);
-  double C = A * ln1.X  + B * ln1.Y;
-  C = A * pt.X + B * pt.Y - C;
-  return (C * C) / (A * A + B * B);
-}
-//---------------------------------------------------------------------------
-
-bool SlopesNearCollinear(const IntPoint& pt1,
-    const IntPoint& pt2, const IntPoint& pt3, double distSqrd)
-{
-  //this function is more accurate when the point that's geometrically
-  //between the other 2 points is the one that's tested for distance.
-  //ie makes it more likely to pick up 'spikes' ...
-	if (Abs(pt1.X - pt2.X) > Abs(pt1.Y - pt2.Y))
-	{
-    if ((pt1.X > pt2.X) == (pt1.X < pt3.X))
-      return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
-    else if ((pt2.X > pt1.X) == (pt2.X < pt3.X))
-      return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
-		else
-	    return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
-	}
-	else
-	{
-    if ((pt1.Y > pt2.Y) == (pt1.Y < pt3.Y))
-      return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
-    else if ((pt2.Y > pt1.Y) == (pt2.Y < pt3.Y))
-      return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
-		else
-      return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
-	}
-}
-//------------------------------------------------------------------------------
+        double distSqrd = distance * distance;
+        OutPt* op       = &outPts[0];
+        while (op->Idx == 0 && op->Next != op->Prev)
+        {
+            if (PointsAreClose(op->Pt, op->Prev->Pt, distSqrd))
+            {
+                op = ExcludeOp(op);
+                size--;
+            }
+            else if (PointsAreClose(op->Prev->Pt, op->Next->Pt, distSqrd))
+            {
+                ExcludeOp(op->Next);
+                op = ExcludeOp(op);
+                size -= 2;
+            }
+            else if (SlopesNearCollinear(op->Prev->Pt, op->Pt, op->Next->Pt, distSqrd))
+            {
+                op = ExcludeOp(op);
+                size--;
+            }
+            else
+            {
+                op->Idx = 1;
+                op      = op->Next;
+            }
+        }
 
-bool PointsAreClose(IntPoint pt1, IntPoint pt2, double distSqrd)
-{
-    double Dx = (double)pt1.X - pt2.X;
-    double dy = (double)pt1.Y - pt2.Y;
-    return ((Dx * Dx) + (dy * dy) <= distSqrd);
-}
-//------------------------------------------------------------------------------
+        if (size < 3) size = 0;
+        out_poly.resize(size);
+        for (size_t i = 0; i < size; ++i)
+        {
+            out_poly[i] = op->Pt;
+            op          = op->Next;
+        }
+        delete[] outPts;
+    }
+    //------------------------------------------------------------------------------
 
-OutPt* ExcludeOp(OutPt* op)
-{
-  OutPt* result = op->Prev;
-  result->Next = op->Next;
-  op->Next->Prev = result;
-  result->Idx = 0;
-  return result;
-}
-//------------------------------------------------------------------------------
+    void CleanPolygon(Path& poly, double distance)
+    {
+        CleanPolygon(poly, poly, distance);
+    }
+    //------------------------------------------------------------------------------
 
-void CleanPolygon(const Path& in_poly, Path& out_poly, double distance)
-{
-  //distance = proximity in units/pixels below which vertices
-  //will be stripped. Default ~= sqrt(2).
-
-  size_t size = in_poly.size();
-
-  if (size == 0)
-  {
-    out_poly.clear();
-    return;
-  }
-
-  OutPt* outPts = new OutPt[size];
-  for (size_t i = 0; i < size; ++i)
-  {
-    outPts[i].Pt = in_poly[i];
-    outPts[i].Next = &outPts[(i + 1) % size];
-    outPts[i].Next->Prev = &outPts[i];
-    outPts[i].Idx = 0;
-  }
-
-  double distSqrd = distance * distance;
-  OutPt* op = &outPts[0];
-  while (op->Idx == 0 && op->Next != op->Prev)
-  {
-    if (PointsAreClose(op->Pt, op->Prev->Pt, distSqrd))
-    {
-      op = ExcludeOp(op);
-      size--;
-    }
-    else if (PointsAreClose(op->Prev->Pt, op->Next->Pt, distSqrd))
-    {
-      ExcludeOp(op->Next);
-      op = ExcludeOp(op);
-      size -= 2;
-    }
-    else if (SlopesNearCollinear(op->Prev->Pt, op->Pt, op->Next->Pt, distSqrd))
-    {
-      op = ExcludeOp(op);
-      size--;
-    }
-    else
-    {
-      op->Idx = 1;
-      op = op->Next;
-    }
-  }
-
-  if (size < 3) size = 0;
-  out_poly.resize(size);
-  for (size_t i = 0; i < size; ++i)
-  {
-    out_poly[i] = op->Pt;
-    op = op->Next;
-  }
-  delete [] outPts;
-}
-//------------------------------------------------------------------------------
+    void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance)
+    {
+        out_polys.resize(in_polys.size());
+        for (Paths::size_type i = 0; i < in_polys.size(); ++i)
+            CleanPolygon(in_polys[i], out_polys[i], distance);
+    }
+    //------------------------------------------------------------------------------
 
-void CleanPolygon(Path& poly, double distance)
-{
-  CleanPolygon(poly, poly, distance);
-}
-//------------------------------------------------------------------------------
+    void CleanPolygons(Paths& polys, double distance)
+    {
+        CleanPolygons(polys, polys, distance);
+    }
+    //------------------------------------------------------------------------------
 
-void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance)
-{
-  out_polys.resize(in_polys.size());
-  for (Paths::size_type i = 0; i < in_polys.size(); ++i)
-    CleanPolygon(in_polys[i], out_polys[i], distance);
-}
-//------------------------------------------------------------------------------
+    void Minkowski(const Path& poly, const Path& path, Paths& solution, bool isSum, bool isClosed)
+    {
+        int    delta   = (isClosed ? 1 : 0);
+        size_t polyCnt = poly.size();
+        size_t pathCnt = path.size();
+        Paths  pp;
+        pp.reserve(pathCnt);
+        if (isSum)
+            for (size_t i = 0; i < pathCnt; ++i)
+            {
+                Path p;
+                p.reserve(polyCnt);
+                for (size_t j = 0; j < poly.size(); ++j)
+                    p.push_back(IntPoint(path[i].X + poly[j].X, path[i].Y + poly[j].Y));
+                pp.push_back(p);
+            }
+        else
+            for (size_t i = 0; i < pathCnt; ++i)
+            {
+                Path p;
+                p.reserve(polyCnt);
+                for (size_t j = 0; j < poly.size(); ++j)
+                    p.push_back(IntPoint(path[i].X - poly[j].X, path[i].Y - poly[j].Y));
+                pp.push_back(p);
+            }
 
-void CleanPolygons(Paths& polys, double distance)
-{
-  CleanPolygons(polys, polys, distance);
-}
-//------------------------------------------------------------------------------
+        solution.clear();
+        solution.reserve((pathCnt + delta) * (polyCnt + 1));
+        for (size_t i = 0; i < pathCnt - 1 + delta; ++i)
+            for (size_t j = 0; j < polyCnt; ++j)
+            {
+                Path quad;
+                quad.reserve(4);
+                quad.push_back(pp[i % pathCnt][j % polyCnt]);
+                quad.push_back(pp[(i + 1) % pathCnt][j % polyCnt]);
+                quad.push_back(pp[(i + 1) % pathCnt][(j + 1) % polyCnt]);
+                quad.push_back(pp[i % pathCnt][(j + 1) % polyCnt]);
+                if (!Orientation(quad)) ReversePath(quad);
+                solution.push_back(quad);
+            }
+    }
+    //------------------------------------------------------------------------------
 
-void Minkowski(const Path& poly, const Path& path,
-  Paths& solution, bool isSum, bool isClosed)
-{
-  int delta = (isClosed ? 1 : 0);
-  size_t polyCnt = poly.size();
-  size_t pathCnt = path.size();
-  Paths pp;
-  pp.reserve(pathCnt);
-  if (isSum)
-    for (size_t i = 0; i < pathCnt; ++i)
-    {
-      Path p;
-      p.reserve(polyCnt);
-      for (size_t j = 0; j < poly.size(); ++j)
-        p.push_back(IntPoint(path[i].X + poly[j].X, path[i].Y + poly[j].Y));
-      pp.push_back(p);
-    }
-  else
-    for (size_t i = 0; i < pathCnt; ++i)
-    {
-      Path p;
-      p.reserve(polyCnt);
-      for (size_t j = 0; j < poly.size(); ++j)
-        p.push_back(IntPoint(path[i].X - poly[j].X, path[i].Y - poly[j].Y));
-      pp.push_back(p);
-    }
-
-  solution.clear();
-  solution.reserve((pathCnt + delta) * (polyCnt + 1));
-  for (size_t i = 0; i < pathCnt - 1 + delta; ++i)
-    for (size_t j = 0; j < polyCnt; ++j)
-    {
-      Path quad;
-      quad.reserve(4);
-      quad.push_back(pp[i % pathCnt][j % polyCnt]);
-      quad.push_back(pp[(i + 1) % pathCnt][j % polyCnt]);
-      quad.push_back(pp[(i + 1) % pathCnt][(j + 1) % polyCnt]);
-      quad.push_back(pp[i % pathCnt][(j + 1) % polyCnt]);
-      if (!Orientation(quad)) ReversePath(quad);
-      solution.push_back(quad);
-    }
-}
-//------------------------------------------------------------------------------
+    void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed)
+    {
+        Minkowski(pattern, path, solution, true, pathIsClosed);
+        Clipper c;
+        c.AddPaths(solution, ptSubject, true);
+        c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
+    }
+    //------------------------------------------------------------------------------
 
-void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed)
-{
-  Minkowski(pattern, path, solution, true, pathIsClosed);
-  Clipper c;
-  c.AddPaths(solution, ptSubject, true);
-  c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
-}
-//------------------------------------------------------------------------------
+    void TranslatePath(const Path& input, Path& output, const IntPoint delta)
+    {
+        // precondition: input != output
+        output.resize(input.size());
+        for (size_t i = 0; i < input.size(); ++i)
+            output[i] = IntPoint(input[i].X + delta.X, input[i].Y + delta.Y);
+    }
+    //------------------------------------------------------------------------------
 
-void TranslatePath(const Path& input, Path& output, const IntPoint delta)
-{
-  //precondition: input != output
-  output.resize(input.size());
-  for (size_t i = 0; i < input.size(); ++i)
-    output[i] = IntPoint(input[i].X + delta.X, input[i].Y + delta.Y);
-}
-//------------------------------------------------------------------------------
+    void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed)
+    {
+        Clipper c;
+        for (size_t i = 0; i < paths.size(); ++i)
+        {
+            Paths tmp;
+            Minkowski(pattern, paths[i], tmp, true, pathIsClosed);
+            c.AddPaths(tmp, ptSubject, true);
+            if (pathIsClosed)
+            {
+                Path tmp2;
+                TranslatePath(paths[i], tmp2, pattern[0]);
+                c.AddPath(tmp2, ptClip, true);
+            }
+        }
+        c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
+    }
+    //------------------------------------------------------------------------------
 
-void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed)
-{
-  Clipper c;
-  for (size_t i = 0; i < paths.size(); ++i)
-  {
-    Paths tmp;
-    Minkowski(pattern, paths[i], tmp, true, pathIsClosed);
-    c.AddPaths(tmp, ptSubject, true);
-    if (pathIsClosed)
-    {
-      Path tmp2;
-      TranslatePath(paths[i], tmp2, pattern[0]);
-      c.AddPath(tmp2, ptClip, true);
-    }
-  }
-    c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
-}
-//------------------------------------------------------------------------------
+    void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution)
+    {
+        Minkowski(poly1, poly2, solution, false, true);
+        Clipper c;
+        c.AddPaths(solution, ptSubject, true);
+        c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
+    }
+    //------------------------------------------------------------------------------
 
-void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution)
-{
-  Minkowski(poly1, poly2, solution, false, true);
-  Clipper c;
-  c.AddPaths(solution, ptSubject, true);
-  c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
-}
-//------------------------------------------------------------------------------
+    enum NodeType
+    {
+        ntAny,
+        ntOpen,
+        ntClosed
+    };
 
-enum NodeType {ntAny, ntOpen, ntClosed};
+    void AddPolyNodeToPaths(const PolyNode& polynode, NodeType nodetype, Paths& paths)
+    {
+        bool match = true;
+        if (nodetype == ntClosed)
+            match = !polynode.IsOpen();
+        else if (nodetype == ntOpen)
+            return;
 
-void AddPolyNodeToPaths(const PolyNode& polynode, NodeType nodetype, Paths& paths)
-{
-  bool match = true;
-  if (nodetype == ntClosed) match = !polynode.IsOpen();
-  else if (nodetype == ntOpen) return;
-
-  if (!polynode.Contour.empty() && match)
-    paths.push_back(polynode.Contour);
-  for (int i = 0; i < polynode.ChildCount(); ++i)
-    AddPolyNodeToPaths(*polynode.Childs[i], nodetype, paths);
-}
-//------------------------------------------------------------------------------
+        if (!polynode.Contour.empty() && match)
+            paths.push_back(polynode.Contour);
+        for (int i = 0; i < polynode.ChildCount(); ++i)
+            AddPolyNodeToPaths(*polynode.Childs[i], nodetype, paths);
+    }
+    //------------------------------------------------------------------------------
 
-void PolyTreeToPaths(const PolyTree& polytree, Paths& paths)
-{
-  paths.resize(0);
-  paths.reserve(polytree.Total());
-  AddPolyNodeToPaths(polytree, ntAny, paths);
-}
-//------------------------------------------------------------------------------
+    void PolyTreeToPaths(const PolyTree& polytree, Paths& paths)
+    {
+        paths.resize(0);
+        paths.reserve(polytree.Total());
+        AddPolyNodeToPaths(polytree, ntAny, paths);
+    }
+    //------------------------------------------------------------------------------
 
-void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths)
-{
-  paths.resize(0);
-  paths.reserve(polytree.Total());
-  AddPolyNodeToPaths(polytree, ntClosed, paths);
-}
-//------------------------------------------------------------------------------
+    void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths)
+    {
+        paths.resize(0);
+        paths.reserve(polytree.Total());
+        AddPolyNodeToPaths(polytree, ntClosed, paths);
+    }
+    //------------------------------------------------------------------------------
 
-void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths)
-{
-  paths.resize(0);
-  paths.reserve(polytree.Total());
-  //Open paths are top level only, so ...
-  for (int i = 0; i < polytree.ChildCount(); ++i)
-    if (polytree.Childs[i]->IsOpen())
-      paths.push_back(polytree.Childs[i]->Contour);
-}
-//------------------------------------------------------------------------------
+    void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths)
+    {
+        paths.resize(0);
+        paths.reserve(polytree.Total());
+        // Open paths are top level only, so ...
+        for (int i = 0; i < polytree.ChildCount(); ++i)
+            if (polytree.Childs[i]->IsOpen())
+                paths.push_back(polytree.Childs[i]->Contour);
+    }
+    //------------------------------------------------------------------------------
 
-std::ostream& operator <<(std::ostream &s, const IntPoint &p)
-{
-  s << "(" << p.X << "," << p.Y << ")";
-  return s;
-}
-//------------------------------------------------------------------------------
+    std::ostream& operator<<(std::ostream& s, const IntPoint& p)
+    {
+        s << "(" << p.X << "," << p.Y << ")";
+        return s;
+    }
+    //------------------------------------------------------------------------------
 
-std::ostream& operator <<(std::ostream &s, const Path &p)
-{
-  if (p.empty()) return s;
-  Path::size_type last = p.size() -1;
-  for (Path::size_type i = 0; i < last; i++)
-    s << "(" << p[i].X << "," << p[i].Y << "), ";
-  s << "(" << p[last].X << "," << p[last].Y << ")\n";
-  return s;
-}
-//------------------------------------------------------------------------------
+    std::ostream& operator<<(std::ostream& s, const Path& p)
+    {
+        if (p.empty()) return s;
+        Path::size_type last = p.size() - 1;
+        for (Path::size_type i = 0; i < last; i++)
+            s << "(" << p[i].X << "," << p[i].Y << "), ";
+        s << "(" << p[last].X << "," << p[last].Y << ")\n";
+        return s;
+    }
+    //------------------------------------------------------------------------------
 
-std::ostream& operator <<(std::ostream &s, const Paths &p)
-{
-  for (Paths::size_type i = 0; i < p.size(); i++)
-    s << p[i];
-  s << "\n";
-  return s;
-}
-//------------------------------------------------------------------------------
+    std::ostream& operator<<(std::ostream& s, const Paths& p)
+    {
+        for (Paths::size_type i = 0; i < p.size(); i++)
+            s << p[i];
+        s << "\n";
+        return s;
+    }
+    //------------------------------------------------------------------------------
 
-} //ClipperLib namespace
+}  // namespace ClipperLib
diff --git a/third_party/clipper/clipper.hpp b/third_party/clipper/clipper.hpp
index df517471c0..e3aa9ff2d5 100644
--- a/third_party/clipper/clipper.hpp
+++ b/third_party/clipper/clipper.hpp
@@ -1,404 +1,493 @@
 /*******************************************************************************
-*                                                                              *
-* Author    :  Angus Johnson                                                   *
-* Version   :  6.4.2                                                           *
-* Date      :  27 February 2017                                                *
-* Website   :  http://www.angusj.com                                           *
-* Copyright :  Angus Johnson 2010-2017                                         *
-*                                                                              *
-* License:                                                                     *
-* Use, modification & distribution is subject to Boost Software License Ver 1. *
-* http://www.boost.org/LICENSE_1_0.txt                                         *
-*                                                                              *
-* Attributions:                                                                *
-* The code in this library is an extension of Bala Vatti's clipping algorithm: *
-* "A generic solution to polygon clipping"                                     *
-* Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
-* http://portal.acm.org/citation.cfm?id=129906                                 *
-*                                                                              *
-* Computer graphics and geometric modeling: implementation and algorithms      *
-* By Max K. Agoston                                                            *
-* Springer; 1 edition (January 4, 2005)                                        *
-* http://books.google.com/books?q=vatti+clipping+agoston                       *
-*                                                                              *
-* See also:                                                                    *
-* "Polygon Offsetting by Computing Winding Numbers"                            *
-* Paper no. DETC2005-85513 pp. 565-575                                         *
-* ASME 2005 International Design Engineering Technical Conferences             *
-* and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
-* September 24-28, 2005 , Long Beach, California, USA                          *
-* http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
-*                                                                              *
-*******************************************************************************/
+ *                                                                              *
+ * Author    :  Angus Johnson                                                   *
+ * Version   :  6.4.2                                                           *
+ * Date      :  27 February 2017                                                *
+ * Website   :  http://www.angusj.com                                           *
+ * Copyright :  Angus Johnson 2010-2017                                         *
+ *                                                                              *
+ * License:                                                                     *
+ * Use, modification & distribution is subject to Boost Software License Ver 1. *
+ * http://www.boost.org/LICENSE_1_0.txt                                         *
+ *                                                                              *
+ * Attributions:                                                                *
+ * The code in this library is an extension of Bala Vatti's clipping algorithm: *
+ * "A generic solution to polygon clipping"                                     *
+ * Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
+ * http://portal.acm.org/citation.cfm?id=129906                                 *
+ *                                                                              *
+ * Computer graphics and geometric modeling: implementation and algorithms      *
+ * By Max K. Agoston                                                            *
+ * Springer; 1 edition (January 4, 2005)                                        *
+ * http://books.google.com/books?q=vatti+clipping+agoston                       *
+ *                                                                              *
+ * See also:                                                                    *
+ * "Polygon Offsetting by Computing Winding Numbers"                            *
+ * Paper no. DETC2005-85513 pp. 565-575                                         *
+ * ASME 2005 International Design Engineering Technical Conferences             *
+ * and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
+ * September 24-28, 2005 , Long Beach, California, USA                          *
+ * http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
+ *                                                                              *
+ *******************************************************************************/
 
 #ifndef clipper_hpp
 #define clipper_hpp
 
 #define CLIPPER_VERSION "6.4.2"
 
-//use_int32: When enabled 32bit ints are used instead of 64bit ints. This
-//improve performance but coordinate values are limited to the range +/- 46340
-//#define use_int32
+// use_int32: When enabled 32bit ints are used instead of 64bit ints. This
+// improve performance but coordinate values are limited to the range +/- 46340
+// #define use_int32
 
-//use_xyz: adds a Z member to IntPoint. Adds a minor cost to perfomance.
-//#define use_xyz
+// use_xyz: adds a Z member to IntPoint. Adds a minor cost to perfomance.
+// #define use_xyz
 
-//use_lines: Enables line clipping. Adds a very minor cost to performance.
+// use_lines: Enables line clipping. Adds a very minor cost to performance.
 #define use_lines
 
-//use_deprecated: Enables temporary support for the obsolete functions
-//#define use_deprecated
+// use_deprecated: Enables temporary support for the obsolete functions
+// #define use_deprecated
 
-#include <vector>
-#include <list>
-#include <set>
-#include <stdexcept>
-#include <cstring>
 #include <cstdlib>
-#include <ostream>
+#include <cstring>
 #include <functional>
+#include <list>
+#include <ostream>
 #include <queue>
+#include <set>
+#include <stdexcept>
+#include <vector>
 
-namespace ClipperLib {
+namespace ClipperLib
+{
 
-enum ClipType { ctIntersection, ctUnion, ctDifference, ctXor };
-enum PolyType { ptSubject, ptClip };
-//By far the most widely used winding rules for polygon filling are
-//EvenOdd & NonZero (GDI, GDI+, XLib, OpenGL, Cairo, AGG, Quartz, SVG, Gr32)
-//Others rules include Positive, Negative and ABS_GTR_EQ_TWO (only in OpenGL)
-//see http://glprogramming.com/red/chapter11.html
-enum PolyFillType { pftEvenOdd, pftNonZero, pftPositive, pftNegative };
+    enum ClipType
+    {
+        ctIntersection,
+        ctUnion,
+        ctDifference,
+        ctXor
+    };
+    enum PolyType
+    {
+        ptSubject,
+        ptClip
+    };
+    // By far the most widely used winding rules for polygon filling are
+    // EvenOdd & NonZero (GDI, GDI+, XLib, OpenGL, Cairo, AGG, Quartz, SVG, Gr32)
+    // Others rules include Positive, Negative and ABS_GTR_EQ_TWO (only in OpenGL)
+    // see http://glprogramming.com/red/chapter11.html
+    enum PolyFillType
+    {
+        pftEvenOdd,
+        pftNonZero,
+        pftPositive,
+        pftNegative
+    };
 
 #ifdef use_int32
-  typedef int cInt;
-  static cInt const loRange = 0x7FFF;
-  static cInt const hiRange = 0x7FFF;
+    typedef int       cInt;
+    static cInt const loRange = 0x7FFF;
+    static cInt const hiRange = 0x7FFF;
 #else
-  typedef signed long long cInt;
-  static cInt const loRange = 0x3FFFFFFF;
-  static cInt const hiRange = 0x3FFFFFFFFFFFFFFFLL;
-  typedef signed long long long64;     //used by Int128 class
-  typedef unsigned long long ulong64;
+    typedef signed long long   cInt;
+    static cInt const          loRange = 0x3FFFFFFF;
+    static cInt const          hiRange = 0x3FFFFFFFFFFFFFFFLL;
+    typedef signed long long   long64;  // used by Int128 class
+    typedef unsigned long long ulong64;
 
 #endif
 
-struct IntPoint {
-  cInt X;
-  cInt Y;
+    struct IntPoint
+    {
+        cInt X;
+        cInt Y;
 #ifdef use_xyz
-  cInt Z;
-  IntPoint(cInt x = 0, cInt y = 0, cInt z = 0): X(x), Y(y), Z(z) {};
+        cInt Z;
+        IntPoint(cInt x = 0, cInt y = 0, cInt z = 0)
+            : X(x)
+            , Y(y)
+            , Z(z){};
 #else
-  IntPoint(cInt x = 0, cInt y = 0): X(x), Y(y) {};
+        IntPoint(cInt x = 0, cInt y = 0)
+            : X(x)
+            , Y(y){};
 #endif
 
-  friend inline bool operator== (const IntPoint& a, const IntPoint& b)
-  {
-    return a.X == b.X && a.Y == b.Y;
-  }
-  friend inline bool operator!= (const IntPoint& a, const IntPoint& b)
-  {
-    return a.X != b.X  || a.Y != b.Y;
-  }
-};
-//------------------------------------------------------------------------------
-
-typedef std::vector< IntPoint > Path;
-typedef std::vector< Path > Paths;
-
-inline Path& operator <<(Path& poly, const IntPoint& p) {poly.push_back(p); return poly;}
-inline Paths& operator <<(Paths& polys, const Path& p) {polys.push_back(p); return polys;}
-
-std::ostream& operator <<(std::ostream &s, const IntPoint &p);
-std::ostream& operator <<(std::ostream &s, const Path &p);
-std::ostream& operator <<(std::ostream &s, const Paths &p);
-
-struct DoublePoint
-{
-  double X;
-  double Y;
-  DoublePoint(double x = 0, double y = 0) : X(x), Y(y) {}
-  DoublePoint(IntPoint ip) : X((double)ip.X), Y((double)ip.Y) {}
-};
-//------------------------------------------------------------------------------
+        friend inline bool operator==(const IntPoint& a, const IntPoint& b)
+        {
+            return a.X == b.X && a.Y == b.Y;
+        }
+        friend inline bool operator!=(const IntPoint& a, const IntPoint& b)
+        {
+            return a.X != b.X || a.Y != b.Y;
+        }
+    };
+    //------------------------------------------------------------------------------
+
+    typedef std::vector<IntPoint> Path;
+    typedef std::vector<Path>     Paths;
+
+    inline Path&                  operator<<(Path& poly, const IntPoint& p)
+    {
+        poly.push_back(p);
+        return poly;
+    }
+    inline Paths& operator<<(Paths& polys, const Path& p)
+    {
+        polys.push_back(p);
+        return polys;
+    }
+
+    std::ostream& operator<<(std::ostream& s, const IntPoint& p);
+    std::ostream& operator<<(std::ostream& s, const Path& p);
+    std::ostream& operator<<(std::ostream& s, const Paths& p);
+
+    struct DoublePoint
+    {
+        double X;
+        double Y;
+        DoublePoint(double x = 0, double y = 0)
+            : X(x)
+            , Y(y)
+        {
+        }
+        DoublePoint(IntPoint ip)
+            : X((double)ip.X)
+            , Y((double)ip.Y)
+        {
+        }
+    };
+    //------------------------------------------------------------------------------
 
 #ifdef use_xyz
-typedef void (*ZFillCallback)(IntPoint& e1bot, IntPoint& e1top, IntPoint& e2bot, IntPoint& e2top, IntPoint& pt);
+    typedef void (*ZFillCallback)(IntPoint& e1bot, IntPoint& e1top, IntPoint& e2bot, IntPoint& e2top, IntPoint& pt);
 #endif
 
-enum InitOptions {ioReverseSolution = 1, ioStrictlySimple = 2, ioPreserveCollinear = 4};
-enum JoinType {jtSquare, jtRound, jtMiter};
-enum EndType {etClosedPolygon, etClosedLine, etOpenButt, etOpenSquare, etOpenRound};
-
-class PolyNode;
-typedef std::vector< PolyNode* > PolyNodes;
-
-class PolyNode
-{
-public:
-    PolyNode();
-    virtual ~PolyNode(){};
-    Path Contour;
-    PolyNodes Childs;
-    PolyNode* Parent;
-    PolyNode* GetNext() const;
-    bool IsHole() const;
-    bool IsOpen() const;
-    int ChildCount() const;
-private:
-    //PolyNode& operator =(PolyNode& other);
-    unsigned Index; //node index in Parent.Childs
-    bool m_IsOpen;
-    JoinType m_jointype;
-    EndType m_endtype;
-    PolyNode* GetNextSiblingUp() const;
-    void AddChild(PolyNode& child);
-    friend class Clipper; //to access Index
-    friend class ClipperOffset;
-};
-
-class PolyTree: public PolyNode
-{
-public:
-    ~PolyTree(){ Clear(); };
-    PolyNode* GetFirst() const;
-    void Clear();
-    int Total() const;
-private:
-  //PolyTree& operator =(PolyTree& other);
-  PolyNodes AllNodes;
-    friend class Clipper; //to access AllNodes
-};
-
-bool Orientation(const Path &poly);
-double Area(const Path &poly);
-int PointInPolygon(const IntPoint &pt, const Path &path);
-
-void SimplifyPolygon(const Path &in_poly, Paths &out_polys, PolyFillType fillType = pftEvenOdd);
-void SimplifyPolygons(const Paths &in_polys, Paths &out_polys, PolyFillType fillType = pftEvenOdd);
-void SimplifyPolygons(Paths &polys, PolyFillType fillType = pftEvenOdd);
-
-void CleanPolygon(const Path& in_poly, Path& out_poly, double distance = 1.415);
-void CleanPolygon(Path& poly, double distance = 1.415);
-void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance = 1.415);
-void CleanPolygons(Paths& polys, double distance = 1.415);
-
-void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed);
-void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed);
-void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution);
-
-void PolyTreeToPaths(const PolyTree& polytree, Paths& paths);
-void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths);
-void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths);
-
-void ReversePath(Path& p);
-void ReversePaths(Paths& p);
-
-struct IntRect { cInt left; cInt top; cInt right; cInt bottom; };
-
-//enums that are used internally ...
-enum EdgeSide { esLeft = 1, esRight = 2};
-
-//forward declarations (for stuff used internally) ...
-struct TEdge;
-struct IntersectNode;
-struct LocalMinimum;
-struct OutPt;
-struct OutRec;
-struct Join;
-
-typedef std::vector < OutRec* > PolyOutList;
-typedef std::vector < TEdge* > EdgeList;
-typedef std::vector < Join* > JoinList;
-typedef std::vector < IntersectNode* > IntersectList;
-
-//------------------------------------------------------------------------------
-
-//ClipperBase is the ancestor to the Clipper class. It should not be
-//instantiated directly. This class simply abstracts the conversion of sets of
-//polygon coordinates into edge objects that are stored in a LocalMinima list.
-class ClipperBase
-{
-public:
-  ClipperBase();
-  virtual ~ClipperBase();
-  virtual bool AddPath(const Path &pg, PolyType PolyTyp, bool Closed);
-  bool AddPaths(const Paths &ppg, PolyType PolyTyp, bool Closed);
-  virtual void Clear();
-  IntRect GetBounds();
-  bool PreserveCollinear() {return m_PreserveCollinear;};
-  void PreserveCollinear(bool value) {m_PreserveCollinear = value;};
-protected:
-  void DisposeLocalMinimaList();
-  TEdge* AddBoundsToLML(TEdge *e, bool IsClosed);
-  virtual void Reset();
-  TEdge* ProcessBound(TEdge* E, bool IsClockwise);
-  void InsertScanbeam(const cInt Y);
-  bool PopScanbeam(cInt &Y);
-  bool LocalMinimaPending();
-  bool PopLocalMinima(cInt Y, const LocalMinimum *&locMin);
-  OutRec* CreateOutRec();
-  void DisposeAllOutRecs();
-  void DisposeOutRec(PolyOutList::size_type index);
-  void SwapPositionsInAEL(TEdge *edge1, TEdge *edge2);
-  void DeleteFromAEL(TEdge *e);
-  void UpdateEdgeIntoAEL(TEdge *&e);
-
-  typedef std::vector<LocalMinimum> MinimaList;
-  MinimaList::iterator m_CurrentLM;
-  MinimaList           m_MinimaList;
-
-  bool              m_UseFullRange;
-  EdgeList          m_edges;
-  bool              m_PreserveCollinear;
-  bool              m_HasOpenPaths;
-  PolyOutList       m_PolyOuts;
-  TEdge           *m_ActiveEdges;
-
-  typedef std::priority_queue<cInt> ScanbeamList;
-  ScanbeamList     m_Scanbeam;
-};
-//------------------------------------------------------------------------------
-
-class Clipper : public virtual ClipperBase
-{
-public:
-  Clipper(int initOptions = 0);
-  bool Execute(ClipType clipType,
-      Paths &solution,
-      PolyFillType fillType = pftEvenOdd);
-  bool Execute(ClipType clipType,
-      Paths &solution,
-      PolyFillType subjFillType,
-      PolyFillType clipFillType);
-  bool Execute(ClipType clipType,
-      PolyTree &polytree,
-      PolyFillType fillType = pftEvenOdd);
-  bool Execute(ClipType clipType,
-      PolyTree &polytree,
-      PolyFillType subjFillType,
-      PolyFillType clipFillType);
-  bool ReverseSolution() { return m_ReverseOutput; };
-  void ReverseSolution(bool value) {m_ReverseOutput = value;};
-  bool StrictlySimple() {return m_StrictSimple;};
-  void StrictlySimple(bool value) {m_StrictSimple = value;};
-  //set the callback function for z value filling on intersections (otherwise Z is 0)
+    enum InitOptions
+    {
+        ioReverseSolution   = 1,
+        ioStrictlySimple    = 2,
+        ioPreserveCollinear = 4
+    };
+    enum JoinType
+    {
+        jtSquare,
+        jtRound,
+        jtMiter
+    };
+    enum EndType
+    {
+        etClosedPolygon,
+        etClosedLine,
+        etOpenButt,
+        etOpenSquare,
+        etOpenRound
+    };
+
+    class PolyNode;
+    typedef std::vector<PolyNode*> PolyNodes;
+
+    class PolyNode
+    {
+      public:
+        PolyNode();
+        virtual ~PolyNode(){};
+        Path      Contour;
+        PolyNodes Childs;
+        PolyNode* Parent;
+        PolyNode* GetNext() const;
+        bool      IsHole() const;
+        bool      IsOpen() const;
+        int       ChildCount() const;
+
+      private:
+        // PolyNode& operator =(PolyNode& other);
+        unsigned  Index;  // node index in Parent.Childs
+        bool      m_IsOpen;
+        JoinType  m_jointype;
+        EndType   m_endtype;
+        PolyNode* GetNextSiblingUp() const;
+        void      AddChild(PolyNode& child);
+        friend class Clipper;  // to access Index
+        friend class ClipperOffset;
+    };
+
+    class PolyTree : public PolyNode
+    {
+      public:
+        ~PolyTree()
+        {
+            Clear();
+        };
+        PolyNode* GetFirst() const;
+        void      Clear();
+        int       Total() const;
+
+      private:
+        // PolyTree& operator =(PolyTree& other);
+        PolyNodes AllNodes;
+        friend class Clipper;  // to access AllNodes
+    };
+
+    bool   Orientation(const Path& poly);
+    double Area(const Path& poly);
+    int    PointInPolygon(const IntPoint& pt, const Path& path);
+
+    void   SimplifyPolygon(const Path& in_poly, Paths& out_polys, PolyFillType fillType = pftEvenOdd);
+    void   SimplifyPolygons(const Paths& in_polys, Paths& out_polys, PolyFillType fillType = pftEvenOdd);
+    void   SimplifyPolygons(Paths& polys, PolyFillType fillType = pftEvenOdd);
+
+    void   CleanPolygon(const Path& in_poly, Path& out_poly, double distance = 1.415);
+    void   CleanPolygon(Path& poly, double distance = 1.415);
+    void   CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance = 1.415);
+    void   CleanPolygons(Paths& polys, double distance = 1.415);
+
+    void   MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed);
+    void   MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed);
+    void   MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution);
+
+    void   PolyTreeToPaths(const PolyTree& polytree, Paths& paths);
+    void   ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths);
+    void   OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths);
+
+    void   ReversePath(Path& p);
+    void   ReversePaths(Paths& p);
+
+    struct IntRect
+    {
+        cInt left;
+        cInt top;
+        cInt right;
+        cInt bottom;
+    };
+
+    // enums that are used internally ...
+    enum EdgeSide
+    {
+        esLeft  = 1,
+        esRight = 2
+    };
+
+    // forward declarations (for stuff used internally) ...
+    struct TEdge;
+    struct IntersectNode;
+    struct LocalMinimum;
+    struct OutPt;
+    struct OutRec;
+    struct Join;
+
+    typedef std::vector<OutRec*>        PolyOutList;
+    typedef std::vector<TEdge*>         EdgeList;
+    typedef std::vector<Join*>          JoinList;
+    typedef std::vector<IntersectNode*> IntersectList;
+
+    //------------------------------------------------------------------------------
+
+    // ClipperBase is the ancestor to the Clipper class. It should not be
+    // instantiated directly. This class simply abstracts the conversion of sets of
+    // polygon coordinates into edge objects that are stored in a LocalMinima list.
+    class ClipperBase
+    {
+      public:
+        ClipperBase();
+        virtual ~ClipperBase();
+        virtual bool AddPath(const Path& pg, PolyType PolyTyp, bool Closed);
+        bool         AddPaths(const Paths& ppg, PolyType PolyTyp, bool Closed);
+        virtual void Clear();
+        IntRect      GetBounds();
+        bool         PreserveCollinear()
+        {
+            return m_PreserveCollinear;
+        };
+        void PreserveCollinear(bool value)
+        {
+            m_PreserveCollinear = value;
+        };
+
+      protected:
+        void                              DisposeLocalMinimaList();
+        TEdge*                            AddBoundsToLML(TEdge* e, bool IsClosed);
+        virtual void                      Reset();
+        TEdge*                            ProcessBound(TEdge* E, bool IsClockwise);
+        void                              InsertScanbeam(const cInt Y);
+        bool                              PopScanbeam(cInt& Y);
+        bool                              LocalMinimaPending();
+        bool                              PopLocalMinima(cInt Y, const LocalMinimum*& locMin);
+        OutRec*                           CreateOutRec();
+        void                              DisposeAllOutRecs();
+        void                              DisposeOutRec(PolyOutList::size_type index);
+        void                              SwapPositionsInAEL(TEdge* edge1, TEdge* edge2);
+        void                              DeleteFromAEL(TEdge* e);
+        void                              UpdateEdgeIntoAEL(TEdge*& e);
+
+        typedef std::vector<LocalMinimum> MinimaList;
+        MinimaList::iterator              m_CurrentLM;
+        MinimaList                        m_MinimaList;
+
+        bool                              m_UseFullRange;
+        EdgeList                          m_edges;
+        bool                              m_PreserveCollinear;
+        bool                              m_HasOpenPaths;
+        PolyOutList                       m_PolyOuts;
+        TEdge*                            m_ActiveEdges;
+
+        typedef std::priority_queue<cInt> ScanbeamList;
+        ScanbeamList                      m_Scanbeam;
+    };
+    //------------------------------------------------------------------------------
+
+    class Clipper : public virtual ClipperBase
+    {
+      public:
+        Clipper(int initOptions = 0);
+        bool Execute(ClipType clipType, Paths& solution, PolyFillType fillType = pftEvenOdd);
+        bool Execute(ClipType clipType, Paths& solution, PolyFillType subjFillType, PolyFillType clipFillType);
+        bool Execute(ClipType clipType, PolyTree& polytree, PolyFillType fillType = pftEvenOdd);
+        bool Execute(ClipType clipType, PolyTree& polytree, PolyFillType subjFillType, PolyFillType clipFillType);
+        bool ReverseSolution()
+        {
+            return m_ReverseOutput;
+        };
+        void ReverseSolution(bool value)
+        {
+            m_ReverseOutput = value;
+        };
+        bool StrictlySimple()
+        {
+            return m_StrictSimple;
+        };
+        void StrictlySimple(bool value)
+        {
+            m_StrictSimple = value;
+        };
+        // set the callback function for z value filling on intersections (otherwise Z is 0)
 #ifdef use_xyz
-  void ZFillFunction(ZFillCallback zFillFunc);
+        void ZFillFunction(ZFillCallback zFillFunc);
 #endif
-protected:
-  virtual bool ExecuteInternal();
-private:
-  JoinList         m_Joins;
-  JoinList         m_GhostJoins;
-  IntersectList    m_IntersectList;
-  ClipType         m_ClipType;
-  typedef std::list<cInt> MaximaList;
-  MaximaList       m_Maxima;
-  TEdge           *m_SortedEdges;
-  bool             m_ExecuteLocked;
-  PolyFillType     m_ClipFillType;
-  PolyFillType     m_SubjFillType;
-  bool             m_ReverseOutput;
-  bool             m_UsingPolyTree;
-  bool             m_StrictSimple;
+      protected:
+        virtual bool ExecuteInternal();
+
+      private:
+        JoinList                m_Joins;
+        JoinList                m_GhostJoins;
+        IntersectList           m_IntersectList;
+        ClipType                m_ClipType;
+        typedef std::list<cInt> MaximaList;
+        MaximaList              m_Maxima;
+        TEdge*                  m_SortedEdges;
+        bool                    m_ExecuteLocked;
+        PolyFillType            m_ClipFillType;
+        PolyFillType            m_SubjFillType;
+        bool                    m_ReverseOutput;
+        bool                    m_UsingPolyTree;
+        bool                    m_StrictSimple;
 #ifdef use_xyz
-  ZFillCallback   m_ZFill; //custom callback
+        ZFillCallback m_ZFill;  // custom callback
 #endif
-  void SetWindingCount(TEdge& edge);
-  bool IsEvenOddFillType(const TEdge& edge) const;
-  bool IsEvenOddAltFillType(const TEdge& edge) const;
-  void InsertLocalMinimaIntoAEL(const cInt botY);
-  void InsertEdgeIntoAEL(TEdge *edge, TEdge* startEdge);
-  void AddEdgeToSEL(TEdge *edge);
-  bool PopEdgeFromSEL(TEdge *&edge);
-  void CopyAELToSEL();
-  void DeleteFromSEL(TEdge *e);
-  void SwapPositionsInSEL(TEdge *edge1, TEdge *edge2);
-  bool IsContributing(const TEdge& edge) const;
-  bool IsTopHorz(const cInt XPos);
-  void DoMaxima(TEdge *e);
-  void ProcessHorizontals();
-  void ProcessHorizontal(TEdge *horzEdge);
-  void AddLocalMaxPoly(TEdge *e1, TEdge *e2, const IntPoint &pt);
-  OutPt* AddLocalMinPoly(TEdge *e1, TEdge *e2, const IntPoint &pt);
-  OutRec* GetOutRec(int idx);
-  void AppendPolygon(TEdge *e1, TEdge *e2);
-  void IntersectEdges(TEdge *e1, TEdge *e2, IntPoint &pt);
-  OutPt* AddOutPt(TEdge *e, const IntPoint &pt);
-  OutPt* GetLastOutPt(TEdge *e);
-  bool ProcessIntersections(const cInt topY);
-  void BuildIntersectList(const cInt topY);
-  void ProcessIntersectList();
-  void ProcessEdgesAtTopOfScanbeam(const cInt topY);
-  void BuildResult(Paths& polys);
-  void BuildResult2(PolyTree& polytree);
-  void SetHoleState(TEdge *e, OutRec *outrec);
-  void DisposeIntersectNodes();
-  bool FixupIntersectionOrder();
-  void FixupOutPolygon(OutRec &outrec);
-  void FixupOutPolyline(OutRec &outrec);
-  bool IsHole(TEdge *e);
-  bool FindOwnerFromSplitRecs(OutRec &outRec, OutRec *&currOrfl);
-  void FixHoleLinkage(OutRec &outrec);
-  void AddJoin(OutPt *op1, OutPt *op2, const IntPoint offPt);
-  void ClearJoins();
-  void ClearGhostJoins();
-  void AddGhostJoin(OutPt *op, const IntPoint offPt);
-  bool JoinPoints(Join *j, OutRec* outRec1, OutRec* outRec2);
-  void JoinCommonEdges();
-  void DoSimplePolygons();
-  void FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec);
-  void FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec);
-  void FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec);
+        void    SetWindingCount(TEdge& edge);
+        bool    IsEvenOddFillType(const TEdge& edge) const;
+        bool    IsEvenOddAltFillType(const TEdge& edge) const;
+        void    InsertLocalMinimaIntoAEL(const cInt botY);
+        void    InsertEdgeIntoAEL(TEdge* edge, TEdge* startEdge);
+        void    AddEdgeToSEL(TEdge* edge);
+        bool    PopEdgeFromSEL(TEdge*& edge);
+        void    CopyAELToSEL();
+        void    DeleteFromSEL(TEdge* e);
+        void    SwapPositionsInSEL(TEdge* edge1, TEdge* edge2);
+        bool    IsContributing(const TEdge& edge) const;
+        bool    IsTopHorz(const cInt XPos);
+        void    DoMaxima(TEdge* e);
+        void    ProcessHorizontals();
+        void    ProcessHorizontal(TEdge* horzEdge);
+        void    AddLocalMaxPoly(TEdge* e1, TEdge* e2, const IntPoint& pt);
+        OutPt*  AddLocalMinPoly(TEdge* e1, TEdge* e2, const IntPoint& pt);
+        OutRec* GetOutRec(int idx);
+        void    AppendPolygon(TEdge* e1, TEdge* e2);
+        void    IntersectEdges(TEdge* e1, TEdge* e2, IntPoint& pt);
+        OutPt*  AddOutPt(TEdge* e, const IntPoint& pt);
+        OutPt*  GetLastOutPt(TEdge* e);
+        bool    ProcessIntersections(const cInt topY);
+        void    BuildIntersectList(const cInt topY);
+        void    ProcessIntersectList();
+        void    ProcessEdgesAtTopOfScanbeam(const cInt topY);
+        void    BuildResult(Paths& polys);
+        void    BuildResult2(PolyTree& polytree);
+        void    SetHoleState(TEdge* e, OutRec* outrec);
+        void    DisposeIntersectNodes();
+        bool    FixupIntersectionOrder();
+        void    FixupOutPolygon(OutRec& outrec);
+        void    FixupOutPolyline(OutRec& outrec);
+        bool    IsHole(TEdge* e);
+        bool    FindOwnerFromSplitRecs(OutRec& outRec, OutRec*& currOrfl);
+        void    FixHoleLinkage(OutRec& outrec);
+        void    AddJoin(OutPt* op1, OutPt* op2, const IntPoint offPt);
+        void    ClearJoins();
+        void    ClearGhostJoins();
+        void    AddGhostJoin(OutPt* op, const IntPoint offPt);
+        bool    JoinPoints(Join* j, OutRec* outRec1, OutRec* outRec2);
+        void    JoinCommonEdges();
+        void    DoSimplePolygons();
+        void    FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec);
+        void    FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec);
+        void    FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec);
 #ifdef use_xyz
-  void SetZ(IntPoint& pt, TEdge& e1, TEdge& e2);
+        void SetZ(IntPoint& pt, TEdge& e1, TEdge& e2);
 #endif
-};
-//------------------------------------------------------------------------------
-
-class ClipperOffset
-{
-public:
-  ClipperOffset(double miterLimit = 2.0, double roundPrecision = 0.25);
-  ~ClipperOffset();
-  void AddPath(const Path& path, JoinType joinType, EndType endType);
-  void AddPaths(const Paths& paths, JoinType joinType, EndType endType);
-  void Execute(Paths& solution, double delta);
-  void Execute(PolyTree& solution, double delta);
-  void Clear();
-  double MiterLimit;
-  double ArcTolerance;
-private:
-  Paths m_destPolys;
-  Path m_srcPoly;
-  Path m_destPoly;
-  std::vector<DoublePoint> m_normals;
-  double m_delta, m_sinA, m_sin, m_cos;
-  double m_miterLim, m_StepsPerRad;
-  IntPoint m_lowest;
-  PolyNode m_polyNodes;
-
-  void FixOrientations();
-  void DoOffset(double delta);
-  void OffsetPoint(int j, int& k, JoinType jointype);
-  void DoSquare(int j, int k);
-  void DoMiter(int j, int k, double r);
-  void DoRound(int j, int k);
-};
-//------------------------------------------------------------------------------
-
-class clipperException : public std::exception
-{
-  public:
-    clipperException(const char* description): m_descr(description) {}
-    virtual ~clipperException() throw() {}
-    virtual const char* what() const throw() {return m_descr.c_str();}
-  private:
-    std::string m_descr;
-};
-//------------------------------------------------------------------------------
-
-} //ClipperLib namespace
-
-#endif //clipper_hpp
+    };
+    //------------------------------------------------------------------------------
+
+    class ClipperOffset
+    {
+      public:
+        ClipperOffset(double miterLimit = 2.0, double roundPrecision = 0.25);
+        ~ClipperOffset();
+        void   AddPath(const Path& path, JoinType joinType, EndType endType);
+        void   AddPaths(const Paths& paths, JoinType joinType, EndType endType);
+        void   Execute(Paths& solution, double delta);
+        void   Execute(PolyTree& solution, double delta);
+        void   Clear();
+        double MiterLimit;
+        double ArcTolerance;
+
+      private:
+        Paths                    m_destPolys;
+        Path                     m_srcPoly;
+        Path                     m_destPoly;
+        std::vector<DoublePoint> m_normals;
+        double                   m_delta, m_sinA, m_sin, m_cos;
+        double                   m_miterLim, m_StepsPerRad;
+        IntPoint                 m_lowest;
+        PolyNode                 m_polyNodes;
+
+        void                     FixOrientations();
+        void                     DoOffset(double delta);
+        void                     OffsetPoint(int j, int& k, JoinType jointype);
+        void                     DoSquare(int j, int k);
+        void                     DoMiter(int j, int k, double r);
+        void                     DoRound(int j, int k);
+    };
+    //------------------------------------------------------------------------------
+
+    class clipperException : public std::exception
+    {
+      public:
+        clipperException(const char* description)
+            : m_descr(description)
+        {
+        }
+        virtual ~clipperException() throw() {}
+        virtual const char* what() const throw()
+        {
+            return m_descr.c_str();
+        }
+
+      private:
+        std::string m_descr;
+    };
+    //------------------------------------------------------------------------------
+
+}  // namespace ClipperLib
+
+#endif  // clipper_hpp
diff --git a/third_party/concurrentqueue/concurrentqueue.h b/third_party/concurrentqueue/concurrentqueue.h
index 5c63686b4e..e33bd23ad5 100644
--- a/third_party/concurrentqueue/concurrentqueue.h
+++ b/third_party/concurrentqueue/concurrentqueue.h
@@ -32,233 +32,331 @@
 #pragma once
 
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
-// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
-// upon assigning any computed values)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-#ifdef MCDBGQ_USE_RELACY
-#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
-#endif
+    // Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+    // Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+    // upon assigning any computed values)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wconversion"
+
+    #ifdef MCDBGQ_USE_RELACY
+        #pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+    #endif
 #endif
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
-// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
-// does not support `if constexpr`, so we have no choice but to simply disable the warning
-#pragma warning(push)
-#pragma warning(disable: 4127)  // conditional expression is constant
+    // VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
+    // does not support `if constexpr`, so we have no choice but to simply disable the warning
+    #pragma warning(push)
+    #pragma warning(disable : 4127)  // conditional expression is constant
 #endif
 
 #if defined(__APPLE__)
-#include "TargetConditionals.h"
+    #include "TargetConditionals.h"
 #endif
 
 #ifdef MCDBGQ_USE_RELACY
-#include "relacy/relacy_std.hpp"
-#include "relacy_shims.h"
-// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
-// We'll override the default trait malloc ourselves without a macro.
-#undef new
-#undef delete
-#undef malloc
-#undef free
+    #include "relacy/relacy_std.hpp"
+    #include "relacy_shims.h"
+    // We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
+    // We'll override the default trait malloc ourselves without a macro.
+    #undef new
+    #undef delete
+    #undef malloc
+    #undef free
 #else
-#include <atomic>		// Requires C++11. Sorry VS2010.
-#include <cassert>
+    #include <atomic>  // Requires C++11. Sorry VS2010.
+    #include <cassert>
 #endif
-#include <cstddef>              // for max_align_t
+#include <cstddef>  // for max_align_t
 #include <cstdint>
 #include <cstdlib>
 #include <type_traits>
 #include <algorithm>
 #include <utility>
 #include <limits>
-#include <climits>		// for CHAR_BIT
+#include <climits>  // for CHAR_BIT
 #include <array>
-#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
-#include <mutex>        // used for thread exit synchronization
+#include <thread>  // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+#include <mutex>   // used for thread exit synchronization
 
 // Platform-specific definitions of a numeric thread ID type and an invalid value
-namespace moodycamel { namespace details {
-	template<typename thread_id_t> struct thread_id_converter {
-		typedef thread_id_t thread_id_numeric_size_t;
-		typedef thread_id_t thread_id_hash_t;
-		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
-	};
-} }
+namespace moodycamel
+{
+    namespace details
+    {
+        template<typename thread_id_t>
+        struct thread_id_converter
+        {
+            typedef thread_id_t     thread_id_numeric_size_t;
+            typedef thread_id_t     thread_id_hash_t;
+            static thread_id_hash_t prehash(thread_id_t const& x)
+            {
+                return x;
+            }
+        };
+    }  // namespace details
+}  // namespace moodycamel
 #if defined(MCDBGQ_USE_RELACY)
-namespace moodycamel { namespace details {
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
-	static inline thread_id_t thread_id() { return rl::thread_index(); }
-} }
+namespace moodycamel
+{
+    namespace details
+    {
+        typedef std::uint32_t     thread_id_t;
+        static const thread_id_t  invalid_thread_id  = 0xFFFFFFFFU;
+        static const thread_id_t  invalid_thread_id2 = 0xFFFFFFFEU;
+        static inline thread_id_t thread_id()
+        {
+            return rl::thread_index();
+        }
+    }  // namespace details
+}  // namespace moodycamel
 #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
 // No sense pulling in windows.h in a header, we'll manually declare the function
 // we use and rely on backwards-compatibility for this not to break
 extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
-namespace moodycamel { namespace details {
-	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
-	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
-} }
+namespace moodycamel
+{
+    namespace details
+    {
+        static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+        typedef std::uint32_t     thread_id_t;
+        static const thread_id_t  invalid_thread_id  = 0;            // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+        static const thread_id_t  invalid_thread_id2 = 0xFFFFFFFFU;  // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+        static inline thread_id_t thread_id()
+        {
+            return static_cast<thread_id_t>(::GetCurrentThreadId());
+        }
+    }  // namespace details
+}  // namespace moodycamel
 #elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
-namespace moodycamel { namespace details {
-	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
-
-	typedef std::thread::id thread_id_t;
-	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
-
-	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
-	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
-	// be.
-	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
-
-	template<std::size_t> struct thread_id_size { };
-	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
-	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
-
-	template<> struct thread_id_converter<thread_id_t> {
-		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
-#ifndef __APPLE__
-		typedef std::size_t thread_id_hash_t;
-#else
-		typedef thread_id_numeric_size_t thread_id_hash_t;
-#endif
-
-		static thread_id_hash_t prehash(thread_id_t const& x)
-		{
-#ifndef __APPLE__
-			return std::hash<std::thread::id>()(x);
-#else
-			return *reinterpret_cast<thread_id_hash_t const*>(&x);
-#endif
-		}
-	};
-} }
-#else
-// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
-// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
-// static variable's address as a thread identifier :-)
-#if defined(__GNUC__) || defined(__INTEL_COMPILER)
-#define MOODYCAMEL_THREADLOCAL __thread
-#elif defined(_MSC_VER)
-#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+namespace moodycamel
+{
+    namespace details
+    {
+        static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+
+        typedef std::thread::id   thread_id_t;
+        static const thread_id_t  invalid_thread_id;  // Default ctor creates invalid ID
+
+        // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+        // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+        // be.
+        static inline thread_id_t thread_id()
+        {
+            return std::this_thread::get_id();
+        }
+
+        template<std::size_t>
+        struct thread_id_size
+        {
+        };
+        template<>
+        struct thread_id_size<4>
+        {
+            typedef std::uint32_t numeric_t;
+        };
+        template<>
+        struct thread_id_size<8>
+        {
+            typedef std::uint64_t numeric_t;
+        };
+
+        template<>
+        struct thread_id_converter<thread_id_t>
+        {
+            typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+    #ifndef __APPLE__
+            typedef std::size_t thread_id_hash_t;
+    #else
+            typedef thread_id_numeric_size_t thread_id_hash_t;
+    #endif
+
+            static thread_id_hash_t prehash(thread_id_t const& x)
+            {
+    #ifndef __APPLE__
+                return std::hash<std::thread::id>()(x);
+    #else
+                return *reinterpret_cast<thread_id_hash_t const*>(&x);
+    #endif
+            }
+        };
+    }
+}
 #else
-// Assume C++11 compliant compiler
-#define MOODYCAMEL_THREADLOCAL thread_local
-#endif
-namespace moodycamel { namespace details {
-	typedef std::uintptr_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
-	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
-	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
-} }
+   // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+    // In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+    // static variable's address as a thread identifier :-)
+    #if defined(__GNUC__) || defined(__INTEL_COMPILER)
+        #define MOODYCAMEL_THREADLOCAL __thread
+    #elif defined(_MSC_VER)
+        #define MOODYCAMEL_THREADLOCAL __declspec(thread)
+    #else
+   // Assume C++11 compliant compiler
+        #define MOODYCAMEL_THREADLOCAL thread_local
+    #endif
+namespace moodycamel
+{
+    namespace details
+    {
+        typedef std::uintptr_t   thread_id_t;
+        static const thread_id_t invalid_thread_id  = 0;  // Address can't be nullptr
+        static const thread_id_t invalid_thread_id2 = 1;  // Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+        inline thread_id_t       thread_id()
+        {
+            static MOODYCAMEL_THREADLOCAL int x;
+            return reinterpret_cast<thread_id_t>(&x);
+        }
+    }
+}
 #endif
 
 // Constexpr if
 #ifndef MOODYCAMEL_CONSTEXPR_IF
-#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
-#define MOODYCAMEL_CONSTEXPR_IF if constexpr
-#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
-#else
-#define MOODYCAMEL_CONSTEXPR_IF if
-#define MOODYCAMEL_MAYBE_UNUSED
-#endif
+    #if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+        #define MOODYCAMEL_CONSTEXPR_IF if constexpr
+        #define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
+    #else
+        #define MOODYCAMEL_CONSTEXPR_IF if
+        #define MOODYCAMEL_MAYBE_UNUSED
+    #endif
 #endif
 
 // Exceptions
 #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
-#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
-#define MOODYCAMEL_EXCEPTIONS_ENABLED
-#endif
+    #if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+        #define MOODYCAMEL_EXCEPTIONS_ENABLED
+    #endif
 #endif
 #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
-#define MOODYCAMEL_TRY try
-#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
-#define MOODYCAMEL_RETHROW throw
-#define MOODYCAMEL_THROW(expr) throw (expr)
+    #define MOODYCAMEL_TRY try
+    #define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__)
+    #define MOODYCAMEL_RETHROW throw
+    #define MOODYCAMEL_THROW(expr) throw(expr)
 #else
-#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
-#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
-#define MOODYCAMEL_RETHROW
-#define MOODYCAMEL_THROW(expr)
+    #define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true)
+    #define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false)
+    #define MOODYCAMEL_RETHROW
+    #define MOODYCAMEL_THROW(expr)
 #endif
 
 #ifndef MOODYCAMEL_NOEXCEPT
-#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
-#define MOODYCAMEL_NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
-#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
-// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
-// We have to assume *all* non-trivial constructors may throw on VS2012!
-#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
-#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
-#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
-#else
-#define MOODYCAMEL_NOEXCEPT noexcept
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
-#endif
+    #if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+        #define MOODYCAMEL_NOEXCEPT
+        #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+        #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+    #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+   // VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+   // We have to assume *all* non-trivial constructors may throw on VS2012!
+        #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+        #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+        #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+    #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+        #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+        #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+        #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+    #else
+        #define MOODYCAMEL_NOEXCEPT noexcept
+        #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+        #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+    #endif
 #endif
 
 #ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-#ifdef MCDBGQ_USE_RELACY
-#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-#else
-// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
-// g++ <=4.7 doesn't support thread_local either.
-// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
-#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
-// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
-#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
-#endif
-#endif
+    #ifdef MCDBGQ_USE_RELACY
+        #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+    #else
+   // VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+        // g++ <=4.7 doesn't support thread_local either.
+        // Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+        #if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
+   // Assume `thread_local` is fully supported in all other C++11 compilers/platforms
+            #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED  // tentatively enabled for now; years ago several users report having problems with it on
+        #endif
+    #endif
 #endif
 
 // VS2012 doesn't support deleted functions.
 // In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
 #ifndef MOODYCAMEL_DELETE_FUNCTION
-#if defined(_MSC_VER) && _MSC_VER < 1800
-#define MOODYCAMEL_DELETE_FUNCTION
-#else
-#define MOODYCAMEL_DELETE_FUNCTION = delete
-#endif
+    #if defined(_MSC_VER) && _MSC_VER < 1800
+        #define MOODYCAMEL_DELETE_FUNCTION
+    #else
+        #define MOODYCAMEL_DELETE_FUNCTION = delete
+    #endif
 #endif
 
-namespace moodycamel { namespace details {
+namespace moodycamel
+{
+    namespace details
+    {
 #ifndef MOODYCAMEL_ALIGNAS
-// VS2013 doesn't support alignas or alignof, and align() requires a constant literal
-#if defined(_MSC_VER) && _MSC_VER <= 1800
-#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
-#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
-#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
-	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
-	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
-	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
-	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
-	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
-	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
-	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
-	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
-	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
-	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
-#else
-	template<typename T> struct identity { typedef T type; };
-#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
-#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
-#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
-#endif
+    // VS2013 doesn't support alignas or alignof, and align() requires a constant literal
+    #if defined(_MSC_VER) && _MSC_VER <= 1800
+        #define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
+        #define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
+        #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+        template<int Align, typename T>
+        struct Vs2013Aligned
+        {
+        };  // default, unsupported alignment
+        template<typename T>
+        struct Vs2013Aligned<1, T>
+        {
+            typedef __declspec(align(1)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<2, T>
+        {
+            typedef __declspec(align(2)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<4, T>
+        {
+            typedef __declspec(align(4)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<8, T>
+        {
+            typedef __declspec(align(8)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<16, T>
+        {
+            typedef __declspec(align(16)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<32, T>
+        {
+            typedef __declspec(align(32)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<64, T>
+        {
+            typedef __declspec(align(64)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<128, T>
+        {
+            typedef __declspec(align(128)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<256, T>
+        {
+            typedef __declspec(align(256)) T type;
+        };
+    #else
+        template<typename T>
+        struct identity
+        {
+            typedef T type;
+        };
+        #define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
+        #define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
+        #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
+    #endif
 #endif
-} }
+    }  // namespace details
+}  // namespace moodycamel
 
 
 // TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
@@ -266,3482 +364,3958 @@ namespace moodycamel { namespace details {
 // See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
 #define MOODYCAMEL_NO_TSAN
 #if defined(__has_feature)
- #if __has_feature(thread_sanitizer)
-  #undef MOODYCAMEL_NO_TSAN
-  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
- #endif // TSAN
-#endif // TSAN
+    #if __has_feature(thread_sanitizer)
+        #undef MOODYCAMEL_NO_TSAN
+        #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+    #endif  // TSAN
+#endif      // TSAN
 
 // Compiler-specific likely/unlikely hints
-namespace moodycamel { namespace details {
+namespace moodycamel
+{
+    namespace details
+    {
 #if defined(__GNUC__)
-	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
-	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+        static inline bool(likely)(bool x)
+        {
+            return __builtin_expect((x), true);
+        }
+        static inline bool(unlikely)(bool x)
+        {
+            return __builtin_expect((x), false);
+        }
 #else
-	static inline bool (likely)(bool x) { return x; }
-	static inline bool (unlikely)(bool x) { return x; }
+        static inline bool(likely)(bool x)
+        {
+            return x;
+        }
+        static inline bool(unlikely)(bool x)
+        {
+            return x;
+        }
 #endif
-} }
+    }  // namespace details
+}  // namespace moodycamel
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-#include "internal/concurrentqueue_internal_debug.h"
+    #include "internal/concurrentqueue_internal_debug.h"
 #endif
 
-namespace moodycamel {
-namespace details {
-	template<typename T>
-	struct const_numeric_max {
-		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
-		static const T value = std::numeric_limits<T>::is_signed
-			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
-			: static_cast<T>(-1);
-	};
+namespace moodycamel
+{
+    namespace details
+    {
+        template<typename T>
+        struct const_numeric_max
+        {
+            static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+            static const T value = std::numeric_limits<T>::is_signed ? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1) : static_cast<T>(-1);
+        };
 
 #if defined(__GLIBCXX__)
-	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+        typedef ::max_align_t std_max_align_t;  // libstdc++ forgot to add it to std:: for a while
 #else
-	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+        typedef std::max_align_t std_max_align_t;  // Others (e.g. MSVC) insist it can *only* be accessed via std::
 #endif
 
-	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
-	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
-	typedef union {
-		std_max_align_t x;
-		long long y;
-		void* z;
-	} max_align_t;
-}
-
-// Default traits for the ConcurrentQueue. To change some of the
-// traits without re-implementing all of them, inherit from this
-// struct and shadow the declarations you wish to be different;
-// since the traits are used as a template type parameter, the
-// shadowed declarations will be used where defined, and the defaults
-// otherwise.
-struct ConcurrentQueueDefaultTraits
-{
-	// General-purpose size type. std::size_t is strongly recommended.
-	typedef std::size_t size_t;
-
-	// The type used for the enqueue and dequeue indices. Must be at least as
-	// large as size_t. Should be significantly larger than the number of elements
-	// you expect to hold at once, especially if you have a high turnover rate;
-	// for example, on 32-bit x86, if you expect to have over a hundred million
-	// elements or pump several million elements through your queue in a very
-	// short space of time, using a 32-bit type *may* trigger a race condition.
-	// A 64-bit int type is recommended in that case, and in practice will
-	// prevent a race condition no matter the usage of the queue. Note that
-	// whether the queue is lock-free with a 64-int type depends on the whether
-	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
-	typedef std::size_t index_t;
-
-	// Internally, all elements are enqueued and dequeued from multi-element
-	// blocks; this is the smallest controllable unit. If you expect few elements
-	// but many producers, a smaller block size should be favoured. For few producers
-	// and/or many elements, a larger block size is preferred. A sane default
-	// is provided. Must be a power of 2.
-	static const size_t BLOCK_SIZE = 32;
-
-	// For explicit producers (i.e. when using a producer token), the block is
-	// checked for being empty by iterating through a list of flags, one per element.
-	// For large block sizes, this is too inefficient, and switching to an atomic
-	// counter-based approach is faster. The switch is made for block sizes strictly
-	// larger than this threshold.
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
-
-	// How many full blocks can be expected for a single explicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
-
-	// How many full blocks can be expected for a single implicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
-
-	// The initial size of the hash table mapping thread IDs to implicit producers.
-	// Note that the hash is resized every time it becomes half full.
-	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
-	// (using the enqueue methods without an explicit producer token) is disabled.
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
-
-	// Controls the number of items that an explicit consumer (i.e. one with a token)
-	// must consume before it causes all consumers to rotate and move on to the next
-	// internal queue.
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
-
-	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
-	// Enqueue operations that would cause this limit to be surpassed will fail. Note
-	// that this limit is enforced at the block level (for performance reasons), i.e.
-	// it's rounded up to the nearest block size.
-	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
-
-	// The number of times to spin before sleeping when waiting on a semaphore.
-	// Recommended values are on the order of 1000-10000 unless the number of
-	// consumer threads exceeds the number of idle cores (in which case try 0-100).
-	// Only affects instances of the BlockingConcurrentQueue.
-	static const int MAX_SEMA_SPINS = 10000;
-
-	// Whether to recycle dynamically-allocated blocks into an internal free list or
-	// not. If false, only pre-allocated blocks (controlled by the constructor
-	// arguments) will be recycled, and all others will be `free`d back to the heap.
-	// Note that blocks consumed by explicit producers are only freed on destruction
-	// of the queue (not following destruction of the token) regardless of this trait.
-	static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+        // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+        // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+        typedef union
+        {
+            std_max_align_t x;
+            long long       y;
+            void*           z;
+        } max_align_t;
+    }  // namespace details
+
+    // Default traits for the ConcurrentQueue. To change some of the
+    // traits without re-implementing all of them, inherit from this
+    // struct and shadow the declarations you wish to be different;
+    // since the traits are used as a template type parameter, the
+    // shadowed declarations will be used where defined, and the defaults
+    // otherwise.
+    struct ConcurrentQueueDefaultTraits
+    {
+        // General-purpose size type. std::size_t is strongly recommended.
+        typedef std::size_t        size_t;
+
+        // The type used for the enqueue and dequeue indices. Must be at least as
+        // large as size_t. Should be significantly larger than the number of elements
+        // you expect to hold at once, especially if you have a high turnover rate;
+        // for example, on 32-bit x86, if you expect to have over a hundred million
+        // elements or pump several million elements through your queue in a very
+        // short space of time, using a 32-bit type *may* trigger a race condition.
+        // A 64-bit int type is recommended in that case, and in practice will
+        // prevent a race condition no matter the usage of the queue. Note that
+        // whether the queue is lock-free with a 64-int type depends on the whether
+        // std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+        typedef std::size_t        index_t;
+
+        // Internally, all elements are enqueued and dequeued from multi-element
+        // blocks; this is the smallest controllable unit. If you expect few elements
+        // but many producers, a smaller block size should be favoured. For few producers
+        // and/or many elements, a larger block size is preferred. A sane default
+        // is provided. Must be a power of 2.
+        static const size_t        BLOCK_SIZE = 32;
+
+        // For explicit producers (i.e. when using a producer token), the block is
+        // checked for being empty by iterating through a list of flags, one per element.
+        // For large block sizes, this is too inefficient, and switching to an atomic
+        // counter-based approach is faster. The switch is made for block sizes strictly
+        // larger than this threshold.
+        static const size_t        EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+        // How many full blocks can be expected for a single explicit producer? This should
+        // reflect that number's maximum for optimal performance. Must be a power of 2.
+        static const size_t        EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+        // How many full blocks can be expected for a single implicit producer? This should
+        // reflect that number's maximum for optimal performance. Must be a power of 2.
+        static const size_t        IMPLICIT_INITIAL_INDEX_SIZE = 32;
+
+        // The initial size of the hash table mapping thread IDs to implicit producers.
+        // Note that the hash is resized every time it becomes half full.
+        // Must be a power of two, and either 0 or at least 1. If 0, implicit production
+        // (using the enqueue methods without an explicit producer token) is disabled.
+        static const size_t        INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+
+        // Controls the number of items that an explicit consumer (i.e. one with a token)
+        // must consume before it causes all consumers to rotate and move on to the next
+        // internal queue.
+        static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+
+        // The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+        // Enqueue operations that would cause this limit to be surpassed will fail. Note
+        // that this limit is enforced at the block level (for performance reasons), i.e.
+        // it's rounded up to the nearest block size.
+        static const size_t        MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+        // The number of times to spin before sleeping when waiting on a semaphore.
+        // Recommended values are on the order of 1000-10000 unless the number of
+        // consumer threads exceeds the number of idle cores (in which case try 0-100).
+        // Only affects instances of the BlockingConcurrentQueue.
+        static const int           MAX_SEMA_SPINS = 10000;
+
+        // Whether to recycle dynamically-allocated blocks into an internal free list or
+        // not. If false, only pre-allocated blocks (controlled by the constructor
+        // arguments) will be recycled, and all others will be `free`d back to the heap.
+        // Note that blocks consumed by explicit producers are only freed on destruction
+        // of the queue (not following destruction of the token) regardless of this trait.
+        static const bool          RECYCLE_ALLOCATED_BLOCKS = false;
 
 
 #ifndef MCDBGQ_USE_RELACY
-	// Memory allocation can be customized if needed.
-	// malloc should return nullptr on failure, and handle alignment like std::malloc.
-#if defined(malloc) || defined(free)
-	// Gah, this is 2015, stop defining macros that break standard code already!
-	// Work around malloc/free being special macros:
-	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
-	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
-	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
-	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
-#else
-	static inline void* malloc(size_t size) { return std::malloc(size); }
-	static inline void free(void* ptr) { return std::free(ptr); }
-#endif
+        // Memory allocation can be customized if needed.
+        // malloc should return nullptr on failure, and handle alignment like std::malloc.
+    #if defined(malloc) || defined(free)
+        // Gah, this is 2015, stop defining macros that break standard code already!
+        // Work around malloc/free being special macros:
+        static inline void* WORKAROUND_malloc(size_t size)
+        {
+            return malloc(size);
+        }
+        static inline void WORKAROUND_free(void* ptr)
+        {
+            return free(ptr);
+        }
+        static inline void*(malloc)(size_t size)
+        {
+            return WORKAROUND_malloc(size);
+        }
+        static inline void(free)(void* ptr)
+        {
+            return WORKAROUND_free(ptr);
+        }
+    #else
+        static inline void* malloc(size_t size)
+        {
+            return std::malloc(size);
+        }
+        static inline void free(void* ptr)
+        {
+            return std::free(ptr);
+        }
+    #endif
 #else
-	// Debug versions when running under the Relacy race detector (ignore
-	// these in user code)
-	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
-	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+        // Debug versions when running under the Relacy race detector (ignore
+        // these in user code)
+        static inline void* malloc(size_t size)
+        {
+            return rl::rl_malloc(size, $);
+        }
+        static inline void free(void* ptr)
+        {
+            return rl::rl_free(ptr, $);
+        }
 #endif
-};
-
-
-// When producing or consuming many elements, the most efficient way is to:
-//    1) Use one of the bulk-operation methods of the queue with a token
-//    2) Failing that, use the bulk-operation methods without a token
-//    3) Failing that, create a token and use that with the single-item methods
-//    4) Failing that, use the single-parameter methods of the queue
-// Having said that, don't create tokens willy-nilly -- ideally there should be
-// a maximum of one token per thread (of each kind).
-struct ProducerToken;
-struct ConsumerToken;
-
-template<typename T, typename Traits> class ConcurrentQueue;
-template<typename T, typename Traits> class BlockingConcurrentQueue;
-class ConcurrentQueueTests;
-
-
-namespace details
-{
-	struct ConcurrentQueueProducerTypelessBase
-	{
-		ConcurrentQueueProducerTypelessBase* next;
-		std::atomic<bool> inactive;
-		ProducerToken* token;
-
-		ConcurrentQueueProducerTypelessBase()
-			: next(nullptr), inactive(false), token(nullptr)
-		{
-		}
-	};
-
-	template<bool use32> struct _hash_32_or_64 {
-		static inline std::uint32_t hash(std::uint32_t h)
-		{
-			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
-			// Since the thread ID is already unique, all we really want to do is propagate that
-			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
-			// reducing collisions significantly
-			h ^= h >> 16;
-			h *= 0x85ebca6b;
-			h ^= h >> 13;
-			h *= 0xc2b2ae35;
-			return h ^ (h >> 16);
-		}
-	};
-	template<> struct _hash_32_or_64<1> {
-		static inline std::uint64_t hash(std::uint64_t h)
-		{
-			h ^= h >> 33;
-			h *= 0xff51afd7ed558ccd;
-			h ^= h >> 33;
-			h *= 0xc4ceb9fe1a85ec53;
-			return h ^ (h >> 33);
-		}
-	};
-	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
-
-	static inline size_t hash_thread_id(thread_id_t id)
-	{
-		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
-		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
-			thread_id_converter<thread_id_t>::prehash(id)));
-	}
-
-	template<typename T>
-	static inline bool circular_less_than(T a, T b)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
-		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
-		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
-		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
-	}
-
-	template<typename U>
-	static inline char* align_for(char* ptr)
-	{
-		const std::size_t alignment = std::alignment_of<U>::value;
-		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
-	}
-
-	template<typename T>
-	static inline T ceil_to_pow_2(T x)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
-
-		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
-		--x;
-		x |= x >> 1;
-		x |= x >> 2;
-		x |= x >> 4;
-		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
-			x |= x >> (i << 3);
-		}
-		++x;
-		return x;
-	}
-
-	template<typename T>
-	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
-	{
-		T temp = std::move(left.load(std::memory_order_relaxed));
-		left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
-		right.store(std::move(temp), std::memory_order_relaxed);
-	}
-
-	template<typename T>
-	static inline T const& nomove(T const& x)
-	{
-		return x;
-	}
-
-	template<bool Enable>
-	struct nomove_if
-	{
-		template<typename T>
-		static inline T const& eval(T const& x)
-		{
-			return x;
-		}
-	};
-
-	template<>
-	struct nomove_if<false>
-	{
-		template<typename U>
-		static inline auto eval(U&& x)
-			-> decltype(std::forward<U>(x))
-		{
-			return std::forward<U>(x);
-		}
-	};
-
-	template<typename It>
-	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
-	{
-		return *it;
-	}
+    };
+
+
+    // When producing or consuming many elements, the most efficient way is to:
+    //    1) Use one of the bulk-operation methods of the queue with a token
+    //    2) Failing that, use the bulk-operation methods without a token
+    //    3) Failing that, create a token and use that with the single-item methods
+    //    4) Failing that, use the single-parameter methods of the queue
+    // Having said that, don't create tokens willy-nilly -- ideally there should be
+    // a maximum of one token per thread (of each kind).
+    struct ProducerToken;
+    struct ConsumerToken;
+
+    template<typename T, typename Traits>
+    class ConcurrentQueue;
+    template<typename T, typename Traits>
+    class BlockingConcurrentQueue;
+    class ConcurrentQueueTests;
+
+
+    namespace details
+    {
+        struct ConcurrentQueueProducerTypelessBase
+        {
+            ConcurrentQueueProducerTypelessBase* next;
+            std::atomic<bool>                    inactive;
+            ProducerToken*                       token;
+
+            ConcurrentQueueProducerTypelessBase()
+                : next(nullptr)
+                , inactive(false)
+                , token(nullptr)
+            {
+            }
+        };
+
+        template<bool use32>
+        struct _hash_32_or_64
+        {
+            static inline std::uint32_t hash(std::uint32_t h)
+            {
+                // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+                // Since the thread ID is already unique, all we really want to do is propagate that
+                // uniqueness evenly across all the bits, so that we can use a subset of the bits while
+                // reducing collisions significantly
+                h ^= h >> 16;
+                h *= 0x85ebca6b;
+                h ^= h >> 13;
+                h *= 0xc2b2ae35;
+                return h ^ (h >> 16);
+            }
+        };
+        template<>
+        struct _hash_32_or_64<1>
+        {
+            static inline std::uint64_t hash(std::uint64_t h)
+            {
+                h ^= h >> 33;
+                h *= 0xff51afd7ed558ccd;
+                h ^= h >> 33;
+                h *= 0xc4ceb9fe1a85ec53;
+                return h ^ (h >> 33);
+            }
+        };
+        template<std::size_t size>
+        struct hash_32_or_64 : public _hash_32_or_64<(size > 4)>
+        {
+        };
+
+        static inline size_t hash_thread_id(thread_id_t id)
+        {
+            static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+            return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+                thread_id_converter<thread_id_t>::prehash(id)));
+        }
+
+        template<typename T>
+        static inline bool circular_less_than(T a, T b)
+        {
+            static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+            return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+            // Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+            //       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
+        }
+
+        template<typename U>
+        static inline char* align_for(char* ptr)
+        {
+            const std::size_t alignment = std::alignment_of<U>::value;
+            return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+        }
+
+        template<typename T>
+        static inline T ceil_to_pow_2(T x)
+        {
+            static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+            // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+            --x;
+            x |= x >> 1;
+            x |= x >> 2;
+            x |= x >> 4;
+            for (std::size_t i = 1; i < sizeof(T); i <<= 1)
+            {
+                x |= x >> (i << 3);
+            }
+            ++x;
+            return x;
+        }
+
+        template<typename T>
+        static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+        {
+            T temp = std::move(left.load(std::memory_order_relaxed));
+            left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+            right.store(std::move(temp), std::memory_order_relaxed);
+        }
+
+        template<typename T>
+        static inline T const& nomove(T const& x)
+        {
+            return x;
+        }
+
+        template<bool Enable>
+        struct nomove_if
+        {
+            template<typename T>
+            static inline T const& eval(T const& x)
+            {
+                return x;
+            }
+        };
+
+        template<>
+        struct nomove_if<false>
+        {
+            template<typename U>
+            static inline auto eval(U&& x)
+                -> decltype(std::forward<U>(x))
+            {
+                return std::forward<U>(x);
+            }
+        };
+
+        template<typename It>
+        static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT->decltype(*it)
+        {
+            return *it;
+        }
 
 #if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
-	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+        template<typename T>
+        struct is_trivially_destructible : std::is_trivially_destructible<T>
+        {
+        };
 #else
-	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+        template<typename T>
+        struct is_trivially_destructible : std::has_trivial_destructor<T>
+        {
+        };
 #endif
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-#ifdef MCDBGQ_USE_RELACY
-	typedef RelacyThreadExitListener ThreadExitListener;
-	typedef RelacyThreadExitNotifier ThreadExitNotifier;
-#else
-	class ThreadExitNotifier;
-
-	struct ThreadExitListener
-	{
-		typedef void (*callback_t)(void*);
-		callback_t callback;
-		void* userData;
-
-		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
-		ThreadExitNotifier* chain;		// reserved for use by the ThreadExitNotifier
-	};
-
-	class ThreadExitNotifier
-	{
-	public:
-		static void subscribe(ThreadExitListener* listener)
-		{
-			auto& tlsInst = instance();
-			std::lock_guard<std::mutex> guard(mutex());
-			listener->next = tlsInst.tail;
-			listener->chain = &tlsInst;
-			tlsInst.tail = listener;
-		}
-
-		static void unsubscribe(ThreadExitListener* listener)
-		{
-			std::lock_guard<std::mutex> guard(mutex());
-			if (!listener->chain) {
-				return;  // race with ~ThreadExitNotifier
-			}
-			auto& tlsInst = *listener->chain;
-			listener->chain = nullptr;
-			ThreadExitListener** prev = &tlsInst.tail;
-			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
-				if (ptr == listener) {
-					*prev = ptr->next;
-					break;
-				}
-				prev = &ptr->next;
-			}
-		}
-
-	private:
-		ThreadExitNotifier() : tail(nullptr) { }
-		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-
-		~ThreadExitNotifier()
-		{
-			// This thread is about to exit, let everyone know!
-			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
-			std::lock_guard<std::mutex> guard(mutex());
-			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
-				ptr->chain = nullptr;
-				ptr->callback(ptr->userData);
-			}
-		}
-
-		// Thread-local
-		static inline ThreadExitNotifier& instance()
-		{
-			static thread_local ThreadExitNotifier notifier;
-			return notifier;
-		}
-
-		static inline std::mutex& mutex()
-		{
-			// Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
-			static std::mutex mutex;
-			return mutex;
-		}
-
-	private:
-		ThreadExitListener* tail;
-	};
-#endif
-#endif
-
-	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
-	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
-	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
-	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
-	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
-}
-
+    #ifdef MCDBGQ_USE_RELACY
+        typedef RelacyThreadExitListener ThreadExitListener;
+        typedef RelacyThreadExitNotifier ThreadExitNotifier;
+    #else
+        class ThreadExitNotifier;
+
+        struct ThreadExitListener
+        {
+            typedef void (*callback_t)(void*);
+            callback_t          callback;
+            void*               userData;
+
+            ThreadExitListener* next;   // reserved for use by the ThreadExitNotifier
+            ThreadExitNotifier* chain;  // reserved for use by the ThreadExitNotifier
+        };
+
+        class ThreadExitNotifier
+        {
+          public:
+            static void subscribe(ThreadExitListener* listener)
+            {
+                auto&                       tlsInst = instance();
+                std::lock_guard<std::mutex> guard(mutex());
+                listener->next  = tlsInst.tail;
+                listener->chain = &tlsInst;
+                tlsInst.tail    = listener;
+            }
+
+            static void unsubscribe(ThreadExitListener* listener)
+            {
+                std::lock_guard<std::mutex> guard(mutex());
+                if (!listener->chain)
+                {
+                    return;  // race with ~ThreadExitNotifier
+                }
+                auto& tlsInst             = *listener->chain;
+                listener->chain           = nullptr;
+                ThreadExitListener** prev = &tlsInst.tail;
+                for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next)
+                {
+                    if (ptr == listener)
+                    {
+                        *prev = ptr->next;
+                        break;
+                    }
+                    prev = &ptr->next;
+                }
+            }
+
+          private:
+            ThreadExitNotifier()
+                : tail(nullptr)
+            {
+            }
+            ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+            ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+
+            ~ThreadExitNotifier()
+            {
+                // This thread is about to exit, let everyone know!
+                assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+                std::lock_guard<std::mutex> guard(mutex());
+                for (auto ptr = tail; ptr != nullptr; ptr = ptr->next)
+                {
+                    ptr->chain = nullptr;
+                    ptr->callback(ptr->userData);
+                }
+            }
+
+            // Thread-local
+            static inline ThreadExitNotifier& instance()
+            {
+                static thread_local ThreadExitNotifier notifier;
+                return notifier;
+            }
+
+            static inline std::mutex& mutex()
+            {
+                // Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
+                static std::mutex mutex;
+                return mutex;
+            }
+
+          private:
+            ThreadExitListener* tail;
+        };
+    #endif
+#endif
 
-struct ProducerToken
-{
-	template<typename T, typename Traits>
-	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
-
-	template<typename T, typename Traits>
-	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
-
-	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-		: producer(other.producer)
-	{
-		other.producer = nullptr;
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-	}
-
-	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-
-	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(producer, other.producer);
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-		if (other.producer != nullptr) {
-			other.producer->token = &other;
-		}
-	}
-
-	// A token is always valid unless:
-	//     1) Memory allocation failed during construction
-	//     2) It was moved via the move constructor
-	//        (Note: assignment does a swap, leaving both potentially valid)
-	//     3) The associated queue was destroyed
-	// Note that if valid() returns true, that only indicates
-	// that the token is valid for use with a specific queue,
-	// but not which one; that's up to the user to track.
-	inline bool valid() const { return producer != nullptr; }
-
-	~ProducerToken()
-	{
-		if (producer != nullptr) {
-			producer->token = nullptr;
-			producer->inactive.store(true, std::memory_order_release);
-		}
-	}
-
-	// Disable copying and assignment
-	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-
-private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-
-protected:
-	details::ConcurrentQueueProducerTypelessBase* producer;
-};
-
-
-struct ConsumerToken
-{
-	template<typename T, typename Traits>
-	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
-
-	template<typename T, typename Traits>
-	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
-
-	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
-	{
-	}
-
-	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-
-	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(initialOffset, other.initialOffset);
-		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
-		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
-		std::swap(currentProducer, other.currentProducer);
-		std::swap(desiredProducer, other.desiredProducer);
-	}
-
-	// Disable copying and assignment
-	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-
-private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-
-private: // but shared with ConcurrentQueue
-	std::uint32_t initialOffset;
-	std::uint32_t lastKnownGlobalOffset;
-	std::uint32_t itemsConsumedFromCurrent;
-	details::ConcurrentQueueProducerTypelessBase* currentProducer;
-	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
-};
-
-// Need to forward-declare this swap because it's in a namespace.
-// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
-
-
-template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
-class ConcurrentQueue
-{
-public:
-	typedef ::moodycamel::ProducerToken producer_token_t;
-	typedef ::moodycamel::ConsumerToken consumer_token_t;
-
-	typedef typename Traits::index_t index_t;
-	typedef typename Traits::size_t size_t;
-
-	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+        template<typename T>
+        struct static_is_lock_free_num
+        {
+            enum
+            {
+                value = 0
+            };
+        };
+        template<>
+        struct static_is_lock_free_num<signed char>
+        {
+            enum
+            {
+                value = ATOMIC_CHAR_LOCK_FREE
+            };
+        };
+        template<>
+        struct static_is_lock_free_num<short>
+        {
+            enum
+            {
+                value = ATOMIC_SHORT_LOCK_FREE
+            };
+        };
+        template<>
+        struct static_is_lock_free_num<int>
+        {
+            enum
+            {
+                value = ATOMIC_INT_LOCK_FREE
+            };
+        };
+        template<>
+        struct static_is_lock_free_num<long>
+        {
+            enum
+            {
+                value = ATOMIC_LONG_LOCK_FREE
+            };
+        };
+        template<>
+        struct static_is_lock_free_num<long long>
+        {
+            enum
+            {
+                value = ATOMIC_LLONG_LOCK_FREE
+            };
+        };
+        template<typename T>
+        struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type>
+        {
+        };
+        template<>
+        struct static_is_lock_free<bool>
+        {
+            enum
+            {
+                value = ATOMIC_BOOL_LOCK_FREE
+            };
+        };
+        template<typename U>
+        struct static_is_lock_free<U*>
+        {
+            enum
+            {
+                value = ATOMIC_POINTER_LOCK_FREE
+            };
+        };
+    }  // namespace details
+
+
+    struct ProducerToken
+    {
+        template<typename T, typename Traits>
+        explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+
+        template<typename T, typename Traits>
+        explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+
+        ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+            : producer(other.producer)
+        {
+            other.producer = nullptr;
+            if (producer != nullptr)
+            {
+                producer->token = this;
+            }
+        }
+
+        inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+        {
+            swap(other);
+            return *this;
+        }
+
+        void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+        {
+            std::swap(producer, other.producer);
+            if (producer != nullptr)
+            {
+                producer->token = this;
+            }
+            if (other.producer != nullptr)
+            {
+                other.producer->token = &other;
+            }
+        }
+
+        // A token is always valid unless:
+        //     1) Memory allocation failed during construction
+        //     2) It was moved via the move constructor
+        //        (Note: assignment does a swap, leaving both potentially valid)
+        //     3) The associated queue was destroyed
+        // Note that if valid() returns true, that only indicates
+        // that the token is valid for use with a specific queue,
+        // but not which one; that's up to the user to track.
+        inline bool valid() const
+        {
+            return producer != nullptr;
+        }
+
+        ~ProducerToken()
+        {
+            if (producer != nullptr)
+            {
+                producer->token = nullptr;
+                producer->inactive.store(true, std::memory_order_release);
+            }
+        }
+
+        // Disable copying and assignment
+        ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+        ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+      private:
+        template<typename T, typename Traits>
+        friend class ConcurrentQueue;
+        friend class ConcurrentQueueTests;
+
+      protected:
+        details::ConcurrentQueueProducerTypelessBase* producer;
+    };
+
+
+    struct ConsumerToken
+    {
+        template<typename T, typename Traits>
+        explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+
+        template<typename T, typename Traits>
+        explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+
+        ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+            : initialOffset(other.initialOffset),
+              lastKnownGlobalOffset(other.lastKnownGlobalOffset),
+              itemsConsumedFromCurrent(other.itemsConsumedFromCurrent),
+              currentProducer(other.currentProducer),
+              desiredProducer(other.desiredProducer)
+        {
+        }
+
+        inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+        {
+            swap(other);
+            return *this;
+        }
+
+        void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+        {
+            std::swap(initialOffset, other.initialOffset);
+            std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+            std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+            std::swap(currentProducer, other.currentProducer);
+            std::swap(desiredProducer, other.desiredProducer);
+        }
+
+        // Disable copying and assignment
+        ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+        ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+      private:
+        template<typename T, typename Traits>
+        friend class ConcurrentQueue;
+        friend class ConcurrentQueueTests;
+
+      private:  // but shared with ConcurrentQueue
+        std::uint32_t                                 initialOffset;
+        std::uint32_t                                 lastKnownGlobalOffset;
+        std::uint32_t                                 itemsConsumedFromCurrent;
+        details::ConcurrentQueueProducerTypelessBase* currentProducer;
+        details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+    };
+
+    // Need to forward-declare this swap because it's in a namespace.
+    // See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+    template<typename T, typename Traits>
+    inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+    template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+    class ConcurrentQueue
+    {
+      public:
+        typedef ::moodycamel::ProducerToken producer_token_t;
+        typedef ::moodycamel::ConsumerToken consumer_token_t;
+
+        typedef typename Traits::index_t    index_t;
+        typedef typename Traits::size_t     size_t;
+
+        static const size_t                 BLOCK_SIZE                                        = static_cast<size_t>(Traits::BLOCK_SIZE);
+        static const size_t                 EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD            = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+        static const size_t                 EXPLICIT_INITIAL_INDEX_SIZE                       = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+        static const size_t                 IMPLICIT_INITIAL_INDEX_SIZE                       = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+        static const size_t                 INITIAL_IMPLICIT_PRODUCER_HASH_SIZE               = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+        static const std::uint32_t          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
-#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+    #pragma warning(push)
+    #pragma warning(disable : 4307)  // + integral constant overflow (that's what the ternary expression is for!)
+    #pragma warning(disable : 4309)  // static_cast: Truncation of constant value
 #endif
-	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+        static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
 #ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
-	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
-	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
-	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
-	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
-	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
-	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
-
-public:
-	// Creates a queue with at least `capacity` element slots; note that the
-	// actual number of elements that can be inserted without additional memory
-	// allocation depends on the number of producers and the block size (e.g. if
-	// the block size is equal to `capacity`, only a single block will be allocated
-	// up-front, which means only a single producer will be able to enqueue elements
-	// without an extra allocation -- blocks aren't shared between producers).
-	// This method is not thread safe -- it is up to the user to ensure that the
-	// queue is fully constructed before it starts being used by other threads (this
-	// includes making the memory effects of construction visible, possibly with a
-	// memory barrier).
-	explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+    #pragma warning(pop)
+#endif
+
+        static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+        static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+        static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+        static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+        static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+        static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+        static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+        static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+        static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+      public:
+        // Creates a queue with at least `capacity` element slots; note that the
+        // actual number of elements that can be inserted without additional memory
+        // allocation depends on the number of producers and the block size (e.g. if
+        // the block size is equal to `capacity`, only a single block will be allocated
+        // up-front, which means only a single producer will be able to enqueue elements
+        // without an extra allocation -- blocks aren't shared between producers).
+        // This method is not thread safe -- it is up to the user to ensure that the
+        // queue is fully constructed before it starts being used by other threads (this
+        // includes making the memory effects of construction visible, possibly with a
+        // memory barrier).
+        explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+            : producerListTail(nullptr)
+            , producerCount(0)
+            , initialBlockPoolIndex(0)
+            , nextExplicitConsumerId(0)
+            , globalExplicitConsumerOffset(0)
+        {
+            implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+            populate_initial_implicit_producer_hash();
+            populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		// Track all the producers using a fully-resolved typed list for
-		// each kind; this makes it possible to debug them starting from
-		// the root queue object (otherwise wacky casts are needed that
-		// don't compile in the debugger's expression evaluator).
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-
-	// Computes the correct amount of pre-allocated blocks for you based
-	// on the minimum number of elements you want available at any given
-	// time, and the maximum concurrent number of each type of producer.
-	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
-		populate_initial_block_list(blocks);
+            // Track all the producers using a fully-resolved typed list for
+            // each kind; this makes it possible to debug them starting from
+            // the root queue object (otherwise wacky casts are needed that
+            // don't compile in the debugger's expression evaluator).
+            explicitProducers.store(nullptr, std::memory_order_relaxed);
+            implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+        }
+
+        // Computes the correct amount of pre-allocated blocks for you based
+        // on the minimum number of elements you want available at any given
+        // time, and the maximum concurrent number of each type of producer.
+        ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+            : producerListTail(nullptr)
+            , producerCount(0)
+            , initialBlockPoolIndex(0)
+            , nextExplicitConsumerId(0)
+            , globalExplicitConsumerOffset(0)
+        {
+            implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+            populate_initial_implicit_producer_hash();
+            size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
+            populate_initial_block_list(blocks);
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-
-	// Note: The queue should not be accessed concurrently while it's
-	// being deleted. It's up to the user to synchronize this.
-	// This method is not thread safe.
-	~ConcurrentQueue()
-	{
-		// Destroy producers
-		auto ptr = producerListTail.load(std::memory_order_relaxed);
-		while (ptr != nullptr) {
-			auto next = ptr->next_prod();
-			if (ptr->token != nullptr) {
-				ptr->token->producer = nullptr;
-			}
-			destroy(ptr);
-			ptr = next;
-		}
-
-		// Destroy implicit producer hash tables
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
-			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
-			while (hash != nullptr) {
-				auto prev = hash->prev;
-				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
-					for (size_t i = 0; i != hash->capacity; ++i) {
-						hash->entries[i].~ImplicitProducerKVP();
-					}
-					hash->~ImplicitProducerHash();
-					(Traits::free)(hash);
-				}
-				hash = prev;
-			}
-		}
-
-		// Destroy global free list
-		auto block = freeList.head_unsafe();
-		while (block != nullptr) {
-			auto next = block->freeListNext.load(std::memory_order_relaxed);
-			if (block->dynamicallyAllocated) {
-				destroy(block);
-			}
-			block = next;
-		}
-
-		// Destroy initial free list
-		destroy_array(initialBlockPool, initialBlockPoolSize);
-	}
-
-	// Disable copying and copy assignment
-	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-
-	// Moving is supported, but note that it is *not* a thread-safe operation.
-	// Nobody can use the queue while it's being moved, and the memory effects
-	// of that move must be propagated to other threads before they can use it.
-	// Note: When a queue is moved, its tokens are still valid but can only be
-	// used with the destination queue (i.e. semantically they are moved along
-	// with the queue itself).
-	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
-		producerCount(other.producerCount.load(std::memory_order_relaxed)),
-		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
-		initialBlockPool(other.initialBlockPool),
-		initialBlockPoolSize(other.initialBlockPoolSize),
-		freeList(std::move(other.freeList)),
-		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
-		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
-	{
-		// Move the other one into this, and leave the other one as an empty queue
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		swap_implicit_producer_hashes(other);
-
-		other.producerListTail.store(nullptr, std::memory_order_relaxed);
-		other.producerCount.store(0, std::memory_order_relaxed);
-		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
-		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+            explicitProducers.store(nullptr, std::memory_order_relaxed);
+            implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+        }
+
+        // Note: The queue should not be accessed concurrently while it's
+        // being deleted. It's up to the user to synchronize this.
+        // This method is not thread safe.
+        ~ConcurrentQueue()
+        {
+            // Destroy producers
+            auto ptr = producerListTail.load(std::memory_order_relaxed);
+            while (ptr != nullptr)
+            {
+                auto next = ptr->next_prod();
+                if (ptr->token != nullptr)
+                {
+                    ptr->token->producer = nullptr;
+                }
+                destroy(ptr);
+                ptr = next;
+            }
+
+            // Destroy implicit producer hash tables
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0)
+            {
+                auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+                while (hash != nullptr)
+                {
+                    auto prev = hash->prev;
+                    if (prev != nullptr)
+                    {  // The last hash is part of this object and was not allocated dynamically
+                        for (size_t i = 0; i != hash->capacity; ++i)
+                        {
+                            hash->entries[i].~ImplicitProducerKVP();
+                        }
+                        hash->~ImplicitProducerHash();
+                        (Traits::free)(hash);
+                    }
+                    hash = prev;
+                }
+            }
+
+            // Destroy global free list
+            auto block = freeList.head_unsafe();
+            while (block != nullptr)
+            {
+                auto next = block->freeListNext.load(std::memory_order_relaxed);
+                if (block->dynamicallyAllocated)
+                {
+                    destroy(block);
+                }
+                block = next;
+            }
+
+            // Destroy initial free list
+            destroy_array(initialBlockPool, initialBlockPoolSize);
+        }
+
+        // Disable copying and copy assignment
+        ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+        ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+
+        // Moving is supported, but note that it is *not* a thread-safe operation.
+        // Nobody can use the queue while it's being moved, and the memory effects
+        // of that move must be propagated to other threads before they can use it.
+        // Note: When a queue is moved, its tokens are still valid but can only be
+        // used with the destination queue (i.e. semantically they are moved along
+        // with the queue itself).
+        ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+            : producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+              producerCount(other.producerCount.load(std::memory_order_relaxed)),
+              initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+              initialBlockPool(other.initialBlockPool),
+              initialBlockPoolSize(other.initialBlockPoolSize),
+              freeList(std::move(other.freeList)),
+              nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+              globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+        {
+            // Move the other one into this, and leave the other one as an empty queue
+            implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+            populate_initial_implicit_producer_hash();
+            swap_implicit_producer_hashes(other);
+
+            other.producerListTail.store(nullptr, std::memory_order_relaxed);
+            other.producerCount.store(0, std::memory_order_relaxed);
+            other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+            other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-
-		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
-		other.initialBlockPoolSize = 0;
-		other.initialBlockPool = nullptr;
-
-		reown_producers();
-	}
-
-	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-	{
-		return swap_internal(other);
-	}
-
-	// Swaps this queue's state with the other's. Not thread-safe.
-	// Swapping two queues does not invalidate their tokens, however
-	// the tokens that were created for one queue must be used with
-	// only the swapped queue (i.e. the tokens are tied to the
-	// queue's movable state, not the object itself).
-	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap_internal(other);
-	}
-
-private:
-	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
-	{
-		if (this == &other) {
-			return *this;
-		}
-
-		details::swap_relaxed(producerListTail, other.producerListTail);
-		details::swap_relaxed(producerCount, other.producerCount);
-		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
-		std::swap(initialBlockPool, other.initialBlockPool);
-		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
-		freeList.swap(other.freeList);
-		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
-		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
-
-		swap_implicit_producer_hashes(other);
-
-		reown_producers();
-		other.reown_producers();
+            explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+            implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+
+            other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+            other.initialBlockPoolSize = 0;
+            other.initialBlockPool     = nullptr;
+
+            reown_producers();
+        }
+
+        inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+        {
+            return swap_internal(other);
+        }
+
+        // Swaps this queue's state with the other's. Not thread-safe.
+        // Swapping two queues does not invalidate their tokens, however
+        // the tokens that were created for one queue must be used with
+        // only the swapped queue (i.e. the tokens are tied to the
+        // queue's movable state, not the object itself).
+        inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+        {
+            swap_internal(other);
+        }
+
+      private:
+        ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+        {
+            if (this == &other)
+            {
+                return *this;
+            }
+
+            details::swap_relaxed(producerListTail, other.producerListTail);
+            details::swap_relaxed(producerCount, other.producerCount);
+            details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+            std::swap(initialBlockPool, other.initialBlockPool);
+            std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+            freeList.swap(other.freeList);
+            details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+            details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+
+            swap_implicit_producer_hashes(other);
+
+            reown_producers();
+            other.reown_producers();
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		details::swap_relaxed(explicitProducers, other.explicitProducers);
-		details::swap_relaxed(implicitProducers, other.implicitProducers);
-#endif
-
-		return *this;
-	}
-
-public:
-	// Enqueues a single item (by copying it).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(item);
-	}
-
-	// Enqueues a single item (by moving it, if possible).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(std::move(item));
-	}
-
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CanAlloc>(token, item);
-	}
-
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CanAlloc>(token, std::move(item));
-	}
-
-	// Enqueues several items.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
-	}
-
-	// Enqueues several items using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails
-	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
-	}
-
-	// Enqueues a single item (by copying it).
-	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(item);
-	}
-
-	// Enqueues a single item (by moving it, if possible).
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(std::move(item));
-	}
-
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, item);
-	}
-
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, std::move(item));
-	}
-
-	// Enqueues several items.
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
-	}
-
-	// Enqueues several items using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
-	}
-
-
-
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(U& item)
-	{
-		// Instead of simply trying each producer in turn (which could cause needless contention on the first
-		// producer), we score them heuristically.
-		size_t nonEmptyCount = 0;
-		ProducerBase* best = nullptr;
-		size_t bestSize = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
-			auto size = ptr->size_approx();
-			if (size > 0) {
-				if (size > bestSize) {
-					bestSize = size;
-					best = ptr;
-				}
-				++nonEmptyCount;
-			}
-		}
-
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (nonEmptyCount > 0) {
-			if ((details::likely)(best->dequeue(item))) {
-				return true;
-			}
-			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-				if (ptr != best && ptr->dequeue(item)) {
-					return true;
-				}
-			}
-		}
-		return false;
-	}
-
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// This differs from the try_dequeue(item) method in that this one does
-	// not attempt to reduce contention by interleaving the order that producer
-	// streams are dequeued from. So, using this method can reduce overall throughput
-	// under contention, but will give more predictable results in single-threaded
-	// consumer scenarios. This is mostly only useful for internal unit tests.
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue_non_interleaved(U& item)
-	{
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->dequeue(item)) {
-				return true;
-			}
-		}
-		return false;
-	}
-
-	// Attempts to dequeue from the queue using an explicit consumer token.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(consumer_token_t& token, U& item)
-	{
-		// The idea is roughly as follows:
-		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
-		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
-		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
-		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
-
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return false;
-			}
-		}
-
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
-			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return true;
-		}
-
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			if (ptr->dequeue(item)) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = 1;
-				return true;
-			}
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return false;
-	}
-
-	// Attempts to dequeue several elements from the queue.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(It itemFirst, size_t max)
-	{
-		size_t count = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			count += ptr->dequeue_bulk(itemFirst, max - count);
-			if (count == max) {
-				break;
-			}
-		}
-		return count;
-	}
-
-	// Attempts to dequeue several elements from the queue using an explicit consumer token.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
-	{
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return 0;
-			}
-		}
-
-		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
-		if (count == max) {
-			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return max;
-		}
-		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
-		max -= count;
-
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
-			count += dequeued;
-			if (dequeued != 0) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
-			}
-			if (dequeued == max) {
-				break;
-			}
-			max -= dequeued;
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return count;
-	}
-
-
-
-	// Attempts to dequeue from a specific producer's inner queue.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns false if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
-	}
-
-	// Attempts to dequeue several elements from a specific producer's inner queue.
-	// Returns the number of items actually dequeued.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns 0 if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
-	}
-
-
-	// Returns an estimate of the total number of elements currently in the queue. This
-	// estimate is only accurate if the queue has completely stabilized before it is called
-	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
-	// visible on the calling thread, and no further operations start while this method is
-	// being called).
-	// Thread-safe.
-	size_t size_approx() const
-	{
-		size_t size = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			size += ptr->size_approx();
-		}
-		return size;
-	}
-
-
-	// Returns true if the underlying atomic variables used by
-	// the queue are lock-free (they should be on most platforms).
-	// Thread-safe.
-	static constexpr bool is_lock_free()
-	{
-		return
-			details::static_is_lock_free<bool>::value == 2 &&
-			details::static_is_lock_free<size_t>::value == 2 &&
-			details::static_is_lock_free<std::uint32_t>::value == 2 &&
-			details::static_is_lock_free<index_t>::value == 2 &&
-			details::static_is_lock_free<void*>::value == 2 &&
-			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
-	}
-
-
-private:
-	friend struct ProducerToken;
-	friend struct ConsumerToken;
-	struct ExplicitProducer;
-	friend struct ExplicitProducer;
-	struct ImplicitProducer;
-	friend struct ImplicitProducer;
-	friend class ConcurrentQueueTests;
-
-	enum AllocationMode { CanAlloc, CannotAlloc };
-
-
-	///////////////////////////////
-	// Queue methods
-	///////////////////////////////
-
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(producer_token_t const& token, U&& element)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(U&& element)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-
-	inline bool update_current_producer_after_rotation(consumer_token_t& token)
-	{
-		// Ah, there's been a rotation, figure out where we should be!
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		if (token.desiredProducer == nullptr && tail == nullptr) {
-			return false;
-		}
-		auto prodCount = producerCount.load(std::memory_order_relaxed);
-		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
-		if ((details::unlikely)(token.desiredProducer == nullptr)) {
-			// Aha, first time we're dequeueing anything.
-			// Figure out our local position
-			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
-			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
-			token.desiredProducer = tail;
-			for (std::uint32_t i = 0; i != offset; ++i) {
-				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-				if (token.desiredProducer == nullptr) {
-					token.desiredProducer = tail;
-				}
-			}
-		}
-
-		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
-		if (delta >= prodCount) {
-			delta = delta % prodCount;
-		}
-		for (std::uint32_t i = 0; i != delta; ++i) {
-			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-			if (token.desiredProducer == nullptr) {
-				token.desiredProducer = tail;
-			}
-		}
-
-		token.lastKnownGlobalOffset = globalOffset;
-		token.currentProducer = token.desiredProducer;
-		token.itemsConsumedFromCurrent = 0;
-		return true;
-	}
-
-
-	///////////////////////////
-	// Free list
-	///////////////////////////
-
-	template <typename N>
-	struct FreeListNode
-	{
-		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
-
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<N*> freeListNext;
-	};
-
-	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
-	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
-	// speedy under low contention.
-	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
-	struct FreeList
-	{
-		FreeList() : freeListHead(nullptr) { }
-		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
-		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
-
-		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-
-		inline void add(N* node)
-		{
+            details::swap_relaxed(explicitProducers, other.explicitProducers);
+            details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+
+            return *this;
+        }
+
+      public:
+        // Enqueues a single item (by copying it).
+        // Allocates memory if required. Only fails if memory allocation fails (or implicit
+        // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+        // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Thread-safe.
+        inline bool enqueue(T const& item)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue<CanAlloc>(item);
+        }
+
+        // Enqueues a single item (by moving it, if possible).
+        // Allocates memory if required. Only fails if memory allocation fails (or implicit
+        // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+        // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Thread-safe.
+        inline bool enqueue(T&& item)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue<CanAlloc>(std::move(item));
+        }
+
+        // Enqueues a single item (by copying it) using an explicit producer token.
+        // Allocates memory if required. Only fails if memory allocation fails (or
+        // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Thread-safe.
+        inline bool enqueue(producer_token_t const& token, T const& item)
+        {
+            return inner_enqueue<CanAlloc>(token, item);
+        }
+
+        // Enqueues a single item (by moving it, if possible) using an explicit producer token.
+        // Allocates memory if required. Only fails if memory allocation fails (or
+        // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Thread-safe.
+        inline bool enqueue(producer_token_t const& token, T&& item)
+        {
+            return inner_enqueue<CanAlloc>(token, std::move(item));
+        }
+
+        // Enqueues several items.
+        // Allocates memory if required. Only fails if memory allocation fails (or
+        // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+        // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+        // Thread-safe.
+        template<typename It>
+        bool enqueue_bulk(It itemFirst, size_t count)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+        }
+
+        // Enqueues several items using an explicit producer token.
+        // Allocates memory if required. Only fails if memory allocation fails
+        // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Note: Use std::make_move_iterator if the elements should be moved
+        // instead of copied.
+        // Thread-safe.
+        template<typename It>
+        bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+        {
+            return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+        }
+
+        // Enqueues a single item (by copying it).
+        // Does not allocate memory. Fails if not enough room to enqueue (or implicit
+        // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+        // is 0).
+        // Thread-safe.
+        inline bool try_enqueue(T const& item)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue<CannotAlloc>(item);
+        }
+
+        // Enqueues a single item (by moving it, if possible).
+        // Does not allocate memory (except for one-time implicit producer).
+        // Fails if not enough room to enqueue (or implicit production is
+        // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+        // Thread-safe.
+        inline bool try_enqueue(T&& item)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue<CannotAlloc>(std::move(item));
+        }
+
+        // Enqueues a single item (by copying it) using an explicit producer token.
+        // Does not allocate memory. Fails if not enough room to enqueue.
+        // Thread-safe.
+        inline bool try_enqueue(producer_token_t const& token, T const& item)
+        {
+            return inner_enqueue<CannotAlloc>(token, item);
+        }
+
+        // Enqueues a single item (by moving it, if possible) using an explicit producer token.
+        // Does not allocate memory. Fails if not enough room to enqueue.
+        // Thread-safe.
+        inline bool try_enqueue(producer_token_t const& token, T&& item)
+        {
+            return inner_enqueue<CannotAlloc>(token, std::move(item));
+        }
+
+        // Enqueues several items.
+        // Does not allocate memory (except for one-time implicit producer).
+        // Fails if not enough room to enqueue (or implicit production is
+        // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+        // Note: Use std::make_move_iterator if the elements should be moved
+        // instead of copied.
+        // Thread-safe.
+        template<typename It>
+        bool try_enqueue_bulk(It itemFirst, size_t count)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+        }
+
+        // Enqueues several items using an explicit producer token.
+        // Does not allocate memory. Fails if not enough room to enqueue.
+        // Note: Use std::make_move_iterator if the elements should be moved
+        // instead of copied.
+        // Thread-safe.
+        template<typename It>
+        bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+        {
+            return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+        }
+
+
+        // Attempts to dequeue from the queue.
+        // Returns false if all producer streams appeared empty at the time they
+        // were checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename U>
+        bool try_dequeue(U& item)
+        {
+            // Instead of simply trying each producer in turn (which could cause needless contention on the first
+            // producer), we score them heuristically.
+            size_t        nonEmptyCount = 0;
+            ProducerBase* best          = nullptr;
+            size_t        bestSize      = 0;
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod())
+            {
+                auto size = ptr->size_approx();
+                if (size > 0)
+                {
+                    if (size > bestSize)
+                    {
+                        bestSize = size;
+                        best     = ptr;
+                    }
+                    ++nonEmptyCount;
+                }
+            }
+
+            // If there was at least one non-empty queue but it appears empty at the time
+            // we try to dequeue from it, we need to make sure every queue's been tried
+            if (nonEmptyCount > 0)
+            {
+                if ((details::likely)(best->dequeue(item)))
+                {
+                    return true;
+                }
+                for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+                {
+                    if (ptr != best && ptr->dequeue(item))
+                    {
+                        return true;
+                    }
+                }
+            }
+            return false;
+        }
+
+        // Attempts to dequeue from the queue.
+        // Returns false if all producer streams appeared empty at the time they
+        // were checked (so, the queue is likely but not guaranteed to be empty).
+        // This differs from the try_dequeue(item) method in that this one does
+        // not attempt to reduce contention by interleaving the order that producer
+        // streams are dequeued from. So, using this method can reduce overall throughput
+        // under contention, but will give more predictable results in single-threaded
+        // consumer scenarios. This is mostly only useful for internal unit tests.
+        // Never allocates. Thread-safe.
+        template<typename U>
+        bool try_dequeue_non_interleaved(U& item)
+        {
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+            {
+                if (ptr->dequeue(item))
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        // Attempts to dequeue from the queue using an explicit consumer token.
+        // Returns false if all producer streams appeared empty at the time they
+        // were checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename U>
+        bool try_dequeue(consumer_token_t& token, U& item)
+        {
+            // The idea is roughly as follows:
+            // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+            // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+            // If there's no items where you're supposed to be, keep moving until you find a producer with some items
+            // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+
+            if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+            {
+                if (!update_current_producer_after_rotation(token))
+                {
+                    return false;
+                }
+            }
+
+            // If there was at least one non-empty queue but it appears empty at the time
+            // we try to dequeue from it, we need to make sure every queue's been tried
+            if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item))
+            {
+                if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE)
+                {
+                    globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+                }
+                return true;
+            }
+
+            auto tail = producerListTail.load(std::memory_order_acquire);
+            auto ptr  = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+            if (ptr == nullptr)
+            {
+                ptr = tail;
+            }
+            while (ptr != static_cast<ProducerBase*>(token.currentProducer))
+            {
+                if (ptr->dequeue(item))
+                {
+                    token.currentProducer          = ptr;
+                    token.itemsConsumedFromCurrent = 1;
+                    return true;
+                }
+                ptr = ptr->next_prod();
+                if (ptr == nullptr)
+                {
+                    ptr = tail;
+                }
+            }
+            return false;
+        }
+
+        // Attempts to dequeue several elements from the queue.
+        // Returns the number of items actually dequeued.
+        // Returns 0 if all producer streams appeared empty at the time they
+        // were checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename It>
+        size_t try_dequeue_bulk(It itemFirst, size_t max)
+        {
+            size_t count = 0;
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+            {
+                count += ptr->dequeue_bulk(itemFirst, max - count);
+                if (count == max)
+                {
+                    break;
+                }
+            }
+            return count;
+        }
+
+        // Attempts to dequeue several elements from the queue using an explicit consumer token.
+        // Returns the number of items actually dequeued.
+        // Returns 0 if all producer streams appeared empty at the time they
+        // were checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename It>
+        size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+        {
+            if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+            {
+                if (!update_current_producer_after_rotation(token))
+                {
+                    return 0;
+                }
+            }
+
+            size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+            if (count == max)
+            {
+                if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE)
+                {
+                    globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+                }
+                return max;
+            }
+            token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+            max -= count;
+
+            auto tail = producerListTail.load(std::memory_order_acquire);
+            auto ptr  = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+            if (ptr == nullptr)
+            {
+                ptr = tail;
+            }
+            while (ptr != static_cast<ProducerBase*>(token.currentProducer))
+            {
+                auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+                count += dequeued;
+                if (dequeued != 0)
+                {
+                    token.currentProducer          = ptr;
+                    token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+                }
+                if (dequeued == max)
+                {
+                    break;
+                }
+                max -= dequeued;
+                ptr = ptr->next_prod();
+                if (ptr == nullptr)
+                {
+                    ptr = tail;
+                }
+            }
+            return count;
+        }
+
+
+        // Attempts to dequeue from a specific producer's inner queue.
+        // If you happen to know which producer you want to dequeue from, this
+        // is significantly faster than using the general-case try_dequeue methods.
+        // Returns false if the producer's queue appeared empty at the time it
+        // was checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename U>
+        inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+        {
+            return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+        }
+
+        // Attempts to dequeue several elements from a specific producer's inner queue.
+        // Returns the number of items actually dequeued.
+        // If you happen to know which producer you want to dequeue from, this
+        // is significantly faster than using the general-case try_dequeue methods.
+        // Returns 0 if the producer's queue appeared empty at the time it
+        // was checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename It>
+        inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+        {
+            return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+        }
+
+
+        // Returns an estimate of the total number of elements currently in the queue. This
+        // estimate is only accurate if the queue has completely stabilized before it is called
+        // (i.e. all enqueue and dequeue operations have completed and their memory effects are
+        // visible on the calling thread, and no further operations start while this method is
+        // being called).
+        // Thread-safe.
+        size_t size_approx() const
+        {
+            size_t size = 0;
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+            {
+                size += ptr->size_approx();
+            }
+            return size;
+        }
+
+
+        // Returns true if the underlying atomic variables used by
+        // the queue are lock-free (they should be on most platforms).
+        // Thread-safe.
+        static constexpr bool is_lock_free()
+        {
+            return details::static_is_lock_free<bool>::value == 2 &&
+                   details::static_is_lock_free<size_t>::value == 2 &&
+                   details::static_is_lock_free<std::uint32_t>::value == 2 &&
+                   details::static_is_lock_free<index_t>::value == 2 &&
+                   details::static_is_lock_free<void*>::value == 2 &&
+                   details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+        }
+
+
+      private:
+        friend struct ProducerToken;
+        friend struct ConsumerToken;
+        struct ExplicitProducer;
+        friend struct ExplicitProducer;
+        struct ImplicitProducer;
+        friend struct ImplicitProducer;
+        friend class ConcurrentQueueTests;
+
+        enum AllocationMode
+        {
+            CanAlloc,
+            CannotAlloc
+        };
+
+
+        ///////////////////////////////
+        // Queue methods
+        ///////////////////////////////
+
+        template<AllocationMode canAlloc, typename U>
+        inline bool inner_enqueue(producer_token_t const& token, U&& element)
+        {
+            return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+        }
+
+        template<AllocationMode canAlloc, typename U>
+        inline bool inner_enqueue(U&& element)
+        {
+            auto producer = get_or_add_implicit_producer();
+            return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+        }
+
+        template<AllocationMode canAlloc, typename It>
+        inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+        {
+            return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+        }
+
+        template<AllocationMode canAlloc, typename It>
+        inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+        {
+            auto producer = get_or_add_implicit_producer();
+            return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+        }
+
+        inline bool update_current_producer_after_rotation(consumer_token_t& token)
+        {
+            // Ah, there's been a rotation, figure out where we should be!
+            auto tail = producerListTail.load(std::memory_order_acquire);
+            if (token.desiredProducer == nullptr && tail == nullptr)
+            {
+                return false;
+            }
+            auto prodCount    = producerCount.load(std::memory_order_relaxed);
+            auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+            if ((details::unlikely)(token.desiredProducer == nullptr))
+            {
+                // Aha, first time we're dequeueing anything.
+                // Figure out our local position
+                // Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+                std::uint32_t offset  = prodCount - 1 - (token.initialOffset % prodCount);
+                token.desiredProducer = tail;
+                for (std::uint32_t i = 0; i != offset; ++i)
+                {
+                    token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+                    if (token.desiredProducer == nullptr)
+                    {
+                        token.desiredProducer = tail;
+                    }
+                }
+            }
+
+            std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+            if (delta >= prodCount)
+            {
+                delta = delta % prodCount;
+            }
+            for (std::uint32_t i = 0; i != delta; ++i)
+            {
+                token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+                if (token.desiredProducer == nullptr)
+                {
+                    token.desiredProducer = tail;
+                }
+            }
+
+            token.lastKnownGlobalOffset    = globalOffset;
+            token.currentProducer          = token.desiredProducer;
+            token.itemsConsumedFromCurrent = 0;
+            return true;
+        }
+
+
+        ///////////////////////////
+        // Free list
+        ///////////////////////////
+
+        template<typename N>
+        struct FreeListNode
+        {
+            FreeListNode()
+                : freeListRefs(0)
+                , freeListNext(nullptr)
+            {
+            }
+
+            std::atomic<std::uint32_t> freeListRefs;
+            std::atomic<N*>            freeListNext;
+        };
+
+        // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+        // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+        // speedy under low contention.
+        template<typename N>  // N must inherit FreeListNode or have the same fields (and initialization of them)
+        struct FreeList
+        {
+            FreeList()
+                : freeListHead(nullptr)
+            {
+            }
+            FreeList(FreeList&& other)
+                : freeListHead(other.freeListHead.load(std::memory_order_relaxed))
+            {
+                other.freeListHead.store(nullptr, std::memory_order_relaxed);
+            }
+            void swap(FreeList& other)
+            {
+                details::swap_relaxed(freeListHead, other.freeListHead);
+            }
+
+            FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+            FreeList&   operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+
+            inline void add(N* node)
+            {
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif
-			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
-			// set it using a fetch_add
-			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
-				// Oh look! We were the last ones referencing this node, and we know
-				// we want to add it to the free list, so let's do it!
-		 		add_knowing_refcount_is_zero(node);
-			}
-		}
-
-		inline N* try_get()
-		{
+                debug::DebugLock lock(mutex);
+#endif
+                // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+                // set it using a fetch_add
+                if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0)
+                {
+                    // Oh look! We were the last ones referencing this node, and we know
+                    // we want to add it to the free list, so let's do it!
+                    add_knowing_refcount_is_zero(node);
+                }
+            }
+
+            inline N* try_get()
+            {
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif
-			auto head = freeListHead.load(std::memory_order_acquire);
-			while (head != nullptr) {
-				auto prevHead = head;
-				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
-				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) {
-					head = freeListHead.load(std::memory_order_acquire);
-					continue;
-				}
-
-				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
-				// next and not worry about it changing between now and the time we do the CAS
-				auto next = head->freeListNext.load(std::memory_order_relaxed);
-				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
-					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
-					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
-
-					// Decrease refcount twice, once for our ref, and once for the list's ref
-					head->freeListRefs.fetch_sub(2, std::memory_order_release);
-					return head;
-				}
-
-				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
-				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
-				// count decrement happens-after the CAS on the head.
-				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
-				if (refs == SHOULD_BE_ON_FREELIST + 1) {
-					add_knowing_refcount_is_zero(prevHead);
-				}
-			}
-
-			return nullptr;
-		}
-
-		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
-		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
-
-	private:
-		inline void add_knowing_refcount_is_zero(N* node)
-		{
-			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
-			// only one copy of this method per node at a time, i.e. the single thread case), then we know
-			// we can safely change the next pointer of the node; however, once the refcount is back above
-			// zero, then other threads could increase it (happens under heavy contention, when the refcount
-			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
-			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
-			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
-			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
-			auto head = freeListHead.load(std::memory_order_relaxed);
-			while (true) {
-				node->freeListNext.store(head, std::memory_order_relaxed);
-				node->freeListRefs.store(1, std::memory_order_release);
-				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
-					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
-					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
-						continue;
-					}
-				}
-				return;
-			}
-		}
-
-	private:
-		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
-		std::atomic<N*> freeListHead;
-
-	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
-	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+                debug::DebugLock lock(mutex);
+#endif
+                auto head = freeListHead.load(std::memory_order_acquire);
+                while (head != nullptr)
+                {
+                    auto prevHead = head;
+                    auto refs     = head->freeListRefs.load(std::memory_order_relaxed);
+                    if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed))
+                    {
+                        head = freeListHead.load(std::memory_order_acquire);
+                        continue;
+                    }
+
+                    // Good, reference count has been incremented (it wasn't at zero), which means we can read the
+                    // next and not worry about it changing between now and the time we do the CAS
+                    auto next = head->freeListNext.load(std::memory_order_relaxed);
+                    if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed))
+                    {
+                        // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+                        // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+                        assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+
+                        // Decrease refcount twice, once for our ref, and once for the list's ref
+                        head->freeListRefs.fetch_sub(2, std::memory_order_release);
+                        return head;
+                    }
+
+                    // OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+                    // Note that we don't need to release any memory effects, but we do need to ensure that the reference
+                    // count decrement happens-after the CAS on the head.
+                    refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+                    if (refs == SHOULD_BE_ON_FREELIST + 1)
+                    {
+                        add_knowing_refcount_is_zero(prevHead);
+                    }
+                }
+
+                return nullptr;
+            }
+
+            // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+            N* head_unsafe() const
+            {
+                return freeListHead.load(std::memory_order_relaxed);
+            }
+
+          private:
+            inline void add_knowing_refcount_is_zero(N* node)
+            {
+                // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+                // only one copy of this method per node at a time, i.e. the single thread case), then we know
+                // we can safely change the next pointer of the node; however, once the refcount is back above
+                // zero, then other threads could increase it (happens under heavy contention, when the refcount
+                // goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+                // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+                // to add the node to the actual list fails, decrease the refcount and leave the add operation to
+                // the next thread who puts the refcount back at zero (which could be us, hence the loop).
+                auto head = freeListHead.load(std::memory_order_relaxed);
+                while (true)
+                {
+                    node->freeListNext.store(head, std::memory_order_relaxed);
+                    node->freeListRefs.store(1, std::memory_order_release);
+                    if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed))
+                    {
+                        // Hmm, the add failed, but we can only try again when the refcount goes back to zero
+                        if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1)
+                        {
+                            continue;
+                        }
+                    }
+                    return;
+                }
+            }
+
+          private:
+            // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+            std::atomic<N*>            freeListHead;
+
+            static const std::uint32_t REFS_MASK             = 0x7FFFFFFF;
+            static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
 
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-		debug::DebugMutex mutex;
+            debug::DebugMutex mutex;
 #endif
-	};
-
-
-	///////////////////////////
-	// Block
-	///////////////////////////
-
-	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
-
-	struct Block
-	{
-		Block()
-			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
-		{
+        };
+
+
+        ///////////////////////////
+        // Block
+        ///////////////////////////
+
+        enum InnerQueueContext
+        {
+            implicit_context = 0,
+            explicit_context = 1
+        };
+
+        struct Block
+        {
+            Block()
+                : next(nullptr)
+                , elementsCompletelyDequeued(0)
+                , freeListRefs(0)
+                , freeListNext(nullptr)
+                , dynamicallyAllocated(true)
+            {
 #ifdef MCDBGQ_TRACKMEM
-			owner = nullptr;
-#endif
-		}
-
-		template<InnerQueueContext context>
-		inline bool is_empty() const
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Check flags
-				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
-					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
-						return false;
-					}
-				}
-
-				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
-				std::atomic_thread_fence(std::memory_order_acquire);
-				return true;
-			}
-			else {
-				// Check counter
-				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
-					std::atomic_thread_fence(std::memory_order_acquire);
-					return true;
-				}
-				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
-				return false;
-			}
-		}
-
-		// Returns true if the block is now empty (does not apply in explicit context)
-		template<InnerQueueContext context>
-		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flag
-				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
-				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
-				assert(prevVal < BLOCK_SIZE);
-				return prevVal == BLOCK_SIZE - 1;
-			}
-		}
-
-		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
-		// Returns true if the block is now empty (does not apply in explicit context).
-		template<InnerQueueContext context>
-		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flags
-				std::atomic_thread_fence(std::memory_order_release);
-				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
-				for (size_t j = 0; j != count; ++j) {
-					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
-					emptyFlags[i + j].store(true, std::memory_order_relaxed);
-				}
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
-				assert(prevVal + count <= BLOCK_SIZE);
-				return prevVal + count == BLOCK_SIZE;
-			}
-		}
-
-		template<InnerQueueContext context>
-		inline void set_all_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set all flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(true, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
-			}
-		}
-
-		template<InnerQueueContext context>
-		inline void reset_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Reset flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(false, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
-			}
-		}
-
-		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-
-	private:
-		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
-		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
-	public:
-		Block* next;
-		std::atomic<size_t> elementsCompletelyDequeued;
-		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
-	public:
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<Block*> freeListNext;
-		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+                owner = nullptr;
+#endif
+            }
+
+            template<InnerQueueContext context>
+            inline bool is_empty() const
+            {
+                MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+                {
+                    // Check flags
+                    for (size_t i = 0; i < BLOCK_SIZE; ++i)
+                    {
+                        if (!emptyFlags[i].load(std::memory_order_relaxed))
+                        {
+                            return false;
+                        }
+                    }
+
+                    // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+                    std::atomic_thread_fence(std::memory_order_acquire);
+                    return true;
+                }
+                else
+                {
+                    // Check counter
+                    if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE)
+                    {
+                        std::atomic_thread_fence(std::memory_order_acquire);
+                        return true;
+                    }
+                    assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+                    return false;
+                }
+            }
+
+            // Returns true if the block is now empty (does not apply in explicit context)
+            template<InnerQueueContext context>
+            inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
+            {
+                MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+                {
+                    // Set flag
+                    assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+                    emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+                    return false;
+                }
+                else
+                {
+                    // Increment counter
+                    auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+                    assert(prevVal < BLOCK_SIZE);
+                    return prevVal == BLOCK_SIZE - 1;
+                }
+            }
+
+            // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+            // Returns true if the block is now empty (does not apply in explicit context).
+            template<InnerQueueContext context>
+            inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
+            {
+                MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+                {
+                    // Set flags
+                    std::atomic_thread_fence(std::memory_order_release);
+                    i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+                    for (size_t j = 0; j != count; ++j)
+                    {
+                        assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+                        emptyFlags[i + j].store(true, std::memory_order_relaxed);
+                    }
+                    return false;
+                }
+                else
+                {
+                    // Increment counter
+                    auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+                    assert(prevVal + count <= BLOCK_SIZE);
+                    return prevVal + count == BLOCK_SIZE;
+                }
+            }
+
+            template<InnerQueueContext context>
+            inline void set_all_empty()
+            {
+                MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+                {
+                    // Set all flags
+                    for (size_t i = 0; i != BLOCK_SIZE; ++i)
+                    {
+                        emptyFlags[i].store(true, std::memory_order_relaxed);
+                    }
+                }
+                else
+                {
+                    // Reset counter
+                    elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+                }
+            }
+
+            template<InnerQueueContext context>
+            inline void reset_empty()
+            {
+                MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+                {
+                    // Reset flags
+                    for (size_t i = 0; i != BLOCK_SIZE; ++i)
+                    {
+                        emptyFlags[i].store(false, std::memory_order_relaxed);
+                    }
+                }
+                else
+                {
+                    // Reset counter
+                    elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+                }
+            }
+
+            inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT
+            {
+                return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+            }
+            inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT
+            {
+                return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+            }
+
+          private:
+            static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
+            MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T)
+            elements;
+
+          public:
+            Block*              next;
+            std::atomic<size_t> elementsCompletelyDequeued;
+            std::atomic<bool>   emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+
+          public:
+            std::atomic<std::uint32_t> freeListRefs;
+            std::atomic<Block*>        freeListNext;
+            bool                       dynamicallyAllocated;  // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
 
 #ifdef MCDBGQ_TRACKMEM
-		void* owner;
+            void* owner;
 #endif
-	};
-	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+        };
+        static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
 
 
 #ifdef MCDBGQ_TRACKMEM
-public:
-	struct MemStats;
-private:
-#endif
-
-	///////////////////////////
-	// Producer base
-	///////////////////////////
-
-	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
-	{
-		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
-			tailIndex(0),
-			headIndex(0),
-			dequeueOptimisticCount(0),
-			dequeueOvercommit(0),
-			tailBlock(nullptr),
-			isExplicit(isExplicit_),
-			parent(parent_)
-		{
-		}
-
-		virtual ~ProducerBase() { }
-
-		template<typename U>
-		inline bool dequeue(U& element)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue(element);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue(element);
-			}
-		}
-
-		template<typename It>
-		inline size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-		}
-
-		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
-
-		inline size_t size_approx() const
-		{
-			auto tail = tailIndex.load(std::memory_order_relaxed);
-			auto head = headIndex.load(std::memory_order_relaxed);
-			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
-		}
-
-		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
-	protected:
-		std::atomic<index_t> tailIndex;		// Where to enqueue to next
-		std::atomic<index_t> headIndex;		// Where to dequeue from next
-
-		std::atomic<index_t> dequeueOptimisticCount;
-		std::atomic<index_t> dequeueOvercommit;
-
-		Block* tailBlock;
-
-	public:
-		bool isExplicit;
-		ConcurrentQueue* parent;
-
-	protected:
+      public:
+        struct MemStats;
+
+      private:
+#endif
+
+        ///////////////////////////
+        // Producer base
+        ///////////////////////////
+
+        struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+        {
+            ProducerBase(ConcurrentQueue* parent_, bool isExplicit_)
+                : tailIndex(0)
+                , headIndex(0)
+                , dequeueOptimisticCount(0)
+                , dequeueOvercommit(0)
+                , tailBlock(nullptr)
+                , isExplicit(isExplicit_)
+                , parent(parent_)
+            {
+            }
+
+            virtual ~ProducerBase() {}
+
+            template<typename U>
+            inline bool dequeue(U& element)
+            {
+                if (isExplicit)
+                {
+                    return static_cast<ExplicitProducer*>(this)->dequeue(element);
+                }
+                else
+                {
+                    return static_cast<ImplicitProducer*>(this)->dequeue(element);
+                }
+            }
+
+            template<typename It>
+            inline size_t dequeue_bulk(It& itemFirst, size_t max)
+            {
+                if (isExplicit)
+                {
+                    return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+                }
+                else
+                {
+                    return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+                }
+            }
+
+            inline ProducerBase* next_prod() const
+            {
+                return static_cast<ProducerBase*>(next);
+            }
+
+            inline size_t size_approx() const
+            {
+                auto tail = tailIndex.load(std::memory_order_relaxed);
+                auto head = headIndex.load(std::memory_order_relaxed);
+                return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+            }
+
+            inline index_t getTail() const
+            {
+                return tailIndex.load(std::memory_order_relaxed);
+            }
+
+          protected:
+            std::atomic<index_t> tailIndex;  // Where to enqueue to next
+            std::atomic<index_t> headIndex;  // Where to dequeue from next
+
+            std::atomic<index_t> dequeueOptimisticCount;
+            std::atomic<index_t> dequeueOvercommit;
+
+            Block*               tailBlock;
+
+          public:
+            bool             isExplicit;
+            ConcurrentQueue* parent;
+
+          protected:
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-
-
-	///////////////////////////
-	// Explicit queue
-	///////////////////////////
-
-	struct ExplicitProducer : public ProducerBase
-	{
-		explicit ExplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, true),
-			blockIndex(nullptr),
-			pr_blockIndexSlotsUsed(0),
-			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
-			pr_blockIndexFront(0),
-			pr_blockIndexEntries(nullptr),
-			pr_blockIndexRaw(nullptr)
-		{
-			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
-			if (poolBasedIndexSize > pr_blockIndexSize) {
-				pr_blockIndexSize = poolBasedIndexSize;
-			}
-
-			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
-		}
-
-		~ExplicitProducer()
-		{
-			// Destruct any elements not yet dequeued.
-			// Since we're in the destructor, we can assume all elements
-			// are either completely dequeued or completely not (no halfways).
-			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
-				// First find the block that's partially dequeued, if any
-				Block* halfDequeuedBlock = nullptr;
-				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
-					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
-					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
-					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
-					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
-						i = (i + 1) & (pr_blockIndexSize - 1);
-					}
-					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
-					halfDequeuedBlock = pr_blockIndexEntries[i].block;
-				}
-
-				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
-				auto block = this->tailBlock;
-				do {
-					block = block->next;
-					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-						continue;
-					}
-
-					size_t i = 0;	// Offset into block
-					if (block == halfDequeuedBlock) {
-						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					}
-
-					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
-					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
-						(*block)[i++]->~T();
-					}
-				} while (block != this->tailBlock);
-			}
-
-			// Destroy all blocks that we own
-			if (this->tailBlock != nullptr) {
-				auto block = this->tailBlock;
-				do {
-					auto nextBlock = block->next;
-					this->parent->add_block_to_free_list(block);
-					block = nextBlock;
-				} while (block != this->tailBlock);
-			}
-
-			// Destroy the block indices
-			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
-			while (header != nullptr) {
-				auto prev = static_cast<BlockIndexHeader*>(header->prev);
-				header->~BlockIndexHeader();
-				(Traits::free)(header);
-				header = prev;
-			}
-		}
-
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto startBlock = this->tailBlock;
-				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					// We can re-use the block ahead of us, it's empty!
-					this->tailBlock = this->tailBlock->next;
-					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-
-					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
-					// last block from it first -- except instead of removing then adding, we can just overwrite).
-					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
-					// it would have been re-attempted when adding the first block to the queue; since there is such
-					// a block, a block index must have been successfully allocated.
-				}
-				else {
-					// Whatever head value we see here is >= the last value we saw here (relatively),
-					// and <= its current value. Since we have the most recent tail, the head must be
-					// <= to it.
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
-						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-						// We can't enqueue in another block because there's not enough leeway -- the
-						// tail could surpass the head by the time the block fills up! (Or we'll exceed
-						// the size limit, if the second part of the condition was true.)
-						return false;
-					}
-					// We're going to need a new block; check that the block index has room
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
-						// Hmm, the circular block index is already full -- we'll need
-						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
-						// the initial allocation failed in the constructor.
-
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							return false;
-						}
-						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
-							return false;
-						}
-					}
-
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						return false;
-					}
+            friend struct MemStats;
+#endif
+        };
+
+
+        ///////////////////////////
+        // Explicit queue
+        ///////////////////////////
+
+        struct ExplicitProducer : public ProducerBase
+        {
+            explicit ExplicitProducer(ConcurrentQueue* parent_)
+                : ProducerBase(parent_, true)
+                , blockIndex(nullptr)
+                , pr_blockIndexSlotsUsed(0)
+                , pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1)
+                , pr_blockIndexFront(0)
+                , pr_blockIndexEntries(nullptr)
+                , pr_blockIndexRaw(nullptr)
+            {
+                size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+                if (poolBasedIndexSize > pr_blockIndexSize)
+                {
+                    pr_blockIndexSize = poolBasedIndexSize;
+                }
+
+                new_block_index(0);  // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+            }
+
+            ~ExplicitProducer()
+            {
+                // Destruct any elements not yet dequeued.
+                // Since we're in the destructor, we can assume all elements
+                // are either completely dequeued or completely not (no halfways).
+                if (this->tailBlock != nullptr)
+                {  // Note this means there must be a block index too
+                    // First find the block that's partially dequeued, if any
+                    Block* halfDequeuedBlock = nullptr;
+                    if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)
+                    {
+                        // The head's not on a block boundary, meaning a block somewhere is partially dequeued
+                        // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+                        size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+                        while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed)))
+                        {
+                            i = (i + 1) & (pr_blockIndexSize - 1);
+                        }
+                        assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+                        halfDequeuedBlock = pr_blockIndexEntries[i].block;
+                    }
+
+                    // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+                    auto block = this->tailBlock;
+                    do {
+                        block = block->next;
+                        if (block->ConcurrentQueue::Block::template is_empty<explicit_context>())
+                        {
+                            continue;
+                        }
+
+                        size_t i = 0;  // Offset into block
+                        if (block == halfDequeuedBlock)
+                        {
+                            i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+                        }
+
+                        // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+                        auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+                        while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex))
+                        {
+                            (*block)[i++]->~T();
+                        }
+                    } while (block != this->tailBlock);
+                }
+
+                // Destroy all blocks that we own
+                if (this->tailBlock != nullptr)
+                {
+                    auto block = this->tailBlock;
+                    do {
+                        auto nextBlock = block->next;
+                        this->parent->add_block_to_free_list(block);
+                        block = nextBlock;
+                    } while (block != this->tailBlock);
+                }
+
+                // Destroy the block indices
+                auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+                while (header != nullptr)
+                {
+                    auto prev = static_cast<BlockIndexHeader*>(header->prev);
+                    header->~BlockIndexHeader();
+                    (Traits::free)(header);
+                    header = prev;
+                }
+            }
+
+            template<AllocationMode allocMode, typename U>
+            inline bool enqueue(U&& element)
+            {
+                index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+                index_t newTailIndex     = 1 + currentTailIndex;
+                if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+                {
+                    // We reached the end of a block, start a new one
+                    auto startBlock                  = this->tailBlock;
+                    auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+                    if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>())
+                    {
+                        // We can re-use the block ahead of us, it's empty!
+                        this->tailBlock = this->tailBlock->next;
+                        this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+
+                        // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+                        // last block from it first -- except instead of removing then adding, we can just overwrite).
+                        // Note that there must be a valid block index here, since even if allocation failed in the ctor,
+                        // it would have been re-attempted when adding the first block to the queue; since there is such
+                        // a block, a block index must have been successfully allocated.
+                    }
+                    else
+                    {
+                        // Whatever head value we see here is >= the last value we saw here (relatively),
+                        // and <= its current value. Since we have the most recent tail, the head must be
+                        // <= to it.
+                        auto head = this->headIndex.load(std::memory_order_relaxed);
+                        assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                        if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)))
+                        {
+                            // We can't enqueue in another block because there's not enough leeway -- the
+                            // tail could surpass the head by the time the block fills up! (Or we'll exceed
+                            // the size limit, if the second part of the condition was true.)
+                            return false;
+                        }
+                        // We're going to need a new block; check that the block index has room
+                        if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize)
+                        {
+                            // Hmm, the circular block index is already full -- we'll need
+                            // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+                            // the initial allocation failed in the constructor.
+
+                            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
+                            {
+                                return false;
+                            }
+                            else if (!new_block_index(pr_blockIndexSlotsUsed))
+                            {
+                                return false;
+                            }
+                        }
+
+                        // Insert a new block in the circular linked list
+                        auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                        if (newBlock == nullptr)
+                        {
+                            return false;
+                        }
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					++pr_blockIndexSlotsUsed;
-				}
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// The constructor may throw. We want the element not to appear in the queue in
-					// that case (without corrupting the queue):
-					MOODYCAMEL_TRY {
-						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Revert change to the current block, but leave the new block available
-						// for next time
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				else {
-					(void)startBlock;
-					(void)originalBlockIndexSlotsUsed;
-				}
-
-				// Add block to block index
-				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-				entry.base = currentTailIndex;
-				entry.block = this->tailBlock;
-				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
-				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				// Might be something to dequeue, let's give it a try
-
-				// Note that this if is purely for performance purposes in the common case when the queue is
-				// empty and the values are eventually consistent -- we may enter here spuriously.
-
-				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
-				// change them) and must be the same value at this point (inside the if) as when the if condition was
-				// evaluated.
-
-				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
-				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
-				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
-				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
-				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
-				// unfortunately that can't be shown to be correct using only the C++11 standard.
-				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
-				std::atomic_thread_fence(std::memory_order_acquire);
-
-				// Increment optimistic counter, then check if it went over the boundary
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-
-				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
-				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
-				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
-				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
-				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
-				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
-
-				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
-				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
-				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					// Guaranteed to be at least one element to dequeue!
-
-					// Get the index. Note that since there's guaranteed to be at least one element, this
-					// will never exceed tail. We need to do an acquire-release fence here since it's possible
-					// that whatever condition got us to this point was for an earlier enqueued element (that
-					// we already see the memory effects for), but that by the time we increment somebody else
-					// has incremented it, and we need to see the memory effects for *that* element, which is
-					// in such a case is necessarily visible on the thread that incremented it in the first
-					// place with the more current condition (they must have acquired a tail that is at least
-					// as recent).
-					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-
-
-					// Determine which block the element is in
-
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-
-					// We need to be careful here about subtracting and dividing because of index wrap-around.
-					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
-					// block size (in order to get a correct signed block count offset in all cases):
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
-
-					// Dequeue
-					auto& el = *((*block)[index]);
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
-						// Make sure the element is still fully dequeued and destroyed even if the assignment
-						// throws
-						struct Guard {
-							Block* block;
-							index_t index;
-
-							~Guard()
-							{
-								(*block)[index]->~T();
-								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-							}
-						} guard = { block, index };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-					}
-
-					return true;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
-				}
-			}
-
-			return false;
-		}
-
-		template<AllocationMode allocMode, typename It>
-		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			auto originalBlockIndexFront = pr_blockIndexFront;
-			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-
-			Block* firstAllocatedBlock = nullptr;
-
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
-				// Allocate as many blocks as possible from ahead
-				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-
-					this->tailBlock = this->tailBlock->next;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-
-				// Now allocate as many blocks as necessary from the block pool
-				while (blockBaseDiff > 0) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-
-						// pr_blockIndexFront is updated inside new_block_index, so we need to
-						// update our fallback value too (since we keep the new index even if we
-						// later fail)
-						originalBlockIndexFront = originalBlockIndexSlotsUsed;
-					}
-
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-						return false;
-					}
+                        newBlock->owner = this;
+#endif
+                        newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+                        if (this->tailBlock == nullptr)
+                        {
+                            newBlock->next = newBlock;
+                        }
+                        else
+                        {
+                            newBlock->next        = this->tailBlock->next;
+                            this->tailBlock->next = newBlock;
+                        }
+                        this->tailBlock = newBlock;
+                        ++pr_blockIndexSlotsUsed;
+                    }
+
+                    MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                    {
+                        // The constructor may throw. We want the element not to appear in the queue in
+                        // that case (without corrupting the queue):
+                        MOODYCAMEL_TRY
+                        {
+                            new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+                        }
+                        MOODYCAMEL_CATCH(...)
+                        {
+                            // Revert change to the current block, but leave the new block available
+                            // for next time
+                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                            this->tailBlock        = startBlock == nullptr ? this->tailBlock : startBlock;
+                            MOODYCAMEL_RETHROW;
+                        }
+                    }
+                    else
+                    {
+                        (void)startBlock;
+                        (void)originalBlockIndexSlotsUsed;
+                    }
+
+                    // Add block to block index
+                    auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                    entry.base  = currentTailIndex;
+                    entry.block = this->tailBlock;
+                    blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+                    pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+
+                    MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                    {
+                        this->tailIndex.store(newTailIndex, std::memory_order_release);
+                        return true;
+                    }
+                }
+
+                // Enqueue
+                new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+                this->tailIndex.store(newTailIndex, std::memory_order_release);
+                return true;
+            }
+
+            template<typename U>
+            bool dequeue(U& element)
+            {
+                auto tail       = this->tailIndex.load(std::memory_order_relaxed);
+                auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+                if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail))
+                {
+                    // Might be something to dequeue, let's give it a try
+
+                    // Note that this if is purely for performance purposes in the common case when the queue is
+                    // empty and the values are eventually consistent -- we may enter here spuriously.
+
+                    // Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+                    // change them) and must be the same value at this point (inside the if) as when the if condition was
+                    // evaluated.
+
+                    // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+                    // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+                    // the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+                    // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+                    // read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+                    // unfortunately that can't be shown to be correct using only the C++11 standard.
+                    // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+                    std::atomic_thread_fence(std::memory_order_acquire);
+
+                    // Increment optimistic counter, then check if it went over the boundary
+                    auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+
+                    // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+                    // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+                    // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+                    // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+                    // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
+                    // overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
+
+                    // Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+                    // this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+                    // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+                    tail = this->tailIndex.load(std::memory_order_acquire);
+                    if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail)))
+                    {
+                        // Guaranteed to be at least one element to dequeue!
+
+                        // Get the index. Note that since there's guaranteed to be at least one element, this
+                        // will never exceed tail. We need to do an acquire-release fence here since it's possible
+                        // that whatever condition got us to this point was for an earlier enqueued element (that
+                        // we already see the memory effects for), but that by the time we increment somebody else
+                        // has incremented it, and we need to see the memory effects for *that* element, which is
+                        // in such a case is necessarily visible on the thread that incremented it in the first
+                        // place with the more current condition (they must have acquired a tail that is at least
+                        // as recent).
+                        auto  index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+
+                        // Determine which block the element is in
+
+                        auto  localBlockIndex     = blockIndex.load(std::memory_order_acquire);
+                        auto  localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+                        // We need to be careful here about subtracting and dividing because of index wrap-around.
+                        // When an index wraps, we need to preserve the sign of the offset when dividing it by the
+                        // block size (in order to get a correct signed block count offset in all cases):
+                        auto  headBase       = localBlockIndex->entries[localBlockIndexHead].base;
+                        auto  blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                        auto  offset         = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+                        auto  block          = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+
+                        // Dequeue
+                        auto& el = *((*block)[index]);
+                        if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el)))
+                        {
+                            // Make sure the element is still fully dequeued and destroyed even if the assignment
+                            // throws
+                            struct Guard
+                            {
+                                Block*  block;
+                                index_t index;
+
+                                ~Guard()
+                                {
+                                    (*block)[index]->~T();
+                                    block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+                                }
+                            } guard = {block, index};
+
+                            element = std::move(el);  // NOLINT
+                        }
+                        else
+                        {
+                            element = std::move(el);  // NOLINT
+                            el.~T();                  // NOLINT
+                            block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+                        }
+
+                        return true;
+                    }
+                    else
+                    {
+                        // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+                        this->dequeueOvercommit.fetch_add(1, std::memory_order_release);  // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+                    }
+                }
+
+                return false;
+            }
+
+            template<AllocationMode allocMode, typename It>
+            bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
+            {
+                // First, we need to make sure we have enough room to enqueue all of the elements;
+                // this means pre-allocating blocks and putting them in the block index (but only if
+                // all the allocations succeeded).
+                index_t startTailIndex              = this->tailIndex.load(std::memory_order_relaxed);
+                auto    startBlock                  = this->tailBlock;
+                auto    originalBlockIndexFront     = pr_blockIndexFront;
+                auto    originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+
+                Block*  firstAllocatedBlock = nullptr;
+
+                // Figure out how many blocks we'll need to allocate, and do so
+                size_t  blockBaseDiff    = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+                index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                if (blockBaseDiff > 0)
+                {
+                    // Allocate as many blocks as possible from ahead
+                    while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>())
+                    {
+                        blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                        currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                        this->tailBlock     = this->tailBlock->next;
+                        firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+
+                        auto& entry        = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                        entry.base         = currentTailIndex;
+                        entry.block        = this->tailBlock;
+                        pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+                    }
+
+                    // Now allocate as many blocks as necessary from the block pool
+                    while (blockBaseDiff > 0)
+                    {
+                        blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                        currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                        auto head = this->headIndex.load(std::memory_order_relaxed);
+                        assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                        bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+                        if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full)
+                        {
+                            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
+                            {
+                                // Failed to allocate, undo changes (but keep injected blocks)
+                                pr_blockIndexFront     = originalBlockIndexFront;
+                                pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                                this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                                return false;
+                            }
+                            else if (full || !new_block_index(originalBlockIndexSlotsUsed))
+                            {
+                                // Failed to allocate, undo changes (but keep injected blocks)
+                                pr_blockIndexFront     = originalBlockIndexFront;
+                                pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                                this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                                return false;
+                            }
+
+                            // pr_blockIndexFront is updated inside new_block_index, so we need to
+                            // update our fallback value too (since we keep the new index even if we
+                            // later fail)
+                            originalBlockIndexFront = originalBlockIndexSlotsUsed;
+                        }
+
+                        // Insert a new block in the circular linked list
+                        auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                        if (newBlock == nullptr)
+                        {
+                            pr_blockIndexFront     = originalBlockIndexFront;
+                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                            this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                            return false;
+                        }
 
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-
-					++pr_blockIndexSlotsUsed;
-
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-
-				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
-				// publish the new block index front
-				auto block = firstAllocatedBlock;
-				while (true) {
-					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (block == this->tailBlock) {
-						break;
-					}
-					block = block->next;
-				}
-
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-				}
-			}
-
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			auto endBlock = this->tailBlock;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							// Must use copy constructor even if move constructor is available
-							// because we may have to revert if there's an exception.
-							// Sorry about the horrible templated next line, but it was the only way
-							// to disable moving *at compile time*, which is important because a type
-							// may only define a (noexcept) move constructor, and so calls to the
-							// cctor will not compile, even if they are in an if branch that will never
-							// be executed
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Oh dear, an exception's been thrown -- destroy the elements that
-						// were enqueued so far and revert the entire bulk operation (we'll keep
-						// any allocated blocks in our linked list for later, though).
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-						MOODYCAMEL_RETHROW;
-					}
-				}
-
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-
-			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-				if (firstAllocatedBlock != nullptr)
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-			}
-
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-
-					// Determine which block the first element is in
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
-
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					do {
-						auto firstIndexInBlock = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-						auto block = localBlockIndex->entries[indexIndex].block;
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								// It's too late to revert the dequeue, but we can make sure that all
-								// the dequeued objects are properly destroyed and the block index
-								// (and empty count) are properly updated before we propagate the exception
-								do {
-									block = localBlockIndex->entries[indexIndex].block;
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-
-									firstIndexInBlock = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-					} while (index != firstIndex + actualCount);
-
-					return actualCount;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-
-			return 0;
-		}
-
-	private:
-		struct BlockIndexEntry
-		{
-			index_t base;
-			Block* block;
-		};
-
-		struct BlockIndexHeader
-		{
-			size_t size;
-			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
-			BlockIndexEntry* entries;
-			void* prev;
-		};
-
-
-		bool new_block_index(size_t numberOfFilledSlotsToExpose)
-		{
-			auto prevBlockSizeMask = pr_blockIndexSize - 1;
-
-			// Create the new block
-			pr_blockIndexSize <<= 1;
-			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
-			if (newRawPtr == nullptr) {
-				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
-				return false;
-			}
-
-			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
-
-			// Copy in all the old indices, if any
-			size_t j = 0;
-			if (pr_blockIndexSlotsUsed != 0) {
-				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
-				do {
-					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
-					i = (i + 1) & prevBlockSizeMask;
-				} while (i != pr_blockIndexFront);
-			}
-
-			// Update everything
-			auto header = new (newRawPtr) BlockIndexHeader;
-			header->size = pr_blockIndexSize;
-			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
-			header->entries = newBlockIndexEntries;
-			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
-
-			pr_blockIndexFront = j;
-			pr_blockIndexEntries = newBlockIndexEntries;
-			pr_blockIndexRaw = newRawPtr;
-			blockIndex.store(header, std::memory_order_release);
-
-			return true;
-		}
-
-	private:
-		std::atomic<BlockIndexHeader*> blockIndex;
-
-		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
-		size_t pr_blockIndexSlotsUsed;
-		size_t pr_blockIndexSize;
-		size_t pr_blockIndexFront;		// Next slot (not current)
-		BlockIndexEntry* pr_blockIndexEntries;
-		void* pr_blockIndexRaw;
+                        newBlock->owner = this;
+#endif
+                        newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+                        if (this->tailBlock == nullptr)
+                        {
+                            newBlock->next = newBlock;
+                        }
+                        else
+                        {
+                            newBlock->next        = this->tailBlock->next;
+                            this->tailBlock->next = newBlock;
+                        }
+                        this->tailBlock     = newBlock;
+                        firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+
+                        ++pr_blockIndexSlotsUsed;
+
+                        auto& entry        = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                        entry.base         = currentTailIndex;
+                        entry.block        = this->tailBlock;
+                        pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+                    }
+
+                    // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+                    // publish the new block index front
+                    auto block = firstAllocatedBlock;
+                    while (true)
+                    {
+                        block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+                        if (block == this->tailBlock)
+                        {
+                            break;
+                        }
+                        block = block->next;
+                    }
+
+                    MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+                    {
+                        blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+                    }
+                }
+
+                // Enqueue, one block at a time
+                index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+                currentTailIndex     = startTailIndex;
+                auto endBlock        = this->tailBlock;
+                this->tailBlock      = startBlock;
+                assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+                if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr)
+                {
+                    this->tailBlock = firstAllocatedBlock;
+                }
+                while (true)
+                {
+                    index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                    if (details::circular_less_than<index_t>(newTailIndex, stopIndex))
+                    {
+                        stopIndex = newTailIndex;
+                    }
+                    MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+                    {
+                        while (currentTailIndex != stopIndex)
+                        {
+                            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+                        }
+                    }
+                    else
+                    {
+                        MOODYCAMEL_TRY
+                        {
+                            while (currentTailIndex != stopIndex)
+                            {
+                                // Must use copy constructor even if move constructor is available
+                                // because we may have to revert if there's an exception.
+                                // Sorry about the horrible templated next line, but it was the only way
+                                // to disable moving *at compile time*, which is important because a type
+                                // may only define a (noexcept) move constructor, and so calls to the
+                                // cctor will not compile, even if they are in an if branch that will never
+                                // be executed
+                                new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+                                ++currentTailIndex;
+                                ++itemFirst;
+                            }
+                        }
+                        MOODYCAMEL_CATCH(...)
+                        {
+                            // Oh dear, an exception's been thrown -- destroy the elements that
+                            // were enqueued so far and revert the entire bulk operation (we'll keep
+                            // any allocated blocks in our linked list for later, though).
+                            auto constructedStopIndex = currentTailIndex;
+                            auto lastBlockEnqueued    = this->tailBlock;
+
+                            pr_blockIndexFront     = originalBlockIndexFront;
+                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                            this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+
+                            if (!details::is_trivially_destructible<T>::value)
+                            {
+                                auto block = startBlock;
+                                if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+                                {
+                                    block = firstAllocatedBlock;
+                                }
+                                currentTailIndex = startTailIndex;
+                                while (true)
+                                {
+                                    stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                    if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex))
+                                    {
+                                        stopIndex = constructedStopIndex;
+                                    }
+                                    while (currentTailIndex != stopIndex)
+                                    {
+                                        (*block)[currentTailIndex++]->~T();
+                                    }
+                                    if (block == lastBlockEnqueued)
+                                    {
+                                        break;
+                                    }
+                                    block = block->next;
+                                }
+                            }
+                            MOODYCAMEL_RETHROW;
+                        }
+                    }
+
+                    if (this->tailBlock == endBlock)
+                    {
+                        assert(currentTailIndex == newTailIndex);
+                        break;
+                    }
+                    this->tailBlock = this->tailBlock->next;
+                }
+
+                MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+                {
+                    if (firstAllocatedBlock != nullptr)
+                        blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+                }
+
+                this->tailIndex.store(newTailIndex, std::memory_order_release);
+                return true;
+            }
+
+            template<typename It>
+            size_t dequeue_bulk(It& itemFirst, size_t max)
+            {
+                auto tail         = this->tailIndex.load(std::memory_order_relaxed);
+                auto overcommit   = this->dequeueOvercommit.load(std::memory_order_relaxed);
+                auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+                if (details::circular_less_than<size_t>(0, desiredCount))
+                {
+                    desiredCount = desiredCount < max ? desiredCount : max;
+                    std::atomic_thread_fence(std::memory_order_acquire);
+
+                    auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+
+                    tail             = this->tailIndex.load(std::memory_order_acquire);
+                    auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+                    if (details::circular_less_than<size_t>(0, actualCount))
+                    {
+                        actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+                        if (actualCount < desiredCount)
+                        {
+                            this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+                        }
+
+                        // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+                        // will never exceed tail.
+                        auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+                        // Determine which block the first element is in
+                        auto localBlockIndex     = blockIndex.load(std::memory_order_acquire);
+                        auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+                        auto headBase            = localBlockIndex->entries[localBlockIndexHead].base;
+                        auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                        auto offset              = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+                        auto indexIndex          = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+
+                        // Iterate the blocks and dequeue
+                        auto index = firstIndex;
+                        do {
+                            auto    firstIndexInBlock = index;
+                            index_t endIndex          = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                            endIndex                  = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+                            auto block                = localBlockIndex->entries[indexIndex].block;
+                            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index]))))
+                            {
+                                while (index != endIndex)
+                                {
+                                    auto& el     = *((*block)[index]);
+                                    *itemFirst++ = std::move(el);
+                                    el.~T();
+                                    ++index;
+                                }
+                            }
+                            else
+                            {
+                                MOODYCAMEL_TRY
+                                {
+                                    while (index != endIndex)
+                                    {
+                                        auto& el   = *((*block)[index]);
+                                        *itemFirst = std::move(el);
+                                        ++itemFirst;
+                                        el.~T();
+                                        ++index;
+                                    }
+                                }
+                                MOODYCAMEL_CATCH(...)
+                                {
+                                    // It's too late to revert the dequeue, but we can make sure that all
+                                    // the dequeued objects are properly destroyed and the block index
+                                    // (and empty count) are properly updated before we propagate the exception
+                                    do {
+                                        block = localBlockIndex->entries[indexIndex].block;
+                                        while (index != endIndex)
+                                        {
+                                            (*block)[index++]->~T();
+                                        }
+                                        block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+                                        indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+
+                                        firstIndexInBlock = index;
+                                        endIndex          = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                        endIndex          = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+                                    } while (index != firstIndex + actualCount);
+
+                                    MOODYCAMEL_RETHROW;
+                                }
+                            }
+                            block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+                            indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+                        } while (index != firstIndex + actualCount);
+
+                        return actualCount;
+                    }
+                    else
+                    {
+                        // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+                        this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+                    }
+                }
+
+                return 0;
+            }
+
+          private:
+            struct BlockIndexEntry
+            {
+                index_t base;
+                Block*  block;
+            };
+
+            struct BlockIndexHeader
+            {
+                size_t              size;
+                std::atomic<size_t> front;  // Current slot (not next, like pr_blockIndexFront)
+                BlockIndexEntry*    entries;
+                void*               prev;
+            };
+
+
+            bool new_block_index(size_t numberOfFilledSlotsToExpose)
+            {
+                auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+                // Create the new block
+                pr_blockIndexSize <<= 1;
+                auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+                if (newRawPtr == nullptr)
+                {
+                    pr_blockIndexSize >>= 1;  // Reset to allow graceful retry
+                    return false;
+                }
+
+                auto   newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+
+                // Copy in all the old indices, if any
+                size_t j = 0;
+                if (pr_blockIndexSlotsUsed != 0)
+                {
+                    auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+                    do {
+                        newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+                        i                         = (i + 1) & prevBlockSizeMask;
+                    } while (i != pr_blockIndexFront);
+                }
+
+                // Update everything
+                auto header  = new (newRawPtr) BlockIndexHeader;
+                header->size = pr_blockIndexSize;
+                header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+                header->entries = newBlockIndexEntries;
+                header->prev    = pr_blockIndexRaw;  // we link the new block to the old one so we can free it later
+
+                pr_blockIndexFront   = j;
+                pr_blockIndexEntries = newBlockIndexEntries;
+                pr_blockIndexRaw     = newRawPtr;
+                blockIndex.store(header, std::memory_order_release);
+
+                return true;
+            }
+
+          private:
+            std::atomic<BlockIndexHeader*> blockIndex;
+
+            // To be used by producer only -- consumer must use the ones in referenced by blockIndex
+            size_t                         pr_blockIndexSlotsUsed;
+            size_t                         pr_blockIndexSize;
+            size_t                         pr_blockIndexFront;  // Next slot (not current)
+            BlockIndexEntry*               pr_blockIndexEntries;
+            void*                          pr_blockIndexRaw;
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ExplicitProducer* nextExplicitProducer;
-	private:
+          public:
+            ExplicitProducer* nextExplicitProducer;
+
+          private:
 #endif
 
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
+            friend struct MemStats;
 #endif
-	};
-
-
-	//////////////////////////////////
-	// Implicit queue
-	//////////////////////////////////
-
-	struct ImplicitProducer : public ProducerBase
-	{
-		ImplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, false),
-			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
-			blockIndex(nullptr)
-		{
-			new_block_index();
-		}
-
-		~ImplicitProducer()
-		{
-			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
-			// completed already; this means that all undequeued elements are placed contiguously across
-			// contiguous blocks, and that only the first and last remaining blocks can be only partially
-			// empty (all other remaining blocks must be completely full).
+        };
+
+
+        //////////////////////////////////
+        // Implicit queue
+        //////////////////////////////////
+
+        struct ImplicitProducer : public ProducerBase
+        {
+            ImplicitProducer(ConcurrentQueue* parent_)
+                : ProducerBase(parent_, false)
+                , nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE)
+                , blockIndex(nullptr)
+            {
+                new_block_index();
+            }
+
+            ~ImplicitProducer()
+            {
+                // Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+                // completed already; this means that all undequeued elements are placed contiguously across
+                // contiguous blocks, and that only the first and last remaining blocks can be only partially
+                // empty (all other remaining blocks must be completely full).
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-			// Unregister ourselves for thread termination notification
-			if (!this->inactive.load(std::memory_order_relaxed)) {
-				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
-			}
-#endif
-
-			// Destroy all remaining elements!
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto index = this->headIndex.load(std::memory_order_relaxed);
-			Block* block = nullptr;
-			assert(index == tail || details::circular_less_than(index, tail));
-			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
-			while (index != tail) {
-				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
-					if (block != nullptr) {
-						// Free the old block
-						this->parent->add_block_to_free_list(block);
-					}
-
-					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
-				}
-
-				((*block)[index])->~T();
-				++index;
-			}
-			// Even if the queue is empty, there's still one block that's not on the free list
-			// (unless the head index reached the end of it, in which case the tail will be poised
-			// to create a new block).
-			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
-				this->parent->add_block_to_free_list(this->tailBlock);
-			}
-
-			// Destroy block index
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			if (localBlockIndex != nullptr) {
-				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
-					localBlockIndex->index[i]->~BlockIndexEntry();
-				}
-				do {
-					auto prev = localBlockIndex->prev;
-					localBlockIndex->~BlockIndexHeader();
-					(Traits::free)(localBlockIndex);
-					localBlockIndex = prev;
-				} while (localBlockIndex != nullptr);
-			}
-		}
-
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto head = this->headIndex.load(std::memory_order_relaxed);
-				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-					return false;
-				}
+                // Unregister ourselves for thread termination notification
+                if (!this->inactive.load(std::memory_order_relaxed))
+                {
+                    details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+                }
+#endif
+
+                // Destroy all remaining elements!
+                auto   tail  = this->tailIndex.load(std::memory_order_relaxed);
+                auto   index = this->headIndex.load(std::memory_order_relaxed);
+                Block* block = nullptr;
+                assert(index == tail || details::circular_less_than(index, tail));
+                bool forceFreeLastBlock = index != tail;  // If we enter the loop, then the last (tail) block will not be freed
+                while (index != tail)
+                {
+                    if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr)
+                    {
+                        if (block != nullptr)
+                        {
+                            // Free the old block
+                            this->parent->add_block_to_free_list(block);
+                        }
+
+                        block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+                    }
+
+                    ((*block)[index])->~T();
+                    ++index;
+                }
+                // Even if the queue is empty, there's still one block that's not on the free list
+                // (unless the head index reached the end of it, in which case the tail will be poised
+                // to create a new block).
+                if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0))
+                {
+                    this->parent->add_block_to_free_list(this->tailBlock);
+                }
+
+                // Destroy block index
+                auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+                if (localBlockIndex != nullptr)
+                {
+                    for (size_t i = 0; i != localBlockIndex->capacity; ++i)
+                    {
+                        localBlockIndex->index[i]->~BlockIndexEntry();
+                    }
+                    do {
+                        auto prev = localBlockIndex->prev;
+                        localBlockIndex->~BlockIndexHeader();
+                        (Traits::free)(localBlockIndex);
+                        localBlockIndex = prev;
+                    } while (localBlockIndex != nullptr);
+                }
+            }
+
+            template<AllocationMode allocMode, typename U>
+            inline bool enqueue(U&& element)
+            {
+                index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+                index_t newTailIndex     = 1 + currentTailIndex;
+                if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+                {
+                    // We reached the end of a block, start a new one
+                    auto head = this->headIndex.load(std::memory_order_relaxed);
+                    assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                    if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)))
+                    {
+                        return false;
+                    }
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				// Find out where we'll be inserting this block in the block index
-				BlockIndexEntry* idxEntry;
-				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
-					return false;
-				}
-
-				// Get ahold of a new block
-				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-				if (newBlock == nullptr) {
-					rewind_block_index_tail();
-					idxEntry->value.store(nullptr, std::memory_order_relaxed);
-					return false;
-				}
+                    debug::DebugLock lock(mutex);
+#endif
+                    // Find out where we'll be inserting this block in the block index
+                    BlockIndexEntry* idxEntry;
+                    if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex))
+                    {
+                        return false;
+                    }
+
+                    // Get ahold of a new block
+                    auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                    if (newBlock == nullptr)
+                    {
+                        rewind_block_index_tail();
+                        idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                        return false;
+                    }
 #ifdef MCDBGQ_TRACKMEM
-				newBlock->owner = this;
-#endif
-				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// May throw, try to insert now before we publish the fact that we have this new block
-					MOODYCAMEL_TRY {
-						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						rewind_block_index_tail();
-						idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						this->parent->add_block_to_free_list(newBlock);
-						MOODYCAMEL_RETHROW;
-					}
-				}
-
-				// Insert the new block into the index
-				idxEntry->value.store(newBlock, std::memory_order_relaxed);
-
-				this->tailBlock = newBlock;
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			// See ExplicitProducer::dequeue for rationale and explanation
-			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
-			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				std::atomic_thread_fence(std::memory_order_acquire);
-
-				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-
-					// Determine which block the element is in
-					auto entry = get_block_index_entry_for_index(index);
-
-					// Dequeue
-					auto block = entry->value.load(std::memory_order_relaxed);
-					auto& el = *((*block)[index]);
-
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+                    newBlock->owner = this;
+#endif
+                    newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+                    MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                    {
+                        // May throw, try to insert now before we publish the fact that we have this new block
+                        MOODYCAMEL_TRY
+                        {
+                            new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+                        }
+                        MOODYCAMEL_CATCH(...)
+                        {
+                            rewind_block_index_tail();
+                            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                            this->parent->add_block_to_free_list(newBlock);
+                            MOODYCAMEL_RETHROW;
+                        }
+                    }
+
+                    // Insert the new block into the index
+                    idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+                    this->tailBlock = newBlock;
+
+                    MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                    {
+                        this->tailIndex.store(newTailIndex, std::memory_order_release);
+                        return true;
+                    }
+                }
+
+                // Enqueue
+                new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+                this->tailIndex.store(newTailIndex, std::memory_order_release);
+                return true;
+            }
+
+            template<typename U>
+            bool dequeue(U& element)
+            {
+                // See ExplicitProducer::dequeue for rationale and explanation
+                index_t tail       = this->tailIndex.load(std::memory_order_relaxed);
+                index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+                if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail))
+                {
+                    std::atomic_thread_fence(std::memory_order_acquire);
+
+                    index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+                    tail                   = this->tailIndex.load(std::memory_order_acquire);
+                    if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail)))
+                    {
+                        index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+                        // Determine which block the element is in
+                        auto    entry = get_block_index_entry_for_index(index);
+
+                        // Dequeue
+                        auto    block = entry->value.load(std::memory_order_relaxed);
+                        auto&   el    = *((*block)[index]);
+
+                        if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el)))
+                        {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-						// Note: Acquiring the mutex with every dequeue instead of only when a block
-						// is released is very sub-optimal, but it is, after all, purely debug code.
-						debug::DebugLock lock(producer->mutex);
-#endif
-						struct Guard {
-							Block* block;
-							index_t index;
-							BlockIndexEntry* entry;
-							ConcurrentQueue* parent;
-
-							~Guard()
-							{
-								(*block)[index]->~T();
-								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-									entry->value.store(nullptr, std::memory_order_relaxed);
-									parent->add_block_to_free_list(block);
-								}
-							}
-						} guard = { block, index, entry, this->parent };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-
-						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-							{
+                            // Note: Acquiring the mutex with every dequeue instead of only when a block
+                            // is released is very sub-optimal, but it is, after all, purely debug code.
+                            debug::DebugLock lock(producer->mutex);
+#endif
+                            struct Guard
+                            {
+                                Block*           block;
+                                index_t          index;
+                                BlockIndexEntry* entry;
+                                ConcurrentQueue* parent;
+
+                                ~Guard()
+                                {
+                                    (*block)[index]->~T();
+                                    if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index))
+                                    {
+                                        entry->value.store(nullptr, std::memory_order_relaxed);
+                                        parent->add_block_to_free_list(block);
+                                    }
+                                }
+                            } guard = {block, index, entry, this->parent};
+
+                            element = std::move(el);  // NOLINT
+                        }
+                        else
+                        {
+                            element = std::move(el);  // NOLINT
+                            el.~T();                  // NOLINT
+
+                            if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index))
+                            {
+                                {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
+                                    debug::DebugLock lock(mutex);
 #endif
-								// Add the block back into the global free pool (and remove from block index)
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-					}
-
-					return true;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
-				}
-			}
-
-			return false;
-		}
+                                    // Add the block back into the global free pool (and remove from block index)
+                                    entry->value.store(nullptr, std::memory_order_relaxed);
+                                }
+                                this->parent->add_block_to_free_list(block);  // releases the above store
+                            }
+                        }
+
+                        return true;
+                    }
+                    else
+                    {
+                        this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+                    }
+                }
+
+                return false;
+            }
 
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable: 4706)  // assignment within conditional expression
-#endif
-		template<AllocationMode allocMode, typename It>
-		bool enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-
-			// Note that the tailBlock we start off with may not be owned by us any more;
-			// this happens if it was filled up exactly to the top (setting tailIndex to
-			// the first index of the next block which is not yet allocated), then dequeued
-			// completely (putting it on the free list) before we enqueue again.
-
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			Block* firstAllocatedBlock = nullptr;
-			auto endBlock = this->tailBlock;
-
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
+    #pragma warning(push)
+    #pragma warning(disable : 4706)  // assignment within conditional expression
+#endif
+            template<AllocationMode allocMode, typename It>
+            bool enqueue_bulk(It itemFirst, size_t count)
+            {
+                // First, we need to make sure we have enough room to enqueue all of the elements;
+                // this means pre-allocating blocks and putting them in the block index (but only if
+                // all the allocations succeeded).
+
+                // Note that the tailBlock we start off with may not be owned by us any more;
+                // this happens if it was filled up exactly to the top (setting tailIndex to
+                // the first index of the next block which is not yet allocated), then dequeued
+                // completely (putting it on the free list) before we enqueue again.
+
+                index_t startTailIndex      = this->tailIndex.load(std::memory_order_relaxed);
+                auto    startBlock          = this->tailBlock;
+                Block*  firstAllocatedBlock = nullptr;
+                auto    endBlock            = this->tailBlock;
+
+                // Figure out how many blocks we'll need to allocate, and do so
+                size_t  blockBaseDiff    = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+                index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                if (blockBaseDiff > 0)
+                {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				do {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-
-					// Find out where we'll be inserting this block in the block index
-					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
-					Block* newBlock;
-					bool indexInserted = false;
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-
-					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
-						// Index allocation or block allocation failed; revert any other allocations
-						// and index insertions done so far for this operation
-						if (indexInserted) {
-							rewind_block_index_tail();
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						}
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-
-						return false;
-					}
+                    debug::DebugLock lock(mutex);
+#endif
+                    do {
+                        blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                        currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                        // Find out where we'll be inserting this block in the block index
+                        BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
+                        Block*           newBlock;
+                        bool             indexInserted = false;
+                        auto             head          = this->headIndex.load(std::memory_order_relaxed);
+                        assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                        bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+                        if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr)
+                        {
+                            // Index allocation or block allocation failed; revert any other allocations
+                            // and index insertions done so far for this operation
+                            if (indexInserted)
+                            {
+                                rewind_block_index_tail();
+                                idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                            }
+                            currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                            for (auto block = firstAllocatedBlock; block != nullptr; block = block->next)
+                            {
+                                currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+                                idxEntry = get_block_index_entry_for_index(currentTailIndex);
+                                idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                                rewind_block_index_tail();
+                            }
+                            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+                            this->tailBlock = startBlock;
+
+                            return false;
+                        }
 
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-					newBlock->next = nullptr;
-
-					// Insert the new block into the index
-					idxEntry->value.store(newBlock, std::memory_order_relaxed);
-
-					// Store the chain of blocks so that we can undo if later allocations fail,
-					// and so that we can find the blocks when we do the actual enqueueing
-					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
-						assert(this->tailBlock != nullptr);
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					endBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
-				} while (blockBaseDiff > 0);
-			}
-
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
+                        newBlock->owner = this;
+#endif
+                        newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+                        newBlock->next = nullptr;
+
+                        // Insert the new block into the index
+                        idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+                        // Store the chain of blocks so that we can undo if later allocations fail,
+                        // and so that we can find the blocks when we do the actual enqueueing
+                        if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr)
+                        {
+                            assert(this->tailBlock != nullptr);
+                            this->tailBlock->next = newBlock;
+                        }
+                        this->tailBlock     = newBlock;
+                        endBlock            = newBlock;
+                        firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+                    } while (blockBaseDiff > 0);
+                }
+
+                // Enqueue, one block at a time
+                index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+                currentTailIndex     = startTailIndex;
+                this->tailBlock      = startBlock;
+                assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+                if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr)
+                {
+                    this->tailBlock = firstAllocatedBlock;
+                }
+                while (true)
+                {
+                    index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                    if (details::circular_less_than<index_t>(newTailIndex, stopIndex))
+                    {
+                        stopIndex = newTailIndex;
+                    }
+                    MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+                    {
+                        while (currentTailIndex != stopIndex)
+                        {
+                            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+                        }
+                    }
+                    else
+                    {
+                        MOODYCAMEL_TRY
+                        {
+                            while (currentTailIndex != stopIndex)
+                            {
+                                new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+                                ++currentTailIndex;
+                                ++itemFirst;
+                            }
+                        }
+                        MOODYCAMEL_CATCH(...)
+                        {
+                            auto constructedStopIndex = currentTailIndex;
+                            auto lastBlockEnqueued    = this->tailBlock;
+
+                            if (!details::is_trivially_destructible<T>::value)
+                            {
+                                auto block = startBlock;
+                                if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+                                {
+                                    block = firstAllocatedBlock;
+                                }
+                                currentTailIndex = startTailIndex;
+                                while (true)
+                                {
+                                    stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                    if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex))
+                                    {
+                                        stopIndex = constructedStopIndex;
+                                    }
+                                    while (currentTailIndex != stopIndex)
+                                    {
+                                        (*block)[currentTailIndex++]->~T();
+                                    }
+                                    if (block == lastBlockEnqueued)
+                                    {
+                                        break;
+                                    }
+                                    block = block->next;
+                                }
+                            }
+
+                            currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                            for (auto block = firstAllocatedBlock; block != nullptr; block = block->next)
+                            {
+                                currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+                                auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+                                idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                                rewind_block_index_tail();
+                            }
+                            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+                            this->tailBlock = startBlock;
+                            MOODYCAMEL_RETHROW;
+                        }
+                    }
+
+                    if (this->tailBlock == endBlock)
+                    {
+                        assert(currentTailIndex == newTailIndex);
+                        break;
+                    }
+                    this->tailBlock = this->tailBlock->next;
+                }
+                this->tailIndex.store(newTailIndex, std::memory_order_release);
+                return true;
+            }
 #ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					BlockIndexHeader* localBlockIndex;
-					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
-					do {
-						auto blockStartIndex = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-
-						auto entry = localBlockIndex->index[indexIndex];
-						auto block = entry->value.load(std::memory_order_relaxed);
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								do {
-									entry = localBlockIndex->index[indexIndex];
-									block = entry->value.load(std::memory_order_relaxed);
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-
-									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+    #pragma warning(pop)
+#endif
+
+            template<typename It>
+            size_t dequeue_bulk(It& itemFirst, size_t max)
+            {
+                auto tail         = this->tailIndex.load(std::memory_order_relaxed);
+                auto overcommit   = this->dequeueOvercommit.load(std::memory_order_relaxed);
+                auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+                if (details::circular_less_than<size_t>(0, desiredCount))
+                {
+                    desiredCount = desiredCount < max ? desiredCount : max;
+                    std::atomic_thread_fence(std::memory_order_acquire);
+
+                    auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+
+                    tail             = this->tailIndex.load(std::memory_order_acquire);
+                    auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+                    if (details::circular_less_than<size_t>(0, actualCount))
+                    {
+                        actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+                        if (actualCount < desiredCount)
+                        {
+                            this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+                        }
+
+                        // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+                        // will never exceed tail.
+                        auto              firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+                        // Iterate the blocks and dequeue
+                        auto              index = firstIndex;
+                        BlockIndexHeader* localBlockIndex;
+                        auto              indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+                        do {
+                            auto    blockStartIndex = index;
+                            index_t endIndex        = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                            endIndex                = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+
+                            auto entry = localBlockIndex->index[indexIndex];
+                            auto block = entry->value.load(std::memory_order_relaxed);
+                            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index]))))
+                            {
+                                while (index != endIndex)
+                                {
+                                    auto& el     = *((*block)[index]);
+                                    *itemFirst++ = std::move(el);
+                                    el.~T();
+                                    ++index;
+                                }
+                            }
+                            else
+                            {
+                                MOODYCAMEL_TRY
+                                {
+                                    while (index != endIndex)
+                                    {
+                                        auto& el   = *((*block)[index]);
+                                        *itemFirst = std::move(el);
+                                        ++itemFirst;
+                                        el.~T();
+                                        ++index;
+                                    }
+                                }
+                                MOODYCAMEL_CATCH(...)
+                                {
+                                    do {
+                                        entry = localBlockIndex->index[indexIndex];
+                                        block = entry->value.load(std::memory_order_relaxed);
+                                        while (index != endIndex)
+                                        {
+                                            (*block)[index++]->~T();
+                                        }
+
+                                        if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex)))
+                                        {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-										debug::DebugLock lock(mutex);
-#endif
-										entry->value.store(nullptr, std::memory_order_relaxed);
-										this->parent->add_block_to_free_list(block);
-									}
-									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-
-									blockStartIndex = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
-							{
+                                            debug::DebugLock lock(mutex);
+#endif
+                                            entry->value.store(nullptr, std::memory_order_relaxed);
+                                            this->parent->add_block_to_free_list(block);
+                                        }
+                                        indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+
+                                        blockStartIndex = index;
+                                        endIndex        = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                        endIndex        = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+                                    } while (index != firstIndex + actualCount);
+
+                                    MOODYCAMEL_RETHROW;
+                                }
+                            }
+                            if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex)))
+                            {
+                                {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
-#endif
-								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
-								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-					} while (index != firstIndex + actualCount);
-
-					return actualCount;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-
-			return 0;
-		}
-
-	private:
-		// The block size must be > 1, so any number with the low bit set is an invalid block base index
-		static const index_t INVALID_BLOCK_BASE = 1;
-
-		struct BlockIndexEntry
-		{
-			std::atomic<index_t> key;
-			std::atomic<Block*> value;
-		};
-
-		struct BlockIndexHeader
-		{
-			size_t capacity;
-			std::atomic<size_t> tail;
-			BlockIndexEntry* entries;
-			BlockIndexEntry** index;
-			BlockIndexHeader* prev;
-		};
-
-		template<AllocationMode allocMode>
-		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
-			if (localBlockIndex == nullptr) {
-				return false;  // this can happen if new_block_index failed in the constructor
-			}
-			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-			idxEntry = localBlockIndex->index[newTail];
-			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
-				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
-
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-
-			// No room in the old block index, try to allocate another one!
-			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-				return false;
-			}
-			else if (!new_block_index()) {
-				return false;
-			}
-			else {
-				localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-				newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-				idxEntry = localBlockIndex->index[newTail];
-				assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-		}
-
-		inline void rewind_block_index_tail()
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
-		}
-
-		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
-		{
-			BlockIndexHeader* localBlockIndex;
-			auto idx = get_block_index_index_for_index(index, localBlockIndex);
-			return localBlockIndex->index[idx];
-		}
-
-		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
-		{
+                                    debug::DebugLock lock(mutex);
+#endif
+                                    // Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+                                    // we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+                                    entry->value.store(nullptr, std::memory_order_relaxed);
+                                }
+                                this->parent->add_block_to_free_list(block);  // releases the above store
+                            }
+                            indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+                        } while (index != firstIndex + actualCount);
+
+                        return actualCount;
+                    }
+                    else
+                    {
+                        this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+                    }
+                }
+
+                return 0;
+            }
+
+          private:
+            // The block size must be > 1, so any number with the low bit set is an invalid block base index
+            static const index_t INVALID_BLOCK_BASE = 1;
+
+            struct BlockIndexEntry
+            {
+                std::atomic<index_t> key;
+                std::atomic<Block*>  value;
+            };
+
+            struct BlockIndexHeader
+            {
+                size_t              capacity;
+                std::atomic<size_t> tail;
+                BlockIndexEntry*    entries;
+                BlockIndexEntry**   index;
+                BlockIndexHeader*   prev;
+            };
+
+            template<AllocationMode allocMode>
+            inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+            {
+                auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);  // We're the only writer thread, relaxed is OK
+                if (localBlockIndex == nullptr)
+                {
+                    return false;  // this can happen if new_block_index failed in the constructor
+                }
+                size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+                idxEntry       = localBlockIndex->index[newTail];
+                if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+                    idxEntry->value.load(std::memory_order_relaxed) == nullptr)
+                {
+                    idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+                    localBlockIndex->tail.store(newTail, std::memory_order_release);
+                    return true;
+                }
+
+                // No room in the old block index, try to allocate another one!
+                MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
+                {
+                    return false;
+                }
+                else if (!new_block_index())
+                {
+                    return false;
+                }
+                else
+                {
+                    localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+                    newTail         = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+                    idxEntry        = localBlockIndex->index[newTail];
+                    assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+                    idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+                    localBlockIndex->tail.store(newTail, std::memory_order_release);
+                    return true;
+                }
+            }
+
+            inline void rewind_block_index_tail()
+            {
+                auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+                localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+            }
+
+            inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+            {
+                BlockIndexHeader* localBlockIndex;
+                auto              idx = get_block_index_index_for_index(index, localBlockIndex);
+                return localBlockIndex->index[idx];
+            }
+
+            inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+            {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-			debug::DebugLock lock(mutex);
-#endif
-			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
-			localBlockIndex = blockIndex.load(std::memory_order_acquire);
-			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
-			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
-			assert(tailBase != INVALID_BLOCK_BASE);
-			// Note: Must use division instead of shift because the index may wrap around, causing a negative
-			// offset, whose negativity we want to preserve
-			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
-			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
-			return idx;
-		}
-
-		bool new_block_index()
-		{
-			auto prev = blockIndex.load(std::memory_order_relaxed);
-			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
-			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
-			auto raw = static_cast<char*>((Traits::malloc)(
-				sizeof(BlockIndexHeader) +
-				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
-				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
-			if (raw == nullptr) {
-				return false;
-			}
-
-			auto header = new (raw) BlockIndexHeader;
-			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
-			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
-			if (prev != nullptr) {
-				auto prevTail = prev->tail.load(std::memory_order_relaxed);
-				auto prevPos = prevTail;
-				size_t i = 0;
-				do {
-					prevPos = (prevPos + 1) & (prev->capacity - 1);
-					index[i++] = prev->index[prevPos];
-				} while (prevPos != prevTail);
-				assert(i == prevCapacity);
-			}
-			for (size_t i = 0; i != entryCount; ++i) {
-				new (entries + i) BlockIndexEntry;
-				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
-				index[prevCapacity + i] = entries + i;
-			}
-			header->prev = prev;
-			header->entries = entries;
-			header->index = index;
-			header->capacity = nextBlockIndexCapacity;
-			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
-
-			blockIndex.store(header, std::memory_order_release);
-
-			nextBlockIndexCapacity <<= 1;
-
-			return true;
-		}
-
-	private:
-		size_t nextBlockIndexCapacity;
-		std::atomic<BlockIndexHeader*> blockIndex;
+                debug::DebugLock lock(mutex);
+#endif
+                index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+                localBlockIndex = blockIndex.load(std::memory_order_acquire);
+                auto tail       = localBlockIndex->tail.load(std::memory_order_acquire);
+                auto tailBase   = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+                assert(tailBase != INVALID_BLOCK_BASE);
+                // Note: Must use division instead of shift because the index may wrap around, causing a negative
+                // offset, whose negativity we want to preserve
+                auto   offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+                size_t idx    = (tail + offset) & (localBlockIndex->capacity - 1);
+                assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+                return idx;
+            }
+
+            bool new_block_index()
+            {
+                auto   prev         = blockIndex.load(std::memory_order_relaxed);
+                size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+                auto   entryCount   = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+                auto   raw          = static_cast<char*>((Traits::malloc)(
+                    sizeof(BlockIndexHeader) +
+                    std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+                    std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+                if (raw == nullptr)
+                {
+                    return false;
+                }
+
+                auto header  = new (raw) BlockIndexHeader;
+                auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+                auto index   = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+                if (prev != nullptr)
+                {
+                    auto   prevTail = prev->tail.load(std::memory_order_relaxed);
+                    auto   prevPos  = prevTail;
+                    size_t i        = 0;
+                    do {
+                        prevPos    = (prevPos + 1) & (prev->capacity - 1);
+                        index[i++] = prev->index[prevPos];
+                    } while (prevPos != prevTail);
+                    assert(i == prevCapacity);
+                }
+                for (size_t i = 0; i != entryCount; ++i)
+                {
+                    new (entries + i) BlockIndexEntry;
+                    entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+                    index[prevCapacity + i] = entries + i;
+                }
+                header->prev     = prev;
+                header->entries  = entries;
+                header->index    = index;
+                header->capacity = nextBlockIndexCapacity;
+                header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+
+                blockIndex.store(header, std::memory_order_release);
+
+                nextBlockIndexCapacity <<= 1;
+
+                return true;
+            }
+
+          private:
+            size_t                         nextBlockIndexCapacity;
+            std::atomic<BlockIndexHeader*> blockIndex;
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	public:
-		details::ThreadExitListener threadExitListener;
-	private:
+          public:
+            details::ThreadExitListener threadExitListener;
+
+          private:
 #endif
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ImplicitProducer* nextImplicitProducer;
-	private:
+          public:
+            ImplicitProducer* nextImplicitProducer;
+
+          private:
 #endif
 
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-		mutable debug::DebugMutex mutex;
+            mutable debug::DebugMutex mutex;
 #endif
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
+            friend struct MemStats;
 #endif
-	};
-
-
-	//////////////////////////////////
-	// Block pool manipulation
-	//////////////////////////////////
-
-	void populate_initial_block_list(size_t blockCount)
-	{
-		initialBlockPoolSize = blockCount;
-		if (initialBlockPoolSize == 0) {
-			initialBlockPool = nullptr;
-			return;
-		}
-
-		initialBlockPool = create_array<Block>(blockCount);
-		if (initialBlockPool == nullptr) {
-			initialBlockPoolSize = 0;
-		}
-		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
-			initialBlockPool[i].dynamicallyAllocated = false;
-		}
-	}
-
-	inline Block* try_get_block_from_initial_pool()
-	{
-		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
-			return nullptr;
-		}
-
-		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+        };
+
+
+        //////////////////////////////////
+        // Block pool manipulation
+        //////////////////////////////////
+
+        void populate_initial_block_list(size_t blockCount)
+        {
+            initialBlockPoolSize = blockCount;
+            if (initialBlockPoolSize == 0)
+            {
+                initialBlockPool = nullptr;
+                return;
+            }
+
+            initialBlockPool = create_array<Block>(blockCount);
+            if (initialBlockPool == nullptr)
+            {
+                initialBlockPoolSize = 0;
+            }
+            for (size_t i = 0; i < initialBlockPoolSize; ++i)
+            {
+                initialBlockPool[i].dynamicallyAllocated = false;
+            }
+        }
+
+        inline Block* try_get_block_from_initial_pool()
+        {
+            if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize)
+            {
+                return nullptr;
+            }
+
+            auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+            return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+        }
+
+        inline void add_block_to_free_list(Block* block)
+        {
+#ifdef MCDBGQ_TRACKMEM
+            block->owner = nullptr;
+#endif
+            if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated)
+            {
+                destroy(block);
+            }
+            else
+            {
+                freeList.add(block);
+            }
+        }
+
+        inline void add_blocks_to_free_list(Block* block)
+        {
+            while (block != nullptr)
+            {
+                auto next = block->next;
+                add_block_to_free_list(block);
+                block = next;
+            }
+        }
+
+        inline Block* try_get_block_from_free_list()
+        {
+            return freeList.try_get();
+        }
+
+        // Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+        template<AllocationMode canAlloc>
+        Block* requisition_block()
+        {
+            auto block = try_get_block_from_initial_pool();
+            if (block != nullptr)
+            {
+                return block;
+            }
+
+            block = try_get_block_from_free_list();
+            if (block != nullptr)
+            {
+                return block;
+            }
+
+            MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc)
+            {
+                return create<Block>();
+            }
+            else
+            {
+                return nullptr;
+            }
+        }
 
-		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
-	}
 
-	inline void add_block_to_free_list(Block* block)
-	{
 #ifdef MCDBGQ_TRACKMEM
-		block->owner = nullptr;
-#endif
-		if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
-			destroy(block);
-		}
-		else {
-			freeList.add(block);
-		}
-	}
-
-	inline void add_blocks_to_free_list(Block* block)
-	{
-		while (block != nullptr) {
-			auto next = block->next;
-			add_block_to_free_list(block);
-			block = next;
-		}
-	}
-
-	inline Block* try_get_block_from_free_list()
-	{
-		return freeList.try_get();
-	}
-
-	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
-	template<AllocationMode canAlloc>
-	Block* requisition_block()
-	{
-		auto block = try_get_block_from_initial_pool();
-		if (block != nullptr) {
-			return block;
-		}
-
-		block = try_get_block_from_free_list();
-		if (block != nullptr) {
-			return block;
-		}
-
-		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
-			return create<Block>();
-		}
-		else {
-			return nullptr;
-		}
-	}
+      public:
+        struct MemStats
+        {
+            size_t allocatedBlocks;
+            size_t usedBlocks;
+            size_t freeBlocks;
+            size_t ownedBlocksExplicit;
+            size_t ownedBlocksImplicit;
+            size_t implicitProducers;
+            size_t explicitProducers;
+            size_t elementsEnqueued;
+            size_t blockClassBytes;
+            size_t queueClassBytes;
+            size_t implicitBlockIndexBytes;
+            size_t explicitBlockIndexBytes;
+
+            friend class ConcurrentQueue;
+
+          private:
+            static MemStats getFor(ConcurrentQueue* q)
+            {
+                MemStats stats = {0};
+
+                stats.elementsEnqueued = q->size_approx();
+
+                auto block = q->freeList.head_unsafe();
+                while (block != nullptr)
+                {
+                    ++stats.allocatedBlocks;
+                    ++stats.freeBlocks;
+                    block = block->freeListNext.load(std::memory_order_relaxed);
+                }
+
+                for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+                {
+                    bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+                    stats.implicitProducers += implicit ? 1 : 0;
+                    stats.explicitProducers += implicit ? 0 : 1;
+
+                    if (implicit)
+                    {
+                        auto prod = static_cast<ImplicitProducer*>(ptr);
+                        stats.queueClassBytes += sizeof(ImplicitProducer);
+                        auto head = prod->headIndex.load(std::memory_order_relaxed);
+                        auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+                        auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+                        if (hash != nullptr)
+                        {
+                            for (size_t i = 0; i != hash->capacity; ++i)
+                            {
+                                if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr)
+                                {
+                                    ++stats.allocatedBlocks;
+                                    ++stats.ownedBlocksImplicit;
+                                }
+                            }
+                            stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+                            for (; hash != nullptr; hash = hash->prev)
+                            {
+                                stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+                            }
+                        }
+                        for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE)
+                        {
+                            // auto block = prod->get_block_index_entry_for_index(head);
+                            ++stats.usedBlocks;
+                        }
+                    }
+                    else
+                    {
+                        auto prod = static_cast<ExplicitProducer*>(ptr);
+                        stats.queueClassBytes += sizeof(ExplicitProducer);
+                        auto tailBlock   = prod->tailBlock;
+                        bool wasNonEmpty = false;
+                        if (tailBlock != nullptr)
+                        {
+                            auto block = tailBlock;
+                            do {
+                                ++stats.allocatedBlocks;
+                                if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty)
+                                {
+                                    ++stats.usedBlocks;
+                                    wasNonEmpty = wasNonEmpty || block != tailBlock;
+                                }
+                                ++stats.ownedBlocksExplicit;
+                                block = block->next;
+                            } while (block != tailBlock);
+                        }
+                        auto index = prod->blockIndex.load(std::memory_order_relaxed);
+                        while (index != nullptr)
+                        {
+                            stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+                            index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+                        }
+                    }
+                }
+
+                auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+                stats.allocatedBlocks += freeOnInitialPool;
+                stats.freeBlocks += freeOnInitialPool;
+
+                stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+                stats.queueClassBytes += sizeof(ConcurrentQueue);
+
+                return stats;
+            }
+        };
+
+        // For debugging only. Not thread-safe.
+        MemStats getMemStats()
+        {
+            return MemStats::getFor(this);
+        }
+
+      private:
+        friend struct MemStats;
+#endif
 
 
-#ifdef MCDBGQ_TRACKMEM
-	public:
-		struct MemStats {
-			size_t allocatedBlocks;
-			size_t usedBlocks;
-			size_t freeBlocks;
-			size_t ownedBlocksExplicit;
-			size_t ownedBlocksImplicit;
-			size_t implicitProducers;
-			size_t explicitProducers;
-			size_t elementsEnqueued;
-			size_t blockClassBytes;
-			size_t queueClassBytes;
-			size_t implicitBlockIndexBytes;
-			size_t explicitBlockIndexBytes;
-
-			friend class ConcurrentQueue;
-
-		private:
-			static MemStats getFor(ConcurrentQueue* q)
-			{
-				MemStats stats = { 0 };
-
-				stats.elementsEnqueued = q->size_approx();
-
-				auto block = q->freeList.head_unsafe();
-				while (block != nullptr) {
-					++stats.allocatedBlocks;
-					++stats.freeBlocks;
-					block = block->freeListNext.load(std::memory_order_relaxed);
-				}
-
-				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
-					stats.implicitProducers += implicit ? 1 : 0;
-					stats.explicitProducers += implicit ? 0 : 1;
-
-					if (implicit) {
-						auto prod = static_cast<ImplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ImplicitProducer);
-						auto head = prod->headIndex.load(std::memory_order_relaxed);
-						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
-						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
-						if (hash != nullptr) {
-							for (size_t i = 0; i != hash->capacity; ++i) {
-								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
-									++stats.allocatedBlocks;
-									++stats.ownedBlocksImplicit;
-								}
-							}
-							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
-							for (; hash != nullptr; hash = hash->prev) {
-								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
-							}
-						}
-						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
-							//auto block = prod->get_block_index_entry_for_index(head);
-							++stats.usedBlocks;
-						}
-					}
-					else {
-						auto prod = static_cast<ExplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ExplicitProducer);
-						auto tailBlock = prod->tailBlock;
-						bool wasNonEmpty = false;
-						if (tailBlock != nullptr) {
-							auto block = tailBlock;
-							do {
-								++stats.allocatedBlocks;
-								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
-									++stats.usedBlocks;
-									wasNonEmpty = wasNonEmpty || block != tailBlock;
-								}
-								++stats.ownedBlocksExplicit;
-								block = block->next;
-							} while (block != tailBlock);
-						}
-						auto index = prod->blockIndex.load(std::memory_order_relaxed);
-						while (index != nullptr) {
-							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
-							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
-						}
-					}
-				}
-
-				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
-				stats.allocatedBlocks += freeOnInitialPool;
-				stats.freeBlocks += freeOnInitialPool;
-
-				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
-				stats.queueClassBytes += sizeof(ConcurrentQueue);
-
-				return stats;
-			}
-		};
-
-		// For debugging only. Not thread-safe.
-		MemStats getMemStats()
-		{
-			return MemStats::getFor(this);
-		}
-	private:
-		friend struct MemStats;
-#endif
-
-
-	//////////////////////////////////
-	// Producer list manipulation
-	//////////////////////////////////
-
-	ProducerBase* recycle_or_create_producer(bool isExplicit)
-	{
+        //////////////////////////////////
+        // Producer list manipulation
+        //////////////////////////////////
+
+        ProducerBase* recycle_or_create_producer(bool isExplicit)
+        {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		// Try to re-use one first
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
-				bool expected = true;
-				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// We caught one! It's been marked as activated, the caller can have it
-					return ptr;
-				}
-			}
-		}
-
-		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
-	}
-
-	ProducerBase* add_producer(ProducerBase* producer)
-	{
-		// Handle failed memory allocation
-		if (producer == nullptr) {
-			return nullptr;
-		}
-
-		producerCount.fetch_add(1, std::memory_order_relaxed);
-
-		// Add it to the lock-free list
-		auto prevTail = producerListTail.load(std::memory_order_relaxed);
-		do {
-			producer->next = prevTail;
-		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+            debug::DebugLock lock(implicitProdMutex);
+#endif
+            // Try to re-use one first
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+            {
+                if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit)
+                {
+                    bool expected = true;
+                    if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed))
+                    {
+                        // We caught one! It's been marked as activated, the caller can have it
+                        return ptr;
+                    }
+                }
+            }
+
+            return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+        }
+
+        ProducerBase* add_producer(ProducerBase* producer)
+        {
+            // Handle failed memory allocation
+            if (producer == nullptr)
+            {
+                return nullptr;
+            }
+
+            producerCount.fetch_add(1, std::memory_order_relaxed);
+
+            // Add it to the lock-free list
+            auto prevTail = producerListTail.load(std::memory_order_relaxed);
+            do {
+                producer->next = prevTail;
+            } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		if (producer->isExplicit) {
-			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
-			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-		else {
-			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
-			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-#endif
-
-		return producer;
-	}
-
-	void reown_producers()
-	{
-		// After another instance is moved-into/swapped-with this one, all the
-		// producers we stole still think their parents are the other queue.
-		// So fix them up!
-		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
-			ptr->parent = this;
-		}
-	}
-
-
-	//////////////////////////////////
-	// Implicit producer hash
-	//////////////////////////////////
-
-	struct ImplicitProducerKVP
-	{
-		std::atomic<details::thread_id_t> key;
-		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
-
-		ImplicitProducerKVP() : value(nullptr) { }
-
-		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
-			value = other.value;
-		}
-
-		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			swap(other);
-			return *this;
-		}
-
-		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
-		{
-			if (this != &other) {
-				details::swap_relaxed(key, other.key);
-				std::swap(value, other.value);
-			}
-		}
-	};
-
-	template<typename XT, typename XTraits>
-	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
-
-	struct ImplicitProducerHash
-	{
-		size_t capacity;
-		ImplicitProducerKVP* entries;
-		ImplicitProducerHash* prev;
-	};
-
-	inline void populate_initial_implicit_producer_hash()
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			implicitProducerHashCount.store(0, std::memory_order_relaxed);
-			auto hash = &initialImplicitProducerHash;
-			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
-			hash->entries = &initialImplicitProducerHashEntries[0];
-			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
-				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-			}
-			hash->prev = nullptr;
-			implicitProducerHash.store(hash, std::memory_order_relaxed);
-		}
-	}
-
-	void swap_implicit_producer_hashes(ConcurrentQueue& other)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			// Swap (assumes our implicit producer hash is initialized)
-			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
-			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
-			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
-
-			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
-
-			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
-			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
-				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &initialImplicitProducerHash;
-			}
-			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
-				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &other.initialImplicitProducerHash;
-			}
-		}
-	}
-
-	// Only fails (returns nullptr) if memory allocation fails
-	ImplicitProducer* get_or_add_implicit_producer()
-	{
-		// Note that since the data is essentially thread-local (key is thread ID),
-		// there's a reduced need for fences (memory ordering is already consistent
-		// for any individual thread), except for the current table itself.
-
-		// Start by looking for the thread ID in the current and all previous hash tables.
-		// If it's not found, it must not be in there yet, since this same thread would
-		// have added it previously to one of the tables that we traversed.
-
-		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+            if (producer->isExplicit)
+            {
+                auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+                do {
+                    static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+                } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+            }
+            else
+            {
+                auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+                do {
+                    static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+                } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+            }
+#endif
+
+            return producer;
+        }
+
+        void reown_producers()
+        {
+            // After another instance is moved-into/swapped-with this one, all the
+            // producers we stole still think their parents are the other queue.
+            // So fix them up!
+            for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod())
+            {
+                ptr->parent = this;
+            }
+        }
+
+
+        //////////////////////////////////
+        // Implicit producer hash
+        //////////////////////////////////
+
+        struct ImplicitProducerKVP
+        {
+            std::atomic<details::thread_id_t> key;
+            ImplicitProducer*                 value;  // No need for atomicity since it's only read by the thread that sets it in the first place
+
+            ImplicitProducerKVP()
+                : value(nullptr)
+            {
+            }
+
+            ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+            {
+                key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+                value = other.value;
+            }
+
+            inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+            {
+                swap(other);
+                return *this;
+            }
+
+            inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+            {
+                if (this != &other)
+                {
+                    details::swap_relaxed(key, other.key);
+                    std::swap(value, other.value);
+                }
+            }
+        };
+
+        template<typename XT, typename XTraits>
+        friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+
+        struct ImplicitProducerHash
+        {
+            size_t                capacity;
+            ImplicitProducerKVP*  entries;
+            ImplicitProducerHash* prev;
+        };
+
+        inline void populate_initial_implicit_producer_hash()
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            {
+                return;
+            }
+            else
+            {
+                implicitProducerHashCount.store(0, std::memory_order_relaxed);
+                auto hash      = &initialImplicitProducerHash;
+                hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+                hash->entries  = &initialImplicitProducerHashEntries[0];
+                for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i)
+                {
+                    initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+                }
+                hash->prev = nullptr;
+                implicitProducerHash.store(hash, std::memory_order_relaxed);
+            }
+        }
+
+        void swap_implicit_producer_hashes(ConcurrentQueue& other)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            {
+                return;
+            }
+            else
+            {
+                // Swap (assumes our implicit producer hash is initialized)
+                initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+                initialImplicitProducerHash.entries       = &initialImplicitProducerHashEntries[0];
+                other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+
+                details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+
+                details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+                if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash)
+                {
+                    implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+                }
+                else
+                {
+                    ImplicitProducerHash* hash;
+                    for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev)
+                    {
+                        continue;
+                    }
+                    hash->prev = &initialImplicitProducerHash;
+                }
+                if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash)
+                {
+                    other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+                }
+                else
+                {
+                    ImplicitProducerHash* hash;
+                    for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev)
+                    {
+                        continue;
+                    }
+                    hash->prev = &other.initialImplicitProducerHash;
+                }
+            }
+        }
+
+        // Only fails (returns nullptr) if memory allocation fails
+        ImplicitProducer* get_or_add_implicit_producer()
+        {
+            // Note that since the data is essentially thread-local (key is thread ID),
+            // there's a reduced need for fences (memory ordering is already consistent
+            // for any individual thread), except for the current table itself.
+
+            // Start by looking for the thread ID in the current and all previous hash tables.
+            // If it's not found, it must not be in there yet, since this same thread would
+            // have added it previously to one of the tables that we traversed.
+
+            // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
 
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-
-		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
-		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
-			// Look for the id in this hash
-			auto index = hashedId;
-			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
-				index &= hash->capacity - 1u;
-
-				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
-				if (probedKey == id) {
-					// Found it! If we had to search several hashes deep, though, we should lazily add it
-					// to the current main hash table to avoid the extended search next time.
-					// Note there's guaranteed to be room in the current hash table since every subsequent
-					// table implicitly reserves space for all previous tables (there's only one
-					// implicitProducerHashCount).
-					auto value = hash->entries[index].value;
-					if (hash != mainHash) {
-						index = hashedId;
-						while (true) {
-							index &= mainHash->capacity - 1u;
-							auto empty = details::invalid_thread_id;
+            debug::DebugLock lock(implicitProdMutex);
+#endif
+
+            auto id       = details::thread_id();
+            auto hashedId = details::hash_thread_id(id);
+
+            auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+            assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+            for (auto hash = mainHash; hash != nullptr; hash = hash->prev)
+            {
+                // Look for the id in this hash
+                auto index = hashedId;
+                while (true)
+                {  // Not an infinite loop because at least one slot is free in the hash table
+                    index &= hash->capacity - 1u;
+
+                    auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+                    if (probedKey == id)
+                    {
+                        // Found it! If we had to search several hashes deep, though, we should lazily add it
+                        // to the current main hash table to avoid the extended search next time.
+                        // Note there's guaranteed to be room in the current hash table since every subsequent
+                        // table implicitly reserves space for all previous tables (there's only one
+                        // implicitProducerHashCount).
+                        auto value = hash->entries[index].value;
+                        if (hash != mainHash)
+                        {
+                            index = hashedId;
+                            while (true)
+                            {
+                                index &= mainHash->capacity - 1u;
+                                auto empty = details::invalid_thread_id;
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-							auto reusable = details::invalid_thread_id2;
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
-								mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+                                auto reusable = details::invalid_thread_id2;
+                                if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
+                                    mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed))
+                                {
 #else
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-#endif
-								mainHash->entries[index].value = value;
-								break;
-							}
-							++index;
-						}
-					}
-
-					return value;
-				}
-				if (probedKey == details::invalid_thread_id) {
-					break;		// Not in this hash table
-				}
-				++index;
-			}
-		}
-
-		// Insert!
-		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
-		while (true) {
-			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
-			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
-				// We've acquired the resize lock, try to allocate a bigger hash table.
-				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
-				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
-				// locked block).
-				mainHash = implicitProducerHash.load(std::memory_order_acquire);
-				if (newCount >= (mainHash->capacity >> 1)) {
-					size_t newCapacity = mainHash->capacity << 1;
-					while (newCount >= (newCapacity >> 1)) {
-						newCapacity <<= 1;
-					}
-					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
-					if (raw == nullptr) {
-						// Allocation failed
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-						return nullptr;
-					}
-
-					auto newHash = new (raw) ImplicitProducerHash;
-					newHash->capacity = static_cast<size_t>(newCapacity);
-					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
-					for (size_t i = 0; i != newCapacity; ++i) {
-						new (newHash->entries + i) ImplicitProducerKVP;
-						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-					}
-					newHash->prev = mainHash;
-					implicitProducerHash.store(newHash, std::memory_order_release);
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-					mainHash = newHash;
-				}
-				else {
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-				}
-			}
-
-			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
-			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
-			// always be true)
-			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
-				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
-				if (producer == nullptr) {
-					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-					return nullptr;
-				}
+                                if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed))
+                                {
+#endif
+                                    mainHash->entries[index].value = value;
+                                    break;
+                                }
+                                ++index;
+                            }
+                        }
+
+                        return value;
+                    }
+                    if (probedKey == details::invalid_thread_id)
+                    {
+                        break;  // Not in this hash table
+                    }
+                    ++index;
+                }
+            }
+
+            // Insert!
+            auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+            while (true)
+            {
+                // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+                if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire))
+                {
+                    // We've acquired the resize lock, try to allocate a bigger hash table.
+                    // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+                    // we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+                    // locked block).
+                    mainHash = implicitProducerHash.load(std::memory_order_acquire);
+                    if (newCount >= (mainHash->capacity >> 1))
+                    {
+                        size_t newCapacity = mainHash->capacity << 1;
+                        while (newCount >= (newCapacity >> 1))
+                        {
+                            newCapacity <<= 1;
+                        }
+                        auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+                        if (raw == nullptr)
+                        {
+                            // Allocation failed
+                            implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+                            implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+                            return nullptr;
+                        }
+
+                        auto newHash      = new (raw) ImplicitProducerHash;
+                        newHash->capacity = static_cast<size_t>(newCapacity);
+                        newHash->entries  = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+                        for (size_t i = 0; i != newCapacity; ++i)
+                        {
+                            new (newHash->entries + i) ImplicitProducerKVP;
+                            newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+                        }
+                        newHash->prev = mainHash;
+                        implicitProducerHash.store(newHash, std::memory_order_release);
+                        implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+                        mainHash = newHash;
+                    }
+                    else
+                    {
+                        implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+                    }
+                }
+
+                // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+                // to finish being allocated by another thread (and if we just finished allocating above, the condition will
+                // always be true)
+                if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2))
+                {
+                    auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
+                    if (producer == nullptr)
+                    {
+                        implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+                        return nullptr;
+                    }
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
-				producer->threadExitListener.userData = producer;
-				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+                    producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+                    producer->threadExitListener.userData = producer;
+                    details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
 #endif
 
-				auto index = hashedId;
-				while (true) {
-					index &= mainHash->capacity - 1u;
-					auto empty = details::invalid_thread_id;
+                    auto index = hashedId;
+                    while (true)
+                    {
+                        index &= mainHash->capacity - 1u;
+                        auto empty = details::invalid_thread_id;
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-					auto reusable = details::invalid_thread_id2;
-					if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
-						mainHash->entries[index].value = producer;
-						break;
-					}
-#endif
-					if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						mainHash->entries[index].value = producer;
-						break;
-					}
-					++index;
-				}
-				return producer;
-			}
-
-			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
-			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
-			// we try to allocate ourselves).
-			mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		}
-	}
+                        auto reusable = details::invalid_thread_id2;
+                        if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed))
+                        {
+                            implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
+                            mainHash->entries[index].value = producer;
+                            break;
+                        }
+#endif
+                        if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed))
+                        {
+                            mainHash->entries[index].value = producer;
+                            break;
+                        }
+                        ++index;
+                    }
+                    return producer;
+                }
+
+                // Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+                // We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+                // we try to allocate ourselves).
+                mainHash = implicitProducerHash.load(std::memory_order_acquire);
+            }
+        }
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	void implicit_producer_thread_exited(ImplicitProducer* producer)
-	{
-		// Remove from hash
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		auto hash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-		details::thread_id_t probedKey;
-
-		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
-		// trying to add an entry thinking there's a free slot (because they reused a producer)
-		for (; hash != nullptr; hash = hash->prev) {
-			auto index = hashedId;
-			do {
-				index &= hash->capacity - 1u;
-				probedKey = id;
-				if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-					break;
-				}
-				++index;
-			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
-		}
-
-		// Mark the queue as being recyclable
-		producer->inactive.store(true, std::memory_order_release);
-	}
-
-	static void implicit_producer_thread_exited_callback(void* userData)
-	{
-		auto producer = static_cast<ImplicitProducer*>(userData);
-		auto queue = producer->parent;
-		queue->implicit_producer_thread_exited(producer);
-	}
-#endif
-
-	//////////////////////////////////
-	// Utility functions
-	//////////////////////////////////
-
-	template<typename TAlign>
-	static inline void* aligned_malloc(size_t size)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::malloc)(size);
-		else {
-			size_t alignment = std::alignment_of<TAlign>::value;
-			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
-			if (!raw)
-				return nullptr;
-			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
-			*(reinterpret_cast<void**>(ptr) - 1) = raw;
-			return ptr;
-		}
-	}
-
-	template<typename TAlign>
-	static inline void aligned_free(void* ptr)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::free)(ptr);
-		else
-			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
-	}
-
-	template<typename U>
-	static inline U* create_array(size_t count)
-	{
-		assert(count > 0);
-		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
-		if (p == nullptr)
-			return nullptr;
-
-		for (size_t i = 0; i != count; ++i)
-			new (p + i) U();
-		return p;
-	}
-
-	template<typename U>
-	static inline void destroy_array(U* p, size_t count)
-	{
-		if (p != nullptr) {
-			assert(count > 0);
-			for (size_t i = count; i != 0; )
-				(p + --i)->~U();
-		}
-		aligned_free<U>(p);
-	}
-
-	template<typename U>
-	static inline U* create()
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U : nullptr;
-	}
-
-	template<typename U, typename A1>
-	static inline U* create(A1&& a1)
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
-	}
-
-	template<typename U>
-	static inline void destroy(U* p)
-	{
-		if (p != nullptr)
-			p->~U();
-		aligned_free<U>(p);
-	}
-
-private:
-	std::atomic<ProducerBase*> producerListTail;
-	std::atomic<std::uint32_t> producerCount;
-
-	std::atomic<size_t> initialBlockPoolIndex;
-	Block* initialBlockPool;
-	size_t initialBlockPoolSize;
+        void implicit_producer_thread_exited(ImplicitProducer* producer)
+        {
+            // Remove from hash
+    #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+            debug::DebugLock lock(implicitProdMutex);
+    #endif
+            auto hash = implicitProducerHash.load(std::memory_order_acquire);
+            assert(hash != nullptr);  // The thread exit listener is only registered if we were added to a hash in the first place
+            auto                 id       = details::thread_id();
+            auto                 hashedId = details::hash_thread_id(id);
+            details::thread_id_t probedKey;
+
+            // We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+            // trying to add an entry thinking there's a free slot (because they reused a producer)
+            for (; hash != nullptr; hash = hash->prev)
+            {
+                auto index = hashedId;
+                do {
+                    index &= hash->capacity - 1u;
+                    probedKey = id;
+                    if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed))
+                    {
+                        break;
+                    }
+                    ++index;
+                } while (probedKey != details::invalid_thread_id);  // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+            }
+
+            // Mark the queue as being recyclable
+            producer->inactive.store(true, std::memory_order_release);
+        }
+
+        static void implicit_producer_thread_exited_callback(void* userData)
+        {
+            auto producer = static_cast<ImplicitProducer*>(userData);
+            auto queue    = producer->parent;
+            queue->implicit_producer_thread_exited(producer);
+        }
+#endif
+
+        //////////////////////////////////
+        // Utility functions
+        //////////////////////////////////
+
+        template<typename TAlign>
+        static inline void* aligned_malloc(size_t size)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+            return (Traits::malloc)(size);
+            else
+            {
+                size_t alignment = std::alignment_of<TAlign>::value;
+                void*  raw       = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
+                if (!raw)
+                    return nullptr;
+                char* ptr                            = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
+                *(reinterpret_cast<void**>(ptr) - 1) = raw;
+                return ptr;
+            }
+        }
+
+        template<typename TAlign>
+        static inline void aligned_free(void* ptr)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+            return (Traits::free)(ptr);
+            else(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
+        }
+
+        template<typename U>
+        static inline U* create_array(size_t count)
+        {
+            assert(count > 0);
+            U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
+            if (p == nullptr)
+                return nullptr;
+
+            for (size_t i = 0; i != count; ++i)
+                new (p + i) U();
+            return p;
+        }
+
+        template<typename U>
+        static inline void destroy_array(U* p, size_t count)
+        {
+            if (p != nullptr)
+            {
+                assert(count > 0);
+                for (size_t i = count; i != 0;)
+                    (p + --i)->~U();
+            }
+            aligned_free<U>(p);
+        }
+
+        template<typename U>
+        static inline U* create()
+        {
+            void* p = aligned_malloc<U>(sizeof(U));
+            return p != nullptr ? new (p) U : nullptr;
+        }
+
+        template<typename U, typename A1>
+        static inline U* create(A1&& a1)
+        {
+            void* p = aligned_malloc<U>(sizeof(U));
+            return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+        }
+
+        template<typename U>
+        static inline void destroy(U* p)
+        {
+            if (p != nullptr)
+                p->~U();
+            aligned_free<U>(p);
+        }
+
+      private:
+        std::atomic<ProducerBase*> producerListTail;
+        std::atomic<std::uint32_t> producerCount;
+
+        std::atomic<size_t>        initialBlockPoolIndex;
+        Block*                     initialBlockPool;
+        size_t                     initialBlockPoolSize;
 
 #ifndef MCDBGQ_USEDEBUGFREELIST
-	FreeList<Block> freeList;
+        FreeList<Block> freeList;
 #else
-	debug::DebugFreeList<Block> freeList;
+        debug::DebugFreeList<Block> freeList;
 #endif
 
-	std::atomic<ImplicitProducerHash*> implicitProducerHash;
-	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
-	ImplicitProducerHash initialImplicitProducerHash;
-	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
-	std::atomic_flag implicitProducerHashResizeInProgress;
+        std::atomic<ImplicitProducerHash*>                                   implicitProducerHash;
+        std::atomic<size_t>                                                  implicitProducerHashCount;  // Number of slots logically used
+        ImplicitProducerHash                                                 initialImplicitProducerHash;
+        std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+        std::atomic_flag                                                     implicitProducerHashResizeInProgress;
 
-	std::atomic<std::uint32_t> nextExplicitConsumerId;
-	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+        std::atomic<std::uint32_t>                                           nextExplicitConsumerId;
+        std::atomic<std::uint32_t>                                           globalExplicitConsumerOffset;
 
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-	debug::DebugMutex implicitProdMutex;
+        debug::DebugMutex implicitProdMutex;
 #endif
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	std::atomic<ExplicitProducer*> explicitProducers;
-	std::atomic<ImplicitProducer*> implicitProducers;
+        std::atomic<ExplicitProducer*> explicitProducers;
+        std::atomic<ImplicitProducer*> implicitProducers;
 #endif
-};
-
-
-template<typename T, typename Traits>
-ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
-	: producer(queue.recycle_or_create_producer(true))
-{
-	if (producer != nullptr) {
-		producer->token = this;
-	}
-}
-
-template<typename T, typename Traits>
-ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
-{
-	if (producer != nullptr) {
-		producer->token = this;
-	}
-}
-
-template<typename T, typename Traits>
-ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
-{
-	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
-}
-
-template<typename T, typename Traits>
-ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
-{
-	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
-}
-
-template<typename T, typename Traits>
-inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
-
-inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
-
-inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
-
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
-
-}
+    };
+
+
+    template<typename T, typename Traits>
+    ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+        : producer(queue.recycle_or_create_producer(true))
+    {
+        if (producer != nullptr)
+        {
+            producer->token = this;
+        }
+    }
+
+    template<typename T, typename Traits>
+    ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+        : producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+    {
+        if (producer != nullptr)
+        {
+            producer->token = this;
+        }
+    }
+
+    template<typename T, typename Traits>
+    ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+        : itemsConsumedFromCurrent(0)
+        , currentProducer(nullptr)
+        , desiredProducer(nullptr)
+    {
+        initialOffset         = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+        lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+    }
+
+    template<typename T, typename Traits>
+    ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+        : itemsConsumedFromCurrent(0)
+        , currentProducer(nullptr)
+        , desiredProducer(nullptr)
+    {
+        initialOffset         = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+        lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+    }
+
+    template<typename T, typename Traits>
+    inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+    {
+        a.swap(b);
+    }
+
+    inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+    {
+        a.swap(b);
+    }
+
+    inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+    {
+        a.swap(b);
+    }
+
+    template<typename T, typename Traits>
+    inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+    {
+        a.swap(b);
+    }
+
+}  // namespace moodycamel
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
-#pragma warning(pop)
+    #pragma warning(pop)
 #endif
 
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-#pragma GCC diagnostic pop
+    #pragma GCC diagnostic pop
 #endif
diff --git a/third_party/dlpack/dlpack.h b/third_party/dlpack/dlpack.h
index 9835c7b697..93f8044c7c 100644
--- a/third_party/dlpack/dlpack.h
+++ b/third_party/dlpack/dlpack.h
@@ -12,9 +12,9 @@
  * \brief Compatibility with C++
  */
 #ifdef __cplusplus
-#define DLPACK_EXTERN_C extern "C"
+    #define DLPACK_EXTERN_C extern "C"
 #else
-#define DLPACK_EXTERN_C
+    #define DLPACK_EXTERN_C
 #endif
 
 /*! \brief The current version of dlpack */
@@ -25,206 +25,214 @@
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
-#ifdef DLPACK_EXPORTS
-#define DLPACK_DLL __declspec(dllexport)
+    #ifdef DLPACK_EXPORTS
+        #define DLPACK_DLL __declspec(dllexport)
+    #else
+        #define DLPACK_DLL __declspec(dllimport)
+    #endif
 #else
-#define DLPACK_DLL __declspec(dllimport)
-#endif
-#else
-#define DLPACK_DLL
+    #define DLPACK_DLL
 #endif
 
 #include <stddef.h>
 #include <stdint.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 /*!
  * \brief The device type in DLDevice.
  */
 #ifdef __cplusplus
-typedef enum : int32_t {
+    typedef enum : int32_t
+    {
 #else
-typedef enum {
+typedef enum
+{
 #endif
-  /*! \brief CPU device */
-  kDLCPU = 1,
-  /*! \brief CUDA GPU device */
-  kDLCUDA = 2,
-  /*!
-   * \brief Pinned CUDA CPU memory by cudaMallocHost
-   */
-  kDLCUDAHost = 3,
-  /*! \brief OpenCL devices. */
-  kDLOpenCL = 4,
-  /*! \brief Vulkan buffer for next generation graphics. */
-  kDLVulkan = 7,
-  /*! \brief Metal for Apple GPU. */
-  kDLMetal = 8,
-  /*! \brief Verilog simulator buffer */
-  kDLVPI = 9,
-  /*! \brief ROCm GPUs for AMD GPUs */
-  kDLROCM = 10,
-  /*!
-   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
-   */
-  kDLROCMHost = 11,
-  /*!
-   * \brief Reserved extension device type,
-   * used for quickly test extension device
-   * The semantics can differ depending on the implementation.
-   */
-  kDLExtDev = 12,
-  /*!
-   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
-   */
-  kDLCUDAManaged = 13,
-  /*!
-   * \brief Unified shared memory allocated on a oneAPI non-partititioned
-   * device. Call to oneAPI runtime is required to determine the device
-   * type, the USM allocation type and the sycl context it is bound to.
-   *
-   */
-  kDLOneAPI = 14,
-  /*! \brief GPU support for next generation WebGPU standard. */
-  kDLWebGPU = 15,
-  /*! \brief Qualcomm Hexagon DSP */
-  kDLHexagon = 16,
-} DLDeviceType;
+        /*! \brief CPU device */
+        kDLCPU         = 1,
+        /*! \brief CUDA GPU device */
+        kDLCUDA        = 2,
+        /*!
+         * \brief Pinned CUDA CPU memory by cudaMallocHost
+         */
+        kDLCUDAHost    = 3,
+        /*! \brief OpenCL devices. */
+        kDLOpenCL      = 4,
+        /*! \brief Vulkan buffer for next generation graphics. */
+        kDLVulkan      = 7,
+        /*! \brief Metal for Apple GPU. */
+        kDLMetal       = 8,
+        /*! \brief Verilog simulator buffer */
+        kDLVPI         = 9,
+        /*! \brief ROCm GPUs for AMD GPUs */
+        kDLROCM        = 10,
+        /*!
+         * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+         */
+        kDLROCMHost    = 11,
+        /*!
+         * \brief Reserved extension device type,
+         * used for quickly test extension device
+         * The semantics can differ depending on the implementation.
+         */
+        kDLExtDev      = 12,
+        /*!
+         * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+         */
+        kDLCUDAManaged = 13,
+        /*!
+         * \brief Unified shared memory allocated on a oneAPI non-partititioned
+         * device. Call to oneAPI runtime is required to determine the device
+         * type, the USM allocation type and the sycl context it is bound to.
+         *
+         */
+        kDLOneAPI      = 14,
+        /*! \brief GPU support for next generation WebGPU standard. */
+        kDLWebGPU      = 15,
+        /*! \brief Qualcomm Hexagon DSP */
+        kDLHexagon     = 16,
+    } DLDeviceType;
 
-/*!
- * \brief A Device for Tensor and operator.
- */
-typedef struct {
-  /*! \brief The device type used in the device. */
-  DLDeviceType device_type;
-  /*!
-   * \brief The device index.
-   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
-   */
-  int32_t device_id;
-} DLDevice;
+    /*!
+     * \brief A Device for Tensor and operator.
+     */
+    typedef struct
+    {
+        /*! \brief The device type used in the device. */
+        DLDeviceType device_type;
+        /*!
+         * \brief The device index.
+         * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+         */
+        int32_t      device_id;
+    } DLDevice;
 
-/*!
- * \brief The type code options DLDataType.
- */
-typedef enum {
-  /*! \brief signed integer */
-  kDLInt = 0U,
-  /*! \brief unsigned integer */
-  kDLUInt = 1U,
-  /*! \brief IEEE floating point */
-  kDLFloat = 2U,
-  /*!
-   * \brief Opaque handle type, reserved for testing purposes.
-   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
-   */
-  kDLOpaqueHandle = 3U,
-  /*! \brief bfloat16 */
-  kDLBfloat = 4U,
-  /*!
-   * \brief complex number
-   * (C/C++/Python layout: compact struct per complex number)
-   */
-  kDLComplex = 5U,
-} DLDataTypeCode;
+    /*!
+     * \brief The type code options DLDataType.
+     */
+    typedef enum
+    {
+        /*! \brief signed integer */
+        kDLInt          = 0U,
+        /*! \brief unsigned integer */
+        kDLUInt         = 1U,
+        /*! \brief IEEE floating point */
+        kDLFloat        = 2U,
+        /*!
+         * \brief Opaque handle type, reserved for testing purposes.
+         * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+         */
+        kDLOpaqueHandle = 3U,
+        /*! \brief bfloat16 */
+        kDLBfloat       = 4U,
+        /*!
+         * \brief complex number
+         * (C/C++/Python layout: compact struct per complex number)
+         */
+        kDLComplex      = 5U,
+    } DLDataTypeCode;
 
-/*!
- * \brief The data type the tensor can hold. The data type is assumed to follow the
- * native endian-ness. An explicit error message should be raised when attempting to
- * export an array with non-native endianness
- *
- *  Examples
- *   - float: type_code = 2, bits = 32, lanes=1
- *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
- *   - int8: type_code = 0, bits = 8, lanes=1
- *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
- */
-typedef struct {
-  /*!
-   * \brief Type code of base types.
-   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
-   * footprint, but the value should be one of DLDataTypeCode enum values.
-   * */
-  uint8_t code;
-  /*!
-   * \brief Number of bits, common choices are 8, 16, 32.
-   */
-  uint8_t bits;
-  /*! \brief Number of lanes in the type, used for vector types. */
-  uint16_t lanes;
-} DLDataType;
+    /*!
+     * \brief The data type the tensor can hold. The data type is assumed to follow the
+     * native endian-ness. An explicit error message should be raised when attempting to
+     * export an array with non-native endianness
+     *
+     *  Examples
+     *   - float: type_code = 2, bits = 32, lanes=1
+     *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+     *   - int8: type_code = 0, bits = 8, lanes=1
+     *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+     */
+    typedef struct
+    {
+        /*!
+         * \brief Type code of base types.
+         * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+         * footprint, but the value should be one of DLDataTypeCode enum values.
+         * */
+        uint8_t  code;
+        /*!
+         * \brief Number of bits, common choices are 8, 16, 32.
+         */
+        uint8_t  bits;
+        /*! \brief Number of lanes in the type, used for vector types. */
+        uint16_t lanes;
+    } DLDataType;
 
-/*!
- * \brief Plain C Tensor object, does not manage memory.
- */
-typedef struct {
-  /*!
-   * \brief The data pointer points to the allocated data. This will be CUDA
-   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
-   * types. This pointer is always aligned to 256 bytes as in CUDA. The
-   * `byte_offset` field should be used to point to the beginning of the data.
-   *
-   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
-   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
-   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
-   * (after which this note will be updated); at the moment it is recommended
-   * to not rely on the data pointer being correctly aligned.
-   *
-   * For given DLTensor, the size of memory required to store the contents of
-   * data is calculated as follows:
-   *
-   * \code{.c}
-   * static inline size_t GetDataSize(const DLTensor* t) {
-   *   size_t size = 1;
-   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
-   *     size *= t->shape[i];
-   *   }
-   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
-   *   return size;
-   * }
-   * \endcode
-   */
-  void* data;
-  /*! \brief The device of the tensor */
-  DLDevice device;
-  /*! \brief Number of dimensions */
-  int32_t ndim;
-  /*! \brief The data type of the pointer*/
-  DLDataType dtype;
-  /*! \brief The shape of the tensor */
-  int64_t* shape;
-  /*!
-   * \brief strides of the tensor (in number of elements, not bytes)
-   *  can be NULL, indicating tensor is compact and row-majored.
-   */
-  int64_t* strides;
-  /*! \brief The offset in bytes to the beginning pointer to data */
-  uint64_t byte_offset;
-} DLTensor;
+    /*!
+     * \brief Plain C Tensor object, does not manage memory.
+     */
+    typedef struct
+    {
+        /*!
+         * \brief The data pointer points to the allocated data. This will be CUDA
+         * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+         * types. This pointer is always aligned to 256 bytes as in CUDA. The
+         * `byte_offset` field should be used to point to the beginning of the data.
+         *
+         * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+         * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+         * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+         * (after which this note will be updated); at the moment it is recommended
+         * to not rely on the data pointer being correctly aligned.
+         *
+         * For given DLTensor, the size of memory required to store the contents of
+         * data is calculated as follows:
+         *
+         * \code{.c}
+         * static inline size_t GetDataSize(const DLTensor* t) {
+         *   size_t size = 1;
+         *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+         *     size *= t->shape[i];
+         *   }
+         *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+         *   return size;
+         * }
+         * \endcode
+         */
+        void*      data;
+        /*! \brief The device of the tensor */
+        DLDevice   device;
+        /*! \brief Number of dimensions */
+        int32_t    ndim;
+        /*! \brief The data type of the pointer*/
+        DLDataType dtype;
+        /*! \brief The shape of the tensor */
+        int64_t*   shape;
+        /*!
+         * \brief strides of the tensor (in number of elements, not bytes)
+         *  can be NULL, indicating tensor is compact and row-majored.
+         */
+        int64_t*   strides;
+        /*! \brief The offset in bytes to the beginning pointer to data */
+        uint64_t   byte_offset;
+    } DLTensor;
 
-/*!
- * \brief C Tensor object, manage memory of DLTensor. This data structure is
- *  intended to facilitate the borrowing of DLTensor by another framework. It is
- *  not meant to transfer the tensor. When the borrowing framework doesn't need
- *  the tensor, it should call the deleter to notify the host that the resource
- *  is no longer needed.
- */
-typedef struct DLManagedTensor {
-  /*! \brief DLTensor which is being memory managed */
-  DLTensor dl_tensor;
-  /*! \brief the context of the original host framework of DLManagedTensor in
-   *   which DLManagedTensor is used in the framework. It can also be NULL.
-   */
-  void* manager_ctx;
-  /*! \brief Destructor signature void (*)(void*) - this should be called
-   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
-   *   if there is no way for the caller to provide a reasonable destructor.
-   *   The destructors deletes the argument self as well.
-   */
-  void (*deleter)(struct DLManagedTensor* self);
-} DLManagedTensor;
+    /*!
+     * \brief C Tensor object, manage memory of DLTensor. This data structure is
+     *  intended to facilitate the borrowing of DLTensor by another framework. It is
+     *  not meant to transfer the tensor. When the borrowing framework doesn't need
+     *  the tensor, it should call the deleter to notify the host that the resource
+     *  is no longer needed.
+     */
+    typedef struct DLManagedTensor
+    {
+        /*! \brief DLTensor which is being memory managed */
+        DLTensor dl_tensor;
+        /*! \brief the context of the original host framework of DLManagedTensor in
+         *   which DLManagedTensor is used in the framework. It can also be NULL.
+         */
+        void*    manager_ctx;
+        /*! \brief Destructor signature void (*)(void*) - this should be called
+         *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+         *   if there is no way for the caller to provide a reasonable destructor.
+         *   The destructors deletes the argument self as well.
+         */
+        void (*deleter)(struct DLManagedTensor* self);
+    } DLManagedTensor;
 #ifdef __cplusplus
 }  // DLPACK_EXTERN_C
 #endif

From 9839e10d11b7596246ba3a433185b7850490c4b7 Mon Sep 17 00:00:00 2001
From: huzhenhong <455879568@qq.com>
Date: Wed, 20 Dec 2023 10:50:36 +0800
Subject: [PATCH 3/5] format all CMakeLists.txt

---
 .gitignore                                    |   4 +-
 CMakeLists.txt                                | 263 +++++------
 cmake/MMDeploy.cmake                          | 426 +++++++++---------
 cmake/cuda.cmake                              | 176 ++++----
 cmake/filesystem.cmake                        |  83 ++--
 cmake/modules/FindCUDNN.cmake                 |  49 +-
 cmake/modules/FindONNXRUNTIME.cmake           |  52 ++-
 cmake/modules/FindTENSORRT.cmake              |  81 ++--
 cmake/modules/FindTVM.cmake                   |  69 +--
 cmake/post-install.cmake                      |   8 +-
 cmake/stacktrace.cmake                        |   7 +-
 cmake/tensorrt.cmake                          |  50 +-
 cmake/toolchains/aarch64-linux-gnu.cmake      |   8 +-
 cmake/toolchains/arm-linux-gnueabihf.cmake    |   8 +-
 cmake/toolchains/riscv64-linux-gnu.cmake      |   8 +-
 .../riscv64-unknown-linux-gnu.cmake           |  16 +-
 cmake/toolchains/rknpu2-linux-gnu.cmake       |  12 +-
 csrc/mmdeploy/CMakeLists.txt                  |  32 +-
 csrc/mmdeploy/apis/CMakeLists.txt             |  10 +-
 csrc/mmdeploy/apis/c/CMakeLists.txt           | 133 +++---
 csrc/mmdeploy/apis/cxx/CMakeLists.txt         |  69 +--
 csrc/mmdeploy/apis/java/CMakeLists.txt        |  53 +--
 csrc/mmdeploy/apis/java/native/CMakeLists.txt |  50 +-
 csrc/mmdeploy/apis/python/CMakeLists.txt      |  73 ++-
 csrc/mmdeploy/archive/CMakeLists.txt          |   8 +-
 csrc/mmdeploy/backend_ops/CMakeLists.txt      |  58 +--
 csrc/mmdeploy/backend_ops/ncnn/CMakeLists.txt |  33 +-
 .../backend_ops/ncnn/onnx2ncnn/CMakeLists.txt |  42 +-
 .../backend_ops/ncnn/ops/CMakeLists.txt       |  14 +-
 .../ncnn/pyncnn_ext/CMakeLists.txt            |  15 +-
 .../backend_ops/onnxruntime/CMakeLists.txt    |  18 +-
 .../backend_ops/tensorrt/CMakeLists.txt       |  24 +-
 .../torchscript/ops/CMakeLists.txt            |  49 +-
 .../torchscript/optimizer/CMakeLists.txt      |  16 +-
 csrc/mmdeploy/codebase/CMakeLists.txt         |  50 +-
 .../mmdeploy/codebase/mmaction/CMakeLists.txt |  11 +-
 csrc/mmdeploy/codebase/mmcls/CMakeLists.txt   |   7 +-
 csrc/mmdeploy/codebase/mmdet/CMakeLists.txt   |   8 +-
 csrc/mmdeploy/codebase/mmedit/CMakeLists.txt  |   7 +-
 csrc/mmdeploy/codebase/mmocr/CMakeLists.txt   |  16 +-
 .../codebase/mmocr/cpu/CMakeLists.txt         |  24 +-
 .../codebase/mmocr/cuda/CMakeLists.txt        |  30 +-
 csrc/mmdeploy/codebase/mmpose/CMakeLists.txt  |  17 +-
 .../mmdeploy/codebase/mmrotate/CMakeLists.txt |   4 +-
 csrc/mmdeploy/codebase/mmseg/CMakeLists.txt   |   9 +-
 csrc/mmdeploy/core/CMakeLists.txt             | 142 +++---
 csrc/mmdeploy/device/CMakeLists.txt           |  12 +-
 csrc/mmdeploy/graph/CMakeLists.txt            |   7 +-
 csrc/mmdeploy/model/CMakeLists.txt            | 182 ++++----
 csrc/mmdeploy/net/CMakeLists.txt              | 116 +++--
 csrc/mmdeploy/net/acl/CMakeLists.txt          |  24 +-
 csrc/mmdeploy/net/coreml/CMakeLists.txt       |  24 +-
 csrc/mmdeploy/net/ncnn/CMakeLists.txt         |  22 +-
 csrc/mmdeploy/net/openvino/CMakeLists.txt     |  20 +-
 csrc/mmdeploy/net/ppl/CMakeLists.txt          |  32 +-
 csrc/mmdeploy/net/rknn/CMakeLists.txt         |  58 +--
 csrc/mmdeploy/net/snpe/CMakeLists.txt         |  12 +-
 csrc/mmdeploy/net/torchscript/CMakeLists.txt  |  31 +-
 csrc/mmdeploy/net/trt/CMakeLists.txt          |   6 +-
 csrc/mmdeploy/net/tvm/CMakeLists.txt          |   4 +-
 csrc/mmdeploy/operation/cpu/CMakeLists.txt    |  28 +-
 csrc/mmdeploy/operation/cuda/CMakeLists.txt   |  49 +-
 csrc/mmdeploy/preprocess/CMakeLists.txt       |   6 +-
 csrc/mmdeploy/preprocess/elena/CMakeLists.txt |  18 +-
 .../preprocess/transform/CMakeLists.txt       |  32 +-
 csrc/mmdeploy/utils/dlpack/CMakeLists.txt     |  12 +-
 csrc/mmdeploy/utils/opencv/CMakeLists.txt     |  11 +-
 demo/csrc/CMakeLists.txt                      |  53 ++-
 service/snpe/server/CMakeLists.txt            |  81 ++--
 service/snpe/server/common.cmake              |  81 ++--
 tests/test_csrc/CMakeLists.txt                | 101 ++---
 71 files changed, 1725 insertions(+), 1639 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2d78033dcf..452e541fc8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -113,7 +113,7 @@ venv*
 
 data/
 data
-.vscode
+# .vscode
 .idea
 .DS_Store
 
@@ -172,3 +172,5 @@ demo/csharp/*/Properties
 
 # doxygen
 docs/cppapi/docs
+
+*debug*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e002a258e..3851b3995d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,12 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-if (NOT DEFINED CMAKE_INSTALL_PREFIX)
-    set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "installation directory")
-endif ()
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX
+      "${CMAKE_BINARY_DIR}/install"
+      CACHE PATH "installation directory")
+endif()
 message(STATUS "CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
 
-if (NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "choose 'Release' as default build type" FORCE)
-endif ()
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE
+      Release
+      CACHE STRING "choose 'Release' as default build type" FORCE)
+endif()
 
 cmake_minimum_required(VERSION 3.14)
 project(MMDeploy VERSION 1.3.0)
@@ -18,11 +22,11 @@ set(MMDEPLOY_VERSION_MINOR ${PROJECT_VERSION_MINOR})
 set(MMDEPLOY_VERSION_PATCH ${PROJECT_VERSION_PATCH})
 
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-if (MSVC)
-    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-else ()
-    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-endif ()
+if(MSVC)
+  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+else()
+  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+endif()
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
 # options
@@ -41,141 +45,146 @@ option(MMDEPLOY_COVERAGE "build SDK for coverage" OFF)
 option(MMDEPLOY_USE_MSCV_STATIC "statically linked CRT" OFF)
 option(MMDEPLOY_ELENA_FUSION "use elena to fuse preprocess" OFF)
 
-set(MMDEPLOY_TARGET_DEVICES "cpu" CACHE STRING "target devices to support")
-set(MMDEPLOY_TARGET_BACKENDS "" CACHE STRING "target inference engines to support")
-set(MMDEPLOY_CODEBASES "all" CACHE STRING "select OpenMMLab codebases")
-
-if ((NOT MMDEPLOY_BUILD_SDK_MONOLITHIC) AND MMDEPLOY_DYNAMIC_BACKEND)
-    set(MMDEPLOY_DYNAMIC_BACKEND OFF)
-endif ()
+set(MMDEPLOY_TARGET_DEVICES
+    "cpu"
+    CACHE STRING "target devices to support")
+set(MMDEPLOY_TARGET_BACKENDS
+    ""
+    CACHE STRING "target inference engines to support")
+set(MMDEPLOY_CODEBASES
+    "all"
+    CACHE STRING "select OpenMMLab codebases")
+
+if((NOT MMDEPLOY_BUILD_SDK_MONOLITHIC) AND MMDEPLOY_DYNAMIC_BACKEND)
+  set(MMDEPLOY_DYNAMIC_BACKEND OFF)
+endif()
 
-if (MMDEPLOY_SHARED_LIBS)
-    set(MMDEPLOY_LIB_TYPE SHARED)
-else ()
-    set(MMDEPLOY_LIB_TYPE STATIC)
-endif ()
+if(MMDEPLOY_SHARED_LIBS)
+  set(MMDEPLOY_LIB_TYPE SHARED)
+else()
+  set(MMDEPLOY_LIB_TYPE STATIC)
+endif()
 
-set(MMDEPLOY_TASKS "" CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ""
+    CACHE INTERNAL "")
 
-if (MMDEPLOY_COVERAGE)
-    add_compile_options(-coverage -fprofile-arcs -ftest-coverage)
-    add_link_options(-coverage -lgcov)
-endif ()
+if(MMDEPLOY_COVERAGE)
+  add_compile_options(-coverage -fprofile-arcs -ftest-coverage)
+  add_link_options(-coverage -lgcov)
+endif()
 
-# when CUDA devices are enabled, the environment variable ASAN_OPTIONS=protect_shadow_gap=0
-# must be set at runtime
-if (MMDEPLOY_ASAN_ENABLE)
-    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=address>)
-    add_link_options(-fsanitize=address)
-endif ()
+# when CUDA devices are enabled, the environment variable
+# ASAN_OPTIONS=protect_shadow_gap=0 must be set at runtime
+if(MMDEPLOY_ASAN_ENABLE)
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=address>)
+  add_link_options(-fsanitize=address)
+endif()
 
 # notice that ubsan has linker issues for ubuntu < 18.04, see
 # https://stackoverflow.com/questions/50024731/ld-unrecognized-option-push-state-no-as-needed
-if (MMDEPLOY_UBSAN_ENABLE)
-    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=undefined>)
-    add_link_options(-fsanitize=undefined)
-endif ()
-
-if (MSVC)
-    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/diagnostics:classic>)
-    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/wd4251>)
-    if (MMDEPLOY_USE_MSCV_STATIC)
-        foreach(lang C CXX)
-            string(REPLACE /MD /MT CMAKE_${lang}_FLAGS_DEBUG "${CMAKE_${lang}_FLAGS_DEBUG}")
-            string(REPLACE /MD /MT CMAKE_${lang}_FLAGS_RELEASE "${CMAKE_${lang}_FLAGS_RELEASE}")
-        endforeach()
-    endif ()
-endif ()
+if(MMDEPLOY_UBSAN_ENABLE)
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=undefined>)
+  add_link_options(-fsanitize=undefined)
+endif()
+
+if(MSVC)
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/diagnostics:classic>)
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/wd4251>)
+  if(MMDEPLOY_USE_MSCV_STATIC)
+    foreach(lang C CXX)
+      string(REPLACE /MD /MT CMAKE_${lang}_FLAGS_DEBUG
+                     "${CMAKE_${lang}_FLAGS_DEBUG}")
+      string(REPLACE /MD /MT CMAKE_${lang}_FLAGS_RELEASE
+                     "${CMAKE_${lang}_FLAGS_RELEASE}")
+    endforeach()
+  endif()
+endif()
 
 if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fobjc-arc")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fobjc-arc")
 endif()
 
 add_library(MMDeployStaticModules INTERFACE)
 add_library(MMDeployDynamicModules INTERFACE)
 add_library(MMDeployLibs INTERFACE)
 
-if ((cuda IN_LIST MMDEPLOY_TARGET_DEVICES) OR (trt IN_LIST MMDEPLOY_TARGET_BACKENDS))
-    include(cmake/cuda.cmake NO_POLICY_SCOPE)
-endif ()
+if((cuda IN_LIST MMDEPLOY_TARGET_DEVICES) OR (trt IN_LIST
+                                              MMDEPLOY_TARGET_BACKENDS))
+  include(cmake/cuda.cmake NO_POLICY_SCOPE)
+endif()
 
-# this must come after including cuda.cmake because policies in function scope is captured
-# at function definition
+# this must come after including cuda.cmake because policies in function scope
+# is captured at function definition
 include(cmake/MMDeploy.cmake)
 
 add_subdirectory(csrc/mmdeploy)
 
-if (MMDEPLOY_BUILD_SDK)
-    if (NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
-        install(TARGETS MMDeployStaticModules
-                MMDeployDynamicModules
-                MMDeployLibs
-                EXPORT MMDeployTargets)
-    endif ()
-
-    if (MMDEPLOY_BUILD_TEST)
-        add_subdirectory(tests/test_csrc)
-    endif ()
-
-    if (MMDEPLOY_BUILD_EXAMPLES)
-        include(${CMAKE_SOURCE_DIR}/cmake/opencv.cmake)
-        add_subdirectory(demo/csrc)
-    endif ()
-
-    # export MMDeploy package
-    install(EXPORT MMDeployTargets
-            FILE MMDeployTargets.cmake
+if(MMDEPLOY_BUILD_SDK)
+  if(NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
+    install(TARGETS MMDeployStaticModules MMDeployDynamicModules MMDeployLibs
+            EXPORT MMDeployTargets)
+  endif()
+
+  if(MMDEPLOY_BUILD_TEST)
+    add_subdirectory(tests/test_csrc)
+  endif()
+
+  if(MMDEPLOY_BUILD_EXAMPLES)
+    include(${CMAKE_SOURCE_DIR}/cmake/opencv.cmake)
+    add_subdirectory(demo/csrc)
+  endif()
+
+  # export MMDeploy package
+  install(
+    EXPORT MMDeployTargets
+    FILE MMDeployTargets.cmake
+    DESTINATION lib/cmake/MMDeploy)
+
+  if(MMDEPLOY_SPDLOG_EXTERNAL)
+    set(SPDLOG_DEPENDENCY "find_package(spdlog QUIET)")
+  endif()
+  # append backend deps
+  mmdeploy_add_deps(trt BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS TENSORRT
+                    CUDNN)
+  mmdeploy_add_deps(ort BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS ONNXRUNTIME)
+  mmdeploy_add_deps(ncnn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS ncnn)
+  mmdeploy_add_deps(openvino BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS
+                    InferenceEngine)
+  if(NOT MMDEPLOY_SHARED_LIBS)
+    mmdeploy_add_deps(pplnn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS pplnn)
+  endif()
+  mmdeploy_add_deps(snpe BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS snpe)
+  mmdeploy_add_deps(rknn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS rknn)
+
+  include(CMakePackageConfigHelpers)
+  # generate the config file that is includes the exports
+  configure_package_config_file(
+    ${CMAKE_SOURCE_DIR}/cmake/MMDeployConfig.cmake.in
+    "${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfig.cmake"
+    INSTALL_DESTINATION "lib/cmake"
+    NO_SET_AND_CHECK_MACRO NO_CHECK_REQUIRED_COMPONENTS_MACRO)
+
+  write_basic_package_version_file(
+    "${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfigVersion.cmake"
+    VERSION "${MMDeploy_VERSION_MAJOR}.${MMDeploy_VERSION_MINOR}"
+    COMPATIBILITY AnyNewerVersion)
+
+  install(
+    FILES ${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfig.cmake
+          ${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfigVersion.cmake
+          ${CMAKE_CURRENT_SOURCE_DIR}/cmake/MMDeploy.cmake
+    DESTINATION lib/cmake/MMDeploy)
+
+  if(MSVC)
+    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/loader.cpp.in
             DESTINATION lib/cmake/MMDeploy)
+  endif()
+
+  install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules
+          DESTINATION lib/cmake/MMDeploy)
 
-    if (MMDEPLOY_SPDLOG_EXTERNAL)
-        set(SPDLOG_DEPENDENCY "find_package(spdlog QUIET)")
-    endif ()
-    # append backend deps
-    mmdeploy_add_deps(trt BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS TENSORRT CUDNN)
-    mmdeploy_add_deps(ort BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS ONNXRUNTIME)
-    mmdeploy_add_deps(ncnn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS ncnn)
-    mmdeploy_add_deps(openvino BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS InferenceEngine)
-    if (NOT MMDEPLOY_SHARED_LIBS)
-        mmdeploy_add_deps(pplnn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS pplnn)
-    endif ()
-    mmdeploy_add_deps(snpe BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS snpe)
-    mmdeploy_add_deps(rknn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS rknn)
-
-    include(CMakePackageConfigHelpers)
-    # generate the config file that is includes the exports
-    configure_package_config_file(${CMAKE_SOURCE_DIR}/cmake/MMDeployConfig.cmake.in
-            "${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfig.cmake"
-            INSTALL_DESTINATION "lib/cmake"
-            NO_SET_AND_CHECK_MACRO
-            NO_CHECK_REQUIRED_COMPONENTS_MACRO
-            )
-
-    write_basic_package_version_file(
-            "${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfigVersion.cmake"
-            VERSION "${MMDeploy_VERSION_MAJOR}.${MMDeploy_VERSION_MINOR}"
-            COMPATIBILITY AnyNewerVersion
-    )
-
-    install(FILES
-            ${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfig.cmake
-            ${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfigVersion.cmake
-            ${CMAKE_CURRENT_SOURCE_DIR}/cmake/MMDeploy.cmake
-            DESTINATION lib/cmake/MMDeploy
-            )
-
-    if (MSVC)
-        install(FILES
-                ${CMAKE_CURRENT_SOURCE_DIR}/cmake/loader.cpp.in
-                DESTINATION lib/cmake/MMDeploy
-                )
-    endif ()
-
-    install(DIRECTORY
-            ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules
-            DESTINATION lib/cmake/MMDeploy
-            )
-
-    if (${CMAKE_VERSION} VERSION_LESS "3.17.0")
-        install(SCRIPT cmake/post-install.cmake)
-    endif ()
-endif ()
+  if(${CMAKE_VERSION} VERSION_LESS "3.17.0")
+    install(SCRIPT cmake/post-install.cmake)
+  endif()
+endif()
diff --git a/cmake/MMDeploy.cmake b/cmake/MMDeploy.cmake
index 304c7b1bc1..30e15c4c7c 100644
--- a/cmake/MMDeploy.cmake
+++ b/cmake/MMDeploy.cmake
@@ -1,220 +1,228 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-function (mmdeploy_export_impl NAME)
-    set(_LIB_DIR lib)
-    if (MSVC)
-        set(_LIB_DIR bin)
-    endif ()
-    install(TARGETS ${NAME}
-            EXPORT MMDeployTargets
-            ARCHIVE DESTINATION lib
-            LIBRARY DESTINATION ${_LIB_DIR}
-            RUNTIME DESTINATION bin)
-endfunction ()
-
-function (mmdeploy_add_rpath NAME)
-    if (MSVC)
-        return()
-    elseif(APPLE)
-        set_target_properties(${NAME} PROPERTIES
-                INSTALL_RPATH "@loader_path"
-                BUILD_RPATH "@loader_path")
-    else ()
-        set_target_properties(${NAME} PROPERTIES
-                INSTALL_RPATH "\$ORIGIN"
-                BUILD_RPATH "\$ORIGIN")
-        target_link_libraries(${NAME} PRIVATE -Wl,--disable-new-dtags)
-    endif ()
-endfunction ()
+function(mmdeploy_export_impl NAME)
+  set(_LIB_DIR lib)
+  if(MSVC)
+    set(_LIB_DIR bin)
+  endif()
+  install(
+    TARGETS ${NAME}
+    EXPORT MMDeployTargets
+    ARCHIVE DESTINATION lib
+    LIBRARY DESTINATION ${_LIB_DIR}
+    RUNTIME DESTINATION bin)
+endfunction()
+
+function(mmdeploy_add_rpath NAME)
+  if(MSVC)
+    return()
+  elseif(APPLE)
+    set_target_properties(${NAME} PROPERTIES INSTALL_RPATH "@loader_path"
+                                             BUILD_RPATH "@loader_path")
+  else()
+    set_target_properties(${NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN"
+                                             BUILD_RPATH "\$ORIGIN")
+    target_link_libraries(${NAME} PRIVATE -Wl,--disable-new-dtags)
+  endif()
+endfunction()
 
 macro(mmdeploy_add_net NAME)
-    if (MMDEPLOY_DYNAMIC_BACKEND)
-        mmdeploy_add_library(${NAME} SHARED ${ARGN})
-        mmdeploy_add_rpath(${NAME})
-        # DYNAMIC_BACKEND implies BUILD_SDK_MONOLITHIC
-        mmdeploy_export_impl(${NAME})
-        target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy)
-        set(BACKEND_LIB_NAMES ${BACKEND_LIB_NAMES} ${PROJECT_NAME} PARENT_SCOPE)
-    else ()
-        mmdeploy_add_module(${NAME} ${ARGN})
-    endif ()
+  if(MMDEPLOY_DYNAMIC_BACKEND)
+    mmdeploy_add_library(${NAME} SHARED ${ARGN})
+    mmdeploy_add_rpath(${NAME})
+    # DYNAMIC_BACKEND implies BUILD_SDK_MONOLITHIC
+    mmdeploy_export_impl(${NAME})
+    target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy)
+    set(BACKEND_LIB_NAMES
+        ${BACKEND_LIB_NAMES} ${PROJECT_NAME}
+        PARENT_SCOPE)
+  else()
+    mmdeploy_add_module(${NAME} ${ARGN})
+  endif()
 endmacro()
 
-function (mmdeploy_export NAME)
-    if (NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
-        mmdeploy_export_impl(${NAME})
-    endif ()
-endfunction ()
-
-
-function (mmdeploy_add_library NAME)
-    # EXCLUDE: exclude from registering & exporting
-    cmake_parse_arguments(_MMDEPLOY "EXCLUDE" "" "" ${ARGN})
-    # search for add_library keywords
-    cmake_parse_arguments(_TYPE "STATIC;SHARED;MODULE" "" "" ${_MMDEPLOY_UNPARSED_ARGUMENTS})
-    set(_MAYBE_TYPE)
-    if (NOT (_TYPE_STATIC OR _TYPE_SHARED OR _TYPE_MODULE))
-        set(_MAYBE_TYPE ${MMDEPLOY_LIB_TYPE})
-    endif ()
-    add_library(${NAME} ${_MAYBE_TYPE} ${_MMDEPLOY_UNPARSED_ARGUMENTS})
-    if (NOT MSVC)
-        target_compile_options(${NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-    endif ()
+function(mmdeploy_export NAME)
+  if(NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
+    mmdeploy_export_impl(${NAME})
+  endif()
+endfunction()
+
+function(mmdeploy_add_library NAME)
+  # EXCLUDE: exclude from registering & exporting
+  cmake_parse_arguments(_MMDEPLOY "EXCLUDE" "" "" ${ARGN})
+  # search for add_library keywords
+  cmake_parse_arguments(_TYPE "STATIC;SHARED;MODULE" "" ""
+                        ${_MMDEPLOY_UNPARSED_ARGUMENTS})
+  set(_MAYBE_TYPE)
+  if(NOT
+     (_TYPE_STATIC
+      OR _TYPE_SHARED
+      OR _TYPE_MODULE))
+    set(_MAYBE_TYPE ${MMDEPLOY_LIB_TYPE})
+  endif()
+  add_library(${NAME} ${_MAYBE_TYPE} ${_MMDEPLOY_UNPARSED_ARGUMENTS})
+  if(NOT MSVC)
+    target_compile_options(
+      ${NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+  endif()
+  target_compile_definitions(${NAME} PRIVATE -DMMDEPLOY_API_EXPORTS=1)
+  get_target_property(_TYPE ${NAME} TYPE)
+  if(_TYPE STREQUAL STATIC_LIBRARY)
+    set_target_properties(${NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  elseif(_TYPE STREQUAL SHARED_LIBRARY)
+
+  else()
+    message(FATAL_ERROR "unsupported type: ${_TYPE}")
+  endif()
+  if(NOT _MMDEPLOY_EXCLUDE)
+    target_link_libraries(MMDeployLibs INTERFACE ${NAME})
+    mmdeploy_export(${NAME})
+  endif()
+endfunction()
+
+function(mmdeploy_add_module NAME)
+  # EXCLUDE: exclude from registering & exporting as SDK module LIBRARY: the
+  # module is also a library (add_libray with SHARED instead of MODULE)
+  cmake_parse_arguments(_MMDEPLOY "EXCLUDE;LIBRARY" "" "" ${ARGN})
+  # search for add_library keywords
+  cmake_parse_arguments(_TYPE "STATIC;SHARED;MODULE" "" ""
+                        ${_MMDEPLOY_UNPARSED_ARGUMENTS})
+
+  set(_MAYBE_TYPE)
+  # no library type specified
+  if(NOT
+     (_TYPE_STATIC
+      OR _TYPE_SHARED
+      OR _TYPE_MODULE))
+    # shared but not marked as a library, build module library so that no .lib
+    # dependency will be generated for MSVC
+    if(MSVC
+       AND MMDEPLOY_SHARED_LIBS
+       AND NOT _MMDEPLOY_LIBRARY)
+      set(_MAYBE_TYPE MODULE)
+    else()
+      set(_MAYBE_TYPE ${MMDEPLOY_LIB_TYPE})
+    endif()
+  endif()
+
+  add_library(${NAME} ${_MAYBE_TYPE} ${_MMDEPLOY_UNPARSED_ARGUMENTS})
+
+  if(NOT MSVC)
+    target_compile_options(
+      ${NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+  endif()
+
+  # automatically link mmdeploy::core if exists
+  if(TARGET mmdeploy::core)
+    target_link_libraries(${NAME} PRIVATE mmdeploy::core)
+  endif()
+
+  # export public symbols when marked as a library
+  if(_MMDEPLOY_LIBRARY)
     target_compile_definitions(${NAME} PRIVATE -DMMDEPLOY_API_EXPORTS=1)
-    get_target_property(_TYPE ${NAME} TYPE)
-    if (_TYPE STREQUAL STATIC_LIBRARY)
-        set_target_properties(${NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
-    elseif (_TYPE STREQUAL SHARED_LIBRARY)
-    else ()
-        message(FATAL_ERROR "unsupported type: ${_TYPE}")
-    endif ()
-    if (NOT _MMDEPLOY_EXCLUDE)
-        target_link_libraries(MMDeployLibs INTERFACE ${NAME})
-        mmdeploy_export(${NAME})
-    endif ()
-endfunction ()
-
-
-function (mmdeploy_add_module NAME)
-    # EXCLUDE: exclude from registering & exporting as SDK module
-    # LIBRARY: the module is also a library (add_libray with SHARED instead of MODULE)
-    cmake_parse_arguments(_MMDEPLOY "EXCLUDE;LIBRARY" "" "" ${ARGN})
-    # search for add_library keywords
-    cmake_parse_arguments(_TYPE "STATIC;SHARED;MODULE" "" "" ${_MMDEPLOY_UNPARSED_ARGUMENTS})
-
-    set(_MAYBE_TYPE)
-    # no library type specified
-    if (NOT (_TYPE_STATIC OR _TYPE_SHARED OR _TYPE_MODULE))
-        # shared but not marked as a library, build module library so that no .lib dependency
-        # will be generated for MSVC
-        if (MSVC AND MMDEPLOY_SHARED_LIBS AND NOT _MMDEPLOY_LIBRARY)
-            set(_MAYBE_TYPE MODULE)
-        else ()
-            set(_MAYBE_TYPE ${MMDEPLOY_LIB_TYPE})
-        endif ()
-    endif ()
-
-    add_library(${NAME} ${_MAYBE_TYPE} ${_MMDEPLOY_UNPARSED_ARGUMENTS})
-
-    if (NOT MSVC)
-        target_compile_options(${NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-    endif ()
-
-    # automatically link mmdeploy::core if exists
-    if (TARGET mmdeploy::core)
-        target_link_libraries(${NAME} PRIVATE mmdeploy::core)
-    endif ()
-
-    # export public symbols when marked as a library
-    if (_MMDEPLOY_LIBRARY)
-        target_compile_definitions(${NAME} PRIVATE -DMMDEPLOY_API_EXPORTS=1)
-    endif ()
-
-    get_target_property(_TYPE ${NAME} TYPE)
-    if (_TYPE STREQUAL STATIC_LIBRARY)
-        set_target_properties(${NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
-        if (MSVC)
-            target_link_options(${NAME} INTERFACE "/WHOLEARCHIVE:${NAME}")
-        endif ()
-        # register static modules
-        if (NOT _MMDEPLOY_EXCLUDE)
-            target_link_libraries(MMDeployStaticModules INTERFACE ${NAME})
-        endif ()
-    elseif (_TYPE STREQUAL SHARED_LIBRARY OR _TYPE STREQUAL MODULE_LIBRARY)
-        # register dynamic modules
-        if (NOT _MMDEPLOY_EXCLUDE)
-            target_link_libraries(MMDeployDynamicModules INTERFACE ${NAME})
-        endif ()
-    else ()
-        message(FATAL_ERROR "unsupported type: ${_TYPE}")
-    endif ()
-    if (NOT _MMDEPLOY_EXCLUDE)
-        mmdeploy_export(${NAME})
-    endif ()
-endfunction ()
-
-
-function (_mmdeploy_flatten_modules RETVAL)
-    set(_RETVAL)
-    foreach (ARG IN LISTS ARGN)
-        get_target_property(TYPE ${ARG} TYPE)
-        if (TYPE STREQUAL "INTERFACE_LIBRARY")
-            get_target_property(LIBS ${ARG} INTERFACE_LINK_LIBRARIES)
-            if (LIBS)
-                # pattern for 3.17+
-                list(FILTER LIBS EXCLUDE REGEX "^::@")
-                # pattern for 3.13-3.16
-                list(TRANSFORM LIBS REPLACE "(.+)::@.*" "\\1")
-                list(APPEND _RETVAL ${LIBS})
-            endif ()
-        else ()
-            list(APPEND _RETVAL ${ARG})
-        endif ()
-    endforeach ()
-    set(${RETVAL} ${_RETVAL} PARENT_SCOPE)
-endfunction ()
-
-
-function (mmdeploy_load_static NAME)
-    if (MSVC)
-        target_link_libraries(${NAME} PRIVATE ${ARGN})
-    else ()
-        _mmdeploy_flatten_modules(_MODULE_LIST ${ARGN})
-        if (APPLE)
-            foreach (module IN LISTS _MODULE_LIST)
-                target_link_libraries(${NAME} PRIVATE -force_load ${module})
-            endforeach ()
-        else ()
-            target_link_libraries(${NAME} PRIVATE
-                    -Wl,--whole-archive
-                    ${_MODULE_LIST}
-                    -Wl,--no-whole-archive)
-        endif ()
-    endif ()
-endfunction ()
-
-function (mmdeploy_load_dynamic NAME)
+  endif()
+
+  get_target_property(_TYPE ${NAME} TYPE)
+  if(_TYPE STREQUAL STATIC_LIBRARY)
+    set_target_properties(${NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+    if(MSVC)
+      target_link_options(${NAME} INTERFACE "/WHOLEARCHIVE:${NAME}")
+    endif()
+    # register static modules
+    if(NOT _MMDEPLOY_EXCLUDE)
+      target_link_libraries(MMDeployStaticModules INTERFACE ${NAME})
+    endif()
+  elseif(_TYPE STREQUAL SHARED_LIBRARY OR _TYPE STREQUAL MODULE_LIBRARY)
+    # register dynamic modules
+    if(NOT _MMDEPLOY_EXCLUDE)
+      target_link_libraries(MMDeployDynamicModules INTERFACE ${NAME})
+    endif()
+  else()
+    message(FATAL_ERROR "unsupported type: ${_TYPE}")
+  endif()
+  if(NOT _MMDEPLOY_EXCLUDE)
+    mmdeploy_export(${NAME})
+  endif()
+endfunction()
+
+function(_mmdeploy_flatten_modules RETVAL)
+  set(_RETVAL)
+  foreach(ARG IN LISTS ARGN)
+    get_target_property(TYPE ${ARG} TYPE)
+    if(TYPE STREQUAL "INTERFACE_LIBRARY")
+      get_target_property(LIBS ${ARG} INTERFACE_LINK_LIBRARIES)
+      if(LIBS)
+        # pattern for 3.17+
+        list(FILTER LIBS EXCLUDE REGEX "^::@")
+        # pattern for 3.13-3.16
+        list(TRANSFORM LIBS REPLACE "(.+)::@.*" "\\1")
+        list(APPEND _RETVAL ${LIBS})
+      endif()
+    else()
+      list(APPEND _RETVAL ${ARG})
+    endif()
+  endforeach()
+  set(${RETVAL}
+      ${_RETVAL}
+      PARENT_SCOPE)
+endfunction()
+
+function(mmdeploy_load_static NAME)
+  if(MSVC)
+    target_link_libraries(${NAME} PRIVATE ${ARGN})
+  else()
     _mmdeploy_flatten_modules(_MODULE_LIST ${ARGN})
-    if (MSVC)
-        if (NOT _MODULE_LIST)
-            return ()
-        endif ()
-        # MSVC has nothing like "-Wl,--no-as-needed ... -Wl,--as-needed", as a
-        # workaround we build a static module which loads the dynamic modules
-        set(_MODULE_STR ${_MODULE_LIST})
-        list(TRANSFORM _MODULE_STR REPLACE "(.+)" "\"\\1\"")
-        string(JOIN ",\n        " _MODULE_STR ${_MODULE_STR})
-        set(_MMDEPLOY_DYNAMIC_MODULES ${_MODULE_STR})
-
-        set(_LOADER_NAME ${NAME}_loader)
-
-        add_dependencies(${NAME} ${_MODULE_LIST})
-
-        set(_LOADER_PATH ${CMAKE_BINARY_DIR}/${_LOADER_NAME}.cpp)
-        # ! CMAKE_CURRENT_FUNCTION_LIST_DIR requires cmake 3.17+
-        configure_file(
-                ${CMAKE_CURRENT_FUNCTION_LIST_DIR}/loader.cpp.in
-                ${_LOADER_PATH})
-
-        mmdeploy_add_module(${_LOADER_NAME} STATIC EXCLUDE ${_LOADER_PATH})
-        mmdeploy_load_static(${NAME} ${_LOADER_NAME})
-    elseif (APPLE)
-        target_link_libraries(${NAME} PRIVATE ${_MODULE_LIST})
-    else ()
-        target_link_libraries(${NAME} PRIVATE
-                -Wl,--no-as-needed
-                ${_MODULE_LIST}
-                -Wl,--as-needed)
-    endif ()
-endfunction ()
+    if(APPLE)
+      foreach(module IN LISTS _MODULE_LIST)
+        target_link_libraries(${NAME} PRIVATE -force_load ${module})
+      endforeach()
+    else()
+      target_link_libraries(${NAME} PRIVATE -Wl,--whole-archive ${_MODULE_LIST}
+                                            -Wl,--no-whole-archive)
+    endif()
+  endif()
+endfunction()
+
+function(mmdeploy_load_dynamic NAME)
+  _mmdeploy_flatten_modules(_MODULE_LIST ${ARGN})
+  if(MSVC)
+    if(NOT _MODULE_LIST)
+      return()
+    endif()
+    # MSVC has nothing like "-Wl,--no-as-needed ... -Wl,--as-needed", as a
+    # workaround we build a static module which loads the dynamic modules
+    set(_MODULE_STR ${_MODULE_LIST})
+    list(TRANSFORM _MODULE_STR REPLACE "(.+)" "\"\\1\"")
+    string(JOIN ",\n        " _MODULE_STR ${_MODULE_STR})
+    set(_MMDEPLOY_DYNAMIC_MODULES ${_MODULE_STR})
+
+    set(_LOADER_NAME ${NAME}_loader)
+
+    add_dependencies(${NAME} ${_MODULE_LIST})
+
+    set(_LOADER_PATH ${CMAKE_BINARY_DIR}/${_LOADER_NAME}.cpp)
+    # ! CMAKE_CURRENT_FUNCTION_LIST_DIR requires cmake 3.17+
+    configure_file(${CMAKE_CURRENT_FUNCTION_LIST_DIR}/loader.cpp.in
+                   ${_LOADER_PATH})
+
+    mmdeploy_add_module(${_LOADER_NAME} STATIC EXCLUDE ${_LOADER_PATH})
+    mmdeploy_load_static(${NAME} ${_LOADER_NAME})
+  elseif(APPLE)
+    target_link_libraries(${NAME} PRIVATE ${_MODULE_LIST})
+  else()
+    target_link_libraries(${NAME} PRIVATE -Wl,--no-as-needed ${_MODULE_LIST}
+                                          -Wl,--as-needed)
+  endif()
+endfunction()
 
 macro(mmdeploy_add_deps backend)
-    set(multiValueArgs BACKENDS DEPS)
-    cmake_parse_arguments(INFO "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(has_backend OFF)
-    if (${backend} IN_LIST INFO_BACKENDS)
-        foreach(pkg IN LISTS INFO_DEPS)
-            set(${pkg}_DEPENDENCY "find_package(${pkg} REQUIRED)")
-        endforeach()
-    endif()
+  set(multiValueArgs BACKENDS DEPS)
+  cmake_parse_arguments(INFO "${options}" "${oneValueArgs}" "${multiValueArgs}"
+                        ${ARGN})
+  set(has_backend OFF)
+  if(${backend} IN_LIST INFO_BACKENDS)
+    foreach(pkg IN LISTS INFO_DEPS)
+      set(${pkg}_DEPENDENCY "find_package(${pkg} REQUIRED)")
+    endforeach()
+  endif()
 endmacro()
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 578fdc7e74..7b2e1c7d83 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -1,110 +1,114 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0")
-    # suppress 'CMAKE_CUDA_ARCHITECTURES' warning
-    cmake_policy(SET CMP0104 OLD)
-endif ()
+if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0")
+  # suppress 'CMAKE_CUDA_ARCHITECTURES' warning
+  cmake_policy(SET CMP0104 OLD)
+endif()
 
-if (MSVC OR (NOT DEFINED CMAKE_CUDA_RUNTIME_LIBRARY))
-    # use shared, on windows, python api can't build with static lib.
-    set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
-    set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
-endif ()
+if(MSVC OR (NOT DEFINED CMAKE_CUDA_RUNTIME_LIBRARY))
+  # use shared, on windows, python api can't build with static lib.
+  set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
+  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
+endif()
 
-if (MSVC)
-    # no plugin in BuildCustomizations and no specify cuda toolset
-    if (NOT CMAKE_VS_PLATFORM_TOOLSET_CUDA)
-        message(FATAL_ERROR "Please install CUDA MSBuildExtensions")
-    endif ()
+if(MSVC)
+  # no plugin in BuildCustomizations and no specify cuda toolset
+  if(NOT CMAKE_VS_PLATFORM_TOOLSET_CUDA)
+    message(FATAL_ERROR "Please install CUDA MSBuildExtensions")
+  endif()
 
-    if (CMAKE_VS_PLATFORM_TOOLSET_CUDA_CUSTOM_DIR)
-        # find_package(CUDA) required ENV{CUDA_PATH}
-        set(ENV{CUDA_PATH} ${CMAKE_VS_PLATFORM_TOOLSET_CUDA_CUSTOM_DIR})
-    else ()
-        # we use CUDA_PATH and ignore nvcc.exe
-        # cmake will import highest cuda props version, which may not equal to CUDA_PATH
-        if (NOT (DEFINED ENV{CUDA_PATH}))
-            message(FATAL_ERROR "Please set CUDA_PATH environment variable")
-        endif ()
+  if(CMAKE_VS_PLATFORM_TOOLSET_CUDA_CUSTOM_DIR)
+    # find_package(CUDA) required ENV{CUDA_PATH}
+    set(ENV{CUDA_PATH} ${CMAKE_VS_PLATFORM_TOOLSET_CUDA_CUSTOM_DIR})
+  else()
+    # we use CUDA_PATH and ignore nvcc.exe cmake will import highest cuda props
+    # version, which may not equal to CUDA_PATH
+    if(NOT (DEFINED ENV{CUDA_PATH}))
+      message(FATAL_ERROR "Please set CUDA_PATH environment variable")
+    endif()
 
-        string(REGEX REPLACE ".*v([0-9]+)\\..*" "\\1" _MAJOR $ENV{CUDA_PATH})
-        string(REGEX REPLACE ".*v[0-9]+\\.([0-9]+).*" "\\1" _MINOR $ENV{CUDA_PATH})
-        if (NOT (${CMAKE_VS_PLATFORM_TOOLSET_CUDA} STREQUAL "${_MAJOR}.${_MINOR}"))
-            message(FATAL_ERROR "Auto detected cuda version ${CMAKE_VS_PLATFORM_TOOLSET_CUDA}"
-                " is mismatch with ENV{CUDA_PATH} $ENV{CUDA_PATH}. Please modify CUDA_PATH"
-                " to match ${CMAKE_VS_PLATFORM_TOOLSET_CUDA} or specify cuda toolset by"
-                " cmake -T cuda=/path/to/cuda ..")
-        endif ()
+    string(REGEX REPLACE ".*v([0-9]+)\\..*" "\\1" _MAJOR $ENV{CUDA_PATH})
+    string(REGEX REPLACE ".*v[0-9]+\\.([0-9]+).*" "\\1" _MINOR $ENV{CUDA_PATH})
+    if(NOT (${CMAKE_VS_PLATFORM_TOOLSET_CUDA} STREQUAL "${_MAJOR}.${_MINOR}"))
+      message(
+        FATAL_ERROR
+          "Auto detected cuda version ${CMAKE_VS_PLATFORM_TOOLSET_CUDA}"
+          " is mismatch with ENV{CUDA_PATH} $ENV{CUDA_PATH}. Please modify CUDA_PATH"
+          " to match ${CMAKE_VS_PLATFORM_TOOLSET_CUDA} or specify cuda toolset by"
+          " cmake -T cuda=/path/to/cuda ..")
+    endif()
 
-        if (NOT (DEFINED ENV{CUDA_PATH_V${_MAJOR}_${_MINOR}}))
-            message(FATAL_ERROR "Please set CUDA_PATH_V${_MAJOR}_${_MINOR} environment variable")
-        endif ()
-    endif ()
-endif ()
+    if(NOT (DEFINED ENV{CUDA_PATH_V${_MAJOR}_${_MINOR}}))
+      message(
+        FATAL_ERROR
+          "Please set CUDA_PATH_V${_MAJOR}_${_MINOR} environment variable")
+    endif()
+  endif()
+endif()
 
 # nvcc compiler settings
 find_package(CUDA REQUIRED)
 
-if (MSVC)
-    set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc.exe)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/wd4819,/wd4828")
-    if (HAVE_CXX_FLAG_UTF_8)
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/utf-8")
-    endif ()
-else ()
-    set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
-    # Explicitly set the cuda host compiler. Because the default host compiler #
-    # selected by cmake maybe wrong.
-    set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-    set(CMAKE_CUDA_FLAGS
-            "${CMAKE_CUDA_FLAGS} -Xcompiler=-fPIC,-Wall,-fvisibility=hidden")
-    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-fno-gnu-unique")
-    endif ()
-endif ()
+if(MSVC)
+  set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc.exe)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/wd4819,/wd4828")
+  if(HAVE_CXX_FLAG_UTF_8)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/utf-8")
+  endif()
+else()
+  set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
+  # Explicitly set the cuda host compiler. Because the default host compiler #
+  # selected by cmake maybe wrong.
+  set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+  set(CMAKE_CUDA_FLAGS
+      "${CMAKE_CUDA_FLAGS} -Xcompiler=-fPIC,-Wall,-fvisibility=hidden")
+  if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-fno-gnu-unique")
+  endif()
+endif()
 
 enable_language(CUDA)
 
 # set virtual compute architecture and real ones
 set(_NVCC_FLAGS)
-if (NOT CMAKE_CUDA_ARCHITECTURES)
-    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
-    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_53,code=sm_53")
-    if (CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "8")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
-    endif ()
-    if (CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "9")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
-    endif ()
-    if (CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "10")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_72,code=sm_72")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")
-    endif ()
-    if (CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "11")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80")
-        if (CUDA_VERSION_MINOR VERSION_GREATER_EQUAL "1")
-            # cuda doesn't support `sm_86` until version 11.1
-            set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86")
-        endif ()
-        if (CUDA_VERSION_MINOR VERSION_GREATER_EQUAL "4")
-            set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_87,code=sm_87")
-        endif ()
-    endif ()
-endif ()
+if(NOT CMAKE_CUDA_ARCHITECTURES)
+  set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
+  set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_53,code=sm_53")
+  if(CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "8")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
+  endif()
+  if(CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "9")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
+  endif()
+  if(CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "10")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_72,code=sm_72")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")
+  endif()
+  if(CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "11")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80")
+    if(CUDA_VERSION_MINOR VERSION_GREATER_EQUAL "1")
+      # cuda doesn't support `sm_86` until version 11.1
+      set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86")
+    endif()
+    if(CUDA_VERSION_MINOR VERSION_GREATER_EQUAL "4")
+      set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_87,code=sm_87")
+    endif()
+  endif()
+endif()
 
 set(CMAKE_CUDA_FLAGS_DEBUG "-g -O0")
 set(CMAKE_CUDA_FLAGS_RELEASE "-O3")
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMMDEPLOY_USE_CUDA=1")
 
-if (NOT MSVC)
-    set(CMAKE_CUDA_STANDARD 14)
-endif ()
+if(NOT MSVC)
+  set(CMAKE_CUDA_STANDARD 14)
+endif()
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${_NVCC_FLAGS}")
 
-if (MSVC AND MMDEPLOY_USE_MSCV_STATIC)
-    string(REPLACE -MD -MT CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG}")
-    string(REPLACE -MD -MT CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE}")
-endif ()
+if(MSVC AND MMDEPLOY_USE_MSCV_STATIC)
+  string(REPLACE -MD -MT CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG}")
+  string(REPLACE -MD -MT CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE}")
+endif()
diff --git a/cmake/filesystem.cmake b/cmake/filesystem.cmake
index 787923f2cc..14f1aaaadf 100644
--- a/cmake/filesystem.cmake
+++ b/cmake/filesystem.cmake
@@ -1,43 +1,48 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# Modified from https://github.com/pybind/pybind11/blob/master/tests/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved. Modified from
+# https://github.com/pybind/pybind11/blob/master/tests/CMakeLists.txt
 
-if (MSVC)
-    set(STD_FS_NO_LIB_NEEDED TRUE)
-else ()
-    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-            "#include <filesystem>\nint main(int,char**argv){return std::filesystem::path(argv[0]).string().length();}")
-    try_compile(HAS_INC_FS ${CMAKE_CURRENT_BINARY_DIR}
-            SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-            COMPILE_DEFINITIONS -std=c++17 -c)
+if(MSVC)
+  set(STD_FS_NO_LIB_NEEDED TRUE)
+else()
+  file(
+    WRITE ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+    "#include <filesystem>\nint main(int,char**argv){return std::filesystem::path(argv[0]).string().length();}"
+  )
+  try_compile(
+    HAS_INC_FS ${CMAKE_CURRENT_BINARY_DIR}
+    SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+    COMPILE_DEFINITIONS -std=c++17 -c)
 
-    if (NOT HAS_INC_FS)
-        file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-                "#include <experimental/filesystem>\nint main(int,char**argv){return std::experimental::filesystem::path(argv[0]).string().length();}")
-    endif ()
+  if(NOT HAS_INC_FS)
+    file(
+      WRITE ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+      "#include <experimental/filesystem>\nint main(int,char**argv){return std::experimental::filesystem::path(argv[0]).string().length();}"
+    )
+  endif()
 
-    try_compile(
-            STD_FS_NO_LIB_NEEDED ${CMAKE_CURRENT_BINARY_DIR}
-            SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-            COMPILE_DEFINITIONS -std=c++17)
-    try_compile(
-            STD_FS_NEEDS_STDCXXFS ${CMAKE_CURRENT_BINARY_DIR}
-            SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-            COMPILE_DEFINITIONS -std=c++17
-            LINK_LIBRARIES stdc++fs)
-    try_compile(
-            STD_FS_NEEDS_CXXFS ${CMAKE_CURRENT_BINARY_DIR}
-            SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-            COMPILE_DEFINITIONS -std=c++17
-            LINK_LIBRARIES c++fs)
-endif ()
+  try_compile(
+    STD_FS_NO_LIB_NEEDED ${CMAKE_CURRENT_BINARY_DIR}
+    SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+    COMPILE_DEFINITIONS -std=c++17)
+  try_compile(
+    STD_FS_NEEDS_STDCXXFS ${CMAKE_CURRENT_BINARY_DIR}
+    SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+    COMPILE_DEFINITIONS -std=c++17
+    LINK_LIBRARIES stdc++fs)
+  try_compile(
+    STD_FS_NEEDS_CXXFS ${CMAKE_CURRENT_BINARY_DIR}
+    SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+    COMPILE_DEFINITIONS -std=c++17
+    LINK_LIBRARIES c++fs)
+endif()
 
-if (${STD_FS_NO_LIB_NEEDED})
-    set(STD_FS_LIB "")
-elseif (${STD_FS_NEEDS_STDCXXFS})
-    set(STD_FS_LIB stdc++fs)
-elseif (${STD_FS_NEEDS_CXXFS})
-    set(STD_FS_LIB c++fs)
-else ()
-    message(WARNING "Unknown C++17 compiler - not passing -lstdc++fs")
-    set(STD_FS_LIB "")
-endif ()
+if(${STD_FS_NO_LIB_NEEDED})
+  set(STD_FS_LIB "")
+elseif(${STD_FS_NEEDS_STDCXXFS})
+  set(STD_FS_LIB stdc++fs)
+elseif(${STD_FS_NEEDS_CXXFS})
+  set(STD_FS_LIB c++fs)
+else()
+  message(WARNING "Unknown C++17 compiler - not passing -lstdc++fs")
+  set(STD_FS_LIB "")
+endif()
diff --git a/cmake/modules/FindCUDNN.cmake b/cmake/modules/FindCUDNN.cmake
index 3f3f9b893a..332fad48eb 100644
--- a/cmake/modules/FindCUDNN.cmake
+++ b/cmake/modules/FindCUDNN.cmake
@@ -1,36 +1,39 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-if (NOT DEFINED CUDNN_DIR)
-    set(CUDNN_DIR $ENV{CUDNN_DIR})
-endif ()
+if(NOT DEFINED CUDNN_DIR)
+  set(CUDNN_DIR $ENV{CUDNN_DIR})
+endif()
 
 find_path(
-    CUDNN_INCLUDE_DIR cudnn.h
-    HINTS ${CUDNN_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
-    PATH_SUFFIXES include)
+  CUDNN_INCLUDE_DIR cudnn.h
+  HINTS ${CUDNN_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES include)
 
 find_library(
-    CUDNN_LIBRARY_CUDNN_PATH cudnn
-    HINTS ${CUDNN_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
-    PATH_SUFFIXES lib lib64 lib/x64)
+  CUDNN_LIBRARY_CUDNN_PATH cudnn
+  HINTS ${CUDNN_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 lib/x64)
 
-if (NOT (CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY_CUDNN_PATH))
-    message(FATAL_ERROR "Couldn't find cuDNN in CUDNN_DIR: ${CUDNN_DIR}, "
-        "or in CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}, "
-        "please check if the path is correct.")
+if(NOT (CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY_CUDNN_PATH))
+  message(
+    FATAL_ERROR
+      "Couldn't find cuDNN in CUDNN_DIR: ${CUDNN_DIR}, "
+      "or in CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}, "
+      "please check if the path is correct.")
 endif()
 
 add_library(cudnn SHARED IMPORTED)
-set_property(TARGET cudnn APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
-if (MSVC)
-    set_target_properties(cudnn PROPERTIES
-        IMPORTED_IMPLIB_RELEASE ${CUDNN_LIBRARY_CUDNN_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR}
-    )
+set_property(
+  TARGET cudnn
+  APPEND
+  PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+if(MSVC)
+  set_target_properties(
+    cudnn PROPERTIES IMPORTED_IMPLIB_RELEASE ${CUDNN_LIBRARY_CUDNN_PATH}
+                     INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR})
 
 else()
-    set_target_properties(cudnn PROPERTIES
-        IMPORTED_LOCATION_RELEASE ${CUDNN_LIBRARY_CUDNN_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR}
-    )
+  set_target_properties(
+    cudnn PROPERTIES IMPORTED_LOCATION_RELEASE ${CUDNN_LIBRARY_CUDNN_PATH}
+                     INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR})
 endif()
diff --git a/cmake/modules/FindONNXRUNTIME.cmake b/cmake/modules/FindONNXRUNTIME.cmake
index 63ea176595..d3eff87f65 100644
--- a/cmake/modules/FindONNXRUNTIME.cmake
+++ b/cmake/modules/FindONNXRUNTIME.cmake
@@ -1,36 +1,40 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-if (NOT DEFINED ONNXRUNTIME_DIR)
-    set(ONNXRUNTIME_DIR $ENV{ONNXRUNTIME_DIR})
-endif ()
-if (NOT ONNXRUNTIME_DIR)
-    message(FATAL_ERROR "Please set ONNXRUNTIME_DIR with cmake -D option.")
+if(NOT DEFINED ONNXRUNTIME_DIR)
+  set(ONNXRUNTIME_DIR $ENV{ONNXRUNTIME_DIR})
+endif()
+if(NOT ONNXRUNTIME_DIR)
+  message(FATAL_ERROR "Please set ONNXRUNTIME_DIR with cmake -D option.")
 endif()
 
 find_path(
-    ONNXRUNTIME_INCLUDE_DIR onnxruntime_cxx_api.h
-    HINTS ${ONNXRUNTIME_DIR}
-    PATH_SUFFIXES include)
+  ONNXRUNTIME_INCLUDE_DIR onnxruntime_cxx_api.h
+  HINTS ${ONNXRUNTIME_DIR}
+  PATH_SUFFIXES include)
 find_library(
-    ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH onnxruntime
-    HINTS ${ONNXRUNTIME_DIR}
-    PATH_SUFFIXES lib lib64 lib/x64)
-if (NOT (ONNXRUNTIME_INCLUDE_DIR AND ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH))
-    message(FATAL_ERROR "Couldn't find onnxruntime in ONNXRUNTIME_DIR: "
-        "${ONNXRUNTIME_DIR}, please check if the path is correct.")
+  ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH onnxruntime
+  HINTS ${ONNXRUNTIME_DIR}
+  PATH_SUFFIXES lib lib64 lib/x64)
+if(NOT (ONNXRUNTIME_INCLUDE_DIR AND ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH))
+  message(
+    FATAL_ERROR "Couldn't find onnxruntime in ONNXRUNTIME_DIR: "
+                "${ONNXRUNTIME_DIR}, please check if the path is correct.")
 endif()
 
 add_library(onnxruntime SHARED IMPORTED)
-set_property(TARGET onnxruntime APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
-if (MSVC)
-    set_target_properties(onnxruntime PROPERTIES
-        IMPORTED_IMPLIB_RELEASE ${ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${ONNXRUNTIME_INCLUDE_DIR}
-    )
+set_property(
+  TARGET onnxruntime
+  APPEND
+  PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+if(MSVC)
+  set_target_properties(
+    onnxruntime
+    PROPERTIES IMPORTED_IMPLIB_RELEASE ${ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH}
+               INTERFACE_INCLUDE_DIRECTORIES ${ONNXRUNTIME_INCLUDE_DIR})
 
 else()
-    set_target_properties(onnxruntime PROPERTIES
-        IMPORTED_LOCATION_RELEASE ${ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${ONNXRUNTIME_INCLUDE_DIR}
-    )
+  set_target_properties(
+    onnxruntime
+    PROPERTIES IMPORTED_LOCATION_RELEASE ${ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH}
+               INTERFACE_INCLUDE_DIRECTORIES ${ONNXRUNTIME_INCLUDE_DIR})
 endif()
diff --git a/cmake/modules/FindTENSORRT.cmake b/cmake/modules/FindTENSORRT.cmake
index e2c328923e..25d015a52c 100644
--- a/cmake/modules/FindTENSORRT.cmake
+++ b/cmake/modules/FindTENSORRT.cmake
@@ -1,51 +1,56 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-if (NOT DEFINED TENSORRT_DIR)
-    set(TENSORRT_DIR $ENV{TENSORRT_DIR})
-endif ()
-if (NOT TENSORRT_DIR)
-    message(FATAL_ERROR "Please set TENSORRT_DIR with cmake -D option.")
+if(NOT DEFINED TENSORRT_DIR)
+  set(TENSORRT_DIR $ENV{TENSORRT_DIR})
+endif()
+if(NOT TENSORRT_DIR)
+  message(FATAL_ERROR "Please set TENSORRT_DIR with cmake -D option.")
 endif()
 
 find_path(
-    TENSORRT_INCLUDE_DIR NvInfer.h
-    HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
-    PATH_SUFFIXES include)
+  TENSORRT_INCLUDE_DIR NvInfer.h
+  HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES include)
 
-if (NOT TENSORRT_INCLUDE_DIR)
-    message(FATAL_ERROR "Cannot find TensorRT header NvInfer.h "
-        "in TENSORRT_DIR: ${TENSORRT_DIR} or in CUDA_TOOLKIT_ROOT_DIR: "
-        "${CUDA_TOOLKIT_ROOT_DIR}, please check if the path is correct.")
-endif ()
+if(NOT TENSORRT_INCLUDE_DIR)
+  message(
+    FATAL_ERROR
+      "Cannot find TensorRT header NvInfer.h "
+      "in TENSORRT_DIR: ${TENSORRT_DIR} or in CUDA_TOOLKIT_ROOT_DIR: "
+      "${CUDA_TOOLKIT_ROOT_DIR}, please check if the path is correct.")
+endif()
 
 set(__TENSORRT_LIB_COMPONENTS nvinfer;nvinfer_plugin)
 foreach(__component ${__TENSORRT_LIB_COMPONENTS})
-    find_library(
-        __component_path ${__component}
-        HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES lib lib64 lib/x64)
-    if (NOT __component_path)
-        message(FATAL_ERROR "Cannot find TensorRT lib ${__component} in "
-            "TENSORRT_DIR: ${TENSORRT_DIR} or CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}, "
-            "please check if the path is correct")
-    endif()
+  find_library(
+    __component_path ${__component}
+    HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES lib lib64 lib/x64)
+  if(NOT __component_path)
+    message(
+      FATAL_ERROR
+        "Cannot find TensorRT lib ${__component} in "
+        "TENSORRT_DIR: ${TENSORRT_DIR} or CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}, "
+        "please check if the path is correct")
+  endif()
 
-    add_library(${__component} SHARED IMPORTED)
-    set_property(TARGET ${__component} APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
-    if (MSVC)
-        set_target_properties(
-            ${__component} PROPERTIES
-            IMPORTED_IMPLIB_RELEASE ${__component_path}
-            INTERFACE_INCLUDE_DIRECTORIES ${TENSORRT_INCLUDE_DIR}
-        )
-    else()
-        set_target_properties(
-            ${__component} PROPERTIES
-            IMPORTED_LOCATION_RELEASE ${__component_path}
-            INTERFACE_INCLUDE_DIRECTORIES ${TENSORRT_INCLUDE_DIR}
-        )
-    endif()
-    unset(__component_path CACHE)
+  add_library(${__component} SHARED IMPORTED)
+  set_property(
+    TARGET ${__component}
+    APPEND
+    PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+  if(MSVC)
+    set_target_properties(
+      ${__component}
+      PROPERTIES IMPORTED_IMPLIB_RELEASE ${__component_path}
+                 INTERFACE_INCLUDE_DIRECTORIES ${TENSORRT_INCLUDE_DIR})
+  else()
+    set_target_properties(
+      ${__component}
+      PROPERTIES IMPORTED_LOCATION_RELEASE ${__component_path}
+                 INTERFACE_INCLUDE_DIRECTORIES ${TENSORRT_INCLUDE_DIR})
+  endif()
+  unset(__component_path CACHE)
 endforeach()
 
 set(TENSORRT_LIBS ${__TENSORRT_LIB_COMPONENTS})
diff --git a/cmake/modules/FindTVM.cmake b/cmake/modules/FindTVM.cmake
index f6443609e4..8ae3a48abd 100644
--- a/cmake/modules/FindTVM.cmake
+++ b/cmake/modules/FindTVM.cmake
@@ -1,47 +1,56 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-if (NOT DEFINED TVM_DIR)
-    set(TVM_DIR $ENV{TVM_DIR})
-endif ()
-if (NOT TVM_DIR)
-    message(FATAL_ERROR "Please set TVM_DIR with cmake -D option.")
+if(NOT DEFINED TVM_DIR)
+  set(TVM_DIR $ENV{TVM_DIR})
+endif()
+if(NOT TVM_DIR)
+  message(FATAL_ERROR "Please set TVM_DIR with cmake -D option.")
 endif()
 
 find_path(
-    TVM_INCLUDE_DIR tvm/runtime/c_runtime_api.h
-    HINTS ${TVM_DIR}
-    PATH_SUFFIXES include)
+  TVM_INCLUDE_DIR tvm/runtime/c_runtime_api.h
+  HINTS ${TVM_DIR}
+  PATH_SUFFIXES include)
 
 find_path(
-    DMLC_CORE_INCLUDE_DIR  dmlc/io.h
-    HINTS ${TVM_DIR}/3rdparty/dmlc-core
-    PATH_SUFFIXES include)
+  DMLC_CORE_INCLUDE_DIR dmlc/io.h
+  HINTS ${TVM_DIR}/3rdparty/dmlc-core
+  PATH_SUFFIXES include)
 
 find_path(
-    DLPACK_INCLUDE_DIR dlpack/dlpack.h
-    HINTS ${TVM_DIR}/3rdparty/dlpack
-    PATH_SUFFIXES include)
+  DLPACK_INCLUDE_DIR dlpack/dlpack.h
+  HINTS ${TVM_DIR}/3rdparty/dlpack
+  PATH_SUFFIXES include)
 
 find_library(
-    TVM_LIBRARY_PATH tvm_runtime
-    HINTS ${TVM_DIR}
-    PATH_SUFFIXES build lib build/${CMAKE_BUILD_TYPE})
-if (NOT (TVM_INCLUDE_DIR AND DMLC_CORE_INCLUDE_DIR AND DLPACK_INCLUDE_DIR AND TVM_LIBRARY_PATH))
-    message(FATAL_ERROR "Couldn't find tvm in TVM_DIR: "
-        "${TVM_DIR}, please check if the path is correct.")
+  TVM_LIBRARY_PATH tvm_runtime
+  HINTS ${TVM_DIR}
+  PATH_SUFFIXES build lib build/${CMAKE_BUILD_TYPE})
+if(NOT
+   (TVM_INCLUDE_DIR
+    AND DMLC_CORE_INCLUDE_DIR
+    AND DLPACK_INCLUDE_DIR
+    AND TVM_LIBRARY_PATH))
+  message(FATAL_ERROR "Couldn't find tvm in TVM_DIR: "
+                      "${TVM_DIR}, please check if the path is correct.")
 endif()
 
 add_library(tvm_runtime SHARED IMPORTED)
-set_property(TARGET tvm_runtime APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
-if (MSVC)
-    set_target_properties(tvm_runtime PROPERTIES
-        IMPORTED_IMPLIB_RELEASE ${TVM_LIBRARY_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${TVM_INCLUDE_DIR} ${DMLC_CORE_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR}
-    )
+set_property(
+  TARGET tvm_runtime
+  APPEND
+  PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+if(MSVC)
+  set_target_properties(
+    tvm_runtime
+    PROPERTIES IMPORTED_IMPLIB_RELEASE ${TVM_LIBRARY_PATH}
+               INTERFACE_INCLUDE_DIRECTORIES ${TVM_INCLUDE_DIR}
+               ${DMLC_CORE_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR})
 
 else()
-    set_target_properties(tvm_runtime PROPERTIES
-        IMPORTED_LOCATION_RELEASE ${TVM_LIBRARY_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${TVM_INCLUDE_DIR} ${DMLC_CORE_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR}
-    )
+  set_target_properties(
+    tvm_runtime
+    PROPERTIES IMPORTED_LOCATION_RELEASE ${TVM_LIBRARY_PATH}
+               INTERFACE_INCLUDE_DIRECTORIES ${TVM_INCLUDE_DIR}
+               ${DMLC_CORE_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR})
 endif()
diff --git a/cmake/post-install.cmake b/cmake/post-install.cmake
index d289e53996..c9ae0d6dd9 100644
--- a/cmake/post-install.cmake
+++ b/cmake/post-install.cmake
@@ -1,10 +1,10 @@
 
-
-set(_TARGETS_PATH ${CMAKE_INSTALL_PREFIX}/lib/cmake/MMDeploy/MMDeployTargets.cmake)
+set(_TARGETS_PATH
+    ${CMAKE_INSTALL_PREFIX}/lib/cmake/MMDeploy/MMDeployTargets.cmake)
 
 file(READ ${_TARGETS_PATH} _MMDEPLOY_TARGETS)
 
-string(REGEX REPLACE "::@<0x[a-z0-9]+>" ""
-        _MMDEPLOY_TARGETS_FIXED "${_MMDEPLOY_TARGETS}")
+string(REGEX REPLACE "::@<0x[a-z0-9]+>" "" _MMDEPLOY_TARGETS_FIXED
+                     "${_MMDEPLOY_TARGETS}")
 
 file(WRITE ${_TARGETS_PATH} "${_MMDEPLOY_TARGETS_FIXED}")
diff --git a/cmake/stacktrace.cmake b/cmake/stacktrace.cmake
index bd0761a217..4ef719aaa2 100644
--- a/cmake/stacktrace.cmake
+++ b/cmake/stacktrace.cmake
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 find_package(Boost 1.65 COMPONENTS stacktrace_backtrace)
-if (Boost_FOUND)
-    target_link_libraries(mmdeploy_core PUBLIC Boost::stacktrace_backtrace)
-    target_compile_definitions(mmdeploy_core PUBLIC -DMMDEPLOY_STATUS_USE_STACKTRACE=1)
+if(Boost_FOUND)
+  target_link_libraries(mmdeploy_core PUBLIC Boost::stacktrace_backtrace)
+  target_compile_definitions(mmdeploy_core
+                             PUBLIC -DMMDEPLOY_STATUS_USE_STACKTRACE=1)
 endif()
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 546a85070d..1302f4bbcf 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -3,35 +3,33 @@
 include(${CMAKE_SOURCE_DIR}/cmake/modules/FindTENSORRT.cmake)
 include(${CMAKE_SOURCE_DIR}/cmake/modules/FindCUDNN.cmake)
 find_path(
-        TENSORRT_INCLUDE_DIR NvInfer.h
-        HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES include)
-if (TENSORRT_INCLUDE_DIR)
-    message(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}")
-else ()
-    message(ERROR "Cannot find TensorRT headers")
-endif ()
+  TENSORRT_INCLUDE_DIR NvInfer.h
+  HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES include)
+if(TENSORRT_INCLUDE_DIR)
+  message(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}")
+else()
+  message(ERROR "Cannot find TensorRT headers")
+endif()
 
 find_library(
-        TENSORRT_LIBRARY_INFER nvinfer
-        HINTS ${TENSORRT_DIR} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES lib lib64 lib/x64)
+  TENSORRT_LIBRARY_INFER nvinfer
+  HINTS ${TENSORRT_DIR} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 lib/x64)
 find_library(
-        TENSORRT_LIBRARY_INFER_PLUGIN nvinfer_plugin
-        HINTS ${TENSORRT_DIR} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES lib lib64 lib/x64)
-set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER}
-        ${TENSORRT_LIBRARY_INFER_PLUGIN})
-if (TENSORRT_LIBRARY_INFER
-        AND TENSORRT_LIBRARY_INFER_PLUGIN)
-    message(STATUS "Found TensorRT libs at ${TENSORRT_LIBRARY}")
-else ()
-    message(FATAL_ERROR "Cannot find TensorRT libs")
-endif ()
+  TENSORRT_LIBRARY_INFER_PLUGIN nvinfer_plugin
+  HINTS ${TENSORRT_DIR} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 lib/x64)
+set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_INFER_PLUGIN})
+if(TENSORRT_LIBRARY_INFER AND TENSORRT_LIBRARY_INFER_PLUGIN)
+  message(STATUS "Found TensorRT libs at ${TENSORRT_LIBRARY}")
+else()
+  message(FATAL_ERROR "Cannot find TensorRT libs")
+endif()
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(TENSORRT DEFAULT_MSG TENSORRT_INCLUDE_DIR
-        TENSORRT_LIBRARY)
-if (NOT TENSORRT_FOUND)
-    message(ERROR "Cannot find TensorRT library.")
-endif ()
+                                  TENSORRT_LIBRARY)
+if(NOT TENSORRT_FOUND)
+  message(ERROR "Cannot find TensorRT library.")
+endif()
diff --git a/cmake/toolchains/aarch64-linux-gnu.cmake b/cmake/toolchains/aarch64-linux-gnu.cmake
index f95911efd1..dfb3fced21 100644
--- a/cmake/toolchains/aarch64-linux-gnu.cmake
+++ b/cmake/toolchains/aarch64-linux-gnu.cmake
@@ -13,5 +13,9 @@ set(CMAKE_C_FLAGS "-march=armv8-a")
 set(CMAKE_CXX_FLAGS "-march=armv8-a")
 
 # cache flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
+set(CMAKE_C_FLAGS
+    "${CMAKE_C_FLAGS}"
+    CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS}"
+    CACHE STRING "c++ flags")
diff --git a/cmake/toolchains/arm-linux-gnueabihf.cmake b/cmake/toolchains/arm-linux-gnueabihf.cmake
index 74ed5bf935..d4cfe513a0 100644
--- a/cmake/toolchains/arm-linux-gnueabihf.cmake
+++ b/cmake/toolchains/arm-linux-gnueabihf.cmake
@@ -12,5 +12,9 @@ set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon")
 set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon")
 
 # cache flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
+set(CMAKE_C_FLAGS
+    "${CMAKE_C_FLAGS}"
+    CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS}"
+    CACHE STRING "c++ flags")
diff --git a/cmake/toolchains/riscv64-linux-gnu.cmake b/cmake/toolchains/riscv64-linux-gnu.cmake
index e3b3b2adbc..a6515dbd7f 100644
--- a/cmake/toolchains/riscv64-linux-gnu.cmake
+++ b/cmake/toolchains/riscv64-linux-gnu.cmake
@@ -13,5 +13,9 @@ set(CMAKE_C_FLAGS "-march=rv64gc")
 set(CMAKE_CXX_FLAGS "-march=rv64gc")
 
 # cache flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
+set(CMAKE_C_FLAGS
+    "${CMAKE_C_FLAGS}"
+    CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS}"
+    CACHE STRING "c++ flags")
diff --git a/cmake/toolchains/riscv64-unknown-linux-gnu.cmake b/cmake/toolchains/riscv64-unknown-linux-gnu.cmake
index c24661f6e6..93ddc583fe 100644
--- a/cmake/toolchains/riscv64-unknown-linux-gnu.cmake
+++ b/cmake/toolchains/riscv64-unknown-linux-gnu.cmake
@@ -2,15 +2,17 @@ set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR riscv)
 
 if(DEFINED ENV{RISCV_ROOT_PATH})
-    file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
+  file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
 else()
-    message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
+  message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
 endif()
 
 set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
 set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
 
-set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot" CACHE PATH "riscv sysroot")
+set(CMAKE_SYSROOT
+    "${RISCV_ROOT_PATH}/sysroot"
+    CACHE PATH "riscv sysroot")
 set(CMAKE_FIND_ROOT_PATH ${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu)
 
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
@@ -22,5 +24,9 @@ set(CMAKE_C_FLAGS "-march=rv64gc")
 set(CMAKE_CXX_FLAGS "-march=rv64gc")
 
 # cache flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
+set(CMAKE_C_FLAGS
+    "${CMAKE_C_FLAGS}"
+    CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS}"
+    CACHE STRING "c++ flags")
diff --git a/cmake/toolchains/rknpu2-linux-gnu.cmake b/cmake/toolchains/rknpu2-linux-gnu.cmake
index 2bb6835430..4a94f8b238 100644
--- a/cmake/toolchains/rknpu2-linux-gnu.cmake
+++ b/cmake/toolchains/rknpu2-linux-gnu.cmake
@@ -2,9 +2,9 @@ set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR rockchip)
 
 if(DEFINED ENV{RKNN_TOOL_CHAIN})
-    file(TO_CMAKE_PATH $ENV{RKNN_TOOL_CHAIN} RKNN_TOOL_CHAIN)
+  file(TO_CMAKE_PATH $ENV{RKNN_TOOL_CHAIN} RKNN_TOOL_CHAIN)
 else()
-    message(FATAL_ERROR "RKNN_TOOL_CHAIN env must be defined")
+  message(FATAL_ERROR "RKNN_TOOL_CHAIN env must be defined")
 endif()
 
 set(CMAKE_C_COMPILER ${RKNN_TOOL_CHAIN}/bin/aarch64-rockchip-linux-gnu-gcc)
@@ -19,5 +19,9 @@ set(CMAKE_C_FLAGS "-Wl,--allow-shlib-undefined")
 set(CMAKE_CXX_FLAGS "-Wl,--allow-shlib-undefined")
 
 # cache flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
+set(CMAKE_C_FLAGS
+    "${CMAKE_C_FLAGS}"
+    CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS}"
+    CACHE STRING "c++ flags")
diff --git a/csrc/mmdeploy/CMakeLists.txt b/csrc/mmdeploy/CMakeLists.txt
index 6bfbd3a95a..26dce4f586 100644
--- a/csrc/mmdeploy/CMakeLists.txt
+++ b/csrc/mmdeploy/CMakeLists.txt
@@ -2,20 +2,20 @@
 
 add_subdirectory(backend_ops)
 
-if (MMDEPLOY_BUILD_SDK)
-    # include OpenCV for SDK modules since many of them depends on it
-    include(${CMAKE_SOURCE_DIR}/cmake/opencv.cmake)
+if(MMDEPLOY_BUILD_SDK)
+  # include OpenCV for SDK modules since many of them depends on it
+  include(${CMAKE_SOURCE_DIR}/cmake/opencv.cmake)
 
-    add_subdirectory(core)
-    add_subdirectory(execution)
-    add_subdirectory(utils)
-    add_subdirectory(archive)
-    add_subdirectory(device)
-    add_subdirectory(graph)
-    add_subdirectory(model)
-    add_subdirectory(operation)
-    add_subdirectory(preprocess)
-    add_subdirectory(net)
-    add_subdirectory(codebase)
-    add_subdirectory(apis)
-endif ()
+  add_subdirectory(core)
+  add_subdirectory(execution)
+  add_subdirectory(utils)
+  add_subdirectory(archive)
+  add_subdirectory(device)
+  add_subdirectory(graph)
+  add_subdirectory(model)
+  add_subdirectory(operation)
+  add_subdirectory(preprocess)
+  add_subdirectory(net)
+  add_subdirectory(codebase)
+  add_subdirectory(apis)
+endif()
diff --git a/csrc/mmdeploy/apis/CMakeLists.txt b/csrc/mmdeploy/apis/CMakeLists.txt
index 1ab877be90..e137bce311 100644
--- a/csrc/mmdeploy/apis/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/CMakeLists.txt
@@ -4,8 +4,8 @@ add_subdirectory(c)
 add_subdirectory(cxx)
 add_subdirectory(java)
 
-# add python subdir conditionally since it's designed to work as
-# a standalone project also
-if (MMDEPLOY_BUILD_SDK_PYTHON_API)
-    add_subdirectory(python)
-endif ()
+# add python subdir conditionally since it's designed to work as a standalone
+# project also
+if(MMDEPLOY_BUILD_SDK_PYTHON_API)
+  add_subdirectory(python)
+endif()
diff --git a/csrc/mmdeploy/apis/c/CMakeLists.txt b/csrc/mmdeploy/apis/c/CMakeLists.txt
index f08fa8cf86..4c1755b168 100644
--- a/csrc/mmdeploy/apis/c/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/c/CMakeLists.txt
@@ -6,81 +6,76 @@ include(${CMAKE_SOURCE_DIR}/cmake/MMDeploy.cmake)
 set(CAPI_OBJS)
 
 macro(add_object name)
-    add_library(${name} OBJECT ${ARGN})
-    set_target_properties(${name} PROPERTIES POSITION_INDEPENDENT_CODE 1)
-    target_compile_definitions(${name} PRIVATE -DMMDEPLOY_API_EXPORTS=1)
-    if (NOT MSVC)
-        target_compile_options(${name} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-    endif ()
-    target_link_libraries(${name} PRIVATE mmdeploy::core)
-    target_include_directories(${name} PUBLIC
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-            $<INSTALL_INTERFACE:include>)
-    set(CAPI_OBJS ${CAPI_OBJS} ${name})
-    mmdeploy_export(${name})
+  add_library(${name} OBJECT ${ARGN})
+  set_target_properties(${name} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  target_compile_definitions(${name} PRIVATE -DMMDEPLOY_API_EXPORTS=1)
+  if(NOT MSVC)
+    target_compile_options(
+      ${name} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+  endif()
+  target_link_libraries(${name} PRIVATE mmdeploy::core)
+  target_include_directories(
+    ${name} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                   $<INSTALL_INTERFACE:include>)
+  set(CAPI_OBJS ${CAPI_OBJS} ${name})
+  mmdeploy_export(${name})
 endmacro()
 
-set(COMMON_LIST
-        common
-        model
-        executor
-        pipeline)
+set(COMMON_LIST common model executor pipeline)
 
 set(TASK_LIST ${MMDEPLOY_TASKS})
 
-foreach (TASK ${COMMON_LIST})
-    set(TARGET_NAME mmdeploy_${TASK})
-    set(OBJECT_NAME mmdeploy_${TASK}_obj)
-    add_object(${OBJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.cpp)
-    mmdeploy_add_library(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME} PRIVATE ${OBJECT_NAME})
-    target_include_directories(${TARGET_NAME} PUBLIC
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-            $<INSTALL_INTERFACE:include>)
-    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.h
-            DESTINATION include/mmdeploy)
-endforeach ()
+foreach(TASK ${COMMON_LIST})
+  set(TARGET_NAME mmdeploy_${TASK})
+  set(OBJECT_NAME mmdeploy_${TASK}_obj)
+  add_object(${OBJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.cpp)
+  mmdeploy_add_library(${TARGET_NAME})
+  target_link_libraries(${TARGET_NAME} PRIVATE ${OBJECT_NAME})
+  target_include_directories(
+    ${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                          $<INSTALL_INTERFACE:include>)
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.h
+          DESTINATION include/mmdeploy)
+endforeach()
 
-target_link_libraries(mmdeploy_executor PUBLIC
-        mmdeploy_execution mmdeploy_common)
-target_link_libraries(mmdeploy_pipeline PUBLIC
-        mmdeploy_executor mmdeploy_model mmdeploy_common)
+target_link_libraries(mmdeploy_executor PUBLIC mmdeploy_execution
+                                               mmdeploy_common)
+target_link_libraries(mmdeploy_pipeline PUBLIC mmdeploy_executor mmdeploy_model
+                                               mmdeploy_common)
 
-foreach (TASK ${TASK_LIST})
-    set(TARGET_NAME mmdeploy_${TASK})
-    set(OBJECT_NAME mmdeploy_${TASK}_obj)
-    add_object(${OBJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.cpp)
-    mmdeploy_add_library(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME} PRIVATE ${OBJECT_NAME}
-            mmdeploy_pipeline)
-    target_include_directories(${TARGET_NAME} PUBLIC
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-            $<INSTALL_INTERFACE:include>)
-    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.h
-            DESTINATION include/mmdeploy)
-endforeach ()
+foreach(TASK ${TASK_LIST})
+  set(TARGET_NAME mmdeploy_${TASK})
+  set(OBJECT_NAME mmdeploy_${TASK}_obj)
+  add_object(${OBJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.cpp)
+  mmdeploy_add_library(${TARGET_NAME})
+  target_link_libraries(${TARGET_NAME} PRIVATE ${OBJECT_NAME} mmdeploy_pipeline)
+  target_include_directories(
+    ${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                          $<INSTALL_INTERFACE:include>)
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.h
+          DESTINATION include/mmdeploy)
+endforeach()
 
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/demo/csrc/ DESTINATION example/cpp
-        FILES_MATCHING
-        PATTERN "*.cpp"
-        PATTERN "CMakeLists.txt"
-        )
+install(
+  DIRECTORY ${CMAKE_SOURCE_DIR}/demo/csrc/
+  DESTINATION example/cpp
+  FILES_MATCHING
+  PATTERN "*.cpp"
+  PATTERN "CMakeLists.txt")
 
-if (MMDEPLOY_BUILD_SDK_CSHARP_API OR MMDEPLOY_BUILD_SDK_MONOLITHIC)
-    add_library(mmdeploy SHARED)
-    mmdeploy_load_static(mmdeploy MMDeployStaticModules)
-    mmdeploy_load_dynamic(mmdeploy MMDeployDynamicModules)
-    target_link_libraries(mmdeploy PRIVATE ${CAPI_OBJS} mmdeploy_execution)
-    target_include_directories(mmdeploy PUBLIC
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-            $<INSTALL_INTERFACE:include>)
-    set(MMDEPLOY_VERSION ${MMDEPLOY_VERSION_MAJOR}
-            .${MMDEPLOY_VERSION_MINOR}
-            .${MMDEPLOY_VERSION_PATCH})
-    string(REPLACE ";" "" MMDEPLOY_VERSION ${MMDEPLOY_VERSION})
-    set_target_properties(mmdeploy PROPERTIES
-            VERSION ${MMDEPLOY_VERSION}
-            SOVERSION ${MMDEPLOY_VERSION_MAJOR})
-    mmdeploy_add_rpath(mmdeploy)
-    mmdeploy_export_impl(mmdeploy)
-endif ()
+if(MMDEPLOY_BUILD_SDK_CSHARP_API OR MMDEPLOY_BUILD_SDK_MONOLITHIC)
+  add_library(mmdeploy SHARED)
+  mmdeploy_load_static(mmdeploy MMDeployStaticModules)
+  mmdeploy_load_dynamic(mmdeploy MMDeployDynamicModules)
+  target_link_libraries(mmdeploy PRIVATE ${CAPI_OBJS} mmdeploy_execution)
+  target_include_directories(
+    mmdeploy PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                    $<INSTALL_INTERFACE:include>)
+  set(MMDEPLOY_VERSION ${MMDEPLOY_VERSION_MAJOR} .${MMDEPLOY_VERSION_MINOR}
+                       .${MMDEPLOY_VERSION_PATCH})
+  string(REPLACE ";" "" MMDEPLOY_VERSION ${MMDEPLOY_VERSION})
+  set_target_properties(mmdeploy PROPERTIES VERSION ${MMDEPLOY_VERSION}
+                                            SOVERSION ${MMDEPLOY_VERSION_MAJOR})
+  mmdeploy_add_rpath(mmdeploy)
+  mmdeploy_export_impl(mmdeploy)
+endif()
diff --git a/csrc/mmdeploy/apis/cxx/CMakeLists.txt b/csrc/mmdeploy/apis/cxx/CMakeLists.txt
index 0ee897ca4d..9073665516 100644
--- a/csrc/mmdeploy/apis/cxx/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/cxx/CMakeLists.txt
@@ -4,41 +4,44 @@ cmake_minimum_required(VERSION 3.14)
 project(mmdeploy_cxx_api)
 
 add_library(${PROJECT_NAME} INTERFACE)
-target_include_directories(${PROJECT_NAME} INTERFACE
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-        $<INSTALL_INTERFACE:include>)
+target_include_directories(
+  ${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                            $<INSTALL_INTERFACE:include>)
 target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
 set(_tasks ${MMDEPLOY_TASKS} pipeline)
-foreach (task ${_tasks})
-    target_link_libraries(mmdeploy_${task} INTERFACE ${PROJECT_NAME})
-    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${task}.hpp
-            DESTINATION include/mmdeploy)
-endforeach ()
-if (TARGET mmdeploy)
-    target_include_directories(${PROJECT_NAME} INTERFACE
-            $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>
-            $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/outcome>
-            $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/json>
-            )
-    target_include_directories(${PROJECT_NAME} INTERFACE
-            $<INSTALL_INTERFACE:include>
-            $<INSTALL_INTERFACE:include/mmdeploy/third_party/outcome>
-            $<INSTALL_INTERFACE:include/mmdeploy/third_party/json>
-            )
-    if (NOT MMDEPLOY_SPDLOG_EXTERNAL)
-        target_include_directories(${PROJECT_NAME} INTERFACE
-                $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/spdlog/include>
-                $<INSTALL_INTERFACE:include/mmdeploy/third_party>)
-    endif ()
-    target_link_libraries(mmdeploy INTERFACE ${PROJECT_NAME})
-else ()
-    target_link_libraries(${PROJECT_NAME} INTERFACE mmdeploy::core)
-endif ()
+foreach(task ${_tasks})
+  target_link_libraries(mmdeploy_${task} INTERFACE ${PROJECT_NAME})
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${task}.hpp
+          DESTINATION include/mmdeploy)
+endforeach()
+if(TARGET mmdeploy)
+  target_include_directories(
+    ${PROJECT_NAME}
+    INTERFACE $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>
+              $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/outcome>
+              $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/json>)
+  target_include_directories(
+    ${PROJECT_NAME}
+    INTERFACE $<INSTALL_INTERFACE:include>
+              $<INSTALL_INTERFACE:include/mmdeploy/third_party/outcome>
+              $<INSTALL_INTERFACE:include/mmdeploy/third_party/json>)
+  if(NOT MMDEPLOY_SPDLOG_EXTERNAL)
+    target_include_directories(
+      ${PROJECT_NAME}
+      INTERFACE
+        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/spdlog/include>
+        $<INSTALL_INTERFACE:include/mmdeploy/third_party>)
+  endif()
+  target_link_libraries(mmdeploy INTERFACE ${PROJECT_NAME})
+else()
+  target_link_libraries(${PROJECT_NAME} INTERFACE mmdeploy::core)
+endif()
 mmdeploy_export_impl(${PROJECT_NAME})
 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/common.hpp
         DESTINATION include/mmdeploy)
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/demo/csrc/ DESTINATION example/cpp
-        FILES_MATCHING
-        PATTERN "*.cxx"
-        PATTERN "*.h"
-        )
+install(
+  DIRECTORY ${CMAKE_SOURCE_DIR}/demo/csrc/
+  DESTINATION example/cpp
+  FILES_MATCHING
+  PATTERN "*.cxx"
+  PATTERN "*.h")
diff --git a/csrc/mmdeploy/apis/java/CMakeLists.txt b/csrc/mmdeploy/apis/java/CMakeLists.txt
index 04313f1934..6ae7a8e0ad 100644
--- a/csrc/mmdeploy/apis/java/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/java/CMakeLists.txt
@@ -1,6 +1,6 @@
-if (NOT MMDEPLOY_BUILD_SDK_JAVA_API)
-        return ()
-endif ()
+if(NOT MMDEPLOY_BUILD_SDK_JAVA_API)
+  return()
+endif()
 
 project(mmdeploy_java_package)
 
@@ -9,26 +9,27 @@ include(UseJava)
 
 add_subdirectory(native)
 
-add_jar(${PROJECT_NAME} SOURCES
-        mmdeploy/DataType.java
-        mmdeploy/Mat.java
-        mmdeploy/InstanceMask.java
-        mmdeploy/PixelFormat.java
-        mmdeploy/PointF.java
-        mmdeploy/Rect.java
-        mmdeploy/Classifier.java
-        mmdeploy/Detector.java
-        mmdeploy/Segmentor.java
-        mmdeploy/TextDetector.java
-        mmdeploy/TextRecognizer.java
-        mmdeploy/Restorer.java
-        mmdeploy/PoseDetector.java
-        mmdeploy/Context.java
-        mmdeploy/Device.java
-        mmdeploy/Model.java
-        mmdeploy/Profiler.java
-        mmdeploy/Scheduler.java
-        mmdeploy/PoseTracker.java
-        mmdeploy/RotatedDetector.java
-        OUTPUT_NAME mmdeploy
-        OUTPUT_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+add_jar(
+  ${PROJECT_NAME}
+  SOURCES mmdeploy/DataType.java
+          mmdeploy/Mat.java
+          mmdeploy/InstanceMask.java
+          mmdeploy/PixelFormat.java
+          mmdeploy/PointF.java
+          mmdeploy/Rect.java
+          mmdeploy/Classifier.java
+          mmdeploy/Detector.java
+          mmdeploy/Segmentor.java
+          mmdeploy/TextDetector.java
+          mmdeploy/TextRecognizer.java
+          mmdeploy/Restorer.java
+          mmdeploy/PoseDetector.java
+          mmdeploy/Context.java
+          mmdeploy/Device.java
+          mmdeploy/Model.java
+          mmdeploy/Profiler.java
+          mmdeploy/Scheduler.java
+          mmdeploy/PoseTracker.java
+          mmdeploy/RotatedDetector.java
+  OUTPUT_NAME mmdeploy
+  OUTPUT_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
diff --git a/csrc/mmdeploy/apis/java/native/CMakeLists.txt b/csrc/mmdeploy/apis/java/native/CMakeLists.txt
index 6324cd21a1..b1868b8567 100644
--- a/csrc/mmdeploy/apis/java/native/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/java/native/CMakeLists.txt
@@ -1,35 +1,35 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 project(mmdeploy_java)
 
-if (NOT ANDROID)
-        find_package(JNI REQUIRED)
-else ()
-        set(JNI_LIBRARIES)
+if(NOT ANDROID)
+  find_package(JNI REQUIRED)
+else()
+  set(JNI_LIBRARIES)
 endif()
 
-mmdeploy_add_library(${PROJECT_NAME} SHARED EXCLUDE
-        mmdeploy_Classifier.cpp
-        mmdeploy_Detector.cpp
-        mmdeploy_Segmentor.cpp
-        mmdeploy_Restorer.cpp
-        mmdeploy_PoseDetector.cpp
-        mmdeploy_TextDetector.cpp
-        mmdeploy_TextRecognizer.cpp
-        mmdeploy_PoseTracker.cpp
-        mmdeploy_Context.cpp
-        mmdeploy_Device.cpp
-        mmdeploy_Model.cpp
-        mmdeploy_Profiler.cpp
-        mmdeploy_Scheduler.cpp
-        mmdeploy_RotatedDetector.cpp)
+mmdeploy_add_library(
+  ${PROJECT_NAME}
+  SHARED
+  EXCLUDE
+  mmdeploy_Classifier.cpp
+  mmdeploy_Detector.cpp
+  mmdeploy_Segmentor.cpp
+  mmdeploy_Restorer.cpp
+  mmdeploy_PoseDetector.cpp
+  mmdeploy_TextDetector.cpp
+  mmdeploy_TextRecognizer.cpp
+  mmdeploy_PoseTracker.cpp
+  mmdeploy_Context.cpp
+  mmdeploy_Device.cpp
+  mmdeploy_Model.cpp
+  mmdeploy_Profiler.cpp
+  mmdeploy_Scheduler.cpp
+  mmdeploy_RotatedDetector.cpp)
 
-target_include_directories(${PROJECT_NAME} PRIVATE
-        ${JNI_INCLUDE_DIRS})
+target_include_directories(${PROJECT_NAME} PRIVATE ${JNI_INCLUDE_DIRS})
 
 mmdeploy_load_static(${PROJECT_NAME} MMDeployStaticModules)
 mmdeploy_load_dynamic(${PROJECT_NAME} MMDeployDynamicModules)
 
-target_link_libraries(${PROJECT_NAME} PRIVATE
-        ${JNI_LIBRARIES} MMDeployLibs)
-install(TARGETS ${PROJECT_NAME}
-            DESTINATION lib)
+target_link_libraries(${PROJECT_NAME} PRIVATE ${JNI_LIBRARIES} MMDeployLibs)
+install(TARGETS ${PROJECT_NAME} DESTINATION lib)
diff --git a/csrc/mmdeploy/apis/python/CMakeLists.txt b/csrc/mmdeploy/apis/python/CMakeLists.txt
index 12e7946e31..173332d0f7 100644
--- a/csrc/mmdeploy/apis/python/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/python/CMakeLists.txt
@@ -3,53 +3,48 @@
 cmake_minimum_required(VERSION 3.14)
 project(mmdeploy_runtime)
 
-set(MMDEPLOY_RUNTIME_SRCS
-        common.cpp
-        internal.cpp
-        pipeline.cpp)
+set(MMDEPLOY_RUNTIME_SRCS common.cpp internal.cpp pipeline.cpp)
 
 set(CMAKE_CXX_STANDARD 17)
 
-if (${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
-    # standard alone project
-    add_subdirectory(${CMAKE_SOURCE_DIR}/../../../../third_party/pybind11
-            ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
-    find_package(MMDeploy REQUIRED)
-elseif (NOT TARGET pybind11)
-    add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
-endif ()
+if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+  # standard alone project
+  add_subdirectory(${CMAKE_SOURCE_DIR}/../../../../third_party/pybind11
+                   ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
+  find_package(MMDeploy REQUIRED)
+elseif(NOT TARGET pybind11)
+  add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
+endif()
 
-foreach (task_name ${MMDEPLOY_TASKS})
-    list(APPEND MMDEPLOY_RUNTIME_SRCS ${task_name}.cpp)
-endforeach ()
+foreach(task_name ${MMDEPLOY_TASKS})
+  list(APPEND MMDEPLOY_RUNTIME_SRCS ${task_name}.cpp)
+endforeach()
 
 pybind11_add_module(${PROJECT_NAME} ${MMDEPLOY_RUNTIME_SRCS})
 # disable MMDEPLOY_CXX_USE_OPENCV in apis/cxx/mmdeploy/common.hpp
 target_compile_definitions(${PROJECT_NAME} PRIVATE -DMMDEPLOY_CXX_USE_OPENCV=0)
-if (APPLE)
-        set_target_properties(${PROJECT_NAME} PROPERTIES
-                BUILD_RPATH "@loader_path"
-                INSTALL_RPATH "@loader_path")
-else ()
-        set_target_properties(${PROJECT_NAME} PROPERTIES
-                BUILD_RPATH "\$ORIGIN"
-                INSTALL_RPATH "\$ORIGIN")
-endif ()
+if(APPLE)
+  set_target_properties(${PROJECT_NAME} PROPERTIES BUILD_RPATH "@loader_path"
+                                                   INSTALL_RPATH "@loader_path")
+else()
+  set_target_properties(${PROJECT_NAME} PROPERTIES BUILD_RPATH "\$ORIGIN"
+                                                   INSTALL_RPATH "\$ORIGIN")
+endif()
 
 # https://github.com/pybind/pybind11/issues/1604
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    target_compile_options(${PROJECT_NAME} PRIVATE -fsized-deallocation)
-endif ()
-
-if (MMDEPLOY_BUILD_SDK_MONOLITHIC)
-    target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy)
-else ()
-    mmdeploy_load_static(${PROJECT_NAME} MMDeployStaticModules)
-    mmdeploy_load_dynamic(${PROJECT_NAME} MMDeployDynamicModules)
-    target_link_libraries(${PROJECT_NAME} PRIVATE MMDeployLibs)
-endif ()
-
-target_include_directories(${PROJECT_NAME} PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}/..
-        ${CMAKE_CURRENT_SOURCE_DIR})
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  target_compile_options(${PROJECT_NAME} PRIVATE -fsized-deallocation)
+endif()
+
+if(MMDEPLOY_BUILD_SDK_MONOLITHIC)
+  target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy)
+else()
+  mmdeploy_load_static(${PROJECT_NAME} MMDeployStaticModules)
+  mmdeploy_load_dynamic(${PROJECT_NAME} MMDeployDynamicModules)
+  target_link_libraries(${PROJECT_NAME} PRIVATE MMDeployLibs)
+endif()
+
+target_include_directories(
+  ${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..
+                          ${CMAKE_CURRENT_SOURCE_DIR})
 install(DIRECTORY ${CMAKE_SOURCE_DIR}/demo/python/ DESTINATION example/python)
diff --git a/csrc/mmdeploy/archive/CMakeLists.txt b/csrc/mmdeploy/archive/CMakeLists.txt
index 3f3d1f1104..68c34d3d05 100644
--- a/csrc/mmdeploy/archive/CMakeLists.txt
+++ b/csrc/mmdeploy/archive/CMakeLists.txt
@@ -6,8 +6,10 @@ add_library(${PROJECT_NAME} INTERFACE)
 target_link_libraries(${PROJECT_NAME} INTERFACE mmdeploy::core)
 add_library(mmdeploy::archive ALIAS mmdeploy_archive)
 
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/archive
-        DESTINATION include/mmdeploy
-        FILES_MATCHING PATTERN "*.h")
+install(
+  DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/archive
+  DESTINATION include/mmdeploy
+  FILES_MATCHING
+  PATTERN "*.h")
 install(FILES ${CMAKE_SOURCE_DIR}/third_party/json/json.hpp
         DESTINATION include/mmdeploy/third_party/json)
diff --git a/csrc/mmdeploy/backend_ops/CMakeLists.txt b/csrc/mmdeploy/backend_ops/CMakeLists.txt
index 761c35a59a..4fc59bbf8c 100644
--- a/csrc/mmdeploy/backend_ops/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/CMakeLists.txt
@@ -1,39 +1,39 @@
-if (NOT MSVC)
-    set(CMAKE_CXX_STANDARD 14)
-    set(CMAKE_CXX_FLAGS_RELEASE "-O3")
-endif ()
+if(NOT MSVC)
+  set(CMAKE_CXX_STANDARD 14)
+  set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+endif()
 
 # build ONNXRUNTIME ops
-if ("ort" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    if (NOT DEFINED ONNXRUNTIME_DIR)
-        set(ONNXRUNTIME_DIR $ENV{ONNXRUNTIME_DIR})
-    endif ()
-    if (NOT ONNXRUNTIME_DIR)
-        message(FATAL_ERROR " ONNXRUNTIME_DIR is not found.")
-    else ()
-        message(STATUS "Build ONNXRUNTIME custom ops.")
-        add_subdirectory(onnxruntime)
-    endif ()
-endif ()
+if("ort" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  if(NOT DEFINED ONNXRUNTIME_DIR)
+    set(ONNXRUNTIME_DIR $ENV{ONNXRUNTIME_DIR})
+  endif()
+  if(NOT ONNXRUNTIME_DIR)
+    message(FATAL_ERROR " ONNXRUNTIME_DIR is not found.")
+  else()
+    message(STATUS "Build ONNXRUNTIME custom ops.")
+    add_subdirectory(onnxruntime)
+  endif()
+endif()
 
 # build TensorRT ops
-if ("trt" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    if (NOT DEFINED TENSORRT_DIR)
-        set(TENSORRT_DIR $ENV{TENSORRT_DIR})
-    endif ()
-    message(STATUS "Build TensorRT custom ops.")
-    add_subdirectory(tensorrt)
-endif ()
+if("trt" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  if(NOT DEFINED TENSORRT_DIR)
+    set(TENSORRT_DIR $ENV{TENSORRT_DIR})
+  endif()
+  message(STATUS "Build TensorRT custom ops.")
+  add_subdirectory(tensorrt)
+endif()
 
 # build ncnn ops
-if ("ncnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    message(STATUS "Build ncnn custom ops")
-    add_subdirectory(ncnn)
-endif ()
+if("ncnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  message(STATUS "Build ncnn custom ops")
+  add_subdirectory(ncnn)
+endif()
 
 # build TorchScript ops
-if ("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS
-    OR "coreml" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+if("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS OR "coreml" IN_LIST
+                                                     MMDEPLOY_TARGET_BACKENDS)
   message(STATUS "Build torchscript custom ops")
   add_subdirectory(torchscript)
-endif ()
+endif()
diff --git a/csrc/mmdeploy/backend_ops/ncnn/CMakeLists.txt b/csrc/mmdeploy/backend_ops/ncnn/CMakeLists.txt
index 4df9ad1233..f3e2aeb51e 100755
--- a/csrc/mmdeploy/backend_ops/ncnn/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/ncnn/CMakeLists.txt
@@ -3,20 +3,21 @@
 # ncnn
 find_package(ncnn)
 
-if (ncnn_FOUND)
-    message(STATUS "ncnn library found!")
-else ()
-    message(FATAL_ERROR "Could not locate ncnn")
-endif ()
+if(ncnn_FOUND)
+  message(STATUS "ncnn library found!")
+else()
+  message(FATAL_ERROR "Could not locate ncnn")
+endif()
 
-
-if (NOT ANDROID AND NOT IOS AND NOT CMAKE_CROSSCOMPILING)
-    add_subdirectory(ops)
-    add_subdirectory(onnx2ncnn)
-    add_subdirectory(pyncnn_ext)
-else ()
-    # In case of embedded platform, like android, or ios, we only build custom ncnn
-    # ops, and leave the executable converter(onnx2ncnn, pyncnn_ext) built under
-    # the host platforms
-    add_subdirectory(ops)
-endif ()
+if(NOT ANDROID
+   AND NOT IOS
+   AND NOT CMAKE_CROSSCOMPILING)
+  add_subdirectory(ops)
+  add_subdirectory(onnx2ncnn)
+  add_subdirectory(pyncnn_ext)
+else()
+  # In case of embedded platform, like android, or ios, we only build custom
+  # ncnn ops, and leave the executable converter(onnx2ncnn, pyncnn_ext) built
+  # under the host platforms
+  add_subdirectory(ops)
+endif()
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/CMakeLists.txt b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/CMakeLists.txt
index fe1687e951..deeb1e1241 100755
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/CMakeLists.txt
@@ -4,22 +4,28 @@ project(onnx2ncnn)
 
 find_package(Protobuf)
 
-if (PROTOBUF_FOUND)
-    if (${Protobuf_PROTOC_EXECUTABLE} STREQUAL "")
-        message(FATAL_ERROR "protoc not found, try `-DProtobuf_PROTOC_EXECUTABLE=/path/to/protoc`")
-    endif ()
-    protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS
-            ${CMAKE_CURRENT_SOURCE_DIR}/onnx.proto)
-    add_executable(mmdeploy_onnx2ncnn onnx2ncnn.cpp fuse_pass.cpp shape_inference.cpp ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS})
-    target_include_directories(mmdeploy_onnx2ncnn PRIVATE ${PROTOBUF_INCLUDE_DIR}
-            ${CMAKE_CURRENT_BINARY_DIR})
-    target_link_libraries(mmdeploy_onnx2ncnn PRIVATE ${PROTOBUF_LIBRARIES})
-    if (MSVC)
-        target_compile_options(mmdeploy_onnx2ncnn PUBLIC $<$<COMPILE_LANGUAGE:CXX>:/Za>)
-    endif()
-    set(_NCNN_CONVERTER_DIR ${CMAKE_SOURCE_DIR}/mmdeploy/backend/ncnn)
-    install(TARGETS mmdeploy_onnx2ncnn DESTINATION ${_NCNN_CONVERTER_DIR})
-else ()
+if(PROTOBUF_FOUND)
+  if(${Protobuf_PROTOC_EXECUTABLE} STREQUAL "")
     message(
-            FATAL_ERROR "Protobuf not found, onnx model convert tool won't be built")
-endif ()
+      FATAL_ERROR
+        "protoc not found, try `-DProtobuf_PROTOC_EXECUTABLE=/path/to/protoc`")
+  endif()
+  protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS
+                        ${CMAKE_CURRENT_SOURCE_DIR}/onnx.proto)
+  add_executable(
+    mmdeploy_onnx2ncnn onnx2ncnn.cpp fuse_pass.cpp shape_inference.cpp
+                       ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS})
+  target_include_directories(
+    mmdeploy_onnx2ncnn PRIVATE ${PROTOBUF_INCLUDE_DIR}
+                               ${CMAKE_CURRENT_BINARY_DIR})
+  target_link_libraries(mmdeploy_onnx2ncnn PRIVATE ${PROTOBUF_LIBRARIES})
+  if(MSVC)
+    target_compile_options(mmdeploy_onnx2ncnn
+                           PUBLIC $<$<COMPILE_LANGUAGE:CXX>:/Za>)
+  endif()
+  set(_NCNN_CONVERTER_DIR ${CMAKE_SOURCE_DIR}/mmdeploy/backend/ncnn)
+  install(TARGETS mmdeploy_onnx2ncnn DESTINATION ${_NCNN_CONVERTER_DIR})
+else()
+  message(
+    FATAL_ERROR "Protobuf not found, onnx model convert tool won't be built")
+endif()
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/CMakeLists.txt b/csrc/mmdeploy/backend_ops/ncnn/ops/CMakeLists.txt
index abfff8e3f2..755561c379 100755
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/CMakeLists.txt
@@ -6,19 +6,17 @@ project(mmdeploy_ncnn_ops)
 file(GLOB_RECURSE NCNN_OPS_SRCS *.cpp)
 add_library(${PROJECT_NAME}_obj OBJECT "${NCNN_OPS_SRCS}")
 target_compile_definitions(${PROJECT_NAME}_obj PRIVATE -DMMDEPLOY_API_EXPORTS=1)
-set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE 1)
+set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE
+                                                     1)
 target_link_libraries(${PROJECT_NAME}_obj PRIVATE ncnn)
-set(_COMMON_INCLUDE_DIRS
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>)
-target_include_directories(${PROJECT_NAME}_obj
-        PUBLIC ${_COMMON_INCLUDE_DIRS})
+set(_COMMON_INCLUDE_DIRS $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                         $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>)
+target_include_directories(${PROJECT_NAME}_obj PUBLIC ${_COMMON_INCLUDE_DIRS})
 mmdeploy_export(${PROJECT_NAME}_obj)
 
 mmdeploy_add_library(${PROJECT_NAME} SHARED EXCLUDE "")
 target_link_libraries(${PROJECT_NAME} PRIVATE ${PROJECT_NAME}_obj)
-target_include_directories(${PROJECT_NAME}
-        PUBLIC ${_COMMON_INCLUDE_DIRS})
+target_include_directories(${PROJECT_NAME} PUBLIC ${_COMMON_INCLUDE_DIRS})
 
 add_library(mmdeploy::ncnn_ops ALIAS ${PROJECT_NAME})
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt b/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt
index 652f841f7a..1d2a381837 100755
--- a/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt
@@ -3,15 +3,16 @@
 project(ncnn_ext)
 
 # pybind11
-if (NOT TARGET pybind11)
-    add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
-endif ()
+if(NOT TARGET pybind11)
+  add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
+endif()
 
 pybind11_add_module(ncnn_ext ncnn_ext.cpp)
 
 target_link_libraries(ncnn_ext PUBLIC mmdeploy_ncnn_ops ncnn)
 set(_NCNN_EXT_DIR ${CMAKE_SOURCE_DIR}/mmdeploy/backend/ncnn)
-set_target_properties(ncnn_ext PROPERTIES
-        LIBRARY_OUTPUT_DIRECTORY ${_NCNN_EXT_DIR}
-        LIBRARY_OUTPUT_DIRECTORY_DEBUG ${_NCNN_EXT_DIR}
-        LIBRARY_OUTPUT_DIRECTORY_RELEASE ${_NCNN_EXT_DIR})
+set_target_properties(
+  ncnn_ext
+  PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${_NCNN_EXT_DIR}
+             LIBRARY_OUTPUT_DIRECTORY_DEBUG ${_NCNN_EXT_DIR}
+             LIBRARY_OUTPUT_DIRECTORY_RELEASE ${_NCNN_EXT_DIR})
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/CMakeLists.txt b/csrc/mmdeploy/backend_ops/onnxruntime/CMakeLists.txt
index 9548110be6..f8f7e35f77 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/CMakeLists.txt
@@ -9,16 +9,18 @@ include(${CMAKE_SOURCE_DIR}/cmake/MMDeploy.cmake)
 file(GLOB_RECURSE ORT_OPS_SRCS *.cpp)
 add_library(${PROJECT_NAME}_obj OBJECT "${ORT_OPS_SRCS}")
 target_compile_definitions(${PROJECT_NAME}_obj PRIVATE -DMMDEPLOY_API_EXPORTS=1)
-target_compile_options(${PROJECT_NAME}_obj PRIVATE
-        $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE 1)
+target_compile_options(${PROJECT_NAME}_obj
+                       PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE
+                                                     1)
 mmdeploy_export(${PROJECT_NAME}_obj)
 
-target_include_directories(${PROJECT_NAME}_obj PUBLIC
-        $<BUILD_INTERFACE:${ONNXRUNTIME_DIR}/include>
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/common>
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../common>
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>)
+target_include_directories(
+  ${PROJECT_NAME}_obj
+  PUBLIC $<BUILD_INTERFACE:${ONNXRUNTIME_DIR}/include>
+         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/common>
+         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../common>
+         $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>)
 target_link_libraries(${PROJECT_NAME}_obj PUBLIC onnxruntime)
 
 mmdeploy_add_library(${PROJECT_NAME} SHARED EXCLUDE "")
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt b/csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt
index a221311acd..d43e8c4a1b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt
@@ -4,28 +4,28 @@ project(mmdeploy_tensorrt_ops)
 include(${CMAKE_SOURCE_DIR}/cmake/tensorrt.cmake)
 
 # cub
-if (NOT DEFINED CUB_ROOT_DIR)
-    if (CUDA_VERSION VERSION_LESS 11.0)
-        set(CUB_ROOT_DIR "${CMAKE_SOURCE_DIR}/third_party/cub")
-    endif ()
-endif ()
+if(NOT DEFINED CUB_ROOT_DIR)
+  if(CUDA_VERSION VERSION_LESS 11.0)
+    set(CUB_ROOT_DIR "${CMAKE_SOURCE_DIR}/third_party/cub")
+  endif()
+endif()
 
 file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp *.cu)
 add_library(${PROJECT_NAME}_obj OBJECT "${BACKEND_OPS_SRCS}")
-set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE 1)
+set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE
+                                                     1)
 target_compile_definitions(${PROJECT_NAME}_obj
-        PRIVATE -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1)
+                           PRIVATE -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1)
 target_include_directories(${PROJECT_NAME}_obj
-        PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common)
+                           PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common)
 target_include_directories(${PROJECT_NAME}_obj
-        PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
+                           PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
 target_include_directories(${PROJECT_NAME}_obj
-        PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
+                           PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
 target_include_directories(${PROJECT_NAME}_obj PRIVATE ${TENSORRT_INCLUDE_DIR})
 target_include_directories(${PROJECT_NAME}_obj PRIVATE ${CUDNN_DIR}/include)
 target_include_directories(${PROJECT_NAME}_obj PRIVATE ${CUB_ROOT_DIR})
-target_link_libraries(${PROJECT_NAME}_obj
-        PUBLIC ${TENSORRT_LIBS} cublas cudnn)
+target_link_libraries(${PROJECT_NAME}_obj PUBLIC ${TENSORRT_LIBS} cublas cudnn)
 mmdeploy_export(${PROJECT_NAME}_obj)
 
 # Build module library. It is used to convert onnx model to tensorrt engine
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/CMakeLists.txt b/csrc/mmdeploy/backend_ops/torchscript/ops/CMakeLists.txt
index 4a6120d0f8..91e0254570 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/CMakeLists.txt
@@ -1,41 +1,48 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    project(mmdeploy_torchscript_ops CUDA CXX)
-    file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp *.cu)
+  project(mmdeploy_torchscript_ops CUDA CXX)
+  file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp *.cu)
 else()
-    project(mmdeploy_torchscript_ops CXX)
-    file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp)
+  project(mmdeploy_torchscript_ops CXX)
+  file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp)
 endif()
 
 find_package(Torch REQUIRED)
 
 if(MSVC)
-    # workaround to fix building torchscript ops on windows
-    set(_TORCH_TARGET torch_cuda_cu torch_cuda_cpp torch_cpu)
-    foreach(_target IN LISTS _TORCH_TARGET)
-        if(TARGET ${_target})
-            get_property(FIXED_TORCH_CPU_COMPILE_OPTIONS TARGET ${_target} PROPERTY INTERFACE_COMPILE_OPTIONS)
-            string(REPLACE ";" " " FIXED_TORCH_CPU_COMPILE_OPTIONS "${FIXED_TORCH_CPU_COMPILE_OPTIONS}")
-            set_property(TARGET ${_target} PROPERTY INTERFACE_COMPILE_OPTIONS -Xcompiler "${FIXED_TORCH_CPU_COMPILE_OPTIONS}")
-        else()
-            message(WARNING "Target ${_target} not found.")
-        endif()
-    endforeach()
+  # workaround to fix building torchscript ops on windows
+  set(_TORCH_TARGET torch_cuda_cu torch_cuda_cpp torch_cpu)
+  foreach(_target IN LISTS _TORCH_TARGET)
+    if(TARGET ${_target})
+      get_property(
+        FIXED_TORCH_CPU_COMPILE_OPTIONS
+        TARGET ${_target}
+        PROPERTY INTERFACE_COMPILE_OPTIONS)
+      string(REPLACE ";" " " FIXED_TORCH_CPU_COMPILE_OPTIONS
+                     "${FIXED_TORCH_CPU_COMPILE_OPTIONS}")
+      set_property(
+        TARGET ${_target} PROPERTY INTERFACE_COMPILE_OPTIONS -Xcompiler
+                                   "${FIXED_TORCH_CPU_COMPILE_OPTIONS}")
+    else()
+      message(WARNING "Target ${_target} not found.")
+    endif()
+  endforeach()
 endif()
 
 add_library(${PROJECT_NAME}_obj OBJECT "${BACKEND_OPS_SRCS}")
-set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE 1)
+set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE
+                                                     1)
 target_compile_definitions(${PROJECT_NAME}_obj
-    PRIVATE -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1)
+                           PRIVATE -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1)
 target_include_directories(${PROJECT_NAME}_obj
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../common)
+                           PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../common)
 target_include_directories(${PROJECT_NAME}_obj
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
+                           PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
 
 if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    target_include_directories(${PROJECT_NAME}_obj
-        PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
+  target_include_directories(${PROJECT_NAME}_obj
+                             PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
 endif()
 target_link_libraries(${PROJECT_NAME}_obj PRIVATE ${TORCH_LIBRARIES})
 mmdeploy_export(${PROJECT_NAME}_obj)
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/CMakeLists.txt b/csrc/mmdeploy/backend_ops/torchscript/optimizer/CMakeLists.txt
index 1b5e75ccca..c528972177 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/CMakeLists.txt
@@ -3,16 +3,18 @@
 project(ts_optimizer)
 
 find_package(Torch REQUIRED)
-find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
-if (NOT TARGET pybind11)
-    add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
-endif ()
+find_library(TORCH_PYTHON_LIBRARY torch_python
+             PATHS "${TORCH_INSTALL_PREFIX}/lib")
+if(NOT TARGET pybind11)
+  add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
+endif()
 
 file(GLOB_RECURSE OPTIMIZER_SRCS *.cpp)
 
 pybind11_add_module(${PROJECT_NAME} ${OPTIMIZER_SRCS})
-target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY})
+target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES}
+                                              ${TORCH_PYTHON_LIBRARY})
 target_link_directories(${PROJECT_NAME} PRIVATE mmdeploy::torchscript_ops)
 set_target_properties(
-        ${PROJECT_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-        ${CMAKE_SOURCE_DIR}/mmdeploy/backend/torchscript)
+  ${PROJECT_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+                             ${CMAKE_SOURCE_DIR}/mmdeploy/backend/torchscript)
diff --git a/csrc/mmdeploy/codebase/CMakeLists.txt b/csrc/mmdeploy/codebase/CMakeLists.txt
index f933b7fb92..172274efcb 100644
--- a/csrc/mmdeploy/codebase/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/CMakeLists.txt
@@ -3,29 +3,29 @@
 project(mmdeploy_codebase)
 
 set(CODEBASES "")
-if ("all" IN_LIST MMDEPLOY_CODEBASES)
-    list(APPEND CODEBASES "mmcls")
-    list(APPEND CODEBASES "mmdet")
-    list(APPEND CODEBASES "mmseg")
-    list(APPEND CODEBASES "mmocr")
-    list(APPEND CODEBASES "mmedit")
-    list(APPEND CODEBASES "mmpose")
-    list(APPEND CODEBASES "mmrotate")
-    list(APPEND CODEBASES "mmaction")
-else ()
-    set(CODEBASES ${MMDEPLOY_CODEBASES})
-endif ()
+if("all" IN_LIST MMDEPLOY_CODEBASES)
+  list(APPEND CODEBASES "mmcls")
+  list(APPEND CODEBASES "mmdet")
+  list(APPEND CODEBASES "mmseg")
+  list(APPEND CODEBASES "mmocr")
+  list(APPEND CODEBASES "mmedit")
+  list(APPEND CODEBASES "mmpose")
+  list(APPEND CODEBASES "mmrotate")
+  list(APPEND CODEBASES "mmaction")
+else()
+  set(CODEBASES ${MMDEPLOY_CODEBASES})
+endif()
 
-foreach (codebase IN LISTS CODEBASES)
-    message(STATUS "build codebase: ${codebase}")
-    if (codebase STREQUAL "mmpretrain")
-        set(subdir_name "mmcls")
-    elseif (codebase STREQUAL "mmyolo")
-        set(subdir_name "mmdet")
-    elseif (codebase STREQUAL "mmagic")
-        set(subdir_name "mmedit")
-    else()
-	    set(subdir_name ${codebase})
-    endif()
-    add_subdirectory(${subdir_name})
-endforeach ()
+foreach(codebase IN LISTS CODEBASES)
+  message(STATUS "build codebase: ${codebase}")
+  if(codebase STREQUAL "mmpretrain")
+    set(subdir_name "mmcls")
+  elseif(codebase STREQUAL "mmyolo")
+    set(subdir_name "mmdet")
+  elseif(codebase STREQUAL "mmagic")
+    set(subdir_name "mmedit")
+  else()
+    set(subdir_name ${codebase})
+  endif()
+  add_subdirectory(${subdir_name})
+endforeach()
diff --git a/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt b/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
index 2ea41f7271..380b7b6f46 100644
--- a/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
@@ -5,11 +5,12 @@ project(mmdeploy_mmaction)
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 
-target_link_libraries(${PROJECT_NAME} PRIVATE
-    mmdeploy_operation
-    mmdeploy_transform
-    mmdeploy_opencv_utils)
+target_link_libraries(
+  ${PROJECT_NAME} PRIVATE mmdeploy_operation mmdeploy_transform
+                          mmdeploy_opencv_utils)
 
 add_library(mmdeploy::mmaction ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} video_recognizer CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} video_recognizer
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmcls/CMakeLists.txt b/csrc/mmdeploy/codebase/mmcls/CMakeLists.txt
index 79ba5a2ecf..9f084a0a21 100644
--- a/csrc/mmdeploy/codebase/mmcls/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmcls/CMakeLists.txt
@@ -4,8 +4,9 @@ project(mmdeploy_mmcls)
 
 file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
-target_link_libraries(${PROJECT_NAME}
-    PRIVATE mmdeploy_opencv_utils)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils)
 add_library(mmdeploy::mmcls ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} classifier CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} classifier
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmdet/CMakeLists.txt b/csrc/mmdeploy/codebase/mmdet/CMakeLists.txt
index 0d2c75d8e8..c4039f1629 100644
--- a/csrc/mmdeploy/codebase/mmdet/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmdet/CMakeLists.txt
@@ -4,9 +4,11 @@ project(mmdeploy_mmdet)
 
 file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
-target_link_libraries(${PROJECT_NAME}
-        PRIVATE mmdeploy_opencv_utils mmdeploy_operation)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils
+                                              mmdeploy_operation)
 
 add_library(mmdeploy::mmdet ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} detector  CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} detector
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmedit/CMakeLists.txt b/csrc/mmdeploy/codebase/mmedit/CMakeLists.txt
index 1d9f256bb2..d90a62bbd4 100644
--- a/csrc/mmdeploy/codebase/mmedit/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmedit/CMakeLists.txt
@@ -4,8 +4,9 @@ project(mmdeploy_mmedit)
 
 file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
-target_link_libraries(${PROJECT_NAME}
-    PRIVATE mmdeploy_opencv_utils)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils)
 add_library(mmdeploy::mmedit ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} restorer CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} restorer
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt b/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt
index e1567cba85..d54c2b6ef1 100644
--- a/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt
@@ -11,13 +11,13 @@ mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 add_subdirectory(cpu)
 add_subdirectory(cuda)
 
-target_include_directories(${PROJECT_NAME} PRIVATE
-        ${CMAKE_SOURCE_DIR}/third_party/clipper)
-target_link_libraries(${PROJECT_NAME} PRIVATE
-        mmdeploy_opencv_utils
-        mmdeploy_operation
-        mmdeploy::transform
-        mmdeploy::core)
+target_include_directories(${PROJECT_NAME}
+                           PRIVATE ${CMAKE_SOURCE_DIR}/third_party/clipper)
+target_link_libraries(
+  ${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils mmdeploy_operation
+                          mmdeploy::transform mmdeploy::core)
 add_library(mmdeploy::mmocr ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} text_detector text_recognizer CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} text_detector text_recognizer
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmocr/cpu/CMakeLists.txt b/csrc/mmdeploy/codebase/mmocr/cpu/CMakeLists.txt
index 38f19f4ae9..ca2eb84c1c 100644
--- a/csrc/mmdeploy/codebase/mmocr/cpu/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/CMakeLists.txt
@@ -2,15 +2,15 @@
 
 project(mmdeploy_mmocr_cpu_impl CXX)
 
-if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    add_library(${PROJECT_NAME} OBJECT dbnet.cpp panet.cpp psenet.cpp)
-    set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
-    if (NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
-        target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-    endif ()
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-            mmdeploy_opencv_utils
-            mmdeploy::core)
-    target_link_libraries(mmdeploy_mmocr PRIVATE ${PROJECT_NAME})
-    mmdeploy_export(${PROJECT_NAME})
-endif ()
+if("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  add_library(${PROJECT_NAME} OBJECT dbnet.cpp panet.cpp psenet.cpp)
+  set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  if(NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
+    target_compile_options(
+      ${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+  endif()
+  target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils
+                                                mmdeploy::core)
+  target_link_libraries(mmdeploy_mmocr PRIVATE ${PROJECT_NAME})
+  mmdeploy_export(${PROJECT_NAME})
+endif()
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/CMakeLists.txt b/csrc/mmdeploy/codebase/mmocr/cuda/CMakeLists.txt
index 1234e27010..07d62c3aab 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/CMakeLists.txt
@@ -2,22 +2,14 @@
 
 project(mmdeploy_mmocr_cuda_impl)
 
-if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    add_library(${PROJECT_NAME} OBJECT
-            connected_component.cu
-            utils.cu
-            dbnet.cpp
-            panet.cpp
-            psenet.cpp)
-    set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
-    target_include_directories(${PROJECT_NAME} PRIVATE
-            ${CUDA_INCLUDE_DIRS})
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-            mmdeploy_opencv_utils
-            mmdeploy::core
-            ${CUDA_LIBRARIES}
-            cuda)
-    target_link_libraries(mmdeploy_mmocr PRIVATE
-            ${PROJECT_NAME})
-    mmdeploy_export(${PROJECT_NAME})
-endif ()
+if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  add_library(${PROJECT_NAME} OBJECT connected_component.cu utils.cu dbnet.cpp
+                                     panet.cpp psenet.cpp)
+  set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_INCLUDE_DIRS})
+  target_link_libraries(
+    ${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils mmdeploy::core
+                            ${CUDA_LIBRARIES} cuda)
+  target_link_libraries(mmdeploy_mmocr PRIVATE ${PROJECT_NAME})
+  mmdeploy_export(${PROJECT_NAME})
+endif()
diff --git a/csrc/mmdeploy/codebase/mmpose/CMakeLists.txt b/csrc/mmdeploy/codebase/mmpose/CMakeLists.txt
index 9fbafd4aa3..db86432e4b 100644
--- a/csrc/mmdeploy/codebase/mmpose/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmpose/CMakeLists.txt
@@ -6,13 +6,14 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} MMPOSE_SRCS)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/pose_tracker POSE_TRACKER_SRCS)
 
 mmdeploy_add_module(${PROJECT_NAME} ${MMPOSE_SRCS} ${POSE_TRACKER_SRCS})
-target_link_libraries(${PROJECT_NAME} PRIVATE
-        mmdeploy::transform
-        mmdeploy_operation
-        mmdeploy_opencv_utils)
-target_include_directories(${PROJECT_NAME} PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../apis/c)
+target_link_libraries(
+  ${PROJECT_NAME} PRIVATE mmdeploy::transform mmdeploy_operation
+                          mmdeploy_opencv_utils)
+target_include_directories(
+  ${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+                          ${CMAKE_CURRENT_SOURCE_DIR}/../../apis/c)
 add_library(mmdeploy::mmpose ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} pose_detector pose_tracker CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} pose_detector pose_tracker
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmrotate/CMakeLists.txt b/csrc/mmdeploy/codebase/mmrotate/CMakeLists.txt
index b2ee16ad7c..f2510a7f32 100644
--- a/csrc/mmdeploy/codebase/mmrotate/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmrotate/CMakeLists.txt
@@ -7,4 +7,6 @@ mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils)
 add_library(mmdeploy::mmrotate ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} rotated_detector CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} rotated_detector
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmseg/CMakeLists.txt b/csrc/mmdeploy/codebase/mmseg/CMakeLists.txt
index aac2376346..526654fe15 100644
--- a/csrc/mmdeploy/codebase/mmseg/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmseg/CMakeLists.txt
@@ -4,9 +4,10 @@ project(mmdeploy_mmseg)
 
 file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
-target_link_libraries(${PROJECT_NAME} PRIVATE
-    mmdeploy_opencv_utils
-    mmdeploy_operation)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils
+                                              mmdeploy_operation)
 add_library(mmdeploy::mmseg ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} segmentor CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} segmentor
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/core/CMakeLists.txt b/csrc/mmdeploy/core/CMakeLists.txt
index 25a12849f9..90891d191a 100644
--- a/csrc/mmdeploy/core/CMakeLists.txt
+++ b/csrc/mmdeploy/core/CMakeLists.txt
@@ -2,92 +2,100 @@
 
 project(mmdeploy_core)
 
-# this is used to keep compatibility with legacy spdlog where CMake package is not available
+# this is used to keep compatibility with legacy spdlog where CMake package is
+# not available
 set(SPDLOG_LIB)
 
-if (MMDEPLOY_SPDLOG_EXTERNAL)
-    find_package(spdlog QUIET)
-    if (spdlog_FOUND)
-        set(SPDLOG_LIB spdlog::spdlog)
-    endif ()
-else ()
-    set(MMDEPLOY_SPDLOG_DIR ${CMAKE_SOURCE_DIR}/third_party/spdlog)
-    add_subdirectory(${MMDEPLOY_SPDLOG_DIR} ${CMAKE_CURRENT_BINARY_DIR}/spdlog EXCLUDE_FROM_ALL)
-    set_target_properties(spdlog PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    if (NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
-        target_compile_options(spdlog PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-    endif ()
+if(MMDEPLOY_SPDLOG_EXTERNAL)
+  find_package(spdlog QUIET)
+  if(spdlog_FOUND)
     set(SPDLOG_LIB spdlog::spdlog)
-    mmdeploy_export(spdlog)
+  endif()
+else()
+  set(MMDEPLOY_SPDLOG_DIR ${CMAKE_SOURCE_DIR}/third_party/spdlog)
+  add_subdirectory(${MMDEPLOY_SPDLOG_DIR} ${CMAKE_CURRENT_BINARY_DIR}/spdlog
+                   EXCLUDE_FROM_ALL)
+  set_target_properties(spdlog PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  if(NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
+    target_compile_options(
+      spdlog PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+  endif()
+  set(SPDLOG_LIB spdlog::spdlog)
+  mmdeploy_export(spdlog)
 
-    install(DIRECTORY ${MMDEPLOY_SPDLOG_DIR}/include/spdlog
-            DESTINATION include/mmdeploy/third_party)
-endif ()
+  install(DIRECTORY ${MMDEPLOY_SPDLOG_DIR}/include/spdlog
+          DESTINATION include/mmdeploy/third_party)
+endif()
 
 set(SRCS
-        device_impl.cpp
-        logger.cpp
-        mat.cpp
-        model.cpp
-        module.cpp
-        net.cpp
-        operator.cpp
-        status_code.cpp
-        tensor.cpp
-        registry.cpp
-        graph.cpp
-        utils/device_utils.cpp
-        utils/formatter.cpp
-        utils/stacktrace.cpp
-        profiler.cpp
-        )
+    device_impl.cpp
+    logger.cpp
+    mat.cpp
+    model.cpp
+    module.cpp
+    net.cpp
+    operator.cpp
+    status_code.cpp
+    tensor.cpp
+    registry.cpp
+    graph.cpp
+    utils/device_utils.cpp
+    utils/formatter.cpp
+    utils/stacktrace.cpp
+    profiler.cpp)
 
 mmdeploy_add_library(${PROJECT_NAME} ${SRCS})
 
-target_include_directories(${PROJECT_NAME}
-        PUBLIC
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/outcome>
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/concurrentqueue>
-        # TODO: remove dependency of `json`
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/json>
-        )
+target_include_directories(
+  ${PROJECT_NAME}
+  PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>
+         $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/outcome>
+         $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/concurrentqueue>
+         # TODO: remove dependency of `json`
+         $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/json>)
 
-if (MSVC)
-    target_compile_options(${PROJECT_NAME} PUBLIC
-            $<$<COMPILE_LANGUAGE:CXX>:/Zc:preprocessor;/Zc:__cplusplus>)
-endif ()
+if(MSVC)
+  target_compile_options(
+    ${PROJECT_NAME}
+    PUBLIC $<$<COMPILE_LANGUAGE:CXX>:/Zc:preprocessor;/Zc:__cplusplus>)
+endif()
 
-if (MMDEPLOY_STATUS_USE_STACKTRACE)
-    include(${CMAKE_SOURCE_DIR}/cmake/stacktrace.cmake)
-else ()
-    target_compile_definitions(${PROJECT_NAME} PUBLIC -DMMDEPLOY_STATUS_USE_SOURCE_LOCATION=1)
-endif ()
+if(MMDEPLOY_STATUS_USE_STACKTRACE)
+  include(${CMAKE_SOURCE_DIR}/cmake/stacktrace.cmake)
+else()
+  target_compile_definitions(${PROJECT_NAME}
+                             PUBLIC -DMMDEPLOY_STATUS_USE_SOURCE_LOCATION=1)
+endif()
 
-target_include_directories(${PROJECT_NAME} PUBLIC
-        $<INSTALL_INTERFACE:include>
-        $<INSTALL_INTERFACE:include/mmdeploy/third_party/outcome>
-        $<INSTALL_INTERFACE:include/mmdeploy/third_party/json>)
-if (NOT MMDEPLOY_SPDLOG_EXTERNAL)
-    target_include_directories(spdlog INTERFACE
-            $<INSTALL_INTERFACE:include/mmdeploy/third_party>)
-endif ()
+target_include_directories(
+  ${PROJECT_NAME}
+  PUBLIC $<INSTALL_INTERFACE:include>
+         $<INSTALL_INTERFACE:include/mmdeploy/third_party/outcome>
+         $<INSTALL_INTERFACE:include/mmdeploy/third_party/json>)
+if(NOT MMDEPLOY_SPDLOG_EXTERNAL)
+  target_include_directories(
+    spdlog INTERFACE $<INSTALL_INTERFACE:include/mmdeploy/third_party>)
+endif()
 
 target_link_libraries(${PROJECT_NAME} PUBLIC ${SPDLOG_LIB})
 
 include(${CMAKE_SOURCE_DIR}/cmake/filesystem.cmake)
-if (STD_FS_LIB)
-    target_link_libraries(${PROJECT_NAME} PUBLIC ${STD_FS_LIB})
-endif ()
+if(STD_FS_LIB)
+  target_link_libraries(${PROJECT_NAME} PUBLIC ${STD_FS_LIB})
+endif()
 
 add_library(mmdeploy::core ALIAS ${PROJECT_NAME})
 
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/core
-        DESTINATION include/mmdeploy
-        FILES_MATCHING PATTERN "*.h")
+install(
+  DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/core
+  DESTINATION include/mmdeploy
+  FILES_MATCHING
+  PATTERN "*.h")
 install(FILES ${CMAKE_SOURCE_DIR}/third_party/outcome/outcome-experimental.hpp
         DESTINATION include/mmdeploy/third_party/outcome)
 
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/experimental
-        DESTINATION include/mmdeploy
-        FILES_MATCHING PATTERN "*.h")
+install(
+  DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/experimental
+  DESTINATION include/mmdeploy
+  FILES_MATCHING
+  PATTERN "*.h")
diff --git a/csrc/mmdeploy/device/CMakeLists.txt b/csrc/mmdeploy/device/CMakeLists.txt
index 6243e8e6ab..f0c29ac38d 100644
--- a/csrc/mmdeploy/device/CMakeLists.txt
+++ b/csrc/mmdeploy/device/CMakeLists.txt
@@ -2,10 +2,10 @@
 
 add_subdirectory(cpu)
 
-if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    add_subdirectory(cuda)
-endif ()
+if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  add_subdirectory(cuda)
+endif()
 
-if ("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(acl)
-endif ()
+if("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(acl)
+endif()
diff --git a/csrc/mmdeploy/graph/CMakeLists.txt b/csrc/mmdeploy/graph/CMakeLists.txt
index b5b6d6422d..08945be366 100644
--- a/csrc/mmdeploy/graph/CMakeLists.txt
+++ b/csrc/mmdeploy/graph/CMakeLists.txt
@@ -2,11 +2,6 @@
 
 project(mmdeploy_graph)
 
-set(SRCS
-        task.cpp
-        static_router.cpp
-        inference.cpp
-        pipeline.cpp
-        cond.cpp)
+set(SRCS task.cpp static_router.cpp inference.cpp pipeline.cpp cond.cpp)
 mmdeploy_add_module(${PROJECT_NAME} LIBRARY "${SRCS}")
 add_library(mmdeploy::graph ALIAS ${PROJECT_NAME})
diff --git a/csrc/mmdeploy/model/CMakeLists.txt b/csrc/mmdeploy/model/CMakeLists.txt
index 01cbe77076..bd93ad5196 100644
--- a/csrc/mmdeploy/model/CMakeLists.txt
+++ b/csrc/mmdeploy/model/CMakeLists.txt
@@ -3,104 +3,100 @@
 project(mmdeploy_model)
 
 set(MODEL_NAMES "directory_model")
-if (${MMDEPLOY_ZIP_MODEL})
-    set(MODEL_NAMES ${MODEL_NAMES} "zip_model")
+if(${MMDEPLOY_ZIP_MODEL})
+  set(MODEL_NAMES ${MODEL_NAMES} "zip_model")
 
-    if (MSVC)
-        set(zlib zlibstatic)
-        set(LIB_PREFIX "")
-        set(LIB_SUFFIX ".lib")
-    else()
-        set(zlib z)
-        set(LIB_PREFIX "lib")
-        set(LIB_SUFFIX ".a")
-    endif()
-    set(ziplib zip)
+  if(MSVC)
+    set(zlib zlibstatic)
+    set(LIB_PREFIX "")
+    set(LIB_SUFFIX ".lib")
+  else()
+    set(zlib z)
+    set(LIB_PREFIX "lib")
+    set(LIB_SUFFIX ".a")
+  endif()
+  set(ziplib zip)
 
-    set(zlib_name ${LIB_PREFIX}${zlib}${LIB_SUFFIX})
-    set(ziplib_name ${LIB_PREFIX}${ziplib}${LIB_SUFFIX})
+  set(zlib_name ${LIB_PREFIX}${zlib}${LIB_SUFFIX})
+  set(ziplib_name ${LIB_PREFIX}${ziplib}${LIB_SUFFIX})
 
-    include(ExternalProject)
-    set(ZLIB_BUILD_DIR ${CMAKE_BINARY_DIR}/zlib-build)
-    set(ZLIB_INSTALL_DIR ${CMAKE_BINARY_DIR}/zlib-install)
-    set(ZLIB_INCLUDE_DIR ${ZLIB_INSTALL_DIR}/include)
-    set(ZLIB_LIBRARY_DIR ${ZLIB_INSTALL_DIR}/lib)
-    ExternalProject_Add(
-        zlib-external
-        GIT_REPOSITORY https://github.com/madler/zlib
-        GIT_TAG v1.2.13
-        CMAKE_ARGS
-            -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-        UPDATE_DISCONNECTED 1
-    )
-    add_custom_target(static_zlib ALL
-        COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/zlib.lib
-        COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/libz.so
-        COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/libz.dylib
-    )
-    add_dependencies(static_zlib zlib-external)
+  include(ExternalProject)
+  set(ZLIB_BUILD_DIR ${CMAKE_BINARY_DIR}/zlib-build)
+  set(ZLIB_INSTALL_DIR ${CMAKE_BINARY_DIR}/zlib-install)
+  set(ZLIB_INCLUDE_DIR ${ZLIB_INSTALL_DIR}/include)
+  set(ZLIB_LIBRARY_DIR ${ZLIB_INSTALL_DIR}/lib)
+  ExternalProject_Add(
+    zlib-external
+    GIT_REPOSITORY https://github.com/madler/zlib
+    GIT_TAG v1.2.13
+    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
+               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    UPDATE_DISCONNECTED 1)
+  add_custom_target(
+    static_zlib ALL
+    COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/zlib.lib
+    COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/libz.so
+    COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/libz.dylib)
+  add_dependencies(static_zlib zlib-external)
 
-    add_library(${zlib} STATIC IMPORTED)
-    set_target_properties(${zlib}
-        PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARY_DIR}/${zlib_name})
-    add_dependencies(${zlib} zlib-external)
+  add_library(${zlib} STATIC IMPORTED)
+  set_target_properties(${zlib} PROPERTIES IMPORTED_LOCATION
+                                           ${ZLIB_LIBRARY_DIR}/${zlib_name})
+  add_dependencies(${zlib} zlib-external)
 
-    set(LIBZIP_BUILD_DIR ${CMAKE_BINARY_DIR}/libzip-build)
-    set(LIBZIP_INSTALL_DIR ${CMAKE_BINARY_DIR}/libzip-install)
-    set(LIBZIP_INCLUDE_DIR ${LIBZIP_INSTALL_DIR}/include)
-    set(LIBZIP_LIBRARY_DIR ${LIBZIP_INSTALL_DIR}/lib)
-    ExternalProject_Add(
-        libzip-external
-        GIT_REPOSITORY https://github.com/nih-at/libzip
-        GIT_TAG v1.9.2
-        CMAKE_ARGS
-        -DZLIB_ROOT=${ZLIB_INSTALL_DIR}
-        -DCMAKE_INSTALL_PREFIX=${LIBZIP_INSTALL_DIR}
-        -DCMAKE_INSTALL_LIBDIR=lib
-        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-        -DBUILD_SHARED_LIBS=OFF
-        -DLIBZIP_DO_INSTALL=ON
-        -DBUILD_TOOLS=OFF
-        -DBUILD_DOC=OFF
-        -DBUILD_REGRESS=OFF
-        -DBUILD_EXAMPLES=OFF
-        -DENABLE_OPENSSL=OFF
-        -DENABLE_COMMONCRYPTO=OFF
-        -DENABLE_GNUTLS=OFF
-        -DENABLE_MBEDTLS=OFF
-        -DENABLE_WINDOWS_CRYPTO=OFF
-        -DENABLE_BZIP2=OFF
-        -DENABLE_LZMA=OFF
-        -DENABLE_ZSTD=OFF
-        PREFIX libzip
-        BINARY_DIR ${LIBZIP_BUILD_DIR}
-        # INSTALL_COMMAND ""
-        UPDATE_DISCONNECTED 1
-    )
-    add_library(${ziplib} STATIC IMPORTED)
-    set_target_properties(${ziplib}
-        PROPERTIES IMPORTED_LOCATION ${LIBZIP_LIBRARY_DIR}/${ziplib_name})
-    add_dependencies(libzip-external static_zlib)
-    add_dependencies(${ziplib} libzip-external)
-    set(ziplibs ${ziplib} ${zlib})
-    if (NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
-        install(FILES ${LIBZIP_LIBRARY_DIR}/${ziplib_name}
+  set(LIBZIP_BUILD_DIR ${CMAKE_BINARY_DIR}/libzip-build)
+  set(LIBZIP_INSTALL_DIR ${CMAKE_BINARY_DIR}/libzip-install)
+  set(LIBZIP_INCLUDE_DIR ${LIBZIP_INSTALL_DIR}/include)
+  set(LIBZIP_LIBRARY_DIR ${LIBZIP_INSTALL_DIR}/lib)
+  ExternalProject_Add(
+    libzip-external
+    GIT_REPOSITORY https://github.com/nih-at/libzip
+    GIT_TAG v1.9.2
+    CMAKE_ARGS -DZLIB_ROOT=${ZLIB_INSTALL_DIR}
+               -DCMAKE_INSTALL_PREFIX=${LIBZIP_INSTALL_DIR}
+               -DCMAKE_INSTALL_LIBDIR=lib
+               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+               -DBUILD_SHARED_LIBS=OFF
+               -DLIBZIP_DO_INSTALL=ON
+               -DBUILD_TOOLS=OFF
+               -DBUILD_DOC=OFF
+               -DBUILD_REGRESS=OFF
+               -DBUILD_EXAMPLES=OFF
+               -DENABLE_OPENSSL=OFF
+               -DENABLE_COMMONCRYPTO=OFF
+               -DENABLE_GNUTLS=OFF
+               -DENABLE_MBEDTLS=OFF
+               -DENABLE_WINDOWS_CRYPTO=OFF
+               -DENABLE_BZIP2=OFF
+               -DENABLE_LZMA=OFF
+               -DENABLE_ZSTD=OFF
+    PREFIX libzip
+    BINARY_DIR ${LIBZIP_BUILD_DIR}
+    # INSTALL_COMMAND ""
+    UPDATE_DISCONNECTED 1)
+  add_library(${ziplib} STATIC IMPORTED)
+  set_target_properties(
+    ${ziplib} PROPERTIES IMPORTED_LOCATION ${LIBZIP_LIBRARY_DIR}/${ziplib_name})
+  add_dependencies(libzip-external static_zlib)
+  add_dependencies(${ziplib} libzip-external)
+  set(ziplibs ${ziplib} ${zlib})
+  if(NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
+    install(FILES ${LIBZIP_LIBRARY_DIR}/${ziplib_name}
             DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
-        install(FILES ${ZLIB_LIBRARY_DIR}/${zlib_name}
+    install(FILES ${ZLIB_LIBRARY_DIR}/${zlib_name}
             DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
-    endif()
-endif ()
+  endif()
+endif()
 
-foreach (MODEL_NAME ${MODEL_NAMES})
-    set(TARGET_MODEL_NAME mmdeploy_${MODEL_NAME})
-    mmdeploy_add_module(${TARGET_MODEL_NAME} ${MODEL_NAME}_impl.cpp)
-    if (${MODEL_NAME} STREQUAL "zip_model")
-        target_link_libraries(${TARGET_MODEL_NAME} PRIVATE ${ziplibs})
-        target_link_directories(${TARGET_MODEL_NAME} INTERFACE
-            $<INSTALL_INTERFACE:lib>)
-        target_include_directories(${TARGET_MODEL_NAME} PRIVATE
-            $<BUILD_INTERFACE:${LIBZIP_INCLUDE_DIR}>)
-    endif ()
-    add_library(mmdeploy::${MODEL_NAME} ALIAS ${TARGET_MODEL_NAME})
-endforeach ()
+foreach(MODEL_NAME ${MODEL_NAMES})
+  set(TARGET_MODEL_NAME mmdeploy_${MODEL_NAME})
+  mmdeploy_add_module(${TARGET_MODEL_NAME} ${MODEL_NAME}_impl.cpp)
+  if(${MODEL_NAME} STREQUAL "zip_model")
+    target_link_libraries(${TARGET_MODEL_NAME} PRIVATE ${ziplibs})
+    target_link_directories(${TARGET_MODEL_NAME} INTERFACE
+                            $<INSTALL_INTERFACE:lib>)
+    target_include_directories(${TARGET_MODEL_NAME}
+                               PRIVATE $<BUILD_INTERFACE:${LIBZIP_INCLUDE_DIR}>)
+  endif()
+  add_library(mmdeploy::${MODEL_NAME} ALIAS ${TARGET_MODEL_NAME})
+endforeach()
diff --git a/csrc/mmdeploy/net/CMakeLists.txt b/csrc/mmdeploy/net/CMakeLists.txt
index 9240fd7e6b..e3c6a50c9e 100644
--- a/csrc/mmdeploy/net/CMakeLists.txt
+++ b/csrc/mmdeploy/net/CMakeLists.txt
@@ -4,65 +4,63 @@ project(mmdeploy_net_module)
 
 set(BACKEND_LIB_NAMES)
 
-if ("trt" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(trt)
-endif ()
-
-if ("pplnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(ppl)
-endif ()
-
-if ("ort" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(ort)
-endif ()
-
-if ("ncnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(ncnn)
-endif ()
-
-if ("openvino" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(openvino)
-endif ()
-
-if ("snpe" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(snpe)
-endif ()
-
-if ("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(acl)
-endif ()
-
-if ("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(torchscript)
-endif ()
-
-if ("coreml" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(coreml)
-endif ()
-
-if ("rknn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(rknn)
-endif ()
-
-if ("tvm" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(tvm)
-endif ()
-
-if (MMDEPLOY_DYNAMIC_BACKEND)
-    set(_MODULE_STR ${BACKEND_LIB_NAMES})
-    list(TRANSFORM _MODULE_STR REPLACE "(.+)" "\"\\1\"")
-    string(JOIN ",\n        " _MODULE_STR ${_MODULE_STR})
-    set(_MMDEPLOY_DYNAMIC_MODULES ${_MODULE_STR})
-
-    set(_LOADER_NAME net_loader)
-    set(_LOADER_PATH ${${PROJECT_NAME}_BINARY_DIR}/${_LOADER_NAME}.cpp)
-    configure_file(
-        ${CMAKE_SOURCE_DIR}/cmake/loader.cpp.in
-        ${_LOADER_PATH})
-    if (NOT (WIN32 OR APPLE))
-        SET(_DL_LIB dl)
-    endif ()
-endif ()
+if("trt" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(trt)
+endif()
+
+if("pplnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(ppl)
+endif()
+
+if("ort" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(ort)
+endif()
+
+if("ncnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(ncnn)
+endif()
+
+if("openvino" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(openvino)
+endif()
+
+if("snpe" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(snpe)
+endif()
+
+if("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(acl)
+endif()
+
+if("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(torchscript)
+endif()
+
+if("coreml" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(coreml)
+endif()
+
+if("rknn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(rknn)
+endif()
+
+if("tvm" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(tvm)
+endif()
+
+if(MMDEPLOY_DYNAMIC_BACKEND)
+  set(_MODULE_STR ${BACKEND_LIB_NAMES})
+  list(TRANSFORM _MODULE_STR REPLACE "(.+)" "\"\\1\"")
+  string(JOIN ",\n        " _MODULE_STR ${_MODULE_STR})
+  set(_MMDEPLOY_DYNAMIC_MODULES ${_MODULE_STR})
+
+  set(_LOADER_NAME net_loader)
+  set(_LOADER_PATH ${${PROJECT_NAME}_BINARY_DIR}/${_LOADER_NAME}.cpp)
+  configure_file(${CMAKE_SOURCE_DIR}/cmake/loader.cpp.in ${_LOADER_PATH})
+  if(NOT (WIN32 OR APPLE))
+    set(_DL_LIB dl)
+  endif()
+endif()
 
 mmdeploy_add_module(${PROJECT_NAME} net_module.cpp ${_LOADER_PATH})
 target_link_libraries(${PROJECT_NAME} PUBLIC ${_DL_LIB})
diff --git a/csrc/mmdeploy/net/acl/CMakeLists.txt b/csrc/mmdeploy/net/acl/CMakeLists.txt
index 2056b73506..0fd16885e0 100644
--- a/csrc/mmdeploy/net/acl/CMakeLists.txt
+++ b/csrc/mmdeploy/net/acl/CMakeLists.txt
@@ -2,13 +2,17 @@
 
 project(mmdeploy_acl_net)
 
-if ("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    if (NOT DEFINED ASCEND_TOOLKIT_HOME)
-        set(ASCEND_TOOLKIT_HOME $ENV{ASCEND_TOOLKIT_HOME})
-    endif ()
-    mmdeploy_add_module(${PROJECT_NAME} acl_net.cpp)
-    target_include_directories(${PROJECT_NAME} PRIVATE
-        $<BUILD_INTERFACE:${ASCEND_TOOLKIT_HOME}/runtime/include>)
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-        $<BUILD_INTERFACE:${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub/libascendcl.so>)
-endif ()
+if("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  if(NOT DEFINED ASCEND_TOOLKIT_HOME)
+    set(ASCEND_TOOLKIT_HOME $ENV{ASCEND_TOOLKIT_HOME})
+  endif()
+  mmdeploy_add_module(${PROJECT_NAME} acl_net.cpp)
+  target_include_directories(
+    ${PROJECT_NAME}
+    PRIVATE $<BUILD_INTERFACE:${ASCEND_TOOLKIT_HOME}/runtime/include>)
+  target_link_libraries(
+    ${PROJECT_NAME}
+    PRIVATE
+      $<BUILD_INTERFACE:${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub/libascendcl.so>
+  )
+endif()
diff --git a/csrc/mmdeploy/net/coreml/CMakeLists.txt b/csrc/mmdeploy/net/coreml/CMakeLists.txt
index 2f6de5427f..422e13835c 100644
--- a/csrc/mmdeploy/net/coreml/CMakeLists.txt
+++ b/csrc/mmdeploy/net/coreml/CMakeLists.txt
@@ -2,13 +2,17 @@
 
 project(mmdeploy_coreml_net)
 
-if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    find_library(CORE_ML CoreML)
-    find_library(FOUNDATION Foundation)
-    mmdeploy_add_net(${PROJECT_NAME} coreml_net.mm)
-    target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-    target_link_libraries(${PROJECT_NAME} PRIVATE ${CORE_ML} ${FOUNDATION})
-    add_library(mmdeploy::coreml_net ALIAS ${PROJECT_NAME})
-else ()
-    message(ERROR "'coreml_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}")
-endif ()
+if("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  find_library(CORE_ML CoreML)
+  find_library(FOUNDATION Foundation)
+  mmdeploy_add_net(${PROJECT_NAME} coreml_net.mm)
+  target_include_directories(${PROJECT_NAME}
+                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+  target_link_libraries(${PROJECT_NAME} PRIVATE ${CORE_ML} ${FOUNDATION})
+  add_library(mmdeploy::coreml_net ALIAS ${PROJECT_NAME})
+else()
+  message(
+    ERROR
+    "'coreml_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}"
+  )
+endif()
diff --git a/csrc/mmdeploy/net/ncnn/CMakeLists.txt b/csrc/mmdeploy/net/ncnn/CMakeLists.txt
index 7ae7a85385..17c1a8bafc 100644
--- a/csrc/mmdeploy/net/ncnn/CMakeLists.txt
+++ b/csrc/mmdeploy/net/ncnn/CMakeLists.txt
@@ -2,16 +2,16 @@
 
 project(mmdeploy_ncnn_net)
 
-if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
+if("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
 
-    find_package(ncnn REQUIRED)
+  find_package(ncnn REQUIRED)
 
-    mmdeploy_add_net(${PROJECT_NAME} ncnn_net.cpp)
-    target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_ncnn_ops_obj)
-    target_link_libraries(${PROJECT_NAME} PRIVATE ncnn)
-    add_library(mmdeploy::ncnn_net ALIAS ${PROJECT_NAME})
-else ()
-    message(
-            ERROR
-            "'ncnn_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}")
-endif ()
+  mmdeploy_add_net(${PROJECT_NAME} ncnn_net.cpp)
+  target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_ncnn_ops_obj)
+  target_link_libraries(${PROJECT_NAME} PRIVATE ncnn)
+  add_library(mmdeploy::ncnn_net ALIAS ${PROJECT_NAME})
+else()
+  message(
+    ERROR
+    "'ncnn_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}")
+endif()
diff --git a/csrc/mmdeploy/net/openvino/CMakeLists.txt b/csrc/mmdeploy/net/openvino/CMakeLists.txt
index 5c9d344db3..4bee4cad72 100644
--- a/csrc/mmdeploy/net/openvino/CMakeLists.txt
+++ b/csrc/mmdeploy/net/openvino/CMakeLists.txt
@@ -2,13 +2,15 @@
 
 project(mmdeploy_openvino_net)
 
-if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    find_package(InferenceEngine REQUIRED)
+if("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  find_package(InferenceEngine REQUIRED)
 
-    mmdeploy_add_net(${PROJECT_NAME} openvino_net.cpp)
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-            ${InferenceEngine_LIBRARIES})
-    add_library(mmdeploy::openvino_net ALIAS ${PROJECT_NAME})
-else ()
-    message(ERROR "'openvino_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}")
-endif ()
+  mmdeploy_add_net(${PROJECT_NAME} openvino_net.cpp)
+  target_link_libraries(${PROJECT_NAME} PRIVATE ${InferenceEngine_LIBRARIES})
+  add_library(mmdeploy::openvino_net ALIAS ${PROJECT_NAME})
+else()
+  message(
+    ERROR
+    "'openvino_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}"
+  )
+endif()
diff --git a/csrc/mmdeploy/net/ppl/CMakeLists.txt b/csrc/mmdeploy/net/ppl/CMakeLists.txt
index c92b80ba72..6473f3e94b 100644
--- a/csrc/mmdeploy/net/ppl/CMakeLists.txt
+++ b/csrc/mmdeploy/net/ppl/CMakeLists.txt
@@ -5,20 +5,20 @@ project(mmdeploy_pplnn_net)
 find_package(pplnn REQUIRED)
 
 mmdeploy_add_module(${PROJECT_NAME} ppl_net.cpp)
-target_include_directories(${PROJECT_NAME} PUBLIC
-        $<BUILD_INTERFACE:${PPLNN_INCLUDE_DIRS}>)
-if (PPLNN_USE_X86 AND ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES))
-    target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_X86=1)
-endif ()
-if (PPLNN_USE_CUDA AND ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES))
-    target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_CUDA=1)
-    target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include)
-    target_link_directories(${PROJECT_NAME} PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
-    target_link_libraries(${PROJECT_NAME} PRIVATE nvrtc)
-endif ()
-target_link_libraries(${PROJECT_NAME}
-        PRIVATE ${PPLNN_LIBRARIES})
-if (PPLNN_USE_RISCV AND ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES))
-    target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_RISCV=1)
-endif ()
+target_include_directories(${PROJECT_NAME}
+                           PUBLIC $<BUILD_INTERFACE:${PPLNN_INCLUDE_DIRS}>)
+if(PPLNN_USE_X86 AND ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES))
+  target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_X86=1)
+endif()
+if(PPLNN_USE_CUDA AND ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES))
+  target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_CUDA=1)
+  target_include_directories(${PROJECT_NAME}
+                             PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include)
+  target_link_directories(${PROJECT_NAME} PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+  target_link_libraries(${PROJECT_NAME} PRIVATE nvrtc)
+endif()
+target_link_libraries(${PROJECT_NAME} PRIVATE ${PPLNN_LIBRARIES})
+if(PPLNN_USE_RISCV AND ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES))
+  target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_RISCV=1)
+endif()
 add_library(mmdeploy::pplnn_net ALIAS ${PROJECT_NAME})
diff --git a/csrc/mmdeploy/net/rknn/CMakeLists.txt b/csrc/mmdeploy/net/rknn/CMakeLists.txt
index 11d4d0c94c..1938376cb5 100644
--- a/csrc/mmdeploy/net/rknn/CMakeLists.txt
+++ b/csrc/mmdeploy/net/rknn/CMakeLists.txt
@@ -6,32 +6,36 @@ mmdeploy_add_module(${PROJECT_NAME} rknn_net.cpp)
 
 add_library(rknn SHARED IMPORTED)
 
-if (DEFINED ENV{RKNPU2_DEVICE_DIR})
-    file(TO_CMAKE_PATH $ENV{RKNPU2_DEVICE_DIR} RKNPU2_DEVICE_DIR)
-endif ()
-if (DEFINED RKNPU2_DEVICE_DIR)
-    set_target_properties(rknn PROPERTIES
-            IMPORTED_LOCATION "${RKNPU2_DEVICE_DIR}/Linux/librknn_api/aarch64/librknn_api.so"
-            INTERFACE_INCLUDE_DIRECTORIES "${RKNPU2_DEVICE_DIR}/Linux/librknn_api/include"
-            )
-    target_compile_definitions(${PROJECT_NAME} PRIVATE RK_MODELS)
-    target_link_libraries(${PROJECT_NAME} PRIVATE rknn)
-    add_library(mmdeploy::rknn_net ALIAS ${PROJECT_NAME})
-endif ()
+if(DEFINED ENV{RKNPU2_DEVICE_DIR})
+  file(TO_CMAKE_PATH $ENV{RKNPU2_DEVICE_DIR} RKNPU2_DEVICE_DIR)
+endif()
+if(DEFINED RKNPU2_DEVICE_DIR)
+  set_target_properties(
+    rknn
+    PROPERTIES IMPORTED_LOCATION
+               "${RKNPU2_DEVICE_DIR}/Linux/librknn_api/aarch64/librknn_api.so"
+               INTERFACE_INCLUDE_DIRECTORIES
+               "${RKNPU2_DEVICE_DIR}/Linux/librknn_api/include")
+  target_compile_definitions(${PROJECT_NAME} PRIVATE RK_MODELS)
+  target_link_libraries(${PROJECT_NAME} PRIVATE rknn)
+  add_library(mmdeploy::rknn_net ALIAS ${PROJECT_NAME})
+endif()
 
-if (DEFINED ENV{RKNPU_DEVICE_DIR})
-    file(TO_CMAKE_PATH $ENV{RKNPU_DEVICE_DIR} RKNPU_DEVICE_DIR)
-endif ()
-if (DEFINED RKNPU_DEVICE_DIR)
-    set_target_properties(rknn PROPERTIES IMPORTED_CONFIGURATIONS RELEASE
-            IMPORTED_LOCATION_RELEASE "${RKNPU_DEVICE_DIR}/lib/librknn_api.so"
-            INTERFACE_INCLUDE_DIRECTORIES "${RKNPU_DEVICE_DIR}/include"
-            )
-    target_compile_definitions(${PROJECT_NAME} PRIVATE RV_MODELS)
-    target_link_libraries(${PROJECT_NAME} PRIVATE rknn)
-    add_library(mmdeploy::rknn_net ALIAS ${PROJECT_NAME})
-endif ()
+if(DEFINED ENV{RKNPU_DEVICE_DIR})
+  file(TO_CMAKE_PATH $ENV{RKNPU_DEVICE_DIR} RKNPU_DEVICE_DIR)
+endif()
+if(DEFINED RKNPU_DEVICE_DIR)
+  set_target_properties(
+    rknn
+    PROPERTIES IMPORTED_CONFIGURATIONS RELEASE
+               IMPORTED_LOCATION_RELEASE
+               "${RKNPU_DEVICE_DIR}/lib/librknn_api.so"
+               INTERFACE_INCLUDE_DIRECTORIES "${RKNPU_DEVICE_DIR}/include")
+  target_compile_definitions(${PROJECT_NAME} PRIVATE RV_MODELS)
+  target_link_libraries(${PROJECT_NAME} PRIVATE rknn)
+  add_library(mmdeploy::rknn_net ALIAS ${PROJECT_NAME})
+endif()
 
-if (NOT (DEFINED RKNPU2_DEVICE_DIR OR RKNPU_DEVICE_DIR))
-    message(FATAL_ERROR "RKNPU2_DEVICE_DIR or RKNPU_DEVICE_DIR must be defined")
-endif ()
+if(NOT (DEFINED RKNPU2_DEVICE_DIR OR RKNPU_DEVICE_DIR))
+  message(FATAL_ERROR "RKNPU2_DEVICE_DIR or RKNPU_DEVICE_DIR must be defined")
+endif()
diff --git a/csrc/mmdeploy/net/snpe/CMakeLists.txt b/csrc/mmdeploy/net/snpe/CMakeLists.txt
index 2f8af24dc3..25ce6a8a30 100644
--- a/csrc/mmdeploy/net/snpe/CMakeLists.txt
+++ b/csrc/mmdeploy/net/snpe/CMakeLists.txt
@@ -10,15 +10,15 @@ else()
   set(sub_dir "x86_64-linux-clang")
 endif()
 
-if (NOT EXISTS $ENV{SNPE_ROOT}/lib/${sub_dir}/)
-  message(ERROR "SNPE_ROOT directory not exist:  $ENV{SNPE_ROOT}/lib/${sub_dir}/")
+if(NOT EXISTS $ENV{SNPE_ROOT}/lib/${sub_dir}/)
+  message(ERROR
+          "SNPE_ROOT directory not exist:  $ENV{SNPE_ROOT}/lib/${sub_dir}/")
 endif()
 message(STATUS "SNPE lib directory $ENV{SNPE_ROOT}/lib/${sub_dir}/")
 
-set_target_properties(snpe PROPERTIES
-  IMPORTED_LOCATION "$ENV{SNPE_ROOT}/lib/${sub_dir}/libSNPE.so"
-  INTERFACE_INCLUDE_DIRECTORIES "$ENV{SNPE_ROOT}/include/zdl"
-)
+set_target_properties(
+  snpe PROPERTIES IMPORTED_LOCATION "$ENV{SNPE_ROOT}/lib/${sub_dir}/libSNPE.so"
+                  INTERFACE_INCLUDE_DIRECTORIES "$ENV{SNPE_ROOT}/include/zdl")
 
 mmdeploy_add_module(${PROJECT_NAME} snpe_net.cpp)
 target_link_libraries(${PROJECT_NAME} PRIVATE snpe)
diff --git a/csrc/mmdeploy/net/torchscript/CMakeLists.txt b/csrc/mmdeploy/net/torchscript/CMakeLists.txt
index e3e25c9f85..34df69f46d 100644
--- a/csrc/mmdeploy/net/torchscript/CMakeLists.txt
+++ b/csrc/mmdeploy/net/torchscript/CMakeLists.txt
@@ -4,25 +4,24 @@ project(mmdeploy_torch_net)
 
 option(MMDEPLOY_TORCHSCRIPT_SDK_BACKEND "Build TorchScript SDK backend" OFF)
 
-if (MMDEPLOY_TORCHSCRIPT_SDK_BACKEND)
-    find_package(Torch REQUIRED)
-    find_package(TorchVision QUIET)
+if(MMDEPLOY_TORCHSCRIPT_SDK_BACKEND)
+  find_package(Torch REQUIRED)
+  find_package(TorchVision QUIET)
 
-    mmdeploy_add_net(${PROJECT_NAME} torch_net.cpp)
+  mmdeploy_add_net(${PROJECT_NAME} torch_net.cpp)
 
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-            ${TORCH_LIBRARIES})
+  target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
 
-    target_link_directories(${PROJECT_NAME} INTERFACE
-            $<BUILD_INTERFACE:${Torch_DIR}/../../../lib>)
+  target_link_directories(${PROJECT_NAME} INTERFACE
+                          $<BUILD_INTERFACE:${Torch_DIR}/../../../lib>)
 
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-            mmdeploy_torchscript_ops_obj)
+  target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_torchscript_ops_obj)
 
-    if (TorchVision_FOUND)
-        target_link_libraries(${PROJECT_NAME} PRIVATE TorchVision::TorchVision)
-        target_compile_definitions(${PROJECT_NAME} PRIVATE -DMMDEPLOY_USE_TORCHVISION=1)
-    endif ()
+  if(TorchVision_FOUND)
+    target_link_libraries(${PROJECT_NAME} PRIVATE TorchVision::TorchVision)
+    target_compile_definitions(${PROJECT_NAME}
+                               PRIVATE -DMMDEPLOY_USE_TORCHVISION=1)
+  endif()
 
-    add_library(mmdeploy::torch_net ALIAS ${PROJECT_NAME})
-endif ()
+  add_library(mmdeploy::torch_net ALIAS ${PROJECT_NAME})
+endif()
diff --git a/csrc/mmdeploy/net/trt/CMakeLists.txt b/csrc/mmdeploy/net/trt/CMakeLists.txt
index bc49d0b176..12cba2874f 100644
--- a/csrc/mmdeploy/net/trt/CMakeLists.txt
+++ b/csrc/mmdeploy/net/trt/CMakeLists.txt
@@ -5,10 +5,10 @@ project(mmdeploy_trt_net)
 include(${CMAKE_SOURCE_DIR}/cmake/tensorrt.cmake)
 
 mmdeploy_add_net(${PROJECT_NAME} trt_net.cpp)
-target_include_directories(${PROJECT_NAME} PRIVATE
-        ${TENSORRT_INCLUDE_DIR})
+target_include_directories(${PROJECT_NAME} PRIVATE ${TENSORRT_INCLUDE_DIR})
 target_include_directories(${PROJECT_NAME} PRIVATE ${CUDNN_DIR}/include)
-target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
+target_include_directories(${PROJECT_NAME}
+                           PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
 target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_tensorrt_ops_obj)
 target_link_libraries(${PROJECT_NAME} PUBLIC ${TENSORRT_LIBS} cudnn cuda)
 
diff --git a/csrc/mmdeploy/net/tvm/CMakeLists.txt b/csrc/mmdeploy/net/tvm/CMakeLists.txt
index 426d3bb4ec..996eeffd0a 100644
--- a/csrc/mmdeploy/net/tvm/CMakeLists.txt
+++ b/csrc/mmdeploy/net/tvm/CMakeLists.txt
@@ -5,7 +5,9 @@ project(mmdeploy_tvm_net)
 include(${CMAKE_SOURCE_DIR}/cmake/modules/FindTVM.cmake)
 
 mmdeploy_add_net(${PROJECT_NAME} tvm_net.cpp)
-target_include_directories(${PROJECT_NAME} PRIVATE ${TVM_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR} ${DMLC_CORE_INCLUDE_DIR})
+target_include_directories(
+  ${PROJECT_NAME} PRIVATE ${TVM_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR}
+                          ${DMLC_CORE_INCLUDE_DIR})
 target_link_libraries(${PROJECT_NAME} PRIVATE tvm_runtime mmdeploy_dlpack_utils)
 
 add_library(mmdeploy::tvm_net ALIAS ${PROJECT_NAME})
diff --git a/csrc/mmdeploy/operation/cpu/CMakeLists.txt b/csrc/mmdeploy/operation/cpu/CMakeLists.txt
index 5607123deb..be45c86056 100644
--- a/csrc/mmdeploy/operation/cpu/CMakeLists.txt
+++ b/csrc/mmdeploy/operation/cpu/CMakeLists.txt
@@ -2,20 +2,20 @@
 
 project(mmdeploy_operation_cpu)
 
-set(SRCS resize.cpp
-        cvtcolor.cpp
-        pad.cpp
-        to_float.cpp
-        hwc2chw.cpp
-        normalize.cpp
-        crop.cpp
-        flip.cpp
-        warp_affine.cpp
-        crop_resize_pad.cpp
-        permute.cpp)
+set(SRCS
+    resize.cpp
+    cvtcolor.cpp
+    pad.cpp
+    to_float.cpp
+    hwc2chw.cpp
+    normalize.cpp
+    crop.cpp
+    flip.cpp
+    warp_affine.cpp
+    crop_resize_pad.cpp
+    permute.cpp)
 
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 
-target_link_libraries(${PROJECT_NAME} PRIVATE
-        mmdeploy_operation
-        mmdeploy_opencv_utils)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_operation
+                                              mmdeploy_opencv_utils)
diff --git a/csrc/mmdeploy/operation/cuda/CMakeLists.txt b/csrc/mmdeploy/operation/cuda/CMakeLists.txt
index 551f89977b..1842f06f70 100644
--- a/csrc/mmdeploy/operation/cuda/CMakeLists.txt
+++ b/csrc/mmdeploy/operation/cuda/CMakeLists.txt
@@ -1,32 +1,33 @@
-if (NOT ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES))
-    return()
-endif ()
+if(NOT ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES))
+  return()
+endif()
 
 project(mmdeploy_operation_cuda CUDA CXX)
 
 find_package(pplcv REQUIRED)
 
-set(SRCS resize.cpp
-        cvtcolor.cpp
-        pad.cpp
-        to_float.cpp
-        cast.cu
-        hwc2chw.cpp
-        transpose.cu
-        normalize.cpp
-        normalize.cu
-        crop.cpp
-        crop.cu
-        flip.cpp
-        warp_affine.cpp
-        crop_resize_pad.cpp
-        permute.cpp
-        permute.cu)
+set(SRCS
+    resize.cpp
+    cvtcolor.cpp
+    pad.cpp
+    to_float.cpp
+    cast.cu
+    hwc2chw.cpp
+    transpose.cu
+    normalize.cpp
+    normalize.cu
+    crop.cpp
+    crop.cu
+    flip.cpp
+    warp_affine.cpp
+    crop_resize_pad.cpp
+    permute.cpp
+    permute.cu)
 
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 
-target_link_libraries(${PROJECT_NAME} PRIVATE
-        mmdeploy_operation
-        ${PPLCV_LIBRARIES})
-target_include_directories(${PROJECT_NAME}
-        PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include ${PPLCV_INCLUDE_DIRS})
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_operation
+                                              ${PPLCV_LIBRARIES})
+target_include_directories(
+  ${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include
+                          ${PPLCV_INCLUDE_DIRS})
diff --git a/csrc/mmdeploy/preprocess/CMakeLists.txt b/csrc/mmdeploy/preprocess/CMakeLists.txt
index e55f7ca19a..228d8c11a7 100644
--- a/csrc/mmdeploy/preprocess/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/CMakeLists.txt
@@ -4,9 +4,9 @@ project(mmdeploy_transform_module)
 
 add_subdirectory(transform)
 
-if (MMDEPLOY_ELENA_FUSION)
-    add_subdirectory(elena)
-endif ()
+if(MMDEPLOY_ELENA_FUSION)
+  add_subdirectory(elena)
+endif()
 
 mmdeploy_add_module(${PROJECT_NAME} transform_module.cpp)
 target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy::transform)
diff --git a/csrc/mmdeploy/preprocess/elena/CMakeLists.txt b/csrc/mmdeploy/preprocess/elena/CMakeLists.txt
index 4ed0a4ed12..50517eeddf 100644
--- a/csrc/mmdeploy/preprocess/elena/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/elena/CMakeLists.txt
@@ -2,22 +2,20 @@
 
 project(mmdeploy_elena_transform_impl)
 
-set(SRCS
-        fused.cpp
-        elena_registry.cpp)
+set(SRCS fused.cpp elena_registry.cpp)
 
 file(GLOB CPU_KERNEL_SRCS "cpu_kernel/*.cpp")
 
 set(ALL_SRCS ${SRCS} ${CPU_KERNEL_SRCS})
-if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    file(GLOB CUDA_KERNEL_SRCS "cuda_kernel/*.cu")
-    set(ALL_SRCS ${ALL_SRCS} ${CUDA_KERNEL_SRCS})
-endif ()
+if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  file(GLOB CUDA_KERNEL_SRCS "cuda_kernel/*.cu")
+  set(ALL_SRCS ${ALL_SRCS} ${CUDA_KERNEL_SRCS})
+endif()
 
 mmdeploy_add_module(${PROJECT_NAME} "${ALL_SRCS}")
 target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy::transform)
-if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    target_link_libraries(${PROJECT_NAME} PRIVATE cuda)
-endif ()
+if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  target_link_libraries(${PROJECT_NAME} PRIVATE cuda)
+endif()
 add_library(mmdeploy::transform_impl::elena ALIAS ${PROJECT_NAME})
diff --git a/csrc/mmdeploy/preprocess/transform/CMakeLists.txt b/csrc/mmdeploy/preprocess/transform/CMakeLists.txt
index ccabdc9ab7..849f342c02 100644
--- a/csrc/mmdeploy/preprocess/transform/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/transform/CMakeLists.txt
@@ -3,23 +3,23 @@
 project(mmdeploy_transform)
 
 set(SRCS
-        collect.cpp
-        compose.cpp
-        center_crop.cpp
-        three_crop.cpp
-        ten_crop.cpp
-        image2tensor.cpp
-        letter_resize.cpp
-        default_format_bundle.cpp
-        load.cpp
-        normalize.cpp
-        pad.cpp
-        resize.cpp
-        transform.cpp
-        tracer.cpp
-        lift.cpp)
+    collect.cpp
+    compose.cpp
+    center_crop.cpp
+    three_crop.cpp
+    ten_crop.cpp
+    image2tensor.cpp
+    letter_resize.cpp
+    default_format_bundle.cpp
+    load.cpp
+    normalize.cpp
+    pad.cpp
+    resize.cpp
+    transform.cpp
+    tracer.cpp
+    lift.cpp)
 mmdeploy_add_module(${PROJECT_NAME} LIBRARY "${SRCS}")
 target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_operation)
 target_include_directories(
-        ${PROJECT_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/preprocess>)
+  ${PROJECT_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/preprocess>)
 add_library(mmdeploy::transform ALIAS ${PROJECT_NAME})
diff --git a/csrc/mmdeploy/utils/dlpack/CMakeLists.txt b/csrc/mmdeploy/utils/dlpack/CMakeLists.txt
index ab521a05c1..6baa1bac24 100644
--- a/csrc/mmdeploy/utils/dlpack/CMakeLists.txt
+++ b/csrc/mmdeploy/utils/dlpack/CMakeLists.txt
@@ -4,11 +4,11 @@ project(mmdeploy_dlpack_utils)
 
 mmdeploy_add_library(${PROJECT_NAME} STATIC dlpack_utils.cpp)
 
-target_link_libraries(${PROJECT_NAME}
-        PRIVATE mmdeploy::core)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy::core)
 
-target_include_directories(${PROJECT_NAME}
-        INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+target_include_directories(
+  ${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
 
-target_include_directories(${PROJECT_NAME} PRIVATE
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/dlpack>)
+target_include_directories(
+  ${PROJECT_NAME}
+  PRIVATE $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/dlpack>)
diff --git a/csrc/mmdeploy/utils/opencv/CMakeLists.txt b/csrc/mmdeploy/utils/opencv/CMakeLists.txt
index 2dafb8da65..24d31436d4 100644
--- a/csrc/mmdeploy/utils/opencv/CMakeLists.txt
+++ b/csrc/mmdeploy/utils/opencv/CMakeLists.txt
@@ -4,9 +4,10 @@ project(mmdeploy_opencv_utils)
 
 mmdeploy_add_library(${PROJECT_NAME} opencv_utils.cpp)
 
-target_link_libraries(${PROJECT_NAME}
-        PRIVATE mmdeploy::core
-        PUBLIC ${OpenCV_LIBS})
+target_link_libraries(
+  ${PROJECT_NAME}
+  PRIVATE mmdeploy::core
+  PUBLIC ${OpenCV_LIBS})
 
-target_include_directories(${PROJECT_NAME}
-        INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+target_include_directories(
+  ${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
diff --git a/demo/csrc/CMakeLists.txt b/demo/csrc/CMakeLists.txt
index 8de1c05cbf..2ebb7bab4c 100644
--- a/demo/csrc/CMakeLists.txt
+++ b/demo/csrc/CMakeLists.txt
@@ -2,31 +2,35 @@
 cmake_minimum_required(VERSION 3.14)
 project(mmdeploy-example)
 
-if (NOT (${CMAKE_PROJECT_NAME} STREQUAL "MMDeploy"))
-    find_package(MMDeploy REQUIRED)
-endif ()
-
+if(NOT (${CMAKE_PROJECT_NAME} STREQUAL "MMDeploy"))
+  find_package(MMDeploy REQUIRED)
+endif()
 
 function(add_example task folder name)
-    if ((NOT task) OR (task IN_LIST MMDEPLOY_TASKS))
-        # Search for c/cpp sources
-        file(GLOB _SRCS ${folder}/${name}.c*)
-        add_executable(${name} ${_SRCS})
-        if (NOT (MSVC OR APPLE))
-            # Disable new dtags so that executables can run even without LD_LIBRARY_PATH set
-            target_link_libraries(${name} PRIVATE -Wl,--disable-new-dtags)
-        endif ()
-        if (MMDEPLOY_BUILD_SDK_MONOLITHIC)
-            target_link_libraries(${name} PRIVATE mmdeploy ${OpenCV_LIBS})
-        else ()
-            # Load MMDeploy modules
-            mmdeploy_load_static(${name} MMDeployStaticModules)
-            mmdeploy_load_dynamic(${name} MMDeployDynamicModules)
-            # Link to MMDeploy libraries
-            target_link_libraries(${name} PRIVATE MMDeployLibs ${OpenCV_LIBS})
-        endif ()
-        install(TARGETS ${name} RUNTIME DESTINATION bin)
-    endif ()
+  if((NOT task) OR (task IN_LIST MMDEPLOY_TASKS))
+    # Search for c/cpp sources
+    file(GLOB _SRCS ${folder}/${name}.c*)
+    add_executable(${name} ${_SRCS})
+    if(NOT (MSVC OR APPLE))
+      # Disable new dtags so that executables can run even without
+      # LD_LIBRARY_PATH set
+      target_link_libraries(${name} PRIVATE -Wl,--disable-new-dtags)
+    endif()
+    if(MMDEPLOY_BUILD_SDK_MONOLITHIC)
+      target_link_libraries(${name} PRIVATE mmdeploy ${OpenCV_LIBS})
+    else()
+      # Load MMDeploy modules
+      mmdeploy_load_static(${name} MMDeployStaticModules)
+      mmdeploy_load_dynamic(${name} MMDeployDynamicModules)
+      # Link to MMDeploy libraries
+      target_link_libraries(${name} PRIVATE MMDeployLibs ${OpenCV_LIBS})
+      message("MMDeployStaticModules: ${MMDeployStaticModules}")
+      message("MMDeployDynamicModules: ${MMDeployDynamicModules}")
+      message("MMDeployLibs: ${MMDeployLibs}")
+      message("OpenCV_LIBS: ${OpenCV_LIBS}")
+    endif()
+    install(TARGETS ${name} RUNTIME DESTINATION bin)
+  endif()
 endfunction()
 
 add_example(classifier c image_classification)
@@ -39,8 +43,7 @@ add_example(text_detector c ocr)
 add_example(pose_detector c pose_detection)
 add_example(rotated_detector c rotated_object_detection)
 add_example(video_recognizer c video_recognition)
-# TODO: figure out a better way
-# add_example("" c det_cls)
+# TODO: figure out a better way add_example("" c det_cls)
 
 add_example(classifier cpp classifier)
 add_example(detector cpp detector)
diff --git a/service/snpe/server/CMakeLists.txt b/service/snpe/server/CMakeLists.txt
index f14ddc97a8..ce1232b1db 100644
--- a/service/snpe/server/CMakeLists.txt
+++ b/service/snpe/server/CMakeLists.txt
@@ -1,21 +1,21 @@
 # Copyright 2018 gRPC authors.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
 #
-# cmake build file for C++ helloworld example.
-# Assumes protobuf and gRPC have been installed using cmake.
-# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
-# that automatically builds all the dependencies before building helloworld.
+# cmake build file for C++ helloworld example. Assumes protobuf and gRPC have
+# been installed using cmake. See cmake_externalproject/CMakeLists.txt for
+# all-in-one cmake build that automatically builds all the dependencies before
+# building helloworld.
 
 cmake_minimum_required(VERSION 3.5.1)
 project(SNPEServer C CXX)
@@ -32,50 +32,41 @@ set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/inference.grpc.pb.cc")
 set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/inference.grpc.pb.h")
 
 add_custom_command(
-      OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
-      COMMAND ${_PROTOBUF_PROTOC}
-      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
-        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
-        -I "${hw_proto_path}"
-        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
-        "${hw_proto}"
-      DEPENDS "${hw_proto}")
+  OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}"
+         "${hw_grpc_hdrs}"
+  COMMAND
+    ${_PROTOBUF_PROTOC} ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" --cpp_out
+    "${CMAKE_CURRENT_BINARY_DIR}" -I "${hw_proto_path}"
+    --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}" "${hw_proto}"
+  DEPENDS "${hw_proto}")
 
 # Include generated *.pb.h files
 include_directories("${CMAKE_CURRENT_BINARY_DIR}")
 
 # hw_grpc_proto
-add_library(hw_grpc_proto
-  ${hw_grpc_srcs}
-  ${hw_grpc_hdrs}
-  ${hw_proto_srcs}
-  ${hw_proto_hdrs})
+add_library(hw_grpc_proto ${hw_grpc_srcs} ${hw_grpc_hdrs} ${hw_proto_srcs}
+                          ${hw_proto_hdrs})
 
-target_link_libraries(hw_grpc_proto
-  ${_REFLECTION}
-  ${_GRPC_GRPCPP}
-  ${_PROTOBUF_LIBPROTOBUF})
+target_link_libraries(hw_grpc_proto ${_REFLECTION} ${_GRPC_GRPCPP}
+                      ${_PROTOBUF_LIBPROTOBUF})
 
 add_library(snpe SHARED IMPORTED)
 
-if (NOT EXISTS $ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/)
-  message(FATAL_ERROR "SNPE_ROOT directory not exist: "$ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/)
+if(NOT EXISTS $ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/)
+  message(
+    FATAL_ERROR
+      "SNPE_ROOT directory not exist: "$ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/
+  )
 endif()
 
-set_target_properties(snpe PROPERTIES
-  IMPORTED_LOCATION "$ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/libSNPE.so"
-  INTERFACE_INCLUDE_DIRECTORIES "$ENV{SNPE_ROOT}/include/zdl"
-)
-target_link_directories(
+set_target_properties(
   snpe
-  INTERFACE
-)
+  PROPERTIES IMPORTED_LOCATION
+             "$ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/libSNPE.so"
+             INTERFACE_INCLUDE_DIRECTORIES "$ENV{SNPE_ROOT}/include/zdl")
+target_link_directories(snpe INTERFACE)
 
-add_executable(inference_server  inference_server.cc service_impl.cpp)
+add_executable(inference_server inference_server.cc service_impl.cpp)
 
-target_link_libraries(inference_server
-  hw_grpc_proto
-  ${_REFLECTION}
-  ${_GRPC_GRPCPP}
-  ${_PROTOBUF_LIBPROTOBUF}
-  snpe)
+target_link_libraries(inference_server hw_grpc_proto ${_REFLECTION}
+                      ${_GRPC_GRPCPP} ${_PROTOBUF_LIBPROTOBUF} snpe)
diff --git a/service/snpe/server/common.cmake b/service/snpe/server/common.cmake
index 20d2f0c01e..57da6345a7 100644
--- a/service/snpe/server/common.cmake
+++ b/service/snpe/server/common.cmake
@@ -1,25 +1,25 @@
 # Copyright 2018 gRPC authors.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
 #
-# cmake build file for C++ route_guide example.
-# Assumes protobuf and gRPC have been installed using cmake.
-# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
-# that automatically builds all the dependencies before building route_guide.
+# cmake build file for C++ route_guide example. Assumes protobuf and gRPC have
+# been installed using cmake. See cmake_externalproject/CMakeLists.txt for
+# all-in-one cmake build that automatically builds all the dependencies before
+# building route_guide.
 
 cmake_minimum_required(VERSION 3.5.1)
 
-set (CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 17)
 
 if(MSVC)
   add_definitions(-D_WIN32_WINNT=0x600)
@@ -28,24 +28,22 @@ endif()
 find_package(Threads REQUIRED)
 
 if(GRPC_AS_SUBMODULE)
-  # One way to build a projects that uses gRPC is to just include the
-  # entire gRPC project tree via "add_subdirectory".
-  # This approach is very simple to use, but the are some potential
-  # disadvantages:
-  # * it includes gRPC's CMakeLists.txt directly into your build script
-  #   without and that can make gRPC's internal setting interfere with your
-  #   own build.
-  # * depending on what's installed on your system, the contents of submodules
-  #   in gRPC's third_party/* might need to be available (and there might be
-  #   additional prerequisites required to build them). Consider using
-  #   the gRPC_*_PROVIDER options to fine-tune the expected behavior.
+  # One way to build a projects that uses gRPC is to just include the entire
+  # gRPC project tree via "add_subdirectory". This approach is very simple to
+  # use, but the are some potential disadvantages: * it includes gRPC's
+  # CMakeLists.txt directly into your build script without and that can make
+  # gRPC's internal setting interfere with your own build. * depending on what's
+  # installed on your system, the contents of submodules in gRPC's third_party/*
+  # might need to be available (and there might be additional prerequisites
+  # required to build them). Consider using the gRPC_*_PROVIDER options to
+  # fine-tune the expected behavior.
   #
-  # A more robust approach to add dependency on gRPC is using
-  # cmake's ExternalProject_Add (see cmake_externalproject/CMakeLists.txt).
+  # A more robust approach to add dependency on gRPC is using cmake's
+  # ExternalProject_Add (see cmake_externalproject/CMakeLists.txt).
 
-  # Include the gRPC's cmake build (normally grpc source code would live
-  # in a git submodule called "third_party/grpc", but this example lives in
-  # the same repository as gRPC sources, so we just look a few directories up)
+  # Include the gRPC's cmake build (normally grpc source code would live in a
+  # git submodule called "third_party/grpc", but this example lives in the same
+  # repository as gRPC sources, so we just look a few directories up)
   add_subdirectory(../../.. ${CMAKE_CURRENT_BINARY_DIR}/grpc EXCLUDE_FROM_ALL)
   message(STATUS "Using gRPC via add_subdirectory.")
 
@@ -65,23 +63,22 @@ if(GRPC_AS_SUBMODULE)
     set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:grpc_cpp_plugin>)
   endif()
 elseif(GRPC_FETCHCONTENT)
-  # Another way is to use CMake's FetchContent module to clone gRPC at
-  # configure time. This makes gRPC's source code available to your project,
-  # similar to a git submodule.
+  # Another way is to use CMake's FetchContent module to clone gRPC at configure
+  # time. This makes gRPC's source code available to your project, similar to a
+  # git submodule.
   message(STATUS "Using gRPC via add_subdirectory (FetchContent).")
   include(FetchContent)
   FetchContent_Declare(
     grpc
     GIT_REPOSITORY https://github.com/grpc/grpc.git
     # when using gRPC, you will actually set this to an existing tag, such as
-    # v1.25.0, v1.26.0 etc..
-    # For the purpose of testing, we override the tag used to the commit
-    # that's currently under test.
-    GIT_TAG        vGRPC_TAG_VERSION_OF_YOUR_CHOICE)
+    # v1.25.0, v1.26.0 etc.. For the purpose of testing, we override the tag
+    # used to the commit that's currently under test.
+    GIT_TAG vGRPC_TAG_VERSION_OF_YOUR_CHOICE)
   FetchContent_MakeAvailable(grpc)
 
-  # Since FetchContent uses add_subdirectory under the hood, we can use
-  # the grpc targets directly from this build.
+  # Since FetchContent uses add_subdirectory under the hood, we can use the grpc
+  # targets directly from this build.
   set(_PROTOBUF_LIBPROTOBUF libprotobuf)
   set(_REFLECTION grpc++_reflection)
   set(_PROTOBUF_PROTOC $<TARGET_FILE:protoc>)
@@ -95,8 +92,8 @@ else()
   # This branch assumes that gRPC and all its dependencies are already installed
   # on this system, so they can be located by find_package().
 
-  # Find Protobuf installation
-  # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
+  # Find Protobuf installation Looks for protobuf-config.cmake file installed by
+  # Protobuf's cmake installation.
   set(protobuf_MODULE_COMPATIBLE TRUE)
   find_package(Protobuf CONFIG REQUIRED)
   message(STATUS "Using protobuf ${Protobuf_VERSION}")
@@ -109,8 +106,8 @@ else()
     set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
   endif()
 
-  # Find gRPC installation
-  # Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
+  # Find gRPC installation Looks for gRPCConfig.cmake file installed by gRPC's
+  # cmake installation.
   find_package(gRPC CONFIG REQUIRED)
   message(STATUS "Using gRPC ${gRPC_VERSION}")
 
diff --git a/tests/test_csrc/CMakeLists.txt b/tests/test_csrc/CMakeLists.txt
index 960b188899..2f99a6d82d 100644
--- a/tests/test_csrc/CMakeLists.txt
+++ b/tests/test_csrc/CMakeLists.txt
@@ -12,69 +12,68 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/model MODEL_TC)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/graph GRAPH_TC)
 
 set(DEVICE_TC)
-foreach (DEVICE IN LISTS MMDEPLOY_TARGET_DEVICES)
-    list(APPEND DEVICE_TC
-            ${CMAKE_CURRENT_SOURCE_DIR}/device/test_${DEVICE}_device.cpp)
-endforeach ()
+foreach(DEVICE IN LISTS MMDEPLOY_TARGET_DEVICES)
+  list(APPEND DEVICE_TC
+       ${CMAKE_CURRENT_SOURCE_DIR}/device/test_${DEVICE}_device.cpp)
+endforeach()
 
 set(CAPI_TC)
-if ("all" IN_LIST MMDEPLOY_CODEBASES)
-    set(TASK_LIST
-            "classifier;detector;segmentor;text_detector;text_recognizer;restorer;model"
-            )
-    set(CODEBASES "mmcls;mmdet;mmseg;mmedit;mmocr")
-else ()
-    set(TASK_LIST "model")
-    set(CODEBASES "${MMDEPLOY_CODEBASES}")
-    if ("mmcls" IN_LIST MMDEPLOY_CODEBASES)
-        list(APPEND TASK_LIST "classifier")
-    endif ()
-    if ("mmdet" IN_LIST MMDEPLOY_CODEBASES)
-        list(APPEND TASK_LIST "detector")
-    endif ()
-    if ("mmseg" IN_LIST MMDEPLOY_CODEBASES)
-        list(APPEND TASK_LIST "segmentor")
-    endif ()
-    if ("mmedit" IN_LIST MMDEPLOY_CODEBASES)
-        list(APPEND TASK_LIST "restorer")
-    endif ()
-    if ("mmocr" IN_LIST MMDEPLOY_CODEBASES)
-        list(APPEND TASK_LIST "text_detector")
-        list(APPEND TASK_LIST "text_recognizer")
-    endif ()
-endif ()
-foreach (TASK ${TASK_LIST})
-    list(APPEND CAPI_TC ${CMAKE_CURRENT_SOURCE_DIR}/capi/test_${TASK}.cpp)
-endforeach ()
+if("all" IN_LIST MMDEPLOY_CODEBASES)
+  set(TASK_LIST
+      "classifier;detector;segmentor;text_detector;text_recognizer;restorer;model"
+  )
+  set(CODEBASES "mmcls;mmdet;mmseg;mmedit;mmocr")
+else()
+  set(TASK_LIST "model")
+  set(CODEBASES "${MMDEPLOY_CODEBASES}")
+  if("mmcls" IN_LIST MMDEPLOY_CODEBASES)
+    list(APPEND TASK_LIST "classifier")
+  endif()
+  if("mmdet" IN_LIST MMDEPLOY_CODEBASES)
+    list(APPEND TASK_LIST "detector")
+  endif()
+  if("mmseg" IN_LIST MMDEPLOY_CODEBASES)
+    list(APPEND TASK_LIST "segmentor")
+  endif()
+  if("mmedit" IN_LIST MMDEPLOY_CODEBASES)
+    list(APPEND TASK_LIST "restorer")
+  endif()
+  if("mmocr" IN_LIST MMDEPLOY_CODEBASES)
+    list(APPEND TASK_LIST "text_detector")
+    list(APPEND TASK_LIST "text_recognizer")
+  endif()
+endif()
+foreach(TASK ${TASK_LIST})
+  list(APPEND CAPI_TC ${CMAKE_CURRENT_SOURCE_DIR}/capi/test_${TASK}.cpp)
+endforeach()
 
 # generate the header file
 configure_file(config/test_define.h.in
-        ${CMAKE_CURRENT_SOURCE_DIR}/test_define.h)
+               ${CMAKE_CURRENT_SOURCE_DIR}/test_define.h)
 
 set(TC_SRCS
-        ${TC_SRCS}
-        ${ARCHIVE_TC}
-        ${CORE_TC}
-        ${TRANSFORM_TC}
-        ${MODEL_TC}
-        ${NET_TC}
-        ${DEVICE_TC}
-        ${CAPI_TC}
-        ${GRAPH_TC})
+    ${TC_SRCS}
+    ${ARCHIVE_TC}
+    ${CORE_TC}
+    ${TRANSFORM_TC}
+    ${MODEL_TC}
+    ${NET_TC}
+    ${DEVICE_TC}
+    ${CAPI_TC}
+    ${GRAPH_TC})
 
 add_executable(mmdeploy_tests ${TC_SRCS})
 target_include_directories(mmdeploy_tests
-        PRIVATE ${CMAKE_SOURCE_DIR}/third_party/catch2)
+                           PRIVATE ${CMAKE_SOURCE_DIR}/third_party/catch2)
 target_include_directories(mmdeploy_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 
-if (NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
-    target_compile_options(mmdeploy_tests PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-endif ()
+if(NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
+  target_compile_options(mmdeploy_tests
+                         PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+endif()
 
 mmdeploy_load_static(mmdeploy_tests MMDeployStaticModules)
 mmdeploy_load_dynamic(mmdeploy_tests MMDeployDynamicModules)
-target_link_libraries(mmdeploy_tests PRIVATE
-    MMDeployLibs
-    mmdeploy_transform
-    mmdeploy_operation
-    mmdeploy_opencv_utils)
+target_link_libraries(
+  mmdeploy_tests PRIVATE MMDeployLibs mmdeploy_transform mmdeploy_operation
+                         mmdeploy_opencv_utils)

From e26f1315d762ca4c797acad260da35047a2eebf8 Mon Sep 17 00:00:00 2001
From: huzhenhong <455879568@qq.com>
Date: Wed, 20 Dec 2023 18:26:22 +0800
Subject: [PATCH 4/5] format cpp and cuda code

---
 .../mmdeploy/apis/cxx/mmdeploy/classifier.hpp |  13 +-
 csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp    |  89 ++++--
 csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp  |  13 +-
 csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp  |   8 +-
 .../apis/cxx/mmdeploy/pose_detector.hpp       |   7 +-
 .../common_cuda_helper.cuh                    |  28 +-
 .../modulated_deform_conv_cpu.h               |  26 +-
 .../modulated_deform_conv_cuda.cuh            |  57 ++--
 .../backend_ops/ncnn/onnx2ncnn/fuse_pass.h    |  12 +-
 .../ncnn/onnx2ncnn/shape_inference.cpp        |   9 +-
 .../ncnn/onnx2ncnn/shape_inference.h          |   9 +-
 .../ops/constantofshape/constantofshape.cpp   |   5 +-
 .../ops/constantofshape/constantofshape.h     |   4 +-
 .../backend_ops/ncnn/ops/expand/expand.cpp    |   4 +-
 .../backend_ops/ncnn/ops/expand/expand.h      |   4 +-
 .../backend_ops/ncnn/ops/gather/gather.cpp    |   5 +-
 .../backend_ops/ncnn/ops/gather/gather.h      |   4 +-
 .../backend_ops/ncnn/ops/ncnn_ops_register.h  |   1 +
 .../backend_ops/ncnn/ops/shape/shape.cpp      |   5 +-
 .../backend_ops/ncnn/ops/shape/shape.h        |   4 +-
 .../ncnn/ops/tensorslice/tensorslice.cpp      |   5 +-
 .../ncnn/ops/tensorslice/tensorslice.h        |   4 +-
 .../backend_ops/ncnn/ops/topk/topk.cpp        |   6 +-
 .../mmdeploy/backend_ops/ncnn/ops/topk/topk.h |   6 +-
 .../tensorrt/batched_nms/trt_batched_nms.cpp  |  82 ++++--
 .../tensorrt/batched_nms/trt_batched_nms.hpp  |  37 ++-
 .../trt_batched_rotated_nms.hpp               |  33 ++-
 .../trt_bicubic_interpolate.cpp               |  32 +-
 .../trt_bicubic_interpolate.hpp               |  34 ++-
 .../trt_bicubic_interpolate_kernel.cu         | 112 +++++--
 .../trt_bicubic_interpolate_kernel.hpp        |  11 +-
 .../tensorrt/common/common_cuda_helper.hpp    |  22 +-
 .../common/nms/batched_nms_kernel.hpp         |  25 +-
 .../tensorrt/common/nms/cub_helper.h          |   7 +-
 .../backend_ops/tensorrt/common/nms/kernel.h  | 164 ++++++++---
 .../tensorrt/common/trt_plugin_base.hpp       |  14 +-
 .../tensorrt/common/trt_serialize.hpp         |   6 +
 .../tensorrt/common_impl/nms/allClassNMS.cu   | 125 ++++++--
 .../common_impl/nms/allClassRotatedNMS.cu     | 143 ++++++---
 .../common_impl/nms/batched_nms_kernel.cpp    | 124 +++++++-
 .../common_impl/nms/gatherNMSOutputs.cu       |  83 +++++-
 .../tensorrt/common_impl/nms/kernel.cu        |  15 +-
 .../tensorrt/common_impl/nms/permuteData.cu   |  54 +++-
 .../common_impl/nms/sortScoresPerClass.cu     |  57 +++-
 .../common_impl/nms/sortScoresPerImage.cu     |  41 ++-
 .../tensorrt/common_impl/trt_cuda_helper.cu   |  59 +++-
 .../tensorrt/deform_conv/trt_deform_conv.cpp  |  48 ++-
 .../tensorrt/deform_conv/trt_deform_conv.hpp  |  55 +++-
 .../deform_conv/trt_deform_conv_kernel.cu     | 183 ++++++++++--
 .../deform_conv/trt_deform_conv_kernel.cuh    |  41 ++-
 .../deform_conv/trt_deform_conv_kernel.hpp    |  42 ++-
 .../tensorrt/gather_topk/gather_topk.cpp      |  41 ++-
 .../tensorrt/gather_topk/gather_topk.hpp      |  39 ++-
 .../gather_topk/gather_topk_kernel.cu         |  43 ++-
 .../gather_topk/gather_topk_kernel.hpp        |   9 +-
 .../tensorrt/grid_priors/trt_grid_priors.cpp  |  18 +-
 .../tensorrt/grid_priors/trt_grid_priors.hpp  |  26 +-
 .../grid_priors/trt_grid_priors_kernel.cu     |  44 ++-
 .../grid_priors/trt_grid_priors_kernel.hpp    |   9 +-
 .../grid_sampler/trt_grid_sampler.cpp         |  32 +-
 .../grid_sampler/trt_grid_sampler.hpp         |  36 ++-
 .../grid_sampler/trt_grid_sampler_kernel.cu   |  88 ++++--
 .../grid_sampler/trt_grid_sampler_kernel.hpp  |  12 +-
 .../instance_norm/trt_instance_norm.cpp       |  34 ++-
 .../instance_norm/trt_instance_norm.hpp       |  45 ++-
 .../trt_modulated_deform_conv.cpp             | 171 ++++++-----
 .../trt_modulated_deform_conv.hpp             |  54 +++-
 .../trt_modulated_deform_conv_kernel.cu       | 274 +++++++++++-------
 .../trt_modulated_deform_conv_kernel.hpp      |  51 ++--
 .../trt_multi_level_roi_align_kernel.cu       | 146 ++++++----
 .../trt_multi_level_rotated_roi_align.cpp     | 107 ++++---
 .../trt_multi_level_rotated_roi_align.hpp     |  52 +++-
 ...rt_multi_level_rotated_roi_align_kernel.cu | 133 +++++----
 ...t_multi_level_rotated_roi_align_kernel.hpp |  19 +-
 .../trt_ms_deform_attn.cpp                    |  92 +++---
 .../trt_ms_deform_attn.hpp                    |  48 ++-
 .../trt_ms_deform_attn_kernel.cu              | 146 ++++++----
 .../trt_ms_deform_attn_kernel.cuh             | 117 +++++---
 .../trt_ms_deform_attn_kernel.hpp             |  15 +-
 .../tensorrt/roi_align/trt_roi_align.cpp      |  55 +++-
 .../tensorrt/roi_align/trt_roi_align.hpp      |  51 +++-
 .../roi_align/trt_roi_align_kernel.cu         |  92 +++---
 .../roi_align/trt_roi_align_kernel.hpp        |  17 +-
 .../scaled_dot_product_attention.cpp          |  23 +-
 .../scaled_dot_product_attention.hpp          |  42 ++-
 .../scaled_dot_product_attention_kernel.cu    | 207 +++++++++++--
 .../scaled_dot_product_attention_kernel.hpp   |  19 +-
 .../tensorrt/scatternd/trt_scatternd.cpp      |  48 ++-
 .../tensorrt/scatternd/trt_scatternd.hpp      |  43 ++-
 .../scatternd/trt_scatternd_kernel.cu         |  50 +++-
 .../scatternd/trt_scatternd_kernel.hpp        |  10 +-
 .../ops/coreml_nms/coreml_nms_cpu.cpp         |   6 +-
 .../modulated_deform_conv_cpu.cpp             |  98 ++++---
 .../modulated_deform_conv_cuda.cu             | 113 ++++++--
 .../torchscript/optimizer/bind.cpp            |  10 +-
 .../optimizer/ir/subgraph_matcher.cpp         |  58 +++-
 .../passes/onnx/fuse_select_assign.cpp        |  10 +-
 .../optimizer/passes/onnx/onnx_peephole.cpp   |   7 +-
 csrc/mmdeploy/codebase/common.h               |   7 +-
 .../codebase/mmdet/object_detection.cpp       |  14 +-
 .../codebase/mmdet/object_detection.h         |  14 +-
 csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp  |  17 +-
 csrc/mmdeploy/codebase/mmdet/rtmdet_head.h    |  25 +-
 csrc/mmdeploy/codebase/mmdet/utils.cpp        |  44 ++-
 csrc/mmdeploy/codebase/mmdet/utils.h          |  42 ++-
 csrc/mmdeploy/codebase/mmdet/yolo_head.cpp    |  54 +++-
 csrc/mmdeploy/codebase/mmdet/yolo_head.h      |  49 +++-
 csrc/mmdeploy/core/graph.h                    |   3 +
 csrc/mmdeploy/execution/then.h                |   1 +
 csrc/mmdeploy/graph/static_router.cpp         |  24 +-
 110 files changed, 3808 insertions(+), 1323 deletions(-)

diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp
index bf4772bcfb..5ba395ad77 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp
@@ -46,7 +46,11 @@ namespace mmdeploy
 
                 Classification* results{};
                 int*            result_count{};
-                auto            ec = mmdeploy_classifier_apply(classifier_, reinterpret(images.data()), static_cast<int>(images.size()), &results, &result_count);
+                auto            ec = mmdeploy_classifier_apply(classifier_,
+                                                    reinterpret(images.data()),
+                                                    static_cast<int>(images.size()),
+                                                    &results,
+                                                    &result_count);
                 if (ec != MMDEPLOY_SUCCESS)
                 {
                     throw_exception(static_cast<ErrorCode>(ec));
@@ -55,8 +59,11 @@ namespace mmdeploy
                 std::vector<Result> rets;
                 rets.reserve(images.size());
 
-                std::shared_ptr<Classification> data(results, [result_count, count = images.size()](auto p)
-                                                     { mmdeploy_classifier_release_result(p, result_count, count); });
+                std::shared_ptr<Classification> data(results,
+                                                     [result_count, count = images.size()](auto p)
+                                                     {
+                                                         mmdeploy_classifier_release_result(p, result_count, count);
+                                                     });
 
                 size_t                          offset = 0;
                 for (size_t i = 0; i < images.size(); ++i)
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
index a7547aa7c7..8d16fd9ecc 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
@@ -51,6 +51,7 @@ namespace mmdeploy
                 : handle_(std::exchange(o.handle_, nullptr))
             {
             }
+
             UniqueHandle& operator=(UniqueHandle&& o) noexcept
             {
                 if (this != &o)
@@ -64,6 +65,7 @@ namespace mmdeploy
             {
                 return handle_;
             }
+
             T operator->() const noexcept
             {
                 return handle_;
@@ -84,8 +86,11 @@ namespace mmdeploy
                 {
                     throw_exception(static_cast<ErrorCode>(ec));
                 }
-                model_.reset(model, [](auto p)
-                             { mmdeploy_model_destroy(p); });
+                model_.reset(model,
+                             [](auto p)
+                             {
+                                 mmdeploy_model_destroy(p);
+                             });
             }
 
             explicit Model(const std::string& path)
@@ -96,13 +101,19 @@ namespace mmdeploy
             Model(const void* buffer, size_t size)
             {
                 mmdeploy_model_t model{};
-                auto             ec = mmdeploy_model_create(buffer, static_cast<int>(size), &model);
+                auto             ec = mmdeploy_model_create(buffer,
+                                                static_cast<int>(size),
+                                                &model);
                 if (ec != MMDEPLOY_SUCCESS)
                 {
                     throw_exception(static_cast<ErrorCode>(ec));
                 }
-                model_.reset(model, [](auto p)
-                             { mmdeploy_model_destroy(p); });
+
+                model_.reset(model,
+                             [](auto p)
+                             {
+                                 mmdeploy_model_destroy(p);
+                             });
             }
 
             operator mmdeploy_model_t() const noexcept
@@ -122,19 +133,26 @@ namespace mmdeploy
                 , index_(index)
             {
                 mmdeploy_device_t device{};
-                auto              ec = mmdeploy_device_create(name_.c_str(), index, &device);
+                auto              ec = mmdeploy_device_create(name_.c_str(),
+                                                 index,
+                                                 &device);
                 if (ec != MMDEPLOY_SUCCESS)
                 {
                     throw_exception(static_cast<ErrorCode>(ec));
                 }
-                device_.reset(device, [](auto p)
-                              { mmdeploy_device_destroy(p); });
+
+                device_.reset(device,
+                              [](auto p)
+                              {
+                                  mmdeploy_device_destroy(p);
+                              });
             }
 
             const char* name() const noexcept
             {
                 return name_.c_str();
             }
+
             int index() const noexcept
             {
                 return index_;
@@ -163,8 +181,12 @@ namespace mmdeploy
                 {
                     throw_exception(static_cast<ErrorCode>(ec));
                 }
-                profiler_.reset(profiler, [](auto p)
-                                { mmdeploy_profiler_destroy(p); });
+
+                profiler_.reset(profiler,
+                                [](auto p)
+                                {
+                                    mmdeploy_profiler_destroy(p);
+                                });
             };
 
             operator mmdeploy_profiler_t() const noexcept
@@ -185,8 +207,20 @@ namespace mmdeploy
             {
             }
 
-            Mat(int height, int width, int channels, mmdeploy_pixel_format_t format, mmdeploy_data_type_t type, uint8_t* data, mmdeploy_device_t device = nullptr)
-                : desc_{data, height, width, channels, format, type, device}
+            Mat(int                     height,
+                int                     width,
+                int                     channels,
+                mmdeploy_pixel_format_t format,
+                mmdeploy_data_type_t    type,
+                uint8_t*                data,
+                mmdeploy_device_t       device = nullptr)
+                : desc_{data,
+                        height,
+                        width,
+                        channels,
+                        format,
+                        type,
+                        device}
             {
             }
 
@@ -202,17 +236,24 @@ namespace mmdeploy
 
 #if MMDEPLOY_CXX_USE_OPENCV
             Mat(const cv::Mat& mat, mmdeploy_pixel_format_t pixel_format)
-                : desc_{mat.data, mat.rows, mat.cols, mat.channels(), pixel_format, GetCvType(mat.depth())}
+                : desc_{mat.data,
+                        mat.rows,
+                        mat.cols,
+                        mat.channels(),
+                        pixel_format,
+                        GetCvType(mat.depth())}
             {
                 if (pixel_format == MMDEPLOY_PIXEL_FORMAT_COUNT)
                 {
                     throw_exception(eNotSupported);
                 }
+
                 if (desc_.type == MMDEPLOY_DATA_TYPE_COUNT)
                 {
                     throw_exception(eNotSupported);
                 }
             }
+
             Mat(const cv::Mat& mat)
                 : Mat(mat, GetCvFormat(mat.channels()))
             {
@@ -230,6 +271,7 @@ namespace mmdeploy
                         return MMDEPLOY_DATA_TYPE_COUNT;
                 }
             }
+
             static mmdeploy_pixel_format_t GetCvFormat(int channels)
             {
                 switch (channels)
@@ -274,14 +316,17 @@ namespace mmdeploy
             {
                 return *(data_.get() + offset_ + index);
             }
+
             size_t size() const noexcept
             {
                 return size_;
             }
+
             T* begin() const noexcept
             {
                 return data_.get() + offset_;
             }
+
             T* end() const noexcept
             {
                 return begin() + size_;
@@ -291,6 +336,7 @@ namespace mmdeploy
             {
                 return data_.get();
             }
+
             T& operator*() const noexcept
             {
                 return *data_;
@@ -312,14 +358,18 @@ namespace mmdeploy
           public:
             explicit Scheduler(mmdeploy_scheduler_t scheduler)
             {
-                scheduler_.reset(scheduler, [](auto p)
-                                 { mmdeploy_scheduler_destroy(p); });
+                scheduler_.reset(scheduler,
+                                 [](auto p)
+                                 {
+                                     mmdeploy_scheduler_destroy(p);
+                                 });
             }
 
             static Scheduler ThreadPool(int num_threads)
             {
                 return Scheduler(mmdeploy_executor_create_thread_pool(num_threads));
             }
+
             static Scheduler Thread()
             {
                 return Scheduler(mmdeploy_executor_create_thread());
@@ -341,9 +391,13 @@ namespace mmdeploy
             {
                 mmdeploy_context_t context{};
                 mmdeploy_context_create(&context);
-                context_.reset(context, [](auto p)
-                               { mmdeploy_context_destroy(p); });
+                context_.reset(context,
+                               [](auto p)
+                               {
+                                   mmdeploy_context_destroy(p);
+                               });
             }
+
             /* implicit */ Context(const Device& device)
                 : Context()
             {
@@ -354,7 +408,6 @@ namespace mmdeploy
             {
                 mmdeploy_context_add(*this, MMDEPLOY_TYPE_SCHEDULER, name.c_str(), scheduler);
             }
-
             void Add(const std::string& name, const Model& model)
             {
                 mmdeploy_context_add(*this, MMDEPLOY_TYPE_MODEL, name.c_str(), model);
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp
index 6f38a20d90..31874fa9f9 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp
@@ -46,14 +46,21 @@ namespace mmdeploy
 
                 Detection* results{};
                 int*       result_count{};
-                auto       ec = mmdeploy_detector_apply(detector_, reinterpret(images.data()), static_cast<int>(images.size()), &results, &result_count);
+                auto       ec = mmdeploy_detector_apply(detector_,
+                                                  reinterpret(images.data()),
+                                                  static_cast<int>(images.size()),
+                                                  &results,
+                                                  &result_count);
                 if (ec != MMDEPLOY_SUCCESS)
                 {
                     throw_exception(static_cast<ErrorCode>(ec));
                 }
 
-                std::shared_ptr<Detection> data(results, [result_count, count = images.size()](auto p)
-                                                { mmdeploy_detector_release_result(p, result_count, count); });
+                std::shared_ptr<Detection> data(results,
+                                                [result_count, count = images.size()](auto p)
+                                                {
+                                                    mmdeploy_detector_release_result(p, result_count, count);
+                                                });
 
                 std::vector<Result>        rets;
                 rets.reserve(images.size());
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp
index c5f07f56af..9380236f8c 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp
@@ -19,7 +19,9 @@ namespace mmdeploy
             Pipeline(const Value& config, const Context& context)
             {
                 mmdeploy_pipeline_t pipeline{};
-                auto                ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, context, &pipeline);
+                auto                ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config,
+                                                      context,
+                                                      &pipeline);
                 if (ec != MMDEPLOY_SUCCESS)
                 {
                     throw_exception(static_cast<ErrorCode>(ec));
@@ -39,7 +41,9 @@ namespace mmdeploy
             Value Apply(const Value& inputs)
             {
                 mmdeploy_value_t tmp{};
-                auto             ec = mmdeploy_pipeline_apply(pipeline_, (mmdeploy_value_t)&inputs, &tmp);
+                auto             ec = mmdeploy_pipeline_apply(pipeline_,
+                                                  (mmdeploy_value_t)&inputs,
+                                                  &tmp);
                 if (ec != MMDEPLOY_SUCCESS)
                 {
                     throw_exception(static_cast<ErrorCode>(ec));
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp
index 6a157f5228..34ef2d2221 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp
@@ -60,8 +60,11 @@ namespace mmdeploy
                     throw_exception(static_cast<ErrorCode>(ec));
                 }
 
-                std::shared_ptr<PoseDetection> data(results, [count = images.size()](auto p)
-                                                    { mmdeploy_pose_detector_release_result(p, count); });
+                std::shared_ptr<PoseDetection> data(results,
+                                                    [count = images.size()](auto p)
+                                                    {
+                                                        mmdeploy_pose_detector_release_result(p, count);
+                                                    });
 
                 std::vector<Result>            rets;
                 rets.reserve(images.size());
diff --git a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh
index d1b3195669..d5b0f57bfc 100644
--- a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh
+++ b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh
@@ -41,13 +41,35 @@ inline int GET_BLOCKS(const int N)
  * @param[in] stream cuda stream handle
  */
 template<class scalar_t>
-void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim, cudaStream_t stream = 0);
+void memcpyPermute(scalar_t*       dst,
+                   const scalar_t* src,
+                   int*            src_size,
+                   int*            permute,
+                   int             src_dim,
+                   cudaStream_t    stream = 0);
 
 template<typename scalar_t>
-cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha, const scalar_t* A, int lda, const scalar_t* B, int ldb, const scalar_t* beta, scalar_t* C, int ldc);
+cublasStatus_t cublasGemmWrap(cublasHandle_t    handle,
+                              cublasOperation_t transa,
+                              cublasOperation_t transb,
+                              int               m,
+                              int               n,
+                              int               k,
+                              const scalar_t*   alpha,
+                              const scalar_t*   A,
+                              int               lda,
+                              const scalar_t*   B,
+                              int               ldb,
+                              const scalar_t*   beta,
+                              scalar_t*         C,
+                              int               ldc);
 
 template<typename scalar_t>
-__device__ scalar_t bilinear_interpolate(const scalar_t* input, const int height, const int width, scalar_t y, scalar_t x)
+__device__ scalar_t bilinear_interpolate(const scalar_t* input,
+                                         const int       height,
+                                         const int       width,
+                                         scalar_t        y,
+                                         scalar_t        x)
 {
     // deal with cases that inverse elements are out of feature map boundary
     if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
diff --git a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h
index 4bd17cd0d3..a65096df08 100644
--- a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h
+++ b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h
@@ -2,7 +2,11 @@
 #include <cstdint>
 
 template<typename T>
-T bilinear_interpolate_2d(const T* src, const int64_t src_h, const int64_t src_w, const T h, const T w)
+T bilinear_interpolate_2d(const T*      src,
+                          const int64_t src_h,
+                          const int64_t src_w,
+                          const T       h,
+                          const T       w)
 {
     if (h <= -1 || src_h <= h || w <= -1 || src_w <= w)
     {
@@ -36,7 +40,25 @@ T bilinear_interpolate_2d(const T* src, const int64_t src_h, const int64_t src_w
 
 // output: (channels * kernel_h * kernel_w, dst_h * dst_w)
 template<typename T>
-void deformable_im2col_2d(const T* input, const T* offset, const T* mask, const int64_t src_h, const int64_t src_w, const int64_t kernel_h, const int64_t kernel_w, const int64_t pad_h, const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, const int64_t dilation_h, const int64_t dilation_w, const int64_t channels, const int64_t offset_groups, const int64_t dst_h, const int64_t dst_w, const bool use_mask, T* columns)
+void deformable_im2col_2d(const T*      input,
+                          const T*      offset,
+                          const T*      mask,
+                          const int64_t src_h,
+                          const int64_t src_w,
+                          const int64_t kernel_h,
+                          const int64_t kernel_w,
+                          const int64_t pad_h,
+                          const int64_t pad_w,
+                          const int64_t stride_h,
+                          const int64_t stride_w,
+                          const int64_t dilation_h,
+                          const int64_t dilation_w,
+                          const int64_t channels,
+                          const int64_t offset_groups,
+                          const int64_t dst_h,
+                          const int64_t dst_w,
+                          const bool    use_mask,
+                          T*            columns)
 {
     const int64_t workload = channels * dst_h * dst_w;
     for (int64_t index = 0; index != workload; ++index)
diff --git a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh
index 6051c4762b..20429a37c9 100644
--- a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh
+++ b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh
@@ -72,7 +72,12 @@
 #include "common_cuda_helper.cuh"
 
 template<typename T>
-__device__ float mdcn_im2col_bilinear(const T* input, const int data_width, const int height, const int width, float h, float w)
+__device__ float mdcn_im2col_bilinear(const T*  input,
+                                      const int data_width,
+                                      const int height,
+                                      const int width,
+                                      float     h,
+                                      float     w)
 {
     int h_low  = floorf(h);
     int w_low  = floorf(w);
@@ -98,7 +103,12 @@ __device__ float mdcn_im2col_bilinear(const T* input, const int data_width, cons
     return float(val);
 }
 template<>
-__device__ float mdcn_im2col_bilinear<__half>(const __half* input, const int data_width, const int height, const int width, float h, float w)
+__device__ float mdcn_im2col_bilinear<__half>(const __half* input,
+                                              const int     data_width,
+                                              const int     height,
+                                              const int     width,
+                                              float         h,
+                                              float         w)
 {
     int   h_low  = floorf(h);
     int   w_low  = floorf(w);
@@ -126,28 +136,27 @@ __device__ float mdcn_im2col_bilinear<__half>(const __half* input, const int dat
 }
 
 template<typename T>
-__global__ void modulated_deformable_im2col_gpu_kernel(
-    const int n,
-    const T*  data_im,
-    const T*  data_offset,
-    const T*  data_mask,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int pad_h,
-    const int pad_w,
-    const int stride_h,
-    const int stride_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size,
-    const int num_channels,
-    const int deformable_group,
-    const int height_col,
-    const int width_col,
-    T*        data_col)
+__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
+                                                       const T*  data_im,
+                                                       const T*  data_offset,
+                                                       const T*  data_mask,
+                                                       const int height,
+                                                       const int width,
+                                                       const int kernel_h,
+                                                       const int kernel_w,
+                                                       const int pad_h,
+                                                       const int pad_w,
+                                                       const int stride_h,
+                                                       const int stride_w,
+                                                       const int dilation_h,
+                                                       const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size,
+                                                       const int num_channels,
+                                                       const int deformable_group,
+                                                       const int height_col,
+                                                       const int width_col,
+                                                       T*        data_col)
 {
     CUDA_1D_KERNEL_LOOP(index, n)
     {
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h
index 73390cc24d..ec4575b51a 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h
@@ -117,7 +117,11 @@ void fuse_pixelshuffle(onnx::GraphProto*                         mutable_graph,
                        std::set<std::string>&                    blob_names,
                        int&                                      reduced_node_count);
 
-void fuse_reorg(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference, std::set<std::string>& blob_names, int& reduced_node_count);
+void fuse_reorg(onnx::GraphProto*                         mutable_graph,
+                std::map<std::string, onnx::TensorProto>& weights,
+                std::map<std::string, int>&               node_reference,
+                std::set<std::string>&                    blob_names,
+                int&                                      reduced_node_count);
 
 void fuse_expand_broadcast(onnx::GraphProto*                         mutable_graph,
                            std::map<std::string, onnx::TensorProto>& weights,
@@ -143,4 +147,8 @@ void fuse_weight_transpose(onnx::GraphProto*                         mutable_gra
                            std::set<std::string>&                    blob_names,
                            int&                                      reduced_node_count);
 
-void fuse_swish(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference, std::set<std::string>& blob_names, int& reduced_node_count);
+void fuse_swish(onnx::GraphProto*                         mutable_graph,
+                std::map<std::string, onnx::TensorProto>& weights,
+                std::map<std::string, int>&               node_reference,
+                std::set<std::string>&                    blob_names,
+                int&                                      reduced_node_count);
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp
index 42482ee8b8..efecdcd199 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp
@@ -13,11 +13,10 @@
  * @param context <tensor name, shape>
  * @return std::tuple<bool, std::vector<int>>
  */
-std::tuple<bool, std::vector<int>> query_shape(
-    onnx::GraphProto*                               mutable_graph,
-    onnx::NodeProto*                                target,
-    const std::map<std::string, onnx::TensorProto>& weights,
-    std::map<std::string, std::vector<int>>&        context)
+std::tuple<bool, std::vector<int>> query_shape(onnx::GraphProto*                               mutable_graph,
+                                               onnx::NodeProto*                                target,
+                                               const std::map<std::string, onnx::TensorProto>& weights,
+                                               std::map<std::string, std::vector<int>>&        context)
 {
     // emplace all input nodes
     const int input_count = mutable_graph->input_size();
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h
index e7a29a2cef..55d966ae83 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h
@@ -13,8 +13,7 @@
  * @param context <tensor name, shape>
  * @return std::tuple<bool, std::vector<int>>
  */
-std::tuple<bool, std::vector<int>> query_shape(
-    onnx::GraphProto*                               mutable_graph,
-    onnx::NodeProto*                                target,
-    const std::map<std::string, onnx::TensorProto>& weights,
-    std::map<std::string, std::vector<int>>&        context);
+std::tuple<bool, std::vector<int>> query_shape(onnx::GraphProto*                               mutable_graph,
+                                               onnx::NodeProto*                                target,
+                                               const std::map<std::string, onnx::TensorProto>& weights,
+                                               std::map<std::string, std::vector<int>>&        context);
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp
index c347cb97a9..32ae99669b 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp
@@ -8,6 +8,7 @@ namespace mmdeploy
     using namespace ncnn;
     DEFINE_LAYER_CREATOR(ConstantOfShape)
     DEFINE_NCNN_OPS(ConstantOfShape, ConstantOfShape)
+
     ConstantOfShape::ConstantOfShape()
     {
         one_blob_only   = true;
@@ -20,7 +21,9 @@ namespace mmdeploy
         return 0;
     }
 
-    int ConstantOfShape::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+    int ConstantOfShape::forward(const Mat&    bottom_blob,
+                                 Mat&          top_blob,
+                                 const Option& opt) const
     {
         int          dims       = bottom_blob.w - 1;
         const float* bottom_ptr = bottom_blob;
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h
index d068fd3196..85317ba559 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h
@@ -14,7 +14,9 @@ namespace mmdeploy
 
         virtual int load_param(const ncnn::ParamDict& pd);
 
-        virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const;
+        virtual int forward(const ncnn::Mat&    bottom_blob,
+                            ncnn::Mat&          top_blob,
+                            const ncnn::Option& opt) const;
 
       public:
         float val;
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp
index c742b91df7..ca8120f228 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp
@@ -15,7 +15,9 @@ namespace mmdeploy
         support_inplace = false;
     }
 
-    int Expand::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+    int Expand::forward(const std::vector<Mat>& bottom_blobs,
+                        std::vector<Mat>&       top_blobs,
+                        const Option&           opt) const
     {
         const Mat& bottom_blob    = bottom_blobs[0];
         size_t     elemsize       = bottom_blob.elemsize;
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h
index a378965d03..5b280100a4 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h
@@ -12,7 +12,9 @@ namespace mmdeploy
       public:
         Expand();
 
-        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const;
+        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs,
+                            std::vector<ncnn::Mat>&       top_blobs,
+                            const ncnn::Option&           opt) const;
     };
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp
index 24ea7f7181..15950bdbfa 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp
@@ -9,6 +9,7 @@ namespace mmdeploy
     using namespace ncnn;
     DEFINE_LAYER_CREATOR(Gather)
     DEFINE_NCNN_OPS(Gather, Gather)
+
     Gather::Gather()
     {
         one_blob_only   = false;
@@ -27,7 +28,9 @@ namespace mmdeploy
     // When indices dim equals to 1, after eliminating implicit batch, the indices
     // dim still be 1. So there is only 1 implicit batch in data, this will make
     // the shape match onnx result.
-    int Gather::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+    int Gather::forward(const std::vector<Mat>& bottom_blobs,
+                        std::vector<Mat>&       top_blobs,
+                        const Option&           opt) const
     {
         const Mat& bottom_blob   = bottom_blobs[0];
         const Mat& indices       = bottom_blobs[1];
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
index 13d38e4bd0..e7bfb717c8 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
@@ -14,7 +14,9 @@ namespace mmdeploy
 
         virtual int load_param(const ncnn::ParamDict& pd);
 
-        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const;
+        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs,
+                            std::vector<ncnn::Mat>&       top_blobs,
+                            const ncnn::Option&           opt) const;
 
       public:
         int axis;
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
index b0de664040..32c918156c 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
@@ -9,6 +9,7 @@
 #include "net.h"
 
 MMDEPLOY_API std::map<const char*, ncnn::layer_creator_func>& get_mmdeploy_layer_creator();
+
 MMDEPLOY_API std::map<const char*, ncnn::layer_destroyer_func>& get_mmdeploy_layer_destroyer();
 
 MMDEPLOY_API int                                                register_mmdeploy_custom_layers(ncnn::Net& net);
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
index 17ae195659..cce2935ba1 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
@@ -8,13 +8,16 @@ namespace mmdeploy
     using namespace ncnn;
     DEFINE_LAYER_CREATOR(Shape)
     DEFINE_NCNN_OPS(Shape, Shape)
+
     Shape::Shape()
     {
         one_blob_only   = true;
         support_inplace = false;
     }
 
-    int Shape::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+    int Shape::forward(const Mat&    bottom_blob,
+                       Mat&          top_blob,
+                       const Option& opt) const
     {
         int    dims     = bottom_blob.dims;
         int    w        = bottom_blob.w;
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
index 2330f57ba4..2c1e4573bf 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
@@ -12,7 +12,9 @@ namespace mmdeploy
       public:
         Shape();
 
-        virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const;
+        virtual int forward(const ncnn::Mat&    bottom_blob,
+                            ncnn::Mat&          top_blob,
+                            const ncnn::Option& opt) const;
     };
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp
index b77c9ce56f..8b1e35ae66 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp
@@ -10,6 +10,7 @@ namespace mmdeploy
     using namespace ncnn;
     DEFINE_LAYER_CREATOR(TensorSlice)
     DEFINE_NCNN_OPS(TensorSlice, TensorSlice)
+
     TensorSlice::TensorSlice()
     {
         one_blob_only   = true;
@@ -56,7 +57,9 @@ namespace mmdeploy
         return 0;
     }
 
-    int TensorSlice::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+    int TensorSlice::forward(const Mat&    bottom_blob,
+                             Mat&          top_blob,
+                             const Option& opt) const
     {
         int        dims      = bottom_blob.dims;
         size_t     elemsize  = bottom_blob.elemsize;
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
index 14342c6f81..fbffdcb843 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
@@ -14,7 +14,9 @@ namespace mmdeploy
 
         virtual int load_param(const ncnn::ParamDict& pd);
 
-        virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const;
+        virtual int forward(const ncnn::Mat&    bottom_blob,
+                            ncnn::Mat&          top_blob,
+                            const ncnn::Option& opt) const;
 
       public:
         ncnn::Mat starts;
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp
index 91235fa476..cfa55d1f8e 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp
@@ -17,6 +17,7 @@ namespace mmdeploy
         one_blob_only   = false;
         support_inplace = false;
     }
+
     int TopK::load_param(const ParamDict& pd)
     {
         axis      = pd.get(0, -1);
@@ -26,7 +27,10 @@ namespace mmdeploy
 
         return 0;
     }
-    int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+
+    int TopK::forward(const std::vector<Mat>& bottom_blobs,
+                      std::vector<Mat>&       top_blobs,
+                      const Option&           opt) const
     {
         int dims          = bottom_blobs[0].dims;
         int positive_axis = axis < 0 ? dims + axis : axis;
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h
index e9bbde1297..45e7968b79 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h
@@ -11,8 +11,12 @@ namespace mmdeploy
     {
       public:
         TopK();
+
         virtual int load_param(const ncnn::ParamDict& pd);
-        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const;
+
+        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs,
+                            std::vector<ncnn::Mat>&       top_blobs,
+                            const ncnn::Option&           opt) const;
 
       public:
         int axis;
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
index 8edec279c5..3bb08a5e22 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
@@ -73,7 +73,10 @@ namespace mmdeploy
         return ret;
     }
 
-    size_t TRTBatchedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT
+    size_t TRTBatchedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                           int                               nbInputs,
+                                           const nvinfer1::PluginTensorDesc* outputs,
+                                           int                               nbOutputs) const TRT_NOEXCEPT
     {
         size_t batch_size    = inputs[0].dims.d[0];
         size_t boxes_size    = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
@@ -81,7 +84,15 @@ namespace mmdeploy
         size_t num_priors    = inputs[0].dims.d[1];
         bool   shareLocation = (inputs[0].dims.d[2] == 1);
         int    topk          = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
-        return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size, param.numClasses, num_priors, topk, DataType::kFLOAT, DataType::kFLOAT);
+        return detectionInferenceWorkspaceSize(shareLocation,
+                                               batch_size,
+                                               boxes_size,
+                                               score_size,
+                                               param.numClasses,
+                                               num_priors,
+                                               topk,
+                                               DataType::kFLOAT,
+                                               DataType::kFLOAT);
     }
 
     int TRTBatchedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
@@ -107,31 +118,30 @@ namespace mmdeploy
         int               topk =
             param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
         bool           rotated = false;
-        pluginStatus_t status  = nmsInference(
-            stream,
-            batch_size,
-            boxes_size,
-            score_size,
-            shareLocation,
-            param.backgroundLabelId,
-            num_priors,
-            param.numClasses,
-            topk,
-            param.keepTopK,
-            param.scoreThreshold,
-            param.iouThreshold,
-            DataType::kFLOAT,
-            locData,
-            DataType::kFLOAT,
-            confData,
-            nmsedDets,
-            nmsedLabels,
-            nmsedIndex,
-            workSpace,
-            param.isNormalized,
-            false,
-            mClipBoxes,
-            rotated);
+        pluginStatus_t status  = nmsInference(stream,
+                                             batch_size,
+                                             boxes_size,
+                                             score_size,
+                                             shareLocation,
+                                             param.backgroundLabelId,
+                                             num_priors,
+                                             param.numClasses,
+                                             topk,
+                                             param.keepTopK,
+                                             param.scoreThreshold,
+                                             param.iouThreshold,
+                                             DataType::kFLOAT,
+                                             locData,
+                                             DataType::kFLOAT,
+                                             confData,
+                                             nmsedDets,
+                                             nmsedLabels,
+                                             nmsedIndex,
+                                             workSpace,
+                                             param.isNormalized,
+                                             false,
+                                             mClipBoxes,
+                                             rotated);
         ASSERT(status == STATUS_SUCCESS);
 
         return 0;
@@ -150,12 +160,18 @@ namespace mmdeploy
         serialize_value(&buffer, mReturnIndex);
     }
 
-    void TRTBatchedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT
+    void TRTBatchedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                        int                                      nbInputs,
+                                        const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                        int                                      nbOutputs) TRT_NOEXCEPT
     {
         // Validate input arguments
     }
 
-    bool TRTBatchedNMS::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    bool TRTBatchedNMS::supportsFormatCombination(int                               pos,
+                                                  const nvinfer1::PluginTensorDesc* ioDesc,
+                                                  int                               nbInputs,
+                                                  int                               nbOutputs) TRT_NOEXCEPT
     {
         if (pos == 3 || pos == 4)
         {
@@ -184,7 +200,9 @@ namespace mmdeploy
         return plugin;
     }
 
-    nvinfer1::DataType TRTBatchedNMS::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
+    nvinfer1::DataType TRTBatchedNMS::getOutputDataType(int                       index,
+                                                        const nvinfer1::DataType* inputTypes,
+                                                        int                       nbInputs) const TRT_NOEXCEPT
     {
         ASSERT(index >= 0 && index < this->getNbOutputs());
         if (index == 1 || index == 2)
@@ -289,7 +307,9 @@ namespace mmdeploy
         return plugin;
     }
 
-    IPluginV2Ext* TRTBatchedNMSCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
+    IPluginV2Ext* TRTBatchedNMSCreator::deserializePlugin(const char* name,
+                                                          const void* serialData,
+                                                          size_t      serialLength) TRT_NOEXCEPT
     {
         // This object will be deleted when the network is destroyed, which will
         // call NMS::destroy()
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
index 2cd276a931..b1d77a54d0 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
@@ -20,7 +20,9 @@ namespace mmdeploy
     class TRTBatchedNMS : public TRTPluginBase
     {
       public:
-        TRTBatchedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param, bool returnIndex);
+        TRTBatchedNMS(const std::string&              name,
+                      nvinfer1::plugin::NMSParameters param,
+                      bool                            returnIndex);
 
         TRTBatchedNMS(const std::string& name, const void* data, size_t length);
 
@@ -28,10 +30,15 @@ namespace mmdeploy
 
         int                 getNbOutputs() const TRT_NOEXCEPT override;
 
-        nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
 
-        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
 
         int                            enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                                const nvinfer1::PluginTensorDesc* outputDesc,
@@ -44,9 +51,15 @@ namespace mmdeploy
 
         void                           serialize(void* buffer) const TRT_NOEXCEPT override;
 
-        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT override;
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
 
-        bool                           supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
 
         const char*                    getPluginType() const TRT_NOEXCEPT override;
 
@@ -54,7 +67,9 @@ namespace mmdeploy
 
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-        nvinfer1::DataType             getOutputDataType(int index, const nvinfer1::DataType* inputType, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputType,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         void                           setClipParam(bool clip);
 
@@ -75,10 +90,12 @@ namespace mmdeploy
 
         const char*             getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-            TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2Ext* createPlugin(const char*                            name,
+                                             const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                                  const void* serialData,
+                                                  size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
index 49b5cb650d..be156dc9c9 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
@@ -20,10 +20,15 @@ namespace mmdeploy
 
         int                 getNbOutputs() const TRT_NOEXCEPT override;
 
-        nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
 
-        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
 
         int                            enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                                const nvinfer1::PluginTensorDesc* outputDesc,
@@ -36,9 +41,15 @@ namespace mmdeploy
 
         void                           serialize(void* buffer) const TRT_NOEXCEPT override;
 
-        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT override;
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
 
-        bool                           supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
 
         const char*                    getPluginType() const TRT_NOEXCEPT override;
 
@@ -46,7 +57,9 @@ namespace mmdeploy
 
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-        nvinfer1::DataType             getOutputDataType(int index, const nvinfer1::DataType* inputType, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputType,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         void                           setClipParam(bool clip);
 
@@ -66,10 +79,12 @@ namespace mmdeploy
 
         const char*             getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-            TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2Ext* createPlugin(const char*                            name,
+                                             const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                                  const void* serialData,
+                                                  size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
index db2063d235..6f46a9f295 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
@@ -43,11 +43,10 @@ namespace mmdeploy
         return plugin;
     }
 
-    nvinfer1::DimsExprs TRTBicubicInterpolate::getOutputDimensions(
-        int                        outputIndex,
-        const nvinfer1::DimsExprs* inputs,
-        int                        nbInputs,
-        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    nvinfer1::DimsExprs TRTBicubicInterpolate::getOutputDimensions(int                        outputIndex,
+                                                                   const nvinfer1::DimsExprs* inputs,
+                                                                   int                        nbInputs,
+                                                                   nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
     {
         nvinfer1::DimsExprs ret;
         ret.nbDims  = 4;
@@ -114,7 +113,16 @@ namespace mmdeploy
         switch (data_type)
         {
             case nvinfer1::DataType::kFLOAT:
-                bicubic_interpolate<float>((float*)x, (float*)output, batch, channels, height, width, height_out, width_out, mAlignCorners, stream);
+                bicubic_interpolate<float>((float*)x,
+                                           (float*)output,
+                                           batch,
+                                           channels,
+                                           height,
+                                           width,
+                                           height_out,
+                                           width_out,
+                                           mAlignCorners,
+                                           stream);
                 break;
             default:
                 return 1;
@@ -179,9 +187,8 @@ namespace mmdeploy
         return PLUGIN_VERSION;
     }
 
-    nvinfer1::IPluginV2* TRTBicubicInterpolateCreator::createPlugin(
-        const char*                            name,
-        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    nvinfer1::IPluginV2* TRTBicubicInterpolateCreator::createPlugin(const char*                            name,
+                                                                    const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
     {
         nvinfer1::Dims     size{2, {1, 1}};
         std::vector<float> scale_factor;
@@ -218,10 +225,9 @@ namespace mmdeploy
         return plugin;
     }
 
-    nvinfer1::IPluginV2* TRTBicubicInterpolateCreator::deserializePlugin(
-        const char* name,
-        const void* serialData,
-        size_t      serialLength) TRT_NOEXCEPT
+    nvinfer1::IPluginV2* TRTBicubicInterpolateCreator::deserializePlugin(const char* name,
+                                                                         const void* serialData,
+                                                                         size_t      serialLength) TRT_NOEXCEPT
     {
         auto plugin = new TRTBicubicInterpolate(name, serialData, serialLength);
         plugin->setPluginNamespace(getPluginNamespace());
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
index 709976ce32..9a66c5e718 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
@@ -20,11 +20,27 @@ namespace mmdeploy
 
         // IPluginV2DynamicExt Methods
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
-        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
-        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
-        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
-        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
         int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
                                    const void* const*                inputs,
@@ -33,7 +49,9 @@ namespace mmdeploy
                                    cudaStream_t                      stream) TRT_NOEXCEPT override;
 
         // IPluginV2Ext Methods
-        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         // IPluginV2 Methods
         const char*        getPluginType() const TRT_NOEXCEPT override;
@@ -58,7 +76,9 @@ namespace mmdeploy
         nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
             TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_BICUBIC_INTERPOLATE_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
index 7a03aa3144..2c189e0a45 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
@@ -41,7 +41,11 @@ __device__ __forceinline__ static void get_cubic_upsample_coefficients(scalar_t
 }
 
 template<typename scalar_t>
-__device__ __forceinline__ static scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2, scalar_t x3, scalar_t t)
+__device__ __forceinline__ static scalar_t cubic_interp1d(scalar_t x0,
+                                                          scalar_t x1,
+                                                          scalar_t x2,
+                                                          scalar_t x3,
+                                                          scalar_t t)
 {
     scalar_t coeffs[4];
     get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
@@ -63,13 +67,16 @@ __device__ __forceinline__ static scalar_t upsample_get_value_bounded(const scal
 {
     int access_y = max(min(y, height - 1), 0);
     int access_x = max(min(x, width - 1), 0);
-    return data[batch * channels * height * width + channel * height * width + access_y * width +
+    return data[batch * channels * height * width + channel * height * width +
+                access_y * width +
                 access_x];
 }
 
 template<typename scalar_t>
-__device__ __forceinline__ scalar_t
-    area_pixel_compute_source_index(scalar_t scale, int64_t dst_index, bool align_corners, bool cubic)
+__device__ __forceinline__ scalar_t area_pixel_compute_source_index(scalar_t scale,
+                                                                    int64_t  dst_index,
+                                                                    bool     align_corners,
+                                                                    bool     cubic)
 {
     if (align_corners)
     {
@@ -96,7 +103,18 @@ __device__ __forceinline__ scalar_t
 
 // cubic interpolation pytorch
 template<typename scalar_t>
-__global__ void resize_cubic_kernel_torch(const int num_elements, const scalar_t* src, const int batchsize, const int channels, int srcWidth, int srcHeight, scalar_t* dst, int dstWidth, int dstHeight, bool align_corners, float height_scale, float width_scale)
+__global__ void resize_cubic_kernel_torch(const int       num_elements,
+                                          const scalar_t* src,
+                                          const int       batchsize,
+                                          const int       channels,
+                                          int             srcWidth,
+                                          int             srcHeight,
+                                          scalar_t*       dst,
+                                          int             dstWidth,
+                                          int             dstHeight,
+                                          bool            align_corners,
+                                          float           height_scale,
+                                          float           width_scale)
 {
     CUDA_1D_KERNEL_LOOP(index, num_elements)
     {
@@ -111,8 +129,10 @@ __global__ void resize_cubic_kernel_torch(const int num_elements, const scalar_t
                 for (int c = 0; c < channels; c++)
                 {
                     const scalar_t val = src[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth +
-                                             output_y * dstWidth + output_x];
-                    dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
+                                             output_y * dstWidth +
+                                             output_x];
+                    dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth +
+                        output_y * dstWidth +
                         output_x]      = val;
                 }
             }
@@ -145,15 +165,29 @@ __global__ void resize_cubic_kernel_torch(const int num_elements, const scalar_t
                         t_x);
                 }
 
-                dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
-                    output_x] = scalar_t(cubic_interp1d(coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y));
+                dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth +
+                    output_y * dstWidth +
+                    output_x] = scalar_t(cubic_interp1d(coefficients[0],
+                                                        coefficients[1],
+                                                        coefficients[2],
+                                                        coefficients[3],
+                                                        t_y));
             }
         }
     }
 }
 
 template<typename scalar_t>
-void resizeGPU(const scalar_t* pIn_d, scalar_t* pOut_d, int batch, int channels, int srcWidth, int srcHeight, int dstWidth, int dstHeight, bool align_corners, cudaStream_t stream)
+void resizeGPU(const scalar_t* pIn_d,
+               scalar_t*       pOut_d,
+               int             batch,
+               int             channels,
+               int             srcWidth,
+               int             srcHeight,
+               int             dstWidth,
+               int             dstHeight,
+               bool            align_corners,
+               cudaStream_t    stream)
 {
     float height_scale = float(srcHeight) / dstHeight;
     float width_scale  = float(srcWidth) / dstWidth;
@@ -163,25 +197,51 @@ void resizeGPU(const scalar_t* pIn_d, scalar_t* pOut_d, int batch, int channels,
         width_scale  = (float)(srcWidth - 1) / (dstWidth - 1);
     }
     int n = batch * dstWidth * dstHeight * channels;
-    resize_cubic_kernel_torch<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(
-        dstWidth * dstHeight,
-        pIn_d,
-        batch,
-        channels,
-        srcWidth,
-        srcHeight,
-        pOut_d,
-        dstWidth,
-        dstHeight,
-        align_corners,
-        height_scale,
-        width_scale);
+    resize_cubic_kernel_torch<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(dstWidth * dstHeight,
+                                                                               pIn_d,
+                                                                               batch,
+                                                                               channels,
+                                                                               srcWidth,
+                                                                               srcHeight,
+                                                                               pOut_d,
+                                                                               dstWidth,
+                                                                               dstHeight,
+                                                                               align_corners,
+                                                                               height_scale,
+                                                                               width_scale);
 }
 
 template<typename scalar_t>
-void bicubic_interpolate(const scalar_t* input, scalar_t* output, int batch, int channels, int in_height, int in_width, int out_height, int out_width, bool align_corners, cudaStream_t stream)
+void bicubic_interpolate(const scalar_t* input,
+                         scalar_t*       output,
+                         int             batch,
+                         int             channels,
+                         int             in_height,
+                         int             in_width,
+                         int             out_height,
+                         int             out_width,
+                         bool            align_corners,
+                         cudaStream_t    stream)
 {
-    resizeGPU(input, output, batch, channels, in_width, in_height, out_width, out_height, align_corners, stream);
+    resizeGPU(input,
+              output,
+              batch,
+              channels,
+              in_width,
+              in_height,
+              out_width,
+              out_height,
+              align_corners,
+              stream);
 }
 
-template void bicubic_interpolate<float>(const float* input, float* output, int batch, int channels, int in_height, int in_width, int out_height, int out_width, bool align_corners, cudaStream_t stream);
+template void bicubic_interpolate<float>(const float* input,
+                                         float*       output,
+                                         int          batch,
+                                         int          channels,
+                                         int          in_height,
+                                         int          in_width,
+                                         int          out_height,
+                                         int          out_width,
+                                         bool         align_corners,
+                                         cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
index 28a89a71db..4ecf16c5fe 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
@@ -5,5 +5,14 @@
 #include "common_cuda_helper.hpp"
 
 template<typename scalar_t>
-void bicubic_interpolate(const scalar_t* input, scalar_t* output, int batch, int channels, int in_height, int in_width, int out_height, int out_width, bool align_corners, cudaStream_t stream);
+void bicubic_interpolate(const scalar_t* input,
+                         scalar_t*       output,
+                         int             batch,
+                         int             channels,
+                         int             in_height,
+                         int             in_width,
+                         int             out_height,
+                         int             out_width,
+                         bool            align_corners,
+                         cudaStream_t    stream);
 #endif  // TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
index 97738f8f02..c71de75638 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
@@ -42,10 +42,28 @@ inline int GET_BLOCKS(const int N)
  * @param[in] stream cuda stream handle
  */
 template<class scalar_t>
-void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim, cudaStream_t stream = 0);
+void memcpyPermute(scalar_t*       dst,
+                   const scalar_t* src,
+                   int*            src_size,
+                   int*            permute,
+                   int             src_dim,
+                   cudaStream_t    stream = 0);
 
 template<typename scalar_t>
-cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha, const scalar_t* A, int lda, const scalar_t* B, int ldb, const scalar_t* beta, scalar_t* C, int ldc);
+cublasStatus_t cublasGemmWrap(cublasHandle_t    handle,
+                              cublasOperation_t transa,
+                              cublasOperation_t transb,
+                              int               m,
+                              int               n,
+                              int               k,
+                              const scalar_t*   alpha,
+                              const scalar_t*   A,
+                              int               lda,
+                              const scalar_t*   B,
+                              int               ldb,
+                              const scalar_t*   beta,
+                              scalar_t*         C,
+                              int               ldc);
 
 template<typename scalar_t>
 __device__ __forceinline__ scalar_t bilinear_interpolate(const scalar_t* __restrict__ input,
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
index 8b28458fd0..542db78b96 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
@@ -6,6 +6,29 @@
 #include "cuda_runtime_api.h"
 #include "kernel.h"
 
-pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize, const int perBatchScoresSize, const bool shareLocation, const int backgroundLabelId, const int numPredsPerClass, const int numClasses, const int topK, const int keepTopK, const float scoreThreshold, const float iouThreshold, const DataType DT_BBOX, const void* locData, const DataType DT_SCORE, const void* confData, void* nmsedDets, void* nmsedLabels, void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid, bool clipBoxes, bool rotated = false);
+pluginStatus_t nmsInference(cudaStream_t   stream,
+                            const int      N,
+                            const int      perBatchBoxesSize,
+                            const int      perBatchScoresSize,
+                            const bool     shareLocation,
+                            const int      backgroundLabelId,
+                            const int      numPredsPerClass,
+                            const int      numClasses,
+                            const int      topK,
+                            const int      keepTopK,
+                            const float    scoreThreshold,
+                            const float    iouThreshold,
+                            const DataType DT_BBOX,
+                            const void*    locData,
+                            const DataType DT_SCORE,
+                            const void*    confData,
+                            void*          nmsedDets,
+                            void*          nmsedLabels,
+                            void*          nmsedIndex,
+                            void*          workspace,
+                            bool           isNormalized,
+                            bool           confSigmoid,
+                            bool           clipBoxes,
+                            bool           rotated = false);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
index 81500147e7..19efec4ac5 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
@@ -6,7 +6,12 @@ template<typename KeyT, typename ValueT>
 size_t cubSortPairsWorkspaceSize(int num_items, int num_segments)
 {
     size_t temp_storage_bytes = 0;
-    cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)NULL, temp_storage_bytes, (const KeyT*)NULL, (KeyT*)NULL, (const ValueT*)NULL, (ValueT*)NULL,
+    cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)NULL,
+                                                       temp_storage_bytes,
+                                                       (const KeyT*)NULL,
+                                                       (KeyT*)NULL,
+                                                       (const ValueT*)NULL,
+                                                       (ValueT*)NULL,
                                                        num_items,     // # items
                                                        num_segments,  // # segments
                                                        (const int*)NULL,
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
index 87b089b623..6e690731d9 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
@@ -31,38 +31,136 @@ struct Bbox
 
 size_t         get_cuda_arch(int devID);
 
-int8_t*        alignPtr(int8_t* ptr, uintptr_t to);
-
-int8_t*        nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize);
-
-void           setUniformOffsets(cudaStream_t stream, int num_segments, int offset, int* d_offsets);
-
-pluginStatus_t allClassNMS(cudaStream_t stream, int num, int num_classes, int num_preds_per_class, int top_k, float nms_threshold, bool share_location, bool isNormalized, DataType DT_SCORE, DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array, bool flipXY = false);
-
-pluginStatus_t allClassRotatedNMS(cudaStream_t stream, int num, int num_classes, int num_preds_per_class, int top_k, float nms_threshold, bool share_location, bool isNormalized, DataType DT_SCORE, DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array, bool flipXY = false);
-
-size_t         detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX);
-
-size_t         detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX);
-
-size_t         sortScoresPerClassWorkspaceSize(int num, int num_classes, int num_preds_per_class, DataType DT_CONF);
-
-size_t         sortScoresPerImageWorkspaceSize(int num_images, int num_items_per_image, DataType DT_SCORE);
-
-pluginStatus_t sortScoresPerImage(cudaStream_t stream, int num_images, int num_items_per_image, DataType DT_SCORE, void* unsorted_scores, void* unsorted_bbox_indices, void* sorted_scores, void* sorted_bbox_indices, void* workspace);
-
-pluginStatus_t sortScoresPerClass(cudaStream_t stream, int num, int num_classes, int num_preds_per_class, int background_label_id, float confidence_threshold, DataType DT_SCORE, void* conf_scores_gpu, void* index_array_gpu, void* workspace);
-
-size_t         calculateTotalWorkspaceSize(size_t* workspaces, int count);
-
-pluginStatus_t permuteData(cudaStream_t stream, int nthreads, int num_classes, int num_data, int num_dim, DataType DT_DATA, bool confSigmoid, const void* data, void* new_data);
-
-size_t         detectionForwardPreNMSSize(int N, int C2);
-
-size_t         detectionForwardPostNMSSize(int N, int numClasses, int topK);
-
-pluginStatus_t gatherNMSOutputs(cudaStream_t stream, bool shareLocation, int numImages, int numPredsPerClass, int numClasses, int topK, int keepTopK, DataType DT_BBOX, DataType DT_SCORE, const void* indices, const void* scores, const void* bboxData, void* nmsedDets, void* nmsedLabels, void* nmsedIndex = nullptr, bool clipBoxes = true, bool rotated = false);
-
-size_t         detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass, int topK, DataType DT_BBOX, DataType DT_SCORE);
+int8_t*        alignPtr(int8_t*   ptr,
+                        uintptr_t to);
+
+int8_t*        nextWorkspacePtr(int8_t*   ptr,
+                                uintptr_t previousWorkspaceSize);
+
+void           setUniformOffsets(cudaStream_t stream,
+                                 int          num_segments,
+                                 int          offset,
+                                 int*         d_offsets);
+
+pluginStatus_t allClassNMS(cudaStream_t stream,
+                           int          num,
+                           int          num_classes,
+                           int          num_preds_per_class,
+                           int          top_k,
+                           float        nms_threshold,
+                           bool         share_location,
+                           bool         isNormalized,
+                           DataType     DT_SCORE,
+                           DataType     DT_BBOX,
+                           void*        bbox_data,
+                           void*        beforeNMS_scores,
+                           void*        beforeNMS_index_array,
+                           void*        afterNMS_scores,
+                           void*        afterNMS_index_array,
+                           bool         flipXY = false);
+
+pluginStatus_t allClassRotatedNMS(cudaStream_t stream,
+                                  int          num,
+                                  int          num_classes,
+                                  int          num_preds_per_class,
+                                  int          top_k,
+                                  float        nms_threshold,
+                                  bool         share_location,
+                                  bool         isNormalized,
+                                  DataType     DT_SCORE,
+                                  DataType     DT_BBOX,
+                                  void*        bbox_data,
+                                  void*        beforeNMS_scores,
+                                  void*        beforeNMS_index_array,
+                                  void*        afterNMS_scores,
+                                  void*        afterNMS_index_array,
+                                  bool         flipXY = false);
+
+size_t         detectionForwardBBoxDataSize(int      N,
+                                            int      C1,
+                                            DataType DT_BBOX);
+
+size_t         detectionForwardBBoxPermuteSize(bool     shareLocation,
+                                               int      N,
+                                               int      C1,
+                                               DataType DT_BBOX);
+
+size_t         sortScoresPerClassWorkspaceSize(int      num,
+                                               int      num_classes,
+                                               int      num_preds_per_class,
+                                               DataType DT_CONF);
+
+size_t         sortScoresPerImageWorkspaceSize(int      num_images,
+                                               int      num_items_per_image,
+                                               DataType DT_SCORE);
+
+pluginStatus_t sortScoresPerImage(cudaStream_t stream,
+                                  int          num_images,
+                                  int          num_items_per_image,
+                                  DataType     DT_SCORE,
+                                  void*        unsorted_scores,
+                                  void*        unsorted_bbox_indices,
+                                  void*        sorted_scores,
+                                  void*        sorted_bbox_indices,
+                                  void*        workspace);
+
+pluginStatus_t sortScoresPerClass(cudaStream_t stream,
+                                  int          num,
+                                  int          num_classes,
+                                  int          num_preds_per_class,
+                                  int          background_label_id,
+                                  float        confidence_threshold,
+                                  DataType     DT_SCORE,
+                                  void*        conf_scores_gpu,
+                                  void*        index_array_gpu,
+                                  void*        workspace);
+
+size_t         calculateTotalWorkspaceSize(size_t* workspaces,
+                                           int     count);
+
+pluginStatus_t permuteData(cudaStream_t stream,
+                           int          nthreads,
+                           int          num_classes,
+                           int          num_data,
+                           int          num_dim,
+                           DataType     DT_DATA,
+                           bool         confSigmoid,
+                           const void*  data,
+                           void*        new_data);
+
+size_t         detectionForwardPreNMSSize(int N,
+                                          int C2);
+
+size_t         detectionForwardPostNMSSize(int N,
+                                           int numClasses,
+                                           int topK);
+
+pluginStatus_t gatherNMSOutputs(cudaStream_t stream,
+                                bool         shareLocation,
+                                int          numImages,
+                                int          numPredsPerClass,
+                                int          numClasses,
+                                int          topK,
+                                int          keepTopK,
+                                DataType     DT_BBOX,
+                                DataType     DT_SCORE,
+                                const void*  indices,
+                                const void*  scores,
+                                const void*  bboxData,
+                                void*        nmsedDets,
+                                void*        nmsedLabels,
+                                void*        nmsedIndex = nullptr,
+                                bool         clipBoxes  = true,
+                                bool         rotated    = false);
+
+size_t         detectionInferenceWorkspaceSize(bool     shareLocation,
+                                               int      N,
+                                               int      C1,
+                                               int      C2,
+                                               int      numClasses,
+                                               int      numPredsPerClass,
+                                               int      topK,
+                                               DataType DT_BBOX,
+                                               DataType DT_SCORE);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
index 482d11a924..cbe5c1a34c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
@@ -44,14 +44,22 @@ namespace mmdeploy
             return mNamespace.c_str();
         }
 
-        virtual void   configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override {}
+        virtual void   configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                       int                                      nbInputs,
+                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                       int                                      nbOutputs) TRT_NOEXCEPT override {}
 
-        virtual size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override
+        virtual size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                        int                               nbInputs,
+                                        const nvinfer1::PluginTensorDesc* outputs,
+                                        int                               nbOutputs) const TRT_NOEXCEPT override
         {
             return 0;
         }
 
-        virtual void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override {}
+        virtual void attachToContext(cudnnContext*            cudnnContext,
+                                     cublasContext*           cublasContext,
+                                     nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override {}
 
         virtual void detachFromContext() TRT_NOEXCEPT override {}
 
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
index d1d2fff678..c059a7cfb8 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
@@ -32,11 +32,13 @@ namespace
         {
             return sizeof(T);
         }
+
         static void serialize(void** buffer, T const& value)
         {
             ::memcpy(*buffer, &value, sizeof(T));
             reinterpret_cast<char*&>(*buffer) += sizeof(T);
         }
+
         static void deserialize(void const** buffer, size_t* buffer_size, T* value)
         {
             assert(*buffer_size >= sizeof(T));
@@ -53,11 +55,13 @@ namespace
         {
             return strlen(value) + 1;
         }
+
         static void serialize(void** buffer, const char* value)
         {
             ::strcpy(static_cast<char*>(*buffer), value);
             reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
         }
+
         static void deserialize(void const** buffer, size_t* buffer_size, const char** value)
         {
             *value           = static_cast<char const*>(*buffer);
@@ -77,6 +81,7 @@ namespace
         {
             return sizeof(value.size()) + value.size() * sizeof(T);
         }
+
         static void serialize(void** buffer, std::vector<T> const& value)
         {
             serialize_value(buffer, value.size());
@@ -84,6 +89,7 @@ namespace
             ::memcpy(*buffer, value.data(), nbyte);
             reinterpret_cast<char*&>(*buffer) += nbyte;
         }
+
         static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value)
         {
             size_t size;
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
index 08a6a617ce..99aba5704c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
@@ -8,7 +8,9 @@
 const static int BS = 512;
 
 template<typename T_BBOX>
-__device__ T_BBOX bboxSize(const Bbox<T_BBOX>& bbox, const bool normalized, T_BBOX offset)
+__device__ T_BBOX bboxSize(const Bbox<T_BBOX>& bbox,
+                           const bool          normalized,
+                           T_BBOX              offset)
 {
     if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin)
     {
@@ -32,7 +34,9 @@ __device__ T_BBOX bboxSize(const Bbox<T_BBOX>& bbox, const bool normalized, T_BB
 }
 
 template<typename T_BBOX>
-__device__ void intersectBbox(const Bbox<T_BBOX>& bbox1, const Bbox<T_BBOX>& bbox2, Bbox<T_BBOX>* intersect_bbox)
+__device__ void intersectBbox(const Bbox<T_BBOX>& bbox1,
+                              const Bbox<T_BBOX>& bbox2,
+                              Bbox<T_BBOX>*       intersect_bbox)
 {
     if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin || bbox2.ymin > bbox1.ymax ||
         bbox2.ymax < bbox1.ymin)
@@ -53,7 +57,10 @@ __device__ void intersectBbox(const Bbox<T_BBOX>& bbox1, const Bbox<T_BBOX>& bbo
 }
 
 template<typename T_BBOX>
-__device__ float jaccardOverlap(const Bbox<T_BBOX>& bbox1, const Bbox<T_BBOX>& bbox2, const bool normalized, T_BBOX offset)
+__device__ float jaccardOverlap(const Bbox<T_BBOX>& bbox1,
+                                const Bbox<T_BBOX>& bbox2,
+                                const bool          normalized,
+                                T_BBOX              offset)
 {
     Bbox<T_BBOX> intersect_bbox;
     intersectBbox(bbox1, bbox2, &intersect_bbox);
@@ -83,22 +90,27 @@ __device__ float jaccardOverlap(const Bbox<T_BBOX>& bbox1, const Bbox<T_BBOX>& b
 
 /********** new NMS for only score and index array **********/
 
-// clang-format off
-template <typename T_SCORE, typename T_BBOX, int TSIZE>
+template<typename T_SCORE, typename T_BBOX, int TSIZE>
 __global__ void
 #ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ == 620 || __CUDA_ARCH__ == 530
-__launch_bounds__(512)
+    #if __CUDA_ARCH__ == 620 || __CUDA_ARCH__ == 530
+    __launch_bounds__(512)
+    #endif
 #endif
-#endif
-allClassNMS_kernel(const int num, const int num_classes, const int num_preds_per_class,
-                    const int top_k, const float nms_threshold, const bool share_location,
-                    const bool isNormalized,
-                    T_BBOX *bbox_data,  // bbox_data should be float to preserve
-                                        // location information
-                    T_SCORE *beforeNMS_scores, int *beforeNMS_index_array,
-                    T_SCORE *afterNMS_scores, int *afterNMS_index_array, bool flipXY = false) {
-    // clang-format on
+        allClassNMS_kernel(const int   num,
+                           const int   num_classes,
+                           const int   num_preds_per_class,
+                           const int   top_k,
+                           const float nms_threshold,
+                           const bool  share_location,
+                           const bool  isNormalized,
+                           T_BBOX*     bbox_data,  // bbox_data should be float to preserve location information
+                           T_SCORE*    beforeNMS_scores,
+                           int*        beforeNMS_index_array,
+                           T_SCORE*    afterNMS_scores,
+                           int*        afterNMS_index_array,
+                           bool        flipXY = false)
+{
     //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
     __shared__ bool kept_bboxinfo_flag[TSIZE * BS];
     for (int i = 0; i < num; i++)
@@ -153,8 +165,9 @@ allClassNMS_kernel(const int num, const int num_classes, const int num_preds_per
 
         // filter out overlapped boxes with lower scores
         int ref_item_idx = offset;
-        int ref_bbox_idx =
-            share_location ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) : beforeNMS_index_array[ref_item_idx];
+        int ref_bbox_idx = share_location ?
+                               (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) :
+                               beforeNMS_index_array[ref_item_idx];
 
         while ((ref_bbox_idx != -1) && ref_item_idx < max_idx)
         {
@@ -214,11 +227,36 @@ allClassNMS_kernel(const int num, const int num_classes, const int num_preds_per
 }
 
 template<typename T_SCORE, typename T_BBOX>
-pluginStatus_t allClassNMS_gpu(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int top_k, const float nms_threshold, const bool share_location, const bool isNormalized, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array, bool flipXY = false)
+pluginStatus_t allClassNMS_gpu(cudaStream_t stream,
+                               const int    num,
+                               const int    num_classes,
+                               const int    num_preds_per_class,
+                               const int    top_k,
+                               const float  nms_threshold,
+                               const bool   share_location,
+                               const bool   isNormalized,
+                               void*        bbox_data,
+                               void*        beforeNMS_scores,
+                               void*        beforeNMS_index_array,
+                               void*        afterNMS_scores,
+                               void*        afterNMS_index_array,
+                               bool         flipXY = false)
 {
 #define P(tsize) allClassNMS_kernel<T_SCORE, T_BBOX, (tsize)>
 
-    void (*kernel[10])(const int, const int, const int, const int, const float, const bool, const bool, float*, T_SCORE*, int*, T_SCORE*, int*, bool) = {
+    void (*kernel[10])(const int,
+                       const int,
+                       const int,
+                       const int,
+                       const float,
+                       const bool,
+                       const bool,
+                       float*,
+                       T_SCORE*,
+                       int*,
+                       T_SCORE*,
+                       int*,
+                       bool) = {
         P(1),
         P(2),
         P(3),
@@ -257,7 +295,20 @@ pluginStatus_t allClassNMS_gpu(cudaStream_t stream, const int num, const int num
 }
 
 // allClassNMS LAUNCH CONFIG
-typedef pluginStatus_t (*nmsFunc)(cudaStream_t, const int, const int, const int, const int, const float, const bool, const bool, void*, void*, void*, void*, void*, bool);
+typedef pluginStatus_t (*nmsFunc)(cudaStream_t,
+                                  const int,
+                                  const int,
+                                  const int,
+                                  const int,
+                                  const float,
+                                  const bool,
+                                  const bool,
+                                  void*,
+                                  void*,
+                                  void*,
+                                  void*,
+                                  void*,
+                                  bool);
 
 struct nmsLaunchConfigSSD
 {
@@ -293,7 +344,22 @@ bool                                   nmsInit()
 
 static bool    initialized = nmsInit();
 
-pluginStatus_t allClassNMS(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int top_k, const float nms_threshold, const bool share_location, const bool isNormalized, const DataType DT_SCORE, const DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array, bool flipXY)
+pluginStatus_t allClassNMS(cudaStream_t   stream,
+                           const int      num,
+                           const int      num_classes,
+                           const int      num_preds_per_class,
+                           const int      top_k,
+                           const float    nms_threshold,
+                           const bool     share_location,
+                           const bool     isNormalized,
+                           const DataType DT_SCORE,
+                           const DataType DT_BBOX,
+                           void*          bbox_data,
+                           void*          beforeNMS_scores,
+                           void*          beforeNMS_index_array,
+                           void*          afterNMS_scores,
+                           void*          afterNMS_index_array,
+                           bool           flipXY)
 {
     nmsLaunchConfigSSD lc(DT_SCORE, DT_BBOX);
     for (unsigned i = 0; i < nmsFuncVec.size(); ++i)
@@ -301,7 +367,20 @@ pluginStatus_t allClassNMS(cudaStream_t stream, const int num, const int num_cla
         if (lc == nmsFuncVec[i])
         {
             DEBUG_PRINTF("all class nms kernel %d\n", i);
-            return nmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized, bbox_data, beforeNMS_scores, beforeNMS_index_array, afterNMS_scores, afterNMS_index_array, flipXY);
+            return nmsFuncVec[i].function(stream,
+                                          num,
+                                          num_classes,
+                                          num_preds_per_class,
+                                          top_k,
+                                          nms_threshold,
+                                          share_location,
+                                          isNormalized,
+                                          bbox_data,
+                                          beforeNMS_scores,
+                                          beforeNMS_index_array,
+                                          afterNMS_scores,
+                                          afterNMS_index_array,
+                                          flipXY);
         }
     }
     return STATUS_BAD_PARAM;
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
index 52758ea247..e8c1cd2187 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
@@ -21,20 +21,24 @@ struct Point
         , y(py)
     {
     }
+
     __host__ __device__ __forceinline__ Point operator+(const Point& p) const
     {
         return Point(x + p.x, y + p.y);
     }
+
     __host__ __device__ __forceinline__ Point& operator+=(const Point& p)
     {
         x += p.x;
         y += p.y;
         return *this;
     }
+
     __host__ __device__ __forceinline__ Point operator-(const Point& p) const
     {
         return Point(x - p.x, y - p.y);
     }
+
     __host__ __device__ __forceinline__ Point operator*(const T coeff) const
     {
         return Point(x * coeff, y * coeff);
@@ -354,13 +358,18 @@ __host__ __device__ __forceinline__ T single_box_iou_rotated(T const* const box1
 /********** new NMS for only score and index array **********/
 
 template<typename T_SCORE, typename T_BBOX, int TSIZE>
-__global__ void allClassRotatedNMS_kernel(const int num, const int num_classes, const int num_preds_per_class, const int top_k, const float nms_threshold, const bool share_location, const bool isNormalized,
-                                          T_BBOX*  bbox_data,  // bbox_data should be float to
-                                                               // preserve location information
-                                          T_SCORE* beforeNMS_scores,
-                                          int*     beforeNMS_index_array,
-                                          T_SCORE* afterNMS_scores,
-                                          int*     afterNMS_index_array)
+__global__ void allClassRotatedNMS_kernel(const int   num,
+                                          const int   num_classes,
+                                          const int   num_preds_per_class,
+                                          const int   top_k,
+                                          const float nms_threshold,
+                                          const bool  share_location,
+                                          const bool  isNormalized,
+                                          T_BBOX*     bbox_data,  // bbox_data should be float to preserve location information
+                                          T_SCORE*    beforeNMS_scores,
+                                          int*        beforeNMS_index_array,
+                                          T_SCORE*    afterNMS_scores,
+                                          int*        afterNMS_index_array)
 {
     //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
     extern __shared__ bool kept_bboxinfo_flag[];
@@ -391,7 +400,9 @@ __global__ void allClassRotatedNMS_kernel(const int num, const int num_classes,
                 if (loc_bboxIndex[t] >= 0)
                 // if (loc_bboxIndex[t] != -1)
                 {
-                    const int bbox_data_idx = share_location ? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset) : loc_bboxIndex[t];
+                    const int bbox_data_idx = share_location ?
+                                                  (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset) :
+                                                  loc_bboxIndex[t];
                     memcpy(&loc_bbox[t * 5], &bbox_data[bbox_data_idx * 5], 5 * sizeof(T_BBOX));
                     kept_bboxinfo_flag[cur_idx] = true;
                 }
@@ -408,8 +419,9 @@ __global__ void allClassRotatedNMS_kernel(const int num, const int num_classes,
 
         // filter out overlapped boxes with lower scores
         int ref_item_idx = offset;
-        int ref_bbox_idx =
-            share_location ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) : beforeNMS_index_array[ref_item_idx];
+        int ref_bbox_idx = share_location ?
+                               (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) :
+                               beforeNMS_index_array[ref_item_idx];
 
         while ((ref_bbox_idx != -1) && ref_item_idx < max_idx)
         {
@@ -440,8 +452,9 @@ __global__ void allClassRotatedNMS_kernel(const int num, const int num_classes,
                 ref_item_idx++;
             } while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
 
-            ref_bbox_idx =
-                share_location ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) : beforeNMS_index_array[ref_item_idx];
+            ref_bbox_idx = share_location ?
+                               (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) :
+                               beforeNMS_index_array[ref_item_idx];
         }
 
         // store data
@@ -457,8 +470,9 @@ __global__ void allClassRotatedNMS_kernel(const int num, const int num_classes,
              */
             if (read_item_idx < max_idx)
             {
-                afterNMS_scores[write_item_idx] =
-                    kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
+                afterNMS_scores[write_item_idx]      = kept_bboxinfo_flag[cur_idx] ?
+                                                           beforeNMS_scores[read_item_idx] :
+                                                           0.0f;
                 afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
             }
         }
@@ -466,11 +480,34 @@ __global__ void allClassRotatedNMS_kernel(const int num, const int num_classes,
 }
 
 template<typename T_SCORE, typename T_BBOX>
-pluginStatus_t allClassRotatedNMS_gpu(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int top_k, const float nms_threshold, const bool share_location, const bool isNormalized, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array)
+pluginStatus_t allClassRotatedNMS_gpu(cudaStream_t stream,
+                                      const int    num,
+                                      const int    num_classes,
+                                      const int    num_preds_per_class,
+                                      const int    top_k,
+                                      const float  nms_threshold,
+                                      const bool   share_location,
+                                      const bool   isNormalized,
+                                      void*        bbox_data,
+                                      void*        beforeNMS_scores,
+                                      void*        beforeNMS_index_array,
+                                      void*        afterNMS_scores,
+                                      void*        afterNMS_index_array)
 {
 #define P(tsize) allClassRotatedNMS_kernel<T_SCORE, T_BBOX, (tsize)>
 
-    void (*kernel[10])(const int, const int, const int, const int, const float, const bool, const bool, float*, T_SCORE*, int*, T_SCORE*, int*) = {
+    void (*kernel[10])(const int,
+                       const int,
+                       const int,
+                       const int,
+                       const float,
+                       const bool,
+                       const bool,
+                       float*,
+                       T_SCORE*,
+                       int*,
+                       T_SCORE*,
+                       int*) = {
         P(1),
         P(2),
         P(3),
@@ -488,26 +525,37 @@ pluginStatus_t allClassRotatedNMS_gpu(cudaStream_t stream, const int num, const
     const int t_size = (top_k + BS - 1) / BS;
 
     ASSERT(t_size <= 10);
-    kernel[t_size - 1]<<<GS, BS, BS * t_size * sizeof(bool), stream>>>(
-        num,
-        num_classes,
-        num_preds_per_class,
-        top_k,
-        nms_threshold,
-        share_location,
-        isNormalized,
-        (T_BBOX*)bbox_data,
-        (T_SCORE*)beforeNMS_scores,
-        (int*)beforeNMS_index_array,
-        (T_SCORE*)afterNMS_scores,
-        (int*)afterNMS_index_array);
+    kernel[t_size - 1]<<<GS, BS, BS * t_size * sizeof(bool), stream>>>(num,
+                                                                       num_classes,
+                                                                       num_preds_per_class,
+                                                                       top_k,
+                                                                       nms_threshold,
+                                                                       share_location,
+                                                                       isNormalized,
+                                                                       (T_BBOX*)bbox_data,
+                                                                       (T_SCORE*)beforeNMS_scores,
+                                                                       (int*)beforeNMS_index_array,
+                                                                       (T_SCORE*)afterNMS_scores,
+                                                                       (int*)afterNMS_index_array);
 
     CSC(cudaGetLastError(), STATUS_FAILURE);
     return STATUS_SUCCESS;
 }
 
 // allClassNMS LAUNCH CONFIG
-typedef pluginStatus_t (*rotatedNmsFunc)(cudaStream_t, const int, const int, const int, const int, const float, const bool, const bool, void*, void*, void*, void*, void*);
+typedef pluginStatus_t (*rotatedNmsFunc)(cudaStream_t,
+                                         const int,
+                                         const int,
+                                         const int,
+                                         const int,
+                                         const float,
+                                         const bool,
+                                         const bool,
+                                         void*,
+                                         void*,
+                                         void*,
+                                         void*,
+                                         void*);
 
 struct rotatedNmsLaunchConfig
 {
@@ -536,13 +584,30 @@ static std::vector<rotatedNmsLaunchConfig> rotatedNmsFuncVec;
 
 bool                                       rotatedNmsInit()
 {
-    rotatedNmsFuncVec.push_back(rotatedNmsLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, allClassRotatedNMS_gpu<float, float>));
+    rotatedNmsFuncVec.push_back(rotatedNmsLaunchConfig(DataType::kFLOAT,
+                                                       DataType::kFLOAT,
+                                                       allClassRotatedNMS_gpu<float, float>));
     return true;
 }
 
 static bool    initialized = rotatedNmsInit();
 
-pluginStatus_t allClassRotatedNMS(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int top_k, const float nms_threshold, const bool share_location, const bool isNormalized, const DataType DT_SCORE, const DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores, void* beforeNMS_index_array, void* afterNMS_scores, void* afterNMS_index_array, bool)
+pluginStatus_t allClassRotatedNMS(cudaStream_t   stream,
+                                  const int      num,
+                                  const int      num_classes,
+                                  const int      num_preds_per_class,
+                                  const int      top_k,
+                                  const float    nms_threshold,
+                                  const bool     share_location,
+                                  const bool     isNormalized,
+                                  const DataType DT_SCORE,
+                                  const DataType DT_BBOX,
+                                  void*          bbox_data,
+                                  void*          beforeNMS_scores,
+                                  void*          beforeNMS_index_array,
+                                  void*          afterNMS_scores,
+                                  void*          afterNMS_index_array,
+                                  bool)
 {
     auto __cuda_arch__ = get_cuda_arch(0);  // assume there is only one arch 7.2 device
     if (__cuda_arch__ == 720 && top_k >= 1000)
@@ -557,7 +622,19 @@ pluginStatus_t allClassRotatedNMS(cudaStream_t stream, const int num, const int
         if (lc == rotatedNmsFuncVec[i])
         {
             DEBUG_PRINTF("all class rotated nms kernel %d\n", i);
-            return rotatedNmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized, bbox_data, beforeNMS_scores, beforeNMS_index_array, afterNMS_scores, afterNMS_index_array);
+            return rotatedNmsFuncVec[i].function(stream,
+                                                 num,
+                                                 num_classes,
+                                                 num_preds_per_class,
+                                                 top_k,
+                                                 nms_threshold,
+                                                 share_location,
+                                                 isNormalized,
+                                                 bbox_data,
+                                                 beforeNMS_scores,
+                                                 beforeNMS_index_array,
+                                                 afterNMS_scores,
+                                                 afterNMS_index_array);
         }
     }
     return STATUS_BAD_PARAM;
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
index 903624d86b..b5f880d87f 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
@@ -3,7 +3,30 @@
 // https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
 #include "nms/batched_nms_kernel.hpp"
 
-pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize, const int perBatchScoresSize, const bool shareLocation, const int backgroundLabelId, const int numPredsPerClass, const int numClasses, const int topK, const int keepTopK, const float scoreThreshold, const float iouThreshold, const DataType DT_BBOX, const void* locData, const DataType DT_SCORE, const void* confData, void* nmsedDets, void* nmsedLabels, void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid, bool clipBoxes, bool rotated)
+pluginStatus_t nmsInference(cudaStream_t   stream,
+                            const int      N,
+                            const int      perBatchBoxesSize,
+                            const int      perBatchScoresSize,
+                            const bool     shareLocation,
+                            const int      backgroundLabelId,
+                            const int      numPredsPerClass,
+                            const int      numClasses,
+                            const int      topK,
+                            const int      keepTopK,
+                            const float    scoreThreshold,
+                            const float    iouThreshold,
+                            const DataType DT_BBOX,
+                            const void*    locData,
+                            const DataType DT_SCORE,
+                            const void*    confData,
+                            void*          nmsedDets,
+                            void*          nmsedLabels,
+                            void*          nmsedIndex,
+                            void*          workspace,
+                            bool           isNormalized,
+                            bool           confSigmoid,
+                            bool           clipBoxes,
+                            bool           rotated)
 {
     const int topKVal       = topK < 0 ? numPredsPerClass : topK;
     const int keepTopKVal   = keepTopK < 0 ? numPredsPerClass : keepTopK;
@@ -29,8 +52,10 @@ pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatch
      */
     // float for now
     void*          bboxData;
-    size_t         bboxPermuteSize =
-        detectionForwardBBoxPermuteSize(shareLocation, N, perBatchBoxesSize, DataType::kFLOAT);
+    size_t         bboxPermuteSize = detectionForwardBBoxPermuteSize(shareLocation,
+                                                             N,
+                                                             perBatchBoxesSize,
+                                                             DataType::kFLOAT);
     void* bboxPermute = nextWorkspacePtr((int8_t*)bboxDataRaw, bboxDataSize);
 
     /*
@@ -40,7 +65,15 @@ pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatch
      */
     if (!shareLocation)
     {
-        status = permuteData(stream, locCount, numLocClasses, numPredsPerClass, rotated ? 5 : 4, DataType::kFLOAT, false, bboxDataRaw, bboxPermute);
+        status = permuteData(stream,
+                             locCount,
+                             numLocClasses,
+                             numPredsPerClass,
+                             rotated ? 5 : 4,
+                             DataType::kFLOAT,
+                             false,
+                             bboxDataRaw,
+                             bboxPermute);
         ASSERT_FAILURE(status == STATUS_SUCCESS);
         bboxData = bboxPermute;
     }
@@ -66,7 +99,15 @@ pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatch
      * After permutation, bboxData format:
      * [batch_size, numClasses, numPredsPerClass, 1]
      */
-    status = permuteData(stream, numScores, numClasses, numPredsPerClass, 1, DataType::kFLOAT, confSigmoid, confData, scores);
+    status = permuteData(stream,
+                         numScores,
+                         numClasses,
+                         numPredsPerClass,
+                         1,
+                         DataType::kFLOAT,
+                         confSigmoid,
+                         confData,
+                         scores);
     ASSERT_FAILURE(status == STATUS_SUCCESS);
 
     size_t indicesSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
@@ -80,7 +121,16 @@ pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatch
     void*  sortingWorkspace = nextWorkspacePtr((int8_t*)postNMSIndices, postNMSIndicesSize);
     // Sort the scores so that the following NMS could be applied.
 
-    status = sortScoresPerClass(stream, N, numClasses, numPredsPerClass, backgroundLabelId, scoreThreshold, DataType::kFLOAT, scores, indices, sortingWorkspace);
+    status = sortScoresPerClass(stream,
+                                N,
+                                numClasses,
+                                numPredsPerClass,
+                                backgroundLabelId,
+                                scoreThreshold,
+                                DataType::kFLOAT,
+                                scores,
+                                indices,
+                                sortingWorkspace);
     ASSERT_FAILURE(status == STATUS_SUCCESS);
 
     // This is set to true as the input bounding boxes are of the format [ymin,
@@ -90,22 +140,76 @@ pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatch
     // NMS
     if (rotated)
     {
-        status = allClassRotatedNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold, shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT, bboxData, scores, indices, postNMSScores, postNMSIndices, flipXY);
+        status = allClassRotatedNMS(stream,
+                                    N,
+                                    numClasses,
+                                    numPredsPerClass,
+                                    topKVal,
+                                    iouThreshold,
+                                    shareLocation,
+                                    isNormalized,
+                                    DataType::kFLOAT,
+                                    DataType::kFLOAT,
+                                    bboxData,
+                                    scores,
+                                    indices,
+                                    postNMSScores,
+                                    postNMSIndices,
+                                    flipXY);
     }
     else
     {
-        status = allClassNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold, shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT, bboxData, scores, indices, postNMSScores, postNMSIndices, flipXY);
+        status = allClassNMS(stream,
+                             N,
+                             numClasses,
+                             numPredsPerClass,
+                             topKVal,
+                             iouThreshold,
+                             shareLocation,
+                             isNormalized,
+                             DataType::kFLOAT,
+                             DataType::kFLOAT,
+                             bboxData,
+                             scores,
+                             indices,
+                             postNMSScores,
+                             postNMSIndices,
+                             flipXY);
     }
 
     ASSERT_FAILURE(status == STATUS_SUCCESS);
 
     // Sort the bounding boxes after NMS using scores
-    status = sortScoresPerImage(stream, N, numClasses * topKVal, DataType::kFLOAT, postNMSScores, postNMSIndices, scores, indices, sortingWorkspace);
+    status = sortScoresPerImage(stream,
+                                N,
+                                numClasses * topKVal,
+                                DataType::kFLOAT,
+                                postNMSScores,
+                                postNMSIndices,
+                                scores,
+                                indices,
+                                sortingWorkspace);
 
     ASSERT_FAILURE(status == STATUS_SUCCESS);
 
     // Gather data from the sorted bounding boxes after NMS
-    status = gatherNMSOutputs(stream, shareLocation, N, numPredsPerClass, numClasses, topKVal, keepTopKVal, DataType::kFLOAT, DataType::kFLOAT, indices, scores, bboxData, nmsedDets, nmsedLabels, nmsedIndex, clipBoxes, rotated);
+    status = gatherNMSOutputs(stream,
+                              shareLocation,
+                              N,
+                              numPredsPerClass,
+                              numClasses,
+                              topKVal,
+                              keepTopKVal,
+                              DataType::kFLOAT,
+                              DataType::kFLOAT,
+                              indices,
+                              scores,
+                              bboxData,
+                              nmsedDets,
+                              nmsedLabels,
+                              nmsedIndex,
+                              clipBoxes,
+                              rotated);
 
     ASSERT_FAILURE(status == STATUS_SUCCESS);
 
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
index 22d901565c..803924a4ee 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
@@ -7,18 +7,30 @@
 #include "trt_plugin_helper.hpp"
 
 template<typename T_BBOX, typename T_SCORE, bool rotated, unsigned nthds_per_cta>
-__launch_bounds__(nthds_per_cta) __global__
-    void gatherNMSOutputs_kernel(const bool shareLocation, const int numImages, const int numPredsPerClass, const int numClasses, const int topK, const int keepTopK, const int* indices, const T_SCORE* scores, const T_BBOX* bboxData, T_BBOX* nmsedDets, int* nmsedLabels, int* nmsedIndex, bool clipBoxes)
+__launch_bounds__(nthds_per_cta) __global__ void gatherNMSOutputs_kernel(const bool     shareLocation,
+                                                                         const int      numImages,
+                                                                         const int      numPredsPerClass,
+                                                                         const int      numClasses,
+                                                                         const int      topK,
+                                                                         const int      keepTopK,
+                                                                         const int*     indices,
+                                                                         const T_SCORE* scores,
+                                                                         const T_BBOX*  bboxData,
+                                                                         T_BBOX*        nmsedDets,
+                                                                         int*           nmsedLabels,
+                                                                         int*           nmsedIndex,
+                                                                         bool           clipBoxes)
 {
     if (keepTopK > topK) return;
-    for (int i = blockIdx.x * nthds_per_cta + threadIdx.x; i < numImages * keepTopK;
-         i += gridDim.x * nthds_per_cta)
+
+    for (int i = blockIdx.x * nthds_per_cta + threadIdx.x; i < numImages * keepTopK; i += gridDim.x * nthds_per_cta)
     {
         const int     imgId  = i / keepTopK;
         const int     detId  = i % keepTopK;
         const int     offset = imgId * numClasses * topK;
         const int     index  = indices[offset + detId];
         const T_SCORE score  = scores[offset + detId];
+
         if (index == -1)
         {
             nmsedLabels[i] = -1;
@@ -96,7 +108,20 @@ __launch_bounds__(nthds_per_cta) __global__
 }
 
 template<typename T_BBOX, typename T_SCORE, bool rotated>
-pluginStatus_t gatherNMSOutputs_gpu(cudaStream_t stream, const bool shareLocation, const int numImages, const int numPredsPerClass, const int numClasses, const int topK, const int keepTopK, const void* indices, const void* scores, const void* bboxData, void* nmsedDets, void* nmsedLabels, void* nmsedIndex, bool clipBoxes)
+pluginStatus_t gatherNMSOutputs_gpu(cudaStream_t stream,
+                                    const bool   shareLocation,
+                                    const int    numImages,
+                                    const int    numPredsPerClass,
+                                    const int    numClasses,
+                                    const int    topK,
+                                    const int    keepTopK,
+                                    const void*  indices,
+                                    const void*  scores,
+                                    const void*  bboxData,
+                                    void*        nmsedDets,
+                                    void*        nmsedLabels,
+                                    void*        nmsedIndex,
+                                    bool         clipBoxes)
 {
     const int BS = 32;
     const int GS = 32;
@@ -120,7 +145,20 @@ pluginStatus_t gatherNMSOutputs_gpu(cudaStream_t stream, const bool shareLocatio
 }
 
 // gatherNMSOutputs LAUNCH CONFIG {{{
-typedef pluginStatus_t (*nmsOutFunc)(cudaStream_t, const bool, const int, const int, const int, const int, const int, const void*, const void*, const void*, void*, void*, void*, bool);
+typedef pluginStatus_t (*nmsOutFunc)(cudaStream_t,
+                                     const bool,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const void*,
+                                     const void*,
+                                     const void*,
+                                     void*,
+                                     void*,
+                                     void*,
+                                     bool);
 struct nmsOutLaunchConfig
 {
     DataType   t_bbox;
@@ -160,7 +198,23 @@ bool                                   nmsOutputInit()
 
 static bool    initialized = nmsOutputInit();
 
-pluginStatus_t gatherNMSOutputs(cudaStream_t stream, const bool shareLocation, const int numImages, const int numPredsPerClass, const int numClasses, const int topK, const int keepTopK, const DataType DT_BBOX, const DataType DT_SCORE, const void* indices, const void* scores, const void* bboxData, void* nmsedDets, void* nmsedLabels, void* nmsedIndex, bool clipBoxes, bool rotated)
+pluginStatus_t gatherNMSOutputs(cudaStream_t   stream,
+                                const bool     shareLocation,
+                                const int      numImages,
+                                const int      numPredsPerClass,
+                                const int      numClasses,
+                                const int      topK,
+                                const int      keepTopK,
+                                const DataType DT_BBOX,
+                                const DataType DT_SCORE,
+                                const void*    indices,
+                                const void*    scores,
+                                const void*    bboxData,
+                                void*          nmsedDets,
+                                void*          nmsedLabels,
+                                void*          nmsedIndex,
+                                bool           clipBoxes,
+                                bool           rotated)
 {
     nmsOutLaunchConfig lc = nmsOutLaunchConfig(DT_BBOX, DT_SCORE, rotated);
     for (unsigned i = 0; i < nmsOutFuncVec.size(); ++i)
@@ -168,7 +222,20 @@ pluginStatus_t gatherNMSOutputs(cudaStream_t stream, const bool shareLocation, c
         if (lc == nmsOutFuncVec[i])
         {
             DEBUG_PRINTF("gatherNMSOutputs kernel %d\n", i);
-            return nmsOutFuncVec[i].function(stream, shareLocation, numImages, numPredsPerClass, numClasses, topK, keepTopK, indices, scores, bboxData, nmsedDets, nmsedLabels, nmsedIndex, clipBoxes);
+            return nmsOutFuncVec[i].function(stream,
+                                             shareLocation,
+                                             numImages,
+                                             numPredsPerClass,
+                                             numClasses,
+                                             topK,
+                                             keepTopK,
+                                             indices,
+                                             scores,
+                                             bboxData,
+                                             nmsedDets,
+                                             nmsedLabels,
+                                             nmsedIndex,
+                                             clipBoxes);
         }
     }
     return STATUS_BAD_PARAM;
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
index e13f8969d4..36228de174 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
@@ -58,8 +58,9 @@ size_t calculateTotalWorkspaceSize(size_t* workspaces, int count)
 using nvinfer1::DataType;
 
 template<unsigned nthds_per_cta>
-__launch_bounds__(nthds_per_cta) __global__
-    void setUniformOffsets_kernel(const int num_segments, const int offset, int* d_offsets)
+__launch_bounds__(nthds_per_cta) __global__ void setUniformOffsets_kernel(const int num_segments,
+                                                                          const int offset,
+                                                                          int*      d_offsets)
 {
     const int idx = blockIdx.x * nthds_per_cta + threadIdx.x;
     if (idx <= num_segments) d_offsets[idx] = idx * offset;
@@ -105,7 +106,15 @@ size_t detectionForwardPostNMSSize(int N, int numClasses, int topK)
     return N * numClasses * topK * sizeof(float);
 }
 
-size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass, int topK, DataType DT_BBOX, DataType DT_SCORE)
+size_t detectionInferenceWorkspaceSize(bool     shareLocation,
+                                       int      N,
+                                       int      C1,
+                                       int      C2,
+                                       int      numClasses,
+                                       int      numPredsPerClass,
+                                       int      topK,
+                                       DataType DT_BBOX,
+                                       DataType DT_SCORE)
 {
     size_t wss[7];
     wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
index 23600a3ce8..327536d8b1 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
@@ -6,8 +6,13 @@
 #include "nms/kernel.h"
 
 template<typename Dtype, unsigned nthds_per_cta>
-__launch_bounds__(nthds_per_cta) __global__
-    void permuteData_kernel(const int nthreads, const int num_classes, const int num_data, const int num_dim, bool confSigmoid, const Dtype* data, Dtype* new_data)
+__launch_bounds__(nthds_per_cta) __global__ void permuteData_kernel(const int    nthreads,
+                                                                    const int    num_classes,
+                                                                    const int    num_data,
+                                                                    const int    num_dim,
+                                                                    bool         confSigmoid,
+                                                                    const Dtype* data,
+                                                                    Dtype*       new_data)
 {
     // data format: [batch_size, num_data, num_classes, num_dim]
     for (int index = blockIdx.x * nthds_per_cta + threadIdx.x; index < nthreads;
@@ -27,17 +32,37 @@ __launch_bounds__(nthds_per_cta) __global__
 }
 
 template<typename Dtype>
-pluginStatus_t permuteData_gpu(cudaStream_t stream, const int nthreads, const int num_classes, const int num_data, const int num_dim, bool confSigmoid, const void* data, void* new_data)
+pluginStatus_t permuteData_gpu(cudaStream_t stream,
+                               const int    nthreads,
+                               const int    num_classes,
+                               const int    num_data,
+                               const int    num_dim,
+                               bool         confSigmoid,
+                               const void*  data,
+                               void*        new_data)
 {
     const int BS = 512;
     const int GS = (nthreads + BS - 1) / BS;
-    permuteData_kernel<Dtype, BS><<<GS, BS, 0, stream>>>(nthreads, num_classes, num_data, num_dim, confSigmoid, (const Dtype*)data, (Dtype*)new_data);
+    permuteData_kernel<Dtype, BS><<<GS, BS, 0, stream>>>(nthreads,
+                                                         num_classes,
+                                                         num_data,
+                                                         num_dim,
+                                                         confSigmoid,
+                                                         (const Dtype*)data,
+                                                         (Dtype*)new_data);
     CSC(cudaGetLastError(), STATUS_FAILURE);
     return STATUS_SUCCESS;
 }
 
 // permuteData LAUNCH CONFIG
-typedef pluginStatus_t (*pdFunc)(cudaStream_t, const int, const int, const int, const int, bool, const void*, void*);
+typedef pluginStatus_t (*pdFunc)(cudaStream_t,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 bool,
+                                 const void*,
+                                 void*);
 
 struct pdLaunchConfig
 {
@@ -69,7 +94,15 @@ bool                               permuteDataInit()
 
 static bool    initialized = permuteDataInit();
 
-pluginStatus_t permuteData(cudaStream_t stream, const int nthreads, const int num_classes, const int num_data, const int num_dim, const DataType DT_DATA, bool confSigmoid, const void* data, void* new_data)
+pluginStatus_t permuteData(cudaStream_t   stream,
+                           const int      nthreads,
+                           const int      num_classes,
+                           const int      num_data,
+                           const int      num_dim,
+                           const DataType DT_DATA,
+                           bool           confSigmoid,
+                           const void*    data,
+                           void*          new_data)
 {
     pdLaunchConfig lc = pdLaunchConfig(DT_DATA);
     for (unsigned i = 0; i < pdFuncVec.size(); ++i)
@@ -77,7 +110,14 @@ pluginStatus_t permuteData(cudaStream_t stream, const int nthreads, const int nu
         if (lc == pdFuncVec[i])
         {
             DEBUG_PRINTF("permuteData kernel %d\n", i);
-            return pdFuncVec[i].function(stream, nthreads, num_classes, num_data, num_dim, confSigmoid, data, new_data);
+            return pdFuncVec[i].function(stream,
+                                         nthreads,
+                                         num_classes,
+                                         num_data,
+                                         num_dim,
+                                         confSigmoid,
+                                         data,
+                                         new_data);
         }
     }
     return STATUS_BAD_PARAM;
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
index 284974e801..df506d3896 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
@@ -9,8 +9,15 @@
 #include "trt_plugin_helper.hpp"
 
 template<typename T_SCORE, unsigned nthds_per_cta>
-__launch_bounds__(nthds_per_cta) __global__
-    void prepareSortData(const int num, const int num_classes, const int num_preds_per_class, const int background_label_id, const float confidence_threshold, T_SCORE* conf_scores_gpu, T_SCORE* temp_scores, int* temp_idx, int* d_offsets)
+__launch_bounds__(nthds_per_cta) __global__ void prepareSortData(const int   num,
+                                                                 const int   num_classes,
+                                                                 const int   num_preds_per_class,
+                                                                 const int   background_label_id,
+                                                                 const float confidence_threshold,
+                                                                 T_SCORE*    conf_scores_gpu,
+                                                                 T_SCORE*    temp_scores,
+                                                                 int*        temp_idx,
+                                                                 int*        d_offsets)
 {
     // Prepare scores data for sort
     const int cur_idx          = blockIdx.x * nthds_per_cta + threadIdx.x;
@@ -65,7 +72,15 @@ __launch_bounds__(nthds_per_cta) __global__
 }
 
 template<typename T_SCORE>
-pluginStatus_t sortScoresPerClass_gpu(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int background_label_id, const float confidence_threshold, void* conf_scores_gpu, void* index_array_gpu, void* workspace)
+pluginStatus_t sortScoresPerClass_gpu(cudaStream_t stream,
+                                      const int    num,
+                                      const int    num_classes,
+                                      const int    num_preds_per_class,
+                                      const int    background_label_id,
+                                      const float  confidence_threshold,
+                                      void*        conf_scores_gpu,
+                                      void*        index_array_gpu,
+                                      void*        workspace)
 {
     const int num_segments  = num * num_classes;
     void*     temp_scores   = workspace;
@@ -108,7 +123,15 @@ pluginStatus_t sortScoresPerClass_gpu(cudaStream_t stream, const int num, const
 }
 
 // sortScoresPerClass LAUNCH CONFIG
-typedef pluginStatus_t (*sspcFunc)(cudaStream_t, const int, const int, const int, const int, const float, void*, void*, void*);
+typedef pluginStatus_t (*sspcFunc)(cudaStream_t,
+                                   const int,
+                                   const int,
+                                   const int,
+                                   const int,
+                                   const float,
+                                   void*,
+                                   void*,
+                                   void*);
 struct sspcLaunchConfig
 {
     DataType t_score;
@@ -138,7 +161,16 @@ bool                                 sspcInit()
 
 static bool    initialized = sspcInit();
 
-pluginStatus_t sortScoresPerClass(cudaStream_t stream, const int num, const int num_classes, const int num_preds_per_class, const int background_label_id, const float confidence_threshold, const DataType DT_SCORE, void* conf_scores_gpu, void* index_array_gpu, void* workspace)
+pluginStatus_t sortScoresPerClass(cudaStream_t   stream,
+                                  const int      num,
+                                  const int      num_classes,
+                                  const int      num_preds_per_class,
+                                  const int      background_label_id,
+                                  const float    confidence_threshold,
+                                  const DataType DT_SCORE,
+                                  void*          conf_scores_gpu,
+                                  void*          index_array_gpu,
+                                  void*          workspace)
 {
     sspcLaunchConfig lc = sspcLaunchConfig(DT_SCORE);
     for (unsigned i = 0; i < sspcFuncVec.size(); ++i)
@@ -146,13 +178,24 @@ pluginStatus_t sortScoresPerClass(cudaStream_t stream, const int num, const int
         if (lc == sspcFuncVec[i])
         {
             DEBUG_PRINTF("sortScoresPerClass kernel %d\n", i);
-            return sspcFuncVec[i].function(stream, num, num_classes, num_preds_per_class, background_label_id, confidence_threshold, conf_scores_gpu, index_array_gpu, workspace);
+            return sspcFuncVec[i].function(stream,
+                                           num,
+                                           num_classes,
+                                           num_preds_per_class,
+                                           background_label_id,
+                                           confidence_threshold,
+                                           conf_scores_gpu,
+                                           index_array_gpu,
+                                           workspace);
         }
     }
     return STATUS_BAD_PARAM;
 }
 
-size_t sortScoresPerClassWorkspaceSize(const int num, const int num_classes, const int num_preds_per_class, const DataType DT_CONF)
+size_t sortScoresPerClassWorkspaceSize(const int      num,
+                                       const int      num_classes,
+                                       const int      num_preds_per_class,
+                                       const DataType DT_CONF)
 {
     size_t    wss[4];
     const int arrayLen = num * num_classes * num_preds_per_class;
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
index 2a940b691a..ab60b5f88a 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
@@ -8,7 +8,14 @@
 #include "nms/kernel.h"
 
 template<typename T_SCORE>
-pluginStatus_t sortScoresPerImage_gpu(cudaStream_t stream, const int num_images, const int num_items_per_image, void* unsorted_scores, void* unsorted_bbox_indices, void* sorted_scores, void* sorted_bbox_indices, void* workspace)
+pluginStatus_t sortScoresPerImage_gpu(cudaStream_t stream,
+                                      const int    num_images,
+                                      const int    num_items_per_image,
+                                      void*        unsorted_scores,
+                                      void*        unsorted_bbox_indices,
+                                      void*        sorted_scores,
+                                      void*        sorted_bbox_indices,
+                                      void*        workspace)
 {
     void* d_offsets    = workspace;
     void* cubWorkspace = nextWorkspacePtr((int8_t*)d_offsets, (num_images + 1) * sizeof(int));
@@ -36,7 +43,14 @@ pluginStatus_t sortScoresPerImage_gpu(cudaStream_t stream, const int num_images,
 }
 
 // sortScoresPerImage LAUNCH CONFIG
-typedef pluginStatus_t (*sspiFunc)(cudaStream_t, const int, const int, void*, void*, void*, void*, void*);
+typedef pluginStatus_t (*sspiFunc)(cudaStream_t,
+                                   const int,
+                                   const int,
+                                   void*,
+                                   void*,
+                                   void*,
+                                   void*,
+                                   void*);
 struct sspiLaunchConfig
 {
     DataType t_score;
@@ -66,7 +80,15 @@ bool                                 sspiInit()
 
 static bool    initialized = sspiInit();
 
-pluginStatus_t sortScoresPerImage(cudaStream_t stream, const int num_images, const int num_items_per_image, const DataType DT_SCORE, void* unsorted_scores, void* unsorted_bbox_indices, void* sorted_scores, void* sorted_bbox_indices, void* workspace)
+pluginStatus_t sortScoresPerImage(cudaStream_t   stream,
+                                  const int      num_images,
+                                  const int      num_items_per_image,
+                                  const DataType DT_SCORE,
+                                  void*          unsorted_scores,
+                                  void*          unsorted_bbox_indices,
+                                  void*          sorted_scores,
+                                  void*          sorted_bbox_indices,
+                                  void*          workspace)
 {
     sspiLaunchConfig lc = sspiLaunchConfig(DT_SCORE);
     for (unsigned i = 0; i < sspiFuncVec.size(); ++i)
@@ -74,13 +96,22 @@ pluginStatus_t sortScoresPerImage(cudaStream_t stream, const int num_images, con
         if (lc == sspiFuncVec[i])
         {
             DEBUG_PRINTF("sortScoresPerImage kernel %d\n", i);
-            return sspiFuncVec[i].function(stream, num_images, num_items_per_image, unsorted_scores, unsorted_bbox_indices, sorted_scores, sorted_bbox_indices, workspace);
+            return sspiFuncVec[i].function(stream,
+                                           num_images,
+                                           num_items_per_image,
+                                           unsorted_scores,
+                                           unsorted_bbox_indices,
+                                           sorted_scores,
+                                           sorted_bbox_indices,
+                                           workspace);
         }
     }
     return STATUS_BAD_PARAM;
 }
 
-size_t sortScoresPerImageWorkspaceSize(const int num_images, const int num_items_per_image, const DataType DT_SCORE)
+size_t sortScoresPerImageWorkspaceSize(const int      num_images,
+                                       const int      num_items_per_image,
+                                       const DataType DT_SCORE)
 {
     const int arrayLen = num_images * num_items_per_image;
     size_t    wss[2];
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
index 67fa9d7961..ad0a1bf6de 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
@@ -5,7 +5,12 @@
 using mmdeploy::TensorDesc;
 
 template<class scalar_t>
-__global__ void copy_permute_kernel(scalar_t* __restrict__ dst, const scalar_t* __restrict__ src, int n, TensorDesc ts_src_stride, TensorDesc ts_dst_stride, TensorDesc ts_permute)
+__global__ void copy_permute_kernel(scalar_t* __restrict__ dst,
+                                    const scalar_t* __restrict__ src,
+                                    int        n,
+                                    TensorDesc ts_src_stride,
+                                    TensorDesc ts_dst_stride,
+                                    TensorDesc ts_permute)
 {
     const int  src_dim    = ts_src_stride.dim;
     const auto src_stride = ts_src_stride.stride;
@@ -26,7 +31,12 @@ __global__ void copy_permute_kernel(scalar_t* __restrict__ dst, const scalar_t*
 }
 
 template<class scalar_t>
-void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim, cudaStream_t stream)
+void memcpyPermute(scalar_t*       dst,
+                   const scalar_t* src,
+                   int*            src_size,
+                   int*            permute,
+                   int             src_dim,
+                   cudaStream_t    stream)
 {
     size_t     copy_size = 1;
     TensorDesc ts_permute;
@@ -69,8 +79,19 @@ void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permu
         ts_permute);
 }
 
-template void memcpyPermute<float>(float* dst, const float* src, int* src_size, int* permute, int src_dim, cudaStream_t stream);
-template void memcpyPermute<half>(half* dst, const half* src, int* src_size, int* permute, int src_dim, cudaStream_t stream);
+template void memcpyPermute<float>(float*       dst,
+                                   const float* src,
+                                   int*         src_size,
+                                   int*         permute,
+                                   int          src_dim,
+                                   cudaStream_t stream);
+
+template void memcpyPermute<half>(half*        dst,
+                                  const half*  src,
+                                  int*         src_size,
+                                  int*         permute,
+                                  int          src_dim,
+                                  cudaStream_t stream);
 
 cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t* cudnn_dtype)
 {
@@ -89,13 +110,39 @@ cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataTyp
 }
 
 template<>
-cublasStatus_t cublasGemmWrap<float>(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc)
+cublasStatus_t cublasGemmWrap<float>(cublasHandle_t    handle,
+                                     cublasOperation_t transa,
+                                     cublasOperation_t transb,
+                                     int               m,
+                                     int               n,
+                                     int               k,
+                                     const float*      alpha,
+                                     const float*      A,
+                                     int               lda,
+                                     const float*      B,
+                                     int               ldb,
+                                     const float*      beta,
+                                     float*            C,
+                                     int               ldc)
 {
     return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
 template<>
-cublasStatus_t cublasGemmWrap<half>(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const half* alpha, const half* A, int lda, const half* B, int ldb, const half* beta, half* C, int ldc)
+cublasStatus_t cublasGemmWrap<half>(cublasHandle_t    handle,
+                                    cublasOperation_t transa,
+                                    cublasOperation_t transb,
+                                    int               m,
+                                    int               n,
+                                    int               k,
+                                    const half*       alpha,
+                                    const half*       A,
+                                    int               lda,
+                                    const half*       B,
+                                    int               ldb,
+                                    const half*       beta,
+                                    half*             C,
+                                    int               ldc)
 {
     return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
index b833a7e19a..247093db2f 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
@@ -157,10 +157,54 @@ namespace mmdeploy
         switch (data_type)
         {
             case nvinfer1::DataType::kFLOAT:
-                deform_conv<float>((float*)x, (float*)weight, (float*)offset, (float*)output, workSpace, batch, channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
+                deform_conv<float>((float*)x,
+                                   (float*)weight,
+                                   (float*)offset,
+                                   (float*)output,
+                                   workSpace,
+                                   batch,
+                                   channels,
+                                   height,
+                                   width,
+                                   channels_out,
+                                   kernel_w,
+                                   kernel_h,
+                                   mStride.d[0],
+                                   mStride.d[1],
+                                   mPadding.d[0],
+                                   mPadding.d[1],
+                                   mDilation.d[0],
+                                   mDilation.d[1],
+                                   mGroup,
+                                   mDeformableGroup,
+                                   im2col_step,
+                                   m_cublas_handle,
+                                   stream);
                 break;
             case nvinfer1::DataType::kHALF:
-                deform_conv<half>((half*)x, (half*)weight, (half*)offset, (half*)output, workSpace, batch, channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
+                deform_conv<half>((half*)x,
+                                  (half*)weight,
+                                  (half*)offset,
+                                  (half*)output,
+                                  workSpace,
+                                  batch,
+                                  channels,
+                                  height,
+                                  width,
+                                  channels_out,
+                                  kernel_w,
+                                  kernel_h,
+                                  mStride.d[0],
+                                  mStride.d[1],
+                                  mPadding.d[0],
+                                  mPadding.d[1],
+                                  mDilation.d[0],
+                                  mDilation.d[1],
+                                  mGroup,
+                                  mDeformableGroup,
+                                  im2col_step,
+                                  m_cublas_handle,
+                                  stream);
                 break;
             default:
                 return 1;
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
index 6d3b4f936c..09845327ca 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
@@ -14,9 +14,16 @@ namespace mmdeploy
     class DeformableConvPluginDynamic : public TRTPluginBase
     {
       public:
-        DeformableConvPluginDynamic(const std::string& name, const nvinfer1::Dims stride, const nvinfer1::Dims padding, const nvinfer1::Dims dilation, const int deformableGroup, const int group);
+        DeformableConvPluginDynamic(const std::string&   name,
+                                    const nvinfer1::Dims stride,
+                                    const nvinfer1::Dims padding,
+                                    const nvinfer1::Dims dilation,
+                                    const int            deformableGroup,
+                                    const int            group);
 
-        DeformableConvPluginDynamic(const std::string name, const void* data, size_t length);
+        DeformableConvPluginDynamic(const std::string name,
+                                    const void*       data,
+                                    size_t            length);
 
         DeformableConvPluginDynamic() = delete;
 
@@ -24,22 +31,44 @@ namespace mmdeploy
 
         // IPluginV2DynamicExt Methods
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
-        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
-        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
-        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
-        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
         int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
                                    const void* const*                inputs,
                                    void* const*                      outputs,
                                    void*                             workspace,
                                    cudaStream_t                      stream) TRT_NOEXCEPT override;
-        void               attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
+        void                           attachToContext(cudnnContext*            cudnnContext,
+                                                       cublasContext*           cublasContext,
+                                                       nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
         void               detachFromContext() TRT_NOEXCEPT override;
 
         // IPluginV2Ext Methods
-        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         // IPluginV2 Methods
         const char*        getPluginType() const TRT_NOEXCEPT override;
@@ -67,10 +96,12 @@ namespace mmdeploy
 
         const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-            TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_DEFORM_CONV_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
index 8fe86280af..e62bdb0a48 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
@@ -69,40 +69,80 @@
 #include "trt_plugin_helper.hpp"
 
 template<typename scalar_t>
-void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, cudaStream_t stream)
+void deform_conv_im2col(const scalar_t* input,
+                        const scalar_t* offset,
+                        scalar_t*       column,
+                        const int       channels,
+                        const int       height,
+                        const int       width,
+                        const int       ksize_h,
+                        const int       ksize_w,
+                        const int       pad_h,
+                        const int       pad_w,
+                        const int       stride_h,
+                        const int       stride_w,
+                        const int       dilation_h,
+                        const int       dilation_w,
+                        const int       parallel_imgs,
+                        const int       deformable_group,
+                        cudaStream_t    stream)
 {
     int height_col                   = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
     int width_col                    = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
     int num_kernels                  = channels * height_col * width_col * parallel_imgs;
     int channel_per_deformable_group = channels / deformable_group;
 
-    deformable_im2col_gpu_kernel<scalar_t><<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
-        num_kernels,
-        input,
-        offset,
-        height,
-        width,
-        ksize_h,
-        ksize_w,
-        pad_h,
-        pad_w,
-        stride_h,
-        stride_w,
-        dilation_h,
-        dilation_w,
-        channel_per_deformable_group,
-        parallel_imgs,
-        channels,
-        deformable_group,
-        height_col,
-        width_col,
-        column);
+    deformable_im2col_gpu_kernel<scalar_t><<<GET_BLOCKS(num_kernels),
+                                             THREADS_PER_BLOCK,
+                                             0,
+                                             stream>>>(num_kernels,
+                                                       input,
+                                                       offset,
+                                                       height,
+                                                       width,
+                                                       ksize_h,
+                                                       ksize_w,
+                                                       pad_h,
+                                                       pad_w,
+                                                       stride_h,
+                                                       stride_w,
+                                                       dilation_h,
+                                                       dilation_w,
+                                                       channel_per_deformable_group,
+                                                       parallel_imgs,
+                                                       channels,
+                                                       deformable_group,
+                                                       height_col,
+                                                       width_col,
+                                                       column);
 
     cudaCheckError();
 }
 
 template<typename scalar_t>
-void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset, scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream)
+void deform_conv(const scalar_t* input,
+                 const scalar_t* weight,
+                 const scalar_t* offset,
+                 scalar_t*       output,
+                 void*           workspace,
+                 int             batchSize,
+                 int             nInputPlane,
+                 int             inputHeight,
+                 int             inputWidth,
+                 int             nOutputPlane,
+                 int             kW,
+                 int             kH,
+                 int             dW,
+                 int             dH,
+                 int             padW,
+                 int             padH,
+                 int             dilationW,
+                 int             dilationH,
+                 int             group,
+                 int             deformable_group,
+                 int             im2col_step,
+                 cublasHandle_t  cublas_handle,
+                 cudaStream_t    stream)
 {
     size_t word_size = sizeof(scalar_t);
 
@@ -112,8 +152,7 @@ void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t*
 
     long outputHW = outputHeight * outputWidth;
     long kHW      = kH * kW;
-    long columns_size =
-        mmdeploy::getAlignedSize(nInputPlane * kHW * im2col_step * outputHW * word_size);
+    long      columns_size = mmdeploy::getAlignedSize(nInputPlane * kHW * im2col_step * outputHW * word_size);
 
     // column buffer for img2col
     char*     workspace_ptr = reinterpret_cast<char*>(workspace);
@@ -148,7 +187,23 @@ void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t*
         const scalar_t* input_start  = input + elt * input_elt_step;
         const scalar_t* offset_start = offset + elt * offset_elt_step;
 
-        deform_conv_im2col<scalar_t>(input_start, offset_start, columns, nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, stream);
+        deform_conv_im2col<scalar_t>(input_start,
+                                     offset_start,
+                                     columns,
+                                     nInputPlane,
+                                     inputHeight,
+                                     inputWidth,
+                                     kH,
+                                     kW,
+                                     padH,
+                                     padW,
+                                     dH,
+                                     dW,
+                                     dilationH,
+                                     dilationW,
+                                     im2col_step,
+                                     deformable_group,
+                                     stream);
 
         for (int g = 0; g < group; ++g)
         {
@@ -156,19 +211,85 @@ void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t*
             scalar_t*       col_start        = columns + g * col_g_step;
             scalar_t*       out_buffer_start = output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
 
-            cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start, n, weight_start, k, &beta, out_buffer_start, n);
+            cublasGemmWrap<scalar_t>(cublas_handle,
+                                     CUBLAS_OP_N,
+                                     CUBLAS_OP_N,
+                                     n,
+                                     m,
+                                     k,
+                                     &alpha,
+                                     col_start,
+                                     n,
+                                     weight_start,
+                                     k,
+                                     &beta,
+                                     out_buffer_start,
+                                     n);
             cudaCheckError();
         }
     }
 
     if (im2col_step != 1)
     {
-        int output_buffer_shape[5]   = {batchSize / im2col_step, nOutputPlane, im2col_step, static_cast<int>(outputHeight), static_cast<int>(outputWidth)};
+        int output_buffer_shape[5]   = {batchSize / im2col_step,
+                                        nOutputPlane,
+                                        im2col_step,
+                                        static_cast<int>(outputHeight),
+                                        static_cast<int>(outputWidth)};
         int output_buffer_permute[5] = {0, 2, 1, 3, 4};
-        memcpyPermute<scalar_t>(output, output_buffer, &output_buffer_shape[0], &output_buffer_permute[0], 5, stream);
+        memcpyPermute<scalar_t>(output,
+                                output_buffer,
+                                &output_buffer_shape[0],
+                                &output_buffer_permute[0],
+                                5,
+                                stream);
     }
 }
 
-template void deform_conv<float>(const float* input, const float* weight, const float* offset, float* output, void* workspace, int batchSize, int nInputPlane, int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+template void deform_conv<float>(const float*   input,
+                                 const float*   weight,
+                                 const float*   offset,
+                                 float*         output,
+                                 void*          workspace,
+                                 int            batchSize,
+                                 int            nInputPlane,
+                                 int            inputHeight,
+                                 int            inputWidth,
+                                 int            nOutputPlane,
+                                 int            kW,
+                                 int            kH,
+                                 int            dW,
+                                 int            dH,
+                                 int            padW,
+                                 int            padH,
+                                 int            dilationW,
+                                 int            dilationH,
+                                 int            group,
+                                 int            deformable_group,
+                                 int            im2col_step,
+                                 cublasHandle_t cublas_handle,
+                                 cudaStream_t   stream);
 
-template void deform_conv<__half>(const __half* input, const __half* weight, const __half* offset, __half* output, void* workspace, int batchSize, int nInputPlane, int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+template void deform_conv<__half>(const __half*  input,
+                                  const __half*  weight,
+                                  const __half*  offset,
+                                  __half*        output,
+                                  void*          workspace,
+                                  int            batchSize,
+                                  int            nInputPlane,
+                                  int            inputHeight,
+                                  int            inputWidth,
+                                  int            nOutputPlane,
+                                  int            kW,
+                                  int            kH,
+                                  int            dW,
+                                  int            dH,
+                                  int            padW,
+                                  int            padH,
+                                  int            dilationW,
+                                  int            dilationH,
+                                  int            group,
+                                  int            deformable_group,
+                                  int            im2col_step,
+                                  cublasHandle_t cublas_handle,
+                                  cudaStream_t   stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
index 330f4b331a..85e675bf9c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
@@ -132,27 +132,26 @@ __device__ __forceinline__ __half deformable_im2col_bilinear(const __half* __res
 }
 
 template<typename scalar_t>
-__global__ void deformable_im2col_gpu_kernel(
-    const int n,
-    const scalar_t* __restrict__ data_im,
-    const scalar_t* __restrict__ data_offset,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int pad_h,
-    const int pad_w,
-    const int stride_h,
-    const int stride_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size,
-    const int num_channels,
-    const int deformable_group,
-    const int height_col,
-    const int width_col,
-    scalar_t* __restrict__ data_col)
+__global__ void deformable_im2col_gpu_kernel(const int n,
+                                             const scalar_t* __restrict__ data_im,
+                                             const scalar_t* __restrict__ data_offset,
+                                             const int height,
+                                             const int width,
+                                             const int kernel_h,
+                                             const int kernel_w,
+                                             const int pad_h,
+                                             const int pad_w,
+                                             const int stride_h,
+                                             const int stride_w,
+                                             const int dilation_h,
+                                             const int dilation_w,
+                                             const int channel_per_deformable_group,
+                                             const int batch_size,
+                                             const int num_channels,
+                                             const int deformable_group,
+                                             const int height_col,
+                                             const int width_col,
+                                             scalar_t* __restrict__ data_col)
 {
     const int hw_col        = height_col * width_col;
     const int data_col_step = batch_size * hw_col;
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
index 35f08be1b4..012dc894f8 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
@@ -5,8 +5,46 @@
 #include <cuda_runtime.h>
 
 template<typename scalar_t>
-void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, cudaStream_t stream);
+void deform_conv_im2col(const scalar_t* input,
+                        const scalar_t* offset,
+                        scalar_t*       column,
+                        const int       channels,
+                        const int       height,
+                        const int       width,
+                        const int       ksize_h,
+                        const int       ksize_w,
+                        const int       pad_h,
+                        const int       pad_w,
+                        const int       stride_h,
+                        const int       stride_w,
+                        const int       dilation_h,
+                        const int       dilation_w,
+                        const int       parallel_imgs,
+                        const int       deformable_group,
+                        cudaStream_t    stream);
 
 template<typename scalar_t>
-void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset, scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+void deform_conv(const scalar_t* input,
+                 const scalar_t* weight,
+                 const scalar_t* offset,
+                 scalar_t*       output,
+                 void*           workspace,
+                 int             batchSize,
+                 int             nInputPlane,
+                 int             inputHeight,
+                 int             inputWidth,
+                 int             nOutputPlane,
+                 int             kW,
+                 int             kH,
+                 int             dW,
+                 int             dH,
+                 int             padW,
+                 int             padH,
+                 int             dilationW,
+                 int             dilationH,
+                 int             group,
+                 int             deformable_group,
+                 int             im2col_step,
+                 cublasHandle_t  cublas_handle,
+                 cudaStream_t    stream);
 #endif  // TRT_DEFORM_CONV_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
index 7dd688e089..2de48da10b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
@@ -56,7 +56,10 @@ namespace mmdeploy
         return ret;
     }
 
-    bool GatherTopk::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    bool GatherTopk::supportsFormatCombination(int                               pos,
+                                               const nvinfer1::PluginTensorDesc* ioDesc,
+                                               int                               nbInputs,
+                                               int                               nbOutputs) TRT_NOEXCEPT
     {
         switch (pos)
         {
@@ -79,9 +82,15 @@ namespace mmdeploy
         return true;
     }
 
-    void   GatherTopk::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT {}
+    void   GatherTopk::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                     int                                      nbInputs,
+                                     const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                     int                                      nbOutputs) TRT_NOEXCEPT {}
 
-    size_t GatherTopk::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT
+    size_t GatherTopk::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                        int                               nbInputs,
+                                        const nvinfer1::PluginTensorDesc* outputs,
+                                        int                               nbOutputs) const TRT_NOEXCEPT
     {
         return 0;
     }
@@ -107,11 +116,25 @@ namespace mmdeploy
         switch (data_type)
         {
             case nvinfer1::DataType::kFLOAT:
-                gather_topk_impl<float>((float*)data, (int*)indices, dims, nbDims, indices_dims, indice_nbDims, (float*)output, stream);
+                gather_topk_impl<float>((float*)data,
+                                        (int*)indices,
+                                        dims,
+                                        nbDims,
+                                        indices_dims,
+                                        indice_nbDims,
+                                        (float*)output,
+                                        stream);
                 break;
 
             case nvinfer1::DataType::kINT32:
-                gather_topk_impl<int>((int*)data, (int*)indices, dims, nbDims, indices_dims, indice_nbDims, (int*)output, stream);
+                gather_topk_impl<int>((int*)data,
+                                      (int*)indices,
+                                      dims,
+                                      nbDims,
+                                      indices_dims,
+                                      indice_nbDims,
+                                      (int*)output,
+                                      stream);
                 break;
             default:
                 break;
@@ -120,7 +143,9 @@ namespace mmdeploy
         return 0;
     }
 
-    nvinfer1::DataType GatherTopk::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
+    nvinfer1::DataType GatherTopk::getOutputDataType(int                       index,
+                                                     const nvinfer1::DataType* inputTypes,
+                                                     int                       nbInputs) const TRT_NOEXCEPT
     {
         return inputTypes[0];
     }
@@ -174,7 +199,9 @@ namespace mmdeploy
         return plugin;
     }
 
-    nvinfer1::IPluginV2* GatherTopkCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
+    nvinfer1::IPluginV2* GatherTopkCreator::deserializePlugin(const char* name,
+                                                              const void* serialData,
+                                                              size_t      serialLength) TRT_NOEXCEPT
     {
         auto plugin = new GatherTopk(name, serialData, serialLength);
         plugin->setPluginNamespace(getPluginNamespace());
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
index b3db9b4058..d1a0df29e3 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
@@ -22,11 +22,27 @@ namespace mmdeploy
 
         // IPluginV2DynamicExt Methods
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
-        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
-        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
-        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
-        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
         int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
                                    const void* const*                inputs,
@@ -35,7 +51,9 @@ namespace mmdeploy
                                    cudaStream_t                      stream) TRT_NOEXCEPT override;
 
         // IPluginV2Ext Methods
-        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         // IPluginV2 Methods
         const char*        getPluginType() const TRT_NOEXCEPT override;
@@ -53,10 +71,13 @@ namespace mmdeploy
         const char*          getPluginName() const TRT_NOEXCEPT override;
 
         const char*          getPluginVersion() const TRT_NOEXCEPT override;
-        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-            TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_SCATTERND_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
index 873876ec12..3c1663d499 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
@@ -9,7 +9,13 @@
 #include "trt_plugin_helper.hpp"
 
 template<typename scalar_t>
-__global__ void gather_topk_kernel(const scalar_t* input, const int* indices, scalar_t* output, int batch, int num_input, int num_indices, int channel)
+__global__ void gather_topk_kernel(const scalar_t* input,
+                                   const int*      indices,
+                                   scalar_t*       output,
+                                   int             batch,
+                                   int             num_input,
+                                   int             num_indices,
+                                   int             channel)
 {
     CUDA_1D_KERNEL_LOOP(index, batch * num_indices * channel)
     {
@@ -24,7 +30,14 @@ __global__ void gather_topk_kernel(const scalar_t* input, const int* indices, sc
 }
 
 template<typename scalar_t>
-void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, scalar_t* output, cudaStream_t stream)
+void gather_topk_impl(const scalar_t* input,
+                      const int*      indices,
+                      const int*      dims,
+                      int             nbDims,
+                      const int*      indices_dims,
+                      int             indice_nbDims,
+                      scalar_t*       output,
+                      cudaStream_t    stream)
 {
     int batch = 1;
     for (int i = 0; i < indice_nbDims - 1; ++i) batch *= dims[i];
@@ -33,9 +46,29 @@ void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims
     int channel     = 1;
     for (int i = indice_nbDims; i < nbDims; ++i) channel *= dims[i];
     const int col_block = DIVUP(batch * num_indices * channel, THREADS_PER_BLOCK);
-    gather_topk_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(input, indices, output, batch, num_input, num_indices, channel);
+    gather_topk_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(input,
+                                                                    indices,
+                                                                    output,
+                                                                    batch,
+                                                                    num_input,
+                                                                    num_indices,
+                                                                    channel);
 }
 
-template void gather_topk_impl<float>(const float* input, const int* indices, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, float* output, cudaStream_t stream);
+template void gather_topk_impl<float>(const float* input,
+                                      const int*   indices,
+                                      const int*   dims,
+                                      int          nbDims,
+                                      const int*   indices_dims,
+                                      int          indice_nbDims,
+                                      float*       output,
+                                      cudaStream_t stream);
 
-template void gather_topk_impl<int32_t>(const int32_t* input, const int* indices, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, int32_t* output, cudaStream_t stream);
+template void gather_topk_impl<int32_t>(const int32_t* input,
+                                        const int*     indices,
+                                        const int*     dims,
+                                        int            nbDims,
+                                        const int*     indices_dims,
+                                        int            indice_nbDims,
+                                        int32_t*       output,
+                                        cudaStream_t   stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
index e5ee6b987e..0c5c7e6011 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
@@ -4,5 +4,12 @@
 #include <cuda_runtime.h>
 
 template<typename scalar_t>
-void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, scalar_t* output, cudaStream_t stream);
+void gather_topk_impl(const scalar_t* input,
+                      const int*      indices,
+                      const int*      dims,
+                      int             nbDims,
+                      const int*      indices_dims,
+                      int             indice_nbDims,
+                      scalar_t*       output,
+                      cudaStream_t    stream);
 #endif  // TRT_GRID_SAMPLER_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
index ef99b1fba6..761b61538b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
@@ -59,7 +59,10 @@ namespace mmdeploy
         return ret;
     }
 
-    bool GridPriorsTRT::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    bool GridPriorsTRT::supportsFormatCombination(int                               pos,
+                                                  const nvinfer1::PluginTensorDesc* ioDesc,
+                                                  int                               nbInputs,
+                                                  int                               nbOutputs) TRT_NOEXCEPT
     {
         if (pos == 0)
         {
@@ -94,7 +97,14 @@ namespace mmdeploy
         switch (data_type)
         {
             case nvinfer1::DataType::kFLOAT:
-                trt_grid_priors_impl<float>((float*)base_anchor, (float*)output, num_base_anchors, feat_w, feat_h, mStride.d[0], mStride.d[1], stream);
+                trt_grid_priors_impl<float>((float*)base_anchor,
+                                            (float*)output,
+                                            num_base_anchors,
+                                            feat_w,
+                                            feat_h,
+                                            mStride.d[0],
+                                            mStride.d[1],
+                                            stream);
                 break;
             default:
                 return 1;
@@ -103,7 +113,9 @@ namespace mmdeploy
         return 0;
     }
 
-    nvinfer1::DataType GridPriorsTRT::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
+    nvinfer1::DataType GridPriorsTRT::getOutputDataType(int                       index,
+                                                        const nvinfer1::DataType* inputTypes,
+                                                        int                       nbInputs) const TRT_NOEXCEPT
     {
         return inputTypes[0];
     }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
index a555b2d54a..8285ba47ab 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
@@ -24,9 +24,17 @@ namespace mmdeploy
 
         // IPluginV2DynamicExt Methods
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
-        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
-        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
         int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
                                    const void* const*                inputs,
@@ -35,7 +43,9 @@ namespace mmdeploy
                                    cudaStream_t                      stream) TRT_NOEXCEPT override;
 
         // IPluginV2Ext Methods
-        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         // IPluginV2 Methods
         const char*        getPluginType() const TRT_NOEXCEPT override;
@@ -59,10 +69,12 @@ namespace mmdeploy
 
         const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-            TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_GRID_PRIORS_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
index 9decc3ba6e..f6207eecc1 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
@@ -6,7 +6,13 @@
 #include "trt_plugin_helper.hpp"
 
 template<typename scalar_t>
-__global__ void trt_grid_priors_kernel(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors, int feat_w, int feat_h, int stride_w, int stride_h)
+__global__ void trt_grid_priors_kernel(const scalar_t* base_anchor,
+                                       scalar_t*       output,
+                                       int             num_base_anchors,
+                                       int             feat_w,
+                                       int             feat_h,
+                                       int             stride_w,
+                                       int             stride_h)
 {
     // load base anchor into shared memory.
     extern __shared__ scalar_t shared_base_anchor[];
@@ -31,16 +37,32 @@ __global__ void trt_grid_priors_kernel(const scalar_t* base_anchor, scalar_t* ou
 }
 
 template<typename scalar_t>
-void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors, int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream)
+void trt_grid_priors_impl(const scalar_t* base_anchor,
+                          scalar_t*       output,
+                          int             num_base_anchors,
+                          int             feat_w,
+                          int             feat_h,
+                          int             stride_w,
+                          int             stride_h,
+                          cudaStream_t    stream)
 {
-    trt_grid_priors_kernel<<<GET_BLOCKS(num_base_anchors * feat_w * feat_h), THREADS_PER_BLOCK, DIVUP(num_base_anchors * 4, 32) * 32 * sizeof(scalar_t), stream>>>(
-        base_anchor,
-        output,
-        (int)num_base_anchors,
-        (int)feat_w,
-        (int)feat_h,
-        (int)stride_w,
-        (int)stride_h);
+    trt_grid_priors_kernel<<<GET_BLOCKS(num_base_anchors * feat_w * feat_h),
+                             THREADS_PER_BLOCK,
+                             DIVUP(num_base_anchors * 4, 32) * 32 * sizeof(scalar_t),
+                             stream>>>(base_anchor,
+                                       output,
+                                       (int)num_base_anchors,
+                                       (int)feat_w,
+                                       (int)feat_h,
+                                       (int)stride_w,
+                                       (int)stride_h);
 }
 
-template void trt_grid_priors_impl<float>(const float* base_anchor, float* output, int num_base_anchors, int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream);
+template void trt_grid_priors_impl<float>(const float* base_anchor,
+                                          float*       output,
+                                          int          num_base_anchors,
+                                          int          feat_w,
+                                          int          feat_h,
+                                          int          stride_w,
+                                          int          stride_h,
+                                          cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
index e050eb1047..5de3690b30 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
@@ -4,6 +4,13 @@
 #include <cuda_runtime.h>
 
 template<typename scalar_t>
-void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors, int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream);
+void trt_grid_priors_impl(const scalar_t* base_anchor,
+                          scalar_t*       output,
+                          int             num_base_anchors,
+                          int             feat_w,
+                          int             feat_h,
+                          int             stride_w,
+                          int             stride_h,
+                          cudaStream_t    stream);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
index 0d7ebf32da..9894f7f0b4 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
@@ -58,7 +58,10 @@ namespace mmdeploy
         return ret;
     }
 
-    bool TRTGridSampler::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    bool TRTGridSampler::supportsFormatCombination(int                               pos,
+                                                   const nvinfer1::PluginTensorDesc* ioDesc,
+                                                   int                               nbInputs,
+                                                   int                               nbOutputs) TRT_NOEXCEPT
     {
         if (pos == 0)
         {
@@ -71,12 +74,18 @@ namespace mmdeploy
         }
     }
 
-    void TRTGridSampler::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT
+    void TRTGridSampler::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                         int                                      nbInputs,
+                                         const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                         int                                      nbOutputs) TRT_NOEXCEPT
     {
         // Validate input arguments
     }
 
-    size_t TRTGridSampler::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT
+    size_t TRTGridSampler::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                            int                               nbInputs,
+                                            const nvinfer1::PluginTensorDesc* outputs,
+                                            int                               nbOutputs) const TRT_NOEXCEPT
     {
         return 0;
     }
@@ -128,7 +137,17 @@ namespace mmdeploy
         switch (data_type)
         {
             case nvinfer1::DataType::kFLOAT:
-                grid_sample<float>((float*)outputs[0], (float*)inputs[0], (float*)inputs[1], &(output_dims.d[0]), &(input_dims.d[0]), &(grid_dims.d[0]), input_dims.nbDims, interp_mode, padding_mode, mAlignCorners, stream);
+                grid_sample<float>((float*)outputs[0],
+                                   (float*)inputs[0],
+                                   (float*)inputs[1],
+                                   &(output_dims.d[0]),
+                                   &(input_dims.d[0]),
+                                   &(grid_dims.d[0]),
+                                   input_dims.nbDims,
+                                   interp_mode,
+                                   padding_mode,
+                                   mAlignCorners,
+                                   stream);
                 break;
             default:
                 return 1;
@@ -177,8 +196,9 @@ namespace mmdeploy
 
     TRTGridSamplerCreator::TRTGridSamplerCreator()
     {
-        mPluginAttributes = std::vector<nvinfer1::PluginField>(
-            {nvinfer1::PluginField("interpolation_mode"), nvinfer1::PluginField("padding_mode"), nvinfer1::PluginField("align_corners")});
+        mPluginAttributes = std::vector<nvinfer1::PluginField>({nvinfer1::PluginField("interpolation_mode"),
+                                                                nvinfer1::PluginField("padding_mode"),
+                                                                nvinfer1::PluginField("align_corners")});
         mFC.nbFields = mPluginAttributes.size();
         mFC.fields   = mPluginAttributes.data();
     }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
index 1fc41e5bb8..286b955d6c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
@@ -15,9 +15,14 @@ namespace mmdeploy
     class TRTGridSampler : public TRTPluginBase
     {
       public:
-        TRTGridSampler(const std::string& name, int mode, int paddingMode, bool alignCorners);
+        TRTGridSampler(const std::string& name,
+                       int                mode,
+                       int                paddingMode,
+                       bool               alignCorners);
 
-        TRTGridSampler(const std::string name, const void* data, size_t length);
+        TRTGridSampler(const std::string name,
+                       const void*       data,
+                       size_t            length);
 
         TRTGridSampler() = delete;
 
@@ -26,14 +31,26 @@ namespace mmdeploy
         // IPluginV2DynamicExt Methods
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder)
             TRT_NOEXCEPT override;
 
-        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int                               pos,
+                                                     const nvinfer1::PluginTensorDesc* ioDesc,
+                                                     int                               nbInputs,
+                                                     int                               nbOutputs) TRT_NOEXCEPT override;
 
-        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                           int                                      nbInputs,
+                                           const nvinfer1::DynamicPluginTensorDesc* out,
+                                           int                                      nbOutputs) TRT_NOEXCEPT override;
 
-        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                            int                               nbInputs,
+                                            const nvinfer1::PluginTensorDesc* outputs,
+                                            int                               nbOutputs) const TRT_NOEXCEPT override;
 
         int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
@@ -73,10 +90,13 @@ namespace mmdeploy
 
         const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc)
             TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_GRID_SAMPLER_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
index 6dafbbb126..28d581dd66 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
@@ -130,7 +130,16 @@ static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w, int
 }
 
 template<typename scalar_t>
-__global__ void grid_sampler_2d_kernel(const int nthreads, const scalar_t* input, const scalar_t* grid, scalar_t* output, TensorDesc input_desc, TensorDesc grid_desc, TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode, const GridSamplerPadding padding_mode, bool align_corners)
+__global__ void grid_sampler_2d_kernel(const int                      nthreads,
+                                       const scalar_t*                input,
+                                       const scalar_t*                grid,
+                                       scalar_t*                      output,
+                                       TensorDesc                     input_desc,
+                                       TensorDesc                     grid_desc,
+                                       TensorDesc                     output_desc,
+                                       const GridSamplerInterpolation interpolation_mode,
+                                       const GridSamplerPadding       padding_mode,
+                                       bool                           align_corners)
 {
     int C          = input_desc.shape[1];
     int inp_H      = input_desc.shape[2];
@@ -230,7 +239,16 @@ __global__ void grid_sampler_2d_kernel(const int nthreads, const scalar_t* input
 }
 
 template<typename scalar_t>
-__global__ void grid_sampler_3d_kernel(const int nthreads, const scalar_t* input, const scalar_t* grid, scalar_t* output, TensorDesc input_desc, TensorDesc grid_desc, TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode, const GridSamplerPadding padding_mode, bool align_corners)
+__global__ void grid_sampler_3d_kernel(const int                      nthreads,
+                                       const scalar_t*                input,
+                                       const scalar_t*                grid,
+                                       scalar_t*                      output,
+                                       TensorDesc                     input_desc,
+                                       TensorDesc                     grid_desc,
+                                       TensorDesc                     output_desc,
+                                       const GridSamplerInterpolation interpolation_mode,
+                                       const GridSamplerPadding       padding_mode,
+                                       bool                           align_corners)
 {
     int C          = input_desc.shape[1];
     int inp_D      = input_desc.shape[2];
@@ -402,7 +420,17 @@ void create_desc(const int* dims, int nb_dims, TensorDesc& desc)
 }
 
 template<typename T>
-void grid_sample(T* output, const T* input, const T* grid, int* output_dims, int* input_dims, int* grid_dims, int nb_dims, GridSamplerInterpolation interp, GridSamplerPadding padding, bool align_corners, cudaStream_t stream)
+void grid_sample(T*                       output,
+                 const T*                 input,
+                 const T*                 grid,
+                 int*                     output_dims,
+                 int*                     input_dims,
+                 int*                     grid_dims,
+                 int                      nb_dims,
+                 GridSamplerInterpolation interp,
+                 GridSamplerPadding       padding,
+                 bool                     align_corners,
+                 cudaStream_t             stream)
 {
     TensorDesc input_desc;
     create_desc(input_dims, nb_dims, input_desc);
@@ -425,31 +453,29 @@ void grid_sample(T* output, const T* input, const T* grid, int* output_dims, int
 
     if (nb_dims == 4)
     {
-        grid_sampler_2d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
-            count,
-            input,
-            grid,
-            output,
-            input_desc,
-            grid_desc,
-            output_desc,
-            interp,
-            padding,
-            align_corners);
+        grid_sampler_2d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(count,
+                                                                                       input,
+                                                                                       grid,
+                                                                                       output,
+                                                                                       input_desc,
+                                                                                       grid_desc,
+                                                                                       output_desc,
+                                                                                       interp,
+                                                                                       padding,
+                                                                                       align_corners);
     }
     else if (nb_dims == 5)
     {
-        grid_sampler_3d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
-            count,
-            input,
-            grid,
-            output,
-            input_desc,
-            grid_desc,
-            output_desc,
-            interp,
-            padding,
-            align_corners);
+        grid_sampler_3d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(count,
+                                                                                       input,
+                                                                                       grid,
+                                                                                       output,
+                                                                                       input_desc,
+                                                                                       grid_desc,
+                                                                                       output_desc,
+                                                                                       interp,
+                                                                                       padding,
+                                                                                       align_corners);
     }
     else
     {
@@ -457,4 +483,14 @@ void grid_sample(T* output, const T* input, const T* grid, int* output_dims, int
     }
 }
 
-template void grid_sample<float>(float* output, const float* input, const float* grid, int* output_dims, int* input_dims, int* grid_dims, int nb_dims, GridSamplerInterpolation interp, GridSamplerPadding padding, bool align_corners, cudaStream_t stream);
+template void grid_sample<float>(float*                   output,
+                                 const float*             input,
+                                 const float*             grid,
+                                 int*                     output_dims,
+                                 int*                     input_dims,
+                                 int*                     grid_dims,
+                                 int                      nb_dims,
+                                 GridSamplerInterpolation interp,
+                                 GridSamplerPadding       padding,
+                                 bool                     align_corners,
+                                 cudaStream_t             stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
index b73bd91213..2da0e3abc5 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
@@ -16,5 +16,15 @@ enum class GridSamplerPadding
 };
 
 template<typename T>
-void grid_sample(T* output, const T* input, const T* grid, int* output_dims, int* input_dims, int* grid_dims, int nb_dims, GridSamplerInterpolation interp, GridSamplerPadding padding, bool align_corners, cudaStream_t stream);
+void grid_sample(T*                       output,
+                 const T*                 input,
+                 const T*                 grid,
+                 int*                     output_dims,
+                 int*                     input_dims,
+                 int*                     grid_dims,
+                 int                      nb_dims,
+                 GridSamplerInterpolation interp,
+                 GridSamplerPadding       padding,
+                 bool                     align_corners,
+                 cudaStream_t             stream);
 #endif  // TRT_GRID_SAMPLER_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
index a3ead6d507..7b5ed533e5 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
@@ -20,13 +20,16 @@ namespace mmdeploy
         constexpr const char* PLUGIN_NAME{"TRTInstanceNormalization"};
     }  // namespace
 
-    TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, float epsilon)
+    TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name,
+                                                       float              epsilon)
         : TRTPluginBase(name)
         , mEpsilon(epsilon)
     {
     }
 
-    TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, void const* serialData, size_t serialLength)
+    TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name,
+                                                       void const*        serialData,
+                                                       size_t             serialLength)
         : TRTPluginBase(name)
     {
         deserialize_value(&serialData, &serialLength, &mEpsilon);
@@ -40,11 +43,10 @@ namespace mmdeploy
         return 1;
     }
 
-    DimsExprs TRTInstanceNormalization::getOutputDimensions(
-        int                        outputIndex,
-        const nvinfer1::DimsExprs* inputs,
-        int                        nbInputs,
-        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    DimsExprs TRTInstanceNormalization::getOutputDimensions(int                        outputIndex,
+                                                            const nvinfer1::DimsExprs* inputs,
+                                                            int                        nbInputs,
+                                                            nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
     {
         nvinfer1::DimsExprs output(inputs[0]);
         return output;
@@ -101,7 +103,23 @@ namespace mmdeploy
         //       overflows (NaNs) for fp32 data in some circumstances. The lower-
         //       performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
         //       acceptable.
-        cudnnBatchNormalizationForwardTraining(_cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha, &beta, _x_desc, x_ptr, _y_desc, y_ptr, _b_desc, n_scales, n_bias, 1., nullptr, nullptr, mEpsilon, nullptr, nullptr);
+        cudnnBatchNormalizationForwardTraining(_cudnn_handle,
+                                               CUDNN_BATCHNORM_SPATIAL_PERSISTENT,
+                                               &alpha,
+                                               &beta,
+                                               _x_desc,
+                                               x_ptr,
+                                               _y_desc,
+                                               y_ptr,
+                                               _b_desc,
+                                               n_scales,
+                                               n_bias,
+                                               1.,
+                                               nullptr,
+                                               nullptr,
+                                               mEpsilon,
+                                               nullptr,
+                                               nullptr);
         return 0;
     }
 
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
index d513a59301..d8119d355b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
@@ -19,9 +19,12 @@ namespace mmdeploy
     class TRTInstanceNormalization final : public TRTPluginBase
     {
       public:
-        TRTInstanceNormalization(const std::string& name, float epsilon);
+        TRTInstanceNormalization(const std::string& name,
+                                 float              epsilon);
 
-        TRTInstanceNormalization(const std::string& name, void const* serialData, size_t serialLength);
+        TRTInstanceNormalization(const std::string& name,
+                                 void const*        serialData,
+                                 size_t             serialLength);
 
         TRTInstanceNormalization() = delete;
 
@@ -30,10 +33,15 @@ namespace mmdeploy
         int                 getNbOutputs() const TRT_NOEXCEPT override;
 
         // DynamicExt plugins returns DimsExprs class instead of Dims
-        nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
 
-        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
 
         int                            enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                                const nvinfer1::PluginTensorDesc* outputDesc,
@@ -47,7 +55,10 @@ namespace mmdeploy
         void                           serialize(void* buffer) const TRT_NOEXCEPT override;
 
         // DynamicExt plugin supportsFormat update.
-        bool                           supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
 
         const char*                    getPluginType() const TRT_NOEXCEPT override;
 
@@ -55,13 +66,20 @@ namespace mmdeploy
 
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-        nvinfer1::DataType             getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
-        void                           attachToContext(cudnnContext* cudnn, cublasContext* cublas, nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
+        void                           attachToContext(cudnnContext*            cudnn,
+                                                       cublasContext*           cublas,
+                                                       nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
 
         void                           detachFromContext() TRT_NOEXCEPT override;
 
-        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
 
       private:
         float                   mEpsilon{};
@@ -81,11 +99,12 @@ namespace mmdeploy
 
         const char*                    getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2DynamicExt* createPlugin(
-            const char*                            name,
-            const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2DynamicExt* createPlugin(const char*                            name,
+                                                    const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name,
+                                                         const void* serialData,
+                                                         size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_INSTANCE_NORMALIZATION_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
index 363242e8e1..c3540002fa 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
@@ -18,13 +18,12 @@ namespace mmdeploy
         static const char* PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
     }  // namespace
 
-    ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
-        const std::string&   name,
-        const nvinfer1::Dims stride,
-        const nvinfer1::Dims padding,
-        const nvinfer1::Dims dilation,
-        const int            deformableGroup,
-        const int            group)
+    ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(const std::string&   name,
+                                                                               const nvinfer1::Dims stride,
+                                                                               const nvinfer1::Dims padding,
+                                                                               const nvinfer1::Dims dilation,
+                                                                               const int            deformableGroup,
+                                                                               const int            group)
         : TRTPluginBase(name)
         , mStride(stride)
         , mPadding(padding)
@@ -51,13 +50,12 @@ namespace mmdeploy
 
     nvinfer1::IPluginV2DynamicExt* ModulatedDeformableConvPluginDynamic::clone() const TRT_NOEXCEPT
     {
-        ModulatedDeformableConvPluginDynamic* plugin = new ModulatedDeformableConvPluginDynamic(
-            mLayerName,
-            mStride,
-            mPadding,
-            mDilation,
-            mDeformableGroup,
-            mGroup);
+        ModulatedDeformableConvPluginDynamic* plugin = new ModulatedDeformableConvPluginDynamic(mLayerName,
+                                                                                                mStride,
+                                                                                                mPadding,
+                                                                                                mDilation,
+                                                                                                mDeformableGroup,
+                                                                                                mGroup);
         plugin->setPluginNamespace(getPluginNamespace());
 
         return plugin;
@@ -88,11 +86,10 @@ namespace mmdeploy
         return out;
     }
 
-    nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
-        int                        outputIndex,
-        const nvinfer1::DimsExprs* inputs,
-        int                        nbInputs,
-        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(int                        outputIndex,
+                                                                                  const nvinfer1::DimsExprs* inputs,
+                                                                                  int                        nbInputs,
+                                                                                  nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
     {
         using DimOp                    = nvinfer1::DimensionOperation;
         auto                weight_dim = inputs[3].d;
@@ -120,11 +117,10 @@ namespace mmdeploy
         return ret;
     }
 
-    bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(
-        int                               pos,
-        const nvinfer1::PluginTensorDesc* ioDesc,
-        int                               nbInputs,
-        int                               nbOutputs) TRT_NOEXCEPT
+    bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(int                               pos,
+                                                                         const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                         int                               nbInputs,
+                                                                         int                               nbOutputs) TRT_NOEXCEPT
     {
         if (pos == 0)
         {
@@ -138,11 +134,10 @@ namespace mmdeploy
         }
     }
 
-    void ModulatedDeformableConvPluginDynamic::configurePlugin(
-        const nvinfer1::DynamicPluginTensorDesc* inputs,
-        int                                      nbInputs,
-        const nvinfer1::DynamicPluginTensorDesc* outputs,
-        int                                      nbOutputs) TRT_NOEXCEPT
+    void ModulatedDeformableConvPluginDynamic::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                               int                                      nbInputs,
+                                                               const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                               int                                      nbOutputs) TRT_NOEXCEPT
     {
         if (nbInputs == 5)
         {
@@ -150,11 +145,10 @@ namespace mmdeploy
         }
     }
 
-    size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(
-        const nvinfer1::PluginTensorDesc* inputs,
-        int                               nbInputs,
-        const nvinfer1::PluginTensorDesc* outputs,
-        int                               nbOutputs) const TRT_NOEXCEPT
+    size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                                  int                               nbInputs,
+                                                                  const nvinfer1::PluginTensorDesc* outputs,
+                                                                  int                               nbOutputs) const TRT_NOEXCEPT
     {
         int    sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
 
@@ -205,60 +199,58 @@ namespace mmdeploy
         switch (data_type)
         {
             case nvinfer1::DataType::kFLOAT:
-                ModulatedDeformConvForwardCUDAKernelLauncher<float>(
-                    (float*)x,
-                    (float*)weight,
-                    (float*)bias,
-                    (float*)offset,
-                    (float*)mask,
-                    (float*)output,
-                    workSpace,
-                    batch,
-                    channels,
-                    height,
-                    width,
-                    channels_out,
-                    kernel_w,
-                    kernel_h,
-                    mStride.d[0],
-                    mStride.d[1],
-                    mPadding.d[0],
-                    mPadding.d[1],
-                    mDilation.d[0],
-                    mDilation.d[1],
-                    mGroup,
-                    mDeformableGroup,
-                    im2col_step,
-                    m_cublas_handle,
-                    stream);
+                ModulatedDeformConvForwardCUDAKernelLauncher<float>((float*)x,
+                                                                    (float*)weight,
+                                                                    (float*)bias,
+                                                                    (float*)offset,
+                                                                    (float*)mask,
+                                                                    (float*)output,
+                                                                    workSpace,
+                                                                    batch,
+                                                                    channels,
+                                                                    height,
+                                                                    width,
+                                                                    channels_out,
+                                                                    kernel_w,
+                                                                    kernel_h,
+                                                                    mStride.d[0],
+                                                                    mStride.d[1],
+                                                                    mPadding.d[0],
+                                                                    mPadding.d[1],
+                                                                    mDilation.d[0],
+                                                                    mDilation.d[1],
+                                                                    mGroup,
+                                                                    mDeformableGroup,
+                                                                    im2col_step,
+                                                                    m_cublas_handle,
+                                                                    stream);
                 break;
             case nvinfer1::DataType::kHALF:
-                ModulatedDeformConvForwardCUDAKernelLauncher<half>(
-                    (half*)x,
-                    (half*)weight,
-                    (half*)bias,
-                    (half*)offset,
-                    (half*)mask,
-                    (half*)output,
-                    workSpace,
-                    batch,
-                    channels,
-                    height,
-                    width,
-                    channels_out,
-                    kernel_w,
-                    kernel_h,
-                    mStride.d[0],
-                    mStride.d[1],
-                    mPadding.d[0],
-                    mPadding.d[1],
-                    mDilation.d[0],
-                    mDilation.d[1],
-                    mGroup,
-                    mDeformableGroup,
-                    im2col_step,
-                    m_cublas_handle,
-                    stream);
+                ModulatedDeformConvForwardCUDAKernelLauncher<half>((half*)x,
+                                                                   (half*)weight,
+                                                                   (half*)bias,
+                                                                   (half*)offset,
+                                                                   (half*)mask,
+                                                                   (half*)output,
+                                                                   workSpace,
+                                                                   batch,
+                                                                   channels,
+                                                                   height,
+                                                                   width,
+                                                                   channels_out,
+                                                                   kernel_w,
+                                                                   kernel_h,
+                                                                   mStride.d[0],
+                                                                   mStride.d[1],
+                                                                   mPadding.d[0],
+                                                                   mPadding.d[1],
+                                                                   mDilation.d[0],
+                                                                   mDilation.d[1],
+                                                                   mGroup,
+                                                                   mDeformableGroup,
+                                                                   im2col_step,
+                                                                   m_cublas_handle,
+                                                                   stream);
                 break;
             default:
                 return 1;
@@ -268,10 +260,9 @@ namespace mmdeploy
         return 0;
     }
 
-    nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(
-        int                       index,
-        const nvinfer1::DataType* inputTypes,
-        int                       nbInputs) const TRT_NOEXCEPT
+    nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(int                       index,
+                                                                               const nvinfer1::DataType* inputTypes,
+                                                                               int                       nbInputs) const TRT_NOEXCEPT
     {
         return inputTypes[0];
     }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
index 2082d83b9a..1bfbc17735 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
@@ -14,9 +14,16 @@ namespace mmdeploy
     class ModulatedDeformableConvPluginDynamic : public TRTPluginBase
     {
       public:
-        ModulatedDeformableConvPluginDynamic(const std::string& name, const nvinfer1::Dims stride, const nvinfer1::Dims padding, const nvinfer1::Dims dilation, const int deformableGroup, const int group);
+        ModulatedDeformableConvPluginDynamic(const std::string&   name,
+                                             const nvinfer1::Dims stride,
+                                             const nvinfer1::Dims padding,
+                                             const nvinfer1::Dims dilation,
+                                             const int            deformableGroup,
+                                             const int            group);
 
-        ModulatedDeformableConvPluginDynamic(const std::string name, const void* data, size_t length);
+        ModulatedDeformableConvPluginDynamic(const std::string name,
+                                             const void*       data,
+                                             size_t            length);
 
         ModulatedDeformableConvPluginDynamic() = delete;
 
@@ -24,22 +31,44 @@ namespace mmdeploy
 
         // IPluginV2DynamicExt Methods
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
-        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
-        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
-        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
-        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
         int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
                                    const void* const*                inputs,
                                    void* const*                      outputs,
                                    void*                             workspace,
                                    cudaStream_t                      stream) TRT_NOEXCEPT override;
-        void               attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
+        void                           attachToContext(cudnnContext*            cudnnContext,
+                                                       cublasContext*           cublasContext,
+                                                       nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
         void               detachFromContext() TRT_NOEXCEPT override;
 
         // IPluginV2Ext Methods
-        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         // IPluginV2 Methods
         const char*        getPluginType() const TRT_NOEXCEPT override;
@@ -68,10 +97,13 @@ namespace mmdeploy
 
         const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc)
             TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_MODULATED_DEFORM_CONV_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
index 21fc6cacf5..1b8884c7dc 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
@@ -8,41 +8,63 @@
 #include "trt_plugin_helper.hpp"
 
 template<typename T>
-void trt_modulated_deformable_im2col(const T* data_im_, const T* data_offset_, const T* data_mask_, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, T* data_col_, cudaStream_t stream)
+void trt_modulated_deformable_im2col(const T*     data_im_,
+                                     const T*     data_offset_,
+                                     const T*     data_mask_,
+                                     const int    batch_size,
+                                     const int    channels,
+                                     const int    height_im,
+                                     const int    width_im,
+                                     const int    height_col,
+                                     const int    width_col,
+                                     const int    kernel_h,
+                                     const int    kenerl_w,
+                                     const int    pad_h,
+                                     const int    pad_w,
+                                     const int    stride_h,
+                                     const int    stride_w,
+                                     const int    dilation_h,
+                                     const int    dilation_w,
+                                     const int    deformable_group,
+                                     T*           data_col_,
+                                     cudaStream_t stream)
 {
     // num_axes should be smaller than block size
     const int channel_per_deformable_group = channels / deformable_group;
     const int num_kernels                  = channels * batch_size * height_col * width_col;
 
     modulated_deformable_im2col_gpu_kernel<T>
-        <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
-            num_kernels,
-            data_im_,
-            data_offset_,
-            data_mask_,
-            height_im,
-            width_im,
-            kernel_h,
-            kenerl_w,
-            pad_h,
-            pad_w,
-            stride_h,
-            stride_w,
-            dilation_h,
-            dilation_w,
-            channel_per_deformable_group,
-            batch_size,
-            channels,
-            deformable_group,
-            height_col,
-            width_col,
-            data_col_);
+        <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(num_kernels,
+                                                                    data_im_,
+                                                                    data_offset_,
+                                                                    data_mask_,
+                                                                    height_im,
+                                                                    width_im,
+                                                                    kernel_h,
+                                                                    kenerl_w,
+                                                                    pad_h,
+                                                                    pad_w,
+                                                                    stride_h,
+                                                                    stride_w,
+                                                                    dilation_h,
+                                                                    dilation_w,
+                                                                    channel_per_deformable_group,
+                                                                    batch_size,
+                                                                    channels,
+                                                                    deformable_group,
+                                                                    height_col,
+                                                                    width_col,
+                                                                    data_col_);
 
     cudaCheckError();
 }
 
 template<typename scalar_t>
-__global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias, size_t step_batch, size_t step_channel, size_t n)
+__global__ void output_add_bias_kernel(scalar_t*       output,
+                                       const scalar_t* bias,
+                                       size_t          step_batch,
+                                       size_t          step_channel,
+                                       size_t          n)
 {
     CUDA_1D_KERNEL_LOOP(index, n)
     {
@@ -52,7 +74,11 @@ __global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias, s
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
 template<>
-__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias, size_t step_batch, size_t step_channel, size_t n)
+__global__ void output_add_bias_kernel<__half>(__half*       output,
+                                               const __half* bias,
+                                               size_t        step_batch,
+                                               size_t        step_channel,
+                                               size_t        n)
 {
     CUDA_1D_KERNEL_LOOP(index, n)
     {
@@ -63,7 +89,11 @@ __global__ void output_add_bias_kernel<__half>(__half* output, const __half* bia
 }
 #else
 template<>
-__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias, size_t step_batch, size_t step_channel, size_t n)
+__global__ void output_add_bias_kernel<__half>(__half*       output,
+                                               const __half* bias,
+                                               size_t        step_batch,
+                                               size_t        step_channel,
+                                               size_t        n)
 {
     CUDA_1D_KERNEL_LOOP(index, n)
     {
@@ -75,41 +105,50 @@ __global__ void output_add_bias_kernel<__half>(__half* output, const __half* bia
 #endif
 
 template<typename scalar_t>
-static void output_add_bias(scalar_t* output, const scalar_t* bias, size_t batch, size_t channel, size_t height, size_t width, cudaStream_t stream)
+static void output_add_bias(scalar_t*       output,
+                            const scalar_t* bias,
+                            size_t          batch,
+                            size_t          channel,
+                            size_t          height,
+                            size_t          width,
+                            cudaStream_t    stream)
 {
     size_t step_channel = height * width;
     size_t step_batch   = step_channel * channel;
     size_t n            = step_batch * batch;
-    output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(output, bias, step_batch, step_channel, n);
+    output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(output,
+                                                                            bias,
+                                                                            step_batch,
+                                                                            step_channel,
+                                                                            n);
 }
 
 template<typename scalar_t>
-void ModulatedDeformConvForwardCUDAKernelLauncher(
-    const scalar_t* input,
-    const scalar_t* weight,
-    const scalar_t* bias,
-    const scalar_t* offset,
-    const scalar_t* mask,
-    scalar_t*       output,
-    void*           workspace,
-    int             batch,
-    int             channels,
-    int             height,
-    int             width,
-    int             channels_out,
-    int             kernel_w,
-    int             kernel_h,
-    int             stride_w,
-    int             stride_h,
-    int             pad_w,
-    int             pad_h,
-    int             dilation_w,
-    int             dilation_h,
-    int             group,
-    int             deformable_group,
-    int             im2col_step,
-    cublasHandle_t  cublas_handle,
-    cudaStream_t    stream)
+void ModulatedDeformConvForwardCUDAKernelLauncher(const scalar_t* input,
+                                                  const scalar_t* weight,
+                                                  const scalar_t* bias,
+                                                  const scalar_t* offset,
+                                                  const scalar_t* mask,
+                                                  scalar_t*       output,
+                                                  void*           workspace,
+                                                  int             batch,
+                                                  int             channels,
+                                                  int             height,
+                                                  int             width,
+                                                  int             channels_out,
+                                                  int             kernel_w,
+                                                  int             kernel_h,
+                                                  int             stride_w,
+                                                  int             stride_h,
+                                                  int             pad_w,
+                                                  int             pad_h,
+                                                  int             dilation_w,
+                                                  int             dilation_h,
+                                                  int             group,
+                                                  int             deformable_group,
+                                                  int             im2col_step,
+                                                  cublasHandle_t  cublas_handle,
+                                                  cudaStream_t    stream)
 {
     bool with_bias = (bias != nullptr);
 
@@ -168,67 +207,84 @@ void ModulatedDeformConvForwardCUDAKernelLauncher(
             scalar_t*       col_start        = columns + g * col_g_step;
             scalar_t*       out_buffer_start = output + b * out_step + g * out_group_step;
 
-            cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start, n, weight_start, k, &beta, out_buffer_start, n);
+            cublasGemmWrap<scalar_t>(cublas_handle,
+                                     CUBLAS_OP_N,
+                                     CUBLAS_OP_N,
+                                     n,
+                                     m,
+                                     k,
+                                     &alpha,
+                                     col_start,
+                                     n,
+                                     weight_start,
+                                     k,
+                                     &beta,
+                                     out_buffer_start,
+                                     n);
             cudaCheckError();
         }
     }
 
     if (with_bias)
     {
-        output_add_bias<scalar_t>(output, bias, batch, channels_out, height_out, width_out, stream);
+        output_add_bias<scalar_t>(output,
+                                  bias,
+                                  batch,
+                                  channels_out,
+                                  height_out,
+                                  width_out,
+                                  stream);
     }
 }
 
-template void ModulatedDeformConvForwardCUDAKernelLauncher<float>(
-    const float*   input,
-    const float*   weight,
-    const float*   bias,
-    const float*   offset,
-    const float*   mask,
-    float*         output,
-    void*          workspace,
-    int            batch,
-    int            channels,
-    int            height,
-    int            width,
-    int            channels_out,
-    int            kernel_w,
-    int            kernel_h,
-    int            stride_w,
-    int            stride_h,
-    int            pad_w,
-    int            pad_h,
-    int            dilation_w,
-    int            dilation_h,
-    int            group,
-    int            deformable_group,
-    int            im2col_step,
-    cublasHandle_t cublas_handle,
-    cudaStream_t   stream);
-
-template void ModulatedDeformConvForwardCUDAKernelLauncher<__half>(
-    const __half*  input,
-    const __half*  weight,
-    const __half*  bias,
-    const __half*  offset,
-    const __half*  mask,
-    __half*        output,
-    void*          workspace,
-    int            batch,
-    int            channels,
-    int            height,
-    int            width,
-    int            channels_out,
-    int            kernel_w,
-    int            kernel_h,
-    int            stride_w,
-    int            stride_h,
-    int            pad_w,
-    int            pad_h,
-    int            dilation_w,
-    int            dilation_h,
-    int            group,
-    int            deformable_group,
-    int            im2col_step,
-    cublasHandle_t cublas_handle,
-    cudaStream_t   stream);
+template void ModulatedDeformConvForwardCUDAKernelLauncher<float>(const float*   input,
+                                                                  const float*   weight,
+                                                                  const float*   bias,
+                                                                  const float*   offset,
+                                                                  const float*   mask,
+                                                                  float*         output,
+                                                                  void*          workspace,
+                                                                  int            batch,
+                                                                  int            channels,
+                                                                  int            height,
+                                                                  int            width,
+                                                                  int            channels_out,
+                                                                  int            kernel_w,
+                                                                  int            kernel_h,
+                                                                  int            stride_w,
+                                                                  int            stride_h,
+                                                                  int            pad_w,
+                                                                  int            pad_h,
+                                                                  int            dilation_w,
+                                                                  int            dilation_h,
+                                                                  int            group,
+                                                                  int            deformable_group,
+                                                                  int            im2col_step,
+                                                                  cublasHandle_t cublas_handle,
+                                                                  cudaStream_t   stream);
+
+template void ModulatedDeformConvForwardCUDAKernelLauncher<__half>(const __half*  input,
+                                                                   const __half*  weight,
+                                                                   const __half*  bias,
+                                                                   const __half*  offset,
+                                                                   const __half*  mask,
+                                                                   __half*        output,
+                                                                   void*          workspace,
+                                                                   int            batch,
+                                                                   int            channels,
+                                                                   int            height,
+                                                                   int            width,
+                                                                   int            channels_out,
+                                                                   int            kernel_w,
+                                                                   int            kernel_h,
+                                                                   int            stride_w,
+                                                                   int            stride_h,
+                                                                   int            pad_w,
+                                                                   int            pad_h,
+                                                                   int            dilation_w,
+                                                                   int            dilation_h,
+                                                                   int            group,
+                                                                   int            deformable_group,
+                                                                   int            im2col_step,
+                                                                   cublasHandle_t cublas_handle,
+                                                                   cudaStream_t   stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
index 3a1298558c..4d928b16c5 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
@@ -5,31 +5,30 @@
 #include <cuda_runtime.h>
 
 template<typename scalar_t>
-void ModulatedDeformConvForwardCUDAKernelLauncher(
-    const scalar_t* input,
-    const scalar_t* weight,
-    const scalar_t* bias,
-    const scalar_t* offset,
-    const scalar_t* mask,
-    scalar_t*       output,
-    void*           workspace,
-    int             batch,
-    int             channels,
-    int             height,
-    int             width,
-    int             channels_out,
-    int             kernel_w,
-    int             kernel_h,
-    int             stride_w,
-    int             stride_h,
-    int             pad_w,
-    int             pad_h,
-    int             dilation_w,
-    int             dilation_h,
-    int             group,
-    int             deformable_group,
-    int             im2col_step,
-    cublasHandle_t  cublas_handle,
-    cudaStream_t    stream);
+void ModulatedDeformConvForwardCUDAKernelLauncher(const scalar_t* input,
+                                                  const scalar_t* weight,
+                                                  const scalar_t* bias,
+                                                  const scalar_t* offset,
+                                                  const scalar_t* mask,
+                                                  scalar_t*       output,
+                                                  void*           workspace,
+                                                  int             batch,
+                                                  int             channels,
+                                                  int             height,
+                                                  int             width,
+                                                  int             channels_out,
+                                                  int             kernel_w,
+                                                  int             kernel_h,
+                                                  int             stride_w,
+                                                  int             stride_h,
+                                                  int             pad_w,
+                                                  int             pad_h,
+                                                  int             dilation_w,
+                                                  int             dilation_h,
+                                                  int             group,
+                                                  int             deformable_group,
+                                                  int             im2col_step,
+                                                  cublasHandle_t  cublas_handle,
+                                                  cudaStream_t    stream);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
index 1663088e30..260086b511 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
@@ -152,51 +152,66 @@ __global__ void roi_extractor_kernel(scalar_t* __restrict__ output,
 
         if (pool_mode == 0)
         {
-            const scalar_t output_val = roi_align_single<scalar_t, aligned, 0>(
-                bottom_data,
-                roi_batch_ind,
-                roi_start_w,
-                roi_start_h,
-                roi_end_w,
-                roi_end_h,
-                spatial_scale,
-                pw,
-                ph,
-                c,
-                sample_num,
-                channels,
-                height,
-                width,
-                pooled_height,
-                pooled_width);
+            const scalar_t output_val = roi_align_single<scalar_t, aligned, 0>(bottom_data,
+                                                                               roi_batch_ind,
+                                                                               roi_start_w,
+                                                                               roi_start_h,
+                                                                               roi_end_w,
+                                                                               roi_end_h,
+                                                                               spatial_scale,
+                                                                               pw,
+                                                                               ph,
+                                                                               c,
+                                                                               sample_num,
+                                                                               channels,
+                                                                               height,
+                                                                               width,
+                                                                               pooled_height,
+                                                                               pooled_width);
             output[index] = output_val;
         }
         else
         {
-            const scalar_t output_val = roi_align_single<scalar_t, aligned, 1>(
-                bottom_data,
-                roi_batch_ind,
-                roi_start_w,
-                roi_start_h,
-                roi_end_w,
-                roi_end_h,
-                spatial_scale,
-                pw,
-                ph,
-                c,
-                sample_num,
-                channels,
-                height,
-                width,
-                pooled_height,
-                pooled_width);
+            const scalar_t output_val = roi_align_single<scalar_t, aligned, 1>(bottom_data,
+                                                                               roi_batch_ind,
+                                                                               roi_start_w,
+                                                                               roi_start_h,
+                                                                               roi_end_w,
+                                                                               roi_end_h,
+                                                                               spatial_scale,
+                                                                               pw,
+                                                                               ph,
+                                                                               c,
+                                                                               sample_num,
+                                                                               channels,
+                                                                               height,
+                                                                               width,
+                                                                               pooled_height,
+                                                                               pooled_width);
             output[index] = output_val;
         }
     }
 }
 
 template<typename T>
-void multi_level_roi_align(T* output, const T* rois, int num_rois, const void* const* feats, int num_feats, int n, int c, int* h, int* w, float* strides, int aligned_height, int aligned_width, int pool_mode, int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream)
+void multi_level_roi_align(T*                 output,
+                           const T*           rois,
+                           int                num_rois,
+                           const void* const* feats,
+                           int                num_feats,
+                           int                n,
+                           int                c,
+                           int*               h,
+                           int*               w,
+                           float*             strides,
+                           int                aligned_height,
+                           int                aligned_width,
+                           int                pool_mode,
+                           int                sample_num,
+                           float              roi_scale_factor,
+                           int                finest_scale,
+                           bool               aligned,
+                           cudaStream_t       stream)
 {
     FeatData feat_data;
     feat_data.batch_size  = n;
@@ -212,32 +227,47 @@ void multi_level_roi_align(T* output, const T* rois, int num_rois, const void* c
     int nThreads = num_rois * c * aligned_height * aligned_width;
     if (aligned)
     {
-        roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-            output,
-            rois,
-            feat_data,
-            pool_mode,
-            sample_num,
-            roi_scale_factor,
-            finest_scale,
-            aligned_height,
-            aligned_width,
-            nThreads);
+        roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(output,
+                                                                                              rois,
+                                                                                              feat_data,
+                                                                                              pool_mode,
+                                                                                              sample_num,
+                                                                                              roi_scale_factor,
+                                                                                              finest_scale,
+                                                                                              aligned_height,
+                                                                                              aligned_width,
+                                                                                              nThreads);
     }
     else
     {
-        roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-            output,
-            rois,
-            feat_data,
-            pool_mode,
-            sample_num,
-            roi_scale_factor,
-            finest_scale,
-            aligned_height,
-            aligned_width,
-            nThreads);
+        roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(output,
+                                                                                               rois,
+                                                                                               feat_data,
+                                                                                               pool_mode,
+                                                                                               sample_num,
+                                                                                               roi_scale_factor,
+                                                                                               finest_scale,
+                                                                                               aligned_height,
+                                                                                               aligned_width,
+                                                                                               nThreads);
     }
 }
 
-template void multi_level_roi_align<float>(float* output, const float* rois, int num_rois, const void* const* feats, int num_feats, int n, int c, int* h, int* w, float* strides, int aligned_height, int aligned_width, int pool_mode, int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream);
+template void multi_level_roi_align<float>(float*             output,
+                                           const float*       rois,
+                                           int                num_rois,
+                                           const void* const* feats,
+                                           int                num_feats,
+                                           int                n,
+                                           int                c,
+                                           int*               h,
+                                           int*               w,
+                                           float*             strides,
+                                           int                aligned_height,
+                                           int                aligned_width,
+                                           int                pool_mode,
+                                           int                sample_num,
+                                           float              roi_scale_factor,
+                                           int                finest_scale,
+                                           bool               aligned,
+                                           cudaStream_t       stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp
index 492a171efd..ec3c282ffe 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp
@@ -17,16 +17,15 @@ namespace mmdeploy
         static const char* PLUGIN_NAME{"MMCVMultiLevelRotatedRoiAlign"};
     }  // namespace
 
-    TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(
-        const std::string&        name,
-        int                       alignedHeight,
-        int                       alignedWidth,
-        int                       clockwise,
-        int                       sampleNum,
-        const std::vector<float>& featmapStrides,
-        float                     roiScaleFactor,
-        int                       finestScale,
-        bool                      aligned)
+    TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(const std::string&        name,
+                                                               int                       alignedHeight,
+                                                               int                       alignedWidth,
+                                                               int                       clockwise,
+                                                               int                       sampleNum,
+                                                               const std::vector<float>& featmapStrides,
+                                                               float                     roiScaleFactor,
+                                                               int                       finestScale,
+                                                               bool                      aligned)
         : TRTPluginBase(name)
         , mAlignedHeight(alignedHeight)
         , mAlignedWidth(alignedWidth)
@@ -39,7 +38,9 @@ namespace mmdeploy
     {
     }
 
-    TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(const std::string name, const void* data, size_t length)
+    TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(const std::string name,
+                                                               const void*       data,
+                                                               size_t            length)
         : TRTPluginBase(name)
     {
         deserialize_value(&data, &length, &mAlignedHeight);
@@ -54,26 +55,24 @@ namespace mmdeploy
 
     nvinfer1::IPluginV2DynamicExt* TRTMultiLevelRotatedRoiAlign::clone() const TRT_NOEXCEPT
     {
-        TRTMultiLevelRotatedRoiAlign* plugin = new TRTMultiLevelRotatedRoiAlign(
-            mLayerName,
-            mAlignedHeight,
-            mAlignedWidth,
-            mClockwise,
-            mSampleNum,
-            mFeatmapStrides,
-            mRoiScaleFactor,
-            mFinestScale,
-            mAligned);
+        TRTMultiLevelRotatedRoiAlign* plugin = new TRTMultiLevelRotatedRoiAlign(mLayerName,
+                                                                                mAlignedHeight,
+                                                                                mAlignedWidth,
+                                                                                mClockwise,
+                                                                                mSampleNum,
+                                                                                mFeatmapStrides,
+                                                                                mRoiScaleFactor,
+                                                                                mFinestScale,
+                                                                                mAligned);
         plugin->setPluginNamespace(getPluginNamespace());
 
         return plugin;
     }
 
-    nvinfer1::DimsExprs TRTMultiLevelRotatedRoiAlign::getOutputDimensions(
-        int                        outputIndex,
-        const nvinfer1::DimsExprs* inputs,
-        int                        nbInputs,
-        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    nvinfer1::DimsExprs TRTMultiLevelRotatedRoiAlign::getOutputDimensions(int                        outputIndex,
+                                                                          const nvinfer1::DimsExprs* inputs,
+                                                                          int                        nbInputs,
+                                                                          nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
     {
         // warning, nbInputs should equal to mFeatmapStrides.size() + 1
         nvinfer1::DimsExprs ret;
@@ -86,11 +85,10 @@ namespace mmdeploy
         return ret;
     }
 
-    bool TRTMultiLevelRotatedRoiAlign::supportsFormatCombination(
-        int                               pos,
-        const nvinfer1::PluginTensorDesc* ioDesc,
-        int                               nbInputs,
-        int                               nbOutputs) TRT_NOEXCEPT
+    bool TRTMultiLevelRotatedRoiAlign::supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT
     {
         return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
                ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
@@ -143,15 +141,31 @@ namespace mmdeploy
         const void*        rois  = inputs[0];
         const void* const* feats = inputs + 1;
 
-        multi_level_rotated_roi_align<float>((float*)outputs[0], (const float*)rois, num_rois, feats, num_feats, batch_size, channels, &heights[0], &widths[0], &strides[0], mAlignedHeight, mAlignedWidth, mClockwise, mSampleNum, mRoiScaleFactor, mFinestScale, mAligned, stream);
+        multi_level_rotated_roi_align<float>((float*)outputs[0],
+                                             (const float*)rois,
+                                             num_rois,
+                                             feats,
+                                             num_feats,
+                                             batch_size,
+                                             channels,
+                                             &heights[0],
+                                             &widths[0],
+                                             &strides[0],
+                                             mAlignedHeight,
+                                             mAlignedWidth,
+                                             mClockwise,
+                                             mSampleNum,
+                                             mRoiScaleFactor,
+                                             mFinestScale,
+                                             mAligned,
+                                             stream);
 
         return 0;
     }
 
-    nvinfer1::DataType TRTMultiLevelRotatedRoiAlign::getOutputDataType(
-        int                       index,
-        const nvinfer1::DataType* inputTypes,
-        int                       nbInputs) const TRT_NOEXCEPT
+    nvinfer1::DataType TRTMultiLevelRotatedRoiAlign::getOutputDataType(int                       index,
+                                                                       const nvinfer1::DataType* inputTypes,
+                                                                       int                       nbInputs) const TRT_NOEXCEPT
     {
         return nvinfer1::DataType::kFLOAT;
     }
@@ -194,8 +208,14 @@ namespace mmdeploy
 
     TRTMultiLevelRotatedRoiAlignCreator::TRTMultiLevelRotatedRoiAlignCreator()
     {
-        mPluginAttributes = std::vector<nvinfer1::PluginField>(
-            {nvinfer1::PluginField("output_height"), nvinfer1::PluginField("output_width"), nvinfer1::PluginField("clockwise"), nvinfer1::PluginField("sampling_ratio"), nvinfer1::PluginField("featmap_strides"), nvinfer1::PluginField("roi_scale_factor"), nvinfer1::PluginField("finest_scale"), nvinfer1::PluginField("aligned")});
+        mPluginAttributes = std::vector<nvinfer1::PluginField>({nvinfer1::PluginField("output_height"),
+                                                                nvinfer1::PluginField("output_width"),
+                                                                nvinfer1::PluginField("clockwise"),
+                                                                nvinfer1::PluginField("sampling_ratio"),
+                                                                nvinfer1::PluginField("featmap_strides"),
+                                                                nvinfer1::PluginField("roi_scale_factor"),
+                                                                nvinfer1::PluginField("finest_scale"),
+                                                                nvinfer1::PluginField("aligned")});
         mFC.nbFields = mPluginAttributes.size();
         mFC.fields   = mPluginAttributes.data();
     }
@@ -270,8 +290,15 @@ namespace mmdeploy
 
         ASSERT(featmapStrides.size() != 0);
 
-        TRTMultiLevelRotatedRoiAlign* plugin =
-            new TRTMultiLevelRotatedRoiAlign(name, alignedHeight, alignedWidth, clockwise, sampleNum, featmapStrides, roiScaleFactor, finestScale, aligned);
+        TRTMultiLevelRotatedRoiAlign* plugin = new TRTMultiLevelRotatedRoiAlign(name,
+                                                                                alignedHeight,
+                                                                                alignedWidth,
+                                                                                clockwise,
+                                                                                sampleNum,
+                                                                                featmapStrides,
+                                                                                roiScaleFactor,
+                                                                                finestScale,
+                                                                                aligned);
         plugin->setPluginNamespace(getPluginNamespace());
         return plugin;
     }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp
index 570317ebde..906a429f6e 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp
@@ -15,19 +15,45 @@ namespace mmdeploy
     class TRTMultiLevelRotatedRoiAlign : public TRTPluginBase
     {
       public:
-        TRTMultiLevelRotatedRoiAlign(const std::string& name, int alignedHeight, int alignedWidth, int clockwise, int sampleNum, const std::vector<float>& featmapStrides, float roiScaleFactor = -1, int finestScale = 56, bool aligned = false);
+        TRTMultiLevelRotatedRoiAlign(const std::string&        name,
+                                     int                       alignedHeight,
+                                     int                       alignedWidth,
+                                     int                       clockwise,
+                                     int                       sampleNum,
+                                     const std::vector<float>& featmapStrides,
+                                     float                     roiScaleFactor = -1,
+                                     int                       finestScale    = 56,
+                                     bool                      aligned        = false);
 
-        TRTMultiLevelRotatedRoiAlign(const std::string name, const void* data, size_t length);
+        TRTMultiLevelRotatedRoiAlign(const std::string name,
+                                     const void*       data,
+                                     size_t            length);
 
         TRTMultiLevelRotatedRoiAlign() = delete;
 
         // IPluginV2DynamicExt Methods
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
-        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
-        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
-        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
-        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
         int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
                                    const void* const*                inputs,
@@ -36,7 +62,9 @@ namespace mmdeploy
                                    cudaStream_t                      stream) TRT_NOEXCEPT override;
 
         // IPluginV2Ext Methods
-        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         // IPluginV2 Methods
         const char*        getPluginType() const TRT_NOEXCEPT override;
@@ -65,10 +93,12 @@ namespace mmdeploy
 
         const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-            TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_MULTI_LEVEL_ROTATED_ROI_ALIGN_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu
index 897ae69e8b..3b09215547 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu
@@ -144,30 +144,46 @@ __global__ void rotated_roi_extractor_kernel(scalar_t* __restrict__ output,
 
         theta = clockwise > 0 ? -theta : theta;
 
-        const scalar_t output_val = roi_align_single<scalar_t, aligned>(
-            bottom_data,
-            roi_batch_ind,
-            roi_center_w,
-            roi_center_h,
-            roi_width,
-            roi_height,
-            theta,
-            spatial_scale,
-            pw,
-            ph,
-            c,
-            sample_num,
-            channels,
-            height,
-            width,
-            pooled_height,
-            pooled_width);
+        const scalar_t output_val = roi_align_single<scalar_t, aligned>(bottom_data,
+                                                                        roi_batch_ind,
+                                                                        roi_center_w,
+                                                                        roi_center_h,
+                                                                        roi_width,
+                                                                        roi_height,
+                                                                        theta,
+                                                                        spatial_scale,
+                                                                        pw,
+                                                                        ph,
+                                                                        c,
+                                                                        sample_num,
+                                                                        channels,
+                                                                        height,
+                                                                        width,
+                                                                        pooled_height,
+                                                                        pooled_width);
         output[index] = output_val;
     }
 }
 
 template<typename T>
-void multi_level_rotated_roi_align(T* output, const T* rois, int num_rois, const void* const* feats, int num_feats, int n, int c, int* h, int* w, float* strides, int aligned_height, int aligned_width, int clockwise, int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream)
+void multi_level_rotated_roi_align(T*                 output,
+                                   const T*           rois,
+                                   int                num_rois,
+                                   const void* const* feats,
+                                   int                num_feats,
+                                   int                n,
+                                   int                c,
+                                   int*               h,
+                                   int*               w,
+                                   float*             strides,
+                                   int                aligned_height,
+                                   int                aligned_width,
+                                   int                clockwise,
+                                   int                sample_num,
+                                   float              roi_scale_factor,
+                                   int                finest_scale,
+                                   bool               aligned,
+                                   cudaStream_t       stream)
 {
     FeatData feat_data;
     feat_data.batch_size  = n;
@@ -183,50 +199,47 @@ void multi_level_rotated_roi_align(T* output, const T* rois, int num_rois, const
     int nThreads = num_rois * c * aligned_height * aligned_width;
     if (aligned)
     {
-        rotated_roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-            output,
-            rois,
-            feat_data,
-            clockwise,
-            sample_num,
-            roi_scale_factor,
-            finest_scale,
-            aligned_height,
-            aligned_width,
-            nThreads);
+        rotated_roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(output,
+                                                                                                      rois,
+                                                                                                      feat_data,
+                                                                                                      clockwise,
+                                                                                                      sample_num,
+                                                                                                      roi_scale_factor,
+                                                                                                      finest_scale,
+                                                                                                      aligned_height,
+                                                                                                      aligned_width,
+                                                                                                      nThreads);
     }
     else
     {
-        rotated_roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-            output,
-            rois,
-            feat_data,
-            clockwise,
-            sample_num,
-            roi_scale_factor,
-            finest_scale,
-            aligned_height,
-            aligned_width,
-            nThreads);
+        rotated_roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(output,
+                                                                                                       rois,
+                                                                                                       feat_data,
+                                                                                                       clockwise,
+                                                                                                       sample_num,
+                                                                                                       roi_scale_factor,
+                                                                                                       finest_scale,
+                                                                                                       aligned_height,
+                                                                                                       aligned_width,
+                                                                                                       nThreads);
     }
 }
 
-template void multi_level_rotated_roi_align<float>(
-    float*             output,
-    const float*       rois,
-    int                num_rois,
-    const void* const* feats,
-    int                num_feats,
-    int                n,
-    int                c,
-    int*               h,
-    int*               w,
-    float*             strides,
-    int                aligned_height,
-    int                aligned_width,
-    int                clockwise,
-    int                sample_num,
-    float              roi_scale_factor,
-    int                finest_scale,
-    bool               aligned,
-    cudaStream_t       stream);
+template void multi_level_rotated_roi_align<float>(float*             output,
+                                                   const float*       rois,
+                                                   int                num_rois,
+                                                   const void* const* feats,
+                                                   int                num_feats,
+                                                   int                n,
+                                                   int                c,
+                                                   int*               h,
+                                                   int*               w,
+                                                   float*             strides,
+                                                   int                aligned_height,
+                                                   int                aligned_width,
+                                                   int                clockwise,
+                                                   int                sample_num,
+                                                   float              roi_scale_factor,
+                                                   int                finest_scale,
+                                                   bool               aligned,
+                                                   cudaStream_t       stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp
index f3fb25df83..b3f7fc0f94 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp
@@ -4,6 +4,23 @@
 #include <cuda_runtime.h>
 
 template<typename T>
-void multi_level_rotated_roi_align(T* output, const T* rois, int num_rois, const void* const* feats, int num_feats, int n, int c, int* h, int* w, float* strides, int aligned_height, int aligned_width, int clockwise, int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream);
+void multi_level_rotated_roi_align(T*                 output,
+                                   const T*           rois,
+                                   int                num_rois,
+                                   const void* const* feats,
+                                   int                num_feats,
+                                   int                n,
+                                   int                c,
+                                   int*               h,
+                                   int*               w,
+                                   float*             strides,
+                                   int                aligned_height,
+                                   int                aligned_width,
+                                   int                clockwise,
+                                   int                sample_num,
+                                   float              roi_scale_factor,
+                                   int                finest_scale,
+                                   bool               aligned,
+                                   cudaStream_t       stream);
 
 #endif  // TRT_MULTI_LEVEL_ROTATED_ROI_ALIGN_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp
index ce9e81290d..73a3a8e6b9 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp
@@ -41,11 +41,10 @@ namespace mmdeploy
         return plugin;
     }
 
-    nvinfer1::DimsExprs MultiScaleDeformableAttnPluginDynamic::getOutputDimensions(
-        int                        outputIndex,
-        const nvinfer1::DimsExprs* inputs,
-        int                        nbInputs,
-        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    nvinfer1::DimsExprs MultiScaleDeformableAttnPluginDynamic::getOutputDimensions(int                        outputIndex,
+                                                                                   const nvinfer1::DimsExprs* inputs,
+                                                                                   int                        nbInputs,
+                                                                                   nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
     {
         nvinfer1::DimsExprs ret;
         ret.nbDims = 3;
@@ -57,11 +56,10 @@ namespace mmdeploy
         return ret;
     }
 
-    bool MultiScaleDeformableAttnPluginDynamic::supportsFormatCombination(
-        int                               pos,
-        const nvinfer1::PluginTensorDesc* ioDesc,
-        int                               nbInputs,
-        int                               nbOutputs) TRT_NOEXCEPT
+    bool MultiScaleDeformableAttnPluginDynamic::supportsFormatCombination(int                               pos,
+                                                                          const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                          int                               nbInputs,
+                                                                          int                               nbOutputs) TRT_NOEXCEPT
     {
         if (ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR)
         {
@@ -82,17 +80,15 @@ namespace mmdeploy
         }
     }
 
-    void MultiScaleDeformableAttnPluginDynamic::configurePlugin(
-        const nvinfer1::DynamicPluginTensorDesc* inputs,
-        int                                      nbInputs,
-        const nvinfer1::DynamicPluginTensorDesc* outputs,
-        int                                      nbOutputs) TRT_NOEXCEPT {}
+    void   MultiScaleDeformableAttnPluginDynamic::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                                int                                      nbInputs,
+                                                                const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                                int                                      nbOutputs) TRT_NOEXCEPT {}
 
-    size_t MultiScaleDeformableAttnPluginDynamic::getWorkspaceSize(
-        const nvinfer1::PluginTensorDesc* inputs,
-        int                               nbInputs,
-        const nvinfer1::PluginTensorDesc* outputs,
-        int                               nbOutputs) const TRT_NOEXCEPT
+    size_t MultiScaleDeformableAttnPluginDynamic::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                                   int                               nbInputs,
+                                                                   const nvinfer1::PluginTensorDesc* outputs,
+                                                                   int                               nbOutputs) const TRT_NOEXCEPT
     {
         return 0;
     }
@@ -121,7 +117,20 @@ namespace mmdeploy
             float const*   attnWeight      = static_cast<float const*>(inputs[4]);
             float*         output          = static_cast<float*>(outputs[0]);
 
-            rc = ms_deform_attn_cuda_forward(value, spatialShapes, levelStartIndex, samplingLoc, attnWeight, output, batch, spatial_size, num_heads, channels, num_levels, num_query, num_point, stream);
+            rc = ms_deform_attn_cuda_forward(value,
+                                             spatialShapes,
+                                             levelStartIndex,
+                                             samplingLoc,
+                                             attnWeight,
+                                             output,
+                                             batch,
+                                             spatial_size,
+                                             num_heads,
+                                             channels,
+                                             num_levels,
+                                             num_query,
+                                             num_point,
+                                             stream);
         }
         else if (inputDesc[0].type == nvinfer1::DataType::kHALF)
         {
@@ -132,16 +141,28 @@ namespace mmdeploy
             const __half*  attnWeight      = static_cast<const __half*>(inputs[4]);
             __half*        output          = static_cast<__half*>(outputs[0]);
 
-            rc = ms_deform_attn_cuda_forward(value, spatialShapes, levelStartIndex, samplingLoc, attnWeight, output, batch, spatial_size, num_heads, channels, num_levels, num_query, num_point, stream);
+            rc = ms_deform_attn_cuda_forward(value,
+                                             spatialShapes,
+                                             levelStartIndex,
+                                             samplingLoc,
+                                             attnWeight,
+                                             output,
+                                             batch,
+                                             spatial_size,
+                                             num_heads,
+                                             channels,
+                                             num_levels,
+                                             num_query,
+                                             num_point,
+                                             stream);
         }
 
         return rc;
     }
 
-    nvinfer1::DataType MultiScaleDeformableAttnPluginDynamic::getOutputDataType(
-        int                       index,
-        const nvinfer1::DataType* inputTypes,
-        int                       nbInputs) const TRT_NOEXCEPT
+    nvinfer1::DataType MultiScaleDeformableAttnPluginDynamic::getOutputDataType(int                       index,
+                                                                                const nvinfer1::DataType* inputTypes,
+                                                                                int                       nbInputs) const TRT_NOEXCEPT
     {
         return inputTypes[0];
     }
@@ -169,10 +190,9 @@ namespace mmdeploy
 
     void MultiScaleDeformableAttnPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {}
 
-    void MultiScaleDeformableAttnPluginDynamic::attachToContext(
-        cudnnContext*            cudnnContext,
-        cublasContext*           cublasContext,
-        nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}
+    void MultiScaleDeformableAttnPluginDynamic::attachToContext(cudnnContext*            cudnnContext,
+                                                                cublasContext*           cublasContext,
+                                                                nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}
 
     void MultiScaleDeformableAttnPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
 
@@ -195,19 +215,17 @@ namespace mmdeploy
         return PLUGIN_VERSION;
     }
 
-    nvinfer1::IPluginV2* MultiScaleDeformableAttnPluginDynamicCreator::createPlugin(
-        const char*                            name,
-        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    nvinfer1::IPluginV2* MultiScaleDeformableAttnPluginDynamicCreator::createPlugin(const char*                            name,
+                                                                                    const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
     {
         MultiScaleDeformableAttnPluginDynamic* plugin = new MultiScaleDeformableAttnPluginDynamic(name);
         plugin->setPluginNamespace(getPluginNamespace());
         return plugin;
     }
 
-    nvinfer1::IPluginV2* MultiScaleDeformableAttnPluginDynamicCreator::deserializePlugin(
-        const char* name,
-        const void* serialData,
-        size_t      serialLength) TRT_NOEXCEPT
+    nvinfer1::IPluginV2* MultiScaleDeformableAttnPluginDynamicCreator::deserializePlugin(const char* name,
+                                                                                         const void* serialData,
+                                                                                         size_t      serialLength) TRT_NOEXCEPT
     {
         auto plugin = new MultiScaleDeformableAttnPluginDynamic(name, serialData, serialLength);
         plugin->setPluginNamespace(getPluginNamespace());
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp
index 5a2c78baf9..62821e27ed 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp
@@ -16,7 +16,9 @@ namespace mmdeploy
       public:
         MultiScaleDeformableAttnPluginDynamic(const std::string& name);
 
-        MultiScaleDeformableAttnPluginDynamic(const std::string name, const void* data, size_t length);
+        MultiScaleDeformableAttnPluginDynamic(const std::string name,
+                                              const void*       data,
+                                              size_t            length);
 
         MultiScaleDeformableAttnPluginDynamic();
 
@@ -24,22 +26,44 @@ namespace mmdeploy
 
         // IPluginV2DynamicExt Methods
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
-        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
-        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
-        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
-        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
         int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
                                    const void* const*                inputs,
                                    void* const*                      outputs,
                                    void*                             workspace,
                                    cudaStream_t                      stream) TRT_NOEXCEPT override;
-        void               attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
+        void                           attachToContext(cudnnContext*            cudnnContext,
+                                                       cublasContext*           cublasContext,
+                                                       nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
         void               detachFromContext() TRT_NOEXCEPT override;
 
         // IPluginV2Ext Methods
-        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         // IPluginV2 Methods
         const char*        getPluginType() const TRT_NOEXCEPT override;
@@ -58,10 +82,12 @@ namespace mmdeploy
 
         const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-            TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_MS_DEFORM_ATTN_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu
index 81ddcc6585..2d10a1ee9f 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu
@@ -8,31 +8,56 @@
 #include "trt_plugin_helper.hpp"
 
 template<typename scalar_t>
-void ms_deformable_im2col_cuda(cudaStream_t stream, scalar_t const* dataValue, int32_t const* dataSpatialShapes, int32_t const* dataLevelStartIndex, scalar_t const* dataSamplingLoc, scalar_t const* dataAttnWeight, int32_t const batchSize, int32_t const spatialSize, int32_t const numHeads, int32_t const channels, int32_t const numLevels, int32_t const numQuery, int32_t const numPoint, scalar_t* dataCol)
+void ms_deformable_im2col_cuda(cudaStream_t    stream,
+                               scalar_t const* dataValue,
+                               int32_t const*  dataSpatialShapes,
+                               int32_t const*  dataLevelStartIndex,
+                               scalar_t const* dataSamplingLoc,
+                               scalar_t const* dataAttnWeight,
+                               int32_t const   batchSize,
+                               int32_t const   spatialSize,
+                               int32_t const   numHeads,
+                               int32_t const   channels,
+                               int32_t const   numLevels,
+                               int32_t const   numQuery,
+                               int32_t const   numPoint,
+                               scalar_t*       dataCol)
 {
     int32_t const numKernels       = batchSize * numQuery * numHeads * channels;
     int32_t const numActualKernels = batchSize * numQuery * numHeads * channels;
 
     ms_deformable_im2col_gpu_kernel<scalar_t>
-        <<<GET_BLOCKS(numActualKernels), THREADS_PER_BLOCK, 0, stream>>>(
-            numKernels,
-            dataValue,
-            dataSpatialShapes,
-            dataLevelStartIndex,
-            dataSamplingLoc,
-            dataAttnWeight,
-            batchSize,
-            spatialSize,
-            numHeads,
-            channels,
-            numLevels,
-            numQuery,
-            numPoint,
-            dataCol);
+        <<<GET_BLOCKS(numActualKernels), THREADS_PER_BLOCK, 0, stream>>>(numKernels,
+                                                                         dataValue,
+                                                                         dataSpatialShapes,
+                                                                         dataLevelStartIndex,
+                                                                         dataSamplingLoc,
+                                                                         dataAttnWeight,
+                                                                         batchSize,
+                                                                         spatialSize,
+                                                                         numHeads,
+                                                                         channels,
+                                                                         numLevels,
+                                                                         numQuery,
+                                                                         numPoint,
+                                                                         dataCol);
 }
 
 template<typename scalar_t>
-int32_t ms_deform_attn_cuda_forward(const scalar_t* value, const int32_t* spatialShapes, const int32_t* levelStartIndex, const scalar_t* samplingLoc, const scalar_t* attnWeight, scalar_t* output, int32_t batch, int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels, int32_t mNumLevels, int32_t mNumQuery, int32_t mNumPoint, cudaStream_t stream)
+int32_t ms_deform_attn_cuda_forward(const scalar_t* value,
+                                    const int32_t*  spatialShapes,
+                                    const int32_t*  levelStartIndex,
+                                    const scalar_t* samplingLoc,
+                                    const scalar_t* attnWeight,
+                                    scalar_t*       output,
+                                    int32_t         batch,
+                                    int32_t         mSpatialSize,
+                                    int32_t         mNumHeads,
+                                    int32_t         mChannels,
+                                    int32_t         mNumLevels,
+                                    int32_t         mNumQuery,
+                                    int32_t         mNumPoint,
+                                    cudaStream_t    stream)
 {
     auto    perValueSize      = mSpatialSize * mNumHeads * mChannels;
     auto    perSampleLocSize  = mNumQuery * mNumHeads * mNumLevels * mNumPoint * 2;
@@ -44,54 +69,51 @@ int32_t ms_deform_attn_cuda_forward(const scalar_t* value, const int32_t* spatia
     for (int32_t n = 0; n < batch / mIm2colStep; ++n)
     {
         auto columns = output + n * mIm2colStep * perOutputSize;
-        ms_deformable_im2col_cuda<scalar_t>(
-            stream,
-            value + n * mIm2colStep * perValueSize,
-            spatialShapes,
-            levelStartIndex,
-            samplingLoc + n * mIm2colStep * perSampleLocSize,
-            attnWeight + n * mIm2colStep * perAttnWeightSize,
-            mIm2colStep,
-            mSpatialSize,
-            mNumHeads,
-            mChannels,
-            mNumLevels,
-            mNumQuery,
-            mNumPoint,
-            columns);
+        ms_deformable_im2col_cuda<scalar_t>(stream,
+                                            value + n * mIm2colStep * perValueSize,
+                                            spatialShapes,
+                                            levelStartIndex,
+                                            samplingLoc + n * mIm2colStep * perSampleLocSize,
+                                            attnWeight + n * mIm2colStep * perAttnWeightSize,
+                                            mIm2colStep,
+                                            mSpatialSize,
+                                            mNumHeads,
+                                            mChannels,
+                                            mNumLevels,
+                                            mNumQuery,
+                                            mNumPoint,
+                                            columns);
     }
 
     return 0;
 }
 
-template int32_t ms_deform_attn_cuda_forward<float>(
-    const float*   value,
-    const int32_t* spatialShapes,
-    const int32_t* levelStartIndex,
-    const float*   samplingLoc,
-    const float*   attnWeight,
-    float*         output,
-    int32_t        batch,
-    int32_t        mSpatialSize,
-    int32_t        mNumHeads,
-    int32_t        mChannels,
-    int32_t        mNumLevels,
-    int32_t        mNumQuery,
-    int32_t        mNumPoint,
-    cudaStream_t   stream);
+template int32_t ms_deform_attn_cuda_forward<float>(const float*   value,
+                                                    const int32_t* spatialShapes,
+                                                    const int32_t* levelStartIndex,
+                                                    const float*   samplingLoc,
+                                                    const float*   attnWeight,
+                                                    float*         output,
+                                                    int32_t        batch,
+                                                    int32_t        mSpatialSize,
+                                                    int32_t        mNumHeads,
+                                                    int32_t        mChannels,
+                                                    int32_t        mNumLevels,
+                                                    int32_t        mNumQuery,
+                                                    int32_t        mNumPoint,
+                                                    cudaStream_t   stream);
 
-template int32_t ms_deform_attn_cuda_forward<__half>(
-    const __half*  value,
-    const int32_t* spatialShapes,
-    const int32_t* levelStartIndex,
-    const __half*  samplingLoc,
-    const __half*  attnWeight,
-    __half*        output,
-    int32_t        batch,
-    int32_t        mSpatialSize,
-    int32_t        mNumHeads,
-    int32_t        mChannels,
-    int32_t        mNumLevels,
-    int32_t        mNumQuery,
-    int32_t        mNumPoint,
-    cudaStream_t   stream);
+template int32_t ms_deform_attn_cuda_forward<__half>(const __half*  value,
+                                                     const int32_t* spatialShapes,
+                                                     const int32_t* levelStartIndex,
+                                                     const __half*  samplingLoc,
+                                                     const __half*  attnWeight,
+                                                     __half*        output,
+                                                     int32_t        batch,
+                                                     int32_t        mSpatialSize,
+                                                     int32_t        mNumHeads,
+                                                     int32_t        mChannels,
+                                                     int32_t        mNumLevels,
+                                                     int32_t        mNumQuery,
+                                                     int32_t        mNumPoint,
+                                                     cudaStream_t   stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh
index 2b62e7fc30..0bef6ed98c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh
@@ -5,7 +5,15 @@
 #include "common_cuda_helper.hpp"
 
 template<typename scalar_t>
-__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t*& bottom_data, const int& height, const int& width, const int& nheads, const int& channels, const scalar_t& h, const scalar_t& w, const int& m, const int& c)
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t*& bottom_data,
+                                                   const int&       height,
+                                                   const int&       width,
+                                                   const int&       nheads,
+                                                   const int&       channels,
+                                                   const scalar_t&  h,
+                                                   const scalar_t&  w,
+                                                   const int&       m,
+                                                   const int&       c)
 {
     const int      h_low  = floorf(h);
     const int      w_low  = floorf(w);
@@ -56,16 +64,15 @@ __device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t*& bottom_data,
 }
 
 template<>
-__device__ __half ms_deform_attn_im2col_bilinear<__half>(
-    const __half*& bottomData,
-    int32_t const& height,
-    int32_t const& width,
-    int32_t const& nHeads,
-    int32_t const& channels,
-    const __half&  h,
-    const __half&  w,
-    int32_t const& m,
-    int32_t const& c)
+__device__ __half ms_deform_attn_im2col_bilinear<__half>(const __half*& bottomData,
+                                                         int32_t const& height,
+                                                         int32_t const& width,
+                                                         int32_t const& nHeads,
+                                                         int32_t const& channels,
+                                                         const __half&  h,
+                                                         const __half&  w,
+                                                         int32_t const& m,
+                                                         int32_t const& c)
 {
     int32_t const hLow  = __half2int_rd(h);
     int32_t const wLow  = __half2int_rd(w);
@@ -144,21 +151,20 @@ __device__ __half ms_deform_attn_im2col_bilinear<__half>(
 
 #if 1
 template<typename scalar_t>
-__global__ void ms_deformable_im2col_gpu_kernel(
-    int32_t const   n,
-    scalar_t const* dataValue,
-    int32_t const*  dataSpatialShapes,
-    int32_t const*  dataLevelStartIndex,
-    scalar_t const* dataSamplingLoc,
-    scalar_t const* dataAttnWeight,
-    int32_t const   batchSize,
-    int32_t const   spatialSize,
-    int32_t const   numHeads,
-    int32_t const   channels,
-    int32_t const   numLevels,
-    int32_t const   numQuery,
-    int32_t const   numPoint,
-    scalar_t*       dataCol)
+__global__ void ms_deformable_im2col_gpu_kernel(int32_t const   n,
+                                                scalar_t const* dataValue,
+                                                int32_t const*  dataSpatialShapes,
+                                                int32_t const*  dataLevelStartIndex,
+                                                scalar_t const* dataSamplingLoc,
+                                                scalar_t const* dataAttnWeight,
+                                                int32_t const   batchSize,
+                                                int32_t const   spatialSize,
+                                                int32_t const   numHeads,
+                                                int32_t const   channels,
+                                                int32_t const   numLevels,
+                                                int32_t const   numQuery,
+                                                int32_t const   numPoint,
+                                                scalar_t*       dataCol)
 {
     CUDA_1D_KERNEL_LOOP(index, n)
     {
@@ -197,7 +203,15 @@ __global__ void ms_deformable_im2col_gpu_kernel(
 
                 if (hIm > -1 && wIm > -1 && hIm < spatialH && wIm < spatialW)
                 {
-                    col += ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads, channels, hIm, wIm, mCol, cCol) *
+                    col += ms_deform_attn_im2col_bilinear(dataValuePtr,
+                                                          spatialH,
+                                                          spatialW,
+                                                          numHeads,
+                                                          channels,
+                                                          hIm,
+                                                          wIm,
+                                                          mCol,
+                                                          cCol) *
                            weight;
                 }
 
@@ -210,21 +224,20 @@ __global__ void ms_deformable_im2col_gpu_kernel(
 }
 
 template<>
-__global__ void ms_deformable_im2col_gpu_kernel<__half>(
-    int32_t const  n,
-    const __half*  dataValue,
-    int32_t const* dataSpatialShapes,
-    int32_t const* dataLevelStartIndex,
-    const __half*  dataSamplingLoc,
-    const __half*  dataAttnWeight,
-    int32_t const  batchSize,
-    int32_t const  spatialSize,
-    int32_t const  numHeads,
-    int32_t const  channels,
-    int32_t const  numLevels,
-    int32_t const  numQuery,
-    int32_t const  numPoint,
-    __half*        dataCol)
+__global__ void ms_deformable_im2col_gpu_kernel<__half>(int32_t const  n,
+                                                        const __half*  dataValue,
+                                                        int32_t const* dataSpatialShapes,
+                                                        int32_t const* dataLevelStartIndex,
+                                                        const __half*  dataSamplingLoc,
+                                                        const __half*  dataAttnWeight,
+                                                        int32_t const  batchSize,
+                                                        int32_t const  spatialSize,
+                                                        int32_t const  numHeads,
+                                                        int32_t const  channels,
+                                                        int32_t const  numLevels,
+                                                        int32_t const  numQuery,
+                                                        int32_t const  numPoint,
+                                                        __half*        dataCol)
 {
     CUDA_1D_KERNEL_LOOP(index, n)
     {
@@ -269,7 +282,15 @@ __global__ void ms_deformable_im2col_gpu_kernel<__half>(
                 if (__hgt(hIm, kMINUS_ONE) && __hgt(wIm, kMINUS_ONE) && __hlt(hIm, spatialHHalf) &&
                     __hlt(wIm, spatialWHalf))
                 {
-                    tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads, channels, hIm, wIm, mCol, cCol);
+                    tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr,
+                                                           spatialH,
+                                                           spatialW,
+                                                           numHeads,
+                                                           channels,
+                                                           hIm,
+                                                           wIm,
+                                                           mCol,
+                                                           cCol);
                     col   = __hadd(col, __hmul(tpVal, weight));
                 }
     #else
@@ -283,7 +304,15 @@ __global__ void ms_deformable_im2col_gpu_kernel<__half>(
                     (__half2float(hIm) < __half2float(spatialHHalf)) &&
                     (__half2float(wIm) < __half2float(spatialWHalf)))
                 {
-                    tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads, channels, hIm, wIm, mCol, cCol);
+                    tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr,
+                                                           spatialH,
+                                                           spatialW,
+                                                           numHeads,
+                                                           channels,
+                                                           hIm,
+                                                           wIm,
+                                                           mCol,
+                                                           cCol);
                     col   = __float2half(__half2float(col) + (__half2float(tpVal) * __half2float(weight)));
                 }
     #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp
index 5dafa5a169..b052c8ce7c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp
@@ -5,6 +5,19 @@
 #include <cuda_runtime.h>
 
 template<typename scalar_t>
-int32_t ms_deform_attn_cuda_forward(const scalar_t* value, const int32_t* spatialShapes, const int32_t* levelStartIndex, const scalar_t* samplingLoc, const scalar_t* attnWeight, scalar_t* output, int32_t batch, int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels, int32_t mNumLevels, int32_t mNumQuery, int32_t mNumPoint, cudaStream_t stream);
+int32_t ms_deform_attn_cuda_forward(const scalar_t* value,
+                                    const int32_t*  spatialShapes,
+                                    const int32_t*  levelStartIndex,
+                                    const scalar_t* samplingLoc,
+                                    const scalar_t* attnWeight,
+                                    scalar_t*       output,
+                                    int32_t         batch,
+                                    int32_t         mSpatialSize,
+                                    int32_t         mNumHeads,
+                                    int32_t         mChannels,
+                                    int32_t         mNumLevels,
+                                    int32_t         mNumQuery,
+                                    int32_t         mNumPoint,
+                                    cudaStream_t    stream);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
index 4325f7b89c..0d71885676 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
@@ -17,7 +17,13 @@ namespace mmdeploy
         static const char* PLUGIN_NAME{"MMCVRoiAlign"};
     }  // namespace
 
-    TRTRoIAlign::TRTRoIAlign(const std::string& name, int outWidth, int outHeight, float spatialScale, int sampleRatio, int poolMode, bool aligned)
+    TRTRoIAlign::TRTRoIAlign(const std::string& name,
+                             int                outWidth,
+                             int                outHeight,
+                             float              spatialScale,
+                             int                sampleRatio,
+                             int                poolMode,
+                             bool               aligned)
         : TRTPluginBase(name)
         , mOutWidth(outWidth)
         , mOutHeight(outHeight)
@@ -41,17 +47,22 @@ namespace mmdeploy
 
     nvinfer1::IPluginV2DynamicExt* TRTRoIAlign::clone() const TRT_NOEXCEPT
     {
-        TRTRoIAlign* plugin = new TRTRoIAlign(mLayerName, mOutWidth, mOutHeight, mSpatialScale, mSampleRatio, mPoolMode, mAligned);
+        TRTRoIAlign* plugin = new TRTRoIAlign(mLayerName,
+                                              mOutWidth,
+                                              mOutHeight,
+                                              mSpatialScale,
+                                              mSampleRatio,
+                                              mPoolMode,
+                                              mAligned);
         plugin->setPluginNamespace(getPluginNamespace());
 
         return plugin;
     }
 
-    nvinfer1::DimsExprs TRTRoIAlign::getOutputDimensions(
-        int                        outputIndex,
-        const nvinfer1::DimsExprs* inputs,
-        int                        nbInputs,
-        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    nvinfer1::DimsExprs TRTRoIAlign::getOutputDimensions(int                        outputIndex,
+                                                         const nvinfer1::DimsExprs* inputs,
+                                                         int                        nbInputs,
+                                                         nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
     {
         nvinfer1::DimsExprs ret;
         ret.nbDims = 4;
@@ -63,23 +74,31 @@ namespace mmdeploy
         return ret;
     }
 
-    bool TRTRoIAlign::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    bool TRTRoIAlign::supportsFormatCombination(int                               pos,
+                                                const nvinfer1::PluginTensorDesc* ioDesc,
+                                                int                               nbInputs,
+                                                int                               nbOutputs) TRT_NOEXCEPT
     {
         return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
                ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
     }
 
-    void   TRTRoIAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT {}
+    void   TRTRoIAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                      int                                      nbInputs,
+                                      const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                      int                                      nbOutputs) TRT_NOEXCEPT {}
 
-    size_t TRTRoIAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT
+    size_t TRTRoIAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                         int                               nbInputs,
+                                         const nvinfer1::PluginTensorDesc* outputs,
+                                         int                               nbOutputs) const TRT_NOEXCEPT
     {
         size_t output_size = 0;
         size_t word_size   = 0;
         switch (mPoolMode)
         {
             case 0:  // max
-                output_size =
-                    outputs[0].dims.d[0] * outputs[0].dims.d[1] * outputs[0].dims.d[2] * outputs[0].dims.d[3];
+                output_size = outputs[0].dims.d[0] * outputs[0].dims.d[1] * outputs[0].dims.d[2] * outputs[0].dims.d[3];
                 word_size = mmdeploy::getElementSize(outputs[0].type);
                 return output_size * word_size * 2;
                 break;
@@ -103,7 +122,9 @@ namespace mmdeploy
         int height   = inputDesc[0].dims.d[2];
         int width    = inputDesc[0].dims.d[3];
 
-        int output_size = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] * outputDesc[0].dims.d[2] *
+        int output_size = outputDesc[0].dims.d[0] *
+                          outputDesc[0].dims.d[1] *
+                          outputDesc[0].dims.d[2] *
                           outputDesc[0].dims.d[3];
         int         word_size = mmdeploy::getElementSize(outputDesc[0].type);
 
@@ -152,7 +173,9 @@ namespace mmdeploy
         return 0;
     }
 
-    nvinfer1::DataType TRTRoIAlign::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
+    nvinfer1::DataType TRTRoIAlign::getOutputDataType(int                       index,
+                                                      const nvinfer1::DataType* inputTypes,
+                                                      int                       nbInputs) const TRT_NOEXCEPT
     {
         return inputTypes[0];
     }
@@ -288,7 +311,9 @@ namespace mmdeploy
         return plugin;
     }
 
-    nvinfer1::IPluginV2* TRTRoIAlignCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
+    nvinfer1::IPluginV2* TRTRoIAlignCreator::deserializePlugin(const char* name,
+                                                               const void* serialData,
+                                                               size_t      serialLength) TRT_NOEXCEPT
     {
         auto plugin = new TRTRoIAlign(name, serialData, serialLength);
         plugin->setPluginNamespace(getPluginNamespace());
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
index 45301e014e..605c1a4333 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
@@ -13,19 +13,43 @@ namespace mmdeploy
     class TRTRoIAlign : public TRTPluginBase
     {
       public:
-        TRTRoIAlign(const std::string& name, int outWidth, int outHeight, float spatialScale, int sampleRatio, int poolMode, bool aligned);
+        TRTRoIAlign(const std::string& name,
+                    int                outWidth,
+                    int                outHeight,
+                    float              spatialScale,
+                    int                sampleRatio,
+                    int                poolMode,
+                    bool               aligned);
 
-        TRTRoIAlign(const std::string name, const void* data, size_t length);
+        TRTRoIAlign(const std::string name,
+                    const void*       data,
+                    size_t            length);
 
         TRTRoIAlign() = delete;
 
         // IPluginV2DynamicExt Methods
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
-        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
-        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
-        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
-        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
         int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
                                    const void* const*                inputs,
@@ -34,7 +58,9 @@ namespace mmdeploy
                                    cudaStream_t                      stream) TRT_NOEXCEPT override;
 
         // IPluginV2Ext Methods
-        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         // IPluginV2 Methods
         const char*        getPluginType() const TRT_NOEXCEPT override;
@@ -60,10 +86,13 @@ namespace mmdeploy
         const char*          getPluginName() const TRT_NOEXCEPT override;
 
         const char*          getPluginVersion() const TRT_NOEXCEPT override;
-        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-            TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_ROI_ALIGN_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
index 4cd448aa52..a8ba93b5ad 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
@@ -5,7 +5,16 @@
 
 /*** Forward ***/
 template<typename T>
-__global__ void roi_align_forward_cuda_kernel(const int nthreads, const T* input, const T* rois, T* output, T* argmax_y, T* argmax_x, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio,
+__global__ void roi_align_forward_cuda_kernel(const int  nthreads,
+                                              const T*   input,
+                                              const T*   rois,
+                                              T*         output,
+                                              T*         argmax_y,
+                                              T*         argmax_x,
+                                              const int  pooled_height,
+                                              const int  pooled_width,
+                                              const T    spatial_scale,
+                                              const int  sampling_ratio,
                                               const int  pool_mode,  // 0 - max pool, 1 - avg pool
                                               const bool aligned,
                                               const int  channels,
@@ -98,41 +107,54 @@ __global__ void roi_align_forward_cuda_kernel(const int nthreads, const T* input
 }
 
 template<typename scalar_t>
-void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input, const scalar_t* rois, scalar_t* output, scalar_t* argmax_y, scalar_t* argmax_x, int output_size, int channels, int height, int width, int aligned_height, int aligned_width, scalar_t spatial_scale, int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream)
+void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input,
+                                          const scalar_t* rois,
+                                          scalar_t*       output,
+                                          scalar_t*       argmax_y,
+                                          scalar_t*       argmax_x,
+                                          int             output_size,
+                                          int             channels,
+                                          int             height,
+                                          int             width,
+                                          int             aligned_height,
+                                          int             aligned_width,
+                                          scalar_t        spatial_scale,
+                                          int             sampling_ratio,
+                                          int             pool_mode,
+                                          bool            aligned,
+                                          cudaStream_t    stream)
 {
     roi_align_forward_cuda_kernel<scalar_t>
-        <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-            output_size,
-            input,
-            rois,
-            output,
-            argmax_y,
-            argmax_x,
-            aligned_height,
-            aligned_width,
-            static_cast<scalar_t>(spatial_scale),
-            sampling_ratio,
-            pool_mode,
-            aligned,
-            channels,
-            height,
-            width);
+        <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(output_size,
+                                                                    input,
+                                                                    rois,
+                                                                    output,
+                                                                    argmax_y,
+                                                                    argmax_x,
+                                                                    aligned_height,
+                                                                    aligned_width,
+                                                                    static_cast<scalar_t>(spatial_scale),
+                                                                    sampling_ratio,
+                                                                    pool_mode,
+                                                                    aligned,
+                                                                    channels,
+                                                                    height,
+                                                                    width);
 }
 
-template void TRTRoIAlignForwardCUDAKernelLauncher<float>(
-    const float* input,
-    const float* rois,
-    float*       output,
-    float*       argmax_y,
-    float*       argmax_x,
-    int          output_size,
-    int          channels,
-    int          height,
-    int          width,
-    int          aligned_height,
-    int          aligned_width,
-    float        spatial_scale,
-    int          sampling_ratio,
-    int          pool_mode,
-    bool         aligned,
-    cudaStream_t stream);
+template void TRTRoIAlignForwardCUDAKernelLauncher<float>(const float* input,
+                                                          const float* rois,
+                                                          float*       output,
+                                                          float*       argmax_y,
+                                                          float*       argmax_x,
+                                                          int          output_size,
+                                                          int          channels,
+                                                          int          height,
+                                                          int          width,
+                                                          int          aligned_height,
+                                                          int          aligned_width,
+                                                          float        spatial_scale,
+                                                          int          sampling_ratio,
+                                                          int          pool_mode,
+                                                          bool         aligned,
+                                                          cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
index 39e8dc7893..38906636a4 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
@@ -5,6 +5,21 @@
 #include "common_cuda_helper.hpp"
 
 template<typename scalar_t>
-void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input, const scalar_t* rois, scalar_t* output, scalar_t* argmax_y, scalar_t* argmax_x, int output_size, int channels, int height, int width, int aligned_height, int aligned_width, scalar_t spatial_scale, int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
+void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input,
+                                          const scalar_t* rois,
+                                          scalar_t*       output,
+                                          scalar_t*       argmax_y,
+                                          scalar_t*       argmax_x,
+                                          int             output_size,
+                                          int             channels,
+                                          int             height,
+                                          int             width,
+                                          int             aligned_height,
+                                          int             aligned_width,
+                                          scalar_t        spatial_scale,
+                                          int             sampling_ratio,
+                                          int             pool_mode,
+                                          bool            aligned,
+                                          cudaStream_t    stream);
 
 #endif  // ROI_ALIGN_CUDA_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp
index b20a4b37ea..551c6ce996 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp
@@ -24,7 +24,9 @@ namespace mmdeploy
     {
     }
 
-    ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string name, const void* data, size_t length)
+    ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string name,
+                                                               const void*       data,
+                                                               size_t            length)
         : TRTPluginBase(name)
         , mask_dim(0)
     {
@@ -156,7 +158,24 @@ namespace mmdeploy
         switch (data_type)
         {
             case nvinfer1::DataType::kFLOAT:
-                dot_product_attention_impl<float>((float*)query, (float*)key, (float*)value, (float*)mask, (float*)attn, (float*)output, B, Nt, Ns, E, &mask_dims[0], _x_desc, _y_desc, _mask_desc, cudnn_dtype, stream, _cublas_handle, _cudnn_handle);
+                dot_product_attention_impl<float>((float*)query,
+                                                  (float*)key,
+                                                  (float*)value,
+                                                  (float*)mask,
+                                                  (float*)attn,
+                                                  (float*)output,
+                                                  B,
+                                                  Nt,
+                                                  Ns,
+                                                  E,
+                                                  &mask_dims[0],
+                                                  _x_desc,
+                                                  _y_desc,
+                                                  _mask_desc,
+                                                  cudnn_dtype,
+                                                  stream,
+                                                  _cublas_handle,
+                                                  _cudnn_handle);
                 break;
             default:
                 return 1;
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp
index 9e184626cb..4aea4c1e20 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp
@@ -16,18 +16,32 @@ namespace mmdeploy
       public:
         ScaledDotProductAttentionTRT(const std::string& name);
 
-        ScaledDotProductAttentionTRT(const std::string name, const void* data, size_t length);
+        ScaledDotProductAttentionTRT(const std::string name,
+                                     const void*       data,
+                                     size_t            length);
 
         ScaledDotProductAttentionTRT() = delete;
 
         ~ScaledDotProductAttentionTRT() TRT_NOEXCEPT override;
 
-        virtual void                   configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        virtual void                   configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
         // IPluginV2DynamicExt Methods
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
-        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
-        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
         int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
                                    const void* const*                inputs,
@@ -36,7 +50,9 @@ namespace mmdeploy
                                    cudaStream_t                      stream) TRT_NOEXCEPT override;
 
         // IPluginV2Ext Methods
-        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         // IPluginV2 Methods
         const char*        getPluginType() const TRT_NOEXCEPT override;
@@ -44,7 +60,11 @@ namespace mmdeploy
         int                getNbOutputs() const TRT_NOEXCEPT override;
         size_t             getSerializationSize() const TRT_NOEXCEPT override;
         void               serialize(void* buffer) const TRT_NOEXCEPT override;
-        void               attachToContext(cudnnContext* cudnn, cublasContext* cublas, nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
+
+        void                           attachToContext(cudnnContext*            cudnn,
+                                                       cublasContext*           cublas,
+                                                       nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
+
         void               detachFromContext() TRT_NOEXCEPT override;
 
       private:
@@ -63,10 +83,12 @@ namespace mmdeploy
 
         const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-            TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_SCALED_DOT_PRODUCT_ATTENTION_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu
index 738316b9a8..9775265b78 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu
@@ -12,22 +12,124 @@
 #include "trt_plugin_helper.hpp"
 
 template<typename scalar_t>
-cublasStatus_t cublasgemmStridedBatchedWrap(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha, const scalar_t* A, int lda, long long int strideA, const scalar_t* B, int ldb, long long int strideB, const scalar_t* beta, scalar_t* C, int ldc, long long int strideC, int batchCount);
+cublasStatus_t cublasgemmStridedBatchedWrap(cublasHandle_t    handle,
+                                            cublasOperation_t transa,
+                                            cublasOperation_t transb,
+                                            int               m,
+                                            int               n,
+                                            int               k,
+                                            const scalar_t*   alpha,
+                                            const scalar_t*   A,
+                                            int               lda,
+                                            long long int     strideA,
+                                            const scalar_t*   B,
+                                            int               ldb,
+                                            long long int     strideB,
+                                            const scalar_t*   beta,
+                                            scalar_t*         C,
+                                            int               ldc,
+                                            long long int     strideC,
+                                            int               batchCount);
 
 template<>
-cublasStatus_t cublasgemmStridedBatchedWrap<float>(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, long long int strideA, const float* B, int ldb, long long int strideB, const float* beta, float* C, int ldc, long long int strideC, int batchCount)
+cublasStatus_t cublasgemmStridedBatchedWrap<float>(cublasHandle_t    handle,
+                                                   cublasOperation_t transa,
+                                                   cublasOperation_t transb,
+                                                   int               m,
+                                                   int               n,
+                                                   int               k,
+                                                   const float*      alpha,
+                                                   const float*      A,
+                                                   int               lda,
+                                                   long long int     strideA,
+                                                   const float*      B,
+                                                   int               ldb,
+                                                   long long int     strideB,
+                                                   const float*      beta,
+                                                   float*            C,
+                                                   int               ldc,
+                                                   long long int     strideC,
+                                                   int               batchCount)
 {
-    return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+    return cublasSgemmStridedBatched(handle,
+                                     transa,
+                                     transb,
+                                     m,
+                                     n,
+                                     k,
+                                     alpha,
+                                     A,
+                                     lda,
+                                     strideA,
+                                     B,
+                                     ldb,
+                                     strideB,
+                                     beta,
+                                     C,
+                                     ldc,
+                                     strideC,
+                                     batchCount);
 }
 
 template<>
-cublasStatus_t cublasgemmStridedBatchedWrap<__half>(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* A, int lda, long long int strideA, const __half* B, int ldb, long long int strideB, const __half* beta, __half* C, int ldc, long long int strideC, int batchCount)
+cublasStatus_t cublasgemmStridedBatchedWrap<__half>(cublasHandle_t    handle,
+                                                    cublasOperation_t transa,
+                                                    cublasOperation_t transb,
+                                                    int               m,
+                                                    int               n,
+                                                    int               k,
+                                                    const __half*     alpha,
+                                                    const __half*     A,
+                                                    int               lda,
+                                                    long long int     strideA,
+                                                    const __half*     B,
+                                                    int               ldb,
+                                                    long long int     strideB,
+                                                    const __half*     beta,
+                                                    __half*           C,
+                                                    int               ldc,
+                                                    long long int     strideC,
+                                                    int               batchCount)
 {
-    return cublasHgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+    return cublasHgemmStridedBatched(handle,
+                                     transa,
+                                     transb,
+                                     m,
+                                     n,
+                                     k,
+                                     alpha,
+                                     A,
+                                     lda,
+                                     strideA,
+                                     B,
+                                     ldb,
+                                     strideB,
+                                     beta,
+                                     C,
+                                     ldc,
+                                     strideC,
+                                     batchCount);
 }
 
 template<typename scalar_t>
-void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, const scalar_t* value, const scalar_t* mask, scalar_t* attn, scalar_t* output, int B, int Nt, int Ns, int E, const int* mask_dims, cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc, cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype, cudaStream_t stream, cublasHandle_t cublas_handle, cudnnHandle_t cudnn_handle)
+void dot_product_attention_impl(const scalar_t*          query,
+                                const scalar_t*          key,
+                                const scalar_t*          value,
+                                const scalar_t*          mask,
+                                scalar_t*                attn,
+                                scalar_t*                output,
+                                int                      B,
+                                int                      Nt,
+                                int                      Ns,
+                                int                      E,
+                                const int*               mask_dims,
+                                cudnnTensorDescriptor_t& x_desc,
+                                cudnnTensorDescriptor_t& y_desc,
+                                cudnnTensorDescriptor_t& mask_desc,
+                                cudnnDataType_t          cudnn_dtype,
+                                cudaStream_t             stream,
+                                cublasHandle_t           cublas_handle,
+                                cudnnHandle_t            cudnn_handle)
 {
     {
         // Q @ K
@@ -36,14 +138,37 @@ void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, cons
         const int  k     = E;
         const auto alpha = scalar_t(1.0f / sqrt(float(E)));
         const auto beta  = scalar_t(0);
-        cublasgemmStridedBatchedWrap(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, key, k, Ns * E, query, k, Nt * E, &beta, attn, m, Nt * Ns, B);
+        cublasgemmStridedBatchedWrap(cublas_handle,
+                                     CUBLAS_OP_T,
+                                     CUBLAS_OP_N,
+                                     m,
+                                     n,
+                                     k,
+                                     &alpha,
+                                     key,
+                                     k,
+                                     Ns * E,
+                                     query,
+                                     k,
+                                     Nt * E,
+                                     &beta,
+                                     attn,
+                                     m,
+                                     Nt * Ns,
+                                     B);
     }
 
     if (mask_dims != nullptr && mask_dims[0] != 0)
     {
         const auto alpha = scalar_t(1);
         const auto beta  = scalar_t(1);
-        cudnnSetTensor4dDescriptor(mask_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, mask_dims[0], mask_dims[1], mask_dims[2]);
+        cudnnSetTensor4dDescriptor(mask_desc,
+                                   CUDNN_TENSOR_NCHW,
+                                   cudnn_dtype,
+                                   1,
+                                   mask_dims[0],
+                                   mask_dims[1],
+                                   mask_dims[2]);
         cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, B, Nt, Ns);
         cudnnAddTensor(cudnn_handle, &alpha, mask_desc, mask, &beta, x_desc, attn);
     }
@@ -54,7 +179,15 @@ void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, cons
         const auto beta  = scalar_t(0);
         cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
         cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
-        cudnnSoftmaxForward(cudnn_handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, &alpha, x_desc, attn, &beta, y_desc, attn);
+        cudnnSoftmaxForward(cudnn_handle,
+                            CUDNN_SOFTMAX_ACCURATE,
+                            CUDNN_SOFTMAX_MODE_INSTANCE,
+                            &alpha,
+                            x_desc,
+                            attn,
+                            &beta,
+                            y_desc,
+                            attn);
     }
 
     {
@@ -64,26 +197,42 @@ void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, cons
         const int  k     = Ns;
         const auto alpha = scalar_t(1);
         const auto beta  = scalar_t(0);
-        cublasgemmStridedBatchedWrap(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, value, m, Ns * E, (const scalar_t*)(attn), k, Ns * Nt, &beta, output, m, Nt * E, B);
+        cublasgemmStridedBatchedWrap(cublas_handle,
+                                     CUBLAS_OP_N,
+                                     CUBLAS_OP_N,
+                                     m,
+                                     n,
+                                     k,
+                                     &alpha,
+                                     value,
+                                     m,
+                                     Ns * E,
+                                     (const scalar_t*)(attn),
+                                     k,
+                                     Ns * Nt,
+                                     &beta,
+                                     output,
+                                     m,
+                                     Nt * E,
+                                     B);
     }
 }
 
-template void dot_product_attention_impl<float>(
-    const float*             query,
-    const float*             key,
-    const float*             value,
-    const float*             mask,
-    float*                   attn,
-    float*                   output,
-    int                      B,
-    int                      Nt,
-    int                      Ns,
-    int                      E,
-    const int*               mask_dims,
-    cudnnTensorDescriptor_t& x_desc,
-    cudnnTensorDescriptor_t& y_desc,
-    cudnnTensorDescriptor_t& mask_desc,
-    cudnnDataType_t          cudnn_dtype,
-    cudaStream_t             stream,
-    cublasHandle_t           cublas_handle,
-    cudnnHandle_t            cudnn_handle);
+template void dot_product_attention_impl<float>(const float*             query,
+                                                const float*             key,
+                                                const float*             value,
+                                                const float*             mask,
+                                                float*                   attn,
+                                                float*                   output,
+                                                int                      B,
+                                                int                      Nt,
+                                                int                      Ns,
+                                                int                      E,
+                                                const int*               mask_dims,
+                                                cudnnTensorDescriptor_t& x_desc,
+                                                cudnnTensorDescriptor_t& y_desc,
+                                                cudnnTensorDescriptor_t& mask_desc,
+                                                cudnnDataType_t          cudnn_dtype,
+                                                cudaStream_t             stream,
+                                                cublasHandle_t           cublas_handle,
+                                                cudnnHandle_t            cudnn_handle);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp
index 10db2aade1..b11a341aa9 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp
@@ -6,6 +6,23 @@
 #include <cudnn.h>
 
 template<typename scalar_t>
-void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, const scalar_t* value, const scalar_t* mask, scalar_t* attn, scalar_t* output, int B, int Nt, int Ns, int E, const int* mask_dims, cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc, cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype, cudaStream_t stream, cublasHandle_t cublas_handle, cudnnHandle_t cudnn_handle);
+void dot_product_attention_impl(const scalar_t*          query,
+                                const scalar_t*          key,
+                                const scalar_t*          value,
+                                const scalar_t*          mask,
+                                scalar_t*                attn,
+                                scalar_t*                output,
+                                int                      B,
+                                int                      Nt,
+                                int                      Ns,
+                                int                      E,
+                                const int*               mask_dims,
+                                cudnnTensorDescriptor_t& x_desc,
+                                cudnnTensorDescriptor_t& y_desc,
+                                cudnnTensorDescriptor_t& mask_desc,
+                                cudnnDataType_t          cudnn_dtype,
+                                cudaStream_t             stream,
+                                cublasHandle_t           cublas_handle,
+                                cudnnHandle_t            cudnn_handle);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
index ca0ed9afa0..ca5fe92dcc 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
@@ -37,16 +37,18 @@ namespace mmdeploy
         return plugin;
     }
 
-    nvinfer1::DimsExprs TRTScatterND::getOutputDimensions(
-        int                        outputIndex,
-        const nvinfer1::DimsExprs* inputs,
-        int                        nbInputs,
-        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    nvinfer1::DimsExprs TRTScatterND::getOutputDimensions(int                        outputIndex,
+                                                          const nvinfer1::DimsExprs* inputs,
+                                                          int                        nbInputs,
+                                                          nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
     {
         return inputs[0];
     }
 
-    bool TRTScatterND::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT
+    bool TRTScatterND::supportsFormatCombination(int                               pos,
+                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                 int                               nbInputs,
+                                                 int                               nbOutputs) TRT_NOEXCEPT
     {
         if (pos < nbInputs)
         {
@@ -83,9 +85,15 @@ namespace mmdeploy
         return true;
     }
 
-    void   TRTScatterND::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT {}
+    void   TRTScatterND::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                       int                                      nbInputs,
+                                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                       int                                      nbOutputs) TRT_NOEXCEPT {}
 
-    size_t TRTScatterND::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT
+    size_t TRTScatterND::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                          int                               nbInputs,
+                                          const nvinfer1::PluginTensorDesc* outputs,
+                                          int                               nbOutputs) const TRT_NOEXCEPT
     {
         return 0;
     }
@@ -112,11 +120,27 @@ namespace mmdeploy
         switch (data_type)
         {
             case nvinfer1::DataType::kFLOAT:
-                TRTONNXScatterNDKernelLauncher<float>((float*)data, (int*)indices, (float*)update, dims, nbDims, indices_dims, indice_nbDims, (float*)output, stream);
+                TRTONNXScatterNDKernelLauncher<float>((float*)data,
+                                                      (int*)indices,
+                                                      (float*)update,
+                                                      dims,
+                                                      nbDims,
+                                                      indices_dims,
+                                                      indice_nbDims,
+                                                      (float*)output,
+                                                      stream);
                 break;
 
             case nvinfer1::DataType::kINT32:
-                TRTONNXScatterNDKernelLauncher<int>((int*)data, (int*)indices, (int*)update, dims, nbDims, indices_dims, indice_nbDims, (int*)output, stream);
+                TRTONNXScatterNDKernelLauncher<int>((int*)data,
+                                                    (int*)indices,
+                                                    (int*)update,
+                                                    dims,
+                                                    nbDims,
+                                                    indices_dims,
+                                                    indice_nbDims,
+                                                    (int*)output,
+                                                    stream);
                 break;
             default:
                 break;
@@ -125,7 +149,9 @@ namespace mmdeploy
         return 0;
     }
 
-    nvinfer1::DataType TRTScatterND::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
+    nvinfer1::DataType TRTScatterND::getOutputDataType(int                       index,
+                                                       const nvinfer1::DataType* inputTypes,
+                                                       int                       nbInputs) const TRT_NOEXCEPT
     {
         return inputTypes[0];
     }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
index b75adc40c2..6afbbe450e 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
@@ -16,17 +16,35 @@ namespace mmdeploy
       public:
         TRTScatterND(const std::string& name);
 
-        TRTScatterND(const std::string name, const void* data, size_t length);
+        TRTScatterND(const std::string name,
+                     const void*       data,
+                     size_t            length);
 
         TRTScatterND() = delete;
 
         // IPluginV2DynamicExt Methods
         nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
-        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-            TRT_NOEXCEPT override;
-        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
-        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
-        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
         int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
                                    const void* const*                inputs,
@@ -35,7 +53,9 @@ namespace mmdeploy
                                    cudaStream_t                      stream) TRT_NOEXCEPT override;
 
         // IPluginV2Ext Methods
-        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
         // IPluginV2 Methods
         const char*        getPluginType() const TRT_NOEXCEPT override;
@@ -53,10 +73,13 @@ namespace mmdeploy
         const char*          getPluginName() const TRT_NOEXCEPT override;
 
         const char*          getPluginVersion() const TRT_NOEXCEPT override;
-        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-            TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
     };
 }  // namespace mmdeploy
 #endif  // TRT_SCATTERND_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
index a9ec98fa36..cd5a235afa 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
@@ -9,7 +9,12 @@
 using mmdeploy::TensorDesc;
 
 template<typename T>
-__global__ void onnx_scatternd_kernel(const int n, const int* indices, const T* update, T* output, TensorDesc tensor_desc, TensorDesc indice_desc)
+__global__ void onnx_scatternd_kernel(const int  n,
+                                      const int* indices,
+                                      const T*   update,
+                                      T*         output,
+                                      TensorDesc tensor_desc,
+                                      TensorDesc indice_desc)
 {
     const int  indice_cols = indice_desc.shape[indice_desc.dim - 1];
     const int  copy_stride = tensor_desc.stride[indice_cols - 1];
@@ -27,7 +32,15 @@ __global__ void onnx_scatternd_kernel(const int n, const int* indices, const T*
 }
 
 template<typename T>
-void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices, const T* update, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, T* output, cudaStream_t stream)
+void TRTONNXScatterNDKernelLauncher(const T*     data,
+                                    const int*   indices,
+                                    const T*     update,
+                                    const int*   dims,
+                                    int          nbDims,
+                                    const int*   indices_dims,
+                                    int          indice_nbDims,
+                                    T*           output,
+                                    cudaStream_t stream)
 {
     // fill tensordesc and initial
     TensorDesc tensor_desc;
@@ -63,15 +76,30 @@ void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices, const T*
     }
     // scatter
     const int col_block = DIVUP(num_update_indice, THREADS_PER_BLOCK);
-    onnx_scatternd_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
-        num_update_indice,
-        indices,
-        update,
-        output,
-        tensor_desc,
-        indice_desc);
+    onnx_scatternd_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(num_update_indice,
+                                                                       indices,
+                                                                       update,
+                                                                       output,
+                                                                       tensor_desc,
+                                                                       indice_desc);
 }
 
-template void TRTONNXScatterNDKernelLauncher<float>(const float* data, const int* indices, const float* update, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, float* output, cudaStream_t stream);
+template void TRTONNXScatterNDKernelLauncher<float>(const float* data,
+                                                    const int*   indices,
+                                                    const float* update,
+                                                    const int*   dims,
+                                                    int          nbDims,
+                                                    const int*   indices_dims,
+                                                    int          indice_nbDims,
+                                                    float*       output,
+                                                    cudaStream_t stream);
 
-template void TRTONNXScatterNDKernelLauncher<int>(const int* data, const int* indices, const int* update, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, int* output, cudaStream_t stream);
+template void TRTONNXScatterNDKernelLauncher<int>(const int*   data,
+                                                  const int*   indices,
+                                                  const int*   update,
+                                                  const int*   dims,
+                                                  int          nbDims,
+                                                  const int*   indices_dims,
+                                                  int          indice_nbDims,
+                                                  int*         output,
+                                                  cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp
index ae8ae2c34b..093ccda4f0 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp
@@ -4,6 +4,14 @@
 #include <cuda_runtime.h>
 
 template<typename T>
-void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices, const T* update, const int* dims, int nbDims, const int* indices_dims, int indice_nbDims, T* output, cudaStream_t stream);
+void TRTONNXScatterNDKernelLauncher(const T*     data,
+                                    const int*   indices,
+                                    const T*     update,
+                                    const int*   dims,
+                                    int          nbDims,
+                                    const int*   indices_dims,
+                                    int          indice_nbDims,
+                                    T*           output,
+                                    cudaStream_t stream);
 
 #endif  // TRT_SCATTERND_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp b/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp
index 77fc5c6388..f83a0ec313 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp
@@ -9,7 +9,11 @@ namespace mmdeploy
 
     using at::Tensor;
 
-    std::vector<Tensor> coreml_nms_cpu(Tensor boxes, Tensor scores, double iou_threshold, double score_threshold, int64_t max_boxes)
+    std::vector<Tensor> coreml_nms_cpu(Tensor  boxes,
+                                       Tensor  scores,
+                                       double  iou_threshold,
+                                       double  score_threshold,
+                                       int64_t max_boxes)
     {
         assert(boxes.dim() == 3);  // bboxes with shape (batch_size, num_bboxes, 4)
         assert(boxes.size(2) == 4);
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp
index cf404849b4..3a9b32e83b 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp
@@ -6,46 +6,74 @@
 namespace mmdeploy
 {
 
-    void modulated_deformable_im2col_cpu(
-        const at::Tensor data_im,
-        const at::Tensor data_offset,
-        const at::Tensor data_mask,
-        const int64_t    batch_size,
-        const int64_t    channels,
-        const int64_t    height_im,
-        const int64_t    width_im,
-        const int64_t    height_col,
-        const int64_t    width_col,
-        const int64_t    kernel_h,
-        const int64_t    kernel_w,
-        const int64_t    pad_h,
-        const int64_t    pad_w,
-        const int64_t    stride_h,
-        const int64_t    stride_w,
-        const int64_t    dilation_h,
-        const int64_t    dilation_w,
-        int64_t          deformable_group,
-        at::Tensor       data_col)
+    void modulated_deformable_im2col_cpu(const at::Tensor data_im,
+                                         const at::Tensor data_offset,
+                                         const at::Tensor data_mask,
+                                         const int64_t    batch_size,
+                                         const int64_t    channels,
+                                         const int64_t    height_im,
+                                         const int64_t    width_im,
+                                         const int64_t    height_col,
+                                         const int64_t    width_col,
+                                         const int64_t    kernel_h,
+                                         const int64_t    kernel_w,
+                                         const int64_t    pad_h,
+                                         const int64_t    pad_w,
+                                         const int64_t    stride_h,
+                                         const int64_t    stride_w,
+                                         const int64_t    dilation_h,
+                                         const int64_t    dilation_w,
+                                         int64_t          deformable_group,
+                                         at::Tensor       data_col)
     {
         // num_axes should be smaller than block size
 
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-            data_im.scalar_type(),
-            "modulated_deformable_im2col_cpu",
-            ([&]
-             {
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-
-        deformable_im2col_2d<scalar_t>(data_im_, data_offset_, data_mask_, height_im, width_im,
-                                       kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-                                       dilation_h, dilation_w, channels, deformable_group,
-                                       height_col, width_col, data_mask_ != nullptr, data_col_); }));
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(data_im.scalar_type(),
+                                            "modulated_deformable_im2col_cpu",
+                                            ([&]
+                                             {
+                                                const scalar_t* data_im_     = data_im.data_ptr<scalar_t>();
+                                                const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+                                                const scalar_t* data_mask_   = data_mask.data_ptr<scalar_t>();
+                                                scalar_t*       data_col_    = data_col.data_ptr<scalar_t>();
+
+                                                deformable_im2col_2d<scalar_t>(data_im_,
+                                                                               data_offset_,
+                                                                               data_mask_,
+                                                                               height_im,
+                                                                               width_im,
+                                                                               kernel_h,
+                                                                               kernel_w,
+                                                                               pad_h,
+                                                                               pad_w,
+                                                                               stride_h,
+                                                                               stride_w,
+                                                                               dilation_h,
+                                                                               dilation_w,
+                                                                               channels,
+                                                                               deformable_group,
+                                                                               height_col,
+                                                                               width_col,
+                                                                               data_mask_ != nullptr,
+                                                                               data_col_); }));
     }
 
-    at::Tensor modulated_deform_conv_forward_cpu(at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor offset, at::Tensor mask, int64_t kernel_h, int64_t kernel_w, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, int64_t dilation_h, int64_t dilation_w, int64_t group, int64_t deformable_group, bool with_bias)
+    at::Tensor modulated_deform_conv_forward_cpu(at::Tensor input,
+                                                 at::Tensor weight,
+                                                 at::Tensor bias,
+                                                 at::Tensor offset,
+                                                 at::Tensor mask,
+                                                 int64_t    kernel_h,
+                                                 int64_t    kernel_w,
+                                                 int64_t    stride_h,
+                                                 int64_t    stride_w,
+                                                 int64_t    pad_h,
+                                                 int64_t    pad_w,
+                                                 int64_t    dilation_h,
+                                                 int64_t    dilation_w,
+                                                 int64_t    group,
+                                                 int64_t    deformable_group,
+                                                 bool       with_bias)
     {
         at::DeviceGuard guard(input.device());
 
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu
index 83fddb8a8c..53cb5fd65c 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu
@@ -6,26 +6,25 @@
 namespace mmdeploy
 {
 
-    void modulated_deformable_im2col_cuda(
-        const at::Tensor data_im,
-        const at::Tensor data_offset,
-        const at::Tensor data_mask,
-        const int64_t    batch_size,
-        const int64_t    channels,
-        const int64_t    height_im,
-        const int64_t    width_im,
-        const int64_t    height_col,
-        const int64_t    width_col,
-        const int64_t    kernel_h,
-        const int64_t    kernel_w,
-        const int64_t    pad_h,
-        const int64_t    pad_w,
-        const int64_t    stride_h,
-        const int64_t    stride_w,
-        const int64_t    dilation_h,
-        const int64_t    dilation_w,
-        const int64_t    deformable_group,
-        at::Tensor       data_col)
+    void modulated_deformable_im2col_cuda(const at::Tensor data_im,
+                                          const at::Tensor data_offset,
+                                          const at::Tensor data_mask,
+                                          const int64_t    batch_size,
+                                          const int64_t    channels,
+                                          const int64_t    height_im,
+                                          const int64_t    width_im,
+                                          const int64_t    height_col,
+                                          const int64_t    width_col,
+                                          const int64_t    kernel_h,
+                                          const int64_t    kernel_w,
+                                          const int64_t    pad_h,
+                                          const int64_t    pad_w,
+                                          const int64_t    stride_h,
+                                          const int64_t    stride_w,
+                                          const int64_t    dilation_h,
+                                          const int64_t    dilation_w,
+                                          const int64_t    deformable_group,
+                                          at::Tensor       data_col)
     {
         // num_axes should be smaller than block size
         const int channel_per_deformable_group = channels / deformable_group;
@@ -36,19 +35,51 @@ namespace mmdeploy
             "modulated_deformable_im2col_cuda",
             ([&]
              {
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        modulated_deformable_im2col_gpu_kernel<scalar_t>
-            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>(
-                num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h,
-                kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-                channel_per_deformable_group, batch_size, channels, deformable_group, height_col,
-                width_col, data_col_); }));
+                const scalar_t* data_im_     = data_im.data_ptr<scalar_t>();
+                const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+                const scalar_t* data_mask_   = data_mask.data_ptr<scalar_t>();
+                scalar_t*       data_col_    = data_col.data_ptr<scalar_t>();
+
+                modulated_deformable_im2col_gpu_kernel<scalar_t>
+                    <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>(num_kernels,
+                                                                                                          data_im_,
+                                                                                                          data_offset_,
+                                                                                                          data_mask_,
+                                                                                                          height_im,
+                                                                                                          width_im,
+                                                                                                          kernel_h,
+                                                                                                          kernel_w,
+                                                                                                          pad_h,
+                                                                                                          pad_w,
+                                                                                                          stride_h,
+                                                                                                          stride_w,
+                                                                                                          dilation_h,
+                                                                                                          dilation_w,
+                                                                                                          channel_per_deformable_group,
+                                                                                                          batch_size,
+                                                                                                          channels,
+                                                                                                          deformable_group,
+                                                                                                          height_col,
+                                                                                                          width_col,
+                                                                                                          data_col_); }));
     }
 
-    at::Tensor modulated_deform_conv_forward_cuda(at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor offset, at::Tensor mask, int64_t kernel_h, int64_t kernel_w, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, int64_t dilation_h, int64_t dilation_w, int64_t group, int64_t deformable_group, bool with_bias)
+    at::Tensor modulated_deform_conv_forward_cuda(at::Tensor input,
+                                                  at::Tensor weight,
+                                                  at::Tensor bias,
+                                                  at::Tensor offset,
+                                                  at::Tensor mask,
+                                                  int64_t    kernel_h,
+                                                  int64_t    kernel_w,
+                                                  int64_t    stride_h,
+                                                  int64_t    stride_w,
+                                                  int64_t    pad_h,
+                                                  int64_t    pad_w,
+                                                  int64_t    dilation_h,
+                                                  int64_t    dilation_w,
+                                                  int64_t    group,
+                                                  int64_t    deformable_group,
+                                                  bool       with_bias)
     {
         at::DeviceGuard guard(input.device());
 
@@ -83,7 +114,25 @@ namespace mmdeploy
             weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
         for (int b = 0; b < batch; b++)
         {
-            modulated_deformable_im2col_cuda(input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns);
+            modulated_deformable_im2col_cuda(input[b],
+                                             offset[b],
+                                             mask[b],
+                                             1,
+                                             channels,
+                                             height,
+                                             width,
+                                             height_out,
+                                             width_out,
+                                             kernel_h,
+                                             kernel_w,
+                                             pad_h,
+                                             pad_w,
+                                             stride_h,
+                                             stride_w,
+                                             dilation_h,
+                                             dilation_w,
+                                             deformable_group,
+                                             columns);
 
             for (int g = 0; g < group; g++)
             {
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp
index 58cf0c6018..49d3d8930a 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp
@@ -17,7 +17,9 @@ namespace mmdeploy
     namespace torch_jit
     {
 
-        void optimize_for_backend(torch::jit::Module& model, const std::string& ir = "torchscript", const std::string& backend = "torchscript")
+        void optimize_for_backend(torch::jit::Module& model,
+                                  const std::string&  ir      = "torchscript",
+                                  const std::string&  backend = "torchscript")
         {
             if (ir == "torchscript")
             {
@@ -37,7 +39,11 @@ namespace mmdeploy
         PYBIND11_MODULE(ts_optimizer, m)
         {
             namespace py = pybind11;
-            m.def("optimize_for_backend", optimize_for_backend, py::arg("module"), py::arg("ir") = std::string("torchscript"), py::arg("backend") = std::string("torchscript"));
+            m.def("optimize_for_backend",
+                  optimize_for_backend,
+                  py::arg("module"),
+                  py::arg("ir")      = std::string("torchscript"),
+                  py::arg("backend") = std::string("torchscript"));
             py::module_ onnx_module = m.def_submodule("onnx");
             onnx_module.def("_jit_pass_merge_shape_concate", MergeShapeConcate, py::arg("graph"));
             onnx_module.def("_jit_pass_onnx_peephole", ONNXPeephole, py::arg("graph"));
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp
index e5f06e9c8b..4834f1d3d5 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp
@@ -111,7 +111,15 @@ namespace mmdeploy
             {
                 if (values_map_.at(v1) != v2)
                 {
-                    GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(), " did not match because %", v1->debugName(), " has already been matched with %", values_map_.at(v1)->debugName(), ".\n");
+                    GRAPH_DEBUG("Values %",
+                                v1->debugName(),
+                                " and %",
+                                v2->debugName(),
+                                " did not match because %",
+                                v1->debugName(),
+                                " has already been matched with %",
+                                values_map_.at(v1)->debugName(),
+                                ".\n");
                     return false;
                 }
                 return true;
@@ -122,7 +130,11 @@ namespace mmdeploy
             // uses don't need to be the same.
             if (v1->uses().size() != v2->uses().size() && !isOutput(v1) && !isInput(v1))
             {
-                GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(), " did not match because number of their uses is different.\n");
+                GRAPH_DEBUG("Values %",
+                            v1->debugName(),
+                            " and %",
+                            v2->debugName(),
+                            " did not match because number of their uses is different.\n");
                 return false;
             }
 
@@ -144,7 +156,11 @@ namespace mmdeploy
             {
                 if (n1->kindOf(attr_name) != n2->kindOf(attr_name))
                 {
-                    GRAPH_DEBUG("Nodes did not match because type of attribute '", attr_name.toQualString(), "' did not match:\n", *n1, *n2);
+                    GRAPH_DEBUG("Nodes did not match because type of attribute '",
+                                attr_name.toQualString(),
+                                "' did not match:\n",
+                                *n1,
+                                *n2);
                     return false;
                 }
                 std::vector<int64_t> n1is, n2is;
@@ -154,21 +170,45 @@ namespace mmdeploy
                     case AttributeKind::s:
                         if (!std::regex_match(n2->s(attr_name), std::regex(n1->s(attr_name))))
                         {
-                            GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(), "' did not match: ", n1->s(attr_name), " != ", n2->s(attr_name), " \n", *n1, *n2);
+                            GRAPH_DEBUG("Nodes did not match because attribute '",
+                                        attr_name.toQualString(),
+                                        "' did not match: ",
+                                        n1->s(attr_name),
+                                        " != ",
+                                        n2->s(attr_name),
+                                        " \n",
+                                        *n1,
+                                        *n2);
                             return false;
                         }
                         break;
                     case AttributeKind::f:
                         if (n1->f(attr_name) != n2->f(attr_name))
                         {
-                            GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(), "' did not match:", n1->f(attr_name), " != ", n2->f(attr_name), " \n", *n1, *n2);
+                            GRAPH_DEBUG("Nodes did not match because attribute '",
+                                        attr_name.toQualString(),
+                                        "' did not match:",
+                                        n1->f(attr_name),
+                                        " != ",
+                                        n2->f(attr_name),
+                                        " \n",
+                                        *n1,
+                                        *n2);
                             return false;
                         }
                         break;
                     case AttributeKind::i:
                         if (n1->i(attr_name) != n2->i(attr_name))
                         {
-                            GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(), "' did not match:", n1->i(attr_name), " != ", n2->i(attr_name), " \n", *n1, *n2);
+                            GRAPH_DEBUG("Nodes did not match because attribute '",
+                                        attr_name.toQualString(),
+                                        "' did not match:",
+                                        n1->i(attr_name),
+                                        " != ",
+                                        n2->i(attr_name),
+                                        " \n",
+                                        *n1,
+                                        *n2);
                             return false;
                         }
                         break;
@@ -193,7 +233,11 @@ namespace mmdeploy
                     default:
                     {
                         // Other attributes types not supported yet
-                        GRAPH_DEBUG("Nodes did not match because type of attribute '", attr_name.toQualString(), "' is not supported.\n", *n1, *n2);
+                        GRAPH_DEBUG("Nodes did not match because type of attribute '",
+                                    attr_name.toQualString(),
+                                    "' is not supported.\n",
+                                    *n1,
+                                    *n2);
                         return false;
                     }
                 }
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp
index 2798abaa8c..bc784671ea 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp
@@ -28,7 +28,10 @@ namespace mmdeploy
             return true;
         }
 
-        bool FuseSelectAssign(Node* node, std::unordered_map<std::string, Tensor>& params, std::unordered_map<std::string, Value*>& vmap, SubgraphMatcher& matcher)
+        bool FuseSelectAssign(Node*                                    node,
+                              std::unordered_map<std::string, Tensor>& params,
+                              std::unordered_map<std::string, Value*>& vmap,
+                              SubgraphMatcher&                         matcher)
         {
             auto values_map = matcher.values_map();
 
@@ -118,7 +121,10 @@ namespace mmdeploy
             return true;
         }
 
-        void FuseSelectAssign(Block* block, std::unordered_map<std::string, Tensor>& params, std::unordered_map<std::string, Value*>& vmap, SubgraphMatcher& matcher)
+        void FuseSelectAssign(Block*                                   block,
+                              std::unordered_map<std::string, Tensor>& params,
+                              std::unordered_map<std::string, Value*>& vmap,
+                              SubgraphMatcher&                         matcher)
         {
             auto graph = block->owningGraph();
             auto it    = block->nodes().begin();
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp
index 0b687c5083..7c2f866b85 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp
@@ -87,10 +87,9 @@ namespace mmdeploy
         void ONNXPeephole(const std::shared_ptr<Graph>& graph)
         {
             ONNXPeephole(graph->block());
-            torch::jit::EliminateDeadCode(
-                graph->block(),
-                true,
-                torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
+            torch::jit::EliminateDeadCode(graph->block(),
+                                          true,
+                                          torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
         }
 
     }  // namespace torch_jit
diff --git a/csrc/mmdeploy/codebase/common.h b/csrc/mmdeploy/codebase/common.h
index f5d01c3bbe..a0d1bc4a18 100644
--- a/csrc/mmdeploy/codebase/common.h
+++ b/csrc/mmdeploy/codebase/common.h
@@ -28,6 +28,7 @@ namespace mmdeploy
         {
             return device_;
         }
+
         Stream& stream()
         {
             return stream_;
@@ -46,6 +47,7 @@ namespace mmdeploy
         {
             return Tag::name;
         }
+
         std::unique_ptr<Module> Create(const Value& cfg) override
         {
             constexpr auto key{"component"};
@@ -63,7 +65,10 @@ namespace mmdeploy
             auto creator          = gRegistry<Tag>().Get(postprocess_type);
             if (creator == nullptr)
             {
-                MMDEPLOY_ERROR("Could not found entry '{}' in {}. Available components: {}", postprocess_type, Tag::name, gRegistry<Tag>().List());
+                MMDEPLOY_ERROR("Could not found entry '{}' in {}. Available components: {}",
+                               postprocess_type,
+                               Tag::name,
+                               gRegistry<Tag>().List());
                 throw_exception(eEntryNotFound);
             }
             return creator->Create(cfg);
diff --git a/csrc/mmdeploy/codebase/mmdet/object_detection.cpp b/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
index b194cb897e..4a27c6975d 100644
--- a/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
@@ -30,7 +30,8 @@ namespace mmdeploy::mmdet
             min_bbox_size_ = cfg["params"].value("min_bbox_size", 0.f);
         }
     }
-    std::vector<Tensor> ResizeBBox::GetDetsLabels(const Value& prep_res, const Value& infer_res)
+    std::vector<Tensor> ResizeBBox::GetDetsLabels(const Value& prep_res,
+                                                  const Value& infer_res)
     {
         std::vector<Tensor> results;
         if (infer_res.contains("dets") && infer_res.contains("labels"))
@@ -79,7 +80,8 @@ namespace mmdeploy::mmdet
             return results;
         }
     }
-    Result<Value> ResizeBBox::operator()(const Value& prep_res, const Value& infer_res)
+    Result<Value> ResizeBBox::operator()(const Value& prep_res,
+                                         const Value& infer_res)
     {
         MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
         try
@@ -118,7 +120,9 @@ namespace mmdeploy::mmdet
             return Status(eFail);
         }
     }
-    Result<Detections> ResizeBBox::DispatchGetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels)
+    Result<Detections> ResizeBBox::DispatchGetBBoxes(const Value&  prep_res,
+                                                     const Tensor& dets,
+                                                     const Tensor& labels)
     {
         auto data_type = labels.data_type();
         switch (data_type)
@@ -134,7 +138,9 @@ namespace mmdeploy::mmdet
         }
     }
     template<typename T>
-    Result<Detections> ResizeBBox::GetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels)
+    Result<Detections> ResizeBBox::GetBBoxes(const Value&  prep_res,
+                                             const Tensor& dets,
+                                             const Tensor& labels)
     {
         Detections    objs;
         auto*         dets_ptr   = dets.data<float>();
diff --git a/csrc/mmdeploy/codebase/mmdet/object_detection.h b/csrc/mmdeploy/codebase/mmdet/object_detection.h
index ea31dd7922..8d71a2772b 100644
--- a/csrc/mmdeploy/codebase/mmdet/object_detection.h
+++ b/csrc/mmdeploy/codebase/mmdet/object_detection.h
@@ -15,15 +15,21 @@ namespace mmdeploy::mmdet
       public:
         explicit ResizeBBox(const Value& cfg);
 
-        Result<Value> operator()(const Value& prep_res, const Value& infer_res);
+        Result<Value> operator()(const Value& prep_res,
+                                 const Value& infer_res);
 
       protected:
-        Result<Detections> DispatchGetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels);
+        Result<Detections> DispatchGetBBoxes(const Value&  prep_res,
+                                             const Tensor& dets,
+                                             const Tensor& labels);
 
         template<typename T>
-        Result<Detections>  GetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels);
+        Result<Detections>  GetBBoxes(const Value&  prep_res,
+                                      const Tensor& dets,
+                                      const Tensor& labels);
 
-        std::vector<Tensor> GetDetsLabels(const Value& prep_res, const Value& infer_res);
+        std::vector<Tensor> GetDetsLabels(const Value& prep_res,
+                                          const Value& infer_res);
 
       protected:
         constexpr static Device kHost{0, 0};
diff --git a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp
index 5acbe061e4..f9b4de1445 100644
--- a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp
@@ -147,7 +147,13 @@ namespace mmdeploy::mmdet
         return objs;
     }
 
-    int RTMDetSepBNHead::RTMDetFeatDeocde(const Tensor& bbox_pred, const Tensor& cls_score, const float stride, const float offset, std::vector<float>& filter_boxes, std::vector<float>& obj_probs, std::vector<int>& class_ids) const
+    int RTMDetSepBNHead::RTMDetFeatDeocde(const Tensor&       bbox_pred,
+                                          const Tensor&       cls_score,
+                                          const float         stride,
+                                          const float         offset,
+                                          std::vector<float>& filter_boxes,
+                                          std::vector<float>& obj_probs,
+                                          std::vector<int>&   class_ids) const
     {
         int  cls_param_num = cls_score.shape(1);
         int  feat_h        = bbox_pred.shape(2);
@@ -199,7 +205,14 @@ namespace mmdeploy::mmdet
         return valid_count;
     }
 
-    std::array<float, 4> RTMDetSepBNHead::RTMDetdecode(float tl_x, float tl_y, float br_x, float br_y, float stride, float offset, int j, int i) const
+    std::array<float, 4> RTMDetSepBNHead::RTMDetdecode(float tl_x,
+                                                       float tl_y,
+                                                       float br_x,
+                                                       float br_y,
+                                                       float stride,
+                                                       float offset,
+                                                       int   j,
+                                                       int   i) const
     {
         tl_x = (offset + j) * stride - tl_x;
         tl_y = (offset + i) * stride - tl_y;
diff --git a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h
index fbbe4ccc08..1334e393de 100644
--- a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h
+++ b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h
@@ -12,10 +12,29 @@ namespace mmdeploy::mmdet
     {
       public:
         explicit RTMDetSepBNHead(const Value& cfg);
+
         Result<Value>        operator()(const Value& prep_res, const Value& infer_res);
-        Result<Detections>   GetBBoxes(const Value& prep_res, const std::vector<Tensor>& bbox_preds, const std::vector<Tensor>& cls_scores) const;
-        int                  RTMDetFeatDeocde(const Tensor& bbox_pred, const Tensor& cls_score, const float stride, const float offset, std::vector<float>& filter_boxes, std::vector<float>& obj_probs, std::vector<int>& class_ids) const;
-        std::array<float, 4> RTMDetdecode(float tl_x, float tl_y, float br_x, float br_y, float stride, float offset, int j, int i) const;
+
+        Result<Detections>   GetBBoxes(const Value&               prep_res,
+                                       const std::vector<Tensor>& bbox_preds,
+                                       const std::vector<Tensor>& cls_scores) const;
+
+        int                  RTMDetFeatDeocde(const Tensor&       bbox_pred,
+                                              const Tensor&       cls_score,
+                                              const float         stride,
+                                              const float         offset,
+                                              std::vector<float>& filter_boxes,
+                                              std::vector<float>& obj_probs,
+                                              std::vector<int>&   class_ids) const;
+
+        std::array<float, 4> RTMDetdecode(float tl_x,
+                                          float tl_y,
+                                          float br_x,
+                                          float br_y,
+                                          float stride,
+                                          float offset,
+                                          int   j,
+                                          int   i) const;
 
       private:
         float              score_thr_{0.4f};
diff --git a/csrc/mmdeploy/codebase/mmdet/utils.cpp b/csrc/mmdeploy/codebase/mmdet/utils.cpp
index 4e25b1c7be..1f2f9fe274 100644
--- a/csrc/mmdeploy/codebase/mmdet/utils.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/utils.cpp
@@ -9,7 +9,17 @@ using mmdeploy::framework::Tensor;
 namespace mmdeploy::mmdet
 {
 
-    std::array<float, 4> MapToOriginImage(float left, float top, float right, float bottom, const float* scale_factor, float x_offset, float y_offset, int ori_width, int ori_height, int top_padding, int left_padding)
+    std::array<float, 4> MapToOriginImage(float        left,
+                                          float        top,
+                                          float        right,
+                                          float        bottom,
+                                          const float* scale_factor,
+                                          float        x_offset,
+                                          float        y_offset,
+                                          int          ori_width,
+                                          int          ori_height,
+                                          int          top_padding,
+                                          int          left_padding)
     {
         left   = std::max((left - left_padding) / scale_factor[0] + x_offset, 0.f);
         top    = std::max((top - top_padding) / scale_factor[1] + y_offset, 0.f);
@@ -18,7 +28,12 @@ namespace mmdeploy::mmdet
         return {left, top, right, bottom};
     }
 
-    void FilterScoresAndTopk(const Tensor& scores, float score_thr, int topk, std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs)
+    void FilterScoresAndTopk(const Tensor&       scores,
+                             float               score_thr,
+                             int                 topk,
+                             std::vector<float>& probs,
+                             std::vector<int>&   label_ids,
+                             std::vector<int>&   anchor_idxs)
     {
         auto kDets     = scores.shape(1);
         auto kClasses  = scores.shape(2);
@@ -38,7 +53,14 @@ namespace mmdeploy::mmdet
         }
     }
 
-    float IOU(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1, float ymax1)
+    float IOU(float xmin0,
+              float ymin0,
+              float xmax0,
+              float ymax0,
+              float xmin1,
+              float ymin1,
+              float xmax1,
+              float ymax1)
     {
         auto w    = std::max(0.f, std::min(xmax0, xmax1) - std::max(xmin0, xmin1));
         auto h    = std::max(0.f, std::min(ymax0, ymax1) - std::max(ymin0, ymin1));
@@ -48,7 +70,9 @@ namespace mmdeploy::mmdet
         return iou <= 0.f ? 0.f : iou;
     }
 
-    void NMS(const Tensor& dets, float iou_threshold, std::vector<int>& keep_idxs)
+    void NMS(const Tensor&     dets,
+             float             iou_threshold,
+             std::vector<int>& keep_idxs)
     {
         auto det_ptr = dets.data<float>();
         for (auto i = 0; i < keep_idxs.size(); ++i)
@@ -79,12 +103,18 @@ namespace mmdeploy::mmdet
         }
     }
 
-    void Sort(std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs)
+    void Sort(std::vector<float>& probs,
+              std::vector<int>&   label_ids,
+              std::vector<int>&   anchor_idxs)
     {
         std::vector<int> prob_idxs(probs.size());
         std::iota(prob_idxs.begin(), prob_idxs.end(), 0);
-        std::sort(prob_idxs.begin(), prob_idxs.end(), [&](int i, int j)
-                  { return probs[i] > probs[j]; });
+        std::sort(prob_idxs.begin(),
+                  prob_idxs.end(),
+                  [&](int i, int j)
+                  {
+                      return probs[i] > probs[j];
+                  });
         std::vector<float> _probs;
         std::vector<int>   _label_ids;
         std::vector<int>   _keep_idxs;
diff --git a/csrc/mmdeploy/codebase/mmdet/utils.h b/csrc/mmdeploy/codebase/mmdet/utils.h
index 8ef4b405ff..474906f783 100644
--- a/csrc/mmdeploy/codebase/mmdet/utils.h
+++ b/csrc/mmdeploy/codebase/mmdet/utils.h
@@ -10,18 +10,46 @@
 
 namespace mmdeploy::mmdet
 {
-    std::array<float, 4> MapToOriginImage(float left, float top, float right, float bottom, const float* scale_factor, float x_offset, float y_offset, int ori_width, int ori_height, int top_padding, int left_padding);
+    std::array<float, 4> MapToOriginImage(float        left,
+                                          float        top,
+                                          float        right,
+                                          float        bottom,
+                                          const float* scale_factor,
+                                          float        x_offset,
+                                          float        y_offset,
+                                          int          ori_width,
+                                          int          ori_height,
+                                          int          top_padding,
+                                          int          left_padding);
+
     // @brief Filter results using score threshold and topk candidates.
     // scores (Tensor): The scores, shape (num_bboxes, K).
     // probs: The scores after being filtered
     // label_ids: The class labels
     // anchor_idxs: The anchor indexes
-    void                 FilterScoresAndTopk(const mmdeploy::framework::Tensor& scores, float score_thr, int topk, std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs);
-    float                IOU(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1, float ymax1);
-
-    void                 Sort(std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs);
-
-    void                 NMS(const mmdeploy::framework::Tensor& dets, float iou_threshold, std::vector<int>& keep_idxs);
+    void                 FilterScoresAndTopk(const mmdeploy::framework::Tensor& scores,
+                                             float                              score_thr,
+                                             int                                topk,
+                                             std::vector<float>&                probs,
+                                             std::vector<int>&                  label_ids,
+                                             std::vector<int>&                  anchor_idxs);
+
+    float                IOU(float xmin0,
+                             float ymin0,
+                             float xmax0,
+                             float ymax0,
+                             float xmin1,
+                             float ymin1,
+                             float xmax1,
+                             float ymax1);
+
+    void                 Sort(std::vector<float>& probs,
+                              std::vector<int>&   label_ids,
+                              std::vector<int>&   anchor_idxs);
+
+    void                 NMS(const mmdeploy::framework::Tensor& dets,
+                             float                              iou_threshold,
+                             std::vector<int>&                  keep_idxs);
 
 }  // namespace mmdeploy::mmdet
 
diff --git a/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp b/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp
index 1999857d20..6df6a6cbe7 100644
--- a/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp
@@ -25,7 +25,9 @@ namespace mmdeploy::mmdet
                 nms_pre_       = cfg["params"].value("nms_pre", -1);
                 score_thr_     = cfg["params"].value("score_thr", 0.02f);
                 min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
-                iou_threshold_ = cfg["params"].contains("nms") ? cfg["params"]["nms"].value("iou_threshold", 0.45f) : 0.45f;
+                iou_threshold_ = cfg["params"].contains("nms") ?
+                                     cfg["params"]["nms"].value("iou_threshold", 0.45f) :
+                                     0.45f;
                 if (cfg["params"].contains("anchor_generator"))
                 {
                     from_value(cfg["params"]["anchor_generator"]["base_sizes"], anchors_);
@@ -81,7 +83,17 @@ namespace mmdeploy::mmdet
         return -1.0 * logf((1.0 / y) - 1.0);
     }
 
-    int YOLOHead::YOLOFeatDecode(const Tensor& feat_map, const std::vector<std::vector<float>>& anchor, int grid_h, int grid_w, int height, int width, int stride, std::vector<float>& boxes, std::vector<float>& obj_probs, std::vector<int>& class_id, float threshold) const
+    int YOLOHead::YOLOFeatDecode(const Tensor&                          feat_map,
+                                 const std::vector<std::vector<float>>& anchor,
+                                 int                                    grid_h,
+                                 int                                    grid_w,
+                                 int                                    height,
+                                 int                                    width,
+                                 int                                    stride,
+                                 std::vector<float>&                    boxes,
+                                 std::vector<float>&                    obj_probs,
+                                 std::vector<int>&                      class_id,
+                                 float                                  threshold) const
     {
         auto      input         = const_cast<float*>(feat_map.data<float>());
         auto      prop_box_size = feat_map.shape(1) / anchor.size();
@@ -89,6 +101,7 @@ namespace mmdeploy::mmdet
         int       valid_count   = 0;
         int       grid_len      = grid_h * grid_w;
         float     thres         = unsigmoid(threshold);
+
         for (int a = 0; a < anchor.size(); a++)
         {
             for (int i = 0; i < grid_h; i++)
@@ -155,7 +168,17 @@ namespace mmdeploy::mmdet
             int stride = strides_[i];
             int grid_h = model_in_h / stride;
             int grid_w = model_in_w / stride;
-            YOLOFeatDecode(pred_maps[i], anchors_[i], grid_h, grid_w, model_in_h, model_in_w, stride, filter_boxes, obj_probs, class_id, score_thr_);
+            YOLOFeatDecode(pred_maps[i],
+                           anchors_[i],
+                           grid_h,
+                           grid_w,
+                           model_in_h,
+                           model_in_w,
+                           stride,
+                           filter_boxes,
+                           obj_probs,
+                           class_id,
+                           score_thr_);
         }
 
         std::vector<int> indexArray;
@@ -165,7 +188,10 @@ namespace mmdeploy::mmdet
         }
         Sort(obj_probs, class_id, indexArray);
 
-        Tensor dets(TensorDesc{Device{0, 0}, DataType::kFLOAT, TensorShape{int(filter_boxes.size() / 4), 4}, "dets"});
+        Tensor dets(TensorDesc{Device{0, 0},
+                               DataType::kFLOAT,
+                               TensorShape{int(filter_boxes.size() / 4), 4},
+                               "dets"});
         std::copy(filter_boxes.begin(), filter_boxes.end(), dets.data<float>());
         NMS(dets, iou_threshold_, indexArray);
 
@@ -224,7 +250,15 @@ namespace mmdeploy::mmdet
         return objs;
     }
 
-    std::array<float, 4> YOLOV3Head::yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride, const std::vector<std::vector<float>>& anchor, int j, int i, int a) const
+    std::array<float, 4> YOLOV3Head::yolo_decode(float                                  box_x,
+                                                 float                                  box_y,
+                                                 float                                  box_w,
+                                                 float                                  box_h,
+                                                 float                                  stride,
+                                                 const std::vector<std::vector<float>>& anchor,
+                                                 int                                    j,
+                                                 int                                    i,
+                                                 int                                    a) const
     {
         box_x = (box_x + j) * stride;
         box_y = (box_y + i) * stride;
@@ -233,7 +267,15 @@ namespace mmdeploy::mmdet
         return std::array<float, 4>{box_x, box_y, box_w, box_h};
     }
 
-    std::array<float, 4> YOLOv5Head::yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride, const std::vector<std::vector<float>>& anchor, int j, int i, int a) const
+    std::array<float, 4> YOLOv5Head::yolo_decode(float                                  box_x,
+                                                 float                                  box_y,
+                                                 float                                  box_w,
+                                                 float                                  box_h,
+                                                 float                                  stride,
+                                                 const std::vector<std::vector<float>>& anchor,
+                                                 int                                    j,
+                                                 int                                    i,
+                                                 int                                    a) const
     {
         box_x = box_x * 2 - 0.5;
         box_y = box_y * 2 - 0.5;
diff --git a/csrc/mmdeploy/codebase/mmdet/yolo_head.h b/csrc/mmdeploy/codebase/mmdet/yolo_head.h
index f34734b24a..63234930d7 100644
--- a/csrc/mmdeploy/codebase/mmdet/yolo_head.h
+++ b/csrc/mmdeploy/codebase/mmdet/yolo_head.h
@@ -12,10 +12,33 @@ namespace mmdeploy::mmdet
     {
       public:
         explicit YOLOHead(const Value& cfg);
+
         Result<Value>                operator()(const Value& prep_res, const Value& infer_res);
-        int                          YOLOFeatDecode(const Tensor& feat_map, const std::vector<std::vector<float>>& anchor, int grid_h, int grid_w, int height, int width, int stride, std::vector<float>& boxes, std::vector<float>& obj_probs, std::vector<int>& class_id, float threshold) const;
-        Result<Detections>           GetBBoxes(const Value& prep_res, const std::vector<Tensor>& pred_maps) const;
-        virtual std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride, const std::vector<std::vector<float>>& anchor, int j, int i, int a) const = 0;
+
+        int                          YOLOFeatDecode(const Tensor&                          feat_map,
+                                                    const std::vector<std::vector<float>>& anchor,
+                                                    int                                    grid_h,
+                                                    int                                    grid_w,
+                                                    int                                    height,
+                                                    int                                    width,
+                                                    int                                    stride,
+                                                    std::vector<float>&                    boxes,
+                                                    std::vector<float>&                    obj_probs,
+                                                    std::vector<int>&                      class_id,
+                                                    float                                  threshold) const;
+
+        Result<Detections>           GetBBoxes(const Value&               prep_res,
+                                               const std::vector<Tensor>& pred_maps) const;
+
+        virtual std::array<float, 4> yolo_decode(float                                  box_x,
+                                                 float                                  box_y,
+                                                 float                                  box_w,
+                                                 float                                  box_h,
+                                                 float                                  stride,
+                                                 const std::vector<std::vector<float>>& anchor,
+                                                 int                                    j,
+                                                 int                                    i,
+                                                 int                                    a) const = 0;
 
       private:
         float                                        score_thr_{0.4f};
@@ -30,14 +53,30 @@ namespace mmdeploy::mmdet
     {
       public:
         using YOLOHead::YOLOHead;
-        std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride, const std::vector<std::vector<float>>& anchor, int j, int i, int a) const override;
+        std::array<float, 4> yolo_decode(float                                  box_x,
+                                         float                                  box_y,
+                                         float                                  box_w,
+                                         float                                  box_h,
+                                         float                                  stride,
+                                         const std::vector<std::vector<float>>& anchor,
+                                         int                                    j,
+                                         int                                    i,
+                                         int                                    a) const override;
     };
 
     class YOLOv5Head : public YOLOHead
     {
       public:
         using YOLOHead::YOLOHead;
-        std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride, const std::vector<std::vector<float>>& anchor, int j, int i, int a) const override;
+        std::array<float, 4> yolo_decode(float                                  box_x,
+                                         float                                  box_y,
+                                         float                                  box_w,
+                                         float                                  box_h,
+                                         float                                  stride,
+                                         const std::vector<std::vector<float>>& anchor,
+                                         int                                    j,
+                                         int                                    i,
+                                         int                                    a) const override;
     };
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/core/graph.h b/csrc/mmdeploy/core/graph.h
index ddcfb71b4c..3d56aa9ed6 100644
--- a/csrc/mmdeploy/core/graph.h
+++ b/csrc/mmdeploy/core/graph.h
@@ -35,6 +35,7 @@ namespace mmdeploy::graph
                 return node->Process(std::move(sender));
             }
         };
+
         __closure::_BinderBack<process_t, Node*> Process()
         {
             return {{}, {}, {this}};
@@ -50,10 +51,12 @@ namespace mmdeploy::graph
         {
             return inputs_;
         }
+
         const vector<string>& outputs() const noexcept
         {
             return outputs_;
         }
+
         const string& name() const noexcept
         {
             return name_;
diff --git a/csrc/mmdeploy/execution/then.h b/csrc/mmdeploy/execution/then.h
index c329d419d1..5da38c9b78 100644
--- a/csrc/mmdeploy/execution/then.h
+++ b/csrc/mmdeploy/execution/then.h
@@ -99,6 +99,7 @@ namespace mmdeploy
             {
                 return {(Sender&&)sender, std::move(func)};
             }
+
             template<typename Func>
             _BinderBack<then_t, Func> operator()(Func func) const
             {
diff --git a/csrc/mmdeploy/graph/static_router.cpp b/csrc/mmdeploy/graph/static_router.cpp
index a1b2bc600a..9c8d31a7ed 100644
--- a/csrc/mmdeploy/graph/static_router.cpp
+++ b/csrc/mmdeploy/graph/static_router.cpp
@@ -122,13 +122,16 @@ namespace mmdeploy::graph
         if (scope_)
         {
             *index = scope_->next_.fetch_add(1, std::memory_order_relaxed);
-            args   = Then(std::move(args), [this, index, start](Value v) mutable
+            args   = Then(std::move(args),
+                        [this, index, start](Value v) mutable
                         {
-      if (*start == false) {
-        scope_->Add(profiler::Event::kStart, *index, profiler::Clock::now());
-        *start = true;
-      }
-      return std::move(v); });
+                            if (*start == false)
+                            {
+                                scope_->Add(profiler::Event::kStart, *index, profiler::Clock::now());
+                                *start = true;
+                            }
+                            return std::move(v);
+                        });
         }
 
         State state(use_count_, std::move(args));
@@ -138,13 +141,16 @@ namespace mmdeploy::graph
             auto output = nodes_[i]->Process(std::move(input));
             state.Write(static_cast<int>(i), std::move(output));
         }
+
         auto output = state.Collect(ret_coords_);
         if (scope_)
         {
-            output = Then(std::move(output), [this, index](Value v)
+            output = Then(std::move(output),
+                          [this, index](Value v)
                           {
-      scope_->Add(profiler::Event::kEnd, *index, profiler::Clock::now());
-      return std::move(v); });
+                              scope_->Add(profiler::Event::kEnd, *index, profiler::Clock::now());
+                              return std::move(v);
+                          });
         }
         return output;
     }

From a7021e4d6a3273f6541aeb8f2cb460360d3426b0 Mon Sep 17 00:00:00 2001
From: huzhenhong <455879568@qq.com>
Date: Fri, 22 Dec 2023 19:30:51 +0800
Subject: [PATCH 5/5] format

---
 csrc/mmdeploy/apis/c/mmdeploy/common.cpp      |  2 +
 .../apis/c/mmdeploy/common_internal.h         | 10 ++-
 csrc/mmdeploy/apis/c/mmdeploy/detector.cpp    |  1 +
 csrc/mmdeploy/apis/c/mmdeploy/executor.cpp    | 72 +++++++++------
 csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp    |  5 ++
 csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp    |  1 +
 csrc/mmdeploy/core/graph.cpp                  |  7 ++
 csrc/mmdeploy/core/mpl/span.h                 | 45 +++++++---
 csrc/mmdeploy/core/value.h                    | 18 +++-
 csrc/mmdeploy/execution/just.h                |  8 +-
 csrc/mmdeploy/graph/task.cpp                  | 87 +++++++++++--------
 csrc/mmdeploy/net/net_module.cpp              |  8 +-
 csrc/mmdeploy/net/openvino/openvino_net.cpp   |  9 +-
 .../operation/cuda/crop_resize_pad.cpp        | 40 +++++++--
 14 files changed, 221 insertions(+), 92 deletions(-)

diff --git a/csrc/mmdeploy/apis/c/mmdeploy/common.cpp b/csrc/mmdeploy/apis/c/mmdeploy/common.cpp
index fff83da181..81e43ffce3 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/common.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/common.cpp
@@ -61,6 +61,7 @@ int mmdeploy_common_create_input(const mmdeploy_mat_t* mats, int mat_count, mmde
     {
         return MMDEPLOY_E_INVALID_ARG;
     }
+
     try
     {
         auto input = std::make_unique<Value>(Value{Value::kArray});
@@ -78,6 +79,7 @@ int mmdeploy_common_create_input(const mmdeploy_mat_t* mats, int mat_count, mmde
     {
         MMDEPLOY_ERROR("unknown exception caught");
     }
+
     return MMDEPLOY_SUCCESS;
 }
 
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h b/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h
index 6beb2f6b5e..24a776d8be 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h
@@ -64,7 +64,12 @@ namespace
 
     inline Mat Cast(const mmdeploy_mat_t& mat)
     {
-        return Mat{mat.height, mat.width, PixelFormat(mat.format), DataType(mat.type), mat.data, mat.device ? *(const Device*)mat.device : Device{0}};
+        return Mat{mat.height,
+                   mat.width,
+                   PixelFormat(mat.format),
+                   DataType(mat.type),
+                   mat.data,
+                   mat.device ? *(const Device*)mat.device : Device{0}};
     }
 
     template<typename F>
@@ -82,6 +87,7 @@ namespace
         {
             MMDEPLOY_ERROR("unknown exception caught");
         }
+
         return nullptr;
     }
 
@@ -124,6 +130,7 @@ namespace
             : v_(other.release())
         {
         }
+
         wrapped& operator=(wrapped&& other) noexcept
         {
             reset();
@@ -140,6 +147,7 @@ namespace
         {
             return Cast(v_);
         }
+
         auto operator->()
         {
             return Cast(v_);
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp b/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp
index 30ea52fcab..6ad627be50 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp
@@ -95,6 +95,7 @@ int mmdeploy_detector_get_result(mmdeploy_value_t output, mmdeploy_detection_t**
     {
         return MMDEPLOY_E_INVALID_ARG;
     }
+
     try
     {
         Value&      value            = Cast(output)->front();
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp b/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp
index e73ffe0606..0de722b58c 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp
@@ -19,7 +19,9 @@ namespace
             auto creator = gRegistry<SchedulerType>().Get(type);
             if (!creator)
             {
-                MMDEPLOY_ERROR("Creator for {} not found. Available schedulers: {}", type, gRegistry<SchedulerType>().List());
+                MMDEPLOY_ERROR("Creator for {} not found. Available schedulers: {}",
+                               type,
+                               gRegistry<SchedulerType>().List());
                 return nullptr;
             }
             return Cast(new SchedulerType(creator->Create(config)));
@@ -79,9 +81,10 @@ mmdeploy_scheduler_t mmdeploy_executor_dynamic_batch(mmdeploy_scheduler_t schedu
     {
         return nullptr;
     }
-    return CreateScheduler(
-        "DynamicBatch",
-        {{"scheduler", *Cast(scheduler)}, {"max_batch_size", max_batch_size}, {"timeout", timeout}});
+    return CreateScheduler("DynamicBatch",
+                           {{"scheduler", *Cast(scheduler)},
+                            {"max_batch_size", max_batch_size},
+                            {"timeout", timeout}});
 }
 
 int mmdeploy_scheduler_destroy(mmdeploy_scheduler_t scheduler)
@@ -110,8 +113,11 @@ mmdeploy_sender_t mmdeploy_executor_schedule(mmdeploy_scheduler_t scheduler)
         return nullptr;
     }
     return Guard([&]
-                 { return Take(Then(Schedule(*Cast(scheduler)), []
-                                    { return Value(); })); });
+                 { return Take(Then(Schedule(*Cast(scheduler)),
+                                    []
+                                    {
+                                        return Value();
+                                    })); });
 }
 
 mmdeploy_sender_t mmdeploy_executor_transfer_just(mmdeploy_scheduler_t scheduler,
@@ -153,12 +159,14 @@ mmdeploy_sender_t mmdeploy_executor_then(mmdeploy_sender_t input, mmdeploy_then_
         return nullptr;
     }
     return Guard([&]
-                 { return Take(Then(Take(input), [fn, context](Value args)
+                 { return Take(Then(Take(input),
+                                    [fn, context](Value args)
                                     {
-      auto out = Cast(fn(Take(std::move(args)), context));
-      Value ret(std::move(*out));
-      delete out;
-      return ret; })); });
+                                        auto  out = Cast(fn(Take(std::move(args)), context));
+                                        Value ret(std::move(*out));
+                                        delete out;
+                                        return ret;
+                                    })); });
 }
 
 mmdeploy_sender_t mmdeploy_executor_let_value(mmdeploy_sender_t input, mmdeploy_let_value_fn_t fn, void* context)
@@ -168,12 +176,14 @@ mmdeploy_sender_t mmdeploy_executor_let_value(mmdeploy_sender_t input, mmdeploy_
         return nullptr;
     }
     return Guard([&]
-                 { return Take(LetValue(Take(input), [fn, context](Value& args)
+                 { return Take(LetValue(Take(input),
+                                        [fn, context](Value& args)
                                         {
-      auto out = Cast(fn(Cast(&args), context));
-      SenderType ret(std::move(*out));
-      delete out;
-      return ret; })); });
+                                            auto       out = Cast(fn(Cast(&args), context));
+                                            SenderType ret(std::move(*out));
+                                            delete out;
+                                            return ret;
+                                        })); });
 }
 
 mmdeploy_sender_t mmdeploy_executor_split(mmdeploy_sender_t input)
@@ -194,13 +204,17 @@ mmdeploy_sender_t mmdeploy_executor_when_all(mmdeploy_sender_t inputs[], int32_t
     }
     return Guard([&]
                  {
-    std::vector<SenderType> senders;
-    senders.reserve(n);
-    for (int i = 0; i < n; ++i) {
-      senders.emplace_back(Take(inputs[i]));
-    }
-    return Take(
-        Then(WhenAll(std::move(senders)), [](Value::Array&& v) { return Value(std::move(v)); })); });
+                    std::vector<SenderType> senders;
+                    senders.reserve(n);
+                    for (int i = 0; i < n; ++i)
+                    {
+                        senders.emplace_back(Take(inputs[i]));
+                    }
+                    return Take(Then(WhenAll(std::move(senders)), 
+                                [](Value::Array&& v)
+                                {
+                                    return Value(std::move(v)); 
+                                })); });
 }
 
 mmdeploy_sender_t mmdeploy_executor_ensure_started(mmdeploy_sender_t input)
@@ -219,6 +233,7 @@ int mmdeploy_executor_start_detached(mmdeploy_sender_t input)
     {
         return MMDEPLOY_E_INVALID_ARG;
     }
+
     try
     {
         StartDetached(Take(input));
@@ -227,6 +242,7 @@ int mmdeploy_executor_start_detached(mmdeploy_sender_t input)
     catch (...)
     {
     }
+
     return MMDEPLOY_E_FAIL;
 }
 
@@ -246,11 +262,13 @@ int mmdeploy_executor_sync_wait_v2(mmdeploy_sender_t sender, mmdeploy_value_t* v
     {
         return MMDEPLOY_E_INVALID_ARG;
     }
+
     auto result = mmdeploy_executor_sync_wait(sender);
     if (!result)
     {
         return MMDEPLOY_E_FAIL;
     }
+
     if (value)
     {
         *value = result;
@@ -259,11 +277,15 @@ int mmdeploy_executor_sync_wait_v2(mmdeploy_sender_t sender, mmdeploy_value_t* v
     {
         mmdeploy_value_destroy(result);
     }
+
     return MMDEPLOY_SUCCESS;
 }
 
 void mmdeploy_executor_execute(mmdeploy_scheduler_t scheduler, void (*fn)(void*), void* context)
 {
-    Execute(*Cast(scheduler), [fn, context]
-            { fn(context); });
+    Execute(*Cast(scheduler),
+            [fn, context]
+            {
+                fn(context);
+            });
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp b/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp
index b0d3d6a220..9e0fcf011e 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp
@@ -48,6 +48,7 @@ int mmdeploy_pipeline_apply_async(mmdeploy_pipeline_t pipeline, mmdeploy_sender_
     {
         return MMDEPLOY_E_INVALID_ARG;
     }
+
     try
     {
         auto h  = Cast(pipeline);
@@ -62,6 +63,7 @@ int mmdeploy_pipeline_apply_async(mmdeploy_pipeline_t pipeline, mmdeploy_sender_
     {
         MMDEPLOY_ERROR("unknown exception caught");
     }
+
     return MMDEPLOY_E_FAIL;
 }
 
@@ -80,16 +82,19 @@ int mmdeploy_pipeline_apply(mmdeploy_pipeline_t pipeline, mmdeploy_value_t input
     {
         return MMDEPLOY_E_FAIL;
     }
+
     mmdeploy_sender_t output_sender{};
     if (auto ec = mmdeploy_pipeline_apply_async(pipeline, input_sender, &output_sender))
     {
         return ec;
     }
+
     auto _output = mmdeploy_executor_sync_wait(output_sender);
     if (!_output)
     {
         return MMDEPLOY_E_FAIL;
     }
+
     *output = _output;
     return MMDEPLOY_SUCCESS;
 }
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
index 8d16fd9ecc..07b6b225b2 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
@@ -408,6 +408,7 @@ namespace mmdeploy
             {
                 mmdeploy_context_add(*this, MMDEPLOY_TYPE_SCHEDULER, name.c_str(), scheduler);
             }
+
             void Add(const std::string& name, const Model& model)
             {
                 mmdeploy_context_add(*this, MMDEPLOY_TYPE_MODEL, name.c_str(), model);
diff --git a/csrc/mmdeploy/core/graph.cpp b/csrc/mmdeploy/core/graph.cpp
index 7d283074e2..6be60296e3 100644
--- a/csrc/mmdeploy/core/graph.cpp
+++ b/csrc/mmdeploy/core/graph.cpp
@@ -40,10 +40,12 @@ namespace mmdeploy::graph
                         (split ? &expr.rhs : &expr.lhs)->push_back(c);
                 }
             }
+
             if (!split)
             {
                 expr.rhs = expr.lhs;
             }
+
             return std::move(expr);
         }
 
@@ -104,6 +106,7 @@ namespace mmdeploy::graph
         {
             node = std::make_unique<Flattened>(std::move(node), flatten_, broadcast_, unflatten_);
         }
+
         return std::move(node);
     }
 
@@ -125,18 +128,21 @@ namespace mmdeploy::graph
                 }
             }
         }
+
         auto creator = gRegistry<Builder>().Get(type);
         if (!creator)
         {
             MMDEPLOY_ERROR("failed to find node creator: {}", type);
             return Status(eEntryNotFound);
         }
+
         auto builder = creator->Create(cfg);
         if (!builder)
         {
             MMDEPLOY_ERROR("failed to create node builder: {}", type);
             return Status(eFail);
         }
+
         return std::move(builder);
     }
 
@@ -150,6 +156,7 @@ namespace mmdeploy::graph
         {
             return from_value<std::vector<std::string>>(value);
         }
+
         return Status(eInvalidArgument);
     }
 
diff --git a/csrc/mmdeploy/core/mpl/span.h b/csrc/mmdeploy/core/mpl/span.h
index d600ee7100..2ab135e99e 100644
--- a/csrc/mmdeploy/core/mpl/span.h
+++ b/csrc/mmdeploy/core/mpl/span.h
@@ -55,31 +55,36 @@ namespace mmdeploy
         {
         }
 
-        // clang-format off
-  template <typename It,
-      std::void_t<decltype(std::addressof(std::declval<It&>()))>* = nullptr>
-        // clang-format on
+
+        template<typename It,
+                 std::void_t<decltype(std::addressof(std::declval<It&>()))>* = nullptr>
         constexpr Span(It first, size_type size)
             : data_(detail::to_address(first))
             , size_(size)
         {
         }
 
-        template<typename It, typename End, std::enable_if_t<!std::is_convertible_v<End, std::size_t>, int> = 0>
+        template<typename It,
+                 typename End,
+                 std::enable_if_t<!std::is_convertible_v<End, std::size_t>, int> = 0>
         constexpr Span(It first, End last)
             : data_(detail::to_address(first))
             , size_(last - first)
         {
         }
 
-        template<typename U, typename = std::void_t<decltype(std::data(std::declval<U>()))>, typename = std::void_t<decltype(std::size(std::declval<U>()))>>
+        template<typename U,
+                 typename = std::void_t<decltype(std::data(std::declval<U>()))>,
+                 typename = std::void_t<decltype(std::size(std::declval<U>()))>>
         constexpr Span(U& v)
             : data_(std::data(v))
             , size_(std::size(v))
         {
         }
 
-        template<typename U, typename = std::void_t<decltype(std::data(std::declval<U>()))>, typename = std::void_t<decltype(std::size(std::declval<U>()))>>
+        template<typename U,
+                 typename = std::void_t<decltype(std::data(std::declval<U>()))>,
+                 typename = std::void_t<decltype(std::size(std::declval<U>()))>>
         constexpr Span(const U& v)
             : data_(std::data(v))
             , size_(std::size(v))
@@ -109,54 +114,67 @@ namespace mmdeploy
         {
             return data_;
         }
+
         constexpr iterator end() const noexcept
         {
             return data_ + size_;
         }
+
         constexpr reverse_iterator rbegin() const noexcept
         {
             return std::make_reverse_iterator(end());
         }
+
         constexpr reverse_iterator rend() const noexcept
         {
             return std::make_reverse_iterator(begin());
         }
+
         constexpr reference front() const
         {
             return data_[0];
         }
+
         constexpr reference back() const
         {
             return data_[size_ - 1];
         }
+
         constexpr reference operator[](size_type idx) const
         {
             return data_[idx];
         }
+
         constexpr pointer data() const noexcept
         {
             return data_;
         }
+
         constexpr size_type size() const noexcept
         {
             return size_;
         }
+
         constexpr size_type size_bytes() const noexcept
         {
             return sizeof(value_type) * size();
         }
+
         constexpr bool empty() const noexcept
         {
             return size_ == 0;
         }
+
         constexpr Span<element_type> first(size_type count) const
         {
             return {begin(), count};
         }
+
         constexpr Span<element_type> last(size_type count) const
         {
             return {end() - count, count};
         }
+
         constexpr Span<element_type> subspan(size_type offset, size_type count = -1) const
         {
             if (count == -1)
@@ -178,6 +196,7 @@ namespace mmdeploy
             {
                 return true;
             }
+
             for (size_type i = 0; i < a.size(); ++i)
             {
                 if (a[i] != b[i])
@@ -185,6 +204,7 @@ namespace mmdeploy
                     return true;
                 }
             }
+
             return false;
         }
 
@@ -198,20 +218,21 @@ namespace mmdeploy
         T*        data_;
         size_type size_;
     };
-    // clang-format off
+
 template <typename It, typename EndOrSize>
 Span(It, EndOrSize) -> Span<std::remove_reference_t<iter_reference_t<It>>>;
 
 template <typename T, std::size_t N>
 Span(T (&)[N]) -> Span<T>;
 
-template <typename U, typename = std::void_t<decltype(std::declval<U>().data())>,
-          typename = std::void_t<decltype(std::declval<U>().size())>>
+template<typename U,
+         typename = std::void_t<decltype(std::declval<U>().data())>,
+         typename = std::void_t<decltype(std::declval<U>().size())>>
 Span(U& v) -> Span<typename uncvref_t<U>::value_type>;
 
-template <typename T>
+template<typename T>
 Span(std::initializer_list<T>) -> Span<const T>;
-    // clang-format on
+
 }  // namespace mmdeploy
 
 #endif  // MMDEPLOY_SRC_CORE_MPL_SPAN_H_
diff --git a/csrc/mmdeploy/core/value.h b/csrc/mmdeploy/core/value.h
index a9ca631248..f14907bf0a 100644
--- a/csrc/mmdeploy/core/value.h
+++ b/csrc/mmdeploy/core/value.h
@@ -394,14 +394,16 @@ namespace mmdeploy
         {
         }
 
-        template<class T, std::enable_if_t<std::is_constructible<String, T>::value, bool> = true>
+        template<class T,
+                 std::enable_if_t<std::is_constructible<String, T>::value, bool> = true>
         Value(T&& value)
             : type_(kString)
             , data_(String{std::forward<T>(value)})
         {
         }
 
-        template<typename T, std::enable_if_t<is_cast_by_erasure<std::decay_t<T>>::value, bool> = true>
+        template<typename T,
+                 std::enable_if_t<is_cast_by_erasure<std::decay_t<T>>::value, bool> = true>
         Value(T&& value)
             : Value(cast_by_erasure(std::forward<T>(value)))
         {
@@ -414,7 +416,9 @@ namespace mmdeploy
             data_.any = create<Any>(std::forward<T>(value.value));
         }
 
-        Value(std::initializer_list<ValueRef> init, bool type_deduction = true, Type manual_type = Type::kArray);
+        Value(std::initializer_list<ValueRef> init,
+              bool                            type_deduction = true,
+              Type                            manual_type    = Type::kArray);
 
         Value(Value&& other) noexcept
             : type_(other.type_)
@@ -1510,7 +1514,9 @@ namespace mmdeploy
 
     }  // namespace detail
 
-    inline Value::Value(std::initializer_list<ValueRef> init, bool type_deduction, Type manual_type)
+    inline Value::Value(std::initializer_list<ValueRef> init,
+                        bool                            type_deduction,
+                        Type                            manual_type)
     {
         bool is_an_object = true;
         for (const auto& x : init)
@@ -1521,17 +1527,20 @@ namespace mmdeploy
                 break;
             }
         }
+
         if (!type_deduction)
         {
             if (manual_type == Type::kArray)
             {
                 is_an_object = false;
             }
+
             if (manual_type == Type::kObject && !is_an_object)
             {
                 throw_exception(eInvalidArgument);
             }
         }
+
         if (is_an_object)
         {
             type_ = Type::kObject;
@@ -1561,6 +1570,7 @@ namespace mmdeploy
         {
             return;
         }
+
         for (const auto& [key, value] : src)
         {
             auto ret = dst.insert({key, value});
diff --git a/csrc/mmdeploy/execution/just.h b/csrc/mmdeploy/execution/just.h
index 094ead00df..f26b924ae2 100644
--- a/csrc/mmdeploy/execution/just.h
+++ b/csrc/mmdeploy/execution/just.h
@@ -21,6 +21,7 @@ namespace mmdeploy
         {
             struct type;
         };
+
         template<typename Receiver, typename... Ts>
         using operation_t = typename _Operation<remove_cvref_t<Receiver>, Ts...>::type;
 
@@ -31,10 +32,9 @@ namespace mmdeploy
             Receiver          receiver_;
             friend void       tag_invoke(start_t, type& op_state) noexcept
             {
-                std::apply(
-                    [&](Ts&... ts) -> void
-                    { SetValue(std::move(op_state.receiver_), std::move(ts)...); },
-                    op_state.values_);
+                std::apply([&](Ts&... ts) -> void
+                           { SetValue(std::move(op_state.receiver_), std::move(ts)...); },
+                           op_state.values_);
             }
         };
 
diff --git a/csrc/mmdeploy/graph/task.cpp b/csrc/mmdeploy/graph/task.cpp
index 9a2b140e02..e9b64a4b26 100644
--- a/csrc/mmdeploy/graph/task.cpp
+++ b/csrc/mmdeploy/graph/task.cpp
@@ -10,39 +10,54 @@ namespace mmdeploy::graph
 
     Sender<Value> Task::Process(Sender<Value> input)
     {
-        return LetValue(std::move(input), [this](Value& v) -> Sender<Value>
+        return LetValue(std::move(input),
+                        [this](Value& v) -> Sender<Value>
                         {
-    assert(v.is_array());
-    // handle empty input
-    if (v.front().empty()) {
-      profiler::ScopedCounter counter(scope_);
-      return TransferJust(*sched_, Value(Value::Array(v.size(), Value::kArray)));
-    }
-    if (v.front().is_array() && !is_batched_) {
-      auto batch_size = v.front().size();
-      Value output = Value::Array(batch_size);
-      // clang-format off
-      return TransferJust(*sched_, std::move(output))
-          | Then([&](Value&& output) -> Value {
-            auto input = graph::DistribAA(v).value();
-            return Value{std::move(input), std::move(output)};
-          })
-          | Bulk(batch_size, [&](size_t index, Value& in_out) {
-            profiler::ScopedCounter counter(scope_);
-            const auto& input = in_out[0];
-            auto& output = in_out[1];
-            output[index] = module_->Process(input[index]).value();
-          })
-          | Then([](const Value& in_out) {
-            return graph::DistribAA(in_out[1]).value();
-          });
-      // clang-format on
-    } else {
-      return DynamicBatch(TransferJust(*sched_, std::move(v)), batch_context_, [&](const Value& u) {
-        profiler::ScopedCounter counter(scope_);
-        return module_->Process(u).value();
-      });
-    } });
+                            assert(v.is_array());
+                            // handle empty input
+                            if (v.front().empty())
+                            {
+                                profiler::ScopedCounter counter(scope_);
+                                return TransferJust(*sched_, Value(Value::Array(v.size(), Value::kArray)));
+                            }
+
+                            if (v.front().is_array() && !is_batched_)
+                            {
+                                auto  batch_size = v.front().size();
+                                Value output     = Value::Array(batch_size);
+
+                                // clang-format off
+                                return TransferJust(*sched_, std::move(output)) | 
+                                        Then([&](Value&& output) -> Value 
+                                        {
+                                            auto input = graph::DistribAA(v).value();
+                                            return Value{std::move(input), std::move(output)};
+                                        }) | 
+                                        Bulk(batch_size, 
+                                            [&](size_t index, Value& in_out) 
+                                            {
+                                                profiler::ScopedCounter counter(scope_);
+                                                const auto& input = in_out[0];
+                                                auto& output = in_out[1];
+                                                output[index] = module_->Process(input[index]).value();
+                                            }) | 
+                                        Then([](const Value& in_out) 
+                                            {
+                                                return graph::DistribAA(in_out[1]).value();
+                                            });
+                                // clang-format on
+                            }
+                            else
+                            {
+                                return DynamicBatch(TransferJust(*sched_, std::move(v)),
+                                                    batch_context_,
+                                                    [&](const Value& u)
+                                                    {
+                                                        profiler::ScopedCounter counter(scope_);
+                                                        return module_->Process(u).value();
+                                                    });
+                            }
+                        });
     }
 
     TaskBuilder::TaskBuilder(Value config)
@@ -119,7 +134,11 @@ namespace mmdeploy::graph
         }
     }
 
-    MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Task, 0), [](const Value& config)
-                                   { return std::make_unique<TaskBuilder>(config); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Builder,
+                                   (Task, 0),
+                                   [](const Value& config)
+                                   {
+                                       return std::make_unique<TaskBuilder>(config);
+                                   });
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/net/net_module.cpp b/csrc/mmdeploy/net/net_module.cpp
index ff8227c83a..bc7a901d06 100644
--- a/csrc/mmdeploy/net/net_module.cpp
+++ b/csrc/mmdeploy/net/net_module.cpp
@@ -393,7 +393,11 @@ namespace mmdeploy::framework
         }
     }
 
-    MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (Net, 0), [](const Value& config)
-                                   { return CreateTask(NetModule{config}); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Module,
+                                   (Net, 0),
+                                   [](const Value& config)
+                                   {
+                                       return CreateTask(NetModule{config});
+                                   });
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/openvino/openvino_net.cpp b/csrc/mmdeploy/net/openvino/openvino_net.cpp
index ff9b5741e1..6f95a38714 100644
--- a/csrc/mmdeploy/net/openvino/openvino_net.cpp
+++ b/csrc/mmdeploy/net/openvino/openvino_net.cpp
@@ -241,8 +241,13 @@ namespace mmdeploy::framework
             const auto& input_name    = tensor.desc().name;
             const auto& tensor_shape  = tensor.desc().shape;
             auto&       size_vector   = input_shapes[input_name];
-            bool        shape_changed = !std::equal(size_vector.begin(), size_vector.end(), tensor_shape.begin(), [](size_t a, int64_t b)
-                                             { return a == size_t(b); });
+            bool        shape_changed = !std::equal(size_vector.begin(),
+                                             size_vector.end(),
+                                             tensor_shape.begin(),
+                                             [](size_t a, int64_t b)
+                                             {
+                                                 return a == size_t(b);
+                                             });
             need_reshape |= shape_changed;
             if (shape_changed)
                 size_vector = InferenceEngine::SizeVector{tensor_shape.begin(), tensor_shape.end()};
diff --git a/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp b/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp
index 8e4eb12c3f..a12b5c560c 100644
--- a/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp
+++ b/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp
@@ -11,14 +11,21 @@ namespace mmdeploy::operation::cuda
       public:
         CropResizePadImpl() = default;
 
-        Result<void> apply(const Tensor& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect, Tensor& dst) override
+        Result<void> apply(const Tensor&           src,
+                           const std::vector<int>& crop_rect,
+                           const std::vector<int>& target_size,
+                           const std::vector<int>& pad_rect,
+                           Tensor&                 dst) override
         {
             auto       cuda_stream = GetNative<cudaStream_t>(stream());
 
             int        width  = target_size[0] + pad_rect[1] + pad_rect[3];
             int        height = target_size[1] + pad_rect[0] + pad_rect[2];
 
-            TensorDesc desc{device(), src.data_type(), {1, height, width, src.shape(3)}, src.name()};
+            TensorDesc desc{device(),
+                            src.data_type(),
+                            {1, height, width, src.shape(3)},
+                            src.name()};
             Tensor     dst_tensor(desc);
             cudaMemsetAsync(dst_tensor.data<uint8_t>(), 0, dst_tensor.byte_size(), cuda_stream);
 
@@ -30,8 +37,7 @@ namespace mmdeploy::operation::cuda
                 }
                 else if (src.data_type() == DataType::kFLOAT)
                 {
-                    OUTCOME_TRY(
-                        ResizeDispatch<float>(src, crop_rect, target_size, pad_rect, dst_tensor, cuda_stream));
+                    OUTCOME_TRY(ResizeDispatch<float>(src, crop_rect, target_size, pad_rect, dst_tensor, cuda_stream));
                 }
                 else
                 {
@@ -63,7 +69,12 @@ namespace mmdeploy::operation::cuda
         }
 
         template<class T>
-        Result<void> ResizeDispatch(const Tensor& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect, Tensor& dst, cudaStream_t stream)
+        Result<void> ResizeDispatch(const Tensor&           src,
+                                    const std::vector<int>& crop_rect,
+                                    const std::vector<int>& target_size,
+                                    const std::vector<int>& pad_rect,
+                                    Tensor&                 dst,
+                                    cudaStream_t            stream)
         {
             int                  in_height        = crop_rect[2] - crop_rect[0] + 1;
             int                  in_width         = crop_rect[3] - crop_rect[1] + 1;
@@ -82,7 +93,16 @@ namespace mmdeploy::operation::cuda
 
             if (auto resize = Select<T>(src.shape(3)); resize)
             {
-                ret = resize(stream, in_height, in_width, in_width_stride, input + in_offset, out_h, out_w, out_width_stride, output + out_offset, interp);
+                ret = resize(stream,
+                             in_height,
+                             in_width,
+                             in_width_stride,
+                             input + in_offset,
+                             out_h,
+                             out_w,
+                             out_width_stride,
+                             output + out_offset,
+                             interp);
             }
             else
             {
@@ -93,7 +113,11 @@ namespace mmdeploy::operation::cuda
         }
     };
 
-    MMDEPLOY_REGISTER_FACTORY_FUNC(CropResizePad, (cuda, 0), []()
-                                   { return std::make_unique<CropResizePadImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(CropResizePad,
+                                   (cuda, 0),
+                                   []()
+                                   {
+                                       return std::make_unique<CropResizePadImpl>();
+                                   });
 
 }  // namespace mmdeploy::operation::cuda