diff --git a/.gitignore b/.gitignore
index 1fc741e69..c0ab7fc54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,4 @@ protobuf_lib/
 sysroot/
 android_config.txt
 model_src/
+.vs/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b6156999..5488ba2d0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,17 +1,18 @@
 if(CMAKE_TOOLCHAIN_FILE)
-set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to")
-# get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :(
 get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME)
+message(STATUS,"CMAKE_TOOLCHAIN_FILE_NAME = ${CMAKE_TOOLCHAIN_FILE_NAME}, ${CMAKE_TOOLCHAIN_FILE}, ${CMAKE_SOURCE_DIR}")
 find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH)
 message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}")
+else()
+    message(FATAL_ERROR "cmake file only used for Android build")
 endif()
 
 if(NOT DEFINED CMAKE_INSTALL_PREFIX)
 set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/install" CACHE PATH "Installation Directory")
+message(STATUS,"CMAKE_CURRENT_SRC_DIR = ${CMAKE_CURRENT_SOURCE_DIR}")
 endif()
 message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}")
 
-
 cmake_minimum_required(VERSION 3.6)
 
 set(CMAKE_BUILD_TYPE debug)
@@ -19,15 +20,61 @@ set(CMAKE_BUILD_TYPE debug)
 # set(CMAKE_BUILD_TYPE release)
 
 
+#get the NDK_ROOT from android.toolchains.cmake
+
+set(PARSED_ANDROID_NDK_REGEX "(.+)/build/cmake/android.toolchain.cmake")
+
+string(REGEX REPLACE "${PARSED_ANDROID_NDK_REGEX}" "\\1"  PARSED_ANDROID_NDK ${CMAKE_TOOLCHAIN_FILE})
+
+file(READ "${PARSED_ANDROID_NDK}/source.properties" TE_NDK_SOURCE_PROPERTIES)
+
+set(TE_NDK_REVISION_REGEX
+  "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.([0-9]+)\\.([0-9]+)?")
+if(NOT TE_NDK_SOURCE_PROPERTIES MATCHES "${TE_NDK_REVISION_REGEX}")
+  message(FATAL_ERROR "Failed to parse Android NDK revision: ${ANDROID_NDK}/source.properties.\n${TE_NDK_SOURCE_PROPERTIES}")
+endif()
+
+set(PARSED_NDK_MAJOR "${CMAKE_MATCH_1}")
+set(PARSED_NDK_MINOR "${CMAKE_MATCH_2}")
+
+
+#ndk less than 15, CMAKE 3.6.3 works, while higher version may failed
+if(PARSED_NDK_MAJOR LESS 15)
+if(CMAKE_VERSION VERSION_GREATER 3.6.3)
+    message(FATAL_ERROR  "please use cmake at most VERSION 3.6.3 for ndk " ${PARSED_NDK_MAJOR} "." ${PARSED_NDK_MINOR})
+endif()
+endif()
+
+#real  project logic starts from here
+
 project(tengine_android)
 
 option(CONFIG_ARCH_ARM64 "build arm64 version" OFF)
+option(CONFIG_ARCH_ARM32 "build arm32 version" OFF)
 option(CONFIG_ARCH_BLAS  "build blas  version" OFF)
+option(CONFIG_ARCH_ARM8_2 "build float16 for arm8.2" OFF)
 option(CONFIG_ACL_GPU  "build acl gpu  version" OFF)
 option(CONFIG_CAFFE_SERIALIZER "caffe serializer" ON)
 option(CONFIG_ONNX_SERIALIZER "onnx serializer" OFF)
+option(CONFIG_MXNET_SERIALIZER "mxnet serializer" OFF)
 option(CONFIG_TF_SERIALIZER "tensorflow serializer" OFF)
+option(CONFIG_TFLITE_SERIALIZER "tflite serializer" OFF)
 option(CONFIG_TENGINE_SERIALIZER "tengine serializer" ON)
+option(CONFIG_KERNEL_FP32 "KERNEL FP32" ON)
+if(ANDROID_NDK_MAJOR AND CONFIG_ARCH_ARM8_2)
+    option(CONFIG_KERNEL_FP16 "KERNEL FP16" ON)
+endif()
+option(CONFIG_KERNEL_INT8 "KERNEL INT8" ON)
+option(CONFIG_KERNEL_UINT8 "KERNEL UINT8" ON)
+option(CONFIG_AUTH_DEVICE  "AUTH DEVICE" ON)
+
+
+#in face, this is related with run-time env, since API LEVEL 22 binary can run on API LEVEL 23 platform
+if(ANDROID_PLATFORM_LEVEL LESS 23)
+    add_definitions(-DNO_CXA_DEMANGLE)
+endif()
+
+set(CONFIG_VERSION_POSTFIX github)
 
 #message("list dir ${CMAKE_CURRENT_LIST_DIR}/.git")
 if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/.git)
@@ -43,31 +90,64 @@ set(GIT_COMMIT_ID -DGIT_COMMIT_ID="0x${stripped_commit_id}")
 message("GIT COMMIT ID: " 0x${stripped_commit_id})
 
 if (CONFIG_ARCH_ARM64)
-   add_definitions(-DCONFIG_ARCH_ARM64=1)
+    add_definitions(-DCONFIG_ARCH_ARM64=1)
+endif()
+
+if(CONFIG_ARCH_ARM32)
+    add_definitions(-DCONFIG_ARCH_ARM32=1)
 endif()
 
 if(CONFIG_ARCH_BLAS)
-   add_definitions(-DCONFIG_ARCH_BLAS=1)
+    add_definitions(-DCONFIG_ARCH_BLAS=1)
+endif()
+
+if(CONFIG_ARCH_ARM8_2)
+    add_definitions(-DCONFIG_ARCH_ARM8_2=1)
+    add_definitions(-mcpu=cortex-a55)
 endif()
 
 if(CONFIG_ACL_GPU)
-   add_definitions(-DCONFIG_ACL_GPU=1)
+    add_definitions(-DCONFIG_ACL_GPU=1)
 endif()
 
 if(CONFIG_CAFFE_SERIALIZER)
-   add_definitions(-DCONFIG_CAFFE_SERIALIZER=1)
+    add_definitions(-DCONFIG_CAFFE_SERIALIZER=1)
 endif()
 
 if(CONFIG_ONNX_SERIALIZER)
-   add_definitions(-DCONFIG_ONNX_SERIALIZER=1)
+    add_definitions(-DCONFIG_ONNX_SERIALIZER=1)
+endif()
+
+if(CONFIG_MXNET_SERIALIZER)
+    add_definitions(-DCONFIG_MXNET_SERIALIZER=1)
 endif()
 
 if(CONFIG_TF_SERIALIZER)
-   add_definitions(-DCONFIG_TF_SERIALIZER=1)
+    add_definitions(-DCONFIG_TF_SERIALIZER=1)
 endif()
 
 if(CONFIG_TENGINE_SERIALIZER)
-   add_definitions(-DCONFIG_TENGINE_SERIALIZER=1)
+    add_definitions(-DCONFIG_TENGINE_SERIALIZER=1)
+endif()
+
+if(CONFIG_KERNEL_FP32)
+    add_definitions(-DCONFIG_KERNEL_FP32=1)
+endif()
+
+if(CONFIG_KERNEL_FP16)
+    add_definitions(-DCONFIG_KERNEL_FP16=1)
+endif()
+
+if(CONFIG_KERNEL_INT8)
+    add_definitions(-DCONFIG_KERNEL_INT8=1)
+endif()
+
+if(CONFIG_KERNEL_UINT8)
+    add_definitions(-DCONFIG_KERNEL_UINT8=1)
+endif()
+
+if (CONFIG_VERSION_POSTFIX)
+    add_definitions(-DCONFIG_VERSION_POSTFIX="${CONFIG_VERSION_POSTFIX}")
 endif()
 
 add_definitions(${GIT_COMMIT_ID})
@@ -83,7 +163,8 @@ add_definitions(-Wno-overloaded-virtual)
 
 set(CMAKE_CXX_STANDARD 11)
 set(CXX_STANDARD_REQUIRED ON)
-#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-overloaded-virtual")
+
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Bsymbolic -Bsymbolic-functions")
 
 include_directories(include)
 
@@ -111,10 +192,14 @@ endif()
 
 ENDFOREACH()
 
-
+ADD_LIBRARY(hclcpu SHARED ${TOPERATOR_LIB_SRCS})
 ADD_LIBRARY(tengine SHARED ${TENGINE_LIB_SRCS} ${TENGINE_SIGN_SRCS})
 
-ADD_DEPENDENCIES(tengine KERNEL_ASM_TARGET)
+#executor
+ADD_DEPENDENCIES(hclcpu KERNEL_ASM_TARGET)
+TARGET_LINK_LIBRARIES(hclcpu tengine)
+
+#target_compile_definitions(operator,"--allow-shlib-undefined")
 
 if(PROTOBUF_DIR)
     if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
@@ -146,11 +231,11 @@ if(CONFIG_ARCH_BLAS)
    if( BLAS_DIR)
      if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
           TARGET_LINK_LIBRARIES(tengine ${BLAS_DIR}/arm32/lib/libopenblas.so)
-     endif()
+     endif()   
      if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64"))
-        TARGET_LINK_LIBRARIES(tengine ${BLAS_DIR}/arm64/lib/libopenblas.so)
-      endif()
-   else()
+          TARGET_LINK_LIBRARIES(tengine ${BLAS_DIR}/arm64/lib/libopenblas.so)
+     endif()
+   else() 
       message(FATAL_ERROR "need to set the blas path")
    endif()
 endif()
@@ -172,8 +257,8 @@ if(CONFIG_ACL_GPU)
 endif()
 
 install (TARGETS tengine  DESTINATION lib)
+install (TARGETS hclcpu  DESTINATION lib)
 install (FILES ${CMAKE_CURRENT_SOURCE_DIR}/core/include/tengine_c_api.h DESTINATION include)
 install (FILES ${CMAKE_CURRENT_SOURCE_DIR}/core/include/cpu_device.h DESTINATION include)
-install (FILES ${CMAKE_CURRENT_SOURCE_DIR}/core/include/tengine_test_api.h DESTINATION include)
 install (FILES ${CMAKE_CURRENT_SOURCE_DIR}/core/include/tengine_c_compat.h DESTINATION include)
 
diff --git a/LICENSE b/LICENSE
index d64569567..d83ed7001 100644
--- a/LICENSE
+++ b/LICENSE
@@ -187,7 +187,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2019 OPEN AI LAB
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/Makefile b/Makefile
index f00c89bb4..d67bcb6c3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,6 @@
-###     cross compile for ARM64
-#CROSS_COMPILE=aarch64-linux-gnu-
-###     cross compile for ARM32
-#CROSS_COMPILE=arm-linux-gnueabihf-
+MAKEFILE_CONFIG=$(shell pwd)/makefile.config
+include $(MAKEFILE_CONFIG)
+
 SYSROOT:=$(shell pwd)/sysroot/ubuntu_rootfs
 
 ifeq ($(CROSS_COMPILE),aarch64-linux-gnu-)
@@ -17,11 +16,20 @@ ifeq ($(CROSS_COMPILE),arm-linux-gnueabihf-)
    export PKG_CONFIG_PATH
 endif
 
-CC=$(CROSS_COMPILE)gcc -std=gnu99 $(SYSROOT_FLAGS)
-CXX=$(CROSS_COMPILE)g++ -std=c++11 $(SYSROOT_FLAGS)
-LD=$(CROSS_COMPILE)g++ $(SYSROOT_FLAGS) $(SYSROOT_LDFLAGS)
+ifeq ($(EMBEDDED_CROSS_ROOT),)
+    CC=$(CROSS_COMPILE)gcc -std=gnu99 $(SYSROOT_FLAGS)
+    CXX=$(CROSS_COMPILE)g++ -std=c++11 $(SYSROOT_FLAGS)
+    LD=$(CROSS_COMPILE)g++ $(SYSROOT_FLAGS) $(SYSROOT_LDFLAGS)
+else
+    CC=$(CROSS_COMPILE)gcc -std=gnu99 
+    CXX=$(CROSS_COMPILE)g++ -std=c++11 
+    LD=$(CROSS_COMPILE)g++ 
+    PKG_CONFIG_PATH:=$(EMBEDDED_CROSS_ROOT)/usr/lib/pkgconfig
+endif
+
 AR=$(CROSS_COMPILE)ar
 
+
 BUILT_IN_LD=$(CROSS_COMPILE)ld
 
 GIT_COMMIT_ID=$(shell git rev-parse HEAD)
@@ -31,10 +39,9 @@ COMMON_CFLAGS+=-Wno-ignored-attributes -Werror -g
 export CC CXX CFLAGS BUILT_IN_LD LD LDFLAGS CXXFLAGS COMMON_CFLAGS 
 export GIT_COMMIT_ID
 
-MAKEFILE_CONFIG=$(shell pwd)/makefile.config
+
 MAKEBUILD=$(shell pwd)/scripts/makefile.build
 
-include $(MAKEFILE_CONFIG)
 
 BUILD_DIR?=$(shell pwd)/build
 INSTALL_DIR?=$(shell pwd)/install
@@ -45,9 +52,10 @@ export INSTALL_DIR MAKEBUILD TOP_DIR MAKEFILE_CONFIG
 
 LIB_SUB_DIRS=core operator executor serializer driver model_src
 
-
 LIB_SO=$(BUILD_DIR)/libtengine.so
 LIB_A=$(BUILD_DIR)/libtengine.a
+LIB_HCL_SO=$(BUILD_DIR)/libhclcpu.so
+export LIB_HCL_SO
 
 LIB_OBJS=$(addprefix $(BUILD_DIR)/, $(foreach f,$(LIB_SUB_DIRS),$(f)/built-in.o))
 
@@ -62,8 +70,14 @@ APP_SUB_DIRS+=tests
 
 ifeq ($(CONFIG_ARCH_ARM32),y)
 	COMMON_CFLAGS+=-march=armv7-a -mfpu=neon -mfp16-format=ieee -mfpu=neon-fp16
+        export CONFIG_ARCH_ARM32
+endif
+
+ifeq ($(CONFIG_ARCH_ARM64),y)
+        export CONFIG_ARCH_ARM64
 endif
 
+
 ifeq ($(CONFIG_FLOAT16),y)
 	COMMON_CFLAGS+=-DCONFIG_FLOAT16
 endif
@@ -73,22 +87,41 @@ ifeq ($(CONFIG_LEGACY_API),y)
 endif
 
 
+HCL_SUB_DIRS+=hclarm
+LIB_HCL_OBJS=$(BUILD_DIR)/hclarm/arm-builtin.o
+
+ifeq ($(CONFIG_KERNEL_FP32),y)
+    COMMON_CFLAGS+=-DCONFIG_KERNEL_FP32
+endif
+
+ifeq ($(CONFIG_KERNEL_FP16),y)
+    COMMON_CFLAGS+=-DCONFIG_KERNEL_FP16
+endif
+
+ifeq ($(CONFIG_KERNEL_INT8),y)
+    COMMON_CFLAGS+=-DCONFIG_KERNEL_INT8
+endif
+
+ifeq ($(CONFIG_KERNEL_UINT8),y)
+    COMMON_CFLAGS+=-DCONFIG_KERNEL_UINT8
+endif
+
 SUB_DIRS=$(LIB_SUB_DIRS) $(APP_SUB_DIRS)
 
-default: $(LIB_SO) $(APP_SUB_DIRS) 
+default: $(LIB_SO) $(LIB_HCL_SO) $(APP_SUB_DIRS) 
 
 build : default
 
 
-clean: $(SUB_DIRS)
+clean: $(SUB_DIRS) $(HCL_SUB_DIRS)
 
-install: $(APP_SUB_DIRS)
-	@mkdir -p $(INSTALL_DIR)/include $(INSTALL_DIR)/lib
+install: $(APP_SUB_DIRS) $(HCL_SUB_DIRS)
+	@mkdir -p $(INSTALL_DIR)/include $(INSTALL_DIR)/lib $(INSTALL_DIR)/tool
 	cp -f core/include/tengine_c_api.h $(INSTALL_DIR)/include
 	cp -f core/include/tengine_c_compat.h $(INSTALL_DIR)/include
 	cp -f core/include/cpu_device.h $(INSTALL_DIR)/include
-	cp -f core/include/tengine_test_api.h $(INSTALL_DIR)/include
 	cp -f $(BUILD_DIR)/libtengine.so $(INSTALL_DIR)/lib
+	cp -f $(BUILD_DIR)/tools/bin/convert_model_to_tm $(INSTALL_DIR)/tool
 
 
 ifeq ($(CONFIG_ACL_GPU),y)
@@ -112,8 +145,15 @@ endif
 
 
 
-$(LIB_SO): $(REAL_LIB_OBJS) 
-	$(LD) -o $@ -shared -Wl,-Bsymbolic -Wl,-Bsymbolic-functions $(wildcard $(LIB_OBJS)) $(LIB_LDFLAGS)
+$(LIB_SO): $(REAL_LIB_OBJS) $(LIB_HCL_SO) 
+	$(LD) -o $@ -shared -Wl,-Bsymbolic -Wl,-Bsymbolic-functions $(wildcard $(LIB_OBJS)) $(LIB_LDFLAGS) $ -L$(BUILD_DIR) -Wl,-rpath,\$$ORIGIN -Wl,-rpath-link=\$$ORIGIN
+
+ifneq ( $(LIB_HCL_SO),)
+     $(LIB_HCL_SO): $(HCL_SUB_DIRS);
+else
+     $(LIB_HCL_SO):
+	
+endif
 
 static: static_lib static_example
 
@@ -125,10 +165,23 @@ static_lib:
 
 static_example: static_lib
 	$(LD) -o $(BUILD_DIR)/test_tm  $(BUILD_DIR)/tests/bin/test_tm.o $(LIBS) -ltengine \
-	      -ldl -lpthread  -static -L$(BUILD_DIR) -lprotobuf -lblas -lpthread
+	      -ldl -lpthread  -static -L$(BUILD_DIR)
 	@echo ; echo static example: $(BUILD_DIR)/test_tm  created
 
-LIB_LDFLAGS+=-lpthread -lprotobuf -ldl
+LIB_LDFLAGS+=-lpthread -ldl
+
+ifeq ($(CONFIG_CAFFE_SERIALIZER),y)
+    PROTOBUF_NEEDED=y
+endif
+
+ifeq ($(CONFIG_TF_SERIALIZER),y)
+    PROTOBUF_NEEDED=y
+endif
+
+ifeq ($(PROTOBUF_NEEDED),y)
+    PROTOBUF_LIB=$(shell export PKG_CONFIG_PATH=${PKG_CONFIG_PATH}  &&  pkg-config  --libs protobuf)
+    LIB_LDFLAGS+=$(PROTOBUF_LIB)
+endif
 
 ifeq ($(CONFIG_ARCH_BLAS),y)
     LIB_LDFLAGS+=-lopenblas
@@ -141,7 +194,7 @@ endif
 $(LIB_SUB_DIRS):
 	@$(MAKE) -C $@  -f $(MAKEBUILD) BUILD_DIR=$(BUILD_DIR)/$@ $(MAKECMDGOALS)
 
-$(APP_SUB_DIRS):
+$(APP_SUB_DIRS) $(HCL_SUB_DIRS):
 	@$(MAKE) -C $@  BUILD_DIR=$(BUILD_DIR)/$@ $(MAKECMDGOALS)
 
 
@@ -153,4 +206,4 @@ distclean:
 	find . -name $(BUILD_DIR) | xargs rm -rf
 	find . -name $(INSTALL_DIR) | xargs rm -rf
 
-.PHONY: clean install $(SUB_DIRS) build
+.PHONY: clean install $(SUB_DIRS) build $(HCL_SUB_DIRS)
diff --git a/README.md b/README.md
index 859d7543e..0656d87d0 100644
--- a/README.md
+++ b/README.md
@@ -63,10 +63,45 @@ Tengine can be extended to support new serialization format, by building new ser
 
 ## Release History
 
+
+## version 1.3.2 - 2019/04/19
+
+**tengine model 2.0**
+
+**New apis**
+
+get_graph_node_number()
+get_graph_node_by_idx()
+
+**New features**
+
+Separate CPU operator as a independent so:  hclcpu.so
+
+Add Reference Operator
+
+Update Testcase & Update permute for mxnet
+
+Update lstm grun mxnet serializer
+
+Support MXNET serializer in CMakelist.txt
+
+Support TFLITE serializer in CMakelist.txt
+
+Support eltwise in TFLITE serializer
+
+**More operator support**
+
+RNN operator definition and blas implementation
+
+LSTM operator definition and blas implementation
+
+GRU operator definition and blas implementation
+
 ## version 1.0.0 - 2018/12/31
 
 **tengine API 2.0**
 
+
 New API set for NN inference
 
 Simplify graph create process: just create_graph()  instead of load_model() and create_runtime_graph()
diff --git a/android_build_armv7.sh b/android_build_armv7.sh
index 6c8fac1c5..c28e75af1 100755
--- a/android_build_armv7.sh
+++ b/android_build_armv7.sh
@@ -26,7 +26,8 @@ done<../android_config.txt
 cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
      -DANDROID_ABI="armeabi-v7a" \
      -DANDROID_ARM_NEON=ON \
-     -DCONFIG_ARCH_BLAS=ON \
+     -DANDROID_ALLOW_UNDEFINED_SYMBOLS=TRUE\
+     -DCONFIG_ARCH_ARM32=ON \
      -DANDROID_PLATFORM=android-21 \
      -DANDROID_STL=c++_shared \
      -DPROTOBUF_DIR=$PROTOBUF_PATH \
diff --git a/android_build_armv8.sh b/android_build_armv8.sh
index 15f5d776c..0f2206938 100755
--- a/android_build_armv8.sh
+++ b/android_build_armv8.sh
@@ -31,4 +31,5 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DPROTOBUF_DIR=$PROTOBUF_PATH \
     -DBLAS_DIR=$BLAS_PATH \
     -DACL_ROOT=$ACL_ROOT \
+    -DANDROID_ALLOW_UNDEFINED_SYMBOLS=TRUE\
     ..
diff --git a/android_pack.sh b/android_pack.sh
index 862eb781c..46457c83b 100755
--- a/android_pack.sh
+++ b/android_pack.sh
@@ -41,5 +41,6 @@ else
 fi
 
 cp build/libtengine.so  ./android_pack  
+cp build/libhclcpu.so  ./android_pack
 
 
diff --git a/cmake/executor.cmake b/cmake/executor.cmake
old mode 100644
new mode 100755
index 64df99d37..2ab1b9b75
--- a/cmake/executor.cmake
+++ b/cmake/executor.cmake
@@ -1,57 +1,67 @@
 include_directories(executor/include executor/operator/include)
 
 FILE(GLOB_RECURSE COMMON_LIB_CPP_SRCS executor/engine/*.cpp executor/lib/*.cpp executor/plugin/*.cpp)
-FILE(GLOB COMMON_CPP_SRCS  executor/operator/common/*.cpp executor/operator/common/fused/*.cpp)
+FILE(GLOB COMMON_CPP_SRCS executor/operator/init.cpp executor/operator/common/*.cpp executor/operator/common/fused/*.cpp)
+FILE(GLOB_RECURSE REF_CPP_SRCS executor/operator/ref/*.cpp)
+
 if(CONFIG_ARCH_BLAS)
     FILE(GLOB COMMON_BLAS_SRCS  executor/operator/common/blas/*.cpp)
     list(APPEND COMMON_CPP_SRCS ${COMMON_BLAS_SRCS})
+    if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
+        include_directories(${BLAS_DIR}/arm32/include)
+    endif()
+    if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64"))
+        include_directories(${BLAS_DIR}/arm64/include)
+    endif()
 endif()
 
-list(APPEND TENGINE_LIB_SRCS ${COMMON_LIB_CPP_SRCS})
-list(APPEND TENGINE_LIB_SRCS ${COMMON_CPP_SRCS})
+if(CONFIG_AUTH_DEVICE)
+include_directories(hclarm/auth)
+FILE(GLOB_RECURSE HCL_AUTH_SRCS hclarm/*.cpp hclarm/*.c)
+list(APPEND TOPERATOR_LIB_SRCS ${HCL_AUTH_SRCS})
 
-include_directories(driver/cpu)
+# For different settings, please change the COMPILE_FLAGS
+# Please refers to hclarm/auth/auth.config 
+FOREACH (file ${HCL_AUTH_SRCS})
+SET_SOURCE_FILES_PROPERTIES ( ${file} PROPERTIES  COMPILE_FLAGS "-DCONFIG_INTERN_TRIAL -DCONFIG_TIME_LIMIT=7200")
 
-#add openblas include
-if(CONFIG_ARCH_BLAS)
-     if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
-         include_directories(${BLAS_DIR}/arm32/include)
-     endif()
-     if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64"))
-         include_directories(${BLAS_DIR}/arm64/include)
-     endif()
+ENDFOREACH()
+ 
 endif()
 
+list(APPEND TENGINE_LIB_SRCS ${COMMON_LIB_CPP_SRCS})
+list(APPEND TOPERATOR_LIB_SRCS ${COMMON_CPP_SRCS})
+list(APPEND TOPERATOR_LIB_SRCS ${REF_CPP_SRCS})
 
-# Now, handle the .S file
+include_directories(driver/cpu)
 if(CONFIG_ARCH_ARM64)
-    FILE(GLOB_RECURSE ARCH64_LIB_CPP_SRCS executor/operator/arm64/*.cpp)
+    FILE(GLOB_RECURSE ARCH_LIB_CPP_SRCS executor/operator/arm64/*.cpp)
+    FILE(GLOB_RECURSE TARGET_ARCH_FILES executor/operator/arm64/*.S)
     include_directories(executor/operator/arm64/include)
-
-    FOREACH(file ${ARCH64_LIB_CPP_SRCS})
-       set(ACL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/executor/operator/arm64/conv/conv_2d_acl")
-       STRING(REGEX MATCH ${ACL_PREFIX} skip_file2 ${file})
-
-      if( NOT skip_file2)
-	      list(APPEND ARCH_LIB_CPP_SRCS ${file})
-      endif()
-
-    endforeach()
 endif()
 
+if(CONFIG_ARCH_ARM32)
+    FILE(GLOB_RECURSE ARCH_LIB_CPP_SRCS executor/operator/arm32/*.cpp)
+    FILE(GLOB_RECURSE TARGET_ARCH_FILES executor/operator/arm32/*.S)
+    include_directories(executor/operator/arm32/include)
+endif()
 
-list(APPEND TENGINE_LIB_SRCS ${ARCH_LIB_CPP_SRCS})
+if(CONFIG_ARCH_ARM8_2)
+    FILE(GLOB_RECURSE ARCH_LIB_CPP_SRCS_8_2 executor/operator/arm8_2/*.cpp)
+    FILE(GLOB_RECURSE TARGET_ARCH_FILES_8_2 executor/operator/arm8_2/*.S)
+    include_directories(executor/operator/arm8_2/include)
+    list(APPEND ARCH_LIB_CPP_SRCS ${ARCH_LIB_CPP_SRCS_8_2})
+    list(APPEND TARGET_ARCH_FILES ${TARGET_ARCH_FILES_8_2})
+endif()
 
-# Now, handle the .S file
+FOREACH(file ${ARCH_LIB_CPP_SRCS})
+	set_property(SOURCE ${file} PROPERTY  COMPILE_FLAGS  "-fvisibility=hidden")
+ENDFOREACH()
 
-if( CONFIG_ARCH_ARM64)
+list(APPEND TOPERATOR_LIB_SRCS ${ARCH_LIB_CPP_SRCS})
 
-set(src_path executor/operator/arm64)
-FILE(GLOB TARGET_ARCH_FILES ${src_path}/*.S ${src_path}/fc/*.S 
-                            ${src_path}/conv/*.S                                
-                            ${src_path}/fused/*.S)
-endif()
 
+# Now, handle the .S file
 FOREACH( file ${TARGET_ARCH_FILES})
 
 string(REPLACE "\.S" "\.s" PREPROCESS_FILE0 ${file})
@@ -68,7 +78,7 @@ ADD_CUSTOM_COMMAND(
 
 #message(${file} --> ${PREPROCESS_FILE})
 
-list(APPEND TENGINE_LIB_SRCS ${PREPROCESS_FILE})
+list(APPEND TOPERATOR_LIB_SRCS ${PREPROCESS_FILE})
 list(APPEND ASM_FILES ${PREPROCESS_FILE})
 
 SET_SOURCE_FILES_PROPERTIES ( ${PREPROCESS_FILE} PROPERTIES  GENERATED  1)
@@ -79,4 +89,3 @@ ENDFOREACH()
 
 ADD_CUSTOM_TARGET(KERNEL_ASM_TARGET DEPENDS ${ASM_FILES})
 
-
diff --git a/cmake/serializer.cmake b/cmake/serializer.cmake
index ac747df2b..e77f3a197 100644
--- a/cmake/serializer.cmake
+++ b/cmake/serializer.cmake
@@ -91,13 +91,28 @@ if(CONFIG_TF_SERIALIZER)
 endif()
 
 if(CONFIG_TENGINE_SERIALIZER)
+    include_directories(serializer/include/tengine)
+    include_directories(serializer/include/tengine/v1)
+    include_directories(serializer/include/tengine/v2)
     FILE(GLOB_RECURSE tengine_serializer_cpp_src "serializer/tengine/*.cpp")
     FILE(GLOB_RECURSE tengine_serializer_c_src "serializer/tengine/*.c")
     list(APPEND TENGINE_LIB_SRCS ${tengine_serializer_cpp_src} ${tengine_serializer_c_src})
 endif()
 
-    FILE(GLOB_RECURSE source_serializer_cpp_src "serializer/source/*.cpp")
-    list(APPEND TENGINE_LIB_SRCS ${source_serializer_cpp_src})
+if(CONFIG_MXNET_SERIALIZER)
+    FILE(GLOB_RECURSE serializer_src "serializer/mxnet/*.cpp")
+    list(APPEND TENGINE_LIB_SRCS ${serializer_src})
+endif()
+
+if(CONFIG_TFLITE_SERIALIZER)
+    include_directories(serializer/include/tf_lite)
+    FILE(GLOB_RECURSE tflite_serializer_src "serializer/tf_lite/*.cpp")
+    list(APPEND TENGINE_LIB_SRCS ${tflite_serializer_src})
+endif()
+
+
+FILE(GLOB_RECURSE source_serializer_cpp_src "serializer/source/*.cpp")
+list(APPEND TENGINE_LIB_SRCS ${source_serializer_cpp_src})
 
 FILE(GLOB plugin_init "serializer/plugin/init.cpp")
 
diff --git a/core/include/compiler_fp16.h b/core/include/compiler_fp16.h
new file mode 100644
index 000000000..8fe09b739
--- /dev/null
+++ b/core/include/compiler_fp16.h
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+
+#ifndef __COMPILIER_FP16_H__
+#define __COMPILIER_FP16_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __ARM_ARCH
+
+#define fp16_to_fp32(data)	\
+	({			\
+    		float f=data; 	\
+    		f;		\
+	})
+
+#define fp32_to_fp16(data) 	\
+	({			\
+	   __fp16 f=data;	\
+	    f;			\
+	})
+
+
+#else
+
+struct fp16_pack{
+   unsigned short frac:10;
+   unsigned char exp: 5;
+   unsigned char sign:1;
+} __attribute__((packed));
+
+struct fp32_pack{
+   unsigned int frac:23;
+   unsigned char exp: 8;
+   unsigned char sign:1;
+} __attribute__((packed));
+
+typedef struct fp16_pack __fp16;
+
+static inline float fp16_to_fp32(__fp16 data)
+{
+   float f;
+   struct fp32_pack * fp32=(struct fp32_pack *)&f;
+   struct fp16_pack * fp16=&data;
+
+   int exp=fp16->exp;
+
+   if(exp==31 && fp16->frac!=0)
+   {
+      //return __builtin_inf()-__builtin_inf();
+      fp32->sign=fp16->sign;
+      fp32->exp=255;
+      fp32->frac=1;
+
+      return f;
+   }
+
+   if(exp==31)
+       exp=255;
+   if(exp==0)
+       exp=0;
+   else
+       exp=(exp-15)+127;
+
+   fp32->exp=exp;
+   fp32->sign=fp16->sign;
+   fp32->frac=((int)fp16->frac)<<13;
+
+   return f;
+
+}
+
+
+static inline __fp16 fp32_to_fp16(float data)
+{
+   struct fp32_pack * fp32=(struct fp32_pack *)&data;
+   struct fp16_pack  fp16;
+
+   int exp=fp32->exp;
+
+   if(fp32->exp==255 && fp32->frac!=0)
+   {
+      //NaN
+      fp16.exp=31;
+      fp16.frac=1;
+      fp16.sign=fp32->sign;
+
+      return fp16;
+   }
+
+   if((exp-127)<-14)
+       exp = 0;
+   else if((exp-127)>15)
+        exp=31;
+   else
+        exp=exp-127+15;
+
+   fp16.exp=exp;
+   fp16.frac=fp32->frac>>13;
+   fp16.sign=fp32->sign;
+
+   return fp16;
+}
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/core/include/cpu_device.h b/core/include/cpu_device.h
index 4e279ed12..3509cc27f 100644
--- a/core/include/cpu_device.h
+++ b/core/include/cpu_device.h
@@ -43,7 +43,7 @@ extern "C" {
 #define ARCH_ARM_V7 2
 #define ARCH_ARM_V8_2 3
 
-#define MAX_CLUSTER_CPU_NUMBER 4
+#define MAX_CLUSTER_CPU_NUMBER 8
 
 struct cpu_cluster
 {
diff --git a/core/include/data_layout.hpp b/core/include/data_layout.hpp
deleted file mode 100644
index e6e753919..000000000
--- a/core/include/data_layout.hpp
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: haitao@openailab.com
- */
-#ifndef __DATA_LAYOUT_HPP__
-#define __DATA_LAYOUT_HPP__
-
-#include "named_data.hpp"
-
-namespace TEngine {
-
-struct DataLayout : public NamedData<DataLayout>
-{
-    DataLayout(const std::string& str, bool as_default = false)
-    {
-        layout_name = str;
-
-        SetData(layout_name, this);
-
-        if(as_default)
-            SetDefaultData(this);
-    }
-
-    DataLayout(std::string&& str, bool as_default = false)
-    {
-        layout_name = std::move(str);
-        SetData(layout_name, this);
-
-        if(as_default)
-            SetDefaultData(this);
-    }
-
-    static const DataLayout* GetLayout(const std::string& name)
-    {
-        return GetData(name);
-    }
-
-    const std::string& GetName(void) const
-    {
-        return layout_name;
-    }
-
-    virtual unsigned int GetDimNum() const
-    {
-        return 0;
-    }
-    virtual int GetH() const
-    {
-        return -1;
-    }
-    virtual int GetW() const
-    {
-        return -1;
-    }
-    virtual int GetC() const
-    {
-        return -1;
-    }
-    virtual int GetD() const
-    {
-        return -1;
-    }
-    virtual int GetN() const
-    {
-        return -1;
-    }
-
-    virtual ~DataLayout(){};
-
-    std::string layout_name;
-};
-
-struct LayoutNCHW : public DataLayout
-{
-    LayoutNCHW(bool as_default = false) : DataLayout("NCHW", as_default){};
-
-    int GetN() const
-    {
-        return 0;
-    }
-    int GetC() const
-    {
-        return 1;
-    }
-    int GetH() const
-    {
-        return 2;
-    }
-    int GetW() const
-    {
-        return 3;
-    }
-    unsigned int GetDimNum() const
-    {
-        return 4;
-    }
-};
-
-struct LayoutNCDHW : public DataLayout
-{
-    LayoutNCDHW(bool as_default = false) : DataLayout("NCDHW", as_default){};
-
-    int GetN() const
-    {
-        return 0;
-    }
-    int GetC() const
-    {
-        return 1;
-    }
-    int GetD() const
-    {
-        return 2;
-    }
-    int GetH() const
-    {
-        return 3;
-    }
-    int GetW() const
-    {
-        return 4;
-    }
-    unsigned int GetDimNum() const
-    {
-        return 5;
-    }
-};
-
-struct LayoutNHWC : public DataLayout
-{
-    LayoutNHWC(bool as_default = false) : DataLayout("NHWC", as_default){};
-
-    int GetN() const
-    {
-        return 0;
-    }
-    int GetH() const
-    {
-        return 1;
-    }
-    int GetW() const
-    {
-        return 2;
-    }
-    int GetC() const
-    {
-        return 3;
-    }
-    unsigned int GetDimNum() const
-    {
-        return 4;
-    }
-};
-
-struct LayoutNDHWC : public DataLayout
-{
-    LayoutNDHWC(bool as_default = false) : DataLayout("NDHWC", as_default){};
-
-    int GetN() const
-    {
-        return 0;
-    }
-    int GetD() const
-    {
-        return 1;
-    }
-    int GetH() const
-    {
-        return 2;
-    }
-    int GetW() const
-    {
-        return 3;
-    }
-    int GetC() const
-    {
-        return 4;
-    }
-    unsigned int GetDimNum() const
-    {
-        return 5;
-    }
-};
-
-struct LayoutNHW : public DataLayout
-{
-    LayoutNHW(bool as_default = false) : DataLayout("NHW", as_default){};
-
-    int GetN() const
-    {
-        return 0;
-    }
-    int GetH() const
-    {
-        return 1;
-    }
-    int GetW() const
-    {
-        return 2;
-    }
-    unsigned int GetDimNum() const
-    {
-        return 3;
-    }
-};
-
-struct LayoutNW : public DataLayout
-{
-    LayoutNW(bool as_default = false) : DataLayout("NW", as_default){};
-
-    int GetN() const
-    {
-        return 0;
-    }
-    int GetW() const
-    {
-        return 1;
-    }
-    unsigned int GetDimNum() const
-    {
-        return 2;
-    }
-};
-
-struct LayoutHW : public DataLayout
-{
-    LayoutHW(bool as_default = false) : DataLayout("HW", as_default){};
-
-    int GetH() const
-    {
-        return 0;
-    }
-    int GetW() const
-    {
-        return 1;
-    }
-    unsigned int GetDimNum() const
-    {
-        return 2;
-    }
-};
-
-struct LayoutW : public DataLayout
-{
-    LayoutW(bool as_default = false) : DataLayout("W", as_default){};
-
-    int GetW() const
-    {
-        return 0;
-    }
-    unsigned int GetDimNum() const
-    {
-        return 1;
-    }
-};
-
-}    // namespace TEngine
-
-#endif
diff --git a/core/include/exec_attr.hpp b/core/include/exec_attr.hpp
index 455b1f4ce..c1e2adfa6 100644
--- a/core/include/exec_attr.hpp
+++ b/core/include/exec_attr.hpp
@@ -46,6 +46,7 @@ enum exec_policy_t
 #define EXEC_KERNEL_FP32 0
 #define EXEC_KERNEL_FP16 1
 #define EXEC_KERNEL_INT8 2
+#define EXEC_KERNEL_UINT8 3
 
 #define MODEL_FORMAT_UNKNOWN 0
 #define MODEL_FORMAT_TENGINE 1
@@ -54,6 +55,10 @@ enum exec_policy_t
 #define MODEL_FORMAT_MXNET 4
 #define MODEL_FORMAT_TENSORFLOW 5
 #define MODEL_FORMAT_TFLITE 6
+#define MODEL_FORMAT_DLA 7
+
+#define MODEL_SUBFORMAT_AIPU 1
+#define MODEL_SUBFORMAT_NNIE 2
 
 struct ExecAttr
 {
@@ -61,12 +66,13 @@ struct ExecAttr
     int priority;
     int kernel_mode;
     int model_format;
+    int model_layout;
+    int graph_layout;
     bool low_mem_mode;
     bool fc_mt;    // fc should in multi-threaded?
     bool pooling_mt;    // pooling should in multi-threaded?
     void* exec_context;
     void* dev_handle;
-    int layout;
 
     ExecAttr(void)
     {
@@ -79,7 +85,8 @@ struct ExecAttr
         model_format = MODEL_FORMAT_TENGINE;
         exec_context = nullptr;
         dev_handle = nullptr;
-        layout = -1;
+        graph_layout = -1;
+        model_layout = -1;
     }
 };
 
diff --git a/core/include/graph.hpp b/core/include/graph.hpp
index 979e59400..fcae590e9 100644
--- a/core/include/graph.hpp
+++ b/core/include/graph.hpp
@@ -50,6 +50,8 @@ class Graph : public BaseObject
     {
         name_ = name;
         model_format_ = -1;
+        model_subformat_ = -1;
+        model_layout_ = -1;
         layout_ = -1;
     }
 
@@ -147,10 +149,32 @@ class Graph : public BaseObject
     {
         model_format_ = model_format;
     }
+
     int GetModelFormat(void)
     {
         return model_format_;
     }
+
+    void SetModelSubFormat(int model_subformat)
+    {
+        model_subformat_ = model_subformat;
+    }
+
+    int GetModelSubFormat(void)
+    {
+        return model_subformat_;
+    }
+
+    void SetModelLayout(int model_layout)
+    {
+        model_layout_ = model_layout;
+    }
+
+    int GetModelLayout(void)
+    {
+        return model_layout_;
+    }
+
     void SetLayout(int layout)
     {
         layout_ = layout;
@@ -176,6 +200,8 @@ class Graph : public BaseObject
     std::unordered_map<std::string, Tensor*> owned_tensors_;
 
     int model_format_;
+    int model_subformat_;
+    int model_layout_;
     int layout_;
 
     Attribute attrs_;
diff --git a/core/include/graph_executor.hpp b/core/include/graph_executor.hpp
index 4320b3fb9..21c4319f3 100644
--- a/core/include/graph_executor.hpp
+++ b/core/include/graph_executor.hpp
@@ -46,6 +46,7 @@ class GraphExecutor
         graph_attached_ = false;
         exec_handle_ = nullptr;
         prerun_done_ = false;
+        optimize_only=0;
 
         InitAttrIO();
     }
@@ -137,6 +138,9 @@ class GraphExecutor
             return -1;
     }
 
+    bool GetOptimizeOnly(const char* name, void* val, int size);
+    bool SetOptimizeOnly(const char* name, const void* val, int size);
+
     bool GetExecAttrEntry(const char* name, void* val, int size);
     bool SetExecAttrEntry(const char* name, const void* val, int size);
 
@@ -170,6 +174,7 @@ class GraphExecutor
 
     AttrIO attr_io_;
     bool prerun_done_;
+    int optimize_only;
 };
 
 }    // namespace TEngine
diff --git a/core/include/node.hpp b/core/include/node.hpp
index f9e81f2f0..e2d6da1a7 100644
--- a/core/include/node.hpp
+++ b/core/include/node.hpp
@@ -343,6 +343,17 @@ class Node : public BaseObject
     bool dynamic_shape_;
 };
 
+#define ATTR_CUSTOM_ATTR "CUSTOM_ATTR"
+
+struct CustomNodeAttr
+{
+    int attr_size;
+    const char* type_name;
+    std::vector<uint8_t> mem;
+};
+
+using node_custom_attr_map_t = std::unordered_map<std::string, CustomNodeAttr>;
+
 }    // namespace TEngine
 
 #endif
diff --git a/core/include/operator.hpp b/core/include/operator.hpp
index b55af6a48..f243e1179 100644
--- a/core/include/operator.hpp
+++ b/core/include/operator.hpp
@@ -59,11 +59,11 @@ class Operator : public BaseObject
         return true;
     }
 
-    virtual bool GetParamItem(const char* param_name, const std::type_info* type_info, void* val)
+    virtual bool GetParamItem(const char* param_name, const char * type_name, void* val)
     {
         return false;
     }
-    virtual bool SetParamItem(const char* param_name, const std::type_info* type_info, const void* val)
+    virtual bool SetParamItem(const char* param_name, const char * type_name, const void* val)
     {
         return false;
     }
@@ -84,6 +84,15 @@ class Operator : public BaseObject
         return 0.0f;
     }
 
+    void SetOpVer(int op_ver)
+    {
+        op_ver_ = op_ver;
+    }
+    int GetOpVer(void)
+    {
+        return op_ver_;
+    }
+
     void SetName(const std::string& new_name)
     {
         name_ = new_name;
@@ -125,13 +134,6 @@ class Operator : public BaseObject
         return ParseInputOutput(std::move(output_str), outputs_);
     }
 
-    Operator& SetLayout(const std::string& layout_str)
-    {
-        layout_ = layout_str;
-
-        return *this;
-    }
-
     Operator& SetDoc(std::string&& doc_str)
     {
         doc_ = doc_str;
@@ -174,22 +176,21 @@ class Operator : public BaseObject
     {
         return outputs_[idx].second;
     }
-    const std::string& GetLayout(void) const
+
+    Operator()
     {
-        return layout_;
+        op_ver_ = 1;
     }
-
-    Operator() = default;
     Operator(const Operator&) = default;
 
     virtual ~Operator(){};
 
 protected:
+    int op_ver_;
     std::string name_;
     bool dynamic_shape_;
     std::vector<io_str_t> inputs_;
     std::vector<io_str_t> outputs_;
-    std::string layout_;
     std::string doc_;
 
 private:
@@ -290,12 +291,12 @@ template <typename T, typename P> class OperatorWithParam : public Operator
             {
                 const std::string& str = any_cast<std::string>(data);
 
-                param.SetItemVal(ir->first, &typeid(const char*), str.c_str());
+                param.SetItemVal(ir->first, typeid(const char*).name(), str.c_str());
             }
             else if(data_type == typeid(int))
             {
                 float f = ( float )any_cast<int>(data);
-                param.SetItemVal(ir->first, &typeid(float), &f);
+                param.SetItemVal(ir->first, typeid(float).name(), &f);
             }
 
             ir++;
@@ -323,14 +324,14 @@ template <typename T, typename P> class OperatorWithParam : public Operator
         return param;
     }
 
-    bool GetParamItem(const char* param_name, const std::type_info* type_info, void* val) override
+    bool GetParamItem(const char* param_name, const char * type_name, void* val) override
     {
-        return param_.GetItemVal(param_name, type_info, val);
+        return param_.GetItemVal(param_name, type_name, val);
     }
 
-    bool SetParamItem(const char* param_name, const std::type_info* type_info, const void* val) override
+    bool SetParamItem(const char* param_name, const char * type_name, const void* val) override
     {
-        return param_.SetItemVal(param_name, type_info, val);
+        return param_.SetItemVal(param_name, type_name, val);
     }
 
 protected:
diff --git a/core/include/parameter.hpp b/core/include/parameter.hpp
index 7b15bed90..3aeb79626 100644
--- a/core/include/parameter.hpp
+++ b/core/include/parameter.hpp
@@ -40,19 +40,19 @@ struct NamedParam
     {
         item_cpy_t cpy_func;
         item_set_any cpy_any;
-        const std::type_info* type_info;
+        const char* type_name;
         int data;
     };
 
-    ItemInfo* FindItem(const std::string& name, const std::type_info* item_type)
+    ItemInfo* FindItem(const std::string& name, const char * type_name)
     {
         if(item_map_.count(name) == 0)
             return nullptr;
 
         ItemInfo& entry = item_map_.at(name);
 
-        // skip type checking if type_info is nullptr
-        if(item_type && (*item_type != *entry.type_info))
+        // skip type checking if type_name is nullptr
+        if(type_name && entry.type_name && strcmp(type_name,entry.type_name))
         {
             // printf("requested: %s recorded:%s\n",item_type->name(),entry.type_info->name());
             return nullptr;
@@ -61,9 +61,9 @@ struct NamedParam
         return &entry;
     }
 
-    bool GetItemVal(const std::string& name, const std::type_info* val_type, void* val)
+    bool GetItemVal(const std::string& name, const char * type_name, void* val)
     {
-        ItemInfo* entry = FindItem(name, val_type);
+        ItemInfo* entry = FindItem(name, type_name);
 
         if(entry == nullptr)
             return false;
@@ -73,9 +73,9 @@ struct NamedParam
         return true;
     }
 
-    bool SetItemVal(const std::string& name, const std::type_info* val_type, const void* val)
+    bool SetItemVal(const std::string& name, const char* type_name, const void* val)
     {
-        ItemInfo* entry = FindItem(name, val_type);
+        ItemInfo* entry = FindItem(name, type_name);
 
         if(entry == nullptr)
             return false;
@@ -91,11 +91,12 @@ struct NamedParam
             return false;
 
         ItemInfo& entry = item_map_.at(name);
-        const std::type_info* item_type = entry.type_info;
-        const std::type_info& any_type = n.type();
+        const char * item_type = entry.type_name;
+        const char * any_type = n.type().name();
 
         /* several special cases */
-        if(*item_type == typeid(const char*) && any_type == typeid(std::string))
+        if(!strcmp(item_type,typeid(const char*).name()) && 
+           !strcmp(any_type, typeid(std::string).name()))
         {
             const char** ptr = ( const char** )(( char* )this + entry.data);
             const std::string& str = any_cast<std::string>(n);
@@ -105,7 +106,8 @@ struct NamedParam
             return true;
         }
 
-        if(*item_type == typeid(std::string) && any_type == typeid(const char*))
+        if(!strcmp(item_type,typeid(std::string).name()) && 
+           !strcmp(any_type,typeid(const char*).name()))
         {
             std::string* p_str = ( std::string* )(( char* )this + entry.data);
             const char* ptr = any_cast<const char*>(n);
@@ -120,7 +122,7 @@ struct NamedParam
 
     bool SetItemFromAny(const std::string& name, const any& n)
     {
-        ItemInfo* entry = FindItem(name, &n.type());
+        ItemInfo* entry = FindItem(name, n.type().name());
 
         if(entry == nullptr)
             return SetItemCompatibleAny(name, n);
@@ -145,7 +147,7 @@ struct NamedParam
     {                                                                                    \
         typedef decltype(e) T;                                                           \
         ItemInfo info;                                                                   \
-        info.type_info = &typeid(T);                                                     \
+        info.type_name = typeid(T).name();                                                     \
         info.data = ( char* )&e - ( char* )this;                                         \
         info.cpy_func = [](void* data, const void* v) { *( T* )data = *( const T* )v; }; \
         info.cpy_any = [](void* data, const any& n) { *( T* )data = any_cast<T>(n); };   \
diff --git a/core/include/serializer.hpp b/core/include/serializer.hpp
index 24a5b36ec..9bf80fe5b 100644
--- a/core/include/serializer.hpp
+++ b/core/include/serializer.hpp
@@ -41,6 +41,7 @@ class Serializer
 {
 public:
     using op_load_map_t = std::unordered_map<std::string, any>;
+    using op_save_map_t = std::unordered_map<std::string, any>;
 
     Serializer() {}
     virtual ~Serializer(){};
@@ -103,11 +104,34 @@ class Serializer
         return op_load_map_[op_name];
     }
 
+    bool RegisterOpSaveMethod(const std::string& op_name, const any& save_func)
+    {
+        if(op_save_map_.count(op_name))
+            return false;
+
+        op_save_map_[op_name] = save_func;
+        return true;
+    }
+
+    bool FindOpSaveMethod(const std::string& op_name)
+    {
+        if(op_save_map_.count(op_name))
+            return true;
+
+        return false;
+    }
+
+    any& GetOpSaveMethod(const std::string& op_name)
+    {
+        return op_save_map_[op_name];
+    }
+
 protected:
     std::string version_;
     std::string name_;
     std::string format_name_;
     op_load_map_t op_load_map_;
+    op_save_map_t op_save_map_;
 };
 
 using SerializerPtr = std::shared_ptr<Serializer>;
diff --git a/core/include/static_graph.hpp b/core/include/static_graph.hpp
index a3aaacf88..f39aea2e7 100644
--- a/core/include/static_graph.hpp
+++ b/core/include/static_graph.hpp
@@ -60,14 +60,20 @@ struct StaticGraph
     std::vector<StaticTensorPtr> tensor_list;
     std::unordered_map<std::string, StaticTensorPtr> const_tensor_map;
     std::vector<void*> mem_src;
-    int layout;
+    int graph_layout;
+    int model_layout;
+    int model_format;
+    int model_subformat; // for dla models
 
     StaticGraph(void)
     {
         exec_context = nullptr;
         dev_handle = nullptr;
         release_func = nullptr;
-        layout = -1;
+        graph_layout = -1;
+        model_layout = -1;
+        model_format = -1;
+        model_subformat = -1;
     }
 
     ~StaticGraph(void);
@@ -82,6 +88,7 @@ struct StaticNode
     std::string name;
     int index;
     StaticOpPtr op;
+    Attribute attrs;
 
     std::vector<int> input_tensor_list;
     std::vector<int> output_tensor_list;
@@ -100,10 +107,10 @@ struct StaticTensor
     int mem_size;
     std::vector<int> dims;
     int data_type;
-    std::string data_layout;
     int type;
     float scale;
     int zero_point;
+    int width;
     NodeSynapse producer;
     std::vector<NodeSynapse> consumer;
     virtual ~StaticTensor() {}
diff --git a/core/include/static_graph_interface.hpp b/core/include/static_graph_interface.hpp
index d8a65cde7..b002f0660 100644
--- a/core/include/static_graph_interface.hpp
+++ b/core/include/static_graph_interface.hpp
@@ -42,6 +42,9 @@ void DumpStaticGraph(StaticGraph* graph);
 const void* GetGraphContext(StaticGraph* graph);
 void SetGraphDevHandle(StaticGraph* graph, void* release_func, void* dev_handle);
 void SetGraphLayout(StaticGraph* graph, int layout);
+void SetModelLayout(StaticGraph* graph, int layout);
+void SetModelFormat(StaticGraph* graph, int model_format);
+void SetModelSubFormat(StaticGraph* graph, int model_subformat);
 
 // TODO: not available to user
 void SetGraphInternalName(StaticGraph* graph, const std::string& name);
@@ -89,7 +92,6 @@ StaticTensor* CreateStaticTensor(StaticGraph* grap, const std::string& name);
 void SetTensorDim(StaticTensor*, const std::vector<int>& dims);
 const std::vector<int>& GetTensorDim(StaticTensor*);
 void SetTensorDataType(StaticTensor*, int data_type);
-void SetTensorDataLayout(StaticTensor*, const std::string& data_layout);
 void SetTensorType(StaticTensor*, int type);
 int SetTensorSize(StaticTensor*, int size);
 
diff --git a/core/include/tengine_c_api.h b/core/include/tengine_c_api.h
index 14902427b..994fd1230 100644
--- a/core/include/tengine_c_api.h
+++ b/core/include/tengine_c_api.h
@@ -71,6 +71,11 @@ extern "C" {
 #define GRAPH_PERF_STAT_RESET 4
 #define GRAPH_PERF_STAT_GET 5
 
+/* quant mode */
+#define TENGINE_QUANT_FP16 0
+#define TENGINE_QUANT_INT8 1
+#define TENGINE_QUANT_UINT8 2
+
 /* follow the std. UNIX log level definitioin */
 enum log_level
 {
@@ -349,6 +354,19 @@ graph_t create_graph(context_t context, const char* model_format, const char* fi
 
 int save_graph(graph_t graph, const char* model_format, const char* file_name, ...);
 
+/*!
+ * @brief quant the graph according to the quant mode
+ *
+ * @param [in/out] graph, the graph handle
+ * @param [in] quant_mode, the quant mode(fp16, int8 or uint8). see TENGINE_QUANT_FP16 etc.
+ * @param [in] node_no_quant_idxs, the index array of nodes not quant
+ * @param [in] node_no_quant_number, the number of nodes not quant
+ *
+ * @return  0 success or -1 fail
+ */
+
+int quant_graph(graph_t graph, int quant_mode, int node_no_quant_idxs[], int node_no_quant_number);
+
 /*!
  * @brief Set the layout type of the graph
  *        the default layout of graph is NCHW
@@ -577,7 +595,6 @@ int get_node_output_number(node_t node);
 /*!
  * @brief Get the input tensor number of a node.
  *
- * @param [in] graph: The graph handle.
  * @param [in] node: The node hanle.
  *
  * @return >=1 the number of output tensor,
@@ -586,19 +603,44 @@ int get_node_output_number(node_t node);
  */
 int get_node_input_number(node_t node);
 
+
+/*!
+ * @brief Get graph node number 
+ * 
+ *
+ * @param [in] graph: the graph handle
+ *  
+ * @return >=0 the number of the graph node
+ *         -1  on error
+ */
+
+int get_graph_node_number(graph_t graph);
+
+/*!
+ * @brief Get graph node by idx 
+ *
+ *
+ * @param [in] graph: the graph handle
+ * @param [in] node_idx: the node index
+ *
+ * @return the node handle or NULL on error
+ */
+
+node_t get_graph_node_by_idx(graph_t graph, int node_idx);
+
 /*!
  * @brief Add an attribute to a node.
  *
  * @param [in] node: The target node handle.
  * @param [in] attr_name: The name of the attribute to be added.
- * @param [in] type_info: The pointer to the std::type_info of expected type
- *                       can be set to NULL to skip type match checking.
+ * @param [in] type_name: The c string get by  std::type_info::name() 
+ *                   can be set to NULL to skip type match checking.
  * @param [in] size: The size of the attribute
  *
  * @return 0: Successfully,
  *         -1: Failed.
  */
-int add_node_attr(node_t node, const char* attr_name, const void* type_info, int size);
+int add_node_attr(node_t node, const char* attr_name, const char * type_name, int size);
 
 /*!
  * @brief Get the attribute value (int) of a node
@@ -645,7 +687,7 @@ int get_node_attr_pointer(node_t node, const char* attr_name, void* attr_val);
  *
  * @param [in] node: The target node.
  * @param [in] attr_name: The name of the attribute to be retrieval.
- * @param [in] type_info: The pointer to the std::type_info of expected type
+ * @param [in] type_name: The c string get by  std::type_info::name() 
  *                   can be set to NULL to skip type match checking.
  * @param [out] buf: The pointer to the buffer to save val.
  * @param [in] size: The buffer size.
@@ -654,7 +696,7 @@ int get_node_attr_pointer(node_t node, const char* attr_name, void* attr_val);
  *         -1: Failed; The name does not exist or the type mismatch.
  *
  */
-int get_node_attr_generic(node_t node, const char* attr_name, const void* type_info, void* buf, int size);
+int get_node_attr_generic(node_t node, const char* attr_name, const char* type_name, void* buf, int size);
 
 /*!
  * @brief Set the attribute value (int) of a node
@@ -703,7 +745,7 @@ int set_node_attr_pointer(node_t node, const char* attr_name, const void* attr_v
  *
  * @param [in] node: The target node.
  * @param [in] attr_name: The name of the attribute to be retrieval.
- * @param [in] type_info: The pointer to the std::type_info of wanted type,
+ * @param [in] type_name: The name of std::type_info::name()
  *                   can be set to NULL to skip type match checking.
  * @param [in] buf: The pointer to the buffer to hold val.
  * @param [in] size: The buffer size.
@@ -712,7 +754,7 @@ int set_node_attr_pointer(node_t node, const char* attr_name, const void* attr_v
  *         -1: Failed, The name does not exist or the type mismatch.
  *
  */
-int set_node_attr_generic(node_t node, const char* attr_name, const void* type_info, const void* buf, int size);
+int set_node_attr_generic(node_t node, const char* attr_name, const char * type_name, const void* buf, int size);
 
 /*!
  * @brief Set customer kernel of a node, on a specific device,
diff --git a/core/include/tengine_c_compat.h b/core/include/tengine_c_compat.h
index e75a10bb5..4488f9ca6 100644
--- a/core/include/tengine_c_compat.h
+++ b/core/include/tengine_c_compat.h
@@ -195,7 +195,7 @@ int get_node_param_pointer(node_t node, const char* param_name, void* param_val)
  *
  * @param node, the target node
  * @param param_name, the name of the param to be retrieval
- * @param type_info, pointer to the std::type_info of wanted type, NULL to skip type check
+ * @param type_name, c string return bye the std::type_info::name() , NULL to skip type check
  * @param param_val, pointer to the val to be saved
  * @param size, parameter size
  *
@@ -203,7 +203,7 @@ int get_node_param_pointer(node_t node, const char* param_name, void* param_val)
  *        <0, failed; probably the name does not exist or the type mismatch
  */
 
-int get_node_param_generic(node_t node, const char* param_name, const void* type_info, void* param_val, int size);
+int get_node_param_generic(node_t node, const char* param_name, const char* type_name, void* param_val, int size);
 
 /*!
  * @brief infer shape for graph
@@ -216,11 +216,10 @@ int infer_shape(graph_t graph);
  * @brief Get the layout of tensor.
  *
  * @param [in] tensor: The tensor handle.
- * @param [out] layout: The layout of tensor.
  * @return >=1 the valid dim number, or -1 Fail.
  *
  */
-int get_tensor_layout(tensor_t tensor, char* layout);
+int get_tensor_layout(tensor_t tensor);
 
 /*!
  * @brief Set the layout of tensor.
@@ -230,7 +229,7 @@ int get_tensor_layout(tensor_t tensor, char* layout);
  * @return 0: Success; -1: Fail.
  *
  */
-int set_tensor_layout(tensor_t tensor, const char* layout);
+int set_tensor_layout(tensor_t tensor, const int layout);
 
 #ifdef __cplusplus
 }
diff --git a/core/include/tengine_c_helper.hpp b/core/include/tengine_c_helper.hpp
index 1b079129f..4f8d45048 100644
--- a/core/include/tengine_c_helper.hpp
+++ b/core/include/tengine_c_helper.hpp
@@ -29,10 +29,10 @@
 
 extern "C" {
 
-int node_add_attr(node_t node, const char* attr_name, const void* type_info, int size);
+int node_add_attr(node_t node, const char* attr_name, const char* type_name, int size);
 
-int node_get_attr_generic(void* node, const char* param_name, const void* type_info, void* param_val, int param_size);
-int node_set_attr_generic(void* node, const char* param_name, const void* type_info, const void* param_val,
+int node_get_attr_generic(void* node, const char* param_name, const char* type_name, void* param_val, int param_size);
+int node_set_attr_generic(void* node, const char* param_name, const char* type_name, const void* param_val,
                           int param_size);
 
 void set_cpu_list(const char* cpu_list_str);
@@ -44,6 +44,8 @@ graph_t create_graph_in_context(context_t exec_context, const char* graph_name,
 
 int save_graph_internal(graph_t graph, const char* file_format, const char* fname, va_list argp);
 
+int quant_graph_internal(graph_t graph, int quant_mode, int node_no_quant_idxs[], int node_no_quant_number);
+
 const char* get_model_name(graph_t graph);
 }
 
@@ -51,7 +53,7 @@ namespace TEngine {
 
 class GraphExecutor;
 
-void InitAllPlugin(void);
+int InitAllPlugin(void);
 
 GraphExecutor* do_merge_graph(std::vector<GraphExecutor*>& exec_list);
 
diff --git a/core/include/tengine_test_api.h b/core/include/tengine_test_api.h
deleted file mode 100644
index 4951b8a16..000000000
--- a/core/include/tengine_test_api.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2018, Open AI Lab
- * Author: haitao@openailab.com
- */
-#ifndef __TENGINE_TEST_API_H__
-#define __TENGINE_TEST_API_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void* test_node_t;
-
-test_node_t create_convolution_test_node(int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h0, int pad_h1,
-                                         int pad_w0, int pad_w1, int dilation_h, int dilation_w, int input_channel,
-                                         int output_channel, int group);
-
-test_node_t create_fc_test_node(int hidden_number, int output_number);
-
-test_node_t create_pooling_test_node(int pool_method, int kernel_h, int kernel_w, int stride_h, int stride_w,
-                                     int pad_h0, int pad_h1, int pad_w0, int pad_w1, int global);
-
-int test_node_set_input(test_node_t node, float* input_data[], int* input_shape[], int input_number);
-int test_node_set_output(test_node_t node, float* output_data[], int* output_shape[], int output_number);
-
-int test_node_prerun(test_node_t node);
-
-int test_node_run(test_node_t node);
-
-int test_node_postrun(test_node_t node);
-
-void destroy_test_node(test_node_t node);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/core/include/tensor_shape.hpp b/core/include/tensor_shape.hpp
index abdd042dd..b2a53abd5 100644
--- a/core/include/tensor_shape.hpp
+++ b/core/include/tensor_shape.hpp
@@ -44,12 +44,12 @@ enum TensorType
 class TShape
 {
 public:
-    void SetDataLayout(const std::string& layout_name)
+    void SetDataLayout(int layout)
     {
-        layout_ = layout_name;
+        layout_ = layout;
     }
 
-    const std::string& GetDataLayout(void) const
+    int GetDataLayout(void) const
     {
         return layout_;
     }
@@ -95,7 +95,7 @@ class TShape
         return true;
     }
 
-    void SetDim(const std::vector<int>& args, bool layout_check = false);
+    void SetDim(const std::vector<int>& args);
 
     void DumpShape(std::ostream& os) const;
 
@@ -103,7 +103,6 @@ class TShape
     int GetC(void) const;
     int GetH(void) const;
     int GetW(void) const;
-    int GetD(void) const;
 
     TShape() = default;
 
@@ -148,7 +147,7 @@ class TShape
 
 private:
     std::vector<int> dim_;
-    std::string layout_;
+    int layout_;
 };
 
 }    // namespace TEngine
diff --git a/core/include/worker_thread.hpp b/core/include/worker_thread.hpp
index edac8c1ea..a3611e81d 100644
--- a/core/include/worker_thread.hpp
+++ b/core/include/worker_thread.hpp
@@ -26,6 +26,9 @@
 #ifndef __WORKER_THREAD_HPP__
 #define __WORKER_THREAD_HPP__
 
+#include <sys/time.h>
+#include <string.h>
+
 #include <queue>
 #include <thread>
 #include <mutex>
@@ -107,6 +110,19 @@ template <typename T> class WorkerThread
 private:
     void DoWork(void)
     {
+        int task_done_count = 0;
+        bool skip = false;
+
+#ifdef CONFIG_MAX_RUN_TIME
+        long start_time;
+
+        struct timeval tv;
+
+        gettimeofday(&tv, NULL);
+
+        start_time = tv.tv_sec;
+#endif
+
         // bind CPU first
         if(bind_cpu_ >= 0)
         {
@@ -126,10 +142,30 @@ template <typename T> class WorkerThread
             if(quit_work_)
                 break;
 
-            process_(task, bind_cpu_);
+#ifdef CONFIG_MAX_RUN_COUNT
+            if(task_done_count > CONFIG_MAX_RUN_COUNT)
+                skip = true;
+#endif
+
+#ifdef CONFIG_MAX_RUN_TIME
+            if(!(task_done_count & 0x3fff))
+            {
+                struct timeval tv;
+
+                gettimeofday(&tv, NULL);
+
+                if((tv.tv_sec - start_time) >= CONFIG_MAX_RUN_TIME)
+                    skip = true;
+            }
+
+#endif
+            if(!skip)
+                process_(task, bind_cpu_);
 
             if(inc_done_)
                 inc_done_(1);
+
+            task_done_count++;
         }
     }
 
diff --git a/core/lib/Makefile b/core/lib/Makefile
index c5b44fa65..d46063ee7 100644
--- a/core/lib/Makefile
+++ b/core/lib/Makefile
@@ -1,5 +1,4 @@
 obj-y+=data_type.o
-obj-y+=data_layout.o
 obj-y+=exec_context.o
 obj-y+=tensor.o
 obj-y+=tensor_shape.o
@@ -19,6 +18,7 @@ obj-y+=compiler.o
 obj-y+=tengine_c_helper.o
 obj-y+=tengine_version.o
 obj-y+=tengine_errno.o
+obj-y+=tengine_runtime_error.o
 obj-y+=logger/
 
 obj-$(CONFIG_LEGACY_API)+=tengine_c_compat.o
diff --git a/core/lib/graph.cpp b/core/lib/graph.cpp
index 1e1285c49..3e8afc0bb 100644
--- a/core/lib/graph.cpp
+++ b/core/lib/graph.cpp
@@ -142,6 +142,14 @@ bool Graph::CreateNodeFromStatic(Node* node, const StaticGraph* static_graph, co
     op->SetDynamicShape(static_op->dynamic_shape);
     node->SetDynamicShape(static_op->dynamic_shape);
 
+    /* copy attrs in static_node */
+    std::vector<std::string> node_attr_name = static_node->attrs.ListAttr();
+
+    for(unsigned int i = 0; i < node_attr_name.size(); i++)
+    {
+        node->SetAttr(node_attr_name[i], static_node->attrs.GetAttr(node_attr_name[i]));
+    }
+
     /* copy attrs in static_op  */
     std::vector<std::string> attr_name = static_op->attrs.ListAttr();
 
@@ -166,7 +174,7 @@ bool Graph::CreateNodeFromStatic(Node* node, const StaticGraph* static_graph, co
 
         TShape& shape = tensor->GetShape();
 
-        shape.SetDataLayout(static_tensor->data_layout);
+        shape.SetDataLayout(static_graph->graph_layout);
         shape.SetDim(static_tensor->dims);
 
         std::vector<QuantParam>* quant_param = tensor->GetQuantParam();
@@ -174,6 +182,7 @@ bool Graph::CreateNodeFromStatic(Node* node, const StaticGraph* static_graph, co
 
         (*quant_param)[0].scale = static_tensor->scale;
         (*quant_param)[0].zero_point = static_tensor->zero_point;
+        (*quant_param)[0].width = static_tensor->width;
 
         if(static_tensor->type == kConstTensor)
         {
@@ -227,6 +236,7 @@ bool Graph::SetupConnection(Tensor* tensor, const StaticGraph* static_graph, con
     return true;
 }
 
+#if 0
 static int model_format_mapping(const std::string& fmt)
 {
     if(fmt == "tengine")
@@ -258,6 +268,7 @@ static int model_format_mapping(const std::string& fmt)
         return MODEL_FORMAT_UNKNOWN;
     }
 }
+#endif
 
 bool Graph::RealCreateFromStatic(const StaticGraphPtr& static_graph)
 {
@@ -324,7 +335,11 @@ bool Graph::RealCreateFromStatic(const StaticGraphPtr& static_graph)
 
     /* save the model format */
 
-    model_format_ = model_format_mapping(static_graph->source_format);
+    //model_format_ = model_format_mapping(static_graph->source_format);
+    model_format_=static_graph->model_format;
+    model_subformat_=static_graph->model_subformat;
+    model_layout_=static_graph->model_layout;
+    layout_=static_graph->graph_layout;
 
     return true;
 }
diff --git a/core/lib/graph_executor.cpp b/core/lib/graph_executor.cpp
index d806695e0..9d1a89687 100644
--- a/core/lib/graph_executor.cpp
+++ b/core/lib/graph_executor.cpp
@@ -44,6 +44,8 @@ bool GraphExecutor::CreateGraph(void* exec_context, const char* graph_name, cons
     {
         graph = new Graph(graph_name);
         graph->SetModelFormat(MODEL_FORMAT_TENGINE);
+        graph->SetLayout(TENGINE_LAYOUT_NCHW);
+        graph->SetModelLayout(TENGINE_LAYOUT_NCHW);
     }
     else
     {
@@ -105,6 +107,7 @@ bool GraphExecutor::PrepareExec(void* exec_context, Graph* graph, StaticGraph* s
 
 bool GraphExecutor::SetExecParam(Graph* graph)
 {
+#if 0
     int model_format = graph->GetModelFormat();
 
     /* set proper layout */
@@ -112,11 +115,17 @@ bool GraphExecutor::SetExecParam(Graph* graph)
        model_format == MODEL_FORMAT_TENSORFLOW || model_format == MODEL_FORMAT_MXNET ||
        model_format == MODEL_FORMAT_TENGINE)
     {
-        exec_attr_.layout = TENGINE_LAYOUT_NCHW;
+        exec_attr_.graph_layout = TENGINE_LAYOUT_NCHW;
+
+        if(model_format == MODEL_FORMAT_TENSORFLOW)
+            exec_attr_.model_layout = TENGINE_LAYOUT_NHWC;
+        else
+            exec_attr_.model_layout = TENGINE_LAYOUT_NCHW;
     }
     else if(model_format == MODEL_FORMAT_TFLITE)
     {
-        exec_attr_.layout = TENGINE_LAYOUT_NHWC;
+        exec_attr_.graph_layout = TENGINE_LAYOUT_NHWC;
+        exec_attr_.model_layout = TENGINE_LAYOUT_NHWC;
     }
     else
     {
@@ -125,9 +134,28 @@ bool GraphExecutor::SetExecParam(Graph* graph)
     }
 
     exec_attr_.model_format = model_format;
+#else
 
-    if(graph->GetLayout() >= 0)
-        exec_attr_.layout = graph->GetLayout();
+#endif
+
+    exec_attr_.graph_layout = graph->GetLayout();
+    exec_attr_.model_layout = graph->GetModelLayout();
+    exec_attr_.model_format = graph->GetModelFormat();
+
+    if(exec_attr_.graph_layout<0)
+    {
+        LOG_ERROR()<<"why graph layout is: "<<exec_attr_.graph_layout<<"\n";
+    }
+
+    if(exec_attr_.model_layout<0)
+    {
+        LOG_ERROR()<<"why model layout is: "<<exec_attr_.model_layout<<"\n";
+    }
+
+    if(exec_attr_.model_format<0)
+    {
+        LOG_ERROR()<<"why model format is: "<<exec_attr_.model_format<<"\n";
+    }
 
     // check graph layout variable
     const char* layout_str = std::getenv("GRAPH_LAYOUT");
@@ -137,7 +165,7 @@ bool GraphExecutor::SetExecParam(Graph* graph)
 
         if(layout == TENGINE_LAYOUT_NCHW || layout == TENGINE_LAYOUT_NHWC)
         {
-            exec_attr_.layout = layout;
+            exec_attr_.graph_layout = layout;
             LOG_INFO() << "ENV set graph layout: [" << layout << "]\n";
         }
     }
@@ -212,24 +240,14 @@ const std::string& GraphExecutor::GetNodeInputTensor(const std::string& node_nam
 
 int GraphExecutor::GetGraphOutputNodeNum(void)
 {
-    Graph* optimized_graph = GetOptimizedGraph();
-
-    if(optimized_graph)
-        return optimized_graph->output_nodes.size();
+    Graph* cur_graph = GetOptimizedGraph();
 
-    return graph_->output_nodes.size();
+    return cur_graph->output_nodes.size();
 }
 
 const std::string& GraphExecutor::GetGraphOutputNodeName(int idx)
 {
-    Graph* optimized_graph = GetOptimizedGraph();
-
-    Graph* cur_graph;
-
-    if(optimized_graph)
-        cur_graph = optimized_graph;
-    else
-        cur_graph = graph_;
+    Graph* cur_graph = GetOptimizedGraph();
 
     std::vector<Node*>& outputs = cur_graph->output_nodes;
     Node* node = outputs[idx];
@@ -286,35 +304,26 @@ bool GraphExecutor::SetGraphOutputNode(const std::vector<std::string>& node_name
 
 Node* GraphExecutor::FindNode(const std::string& name)
 {
-    Graph* optimized_graph = GetOptimizedGraph();
-
-    if(optimized_graph)
-    {
-        Node* node = optimized_graph->FindNode(name);
-        if(node)
-            return node;
-    }
+    Graph* cur_graph = GetOptimizedGraph();
 
-    return graph_->FindNode(name);
+    Node* node = cur_graph->FindNode(name);
+    if(node)
+        return node;
+    else
+        return graph_->FindNode(name);
 }
 
 Tensor* GraphExecutor::FindTensor(const std::string& name)
 {
     // try to search in optmized graph first
 
-    Graph* optimized_graph = GetOptimizedGraph();
-
-    if(optimized_graph)
-    {
-        Tensor* tensor;
-
-        tensor = optimized_graph->FindTensor(name);
-
-        if(tensor)
-            return tensor;
-    }
+    Graph* cur_graph = GetOptimizedGraph();
 
-    return graph_->FindTensor(name);
+    Tensor* tensor = cur_graph->FindTensor(name);
+    if(tensor)
+        return tensor;
+    else
+        return graph_->FindTensor(name);
 }
 
 bool GraphExecutor::InferShape(void)
@@ -381,7 +390,7 @@ bool GraphExecutor::InferShape(void)
 
         outputs.resize(node->GetOutputNum());
 
-        if(!op->InferShape(inputs, outputs, exec_attr_.layout))
+        if(!op->InferShape(inputs, outputs, exec_attr_.graph_layout))
         {
             std::cout << "infer shaped for node: " << node->GetName() << " op: " << op->GetName() << " failed\n";
             return false;
@@ -521,6 +530,18 @@ bool GraphExecutor::Prerun(void)
 
     SetExecParam(graph_);
 
+    int optimize_only=0;
+
+    GetGraphAttr("optimize_only",&optimize_only,sizeof(int));
+
+    if(optimize_only)
+    {
+        if(exec_engine_->Prerun(exec_handle_))
+            return true;
+        else
+            return false;
+    }
+
     if(InferShape() && exec_engine_->Prerun(exec_handle_))
     {
         prerun_done_ = true;
@@ -578,6 +599,27 @@ Graph* GraphExecutor::GetOptimizedGraph(void)
     return graph;
 }
 
+bool GraphExecutor::GetOptimizeOnly(const char* name, void* val, int size)
+{
+    if(size!=sizeof(int))
+        return false;
+
+    *(int *)val=optimize_only;
+
+    return 0;
+    
+}
+
+bool GraphExecutor::SetOptimizeOnly(const char* name, const void* val, int size)
+{
+    const int * int_ptr=(const int *)val;
+
+    optimize_only=int_ptr[0];
+    
+    return true;
+}
+
+
 bool GraphExecutor::SetExecAttrEntry(const char* name, const void* val, int size)
 {
     if(!strcmp("exec_policy", name))
@@ -693,6 +735,17 @@ void GraphExecutor::InitAttrIO(void)
     attr_io_.RegSetFunc("fc_mt", set_func);
     attr_io_.RegSetFunc("pooling_mt", set_func);
 
+
+    auto set_opt_only_func=std::bind(&GraphExecutor::SetOptimizeOnly,this,std::placeholders::_1,std::placeholders::_2,
+    std::placeholders::_3);
+
+    auto get_opt_only_func=std::bind(&GraphExecutor::GetOptimizeOnly,this,std::placeholders::_1,std::placeholders::_2,
+    std::placeholders::_3);
+
+    attr_io_.RegSetFunc("optimize_only",set_opt_only_func);
+    attr_io_.RegGetFunc("optimize_only",get_opt_only_func);
+    
+
     // bailout
     auto set_func2 = std::bind(&GraphExecutor::BailoutSetAttr, this, std::placeholders::_1, std::placeholders::_2,
                                std::placeholders::_3);
diff --git a/core/lib/logger/logger.cpp b/core/lib/logger/logger.cpp
index af75742b2..c5592e8d3 100644
--- a/core/lib/logger/logger.cpp
+++ b/core/lib/logger/logger.cpp
@@ -27,6 +27,7 @@
 #include <iomanip>
 #include <fstream>
 #include <mutex>
+#include <ctime>
 
 #include "compiler.hpp"
 #include "logger.hpp"
@@ -217,8 +218,13 @@ log_stream_t StdLogger::Log(LogLevel level)
     if(option_.log_date)
     {
         auto t = system_clock::to_time_t(system_clock::now());
-
+#if defined(__GNUC__) && __GNUC__ > 5
         (*log_stream) << std::put_time(std::localtime(&t), "%Y-%m-%d %X ");
+#else
+        char buf[128];
+        strftime(buf,128,"%Y-%m-%d %X ",localtime(&t));
+	(*log_stream)<<buf;
+#endif
     }
 
     if(option_.log_level)
diff --git a/core/lib/node.cpp b/core/lib/node.cpp
index bd0687e7a..e916ef0d2 100644
--- a/core/lib/node.cpp
+++ b/core/lib/node.cpp
@@ -132,18 +132,7 @@ void Node::MergeAttr(Node* orig)
 
 /* code for attr get/set/add */
 
-#define ATTR_CUSTOM_ATTR "CUSTOM_ATTR"
-
-struct CustomNodeAttr
-{
-    int attr_size;
-    const void* type_info;
-    std::vector<uint8_t> mem;
-};
-
-using node_custom_attr_map_t = std::unordered_map<std::string, CustomNodeAttr>;
-
-int NodeAddParamGeneric(void* node, const char* param_name, const void* type_info, int param_size)
+int NodeAddParamGeneric(void* node, const char* param_name, const char * type_name, int param_size)
 {
     Node* real_node = ( Node* )node;
 
@@ -159,7 +148,7 @@ int NodeAddParamGeneric(void* node, const char* param_name, const void* type_inf
 
     CustomNodeAttr attr_entry;
 
-    attr_entry.type_info = type_info;
+    attr_entry.type_name = type_name;
     attr_entry.attr_size = param_size;
 
     (*attr_map)[param_name] = attr_entry;
@@ -167,13 +156,13 @@ int NodeAddParamGeneric(void* node, const char* param_name, const void* type_inf
     return 0;
 }
 
-int NodeGetParamGeneric(void* node, const char* param_name, const void* type_info, void* param_val, int size)
+int NodeGetParamGeneric(void* node, const char* param_name, const char * type_name, void* param_val, int size)
 {
     Node* real_node = ( Node* )node;
 
     Operator* op = real_node->GetOp();
 
-    if(op->GetParamItem(param_name, ( const std::type_info* )type_info, param_val))
+    if(op->GetParamItem(param_name, type_name, param_val))
         return 0;
 
     /* check custom attr */
@@ -193,7 +182,7 @@ int NodeGetParamGeneric(void* node, const char* param_name, const void* type_inf
 
     CustomNodeAttr* attr_entry = &attr_map->at(param_name);
 
-    if((size != attr_entry->attr_size) || (type_info && attr_entry->type_info && type_info != attr_entry->type_info))
+    if((size != attr_entry->attr_size) || (type_name && attr_entry->type_name && strcmp(type_name,attr_entry->type_name)))
     {
         set_tengine_errno(EINVAL);
         return -1;
@@ -206,13 +195,13 @@ int NodeGetParamGeneric(void* node, const char* param_name, const void* type_inf
     return 0;
 }
 
-int NodeSetParamGeneric(void* node, const char* param_name, const void* type_info, const void* param_val, int size)
+int NodeSetParamGeneric(void* node, const char* param_name, const char* type_name, const void* param_val, int size)
 {
     Node* real_node = ( Node* )node;
 
     Operator* op = real_node->GetOp();
 
-    if(op->SetParamItem(param_name, ( const std::type_info* )type_info, param_val))
+    if(op->SetParamItem(param_name, type_name, param_val))
         return 0;
 
     /* check custom attr */
@@ -232,7 +221,7 @@ int NodeSetParamGeneric(void* node, const char* param_name, const void* type_inf
 
     CustomNodeAttr* attr_entry = &attr_map->at(param_name);
 
-    if((size != attr_entry->attr_size) || (type_info && attr_entry->type_info && type_info != attr_entry->type_info))
+    if((size != attr_entry->attr_size) || (type_name && attr_entry->type_name && strcmp(type_name,attr_entry->type_name)))
     {
         set_tengine_errno(EINVAL);
         return -1;
diff --git a/core/lib/serializer.cpp b/core/lib/serializer.cpp
index 5eaad045f..8dde693c5 100644
--- a/core/lib/serializer.cpp
+++ b/core/lib/serializer.cpp
@@ -60,4 +60,21 @@ any& GetOpLoadMethod(const std::string& op_name, const std::string& method_name)
     return op_method_load_map[key];
 }
 
+bool FindOpSaveMethod(const std::string& op_name, const std::string& method_name)
+{
+    std::string key = op_name + method_name;
+
+    if(op_method_save_map.ExistAttr(key))
+        return true;
+
+    return false;
+}
+
+any& GetOpSaveMethod(const std::string& op_name, const std::string& method_name)
+{
+    std::string key = op_name + method_name;
+
+    return op_method_save_map[key];
+}
+
 }    // namespace TEngine
diff --git a/core/lib/static_graph.cpp b/core/lib/static_graph.cpp
index e9b6bab2a..3de56f78b 100644
--- a/core/lib/static_graph.cpp
+++ b/core/lib/static_graph.cpp
@@ -69,7 +69,22 @@ void SetGraphDevHandle(StaticGraph* graph, void* release_func, void* dev_handle)
 
 void SetGraphLayout(StaticGraph* graph, int layout)
 {
-    graph->layout = layout;
+    graph->graph_layout = layout;
+}
+
+void SetModelLayout(StaticGraph* graph, int layout)
+{
+    graph->model_layout = layout;
+}
+
+void SetModelFormat(StaticGraph* graph, int model_format)
+{
+    graph->model_format = model_format;
+}
+
+void SetModelSubFormat(StaticGraph* graph, int model_subformat)
+{
+    graph->model_subformat = model_subformat;
 }
 
 void SetGraphInternalName(StaticGraph* graph, const std::string& name)
@@ -382,7 +397,6 @@ StaticTensor* CreateStaticTensor(StaticGraph* graph, const std::string& name)
     tensor_ptr->index = tensor_idx;
     tensor_ptr->name = name;
     tensor_ptr->type = kVarTensor;
-
     graph->tensor_list.push_back(tensor_ptr);
 
     return tensor_ptr.get();
@@ -403,11 +417,6 @@ void SetTensorDataType(StaticTensor* tensor, int data_type)
     tensor->data_type = data_type;
 }
 
-void SetTensorDataLayout(StaticTensor* tensor, const std::string& data_layout)
-{
-    tensor->data_layout = data_layout;
-}
-
 void SetTensorType(StaticTensor* tensor, int type)
 {
     tensor->type = type;
@@ -499,7 +508,6 @@ void DumpStaticNode(StaticGraph* graph, StaticNode* node, std::ostream& os)
         StaticTensorPtr tensor_ptr = graph->tensor_list[index];
 
         os << "\tI" << i << ": " << tensor_ptr->name << " type: " << tensor_ptr->type;
-        os << " datalayout: " << tensor_ptr->data_layout << " ";
         os << " data_type: " << tensor_ptr->data_type << " ";
 
         if(tensor_ptr->dims.size())
@@ -521,7 +529,6 @@ void DumpStaticNode(StaticGraph* graph, StaticNode* node, std::ostream& os)
         StaticTensorPtr tensor_ptr = graph->tensor_list[index];
 
         os << "\tO" << i << ": " << tensor_ptr->name << " type: " << tensor_ptr->type;
-        os << " datalayout: " << tensor_ptr->data_layout << " ";
         os << " data_type: " << tensor_ptr->data_type << " ";
 
         if(tensor_ptr->dims.size())
diff --git a/core/lib/tengine_c_api.cpp b/core/lib/tengine_c_api.cpp
index 79d767ce6..f484f5c1b 100644
--- a/core/lib/tengine_c_api.cpp
+++ b/core/lib/tengine_c_api.cpp
@@ -88,7 +88,11 @@ int init_tengine(void)
         set_cpu_list(cpu_list_str);
     }
 
-    InitAllPlugin();
+    if(InitAllPlugin()<0)
+    {
+        return -1;
+    }
+
 
     if(TEnginePlugin::InitModule() < 0)
     {
@@ -104,9 +108,37 @@ int init_tengine(void)
     return 0;
 }
 
+void dump_mem_prof(void)
+{
+   int pid=getpid();
+
+   char fname[128];
+
+   LOG_INFO()<<"\ntengine memory profile result:\n";
+
+   sprintf(fname,"/proc/%d/status",pid);
+
+   FILE * fp=fopen(fname,"r");
+
+   char line[128];
+
+   while(fgets(line,128,fp))
+   {
+      if(line[0]=='V' && line[1]=='m')
+        LOG_INFO()<<line;
+   }
+
+   fclose(fp);
+}
+
 void release_tengine(void)
 {
     TEnginePlugin::ReleaseModule();
+
+    const char * mem_prof=std::getenv("TENGINE_MEM_PROFILE");
+
+    if(mem_prof && mem_prof[0]=='1')
+         dump_mem_prof();
 }
 
 graph_t create_graph(context_t context, const char* model_format, const char* fname, ...)
@@ -168,6 +200,7 @@ graph_t create_graph(context_t context, const char* model_format, const char* fn
         graph = create_graph_in_context(exec_context, graph_name, model_name.c_str());
     else
         graph = create_graph_in_context(exec_context, graph_name, nullptr);
+        
 
     if(graph == nullptr)
     {
@@ -193,6 +226,27 @@ int save_graph(graph_t graph, const char* model_format, const char* fname, ...)
     return save_graph_internal(graph, model_format, fname, argp);
 }
 
+int quant_graph(graph_t graph, int quant_mode, int node_no_quant_idxs[], int node_no_quant_number)
+{
+    GraphExecutor* executor = static_cast<GraphExecutor*>(graph);
+    Graph* g = executor->GetOptimizedGraph();
+
+    if(g->GetModelFormat() == MODEL_FORMAT_TFLITE)
+    {
+        LOG_INFO() << "Not quant tf-lite model.\n";
+        return 0;
+    }
+
+    if(quant_mode != TENGINE_QUANT_FP16 && quant_mode != TENGINE_QUANT_INT8)
+    {
+        LOG_ERROR() << "Currently only support fp16 and int8 quant.\n";
+        set_tengine_errno(EINVAL);
+        return -1;
+    }
+
+    return quant_graph_internal(graph, quant_mode, node_no_quant_idxs, node_no_quant_number);
+}
+
 int set_graph_layout(graph_t graph, int layout_type)
 {
     if(layout_type != TENGINE_LAYOUT_NCHW && layout_type != TENGINE_LAYOUT_NHWC)
@@ -691,12 +745,40 @@ int get_node_input_number(node_t node)
     return real_node->GetInputNum();
 }
 
-int add_node_attr(node_t node, const char* attr_name, const void* type_info, int size)
+int get_graph_node_number(graph_t graph)
+{
+    GraphExecutor* executor = reinterpret_cast<GraphExecutor*>(graph);
+    Graph* real_graph = executor->GetOptimizedGraph();
+
+    return real_graph->seq_nodes.size();
+}
+
+node_t get_graph_node_by_idx(graph_t graph, int node_idx)
+{
+    GraphExecutor* executor = reinterpret_cast<GraphExecutor*>(graph);
+    Graph* real_graph = executor->GetOptimizedGraph();
+
+    int node_num=real_graph->seq_nodes.size();
+
+    if(node_idx<0 || node_idx>=node_num)
+    {
+        set_tengine_errno(EINVAL);
+        return nullptr;
+    }
+
+    Node* node = real_graph->seq_nodes[node_idx];
+
+    node->SetAttr(ATTR_API_GRAPH, executor);
+
+    return node;
+}
+
+int add_node_attr(node_t node, const char* attr_name, const char* type_name, int size)
 {
     /* first check if the attribute exists*/
     void* buf = malloc(size);
 
-    int ret = get_node_attr_generic(node, attr_name, type_info, buf, size);
+    int ret = get_node_attr_generic(node, attr_name, type_name, buf, size);
 
     free(buf);
 
@@ -706,17 +788,17 @@ int add_node_attr(node_t node, const char* attr_name, const void* type_info, int
         return -1;
     }
 
-    return node_add_attr(node, attr_name, type_info, size);
+    return node_add_attr(node, attr_name, type_name, size);
 }
 
 int get_node_attr_int(node_t node, const char* attr_name, int* attr_val)
 {
-    return get_node_attr_generic(node, attr_name, &typeid(int), attr_val, sizeof(int));
+    return get_node_attr_generic(node, attr_name, typeid(int).name(), attr_val, sizeof(int));
 }
 
 int get_node_attr_float(node_t node, const char* attr_name, float* attr_val)
 {
-    return get_node_attr_generic(node, attr_name, &typeid(float), attr_val, sizeof(float));
+    return get_node_attr_generic(node, attr_name, typeid(float).name(), attr_val, sizeof(float));
 }
 
 int get_node_attr_pointer(node_t node, const char* attr_name, void* attr_val)
@@ -724,19 +806,19 @@ int get_node_attr_pointer(node_t node, const char* attr_name, void* attr_val)
     return get_node_attr_generic(node, attr_name, nullptr, attr_val, sizeof(void*));
 }
 
-int get_node_attr_generic(node_t node, const char* attr_name, const void* type_info, void* buf, int size)
+int get_node_attr_generic(node_t node, const char* attr_name, const char * type_name, void* buf, int size)
 {
-    return node_get_attr_generic(node, attr_name, type_info, buf, size);
+    return node_get_attr_generic(node, attr_name, type_name, buf, size);
 }
 
 int set_node_attr_int(node_t node, const char* attr_name, const int* attr_val)
 {
-    return set_node_attr_generic(node, attr_name, &typeid(int), attr_val, sizeof(int));
+    return set_node_attr_generic(node, attr_name, typeid(int).name(), attr_val, sizeof(int));
 }
 
 int set_node_attr_float(node_t node, const char* attr_name, const float* attr_val)
 {
-    return set_node_attr_generic(node, attr_name, &typeid(float), attr_val, sizeof(float));
+    return set_node_attr_generic(node, attr_name, typeid(float).name(), attr_val, sizeof(float));
 }
 
 int set_node_attr_pointer(node_t node, const char* attr_name, const void* attr_val)
@@ -744,9 +826,9 @@ int set_node_attr_pointer(node_t node, const char* attr_name, const void* attr_v
     return set_node_attr_generic(node, attr_name, nullptr, attr_val, sizeof(void*));
 }
 
-int set_node_attr_generic(node_t node, const char* attr_name, const void* type_info, const void* buf, int size)
+int set_node_attr_generic(node_t node, const char* attr_name, const char* type_name, const void* buf, int size)
 {
-    return node_set_attr_generic(node, attr_name, type_info, buf, size);
+    return node_set_attr_generic(node, attr_name, type_name, buf, size);
 }
 
 tensor_t create_graph_tensor(graph_t graph, const char* tensor_name, int data_type)
@@ -759,6 +841,13 @@ tensor_t create_graph_tensor(graph_t graph, const char* tensor_name, int data_ty
         return nullptr;
     }
 
+    if(data_type<TENGINE_DT_FP32 || data_type > TENGINE_DT_INT16)
+    {
+        LOG_ERROR()<<"unknown data type: "<<data_type<<"\n";
+        set_tengine_errno(EINVAL);
+        return nullptr;
+    }
+
     Graph* real_graph = executor->GetGraph();
 
     if(real_graph->FindTensor(tensor_name))
@@ -777,6 +866,7 @@ tensor_t create_graph_tensor(graph_t graph, const char* tensor_name, int data_ty
 
     new_tensor->SetDataType(data_type);
     new_tensor->SetType(TENSOR_TYPE_CONST);
+    new_tensor->GetShape().SetDataLayout(real_graph->GetLayout());
 
     real_graph->AddTensor(new_tensor);
 
@@ -1469,13 +1559,5 @@ void dump_graph(graph_t graph)
     /* first: try to dump optimized graph */
     Graph* g = executor->GetOptimizedGraph();
 
-    if(g)
-    {
-        g->DumpGraph();
-        return;
-    }
-
-    /* get the origin graph */
-    g = executor->GetGraph();
     g->DumpGraph();
 }
diff --git a/core/lib/tengine_c_compat.cpp b/core/lib/tengine_c_compat.cpp
index 2798190b6..ef497c2af 100644
--- a/core/lib/tengine_c_compat.cpp
+++ b/core/lib/tengine_c_compat.cpp
@@ -28,7 +28,6 @@
 #include "tengine_c_helper.hpp"
 #include "exec_context.hpp"
 
-#include "data_layout.hpp"
 #include "graph_executor.hpp"
 
 using namespace TEngine;
@@ -114,9 +113,9 @@ int get_node_param_pointer(node_t node, const char* param_name, void* param_val)
     return get_node_attr_pointer(node, param_name, param_val);
 }
 
-int get_node_param_generic(node_t node, const char* param_name, const void* type_info, void* param_val, int size)
+int get_node_param_generic(node_t node, const char* param_name, const char * type_name, void* param_val, int size)
 {
-    return get_node_attr_generic(node, param_name, type_info, param_val, size);
+    return get_node_attr_generic(node, param_name, type_name, param_val, size);
 }
 
 int infer_shape(graph_t graph)
@@ -128,32 +127,22 @@ int infer_shape(graph_t graph)
     return 0;
 }
 
-int set_tensor_layout(tensor_t tensor, const char* layout)
+int set_tensor_layout(tensor_t tensor, int layout)
 {
-    std::string real_layout = layout;
-    const DataLayout* data_layout = DataLayout::GetLayout(real_layout);
-    if(data_layout == nullptr)
-        return -1;
     Tensor* real_tensor = reinterpret_cast<Tensor*>(tensor);
 
     TShape shape = real_tensor->GetShape();
 
-    shape.SetDataLayout(real_layout);
+    shape.SetDataLayout(layout);
     real_tensor->Reshape(shape);
 
     return 0;
 }
 
-int get_tensor_layout(tensor_t tensor, char* layout)
+int get_tensor_layout(tensor_t tensor)
 {
     Tensor* real_tensor = reinterpret_cast<Tensor*>(tensor);
 
     TShape shape = real_tensor->GetShape();
-    const std::string& data_layout = shape.GetDataLayout();
-    if(data_layout.empty())
-        return -1;
-    int len = strlen(data_layout.c_str());
-    memcpy(layout, data_layout.c_str(), len);
-
-    return 0;
+    return shape.GetDataLayout();
 }
diff --git a/core/lib/tengine_c_helper.cpp b/core/lib/tengine_c_helper.cpp
index 5b64dbe70..6400ae9f3 100644
--- a/core/lib/tengine_c_helper.cpp
+++ b/core/lib/tengine_c_helper.cpp
@@ -24,37 +24,33 @@
 #include <stdarg.h>
 #include <assert.h>
 #include <string.h>
-
+#include <math.h>
 #include <map>
 #include <set>
 
+#include "share_lib_parser.hpp"
 #include "cpu_device.h"
 #include "tengine_c_api.h"
 #include "tengine_c_compat.h"
 #include "tengine_c_helper.hpp"
 
-#include "data_layout.hpp"
 #include "exec_context.hpp"
 #include "graph_executor.hpp"
 #include "tengine_errno.hpp"
 #include "static_graph_interface.hpp"
 #include "serializer.hpp"
+#include "compiler_fp16.h"
 
 namespace TEngine {
 
-extern int NodeSetParamGeneric(void* node, const char* param_name, const void* type_info, const void* param_val,
+extern int NodeSetParamGeneric(void* node, const char* param_name, const char* type_name, const void* param_val,
                                int size);
-extern int NodeGetParamGeneric(void* node, const char* param_name, const void* type_info, void* param_val, int size);
-extern int NodeAddParamGeneric(void* node, const char* param_name, const void* type_info, int size);
+extern int NodeGetParamGeneric(void* node, const char* param_name, const char* type_name, void* param_val, int size);
+extern int NodeAddParamGeneric(void* node, const char* param_name, const char* type_name, int size);
 }    // namespace TEngine
 
 using namespace TEngine;
 
-void __attribute__((constructor)) first_init(void)
-{
-    NamedData<DataLayout>::InitPredefinedData();
-}
-
 void set_cpu_list(const char* cpu_list_str)
 {
     char* copy_str = strdup(cpu_list_str);
@@ -120,20 +116,20 @@ int dump_model(const char* model_name)
     return -1;
 }
 
-int node_get_attr_generic(void* node, const char* param_name, const void* type_info, void* param_val, int param_size)
+int node_get_attr_generic(void* node, const char* param_name, const char* type_name, void* param_val, int param_size)
 {
-    return NodeGetParamGeneric(node, param_name, type_info, param_val, param_size);
+    return NodeGetParamGeneric(node, param_name, type_name, param_val, param_size);
 }
 
-int node_set_attr_generic(void* node, const char* param_name, const void* type_info, const void* param_val,
+int node_set_attr_generic(void* node, const char* param_name, const char* type_name, const void* param_val,
                           int param_size)
 {
-    return NodeSetParamGeneric(node, param_name, type_info, param_val, param_size);
+    return NodeSetParamGeneric(node, param_name, type_name, param_val, param_size);
 }
 
-int node_add_attr(void* node, const char* param_name, const void* type_info, int param_size)
+int node_add_attr(void* node, const char* param_name, const char* type_name, int param_size)
 {
-    return NodeAddParamGeneric(node, param_name, type_info, param_size);
+    return NodeAddParamGeneric(node, param_name, type_name, param_size);
 }
 
 static int real_vload_model(context_t exec_context, const char* model_name, const char* model_format, const void* addr,
@@ -262,7 +258,7 @@ int save_graph_internal(graph_t graph, const char* model_format, const char* fna
 
     /* Get runtime graph pointer */
     GraphExecutor* executor = static_cast<GraphExecutor*>(graph);
-    Graph* g = executor->GetGraph();
+    Graph* g = executor->GetOptimizedGraph();
 
     /* Save the graph to the files */
     if(!serializer->SaveModel(file_list, g))
@@ -271,6 +267,131 @@ int save_graph_internal(graph_t graph, const char* model_format, const char* fna
     return 0;
 }
 
+static float get_absmax_val(float* data, int data_size)
+{
+    float max_val = 0.f;
+    if(data != nullptr)
+    {
+        for(int i = 0; i < data_size; i++)
+        {
+            float abs_val = fabs(data[i]);
+            if(abs_val > max_val)
+                max_val = abs_val;
+        }
+    }
+    return max_val;
+}
+
+static inline bool isSkipQuant(int nodeInedx, int node_no_quant_idxs[], int number)
+{
+    for(int i = 0; i < number; i++)
+    {
+        if(nodeInedx == node_no_quant_idxs[i])
+            return true;
+    }
+    return false;
+}
+#define GET_TENGINE_DT(a) (a+1)
+int quant_graph_internal(graph_t graph, int quant_mode, int node_no_quant_idxs[], int node_no_quant_number)
+{
+    GraphExecutor* executor = static_cast<GraphExecutor*>(graph);
+    Graph* g = executor->GetOptimizedGraph();
+
+    for(unsigned int i = 0; i < g->seq_nodes.size(); i++)
+    {
+        if(isSkipQuant(i, node_no_quant_idxs, node_no_quant_number))
+            continue;
+
+        Node* node = g->seq_nodes[i];
+        Operator* op = node->GetOp();
+        if(op->GetName() == "Const")
+            continue;
+
+        /* set node output */
+        Tensor* output = node->GetOutputTensor(0);
+        output->SetDataType(GET_TENGINE_DT(quant_mode));
+
+        if(op->GetName() == "Convolution" || op->GetName() == "FullyConnected")
+        {
+            // quant weight
+            Tensor* weight_tensor = node->GetInputTensor(1);
+            if(weight_tensor->GetDataType() == TENGINE_DT_FP32)
+            {
+                int kernel_size = (weight_tensor->GetTotalSize()) / sizeof(float);
+                float* kernel_org = (float*)weight_tensor->GetMemAddr();
+
+                // fp16 quant
+                if(quant_mode == TENGINE_QUANT_FP16)
+                {
+                    __fp16 *kernel_new = (__fp16*)malloc(kernel_size * sizeof(__fp16));
+                    for(int i = 0; i < kernel_size; i++)
+                        kernel_new[i] = fp32_to_fp16(kernel_org[i]);
+
+                    // set the memory
+                    weight_tensor->FreeTensor();
+                    weight_tensor->SetMemAddr(kernel_new);
+
+                    // set the data type
+                    weight_tensor->SetDataType(TENGINE_DT_FP16);
+                }
+                // int8 quant
+                else if (quant_mode == TENGINE_QUANT_INT8)
+                {
+                    int8_t *kernel_new = (int8_t *)malloc(kernel_size);
+                    float weight_max = get_absmax_val(kernel_org, kernel_size);
+                    float weight_scale = weight_max / 127;
+                    int zero_point = 0;
+
+                    for(int i = 0; i < kernel_size; i++)
+                        kernel_new[i] = (int8_t)(round(kernel_org[i] / weight_scale) + zero_point);
+
+                    // set the memory
+                    weight_tensor->FreeTensor();
+                    weight_tensor->SetMemAddr(kernel_new);
+
+                    // set the data type
+                    weight_tensor->SetDataType(TENGINE_DT_INT8);
+
+                    // set the quant param
+                    auto p_quant = weight_tensor->GetQuantParam();
+                    p_quant->resize(1);
+                    QuantParam& param = (*p_quant)[0];
+                    param.scale = weight_scale;
+                    param.zero_point = zero_point;
+                }
+            }
+
+            // quant bias
+            if(node->GetInputNum() > 2)
+            {
+                Tensor* bias_tensor = node->GetInputTensor(2);
+                if(bias_tensor->GetDataType() == TENGINE_DT_FP32)
+                {
+                    int bias_size = (bias_tensor->GetTotalSize()) / sizeof(float);
+                    float* bias_org = (float*)bias_tensor->GetMemAddr();
+
+                    if(quant_mode == TENGINE_QUANT_FP16)
+                    {
+                        __fp16 *bias_new = (__fp16*)malloc(bias_size * sizeof(__fp16));
+                        for(int i = 0; i < bias_size; i++)
+                            bias_new[i] = fp32_to_fp16(bias_org[i]);
+
+                        // set the memory
+                        bias_tensor->FreeTensor();
+                        bias_tensor->SetMemAddr(bias_new);
+
+                        // set the data type
+                        bias_tensor->SetDataType(TENGINE_DT_FP16);
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+
 graph_t create_graph_in_context(context_t exec_context, const char* graph_name, const char* model_name)
 {
     GraphExecutor* executor = new GraphExecutor();
@@ -301,12 +422,50 @@ extern void driver_plugin_init(void);
 
 namespace TEngine {
 
-void InitAllPlugin(void)
+int hclcpu_plugin_init(void)
+{
+    static ShareLibParser so_handle;
+
+    try {
+    if(so_handle.Load("libhclcpu.so")<0)
+    {
+        LOG_ERROR()<<"cannot load libhclcpu.so\n";
+        set_tengine_errno(ENOENT);
+        return -1;
+    }
+
+    if(so_handle.ExecuteFunc<int()>("register_hclcpu_ops")<0)
+    {
+        LOG_ERROR()<<"register hcl cpu ops failed\n";
+        set_tengine_errno(EFAULT);
+        return -1;
+    }
+
+    }
+
+    catch(const std::exception& e)
+    {
+        LOG_ERROR()<<e.what()<<"\n";
+        set_tengine_errno(EFAULT);
+        return -1;
+    }
+    
+    return 0;
+}
+
+int InitAllPlugin(void)
 {
     operator_plugin_init();
     serializer_plugin_init();
     executor_plugin_init();
     driver_plugin_init();
+
+    if(hclcpu_plugin_init()<0)
+    {
+       return -1;
+    }
+
+    return 0;
 }
 
 struct tensor_entry
diff --git a/core/lib/tengine_runtime_error.cpp b/core/lib/tengine_runtime_error.cpp
new file mode 100644
index 000000000..c2521a009
--- /dev/null
+++ b/core/lib/tengine_runtime_error.cpp
@@ -0,0 +1,11 @@
+#include "te_error.hpp"
+
+
+namespace TEngine {
+
+std::string te_error_unable_to_load_library::msg;
+std::string te_error_shared_function_not_found::msg;
+
+
+
+}
diff --git a/core/lib/tengine_version.cpp b/core/lib/tengine_version.cpp
index 7e76c1ddc..99baa3c83 100644
--- a/core/lib/tengine_version.cpp
+++ b/core/lib/tengine_version.cpp
@@ -31,7 +31,7 @@
 
 namespace TEngine {
 
-#define TENGINE_VERSION "1.0.0"
+#define TENGINE_VERSION "1.3.2"
 
 #ifdef CONFIG_VERSION_POSTFIX
 const std::string tengine_version(TENGINE_VERSION "-" CONFIG_VERSION_POSTFIX);
diff --git a/core/lib/tensor_shape.cpp b/core/lib/tensor_shape.cpp
index 32cafefbd..85affaff4 100644
--- a/core/lib/tensor_shape.cpp
+++ b/core/lib/tensor_shape.cpp
@@ -23,25 +23,14 @@
  */
 #include <unordered_map>
 
-#include "data_layout.hpp"
 #include "tensor_shape.hpp"
 #include "logger.hpp"
 #include "compiler.hpp"
 
 namespace TEngine {
 
-void TShape::SetDim(const std::vector<int>& args, bool layout_check)
+void TShape::SetDim(const std::vector<int>& args)
 {
-    if(layout_check)
-    {
-        const DataLayout* p_layout = DataLayout::GetLayout(layout_);
-
-        if(args.size() != p_layout->GetDimNum())
-        {
-            throw(std::runtime_error("shape dims mismatch"));
-        }
-    }
-
     dim_ = args;
 }
 
@@ -68,36 +57,34 @@ void TShape::DumpShape(std::ostream& os) const
     os << result;
 }
 
-#define GET_DIM(D)                                               \
-    const DataLayout* p_layout = DataLayout::GetLayout(layout_); \
-    int idx = p_layout->Get##D();                                \
-    if(idx < 0)                                                  \
-        return 1;                                                \
-    return dim_[idx]
-
 int TShape::GetN(void) const
 {
-    GET_DIM(N);
+    return Shape(0);
 }
 
 int TShape::GetC(void) const
 {
-    GET_DIM(C);
+    if(layout_==TENGINE_LAYOUT_NCHW)
+         return Shape(1);
+    else
+         return Shape(3);
 }
 
 int TShape::GetH(void) const
 {
-    GET_DIM(H);
+    if(layout_==TENGINE_LAYOUT_NCHW)
+         return Shape(2);
+    else
+         return Shape(1);
 }
 
 int TShape::GetW(void) const
 {
-    GET_DIM(W);
+    if(layout_==TENGINE_LAYOUT_NCHW)
+         return Shape(3);
+    else
+         return Shape(2);
 }
 
-int TShape::GetD(void) const
-{
-    GET_DIM(D);
-}
 
 }    // namespace TEngine
diff --git a/doc/benchmark.md b/doc/benchmark.md
index 4f477abaa..c10d40702 100644
--- a/doc/benchmark.md
+++ b/doc/benchmark.md
@@ -3,110 +3,124 @@
 ## **Revision Record**
 |    Date    | Rev |Change Description|Author
 | ---------- | --- |---|---|
-| 2017-12-29 |  0.1 |Initial version|FeyaHan
-| 2018-01-06 |  0.2 |Add multi CPU performance|HaoLuo
-| 2018-06-14 |  0.3 |Add ACL_GPU performance| Chunying
+| 2018-12-27 |  0.9 |update newest benchmark|ZhangRui/LuoHao
 
 
 ---
 
 ## **Catalog**
 
-#### [Test Environment](benchmark.md#test-environment-1)
-#### [Test](benchmark.md#test-1)
-#### [Performance](benchmark.md#performance-1)
+#### [**Test Environment**](benchmark.md#test-environment-1)
+#### [**Test Steps**](benchmark.md#test-steps-1)
+#### [**Performance**](benchmark.md#performance-1)
 
 ---
 
 
-
-## Test Environment
-- Tengine : v0.3
-- Broad : ROCK960
-- CPU : Rockchip RK3399. 
-
-    *   Dual-core Cortex-A72 up to 2.0GHz (real frequency is 1.8GHz); 
-
-    *   Quad-core Cortex-A53 up to 1.5GHz (real frequency is 1.4GHz).  
-
-- GPU : Mali T864 (800MHz).  
+## **Test Environment**
+- Tengine : 0.9.0
+- Broad : **Firefly-3399 (RK3399), TinkerBoard (RK3288)**
 - Operating System : Ubuntu 16.04.
 
 
 ---
 
-## Test 
+## **Test Steps**
 
-### Step1. install Tengine
+### Step1. **install Tengine**
 
-    For more information about the build of Tengine, please refer to the documentation of [install](install.md) 
+For more information about the build of Tengine, please refer to the documentation of [install](install.md).
 
-### Step2. lock the cpu frequency at maximum
-```bash
-    #switch to root user
-    > sudo su 
+### Step2. **lock the cpu frequency at maximum**
 
-    #check which available policy, policy4 for A72, policy0 for A53
-    > cat  /sys/devices/system/cpu/cpufreq/policy4/scaling_available_governors  
+Please set the scaling governer into performance. Below is an example to set the big core of RK33399 to performance mode.
 
-    #set performance policy
-    > echo performance > /sys/devices/system/cpu/cpufreq/policy4/scaling_governor 
-    
-    #check cpu frequency
-    > cat /sys/devices/system/cpu/cpufreq/policy4/cpuinfo_cur_freq
+```bash
+> sudo su #switch to root user
+> cat  /sys/devices/system/cpu/cpufreq/policy4/scaling_available_governors   #check which available policy, note that policy4 is for A72 on RK3399 and policy0 is for A53
+conservative ondemand userspace powersave interactive performance
+> echo performance > /sys/devices/system/cpu/cpufreq/policy4/scaling_governor #set performance policy
+> cat /sys/devices/system/cpu/cpufreq/policy4/cpuinfo_cur_freq     #check cpu frequency
+1800000
 ```
 
-### Step3: test bench_sqz, bench_mobilenet
+### Step3: **test benchmark squeezenet_v1.1 and mobilenet_v1**
 
 * **get model**
 
-    You can get the models from [Tengine model zoo](https://pan.baidu.com/s/1LXZ8vOdyOo50IXS0CUPp8g),the pass word is `57vb`.
-    And then, put the "mobilenet.caffemodel" "mobilenet_deploy.prototxt" "squeezenet_v1.1.caffemodel" "sqz.prototxt" in `~/tengine/models`
-
-* **set device**
-    1. use ACL_GPU
+    You can get the models from [Tengine model zoo](https://pan.baidu.com/s/1LXZ8vOdyOo50IXS0CUPp8g), the password is `57vb`. 
+    And then, put the "mobilenet.caffemodel", "mobilenet_deploy.prototxt", "squeezenet_v1.1.caffemodel", "sqz.prototxt" in `~/tengine/models`.
 
-        For how to build tengine with ACL_GPU, see [acl_driver.md](acl_driver.md).  
-        You can run the test as 
+* **set CPU**
 
-        ```
-        ./build/tests/bin/bench_sqz -d acl_opencl
-        ./build/tests/bin/bench_mobilenet -d acl_opencl
-        ```
-    2. use CPU: single-core/multi-cores
+    By setting the environment variable `TENGINE_CPU_LIST`, different working CPUs can be set.
+    
+    For RK3399:
+    ```
+        1 A72: export TENGINE_CPU_LIST=5
+        2 A72: export TENGINE_CPU_LIST=4,5
+        1 A53: export TENGINE_CPU_LIST=2
+        4 A53: export TENGINE_CPU_LIST=0,1,2,3
+        
+    ```
+    For RK3288:
+    ```
+        1 A17: export TENGINE_CPU_LIST=2
+        4 A17: export TENGINE_CPU_LIST=0,1,2,3
+        
+    ```
 
-        To assign on different cpu core, there are two methods:
+* **run int8/float32 inference**
 
-        - `export TENGINE_CPU_LIST=0,1,2,3`
-        - `tests/bin/bench_sqz –p 0,1,2,3`
+    By default, Tengine run inference as **float32**. To run int8 inference, you need to set the env_variable `KERNEL_MODE` as `2`. And set it back to `0` to run float32 inference.
+    ```
+    export KERNEL_MODE=2  # run int8 inference
+    export KERNEL_MODE=0  # run float32 inference
+    ```
 
-        For rk3399, cpu(0-3) are A53, cpu(4-5) are A72. 
-        
-        - 1A72 `tests/bin/bench_sqz –p 4`
-        - 2A72 `tests/bin/bench_sqz –p 4,5`
-        - 1A53 `tests/bin/bench_sqz –p 0`
-        - 4A53 `tests/bin/bench_sqz –p 0,1,2,3`
+---
 
 ## Performance
 
+### RK3399 
+
+#### MobileNet
 
-|       | SqueezeNet(ms) |Mobilenet (ms) |
+|   | Float32(ms) | INT8（ms） |
 | ---------- | ---|---|
-| rk3399(1*A72) | 91.2 |122.1  |
-| rk3399(2*A72) | 51.2  |65.4 |
-| rk3399(1*A53) | 232.5 |323.6 |
-| rk3399(4*A53) | 79.2  |96.3  |
-| ACL(GPU)| 61.4| 95.9|
+| rk3399(1*A72) | 111.8 |80.1  |
+| rk3399(2*A72) | 63.7  |46.5  |
+| rk3399(1*A53) | 259.6 |198.0 |
+| rk3399(4*A53) | 81.6  |63.7  |
 
 
-Notes:<br>
-(1) We run N=100 times per test case.<br>
-(2) We take the average time of N repeats.
-                  
----
+#### SqueezeNet
+|   | Float32(ms) | INT8（ms） |
+| ---------- | ---|---|
+| rk3399(1*A72) | 79.4  |60.4 |
+| rk3399(2*A72) | 49.3  |37.6 |
+| rk3399(1*A53) | 177.0 |151.2 |
+| rk3399(4*A53) | 68.4  |59.6 |
 
 
+### RK3288
 
+#### MobileNet
 
+|   | Float32(ms) | INT8（ms） |
+| ---------- | ---|---|
+| rk3399(1*A17) | 201 |111  |
+| rk3399(4*A17) | 67.4 |40  |
+
+
+#### SqueezeNet
+|   | Float32(ms) | INT8（ms） |
+| ---------- | ---|---|
+| rk3399(1*A17) | 142 |88 |
+| rk3399(4*A17) | 55  |35 |
+
+Notes:<br>
+(1) We take the average time of N repeats.<br>
+(2) We run N=100 times per test case.<br>
 
 
diff --git a/doc/build_android.md b/doc/build_android.md
index b8a97fd27..f2302d548 100644
--- a/doc/build_android.md
+++ b/doc/build_android.md
@@ -94,7 +94,7 @@ cp ~/ComputeLibrary/build_64/libarm_compute* ~/android-ndk-r16b/platforms/androi
 #For armv7:
 cp ~/ComputeLibrary/build_32/libarm_compute* ~/android-ndk-r16b/platforms/android-21/arch-arm/usr/lib/
 cd ~/tengine/example
-mkdir build
+mdkir build
 cd build
 ../android_build_armv7.sh or ../android_build_armv8.sh
 make -j4
diff --git a/driver/acl_graph/acl_graph.hpp b/driver/acl_graph/acl_graph.hpp
index 19428791f..186463a91 100644
--- a/driver/acl_graph/acl_graph.hpp
+++ b/driver/acl_graph/acl_graph.hpp
@@ -326,10 +326,10 @@ class CLGraph
         Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
         ConvParam* param = conv_op->GetParam();
 
-        int pad_x = param->pads[1];
-        int pad_y = param->pads[0];
-        int pad_x_1 = param->pads[3];
-        int pad_y_1 = param->pads[2];
+        int pad_x = param->pad_w0;
+        int pad_y = param->pad_h0;
+        int pad_x_1 = param->pad_w1;
+        int pad_y_1 = param->pad_h1;
         int stride_x = param->stride_w;
         int stride_y = param->stride_h;
         int group = param->group;
@@ -518,8 +518,8 @@ class CLGraph
         /* weight */
         Tensor* w_tensor = node->GetInputTensor(1);
         name = w_tensor->GetName();
-        int M = w_tensor->GetShape().GetH();
-        int K = w_tensor->GetShape().GetW();
+        int M = w_tensor->GetShape().GetN();
+        int K = w_tensor->GetShape().GetC();
         CLTensor* wtensor = new CLTensor();
         wtensor->allocator()->init(TensorInfo(TensorShape(K, M), 1, data_type_));
         tensors_map_[name] = wtensor;
@@ -573,12 +573,12 @@ class CLGraph
     {
         Pooling* pool_op = dynamic_cast<Pooling*>(node->GetOp());
         PoolParam* param = pool_op->GetParam();
-        int pad_x = param->pad_w;
-        int pad_y = param->pad_h;
+        int pad_x = param->pad_w0;
+        int pad_y = param->pad_h0;
         int stride_x = param->stride_w;
         int stride_y = param->stride_h;
-        int kernel_w = param->kernel_shape[1];
-        int kernel_h = param->kernel_shape[0];
+        int kernel_w = param->kernel_w ;
+        int kernel_h = param->kernel_h;
         int type = param->alg;
         int global = param->global;
 
diff --git a/driver/cpu/cpu_driver.cpp b/driver/cpu/cpu_driver.cpp
index 09d46082e..d4d2e92ef 100644
--- a/driver/cpu/cpu_driver.cpp
+++ b/driver/cpu/cpu_driver.cpp
@@ -587,7 +587,7 @@ static void probe_func(void)
         cpu_dev->online_cpu_number = default_param.cpu_number;
     }
 
-    create_cpu_device("generic_probe", cpu_dev);
+    create_cpu_device(cpu_dev->cpu_name, cpu_dev);
 
     cpu_dev->online_cpu_list = saved_list;
     cpu_dev->online_cpu_number = saved_number;
diff --git a/driver/cpu/cpu_probe.cpp b/driver/cpu/cpu_probe.cpp
index 73ab8eaf8..4138d8940 100644
--- a/driver/cpu/cpu_probe.cpp
+++ b/driver/cpu/cpu_probe.cpp
@@ -25,104 +25,107 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <malloc.h>
 #include <string.h>
 
 #include "cpu_device.h"
 
-int get_cpu_number(void)
-{
-    FILE* fp = fopen("/proc/cpuinfo", "rb");
-    int num = 0;
-    char buf[256];
-
-    if(fp == NULL)
-        return 1;
+struct cpu_item {
+   int cpu_id;
+   int max_freq;
+   int cluster_leader;
+};
 
-    while(fgets(buf, 256, fp))
-    {
-        if(memcmp(buf, "processor", 9) == 0)
-            num++;
-    }
+/* 
+   for the meaning files in /sys/device/system/cpu/cpu0/cpufreq
+   please read documentation/cpu-freq/user-guide.txt
+*/
 
-    fclose(fp);
+int get_cpu_items(struct cpu_item ** p_item)
+{
+    char cpu_path[128];
+    char file_path[128];
+    struct cpu_item * cpu_item=NULL;
+    struct stat stat_buf;
+    int i=0;
 
-    if(num < 1)
-        num = 1;
+    while(1)
+    {
+      FILE * fp;
+      int ret;
 
-    return num;
-}
+      sprintf(cpu_path,"/sys/devices/system/cpu/cpu%d/cpufreq",i);
 
-#ifdef __ARM_ARCH
+      if(stat(cpu_path,&stat_buf)<0)
+          break;
 
-#ifdef __ANDROID__
-int get_cpu_max_freq(int id)
-{
-    char fname[256];
-    int max_freq = 100;
-    FILE* fp = NULL;
+      cpu_item=(struct cpu_item * )realloc(cpu_item,sizeof(struct cpu_item)*(i+1));
 
-    sprintf(fname, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", id);
+      cpu_item[i].cpu_id=i;
 
-    fp = fopen(fname, "rb");
+      ret=snprintf(file_path,128,"%s/cpuinfo_max_freq",cpu_path);
 
-    if(!fp)
-    {
-        sprintf(fname, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", id);
-        fp = fopen(fname, "rb");
-    }
+      if(ret>=128)
+         file_path[127]=0x0;
+ 
+      fp=fopen(file_path,"rb");
 
-    if(fp)
-    {
-        while(!feof(fp))
-        {
-            int freq;
-            if(fscanf(fp, "%d %*d\n", &freq) != 1)
-                break;
+      if(fp==NULL)
+           break;
 
-            if(freq > max_freq)
-                max_freq = freq;
-        }
+      if(fscanf(fp, "%d", &cpu_item[i].max_freq)<0)
+      {
+           fclose(fp);
+           break;
+      }
 
-        fclose(fp);
+      fclose(fp);
 
-        return max_freq;
-    }
+      ret=snprintf(file_path,128,"%s/related_cpus",cpu_path);
 
-    sprintf(fname, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", id);
-    fp = fopen(fname, "rb");
+      if(ret>=128)
+         file_path[127]=0x0;
 
-    if(fp)
-    {
-        fscanf(fp, "%d", &max_freq);
-        fclose(fp);
-    }
+      fp=fopen(file_path,"rb");
 
-    return max_freq;
-}
+      if(fp==NULL)
+           break;
 
-#else
-int get_cpu_max_freq(int id)
-{
-    char cpu_fname[256];
-    FILE* fp;
-    int max_freq;
+      if(fscanf(fp,"%d ",&cpu_item[i].cluster_leader)<0)
+      {
+           fclose(fp);
+           break;
+      }
 
-    sprintf(cpu_fname, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_max_freq", id);
+      fclose(fp);
 
-    fp = fopen(cpu_fname, "r");
+      i++;
 
-    if(!fp)
-        return 0;
+    }
 
-    if(fscanf(fp, "%d", &max_freq) < 0)
-        return 0;
+    if(i==0)
+    {
+        /* 
+         some weird thing happened! just fill a fake one 
+         TODO: add a log here
+        */
+         
+        cpu_item=(struct cpu_item *)malloc(sizeof(struct cpu_item));
+
+        cpu_item[0].cpu_id=0;
+        cpu_item[0].max_freq=100;
+        cpu_item[0].cluster_leader=0;
+
+        i++;
+    }
 
-    fclose(fp);
+   *p_item=cpu_item;
 
-    return max_freq;
+    return i;
 }
-#endif
+
+#ifdef __ARM_ARCH
 
 static char* get_target_line(FILE* fp, const char* target_prefix)
 {
@@ -137,7 +140,7 @@ static char* get_target_line(FILE* fp, const char* target_prefix)
     return nullptr;
 }
 
-int get_cpu_model_arch(int id, struct cpu_cluster* cluster)
+static int get_cpu_model_arch(int id, struct cpu_cluster* cluster)
 {
     char cpu_fname[256];
     FILE* fp;
@@ -239,52 +242,83 @@ int get_cpu_model_arch(int id, struct cpu_cluster* cluster)
     return 0;
 }
 
+#else
+
+static int get_cpu_model_arch(int id, struct cpu_cluster* cluster)
+{
+    cluster->cpu_model = CPU_GENERIC;
+    cluster->cpu_arch = CPU_GENERIC;
+    cluster->l1_size = 32 << 10;
+    cluster->l2_size = 512 << 10;
+
+    return 0;
+}
+
+#endif
 struct cpu_info* probe_system_cpu(void)
 {
     static struct cpu_info cpu_dev;
 
-    int cluster_idx = -1;
-    int last_max_freq = -1;
-    int top_max_freq = -1;
+    struct cpu_item * cpu_item;
+    int  cpu_number;
+    int  cluster_number=1;
 
-    int cpu_number = get_cpu_number();
-    struct cpu_cluster* cpu_cluster = ( struct cpu_cluster* )malloc(sizeof(struct cpu_cluster) * (cpu_number / 4 + 1));
+    cpu_number=get_cpu_items(&cpu_item);
 
-    for(int i = 0; i < cpu_number; i++)
+    /* assuming cluster cpus are continuous */
+    for(int i=1;i<cpu_number;i++)
     {
-        int max_freq = 0;
-        struct cpu_cluster* cluster;
+        if(cpu_item[i-1].cluster_leader!=cpu_item[i].cluster_leader)
+             cluster_number++;
+    }
 
-        max_freq = get_cpu_max_freq(i);
+    struct cpu_cluster* cpu_cluster=(struct cpu_cluster *)
+                            malloc(sizeof(struct cpu_cluster)*cluster_number);
 
-        if(max_freq != last_max_freq)
-        {
-            cluster_idx++;
-            cluster = cpu_cluster + cluster_idx;
-            cluster->cpu_number = 0;
-        }
-        else
-            cluster = cpu_cluster + cluster_idx;
+    memset(cpu_cluster->hw_cpu_id,-1,sizeof(int)*MAX_CLUSTER_CPU_NUMBER);
 
-        cluster->max_freq = max_freq;
-        cluster->cpu_number++;
+    /* setup cpu 0 */
+    cpu_cluster[0].cpu_number=1;
+    cpu_cluster[0].max_freq=cpu_item[0].max_freq;
+    cpu_cluster[0].hw_cpu_id[0]=cpu_item[0].cpu_id;
 
-        last_max_freq = max_freq;
 
-        if(top_max_freq < max_freq)
-            top_max_freq = max_freq;
+    int top_max_freq=0;
+    struct cpu_cluster * cluster=cpu_cluster;
+
+    for(int i=1;i<cpu_number;i++)
+    {
+        /* assuming cluster's cpu is continuous*/
+
+        if(cpu_item[i-1].cluster_leader!=cpu_item[i].cluster_leader)
+        {
+            cluster++;
+            memset(cluster->hw_cpu_id,-1,sizeof(int)*MAX_CLUSTER_CPU_NUMBER);
+            cluster->cpu_number=0;
+            cluster->max_freq=cpu_item[i].max_freq;
 
-        if(get_cpu_model_arch(i, cluster) < 0)
-            return NULL;
+            if(cluster->max_freq>top_max_freq)
+                  top_max_freq=cluster->max_freq;
+        }
+             
+        cluster->hw_cpu_id[cluster->cpu_number]=cpu_item[i].cpu_id;
+        cluster->cpu_number++;
     }
 
-    int start_cpu = 0;
+    free(cpu_item);
+
+    for(int i=0;i<cluster_number;i++)
+    {
+         struct cpu_cluster * cluster=cpu_cluster+i;
+         
+         get_cpu_model_arch(cluster->hw_cpu_id[0],cluster);
+    }
 
-    cpu_dev.cluster_number = cluster_idx + 1;
-    cpu_dev.cluster = cpu_cluster;
+    cpu_dev.cluster_number=cluster_number;
+    cpu_dev.cluster=cpu_cluster;
 
-    // setup the online cpu according to top_max_freq
     cpu_dev.online_cpu_list = ( int* )malloc(sizeof(int) * cpu_number);
+
     int online_cpu_number = 0;
 
     for(int i = 0; i < cpu_dev.cluster_number; i++)
@@ -293,79 +327,26 @@ struct cpu_info* probe_system_cpu(void)
 
         for(int j = 0; j < cluster->cpu_number; j++)
         {
-            cluster->hw_cpu_id[j] = start_cpu + j;
-
-            if(cluster->max_freq == top_max_freq)
+            if(cluster->max_freq >= top_max_freq)
             {
                 cpu_dev.online_cpu_list[online_cpu_number++] = cluster->hw_cpu_id[j];
             }
         }
-
-        start_cpu += cluster->cpu_number;
     }
 
+#ifdef __ARM_ARCH
     cpu_dev.cpu_name = "arm.probed";
-    cpu_dev.board_name = "generic.probed";
-
-    cpu_dev.online_cpu_number = online_cpu_number;
-
-    return &cpu_dev;
-}
-
 #else
+    cpu_dev.cpu_name = "x86.probed";
+#endif
 
-struct cpu_info* probe_system_cpu(void)
-{
-    /* create cpu_info */
-    static struct cpu_info cpu_dev;
-
-    int cpu_number = get_cpu_number();
-
-    struct cpu_cluster* cpu_cluster = ( struct cpu_cluster* )malloc(sizeof(struct cpu_cluster) * (cpu_number / 4 + 1));
-
-    int cluster_number = 0;
-
-    for(int i = 0; i < cpu_number; i += 4)
-    {
-        struct cpu_cluster* cluster = cpu_cluster + cluster_number;
-        int start_cpu_id = cluster_number * 4;
-
-        cluster->cpu_number = start_cpu_id + 4 > cpu_number ? cpu_number - start_cpu_id : 4;
-        cluster->max_freq = 2000;
-        cluster->cpu_model = CPU_GENERIC;
-        cluster->cpu_arch = CPU_GENERIC;
-        cluster->l1_size = 32 << 10;
-        cluster->l2_size = 512 << 10;
-
-        for(int j = 0; j < cluster->cpu_number; j++)
-            cluster->hw_cpu_id[j] = start_cpu_id + j;
-
-        cluster_number++;
-    }
-
-    int online_cpu_number = cpu_number;
-
-    cpu_dev.cpu_name = "geneirc chip";
-    cpu_dev.board_name = "generic board";
-
-    cpu_dev.cluster_number = 1;
-    cpu_dev.l3_size = 512 << 10;
-
+    cpu_dev.board_name = "generic.probed";
     cpu_dev.online_cpu_number = online_cpu_number;
-    cpu_dev.online_cpu_list = ( int* )malloc(sizeof(int) * online_cpu_number);
-
-    for(int i = 0; i < cpu_number; i++)
-    {
-        cpu_dev.online_cpu_list[i] = i;
-    }
-
-    cpu_dev.cluster_number = cluster_number;
-    cpu_dev.cluster = cpu_cluster;
 
     return &cpu_dev;
 }
 
-#endif
+
 
 void free_probe_cpu_info(struct cpu_info* cpu_dev)
 {
diff --git a/driver/cpu/cpu_runner.cpp b/driver/cpu/cpu_runner.cpp
index 87e9116e8..171002378 100644
--- a/driver/cpu/cpu_runner.cpp
+++ b/driver/cpu/cpu_runner.cpp
@@ -271,7 +271,7 @@ static void parse_node(void* data, int repeat_count, uint64_t total_time)
     {
         Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
         ConvParam* param = conv_op->GetParam();
-        printf("%2d x %d / %d_p%d", param->kernel_h, param->kernel_w, param->stride_h, param->pads[0]);
+        printf("%2d x %d / %d_p%d", param->kernel_h, param->kernel_w, param->stride_h, param->pad_h0);
         // if(param->kernel_h==3 && param->stride_h==1)
         // {
         // 	printf(" [%d]",param->mth);
@@ -386,7 +386,7 @@ bool CPURunner::Run(Subgraph* sub_graph)
 
             std::vector<TShape> outputs(output_number);
 
-            if(!op->InferShape(inputs, outputs, node_ops->exec_attr->layout))
+            if(!op->InferShape(inputs, outputs, node_ops->exec_attr->graph_layout))
             {
                 XLOG_ERROR() << "infer shaped for node: " << node->GetName() << " op: " << op->GetName() << " failed\n";
                 ret = false;
@@ -654,10 +654,13 @@ bool CPURunner::FreeMem(Subgraph* sub_graph)
         sub_graph->RemoveAttr("shared_temp_memory");
     }
 
-    MemPool* mem_pool = any_cast<MemPool*>(sub_graph->GetAttr("MemPool"));
-    delete mem_pool;
+    if(sub_graph->ExistAttr("MemPool"))
+    {
+        MemPool* mem_pool = any_cast<MemPool*>(sub_graph->GetAttr("MemPool"));
+        delete mem_pool;
 
-    sub_graph->RemoveAttr("MemPool");
+        sub_graph->RemoveAttr("MemPool");
+    }
 
     return true;
 }
@@ -888,11 +891,12 @@ bool CPURunner::AllocateMem(Subgraph* sub_graph)
                 {
                     void* tensor_addr = get_tensor_mem(input_tensor);
                     int total_size = tensor->GetTotalSize();
-                    set_tensor_mem(tensor, tensor_addr, total_size, nullptr);
-
-                    mem_pool->AddRef(tensor);
+                    if(set_tensor_mem(tensor, tensor_addr, total_size, nullptr))
+		    {
+                        mem_pool->AddRef(tensor);
 
-                    continue;
+                        continue;
+		    }
                 }
             }
 
@@ -900,14 +904,16 @@ bool CPURunner::AllocateMem(Subgraph* sub_graph)
             {
                 int total_size = tensor->GetTotalSize();
                 void* tensor_addr = mem_pool->Allocate(tensor, total_size);
-                set_tensor_mem(tensor, tensor_addr, total_size, nullptr);
+                if(!set_tensor_mem(tensor, tensor_addr, total_size, nullptr))
+                    return false;
             }
         }
         /* input tensor */
         for(unsigned int i = 0; i < node->GetInputNum(); i++)
         {
             Tensor* input_tensor = node->GetInputTensor(i);
-            mem_pool->Free(input_tensor);
+            if(input_tensor->GetName() != "data")
+                mem_pool->Free(input_tensor);
         }
     }
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 9c9ab147a..164187f24 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -38,12 +38,10 @@ add_subdirectory(faster_rcnn)
 add_subdirectory(lighten_cnn)
 add_subdirectory(imagenet_classification)
 add_subdirectory(mobilenet_ssd)
-
 if( NOT ANDROID)
-   #add_subdirectory(caffe_wrapper)
-   #add_subdirectory(tensorflow_wrapper)
+   add_subdirectory(caffe_wrapper)
+   add_subdirectory(tensorflow_wrapper)
 endif()
-
 add_subdirectory(tengine_model)
 
 
diff --git a/examples/YuFaceDetectNet/yu_facedetect.cpp b/examples/YuFaceDetectNet/yu_facedetect.cpp
index 9eea88718..b7aefc96a 100644
--- a/examples/YuFaceDetectNet/yu_facedetect.cpp
+++ b/examples/YuFaceDetectNet/yu_facedetect.cpp
@@ -1,211 +1,211 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2019, Open AI Lab
- * Author: chunyinglv@openailab.com
- */
-
-#include <iostream>
-#include <iomanip>
-#include <sys/time.h>
-
-#include "tengine_c_api.h"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
-
-float show_threshold = 0.5;
-
-struct Box
-{
-    float x0;
-    float y0;
-    float x1;
-    float y1;
-    int class_idx;
-    float score;
-};
-
-void post_process_ssd(cv::Mat& img, float threshold, float* outdata, int num, const std::string& save_name)
-{
-    std::vector<Box> boxes;
-    int line_width = img.cols * 0.005;
-    printf("--------------------------------------------\n");
-    printf("Face id: prob%%\tBOX:( x0 , y0 ),( x1 , y1 )\n");
-    printf("--------------------------------------------\n");
-    int detected_face_num = 0;
-    for(int i = 0; i < num; i++)
-    {
-        if(outdata[1] >= threshold)
-        {
-            detected_face_num += 1;
-            Box box;
-            box.class_idx = outdata[0];
-            box.score = outdata[1];
-            box.x0 = outdata[2] * img.cols;
-            box.y0 = outdata[3] * img.rows;
-            box.x1 = outdata[4] * img.cols;
-            box.y1 = outdata[5] * img.rows;
-            boxes.push_back(box);
-            printf("Face %d:\t%.0f%%\t", detected_face_num, box.score * 100);
-            printf("BOX:( %g , %g ),( %g , %g )\n", box.x0, box.y0, box.x1, box.y1);
-        }
-        outdata += 6;
-    }
-    printf("detect faces : %d \n", detected_face_num);
-    for(int i = 0; i < ( int )boxes.size(); i++)
-    {
-        Box box = boxes[i];
-        cv::rectangle(img, cv::Rect(box.x0, box.y0, (box.x1 - box.x0), (box.y1 - box.y0)), cv::Scalar(255, 255, 0),
-                      line_width);
-
-        std::ostringstream score_str;
-        score_str.precision(3);
-        score_str << box.score;
-        std::string label = score_str.str();
-        int baseLine = 0;
-        cv::Size label_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.3, 1, &baseLine);
-        cv::rectangle(img,
-                      cv::Rect(cv::Point(box.x0, box.y0 - label_size.height),
-                               cv::Size(label_size.width, label_size.height + baseLine)),
-                      cv::Scalar(255, 255, 0), CV_FILLED);
-        cv::putText(img, label, cv::Point(box.x0, box.y0), cv::FONT_HERSHEY_SIMPLEX, 0.3, cv::Scalar(0, 0, 0));
-    }
-    cv::imwrite(save_name, img);
-    std::cout << "======================================\n";
-    std::cout << "[DETECTED IMAGE SAVED]:\t" << save_name << "\n";
-    std::cout << "======================================\n";
-}
-
-void get_input_data(cv::Mat& img, float* input_data, int img_h, int img_w)
-{
-    int mean[3] = { 104,117,123 };
-    unsigned char* src_ptr=(unsigned char*)(img.ptr(0));
-    int hw = img_h * img_w;
-    for(int h = 0; h < img_h; h++)
-    {
-        for(int w = 0; w < img_w; w++)
-        {
-            for(int c = 0; c < 3; c++)
-            {
-                input_data[c * hw + h * img_w + w] =(float)(*src_ptr - mean[c]);
-                src_ptr++;
-            }
-        }
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    if(argc < 4)
-    {
-        std::cout << "[Usage]: " << argv[0] << " <proto> <caffemodel> <jpg> \n";
-        return 0;
-    }
-    std::string proto_name_ = argv[1];
-    std::string mdl_name_ = argv[2];
-    std::string image_file = argv[3];
-    
-    std::string save_file = "save.jpg";
-    
-    cv::Mat img = cv::imread(image_file);
-    if(img.empty())
-    {
-        std::cerr << "failed to read image file " << image_file << "\n";
-        return -1;
-    }
-#if 1
-    // resize to 320 x 240
-    cv::Mat resize_img;
-    int img_w = 320;
-    int img_h = 240;
-    cv::resize(img, resize_img, cv::Size(img_w, img_h), 0, 0,cv::INTER_NEAREST);
-    float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3);
-    get_input_data(resize_img, input_data, img_h, img_w);
-#else
-    // use origin image size
-    int img_h = img.rows;
-    int img_w = img.cols;
-    float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3);
-    get_input_data(img, input_data, img_h, img_w);
-#endif
-
-    init_tengine();
-    if(request_tengine_version("0.9") < 0)
-        return 1;
-
-    graph_t graph = create_graph(nullptr, "caffe", proto_name_.c_str(), mdl_name_.c_str());
-    if(graph == nullptr)
-    {
-        std::cout << "Create graph0 failed\n";
-        std::cout << "errno: " << get_tengine_errno() << "\n";
-        return -1;
-    }
-
-    /* get input tensor */
-    int node_idx = 0;
-    int tensor_idx = 0;
-    tensor_t input_tensor = get_graph_input_tensor(graph, node_idx, tensor_idx);
-    int dims[] = {1, 3, img_h, img_w};
-    set_tensor_shape(input_tensor, dims, 4);
-    /* setup input buffer */
-    if(set_tensor_buffer(input_tensor, input_data, 3 * img_h * img_w * 4) < 0)
-    {
-        std::printf("Set buffer for tensor failed\n");
-        return -1;
-    }
-
-    prerun_graph(graph);
-
-    // time run_graph
-    int repeat_count = 1;
-    const char* repeat = std::getenv("REPEAT_COUNT");
-    if(repeat)
-        repeat_count = std::strtoul(repeat, NULL, 10);
-
-    struct timeval t0, t1;
-    float avg_time = 0.f;
-    gettimeofday(&t0, NULL);
-    for(int i = 0; i < repeat_count; i++)
-        run_graph(graph, 1);
-    gettimeofday(&t1, NULL);
-    float mytime = ( float )((t1.tv_sec * 1000000 + t1.tv_usec) - (t0.tv_sec * 1000000 + t0.tv_usec)) / 1000;
-    avg_time += mytime;
-    std::cout << "--------------------------------------\n";
-    std::cout << "repeat " << repeat_count << " times, avg time per run is " << avg_time / repeat_count << " ms\n";
-
-    // post process
-    tensor_t out_tensor = get_graph_output_tensor(graph, 0, 0);    //"detection_out");
-    int out_dim[4];
-    get_tensor_shape(out_tensor, out_dim, 4);
-    float* outdata = ( float* )get_tensor_buffer(out_tensor);
-    int num = out_dim[1];
-    
-    post_process_ssd(img, show_threshold, outdata, num, save_file.c_str());
-
-    // free
-    release_graph_tensor(out_tensor);
-    release_graph_tensor(input_tensor);
-    postrun_graph(graph);
-    destroy_graph(graph);
-    free(input_data);
-    release_tengine();
-
-    return 0;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: chunyinglv@openailab.com
+ */
+
+#include <iostream>
+#include <iomanip>
+#include <sys/time.h>
+
+#include "tengine_c_api.h"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+float show_threshold = 0.5;
+
+struct Box
+{
+    float x0;
+    float y0;
+    float x1;
+    float y1;
+    int class_idx;
+    float score;
+};
+
+void post_process_ssd(cv::Mat& img, float threshold, float* outdata, int num, const std::string& save_name)
+{
+    std::vector<Box> boxes;
+    int line_width = img.cols * 0.005;
+    printf("--------------------------------------------\n");
+    printf("Face id: prob%%\tBOX:( x0 , y0 ),( x1 , y1 )\n");
+    printf("--------------------------------------------\n");
+    int detected_face_num = 0;
+    for(int i = 0; i < num; i++)
+    {
+        if(outdata[1] >= threshold)
+        {
+            detected_face_num += 1;
+            Box box;
+            box.class_idx = outdata[0];
+            box.score = outdata[1];
+            box.x0 = outdata[2] * img.cols;
+            box.y0 = outdata[3] * img.rows;
+            box.x1 = outdata[4] * img.cols;
+            box.y1 = outdata[5] * img.rows;
+            boxes.push_back(box);
+            printf("Face %d:\t%.0f%%\t", detected_face_num, box.score * 100);
+            printf("BOX:( %g , %g ),( %g , %g )\n", box.x0, box.y0, box.x1, box.y1);
+        }
+        outdata += 6;
+    }
+    printf("detect faces : %d \n", detected_face_num);
+    for(int i = 0; i < ( int )boxes.size(); i++)
+    {
+        Box box = boxes[i];
+        cv::rectangle(img, cv::Rect(box.x0, box.y0, (box.x1 - box.x0), (box.y1 - box.y0)), cv::Scalar(255, 255, 0),
+                      line_width);
+
+        std::ostringstream score_str;
+        score_str.precision(3);
+        score_str << box.score;
+        std::string label = score_str.str();
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.3, 1, &baseLine);
+        cv::rectangle(img,
+                      cv::Rect(cv::Point(box.x0, box.y0 - label_size.height),
+                               cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 0), CV_FILLED);
+        cv::putText(img, label, cv::Point(box.x0, box.y0), cv::FONT_HERSHEY_SIMPLEX, 0.3, cv::Scalar(0, 0, 0));
+    }
+    cv::imwrite(save_name, img);
+    std::cout << "======================================\n";
+    std::cout << "[DETECTED IMAGE SAVED]:\t" << save_name << "\n";
+    std::cout << "======================================\n";
+}
+
+void get_input_data(cv::Mat& img, float* input_data, int img_h, int img_w)
+{
+    int mean[3] = { 104,117,123 };
+    unsigned char* src_ptr=(unsigned char*)(img.ptr(0));
+    int hw = img_h * img_w;
+    for(int h = 0; h < img_h; h++)
+    {
+        for(int w = 0; w < img_w; w++)
+        {
+            for(int c = 0; c < 3; c++)
+            {
+                input_data[c * hw + h * img_w + w] =(float)(*src_ptr - mean[c]);
+                src_ptr++;
+            }
+        }
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    if(argc < 4)
+    {
+        std::cout << "[Usage]: " << argv[0] << " <proto> <caffemodel> <jpg> \n";
+        return 0;
+    }
+    std::string proto_name_ = argv[1];
+    std::string mdl_name_ = argv[2];
+    std::string image_file = argv[3];
+    
+    std::string save_file = "save.jpg";
+    
+    cv::Mat img = cv::imread(image_file);
+    if(img.empty())
+    {
+        std::cerr << "failed to read image file " << image_file << "\n";
+        return -1;
+    }
+#if 1
+    // resize to 320 x 240
+    cv::Mat resize_img;
+    int img_w = 320;
+    int img_h = 240;
+    cv::resize(img, resize_img, cv::Size(img_w, img_h), 0, 0,cv::INTER_NEAREST);
+    float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3);
+    get_input_data(resize_img, input_data, img_h, img_w);
+#else
+    // use origin image size
+    int img_h = img.rows;
+    int img_w = img.cols;
+    float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3);
+    get_input_data(img, input_data, img_h, img_w);
+#endif
+
+    init_tengine();
+    if(request_tengine_version("0.9") < 0)
+        return 1;
+
+    graph_t graph = create_graph(nullptr, "caffe", proto_name_.c_str(), mdl_name_.c_str());
+    if(graph == nullptr)
+    {
+        std::cout << "Create graph0 failed\n";
+        std::cout << "errno: " << get_tengine_errno() << "\n";
+        return -1;
+    }
+
+    /* get input tensor */
+    int node_idx = 0;
+    int tensor_idx = 0;
+    tensor_t input_tensor = get_graph_input_tensor(graph, node_idx, tensor_idx);
+    int dims[] = {1, 3, img_h, img_w};
+    set_tensor_shape(input_tensor, dims, 4);
+    /* setup input buffer */
+    if(set_tensor_buffer(input_tensor, input_data, 3 * img_h * img_w * 4) < 0)
+    {
+        std::printf("Set buffer for tensor failed\n");
+        return -1;
+    }
+
+    prerun_graph(graph);
+
+    // time run_graph
+    int repeat_count = 1;
+    const char* repeat = std::getenv("REPEAT_COUNT");
+    if(repeat)
+        repeat_count = std::strtoul(repeat, NULL, 10);
+
+    struct timeval t0, t1;
+    float avg_time = 0.f;
+    gettimeofday(&t0, NULL);
+    for(int i = 0; i < repeat_count; i++)
+        run_graph(graph, 1);
+    gettimeofday(&t1, NULL);
+    float mytime = ( float )((t1.tv_sec * 1000000 + t1.tv_usec) - (t0.tv_sec * 1000000 + t0.tv_usec)) / 1000;
+    avg_time += mytime;
+    std::cout << "--------------------------------------\n";
+    std::cout << "repeat " << repeat_count << " times, avg time per run is " << avg_time / repeat_count << " ms\n";
+
+    // post process
+    tensor_t out_tensor = get_graph_output_tensor(graph, 0, 0);    //"detection_out");
+    int out_dim[4];
+    get_tensor_shape(out_tensor, out_dim, 4);
+    float* outdata = ( float* )get_tensor_buffer(out_tensor);
+    int num = out_dim[1];
+    
+    post_process_ssd(img, show_threshold, outdata, num, save_file.c_str());
+
+    // free
+    release_graph_tensor(out_tensor);
+    release_graph_tensor(input_tensor);
+    postrun_graph(graph);
+    destroy_graph(graph);
+    free(input_data);
+    release_tengine();
+
+    return 0;
+}
diff --git a/examples/caffe_wrapper/cpp_classification/CMakeLists.txt b/examples/caffe_wrapper/cpp_classification/CMakeLists.txt
index dccedf463..5849f617f 100644
--- a/examples/caffe_wrapper/cpp_classification/CMakeLists.txt
+++ b/examples/caffe_wrapper/cpp_classification/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8)
 project(classification)
 
 set( INSTALL_DIR ${TENGINE_DIR}/install/ )
-set( TENGINE_LIBS wrapper tengine protobuf boost_thread glog )
+set( TENGINE_LIBS wrapper tengine hclcpu protobuf boost_thread glog )
 set( CODE_SRC classification.cpp )
 
 #flag
@@ -23,9 +23,14 @@ include_directories(${INSTALL_DIR}/wrapper_include
 link_directories(${INSTALL_DIR}/lib)
 
 #exe
+
+if ( NOT ARM)
+set (OPEN_BLAS_LIB openblas)
+endif()
+
 add_executable(classification ${CODE_SRC})
-target_link_libraries(classification ${TENGINE_LIBS} ${OpenCV_LIBS})
+target_link_libraries(classification ${TENGINE_LIBS} ${OpenCV_LIBS} ${OPEN_BLAS_LIB})
 
 add_executable(classification_mobilenet ${CODE_SRC})
 target_compile_definitions(classification_mobilenet PUBLIC -DMOBILE_NET)
-target_link_libraries(classification_mobilenet ${TENGINE_LIBS} ${OpenCV_LIBS})
+target_link_libraries(classification_mobilenet ${TENGINE_LIBS} ${OpenCV_LIBS} ${OPEN_BLAS_LIB})
diff --git a/examples/caffe_wrapper/mtcnn/CMakeLists.txt b/examples/caffe_wrapper/mtcnn/CMakeLists.txt
index 15ca8b939..57b4aee09 100644
--- a/examples/caffe_wrapper/mtcnn/CMakeLists.txt
+++ b/examples/caffe_wrapper/mtcnn/CMakeLists.txt
@@ -4,7 +4,7 @@ project(CAFFE_MTCNN)
 
 set( TENGINE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../ )
 set( INSTALL_DIR ${TENGINE_DIR}/install/ )
-set( TENGINE_LIBS wrapper tengine )
+set( TENGINE_LIBS wrapper tengine hclcpu)
 set( CAFFE_MTCNN_SRCS test_caffe_mtcnn.cpp caffe_mtcnn.cpp caffe_mtcnn_utils.cpp )
 
 #flag
@@ -21,5 +21,9 @@ include_directories(${INSTALL_DIR}/wrapper_include
 link_directories(${INSTALL_DIR}/lib)
 
 #exe
+if ( NOT (ARM OR ANDROID) )
+set (OPEN_BLAS_LIB openblas)
+endif()
+
 add_executable(CAFFE_MTCNN ${CAFFE_MTCNN_SRCS})
-target_link_libraries(CAFFE_MTCNN ${TENGINE_LIBS} ${OpenCV_LIBS} boost_thread)
+target_link_libraries(CAFFE_MTCNN ${TENGINE_LIBS} ${OpenCV_LIBS} boost_thread ${OPEN_BLAS_LIB})
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 666c134cc..f3fd3727f 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -101,24 +101,24 @@ bool set_tengine_config()
  */
 std::string get_file(const char* fname)
 {
-    std::fstream test_fs;
+    FILE* fp;
     std::string fn = fname;
 
     const std::string mod_sch1 = "./" + fn;
     const std::string mod_sch2 = get_root_path() + "models/" + fn;
 
-    test_fs.open(mod_sch1.c_str());
-    if(test_fs.is_open())
+    fp = fopen(mod_sch1.c_str(), "r");
+    if(fp)
     {
-        test_fs.close();
+        fclose(fp);
         return mod_sch1;
     }
     else
     {
-        test_fs.open(mod_sch2.c_str());
-        if(test_fs.is_open())
+        fp = fopen(mod_sch2.c_str(), "r");
+        if(fp)
         {
-            test_fs.close();
+            fclose(fp);
             return mod_sch2;
         }
         else
diff --git a/examples/faster_rcnn/CMakeLists.txt b/examples/faster_rcnn/CMakeLists.txt
index 2d4da5631..1e8a54d8a 100644
--- a/examples/faster_rcnn/CMakeLists.txt
+++ b/examples/faster_rcnn/CMakeLists.txt
@@ -4,7 +4,8 @@ project(FASTER_RCNN)
 link_directories( ${PROTOBUF_DIR}/lib )
 add_definitions(-std=c++11)
 set( INSTALL_DIR ${TENGINE_DIR}/install )
-set( TENGINE_LIBS tengine)
+set( TENGINE_LIBS tengine hclcpu)
+
 if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
     set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so)
 endif()
@@ -23,6 +24,11 @@ if( BLAS_DIR)
     endif()
 endif()
 
+if ( NOT (ARM OR ANDROID))
+set (BLAS_LIB openblas)
+endif()
+
+
 #opencv
 find_package(OpenCV REQUIRED)
 
@@ -37,7 +43,7 @@ add_executable(FASTER_RCNN ${RESNET_SRCS})
 if( ANDROID)
    target_link_libraries(FASTER_RCNN ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB})
 else()
-   target_link_libraries(FASTER_RCNN ${TENGINE_LIBS} ${OpenCV_LIBS})
+   target_link_libraries(FASTER_RCNN ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 endif()
 
 
diff --git a/examples/imagenet_classification/CMakeLists.txt b/examples/imagenet_classification/CMakeLists.txt
index 54f3b0222..7ce54f998 100644
--- a/examples/imagenet_classification/CMakeLists.txt
+++ b/examples/imagenet_classification/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8)
 project(Classify)
 
 set( INSTALL_DIR ${TENGINE_DIR}/install/ )
-set( TENGINE_LIBS tengine)
+set( TENGINE_LIBS  tengine hclcpu)
 if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
     set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so)
 endif()
@@ -20,6 +20,9 @@ if( BLAS_DIR)
     endif()
 endif()
 
+if ( NOT (ARM OR ANDROID))
+set (BLAS_LIB openblas)
+endif()
 
 set( CODE_SRCS classification.cpp model_config.cpp ../common/common.cpp )
 set( BIN_EXE Classify )
@@ -38,5 +41,5 @@ add_executable(${BIN_EXE} ${CODE_SRCS})
 if(ANDROID)
    target_link_libraries(${BIN_EXE} ${PROTOBUF_LIB} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 else()
-   target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS})
+   target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 endif()
diff --git a/examples/lighten_cnn/CMakeLists.txt b/examples/lighten_cnn/CMakeLists.txt
index ce2a9d805..b04a1fe9b 100644
--- a/examples/lighten_cnn/CMakeLists.txt
+++ b/examples/lighten_cnn/CMakeLists.txt
@@ -3,7 +3,8 @@ cmake_minimum_required (VERSION 2.8)
 project(LIGHTEN_CNN)
 
 set( INSTALL_DIR ${TENGINE_DIR}/install/ )
-set( TENGINE_LIBS tengine)
+set( TENGINE_LIBS tengine hclcpu)
+
 if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
     set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so)
 endif()
@@ -21,6 +22,11 @@ if( BLAS_DIR)
     endif()
 endif()
 
+if ( NOT (ARM OR ANDROID))
+set (BLAS_LIB openblas)
+endif()
+
+
 #opencv
 find_package(OpenCV REQUIRED)
 
@@ -35,6 +41,6 @@ add_executable(LIGHTEN_CNN ${CODE_SRCS})
 if( ANDROID)
     target_link_libraries(LIGHTEN_CNN ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB})
 else()
-    target_link_libraries(LIGHTEN_CNN ${TENGINE_LIBS} ${OpenCV_LIBS})
+    target_link_libraries(LIGHTEN_CNN ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 endif()
 
diff --git a/examples/linux_build.sh b/examples/linux_build.sh
index 725c3079e..e6f35dc51 100755
--- a/examples/linux_build.sh
+++ b/examples/linux_build.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 
-cmake -DTENGINE_DIR=/home/usr/tengine \
+cmake -DTENGINE_DIR=/home/haitao/workshop/tengine \
       ..
diff --git a/examples/mobilenet_ssd/CMakeLists.txt b/examples/mobilenet_ssd/CMakeLists.txt
index 426646c56..3ba1e6806 100644
--- a/examples/mobilenet_ssd/CMakeLists.txt
+++ b/examples/mobilenet_ssd/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8)
 project(MSSD)
 add_definitions(-std=c++11)
 set( INSTALL_DIR ${TENGINE_DIR}/install/)
-set( TENGINE_LIBS tengine)
+set( TENGINE_LIBS tengine hclcpu)
 
 if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
     set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so)
@@ -21,6 +21,9 @@ if( BLAS_DIR)
     endif()
 endif()
 
+if ( NOT (ARM OR ANDROID))
+set (BLAS_LIB openblas)
+endif()
 
 set( CODE_SRCS mssd.cpp ../common/common.cpp)
 
@@ -39,7 +42,7 @@ add_executable(MSSD ${CODE_SRCS})
 if( ANDROID)
    target_link_libraries(MSSD ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB})
 else()
-   target_link_libraries(MSSD ${TENGINE_LIBS} ${OpenCV_LIBS})
+   target_link_libraries(MSSD ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 endif()
 
 
diff --git a/examples/mtcnn/CMakeLists.txt b/examples/mtcnn/CMakeLists.txt
index 205f12e6d..6f6d8f49a 100644
--- a/examples/mtcnn/CMakeLists.txt
+++ b/examples/mtcnn/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8)
 project(MTCNN)
 
 set( INSTALL_DIR ${TENGINE_DIR}/install/ )
-set( TENGINE_LIBS tengine)
+set( TENGINE_LIBS tengine hclcpu)
 if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
     set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so)
 endif()
@@ -20,6 +20,10 @@ if( BLAS_DIR)
     endif()
 endif()
 
+if ( NOT (ARM OR ANDROID))
+set (BLAS_LIB openblas)
+endif()
+
 set( MTCNN_SRCS mtcnn_utils.cpp mtcnn.cpp test_mtcnn.cpp ../common/common.cpp )
 
 #opencv
@@ -37,6 +41,6 @@ add_executable(MTCNN ${MTCNN_SRCS})
 if( ANDROID)
   target_link_libraries(MTCNN ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB})
 else()
-  target_link_libraries(MTCNN ${TENGINE_LIBS} ${OpenCV_LIBS})
+  target_link_libraries(MTCNN ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 endif()
 
diff --git a/examples/ssd/CMakeLists.txt b/examples/ssd/CMakeLists.txt
index 94cf49cc3..2cc458ca3 100644
--- a/examples/ssd/CMakeLists.txt
+++ b/examples/ssd/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8)
 project(SSD)
 
 set( INSTALL_DIR ${TENGINE_DIR}/install/)
-set( TENGINE_LIBS tengine)
+set( TENGINE_LIBS tengine hclcpu)
 if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
     set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so)
 endif()
@@ -21,6 +21,9 @@ if( BLAS_DIR)
     endif()
 endif()
 
+if ( NOT (ARM OR ANDROID))
+set (BLAS_LIB openblas)
+endif()
 
 #opencv
 find_package(OpenCV REQUIRED)
@@ -37,7 +40,7 @@ add_executable(SSD ${CODE_SRCS})
 if( ANDROID)
   target_link_libraries(SSD ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB})
 else()
-   target_link_libraries(SSD ${TENGINE_LIBS} ${OpenCV_LIBS})
+   target_link_libraries(SSD ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 endif()
 
 
diff --git a/examples/tengine_model/classification/CMakeLists.txt b/examples/tengine_model/classification/CMakeLists.txt
index fe6754b18..02fdd4c56 100644
--- a/examples/tengine_model/classification/CMakeLists.txt
+++ b/examples/tengine_model/classification/CMakeLists.txt
@@ -3,7 +3,8 @@ cmake_minimum_required (VERSION 2.8)
 project(tm_classify)
 
 set( INSTALL_DIR ${TENGINE_DIR}/install/ )
-set( TENGINE_LIBS tengine)
+set( TENGINE_LIBS tengine hclcpu)
+
 if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
     set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so)
 endif()
@@ -20,10 +21,25 @@ if( BLAS_DIR)
     endif()
 endif()
 
+if ( NOT (ARM OR ANDROID))
+set (BLAS_LIB openblas)
+endif()
 
 set( CODE_SRCS classification.cpp ../../common/common.cpp )
 set( BIN_EXE tm_classify )
 
+set( CODE_SRCS_TF classification_tf.cpp ../../common/common.cpp )
+set( BIN_EXE_TF tm_classify_tf )
+
+set( CODE_SRCS_INT8 classification_int8.cpp ../../common/common.cpp )
+set( BIN_EXE_INT8 tm_classify_int8 )
+
+if(NOT ANDROID AND CMAKE_SIZEOF_VOID_P EQUAL 4)
+    add_definitions(-mfp16-format=ieee -mfpu=neon-fp16)
+endif()
+set( CODE_SRCS_FP16 classification_fp16.cpp ../../common/common.cpp )
+set( BIN_EXE_FP16 tm_classify_fp16 )
+
 #flag
 #set(CMAKE_CXX_FLAGS "-std=c++11 -O3 -Wall")
 
@@ -31,7 +47,8 @@ find_package(OpenCV REQUIRED)
 
 #include
 include_directories(${INSTALL_DIR}/include
-                    ${TENGINE_DIR}/examples/common)
+                    ${TENGINE_DIR}/examples/common
+                    ${TENGINE_DIR}/core/include)
 
 #lib
 link_directories(${INSTALL_DIR}/lib)
@@ -41,5 +58,27 @@ add_executable(${BIN_EXE} ${CODE_SRCS})
 if(ANDROID)
    target_link_libraries(${BIN_EXE} ${PROTOBUF_LIB} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 else()
-   target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS})
+   target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
+endif()
+
+add_executable(${BIN_EXE_TF} ${CODE_SRCS_TF})
+if(ANDROID)
+   target_link_libraries(${BIN_EXE_TF} ${PROTOBUF_LIB} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
+else()
+   target_link_libraries(${BIN_EXE_TF} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 endif()
+
+add_executable(${BIN_EXE_INT8} ${CODE_SRCS_INT8})
+if(ANDROID)
+   target_link_libraries(${BIN_EXE_INT8} ${PROTOBUF_LIB} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
+else()
+   target_link_libraries(${BIN_EXE_INT8} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
+endif()
+
+add_executable(${BIN_EXE_FP16} ${CODE_SRCS_FP16})
+if(ANDROID)
+   target_link_libraries(${BIN_EXE_FP16} ${PROTOBUF_LIB} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
+else()
+   target_link_libraries(${BIN_EXE_FP16} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
+endif()
+
diff --git a/examples/tengine_model/classification/classification_fp16.cpp b/examples/tengine_model/classification/classification_fp16.cpp
new file mode 100644
index 000000000..bc0cd2555
--- /dev/null
+++ b/examples/tengine_model/classification/classification_fp16.cpp
@@ -0,0 +1,444 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#include <unistd.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <string>
+#include <vector>
+#include <sys/time.h>
+#include <math.h>
+
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "tengine_c_api.h"
+#include "common.hpp"
+#include "cpu_device.h"
+#include "compiler_fp16.h"
+
+#define DEFAULT_MODEL_NAME "squeezenet"
+#define DEFAULT_IMAGE_FILE "tests/images/cat.jpg"
+#define DEFAULT_LABEL_FILE "models/synset_words.txt"
+#define DEFAULT_IMG_H 227
+#define DEFAULT_IMG_W 227
+#define DEFAULT_SCALE 1.f
+#define DEFAULT_MEAN1 104.007
+#define DEFAULT_MEAN2 116.669
+#define DEFAULT_MEAN3 122.679
+#define DEFAULT_REPEAT_CNT 1
+#define PRINT_TOP_NUM 5
+
+typedef struct
+{
+    const char* model_name;
+    int img_h;
+    int img_w;
+    float scale;
+    float mean[3];
+    const char* tm_file;
+    const char* label_file;
+} Model_Config;
+
+const Model_Config model_list[] = {
+    {"squeezenet", 227, 227, 1.f, {104.007, 116.669, 122.679}, "squeezenet_fp16.tmfile", "synset_words.txt"},
+    {"mobilenet", 224, 224, 0.017, {104.007, 116.669, 122.679}, "mobilenet_fp16.tmfile", "synset_words.txt"},
+    {"mobilenet_v2", 224, 224, 0.017, {104.007, 116.669, 122.679}, "mobilenet_v2_fp16.tmfile", "synset_words.txt"},
+    {"resnet50", 224, 224, 1.f, {104.007, 116.669, 122.679}, "resnet50_fp16.tmfile", "synset_words.txt"},
+    {"alexnet", 227, 227, 1.f, {104.007, 116.669, 122.679}, "alexnet_fp16.tmfile", "synset_words.txt"},
+    {"googlenet", 224, 224, 1.f, {104.007, 116.669, 122.679}, "googlenet_fp16.tmfile", "synset_words.txt"},
+    {"inception_v3", 395, 395, 0.0078, {104.007, 116.669, 122.679}, "inception_v3_fp16.tmfile", "synset2015.txt"},
+    {"inception_v4", 299, 299, 1 / 127.5f, {104.007, 116.669, 122.679}, "inception_v4_fp16.tmfile", "synset_words.txt"},
+    {"vgg16", 224, 224, 1.f, {104.007, 116.669, 122.679}, "vgg16_fp16.tmfile", "synset_words.txt"}
+};
+
+const Model_Config* get_model_config(const char* model_name)
+{
+    std::string name1 = model_name;
+    for(unsigned int i = 0; i < name1.size(); i++)
+        name1[i] = tolower(name1[i]);
+
+    for(unsigned int i = 0; i < sizeof(model_list) / sizeof(Model_Config); i++)
+    {
+        std::string name2 = model_list[i].model_name;
+        if(name1 == name2)
+        {
+            return &model_list[i];
+        }
+    }
+    std::cerr << "Not support model name : " << model_name << "\n";
+    return nullptr;
+}
+
+void LoadLabelFile(std::vector<std::string>& result, const char* fname)
+{
+    std::ifstream labels(fname);
+
+    std::string line;
+    while(std::getline(labels, line))
+        result.push_back(line);
+}
+
+static inline bool PairCompare(const std::pair<__fp16, int>& lhs, const std::pair<__fp16, int>& rhs)
+{
+    return fp16_to_fp32(lhs.first) > fp16_to_fp32(rhs.first);
+}
+
+static inline std::vector<int> Argmax(const std::vector<__fp16>& v, int N)
+{
+    std::vector<std::pair<__fp16, int> > pairs;
+    for(size_t i = 0; i < v.size(); ++i)
+        pairs.push_back(std::make_pair(v[i], i));
+    std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);
+
+    std::vector<int> result;
+    for(int i = 0; i < N; ++i)
+        result.push_back(pairs[i].second);
+    return result;
+}
+
+void get_input_data(const char* image_file, __fp16* input_data, int img_h, int img_w, const float* mean, float scale)
+{
+    cv::Mat sample = cv::imread(image_file, -1);
+    if(sample.empty())
+    {
+        std::cerr << "Failed to read image file " << image_file << ".\n";
+        return;
+    }
+    cv::Mat img;
+    if(sample.channels() == 4)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR);
+    }
+    else if(sample.channels() == 1)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR);
+    }
+    else
+    {
+        img = sample;
+    }
+
+    cv::resize(img, img, cv::Size(img_h, img_w));
+    img.convertTo(img, CV_32FC3);
+    float* img_data = ( float* )img.data;
+    int hw = img_h * img_w;
+
+    for(int h = 0; h < img_h; h++)
+    {
+        for(int w = 0; w < img_w; w++)
+        {
+            for(int c = 0; c < 3; c++)
+            {
+                input_data[c * hw + h * img_w + w] = fp32_to_fp16((*img_data - mean[c]) * scale);
+                img_data++;
+            }
+        }
+    }
+}
+
+void PrintTopLabels(const char* label_file, __fp16* data)
+{
+    // load labels
+    std::vector<std::string> labels;
+    LoadLabelFile(labels, label_file);
+
+    __fp16* end = data + 1000;
+    std::vector<__fp16> result(data, end);
+    std::vector<int> top_N = Argmax(result, PRINT_TOP_NUM);
+
+    for(unsigned int i = 0; i < top_N.size(); i++)
+    {
+        int idx = top_N[i];
+
+        std::cout << std::fixed << std::setprecision(4) << fp16_to_fp32(result[idx]) << " - \"" << labels[idx] << "\"\n";
+    }
+}
+
+bool run_tengine_library(const char* model_name, const char* tm_file, const char* label_file, const char* image_file,
+                         int img_h, int img_w, const float* mean, float scale, int repeat_count)
+{
+    // init
+    init_tengine();
+    if(request_tengine_version("1.2") < 0)
+        return false;
+
+    // create graph
+    graph_t graph = create_graph(nullptr, "tengine", tm_file);
+    if(graph == nullptr)
+    {
+        std::cerr << "Create graph failed.\n";
+        std::cerr << "errno: " << get_tengine_errno() << "\n";
+        return false;
+    }
+
+    // set input shape
+    int img_size = img_h * img_w * 3;
+    int dims[] = {1, 3, img_h, img_w};
+    __fp16* input_data = ( __fp16* )malloc(img_size * sizeof(__fp16));
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if(input_tensor == nullptr)
+    {
+        std::cerr << "Get input tensor failed\n";
+        return false;
+    }
+    set_tensor_shape(input_tensor, dims, 4);
+
+    // prerun
+    if(prerun_graph(graph) < 0)
+    {
+        std::cerr << "Prerun graph failed\n";
+        return false;
+    }
+    //dump_graph(graph);
+
+    struct timeval t0, t1;
+    float avg_time = 0.f;
+    for(int i = 0; i < repeat_count; i++)
+    {
+        get_input_data(image_file, input_data, img_h, img_w, mean, scale);
+        set_tensor_buffer(input_tensor, input_data, img_size * 4);
+
+        gettimeofday(&t0, NULL);
+        if(run_graph(graph, 1) < 0)
+        {
+            std::cerr << "Run graph failed\n";
+            return false;
+        }
+        gettimeofday(&t1, NULL);
+
+        float mytime = ( float )((t1.tv_sec * 1000000 + t1.tv_usec) - (t0.tv_sec * 1000000 + t0.tv_usec)) / 1000;
+        avg_time += mytime;
+    }
+    std::cout << "\nModel name : " << model_name << "\n"
+              << "tengine model file : " << tm_file << "\n"
+              << "label file : " << label_file << "\n"
+              << "image file : " << image_file << "\n"
+              << "img_h, imag_w, scale, mean[3] : " << img_h << " " << img_w << " " << scale << " " << mean[0] << " "
+              << mean[1] << " " << mean[2] << "\n";
+    std::cout << "\nRepeat " << repeat_count << " times, avg time per run is " << avg_time / repeat_count << " ms\n";
+    std::cout << "--------------------------------------\n";
+
+    // print output
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    __fp16* data = ( __fp16* )get_tensor_buffer(output_tensor);
+    PrintTopLabels(label_file, data);
+    std::cout << "--------------------------------------\n";
+
+    //tensor_t tensor1 = get_graph_tensor(graph, "pool1");
+    //Dumpdata("sqz_pool1_fp16.txt", (__fp16*)get_tensor_buffer(tensor1), get_tensor_buffer_size(tensor1)/2);
+
+    free(input_data);
+    release_graph_tensor(input_tensor);
+    release_graph_tensor(output_tensor);
+    postrun_graph(graph);
+    destroy_graph(graph);
+
+    return true;
+}
+
+template <typename T> static std::vector<T> ParseString(const std::string str)
+{
+    typedef std::string::size_type pos;
+    const char delim_ch = ',';
+    std::string str_tmp = str;
+    std::vector<T> result;
+    T t;
+
+    pos delim_pos = str_tmp.find(delim_ch);
+    while(delim_pos != std::string::npos)
+    {
+        std::istringstream ist(str_tmp.substr(0, delim_pos));
+        ist >> t;
+        result.push_back(t);
+        str_tmp.replace(0, delim_pos + 1, "");
+        delim_pos = str_tmp.find(delim_ch);
+    }
+    if(str_tmp.size() > 0)
+    {
+        std::istringstream ist(str_tmp);
+        ist >> t;
+        result.push_back(t);
+    }
+
+    return result;
+}
+
+int main(int argc, char* argv[])
+{
+    int repeat_count = DEFAULT_REPEAT_CNT;
+    const std::string root_path = get_root_path();
+    std::string model_name;
+    std::string tm_file;
+    std::string label_file;
+    std::string image_file;
+    std::vector<int> hw;
+    std::vector<float> ms;
+    int img_h = 0;
+    int img_w = 0;
+    float scale = 0.0;
+    float mean[3] = {-1.0, -1.0, -1.0};
+
+    int res;
+    while((res = getopt(argc, argv, "n:t:l:i:g:s:w:r:h")) != -1)
+    {
+        switch(res)
+        {
+            case 'n':
+                model_name = optarg;
+                break;
+            case 't':
+                tm_file = optarg;
+                break;
+            case 'l':
+                label_file = optarg;
+                break;
+            case 'i':
+                image_file = optarg;
+                break;
+            case 'g':
+                hw = ParseString<int>(optarg);
+                if(hw.size() != 2)
+                {
+                    std::cerr << "Error -g parameter.\n";
+                    return -1;
+                }
+                img_h = hw[0];
+                img_w = hw[1];
+                break;
+            case 's':
+                scale = strtof(optarg, NULL);
+                break;
+            case 'w':
+                ms = ParseString<float>(optarg);
+                if(ms.size() != 3)
+                {
+                    std::cerr << "Error -w parameter.\n";
+                    return -1;
+                }
+                mean[0] = ms[0];
+                mean[1] = ms[1];
+                mean[2] = ms[2];
+                break;
+            case 'r':
+                repeat_count = std::strtoul(optarg, NULL, 10);
+                break;
+            case 'h':
+                std::cout << "[Usage]: " << argv[0] << " [-h]\n"
+                          << "    [-n model_name] [-t tm_file] [-l label_file] [-i image_file]\n"
+                          << "    [-g img_h,img_w] [-s scale] [-w mean[0],mean[1],mean[2]] [-r repeat_count]\n";
+                return 0;
+            default:
+                break;
+        }
+    }
+
+    const Model_Config* mod_config;
+    // if model files not specified
+    if(tm_file.empty())
+    {
+        // if model name not specified
+        if(model_name.empty())
+        {
+            // use default model
+            model_name = DEFAULT_MODEL_NAME;
+            std::cout << "Model name and tm file not specified, run " << model_name << " by default.\n";
+        }
+        // get model config in predefined model list
+        mod_config = get_model_config(model_name.c_str());
+        if(mod_config == nullptr)
+            return -1;
+
+        // get tm file
+        tm_file = get_file(mod_config->tm_file);
+        if(tm_file.empty())
+            return -1;
+
+        // if label file not specified
+        if(label_file.empty())
+        {
+            // get label file
+            label_file = get_file(mod_config->label_file);
+            if(label_file.empty())
+                return -1;
+        }
+
+        if(!hw.size())
+        {
+            img_h = mod_config->img_h;
+            img_w = mod_config->img_w;
+        }
+        if(scale == 0.0)
+            scale = mod_config->scale;
+        if(!ms.size())
+        {
+            mean[0] = mod_config->mean[0];
+            mean[1] = mod_config->mean[1];
+            mean[2] = mod_config->mean[2];
+        }
+    }
+
+    // if label file not specified, use default label file
+    if(label_file.empty())
+    {
+        label_file = root_path + DEFAULT_LABEL_FILE;
+        std::cout << "Label file not specified, use " << label_file << " by default.\n";
+    }
+
+    // if image file not specified, use default image file
+    if(image_file.empty())
+    {
+        image_file = root_path + DEFAULT_IMAGE_FILE;
+        std::cout << "Image file not specified, use " << image_file << " by default.\n";
+    }
+
+    if(img_h == 0)
+        img_h = DEFAULT_IMG_H;
+    if(img_w == 0)
+        img_w = DEFAULT_IMG_W;
+    if(scale == 0.0)
+        scale = DEFAULT_SCALE;
+    if(mean[0] == -1.0)
+        mean[0] = DEFAULT_MEAN1;
+    if(mean[1] == -1.0)
+        mean[1] = DEFAULT_MEAN2;
+    if(mean[2] == -1.0)
+        mean[2] = DEFAULT_MEAN3;
+    if(model_name.empty())
+        model_name = "unknown";
+
+    // check input files
+    if(!check_file_exist(tm_file) || !check_file_exist(label_file) || !check_file_exist(image_file))
+        return -1;
+
+    // start to run
+    if(!run_tengine_library(model_name.c_str(), tm_file.c_str(), label_file.c_str(), image_file.c_str(), img_h, img_w,
+                            mean, scale, repeat_count))
+        return -1;
+
+    std::cout << "ALL TEST DONE\n";
+
+    return 0;
+}
diff --git a/examples/tengine_model/classification/classification_int8.cpp b/examples/tengine_model/classification/classification_int8.cpp
new file mode 100644
index 000000000..1a2a2087d
--- /dev/null
+++ b/examples/tengine_model/classification/classification_int8.cpp
@@ -0,0 +1,474 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#include <unistd.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <string>
+#include <vector>
+#include <sys/time.h>
+#include <math.h>
+
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "tengine_c_api.h"
+#include "common.hpp"
+#include "cpu_device.h"
+
+#define DEFAULT_MODEL_NAME "squeezenet"
+#define DEFAULT_IMAGE_FILE "tests/images/cat.jpg"
+#define DEFAULT_LABEL_FILE "models/synset_words.txt"
+#define DEFAULT_IMG_H 227
+#define DEFAULT_IMG_W 227
+#define DEFAULT_SCALE 1.f
+#define DEFAULT_MEAN1 104.007
+#define DEFAULT_MEAN2 116.669
+#define DEFAULT_MEAN3 122.679
+#define DEFAULT_REPEAT_CNT 1
+#define PRINT_TOP_NUM 5
+
+typedef struct
+{
+    const char* model_name;
+    int img_h;
+    int img_w;
+    float scale;
+    float mean[3];
+    const char* tm_file;
+    const char* label_file;
+} Model_Config;
+
+const Model_Config model_list[] = {
+    {"squeezenet", 227, 227, 1.f, {104.007, 116.669, 122.679}, "squeezenet_int8.tmfile", "synset_words.txt"},
+    {"mobilenet", 224, 224, 0.017, {104.007, 116.669, 122.679}, "mobilenet_int8.tmfile", "synset_words.txt"},
+    {"mobilenet_v2", 224, 224, 0.017, {104.007, 116.669, 122.679}, "mobilenet_v2_int8.tmfile", "synset_words.txt"},
+    {"resnet50", 224, 224, 1.f, {104.007, 116.669, 122.679}, "resnet50_int8.tmfile", "synset_words.txt"},
+    {"alexnet", 227, 227, 1.f, {104.007, 116.669, 122.679}, "alexnet_int8.tmfile", "synset_words.txt"},
+    {"googlenet", 224, 224, 1.f, {104.007, 116.669, 122.679}, "googlenet_int8.tmfile", "synset_words.txt"},
+    {"inception_v3", 395, 395, 0.0078, {104.007, 116.669, 122.679}, "inception_v3_int8.tmfile", "synset2015.txt"},
+    {"inception_v4", 299, 299, 1 / 127.5f, {104.007, 116.669, 122.679}, "inception_v4_int8.tmfile", "synset_words.txt"},
+    {"vgg16", 224, 224, 1.f, {104.007, 116.669, 122.679}, "vgg16_int8.tmfile", "synset_words.txt"}
+};
+
+
+const Model_Config* get_model_config(const char* model_name)
+{
+    std::string name1 = model_name;
+    for(unsigned int i = 0; i < name1.size(); i++)
+        name1[i] = tolower(name1[i]);
+
+    for(unsigned int i = 0; i < sizeof(model_list) / sizeof(Model_Config); i++)
+    {
+        std::string name2 = model_list[i].model_name;
+        if(name1 == name2)
+        {
+            return &model_list[i];
+        }
+    }
+    std::cerr << "Not support model name : " << model_name << "\n";
+    return nullptr;
+}
+
+void LoadLabelFile(std::vector<std::string>& result, const char* fname)
+{
+    std::ifstream labels(fname);
+
+    std::string line;
+    while(std::getline(labels, line))
+        result.push_back(line);
+}
+
+static inline bool PairCompare(const std::pair<int8_t, int>& lhs, const std::pair<int8_t, int>& rhs)
+{
+    return lhs.first > rhs.first;
+}
+
+static inline std::vector<int> Argmax(const std::vector<int8_t>& v, int N)
+{
+    std::vector<std::pair<int8_t, int>> pairs;
+    for(size_t i = 0; i < v.size(); ++i)
+        pairs.push_back(std::make_pair(v[i], i));
+    std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);
+
+    std::vector<int> result;
+    for(int i = 0; i < N; ++i)
+        result.push_back(pairs[i].second);
+    return result;
+}
+
+static float get_absmax_val(float* data, int data_size)
+{
+    float max_val = 0.f;
+    if(data != nullptr)
+    {
+        for(int i = 0; i < data_size; i++)
+        {
+            float abs_val = fabs(data[i]);
+            if(abs_val > max_val)
+                max_val = abs_val;
+        }
+    }
+    return max_val;
+}
+
+void get_input_data(const char* image_file, int8_t* input_data, int img_h, int img_w, const float* mean, float scale,
+                    float *input_scale, int *zero_point)
+{
+    cv::Mat sample = cv::imread(image_file, -1);
+    if(sample.empty())
+    {
+        std::cerr << "Failed to read image file " << image_file << ".\n";
+        return;
+    }
+    cv::Mat img;
+    if(sample.channels() == 4)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR);
+    }
+    else if(sample.channels() == 1)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR);
+    }
+    else
+    {
+        img = sample;
+    }
+
+    cv::resize(img, img, cv::Size(img_h, img_w));
+    img.convertTo(img, CV_32FC3);
+    float* img_data = ( float* )img.data;
+    int hw = img_h * img_w;
+
+    float* temp_data = (float*)malloc(hw*3*sizeof(float));
+    for(int h = 0; h < img_h; h++)
+    {
+        for(int w = 0; w < img_w; w++)
+        {
+            for(int c = 0; c < 3; c++)
+            {
+                temp_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale;
+                img_data++;
+            }
+        }
+    }
+
+    float input_max = get_absmax_val(temp_data, hw*3);
+    *input_scale = input_max / 127;
+    *zero_point = 0;
+
+    for(int i = 0; i < hw*3; i++)
+        input_data[i] = (int8_t)(round(temp_data[i] / *input_scale) + *zero_point);
+
+    free(temp_data);
+}
+
+void PrintTopLabels(const char* label_file, int8_t* data, float q_scale)
+{
+    // load labels
+    std::vector<std::string> labels;
+    LoadLabelFile(labels, label_file);
+
+    int8_t* end = data + 1000;
+    std::vector<int8_t> result(data, end);
+    std::vector<int> top_N = Argmax(result, PRINT_TOP_NUM);
+
+    for(unsigned int i = 0; i < top_N.size(); i++)
+    {
+        int idx = top_N[i];
+
+        float val = result[idx] * q_scale;
+        std::cout << std::fixed << std::setprecision(4) << val << " - \"" << labels[idx] << "\"\n";
+    }
+}
+
+bool run_tengine_library(const char* model_name, const char* tm_file, const char* label_file, const char* image_file,
+                         int img_h, int img_w, const float* mean, float scale, int repeat_count)
+{
+    // init
+    init_tengine();
+    if(request_tengine_version("1.2") < 0)
+        return false;
+
+    // create graph
+    graph_t graph = create_graph(nullptr, "tengine", tm_file);
+    if(graph == nullptr)
+    {
+        std::cerr << "Create graph failed.\n";
+        std::cerr << "errno: " << get_tengine_errno() << "\n";
+        return false;
+    }
+
+    // set input shape
+    int img_size = img_h * img_w * 3;
+    int dims[] = {1, 3, img_h, img_w};
+    int8_t* input_data = ( int8_t* )malloc(img_size);
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if(input_tensor == nullptr)
+    {
+        std::cerr << "Get input tensor failed\n";
+        return false;
+    }
+    set_tensor_shape(input_tensor, dims, 4);
+
+    // prerun
+    if(prerun_graph(graph) < 0)
+    {
+        std::cerr << "Prerun graph failed\n";
+        return false;
+    }
+    //dump_graph(graph);
+
+    struct timeval t0, t1;
+    float avg_time = 0.f;
+    for(int i = 0; i < repeat_count; i++)
+    {
+        float input_scale;
+        int zero_point;
+        get_input_data(image_file, input_data, img_h, img_w, mean, scale, &input_scale, &zero_point);
+        set_tensor_buffer(input_tensor, input_data, img_size * 4);
+        set_tensor_quant_param(input_tensor, &input_scale, &zero_point, 1);
+
+        gettimeofday(&t0, NULL);
+        if(run_graph(graph, 1) < 0)
+        {
+            std::cerr << "Run graph failed\n";
+            return false;
+        }
+        gettimeofday(&t1, NULL);
+
+        float mytime = ( float )((t1.tv_sec * 1000000 + t1.tv_usec) - (t0.tv_sec * 1000000 + t0.tv_usec)) / 1000;
+        avg_time += mytime;
+    }
+    std::cout << "\nModel name : " << model_name << "\n"
+              << "tengine model file : " << tm_file << "\n"
+              << "label file : " << label_file << "\n"
+              << "image file : " << image_file << "\n"
+              << "img_h, imag_w, scale, mean[3] : " << img_h << " " << img_w << " " << scale << " " << mean[0] << " "
+              << mean[1] << " " << mean[2] << "\n";
+    std::cout << "\nRepeat " << repeat_count << " times, avg time per run is " << avg_time / repeat_count << " ms\n";
+    std::cout << "--------------------------------------\n";
+
+    // print output
+    float q_scale;
+    int q_zero;
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    get_tensor_quant_param(output_tensor, &q_scale, &q_zero, 1);
+    int8_t* data = ( int8_t* )get_tensor_buffer(output_tensor);
+    PrintTopLabels(label_file, data, q_scale);
+    std::cout << "--------------------------------------\n";
+
+    free(input_data);
+    release_graph_tensor(input_tensor);
+    release_graph_tensor(output_tensor);
+    postrun_graph(graph);
+    destroy_graph(graph);
+
+    return true;
+}
+
+template <typename T> static std::vector<T> ParseString(const std::string str)
+{
+    typedef std::string::size_type pos;
+    const char delim_ch = ',';
+    std::string str_tmp = str;
+    std::vector<T> result;
+    T t;
+
+    pos delim_pos = str_tmp.find(delim_ch);
+    while(delim_pos != std::string::npos)
+    {
+        std::istringstream ist(str_tmp.substr(0, delim_pos));
+        ist >> t;
+        result.push_back(t);
+        str_tmp.replace(0, delim_pos + 1, "");
+        delim_pos = str_tmp.find(delim_ch);
+    }
+    if(str_tmp.size() > 0)
+    {
+        std::istringstream ist(str_tmp);
+        ist >> t;
+        result.push_back(t);
+    }
+
+    return result;
+}
+
+int main(int argc, char* argv[])
+{
+    int repeat_count = DEFAULT_REPEAT_CNT;
+    const std::string root_path = get_root_path();
+    std::string model_name;
+    std::string tm_file;
+    std::string label_file;
+    std::string image_file;
+    std::vector<int> hw;
+    std::vector<float> ms;
+    int img_h = 0;
+    int img_w = 0;
+    float scale = 0.0;
+    float mean[3] = {-1.0, -1.0, -1.0};
+
+    int res;
+    while((res = getopt(argc, argv, "n:t:l:i:g:s:w:r:h")) != -1)
+    {
+        switch(res)
+        {
+            case 'n':
+                model_name = optarg;
+                break;
+            case 't':
+                tm_file = optarg;
+                break;
+            case 'l':
+                label_file = optarg;
+                break;
+            case 'i':
+                image_file = optarg;
+                break;
+            case 'g':
+                hw = ParseString<int>(optarg);
+                if(hw.size() != 2)
+                {
+                    std::cerr << "Error -g parameter.\n";
+                    return -1;
+                }
+                img_h = hw[0];
+                img_w = hw[1];
+                break;
+            case 's':
+                scale = strtof(optarg, NULL);
+                break;
+            case 'w':
+                ms = ParseString<float>(optarg);
+                if(ms.size() != 3)
+                {
+                    std::cerr << "Error -w parameter.\n";
+                    return -1;
+                }
+                mean[0] = ms[0];
+                mean[1] = ms[1];
+                mean[2] = ms[2];
+                break;
+            case 'r':
+                repeat_count = std::strtoul(optarg, NULL, 10);
+                break;
+            case 'h':
+                std::cout << "[Usage]: " << argv[0] << " [-h]\n"
+                          << "    [-n model_name] [-t tm_file] [-l label_file] [-i image_file]\n"
+                          << "    [-g img_h,img_w] [-s scale] [-w mean[0],mean[1],mean[2]] [-r repeat_count]\n";
+                return 0;
+            default:
+                break;
+        }
+    }
+
+    const Model_Config* mod_config;
+    // if model files not specified
+    if(tm_file.empty())
+    {
+        // if model name not specified
+        if(model_name.empty())
+        {
+            // use default model
+            model_name = DEFAULT_MODEL_NAME;
+            std::cout << "Model name and tm file not specified, run " << model_name << " by default.\n";
+        }
+        // get model config in predefined model list
+        mod_config = get_model_config(model_name.c_str());
+        if(mod_config == nullptr)
+            return -1;
+
+        // get tm file
+        tm_file = get_file(mod_config->tm_file);
+        if(tm_file.empty())
+            return -1;
+
+        // if label file not specified
+        if(label_file.empty())
+        {
+            // get label file
+            label_file = get_file(mod_config->label_file);
+            if(label_file.empty())
+                return -1;
+        }
+
+        if(!hw.size())
+        {
+            img_h = mod_config->img_h;
+            img_w = mod_config->img_w;
+        }
+        if(scale == 0.0)
+            scale = mod_config->scale;
+        if(!ms.size())
+        {
+            mean[0] = mod_config->mean[0];
+            mean[1] = mod_config->mean[1];
+            mean[2] = mod_config->mean[2];
+        }
+    }
+
+    // if label file not specified, use default label file
+    if(label_file.empty())
+    {
+        label_file = root_path + DEFAULT_LABEL_FILE;
+        std::cout << "Label file not specified, use " << label_file << " by default.\n";
+    }
+
+    // if image file not specified, use default image file
+    if(image_file.empty())
+    {
+        image_file = root_path + DEFAULT_IMAGE_FILE;
+        std::cout << "Image file not specified, use " << image_file << " by default.\n";
+    }
+
+    if(img_h == 0)
+        img_h = DEFAULT_IMG_H;
+    if(img_w == 0)
+        img_w = DEFAULT_IMG_W;
+    if(scale == 0.0)
+        scale = DEFAULT_SCALE;
+    if(mean[0] == -1.0)
+        mean[0] = DEFAULT_MEAN1;
+    if(mean[1] == -1.0)
+        mean[1] = DEFAULT_MEAN2;
+    if(mean[2] == -1.0)
+        mean[2] = DEFAULT_MEAN3;
+    if(model_name.empty())
+        model_name = "unknown";
+
+    // check input files
+    if(!check_file_exist(tm_file) || !check_file_exist(label_file) || !check_file_exist(image_file))
+        return -1;
+
+    // start to run
+    if(!run_tengine_library(model_name.c_str(), tm_file.c_str(), label_file.c_str(), image_file.c_str(), img_h, img_w,
+                            mean, scale, repeat_count))
+        return -1;
+
+    std::cout << "ALL TEST DONE\n";
+
+    return 0;
+}
diff --git a/examples/tengine_model/classification/classification_tf.cpp b/examples/tengine_model/classification/classification_tf.cpp
new file mode 100644
index 000000000..bd85e2b08
--- /dev/null
+++ b/examples/tengine_model/classification/classification_tf.cpp
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#include <unistd.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <string>
+#include <vector>
+#include <sys/time.h>
+
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "tengine_c_api.h"
+#include "common.hpp"
+#include "cpu_device.h"
+
+#define DEFAULT_MODEL_NAME "inception_v3"
+#define DEFAULT_IMAGE_FILE "tests/images/cat.jpg"
+#define DEFAULT_LABEL_FILE "models/synset_words.txt"
+#define DEFAULT_IMG_H 299
+#define DEFAULT_IMG_W 299
+#define DEFAULT_SCALE 1.f
+#define DEFAULT_MEAN1 0
+#define DEFAULT_MEAN2 0
+#define DEFAULT_MEAN3 0
+#define DEFAULT_REPEAT_CNT 1
+#define PRINT_TOP_NUM 5
+
+typedef struct
+{
+    const char* model_name;
+    int img_h;
+    int img_w;
+    float scale;
+    float mean[3];
+    const char* tm_file;
+    const char* label_file;
+} Model_Config;
+
+const Model_Config model_list[] = {
+    {"inception_v3", 299, 299, 0.0039, {0, 0, 0}, "inception_v3_tf.tmfile", "labels.txt"},
+    {"inception_v4", 299, 299, 0.0039, {0, 0, 0}, "inception_v4_tf.tmfile", "labels.txt"},
+    {"resnet_v2", 299, 299, 0.0039, {0, 0, 0}, "resnet_v2_tf.tmfile", "labels.txt"},
+    {"mobilenet_v1", 224, 224, 0.017, {104.007, 116.669, 122.679}, "mobilenet_v1_tf.tmfile", "labels.txt"},
+    {"mobilenet_v2", 224, 224, 0.0078, {128, 128, 128}, "mobilenet_v2_tf.tmfile", "imagenet_slim_labels.txt"},
+    {"squeezenet", 224, 224, 0.0039, {0, 0, 0}, "squeezenet_tf.tmfile", "labels.txt"},
+    {"resnet50", 224, 224, 1.f, {0, 0, 0}, "resnet50_tf.tmfile", "synset_words.txt"}};
+
+const Model_Config* get_model_config(const char* model_name)
+{
+    std::string name1 = model_name;
+    for(unsigned int i = 0; i < name1.size(); i++)
+        name1[i] = tolower(name1[i]);
+
+    for(unsigned int i = 0; i < sizeof(model_list) / sizeof(Model_Config); i++)
+    {
+        std::string name2 = model_list[i].model_name;
+        if(name1 == name2)
+        {
+            return &model_list[i];
+        }
+    }
+    std::cerr << "Not support model name : " << model_name << "\n";
+    return nullptr;
+}
+
+void LoadLabelFile(std::vector<std::string>& result, const char* fname)
+{
+    std::ifstream labels(fname);
+
+    std::string line;
+    while(std::getline(labels, line))
+        result.push_back(line);
+}
+
+static inline bool PairCompare(const std::pair<float, int>& lhs, const std::pair<float, int>& rhs)
+{
+    return lhs.first > rhs.first;
+}
+
+static inline std::vector<int> Argmax(const std::vector<float>& v, int N)
+{
+    std::vector<std::pair<float, int>> pairs;
+    for(size_t i = 0; i < v.size(); ++i)
+        pairs.push_back(std::make_pair(v[i], i));
+    std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);
+
+    std::vector<int> result;
+    for(int i = 0; i < N; ++i)
+        result.push_back(pairs[i].second);
+    return result;
+}
+
+void get_input_data(const char* image_file, float* input_data, const int img_h, const int img_w, const float mean,
+                       const float scale)
+{
+    cv::Mat img = cv::imread(image_file, -1);
+    if(img.empty())
+    {
+        std::cerr << "Failed to read image file " << image_file << ".\n";
+        return;
+    }
+
+    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+    cv::resize(img, img, cv::Size(img_w, img_h));
+    img.convertTo(img, CV_32FC3);
+
+    img = (img - mean) * scale;
+
+    std::vector<cv::Mat> input_channels;
+    float* ptr = input_data;
+
+    for(int i = 0; i < 3; ++i)
+    {
+        cv::Mat channel(img_h, img_w, CV_32FC1, ptr);
+        input_channels.push_back(channel);
+        ptr += img_h * img_w;
+    }
+
+    cv::split(img, input_channels);
+}
+
+void PrintTopLabels(const char* label_file, float* data)
+{
+    // load labels
+    std::vector<std::string> labels;
+    LoadLabelFile(labels, label_file);
+
+    float* end = data + 1000;
+    std::vector<float> result(data, end);
+    std::vector<int> top_N = Argmax(result, PRINT_TOP_NUM);
+
+    for(unsigned int i = 0; i < top_N.size(); i++)
+    {
+        int idx = top_N[i];
+
+        std::cout << std::fixed << std::setprecision(4) << result[idx] << " - \"" << labels[idx] << "\"\n";
+    }
+}
+
+bool run_tengine_library(const char* model_name, const char* tm_file, const char* label_file, const char* image_file,
+                         int img_h, int img_w, const float* mean, float scale, int repeat_count)
+{
+    // init
+    init_tengine();
+    if(request_tengine_version("0.9") < 0)
+        return false;
+
+    // create graph
+    graph_t graph = create_graph(nullptr, "tengine", tm_file);
+    if(graph == nullptr)
+    {
+        std::cerr << "Create graph failed.\n";
+        std::cerr << "errno: " << get_tengine_errno() << "\n";
+        return false;
+    }
+
+    // set input shape
+    int img_size = img_h * img_w * 3;
+    int dims[] = {1, 3, img_h, img_w};
+    float* input_data = ( float* )malloc(sizeof(float) * img_size);
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if(input_tensor == nullptr)
+    {
+        std::cerr << "Get input tensor failed\n";
+        return false;
+    }
+    set_tensor_shape(input_tensor, dims, 4);
+
+    // prerun
+    if(prerun_graph(graph) < 0)
+    {
+        std::cerr << "Prerun graph failed\n";
+        return false;
+    }
+
+    struct timeval t0, t1;
+    float avg_time = 0.f;
+    for(int i = 0; i < repeat_count; i++)
+    {
+        get_input_data(image_file, input_data, img_h, img_w, mean[0], scale);
+        set_tensor_buffer(input_tensor, input_data, img_size * 4);
+
+        gettimeofday(&t0, NULL);
+        if(run_graph(graph, 1) < 0)
+        {
+            std::cerr << "Run graph failed\n";
+            return false;
+        }
+        gettimeofday(&t1, NULL);
+
+        float mytime = ( float )((t1.tv_sec * 1000000 + t1.tv_usec) - (t0.tv_sec * 1000000 + t0.tv_usec)) / 1000;
+        avg_time += mytime;
+    }
+    std::cout << "\nModel name : " << model_name << "\n"
+              << "tengine model file : " << tm_file << "\n"
+              << "label file : " << label_file << "\n"
+              << "image file : " << image_file << "\n"
+              << "img_h, imag_w, scale, mean[3] : " << img_h << " " << img_w << " " << scale << " " << mean[0] << " "
+              << mean[1] << " " << mean[2] << "\n";
+    std::cout << "\nRepeat " << repeat_count << " times, avg time per run is " << avg_time / repeat_count << " ms\n";
+    std::cout << "--------------------------------------\n";
+
+    // print output
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* data = ( float* )get_tensor_buffer(output_tensor);
+    PrintTopLabels(label_file, data);
+    std::cout << "--------------------------------------\n";
+
+    free(input_data);
+    release_graph_tensor(input_tensor);
+    release_graph_tensor(output_tensor);
+    postrun_graph(graph);
+    destroy_graph(graph);
+
+    return true;
+}
+
+template <typename T> static std::vector<T> ParseString(const std::string str)
+{
+    typedef std::string::size_type pos;
+    const char delim_ch = ',';
+    std::string str_tmp = str;
+    std::vector<T> result;
+    T t;
+
+    pos delim_pos = str_tmp.find(delim_ch);
+    while(delim_pos != std::string::npos)
+    {
+        std::istringstream ist(str_tmp.substr(0, delim_pos));
+        ist >> t;
+        result.push_back(t);
+        str_tmp.replace(0, delim_pos + 1, "");
+        delim_pos = str_tmp.find(delim_ch);
+    }
+    if(str_tmp.size() > 0)
+    {
+        std::istringstream ist(str_tmp);
+        ist >> t;
+        result.push_back(t);
+    }
+
+    return result;
+}
+
+int main(int argc, char* argv[])
+{
+    int repeat_count = DEFAULT_REPEAT_CNT;
+    const std::string root_path = get_root_path();
+    std::string model_name;
+    std::string tm_file;
+    std::string label_file;
+    std::string image_file;
+    std::vector<int> hw;
+    std::vector<float> ms;
+    int img_h = 0;
+    int img_w = 0;
+    float scale = 0.0;
+    float mean[3] = {-1.0, -1.0, -1.0};
+
+    int res;
+    while((res = getopt(argc, argv, "n:t:l:i:g:s:w:r:h")) != -1)
+    {
+        switch(res)
+        {
+            case 'n':
+                model_name = optarg;
+                break;
+            case 't':
+                tm_file = optarg;
+                break;
+            case 'l':
+                label_file = optarg;
+                break;
+            case 'i':
+                image_file = optarg;
+                break;
+            case 'g':
+                hw = ParseString<int>(optarg);
+                if(hw.size() != 2)
+                {
+                    std::cerr << "Error -g parameter.\n";
+                    return -1;
+                }
+                img_h = hw[0];
+                img_w = hw[1];
+                break;
+            case 's':
+                scale = strtof(optarg, NULL);
+                break;
+            case 'w':
+                ms = ParseString<float>(optarg);
+                if(ms.size() != 3)
+                {
+                    std::cerr << "Error -w parameter.\n";
+                    return -1;
+                }
+                mean[0] = ms[0];
+                mean[1] = ms[1];
+                mean[2] = ms[2];
+                break;
+            case 'r':
+                repeat_count = std::strtoul(optarg, NULL, 10);
+                break;
+            case 'h':
+                std::cout << "[Usage]: " << argv[0] << " [-h]\n"
+                          << "    [-n model_name] [-t tm_file] [-l label_file] [-i image_file]\n"
+                          << "    [-g img_h,img_w] [-s scale] [-w mean[0],mean[1],mean[2]] [-r repeat_count]\n";
+                return 0;
+            default:
+                break;
+        }
+    }
+
+    const Model_Config* mod_config;
+    // if model files not specified
+    if(tm_file.empty())
+    {
+        // if model name not specified
+        if(model_name.empty())
+        {
+            // use default model
+            model_name = DEFAULT_MODEL_NAME;
+            std::cout << "Model name and tm file not specified, run " << model_name << " by default.\n";
+        }
+        // get model config in predefined model list
+        mod_config = get_model_config(model_name.c_str());
+        if(mod_config == nullptr)
+            return -1;
+
+        // get tm file
+        tm_file = get_file(mod_config->tm_file);
+        if(tm_file.empty())
+            return -1;
+
+        // if label file not specified
+        if(label_file.empty())
+        {
+            // get label file
+            label_file = get_file(mod_config->label_file);
+            if(label_file.empty())
+                return -1;
+        }
+
+        if(!hw.size())
+        {
+            img_h = mod_config->img_h;
+            img_w = mod_config->img_w;
+        }
+        if(scale == 0.0)
+            scale = mod_config->scale;
+        if(!ms.size())
+        {
+            mean[0] = mod_config->mean[0];
+            mean[1] = mod_config->mean[1];
+            mean[2] = mod_config->mean[2];
+        }
+    }
+
+    // if label file not specified, use default label file
+    if(label_file.empty())
+    {
+        label_file = root_path + DEFAULT_LABEL_FILE;
+        std::cout << "Label file not specified, use " << label_file << " by default.\n";
+    }
+
+    // if image file not specified, use default image file
+    if(image_file.empty())
+    {
+        image_file = root_path + DEFAULT_IMAGE_FILE;
+        std::cout << "Image file not specified, use " << image_file << " by default.\n";
+    }
+
+    if(img_h == 0)
+        img_h = DEFAULT_IMG_H;
+    if(img_w == 0)
+        img_w = DEFAULT_IMG_W;
+    if(scale == 0.0)
+        scale = DEFAULT_SCALE;
+    if(mean[0] == -1.0)
+        mean[0] = DEFAULT_MEAN1;
+    if(mean[1] == -1.0)
+        mean[1] = DEFAULT_MEAN2;
+    if(mean[2] == -1.0)
+        mean[2] = DEFAULT_MEAN3;
+    if(model_name.empty())
+        model_name = "unknown";
+
+    // check input files
+    if(!check_file_exist(tm_file) || !check_file_exist(label_file) || !check_file_exist(image_file))
+        return -1;
+
+    // start to run
+    if(!run_tengine_library(model_name.c_str(), tm_file.c_str(), label_file.c_str(), image_file.c_str(), img_h, img_w,
+                            mean, scale, repeat_count))
+        return -1;
+
+    std::cout << "ALL TEST DONE\n";
+
+    return 0;
+}
diff --git a/examples/tengine_model/convert/CMakeLists.txt b/examples/tengine_model/convert/CMakeLists.txt
index 5d24442d3..fd461c91c 100644
--- a/examples/tengine_model/convert/CMakeLists.txt
+++ b/examples/tengine_model/convert/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8)
 project(tm_convert)
 
 set( INSTALL_DIR ${TENGINE_DIR}/install/ )
-set( TENGINE_LIBS tengine)
+set( TENGINE_LIBS tengine hclcpu)
 
 set( CODE_SRCS convert_caffe_to_tm.cpp ../../common/common.cpp)
 set( BIN_EXE convert_caffe_to_tm )
@@ -33,6 +33,10 @@ if( BLAS_DIR)
     endif()
 endif()
 
+if ( NOT (ARM OR ANDROID))
+set (BLAS_LIB openblas)
+endif()
+
 
 #exe
 add_executable(${BIN_EXE} ${CODE_SRCS})
@@ -40,6 +44,6 @@ add_executable(${BIN_EXE} ${CODE_SRCS})
 if( ANDROID)
    target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${PROTOBUF_LIB} ${BLAS_LIB})
 else()
-   target_link_libraries(${BIN_EXE} ${TENGINE_LIBS})
+   target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${BLAS_LIB})
 endif()
           
diff --git a/examples/tengine_model/convert/convert_caffe_to_tm.cpp b/examples/tengine_model/convert/convert_caffe_to_tm.cpp
index cc6c04205..7207dae3d 100644
--- a/examples/tengine_model/convert/convert_caffe_to_tm.cpp
+++ b/examples/tengine_model/convert/convert_caffe_to_tm.cpp
@@ -89,6 +89,24 @@ int main(int argc, char* argv[])
         return -1;
     }
 
+    const char* env = std::getenv("TM_NO_OPTIMIZE");
+    if(env == nullptr)
+    {
+        // optimize graph
+        int optimize_only = 1;
+        if(set_graph_attr(graph, "optimize_only", &optimize_only, sizeof(int)) < 0)
+        {
+            std::cerr<<"set optimize only failed\n";
+            return -1;
+        }
+
+        if(prerun_graph(graph) < 0)
+        {
+            std::cerr<<"prerun failed\n";
+            return -1;
+        }
+    }
+
     // save the tengine model file
     if(save_graph(graph, "tengine", output_tmfile.c_str()) < 0)
     {
diff --git a/examples/tensorflow_wrapper/label_image/CMakeLists.txt b/examples/tensorflow_wrapper/label_image/CMakeLists.txt
index 354849940..fb4443b64 100644
--- a/examples/tensorflow_wrapper/label_image/CMakeLists.txt
+++ b/examples/tensorflow_wrapper/label_image/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8)
 project(label_image)
 
 set( INSTALL_DIR ${TENGINE_DIR}/install/ )
-set( TENGINE_LIBS wrapper tengine boost_system boost_thread )
+set( TENGINE_LIBS wrapper tengine hclcpu boost_system boost_thread )
 set( CODE_SRC label_image.cpp ../../common/common.cpp )
 
 #opencv
@@ -20,15 +20,19 @@ set(CMAKE_CXX_FLAGS "-std=c++11 -O3 -Wall")
 #lib
 link_directories(${INSTALL_DIR}/lib)
 
+if ( NOT (ARM OR ANDROID))
+set (BLAS_LIB openblas)
+endif()
+
 #exe
 add_executable(label_image_inceptionv3 ${CODE_SRC})
-target_link_libraries(label_image_inceptionv3 ${TENGINE_LIBS} ${OpenCV_LIBS})
+target_link_libraries(label_image_inceptionv3 ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 
 add_executable(label_image_mobilenet ${CODE_SRC})
 target_compile_definitions(label_image_mobilenet PUBLIC -DMOBILE_NET)
-target_link_libraries(label_image_mobilenet ${TENGINE_LIBS} ${OpenCV_LIBS})
+target_link_libraries(label_image_mobilenet ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 
 add_executable(label_image_resnet50 ${CODE_SRC})
 target_compile_definitions(label_image_resnet50 PUBLIC -DRESNET50)
-target_link_libraries(label_image_resnet50 ${TENGINE_LIBS} ${OpenCV_LIBS})
+target_link_libraries(label_image_resnet50 ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 
diff --git a/examples/yolov2/CMakeLists.txt b/examples/yolov2/CMakeLists.txt
index f74ec5d7a..66fd6e73b 100644
--- a/examples/yolov2/CMakeLists.txt
+++ b/examples/yolov2/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8)
 project(YOLOV2)
 
 set( INSTALL_DIR ${TENGINE_DIR}/install/ )
-set( TENGINE_LIBS tengine)
+set( TENGINE_LIBS tengine hclcpu)
 
 if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
     set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so)
@@ -20,6 +20,11 @@ if( BLAS_DIR)
     endif()
 endif()
 
+if ( NOT (ARM OR ANDROID))
+set (BLAS_LIB openblas)
+endif()
+
+
 set( CODE_SRCS yolov2.cpp ../common/common.cpp)
 set( BIN_EXE YOLOV2)
 
@@ -46,7 +51,7 @@ add_executable(${BIN_EXE} ${CODE_SRCS})
 if( ANDROID)
    target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB})
 else()
-   target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS})
+   target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB})
 endif()
 
 
diff --git a/examples/yolov2/yolov2.cpp b/examples/yolov2/yolov2.cpp
index ba29f571e..e5f1e94f7 100644
--- a/examples/yolov2/yolov2.cpp
+++ b/examples/yolov2/yolov2.cpp
@@ -479,7 +479,7 @@ int main(int argc, char** argv)
 
         std::vector<float> param_biases;
 
-        if(get_node_attr_generic(node, "biases", &typeid(std::vector<float>), &param_biases, sizeof(param_biases)) < 0)
+        if(get_node_attr_generic(node, "biases", typeid(std::vector<float>).name(), &param_biases, sizeof(param_biases)) < 0)
         {
             std::cout << "cannot get bias settings\n";
             return 1;
diff --git a/executor/Makefile b/executor/Makefile
index 8a64f79a3..0bd90ee7f 100644
--- a/executor/Makefile
+++ b/executor/Makefile
@@ -1,4 +1,3 @@
-obj-y+=operator/
 obj-y+=lib/
 obj-y+=engine/
 obj-y+=plugin/
diff --git a/executor/include/kernel_registry.hpp b/executor/include/kernel_registry.hpp
new file mode 100644
index 000000000..0c1e277b3
--- /dev/null
+++ b/executor/include/kernel_registry.hpp
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+
+#ifndef __KERNEL_REGISTRY_HPP__
+#define __KERNEL_REGISTRY_HPP__
+
+#include <unordered_map>
+
+namespace TEngine {
+
+template <typename T>
+class KernelRegistry {
+public:
+    bool Register(const T& t, int layout, int data_type)
+    {
+          int idx=get_idx(layout,data_type);
+
+          if(map.count(idx))
+               return false;
+
+          map[idx]=t;
+          return true;
+    }    
+
+    bool GetKernel(T& t, int layout, int data_type)
+    {
+         int idx=get_idx(layout,data_type);
+
+         if(map.count(idx)==0)
+              return false;
+
+         t=map[idx];
+
+         return true;
+    }
+
+private:
+   int  get_idx(int layout, int data_type) { return (layout<<8)|(data_type);}
+   
+   
+   std::unordered_map<int,T> map;
+
+};
+
+
+
+} //namespace TEngine
+
+#endif
diff --git a/executor/include/node_ops.hpp b/executor/include/node_ops.hpp
index 0d2e34ab3..5404d20ce 100644
--- a/executor/include/node_ops.hpp
+++ b/executor/include/node_ops.hpp
@@ -37,6 +37,10 @@
 #include "cpu_info.hpp"
 #include "exec_attr.hpp"
 
+#ifdef __ANDROID__
+#define dynamic_cast static_cast
+#endif
+
 namespace TEngine {
 
 #define ATTR_NODE_OPS "node_ops"
@@ -191,15 +195,26 @@ struct PrioSelector : public NodeOpsSelector
             auto ops = match_func(cpu_info, node);
 
             if(ops)
+            {
+                ops->need_free=true;
                 return ops;
+            }
         }
 
         return nullptr;
     }
 
-    void Register(int priority, select_node_ops_t func)
+    bool Register(int priority, select_node_ops_t func)
     {
+        for(auto ir = prio_list.begin(); ir != prio_list.end(); ir++)
+        {
+            auto prio = ir->first;
+
+            if(prio == priority)
+                return false;
+        }
         prio_list[priority] = func;
+        return true;
     }
 
     std::map<int, select_node_ops_t> prio_list;
diff --git a/executor/lib/Makefile b/executor/lib/Makefile
index 29a1390c9..dec8747b8 100644
--- a/executor/lib/Makefile
+++ b/executor/lib/Makefile
@@ -9,8 +9,8 @@ obj-y+=device_driver.o
 obj-y+=node_dev_executor.o
 obj-y+=node_dev_driver.o
 obj-y+=node_ops.o
-obj-y+=tengine_test_api.o
 obj-y+=cpu_info.o
 obj-y+=custom_kernel.o
+obj-y+=custom_kernel_ops.o
 
 COMMON_CFLAGS+=-I../../driver/cpu
diff --git a/executor/lib/custom_kernel.cpp b/executor/lib/custom_kernel.cpp
index 56115c571..b0e71ecd0 100644
--- a/executor/lib/custom_kernel.cpp
+++ b/executor/lib/custom_kernel.cpp
@@ -26,7 +26,7 @@ static void PrepareOneTensor(Node* node, Tensor* tensor, struct custom_kernel_te
     t->data_type = tensor->GetDataType();
     t->element_num = shape.GetSize();
     t->element_size = DataType::GetTypeSize(t->data_type);
-    t->layout_type = exec_attr->layout;
+    t->layout_type = exec_attr->graph_layout;
 
     t->data = nullptr;
 }
diff --git a/executor/operator/common/custom_kernel_ops.cpp b/executor/lib/custom_kernel_ops.cpp
similarity index 99%
rename from executor/operator/common/custom_kernel_ops.cpp
rename to executor/lib/custom_kernel_ops.cpp
index a063ad56b..88598554a 100644
--- a/executor/operator/common/custom_kernel_ops.cpp
+++ b/executor/lib/custom_kernel_ops.cpp
@@ -170,8 +170,6 @@ NodeOps* CustomKernelNodeOps::NewOps(Node* node, struct custom_kernel_ops* k_ops
 {
     NodeOps* ops = new CustomKernelNodeOps(node, k_ops);
 
-    ops->need_free = 1;
-
     return ops;
 }
 
diff --git a/executor/lib/device_driver.cpp b/executor/lib/device_driver.cpp
index 3cee38838..dc8555fbc 100644
--- a/executor/lib/device_driver.cpp
+++ b/executor/lib/device_driver.cpp
@@ -56,7 +56,7 @@ bool DriverManager::UnregisterDriver(Driver* driver)
 
 Driver* DriverManager::GetDriver(const std::string& name)
 {
-    Driver* driver;
+    Driver* driver=nullptr;
 
     if(!SafeGet(name, driver))
         return nullptr;
diff --git a/executor/lib/generic_dev_executor.cpp b/executor/lib/generic_dev_executor.cpp
index ebbf16f2b..e96a65f08 100644
--- a/executor/lib/generic_dev_executor.cpp
+++ b/executor/lib/generic_dev_executor.cpp
@@ -233,6 +233,16 @@ bool GenericDevExecutor::PrerunTask(SubgraphTask* task)
     if(task->graph_handle == nullptr || !OptimizeGraph(task))
         return false;
 
+    GraphTask * graph_task=task->graph_task;
+    GraphExecutor * executor=graph_task->GetGraphExecutor();
+
+    int optimize_only=0;
+
+    executor->GetGraphAttr("optimize_only",&optimize_only,sizeof(int));
+
+    if(optimize_only)
+        return true;
+
     unsigned int mem_size;
 
     if(DevGetMemorySize(task->graph_handle, mem_size))
diff --git a/executor/lib/graph_optimizer.cpp b/executor/lib/graph_optimizer.cpp
index 59621fe08..7920a260b 100644
--- a/executor/lib/graph_optimizer.cpp
+++ b/executor/lib/graph_optimizer.cpp
@@ -48,29 +48,25 @@ static void AddConstNodeToSubGraph(Subgraph* graph, Tensor* tensor, Node* fused_
 static bool Weight_Bn(Subgraph* graph, Node* ConvNode, float* mean, float* var, float* gamma, float* beta, float eps,
                       float rescale_factor, Tensor* bias_tensor)
 {
-    Tensor* input_tensor = ConvNode->GetInputTensor(0);
+    Tensor* kernel_tensor = ConvNode->GetInputTensor(1);
     Convolution* conv_op = dynamic_cast<Convolution*>(ConvNode->GetOp());
     ConvParam* param = conv_op->GetParam();
-    const TShape& input_shape = input_tensor->GetShape();
+    const TShape& kernel_shape = kernel_tensor->GetShape();
 
     int group = param->group;
-    int input_chan = input_shape.Shape(1) / group;
-
-    Tensor* output_tensor = ConvNode->GetOutputTensor(0);
-    TShape& output_shape = output_tensor->GetShape();
+    int input_chan = kernel_shape.Shape(1) ;
 
-    int output_chan = output_shape.GetC() / group;
+    int output_chan = kernel_shape.Shape(0) / group;
 
     int kernel_x = param->kernel_w;
     int kernel_y = param->kernel_h;
     int kernel_size = input_chan * kernel_x * kernel_y;
 
-    Tensor* kernel_tensor = ConvNode->GetInputTensor(1);
     float* kernel_org = ( float* )get_tensor_mem(kernel_tensor);
 
-    int channel_num = output_shape.GetC();
+    int channel_num = kernel_shape.Shape(0);
 
-    float* kernel_new = ( float* )(malloc(kernel_size * channel_num * sizeof(float)));
+    float* kernel_new = ( float* )(malloc(kernel_size * channel_num * sizeof(float) + 128));
 
     memcpy(kernel_new, kernel_org, sizeof(float) * kernel_size * channel_num);
 
@@ -102,7 +98,7 @@ static bool Weight_Bn(Subgraph* graph, Node* ConvNode, float* mean, float* var,
 
     {
         Tensor* new_bias_tensor = new Tensor(bias_name);
-        std::vector<int> dims{1, channel_num, 1, 1};
+        std::vector<int> dims{channel_num};
 
         TShape bias_shape;
         bias_shape.SetDim(dims);
@@ -110,7 +106,7 @@ static bool Weight_Bn(Subgraph* graph, Node* ConvNode, float* mean, float* var,
         new_bias_tensor->Reshape(bias_shape);
         new_bias_tensor->SetType(kConstTensor);
 
-        void* bias_new = ( void* )malloc(channel_num * sizeof(float));
+        void* bias_new = ( void* )malloc(channel_num * sizeof(float) + 128);
 
         new_bias_tensor->SetMemAddr(bias_new);
 
diff --git a/executor/lib/graph_task.cpp b/executor/lib/graph_task.cpp
index 1b2bf836b..931a6e038 100644
--- a/executor/lib/graph_task.cpp
+++ b/executor/lib/graph_task.cpp
@@ -207,6 +207,11 @@ Graph* GraphTask::GetOptimizedGraph(void)
 
     optimized_graph_ = MergeSubgraph(graph_, sub_list);
 
+    optimized_graph_->SetLayout(graph_->GetLayout());
+    optimized_graph_->SetModelLayout(graph_->GetModelLayout());
+    optimized_graph_->SetModelFormat(graph_->GetModelFormat());
+    optimized_graph_->SetModelSubFormat(graph_->GetModelSubFormat());
+
     return optimized_graph_;
 }
 
diff --git a/executor/lib/node_ops.cpp b/executor/lib/node_ops.cpp
index 19a110a5a..cdea37202 100644
--- a/executor/lib/node_ops.cpp
+++ b/executor/lib/node_ops.cpp
@@ -347,7 +347,8 @@ bool NodeOpsRegistryManager::RegisterOPImplementor(const std::string& registry_n
         registry->RegisterSelector(prio_selector);
     }
 
-    prio_selector->Register(priority, select_func);
+    if(!prio_selector->Register(priority, select_func))
+        return false;
 
     return true;
 }
diff --git a/executor/lib/tengine_test_api.cpp b/executor/lib/tengine_test_api.cpp
deleted file mode 100644
index 4b34e96e7..000000000
--- a/executor/lib/tengine_test_api.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2018, Open AI Lab
- * Author: haitao@openailab.com
- */
-#include "data_type.hpp"
-#include "exec_context.hpp"
-#include "graph.hpp"
-#include "tensor_mem.hpp"
-#include "operator/convolution.hpp"
-
-#include "tengine_test_api.h"
-#include "node_ops.hpp"
-#include "cpu_driver.hpp"
-#include "graph_executor.hpp"
-
-using namespace TEngine;
-
-test_node_t create_convolution_test_node(int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h0, int pad_h1,
-                                         int pad_w0, int pad_w1, int dilation_h, int dilation_w, int input_channel,
-                                         int output_channel, int group)
-{
-    /* create op */
-
-    Operator* op = OpManager::CreateOp("Convolution");
-    Convolution* conv_op = dynamic_cast<Convolution*>(op);
-
-    ConvParam* param = conv_op->GetParam();
-
-    param->kernel_h = kernel_h;
-    param->kernel_w = kernel_w;
-    param->stride_h = stride_h;
-    param->stride_w = stride_w;
-    param->output_channel = output_channel;
-    param->group = group;
-    param->dilation_h = dilation_h;
-    param->dilation_w = dilation_w;
-
-    param->pad_h = -1;
-    param->pad_w = -1;
-
-    param->pads.resize(4);
-    param->pads[0] = pad_h0;
-    param->pads[1] = pad_w0;
-    param->pads[2] = pad_h1;
-    param->pads[3] = pad_w1;
-
-    /* create node */
-
-    Node* node = new Node("test_convolution");
-
-    node->SetOp(conv_op);
-
-    return node;
-}
-
-static int test_conv_node_set_input(Node* node, float* input_data[], int* input_shape[], int input_number)
-{
-    // input
-
-    Tensor* tensor = new Tensor("input");
-
-    tensor->SetDataType(DataType::GetTypeID("float32"));
-    tensor->SetType(kConstTensor);
-    tensor->SetMemAddr(input_data[0]);
-
-    int* input_dim = input_shape[0];
-
-    std::vector<int> input_dims = {input_dim[0], input_dim[1], input_dim[2], input_dim[3]};
-
-    TShape& intput_shape = tensor->GetShape();
-
-    intput_shape.SetDataLayout("NCHW");
-    intput_shape.SetDim(input_dims);
-
-    node->AddInputTensor(tensor);
-
-    // weight
-
-    tensor = new Tensor("weight");
-
-    tensor->SetDataType(DataType::GetTypeID("float32"));
-    tensor->SetType(kConstTensor);
-    tensor->SetMemAddr(input_data[1]);
-
-    input_dim = input_shape[1];
-
-    std::vector<int> weight_dims = {input_dim[0], input_dim[1], input_dim[2], input_dim[3]};
-
-    TShape& weight_shape = tensor->GetShape();
-
-    weight_shape.SetDataLayout("NCHW");
-    weight_shape.SetDim(weight_dims);
-
-    node->AddInputTensor(tensor);
-
-    if(input_number == 2)
-        return 0;
-
-    // bias
-
-    tensor = new Tensor("bias");
-
-    tensor->SetDataType(DataType::GetTypeID("float32"));
-    tensor->SetType(kConstTensor);
-    tensor->SetMemAddr(input_data[2]);
-
-    input_dim = input_shape[2];
-
-    std::vector<int> bias_dims = {input_dim[0]};
-
-    TShape& bias_shape = tensor->GetShape();
-
-    bias_shape.SetDataLayout("W");
-    bias_shape.SetDim(bias_dims);
-
-    node->AddInputTensor(tensor);
-
-    return 0;
-}
-
-int test_node_set_input(test_node_t node, float* input_data[], int* input_shape[], int input_number)
-{
-    Node* test_node = ( Node* )node;
-
-    Operator* op = test_node->GetOp();
-
-    if(op->GetName() == "Convolution")
-        return test_conv_node_set_input(test_node, input_data, input_shape, input_number);
-
-    return -1;
-}
-
-static int test_conv_node_set_output(Node* node, float* output_data, int* output_shape)
-{
-    Tensor* tensor = new Tensor("output");
-
-    tensor->SetDataType(DataType::GetTypeID("float32"));
-    tensor->SetType(kConstTensor);
-    tensor->SetMemAddr(output_data);
-
-    int* output_dim = output_shape;
-
-    std::vector<int> output_dims = {output_dim[0], output_dim[1], output_dim[2], output_dim[3]};
-
-    TShape& shape = tensor->GetShape();
-
-    shape.SetDataLayout("NCHW");
-    shape.SetDim(output_dims);
-
-    node->AddOutputTensor(tensor);
-
-    return 0;
-}
-
-int test_node_set_output(test_node_t node, float* output_data[], int* output_shape[], int output_number)
-{
-    Node* test_node = ( Node* )node;
-
-    Operator* op = test_node->GetOp();
-
-    if(op->GetName() == "Convolution")
-        return test_conv_node_set_output(test_node, output_data[0], output_shape[0]);
-
-    return -1;
-}
-
-static Graph* create_test_graph(Node* node)
-{
-    Graph* graph = new Graph(node->GetName());
-
-    node->SetNodeIndex(0);
-    graph->seq_nodes.push_back(node);
-
-    graph->AddInputNode(node);
-    graph->AddOutputNode(node);
-
-    /* for all tensors */
-
-    for(unsigned int i = 0; i < node->GetInputNum(); i++)
-    {
-        Tensor* tensor = node->GetInputTensor(i);
-        graph->AddTensorMap(tensor->GetName(), tensor);
-    }
-
-    for(unsigned int i = 0; i < node->GetOutputNum(); i++)
-    {
-        Tensor* tensor = node->GetOutputTensor(i);
-        graph->AddTensorMap(tensor->GetName(), tensor);
-    }
-
-    return graph;
-}
-
-int test_node_prerun(test_node_t node)
-{
-    Node* test_node = ( Node* )node;
-
-    // create graph for this node
-
-    Graph* graph = create_test_graph(test_node);
-
-    GraphExecutor* executor = new GraphExecutor();
-    ExecContext* exec_context = ExecContext::GetDefaultContext();
-
-    if(!executor->AttachGraph(exec_context, graph) || !executor->Prerun())
-    {
-        std::cout << "Prerun failed\n";
-        return -1;
-    }
-
-    test_node->SetAttr("TEST_EXECUTOR", executor);
-
-    return 0;
-
-    /*
-        NodeOps * node_ops=NodeOpsRegistryManager::FindNodeOps(cpu_dev->GetCPUInfo(),test_node);
-
-        if(node_ops==nullptr)
-              return -1;
-
-        auto dispatch=std::bind(&CPUDevice::PushAiderTask,cpu_dev,std::placeholders::_1,
-                                            std::placeholders::_2);
-
-        auto wait=std::bind(&CPUDevice::WaitDone,cpu_dev);
-
-        node_ops->SetHelper(std::malloc,std::free,dispatch,wait);
-
-
-        if(!node_ops->Prerun(test_node))
-        {
-            std::cout<<"Prerun failed\n";
-            return -1;
-        }
-
-        test_node->SetAttr(ATTR_NODE_OPS,node_ops);
-    */
-
-    return 0;
-}
-
-int test_node_run(test_node_t node)
-{
-    Node* test_node = ( Node* )node;
-
-    GraphExecutor* executor = any_cast<GraphExecutor*>(test_node->GetAttr("TEST_EXECUTOR"));
-
-    if(!executor->SyncRun())
-    {
-        std::cout << "Run failed\n";
-        return -1;
-    }
-
-    return 0;
-
-    /*
-        NodeOps * node_ops=any_cast<NodeOps *>(test_node->GetAttr(ATTR_NODE_OPS));
-
-        if(!node_ops->Run(test_node))
-        {
-            std::cout<<"Run failed\n";
-            return -1;
-        }
-    */
-
-    return 0;
-}
-
-int test_node_postrun(test_node_t node)
-{
-    Node* test_node = ( Node* )node;
-
-    GraphExecutor* executor = any_cast<GraphExecutor*>(test_node->GetAttr("TEST_EXECUTOR"));
-
-    if(!executor->Postrun())
-    {
-        std::cout << "Postrun failed\n";
-        return -1;
-    }
-
-    return 0;
-
-    /*
-        NodeOps * node_ops=any_cast<NodeOps *>(test_node->GetAttr(ATTR_NODE_OPS));
-
-        if(!node_ops->Postrun(test_node))
-        {
-            std::cout<<"Postrun failed\n";
-            return -1;
-        }
-    */
-
-    return 0;
-}
-
-void destroy_test_node(test_node_t node)
-{
-    Node* test_node = ( Node* )node;
-
-    /* releaset graph executor & graph */
-
-    GraphExecutor* executor = any_cast<GraphExecutor*>(test_node->GetAttr("TEST_EXECUTOR"));
-
-    Graph* graph = executor->GetGraph();
-
-    delete executor;
-    delete graph;
-
-    /* free tensor */
-
-    for(unsigned int i = 0; i < test_node->GetInputNum(); i++)
-    {
-        Tensor* tensor = test_node->GetInputTensor(i);
-
-        delete tensor;
-    }
-
-    for(unsigned int i = 0; i < test_node->GetOutputNum(); i++)
-    {
-        Tensor* tensor = test_node->GetOutputTensor(i);
-
-        delete tensor;
-    }
-
-    /* free node */
-
-    delete test_node;
-}
diff --git a/executor/lib/tensor_mem.cpp b/executor/lib/tensor_mem.cpp
index 3c4acbc7b..61c00f292 100644
--- a/executor/lib/tensor_mem.cpp
+++ b/executor/lib/tensor_mem.cpp
@@ -66,6 +66,8 @@ bool get_tensor_memptr(const Tensor* tensor, TensorMemPtr& ptr)
 
 bool set_tensor_mem(Tensor* tensor, void* addr, int size, mem_release_t releaser)
 {
+    if(addr == nullptr || size == 0)
+        return false;
     if(tensor->GetType() == kConstTensor)
     {
         LOG_DEBUG() << __FUNCTION__ << ": set const tensor " << tensor->GetName() << " mem: " << addr << "\n";
diff --git a/executor/operator/Makefile b/executor/operator/Makefile
index 4d1070788..ade53fe0c 100644
--- a/executor/operator/Makefile
+++ b/executor/operator/Makefile
@@ -1,13 +1,10 @@
 obj-$(CONFIG_ARCH_ARM64)+=arm64/
 obj-y+=ref/
 obj-y+=common/
+obj-y+=init.o
 
 COMMON_CFLAGS+=-I$(shell pwd)/include
 
-ifneq ($(CONFIG_OPT_CFLAGS),)
-   COMMON_CFLAGS+=-O3 -funroll-loops
-endif
-
 #below are examples to build with pre-compiled object
 #prebuilt-obj-$(CONFIG_ARCH_ARM64)+=arm.o
 #prebuilt-obj-y+=ref/built-in.o
diff --git a/executor/operator/arm64/Makefile b/executor/operator/arm64/Makefile
index 19f1fa8de..0b7ba803e 100644
--- a/executor/operator/arm64/Makefile
+++ b/executor/operator/arm64/Makefile
@@ -8,7 +8,9 @@ obj-y+=pooling.o
 obj-y+=scale_neon.o
 obj-y+=init.o
 
-
-CXXFLAGS+=-I./include 
+CXXFLAGS+=-I./include  
 
 pooling_CXXFLAGS+=-Wno-strict-aliasing
+
+COMMON_CFLAGS+=-fvisibility=hidden
+
diff --git a/executor/operator/arm64/batch_norm.cpp b/executor/operator/arm64/batch_norm.cpp
index 3e55b5992..0fb59212c 100644
--- a/executor/operator/arm64/batch_norm.cpp
+++ b/executor/operator/arm64/batch_norm.cpp
@@ -35,7 +35,7 @@
 #include <cmath>
 namespace TEngine {
 
-namespace BatchNormImpl {
+namespace BatchNormImpl64 {
 
 struct BNOps : public NodeOps
 {
@@ -206,15 +206,24 @@ struct BNOps : public NodeOps
     }
 };
 
-}    // namespace BatchNormImpl
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    if((input->GetShape()).GetDim().size() != 4)
+        return nullptr;
+
+    BNOps* ops = new BNOps();
+
+    return ops;
+}
+
+}    // namespace BatchNormImpl64
 
-using namespace BatchNormImpl;
+using namespace BatchNormImpl64;
 
 void RegisterBatchNormNodeExec(void)
 {
-    BNOps* ops = new BNOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("arm64", BatchNormName, ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("arm64", BatchNormName, BatchNormImpl64::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/arm64/conv/Makefile b/executor/operator/arm64/conv/Makefile
index 103780883..8b4834741 100644
--- a/executor/operator/arm64/conv/Makefile
+++ b/executor/operator/arm64/conv/Makefile
@@ -9,4 +9,7 @@ obj-y+=dw_k3s2p1.o
 obj-y+=dw_k3s1p1_relu_fused.o
 obj-y+=dw_k3s2p1_relu_fused.o
 
+CXXFLAGS+=-I../include 
+
+conv_2d_acl_CXXFLAGS+=-I$(ACL_ROOT) -I$(ACL_ROOT)/include -I$(ACL_ROOT)/utils
 
diff --git a/executor/operator/arm64/conv/conv_2d_dw.cpp b/executor/operator/arm64/conv/conv_2d_dw.cpp
index f13cfbbe4..0e739eeb4 100644
--- a/executor/operator/arm64/conv/conv_2d_dw.cpp
+++ b/executor/operator/arm64/conv/conv_2d_dw.cpp
@@ -1,286 +1,286 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: haitao@openailab.com
- */
-#include <iostream>
-#include <cstring>
-#include <cstdlib>
-
-#include "logger.hpp"
-#include "tensor_mem.hpp"
-
-#include "graph.hpp"
-#include "node_ops.hpp"
-#include "operator/convolution.hpp"
-#include <math.h>
-namespace TEngine {
-
-namespace conv_2d_dw {
-
-const char* conv_name = "CONV_DW";
-const int default_prio = 10;
-
-extern "C" void dw_k3s1p1(float* data, int h, int w, float* kernel, float* output, float* bias);
-extern "C" void dw_k3s2p1(float* data, int h, int w, float* kernel, float* output, float* bias);
-extern "C" void dw_k3s1p1_relu_fused(float* data, int h, int w, float* kernel, float* output, float* bias);
-extern "C" void dw_k3s2p1_relu_fused(float* data, int h, int w, float* kernel, float* output, float* bias);
-
-struct dw_param
-{
-    float* input_buf;
-    int input_h;
-    int input_w;
-    float* output_buf;
-    int output_h;
-    int output_w;
-    float* weight_buf;
-    int channel_num;
-    int stride;
-    float* bias;
-};
-
-struct Conv2dDepth : public NodeOps
-{
-    bool Run(Node* node);
-
-    int activation;
-
-    void DirectConv(float* input_buf, int input_h, int input_w, float* output_buf, int output_h, int output_w,
-                    float* weight_buf, int channel_num, int stride, float* bias);
-
-    bool Aider(int cpu, int seq, void* data);
-};
-
-bool Conv2dDepth::Aider(int cpu, int seq, void* data)
-{
-    dw_param* param = ( dw_param* )data;
-
-    DirectConv(param->input_buf, param->input_h, param->input_w, param->output_buf, param->output_h, param->output_w,
-               param->weight_buf, param->channel_num, param->stride, param->bias);
-
-    return true;
-}
-
-void Conv2dDepth::DirectConv(float* input_buf, int input_h, int input_w, float* output_buf, int output_h, int output_w,
-                             float* weight_buf, int channel_num, int stride, float* bias)
-{
-    int channel_size = input_h * input_w;
-    float* bias_tmp = bias;
-
-    for(int i = 0; i < channel_num; i++)
-    {
-        if(NULL != bias)
-            bias_tmp = bias + i;
-        if(stride == 1)
-        {
-            if(activation >= 0)
-                dw_k3s1p1_relu_fused(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp);
-            else
-                dw_k3s1p1(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp);
-
-            if(activation > 0)
-            {
-                for(int i = 0; i < channel_size; i++)
-                    output_buf[i] = std::min(output_buf[i], ( float )activation);
-            }
-
-            input_buf += channel_size;
-            output_buf += channel_size;
-            weight_buf += 9;
-        }
-        else if(stride == 2)
-        {
-            if(activation >= 0)
-                dw_k3s2p1_relu_fused(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp);
-            else
-                dw_k3s2p1(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp);
-
-            if(activation > 0)
-            {
-                for(int i = 0; i < output_h * output_w; i++)
-                    output_buf[i] = std::min(output_buf[i], ( float )activation);
-            }
-
-            input_buf += channel_size;
-            output_buf += output_h * output_w;
-            weight_buf += 9;
-        }
-    }
-}
-
-bool Conv2dDepth::Run(Node* node)
-{
-    Tensor* input_tensor = node->GetInputTensor(0);
-    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
-    ConvParam* param = conv_op->GetParam();
-    const TShape& input_shape = input_tensor->GetShape();
-
-    int input_c = input_shape.GetC();
-    int input_h = input_shape.GetH();
-    int input_w = input_shape.GetW();
-
-    /* output */
-    Tensor* output_tensor = node->GetOutputTensor(0);
-    TShape& output_shape = output_tensor->GetShape();
-
-    int output_h = output_shape.GetH();
-    int output_w = output_shape.GetW();
-    int output_n = output_shape.GetN();
-
-    Tensor* weight_tensor = node->GetInputTensor(1);
-    float* weight_buf = ( float* )get_tensor_mem(weight_tensor);
-    float* input_buf = ( float* )get_tensor_mem(input_tensor);
-    float* output_buf = ( float* )get_tensor_mem(output_tensor);
-
-    int stride_h = param->stride_h;
-    int cpu_number = cpu_info->GetCPUNumber();
-
-    float* bias = NULL;
-    // get bias
-    if(node->GetInputNum() > 2)
-    {
-        Tensor* bias_tensor = node->GetInputTensor(2);
-        bias = ( float* )get_tensor_mem(bias_tensor);
-    }
-
-    for(int i = 0; i < output_n; i++)
-    {
-        if(cpu_number == 1)
-            DirectConv(input_buf, input_h, input_w, output_buf, output_h, output_w, weight_buf, input_c, stride_h,
-                       bias);
-        else
-        {
-            // partition into 4 tasks
-            std::vector<sub_op_task> task_list;
-            std::vector<dw_param> param_list;
-
-            auto f = std::bind(&Conv2dDepth::Aider, this, std::placeholders::_1, std::placeholders::_2,
-                               std::placeholders::_3);
-
-            task_list.resize(cpu_number);
-            param_list.resize(cpu_number);
-
-            int step = input_c / cpu_number;
-            int channel_size = input_h * input_w;
-
-            for(int i = 0; i < cpu_number; i++)
-            {
-                dw_param* param = &param_list[i];
-                sub_op_task* task = &task_list[i];
-
-                task->exec_func = f;
-                task->seq = i;
-                task->data = param;
-
-                param->input_buf = input_buf;
-                param->input_h = input_h;
-                param->input_w = input_w;
-                param->output_buf = output_buf;
-                param->output_h = output_h;
-                param->output_w = output_w;
-                param->weight_buf = weight_buf;
-                param->channel_num = step;
-                param->stride = stride_h;
-                if(NULL != bias)
-                    param->bias = bias + i * step;
-                else
-                    param->bias = NULL;
-
-                input_buf += channel_size * step;
-                if(stride_h == 1)
-                    output_buf += channel_size * step;
-                else
-                    output_buf += output_h * output_w * step;
-                weight_buf += 9 * step;
-            }
-
-            // the last left ones
-            param_list[cpu_number - 1].channel_num += input_c - cpu_number * step;
-
-            task_dispatch(task_list, -1);
-
-            wait_done();
-        }
-    }
-
-    return true;
-}
-
-static bool isDepthwiseSupported(const ConvParam* param, const TShape& input_shape)
-{
-    int input_c = input_shape.GetC();
-    int group = param->group;
-    int kernel_h = param->kernel_h;
-    int kernel_w = param->kernel_w;
-    int stride_h = param->stride_h;
-    int stride_w = param->stride_w;
-    int dilation_h = param->dilation_h;
-    int dilation_w = param->dilation_w;
-    int pad_h0 = param->pads[0];
-    int pad_w0 = param->pads[1];
-    int pad_h1 = param->pads[2];
-    int pad_w1 = param->pads[3];
-
-    if(group == 1 || input_c != group || kernel_h != 3 || kernel_w != 3 || pad_h0 != 1 || pad_w0 != 1 ||
-       pad_h0 != pad_h1 || pad_w0 != pad_w1 || dilation_h != 1 || dilation_w != 1 || stride_w != stride_h)
-    {
-        return false;
-    }
-    return true;
-}
-
-NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
-{
-    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
-
-    if(exec_attr->layout == TENGINE_LAYOUT_NHWC)
-        return nullptr;
-
-    Operator* op = node->GetOp();
-
-    Convolution* conv_op = dynamic_cast<Convolution*>(op);
-
-    ConvParam* param = conv_op->GetParam();
-
-    const TShape& input_shape = node->GetInputTensor(0)->GetShape();
-
-    if(!isDepthwiseSupported(param, input_shape))
-        return nullptr;
-
-    Conv2dDepth* ops = new Conv2dDepth();
-
-    ops->activation = param->activation;
-
-    ops->need_free = true;
-
-    return ops;
-}
-
-}    // namespace conv_2d_dw
-
-void RegisterConv2dDepth(void)
-{
-    NodeOpsRegistryManager::RegisterOPImplementor("arm64", "Convolution", conv_2d_dw::SelectFunc,
-                                                  conv_2d_dw::default_prio);
-}
-
-}    // namespace TEngine
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+#include <iostream>
+#include <cstring>
+#include <cstdlib>
+
+#include "logger.hpp"
+#include "tensor_mem.hpp"
+
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "operator/convolution.hpp"
+#include <math.h>
+namespace TEngine {
+
+namespace conv_2d_dw {
+
+const char* conv_name = "CONV_DW";
+const int default_prio = 10;
+
+extern "C" void dw_k3s1p1(float* data, int h, int w, float* kernel, float* output, float* bias);
+extern "C" void dw_k3s2p1(float* data, int h, int w, float* kernel, float* output, float* bias);
+extern "C" void dw_k3s1p1_relu_fused(float* data, int h, int w, float* kernel, float* output, float* bias);
+extern "C" void dw_k3s2p1_relu_fused(float* data, int h, int w, float* kernel, float* output, float* bias);
+
+struct dw_param
+{
+    float* input_buf;
+    int input_h;
+    int input_w;
+    float* output_buf;
+    int output_h;
+    int output_w;
+    float* weight_buf;
+    int channel_num;
+    int stride;
+    float* bias;
+};
+
+struct Conv2dDepth : public NodeOps
+{
+    bool Run(Node* node);
+
+    int activation;
+
+    void DirectConv(float* input_buf, int input_h, int input_w, float* output_buf, int output_h, int output_w,
+                    float* weight_buf, int channel_num, int stride, float* bias);
+
+    bool Aider(int cpu, int seq, void* data);
+};
+
+bool Conv2dDepth::Aider(int cpu, int seq, void* data)
+{
+    dw_param* param = ( dw_param* )data;
+
+    DirectConv(param->input_buf, param->input_h, param->input_w, param->output_buf, param->output_h, param->output_w,
+               param->weight_buf, param->channel_num, param->stride, param->bias);
+
+    return true;
+}
+
+void Conv2dDepth::DirectConv(float* input_buf, int input_h, int input_w, float* output_buf, int output_h, int output_w,
+                             float* weight_buf, int channel_num, int stride, float* bias)
+{
+    int channel_size = input_h * input_w;
+    float* bias_tmp = bias;
+
+    for(int i = 0; i < channel_num; i++)
+    {
+        if(NULL != bias)
+            bias_tmp = bias + i;
+        if(stride == 1)
+        {
+            if(activation >= 0)
+                dw_k3s1p1_relu_fused(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp);
+            else
+                dw_k3s1p1(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp);
+
+            if(activation > 0)
+            {
+                for(int i = 0; i < channel_size; i++)
+                    output_buf[i] = std::min(output_buf[i], ( float )activation);
+            }
+
+            input_buf += channel_size;
+            output_buf += channel_size;
+            weight_buf += 9;
+        }
+        else if(stride == 2)
+        {
+            if(activation >= 0)
+                dw_k3s2p1_relu_fused(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp);
+            else
+                dw_k3s2p1(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp);
+
+            if(activation > 0)
+            {
+                for(int i = 0; i < output_h * output_w; i++)
+                    output_buf[i] = std::min(output_buf[i], ( float )activation);
+            }
+
+            input_buf += channel_size;
+            output_buf += output_h * output_w;
+            weight_buf += 9;
+        }
+    }
+}
+
+bool Conv2dDepth::Run(Node* node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
+    ConvParam* param = conv_op->GetParam();
+    const TShape& input_shape = input_tensor->GetShape();
+
+    int input_c = input_shape.GetC();
+    int input_h = input_shape.GetH();
+    int input_w = input_shape.GetW();
+
+    /* output */
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    TShape& output_shape = output_tensor->GetShape();
+
+    int output_h = output_shape.GetH();
+    int output_w = output_shape.GetW();
+    int output_n = output_shape.GetN();
+
+    Tensor* weight_tensor = node->GetInputTensor(1);
+    float* weight_buf = ( float* )get_tensor_mem(weight_tensor);
+    float* input_buf = ( float* )get_tensor_mem(input_tensor);
+    float* output_buf = ( float* )get_tensor_mem(output_tensor);
+
+    int stride_h = param->stride_h;
+    int cpu_number = cpu_info->GetCPUNumber();
+
+    float* bias = NULL;
+    // get bias
+    if(node->GetInputNum() > 2)
+    {
+        Tensor* bias_tensor = node->GetInputTensor(2);
+        bias = ( float* )get_tensor_mem(bias_tensor);
+    }
+
+    for(int i = 0; i < output_n; i++)
+    {
+        if(cpu_number == 1)
+            DirectConv(input_buf, input_h, input_w, output_buf, output_h, output_w, weight_buf, input_c, stride_h,
+                       bias);
+        else
+        {
+            // partition into 4 tasks
+            std::vector<sub_op_task> task_list;
+            std::vector<dw_param> param_list;
+
+            auto f = std::bind(&Conv2dDepth::Aider, this, std::placeholders::_1, std::placeholders::_2,
+                               std::placeholders::_3);
+
+            task_list.resize(cpu_number);
+            param_list.resize(cpu_number);
+
+            int step = input_c / cpu_number;
+            int channel_size = input_h * input_w;
+
+            for(int i = 0; i < cpu_number; i++)
+            {
+                dw_param* param = &param_list[i];
+                sub_op_task* task = &task_list[i];
+
+                task->exec_func = f;
+                task->seq = i;
+                task->data = param;
+
+                param->input_buf = input_buf;
+                param->input_h = input_h;
+                param->input_w = input_w;
+                param->output_buf = output_buf;
+                param->output_h = output_h;
+                param->output_w = output_w;
+                param->weight_buf = weight_buf;
+                param->channel_num = step;
+                param->stride = stride_h;
+                if(NULL != bias)
+                    param->bias = bias + i * step;
+                else
+                    param->bias = NULL;
+
+                input_buf += channel_size * step;
+                if(stride_h == 1)
+                    output_buf += channel_size * step;
+                else
+                    output_buf += output_h * output_w * step;
+                weight_buf += 9 * step;
+            }
+
+            // the last left ones
+            param_list[cpu_number - 1].channel_num += input_c - cpu_number * step;
+
+            task_dispatch(task_list, -1);
+
+            wait_done();
+        }
+    }
+
+    return true;
+}
+
+static bool isDepthwiseSupported(const ConvParam* param, const TShape& input_shape)
+{
+    int input_c = input_shape.GetC();
+    int group = param->group;
+    int kernel_h = param->kernel_h;
+    int kernel_w = param->kernel_w;
+    int stride_h = param->stride_h;
+    int stride_w = param->stride_w;
+    int dilation_h = param->dilation_h;
+    int dilation_w = param->dilation_w;
+    int pad_h0 = param->pad_h0;
+    int pad_w0 = param->pad_w0;
+    int pad_h1 = param->pad_h1;
+    int pad_w1 = param->pad_w1;
+
+    if(group == 1 || input_c != group || kernel_h != 3 || kernel_w != 3 || pad_h0 != 1 || pad_w0 != 1 ||
+       pad_h0 != pad_h1 || pad_w0 != pad_w1 || dilation_h != 1 || dilation_w != 1 || stride_w != stride_h)
+    {
+        return false;
+    }
+    return true;
+}
+
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+
+    if(exec_attr->graph_layout == TENGINE_LAYOUT_NHWC)
+        return nullptr;
+
+    Operator* op = node->GetOp();
+
+    Convolution* conv_op = dynamic_cast<Convolution*>(op);
+
+    ConvParam* param = conv_op->GetParam();
+
+    const TShape& input_shape = node->GetInputTensor(0)->GetShape();
+
+    if(!isDepthwiseSupported(param, input_shape))
+        return nullptr;
+
+    Conv2dDepth* ops = new Conv2dDepth();
+
+    ops->activation = param->activation;
+
+    ops->need_free = true;
+
+    return ops;
+}
+
+}    // namespace conv_2d_dw
+
+void RegisterConv2dDepth(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor("arm64", "Convolution", conv_2d_dw::SelectFunc,
+                                                  conv_2d_dw::default_prio);
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/arm64/conv/conv_2d_fast.cpp b/executor/operator/arm64/conv/conv_2d_fast.cpp
index 60ed834e1..e066df92f 100644
--- a/executor/operator/arm64/conv/conv_2d_fast.cpp
+++ b/executor/operator/arm64/conv/conv_2d_fast.cpp
@@ -1,1077 +1,1072 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: xiaowei@openailab.com
- */
-#include <iostream>
-#include <cstring>
-#include <cstdlib>
-#include <arm_neon.h>
-
-#include "logger.hpp"
-#include "node_ops.hpp"
-#include "tensor_mem.hpp"
-
-#include "graph.hpp"
-#include "operator/convolution.hpp"
-#include <math.h>
-
-extern "C" void sgemm_4x16_interleave(bool have_biases, float* biases, float* input, float* kernel, float* output,
-                                      long kernel_size);
-extern "C" void sgemm_4x4_interleave(bool have_biases, float* biases, float* input, float* kernel, float* output,
-                                     long kernel_size);
-extern "C" void sgemm_4x16_interleave_relu_fused(bool have_biases, float* biases, float* input, float* kernel,
-                                                 float* output, long kernel_size);
-extern "C" void sgemm_4x4_interleave_relu_fused(bool have_biases, float* biases, float* input, float* kernel,
-                                                float* output, long kernel_size);
-
-namespace TEngine {
-
-namespace conv_fast {
-
-#define TYPE_A53 0
-#define TYPE_A72 1
-const char* conv_name = "CONV_FAST";
-const int default_prio = 1000;
-
-void im2col(float* im, float* col, int input_chan, int input_x, int input_y, int kernel_x, int kernel_y, int stride_x,
-            int stride_y, int dilation_x, int dilation_y, int pad_x0, int pad_x1, int pad_y0, int pad_y1, int output_x,
-            int output_y, int col_start, int col_end)
-{
-    int kernel_size = kernel_x * kernel_y * input_chan;
-    int input_xy = input_x * input_y;
-    int pad_x = pad_x0;
-    int pad_y = pad_y0;
-    float* cur_col = col + col_start * kernel_size;
-    bool is_1x1 = (kernel_x == 1) && (kernel_y == 1) && (stride_x == 1) && (stride_y == 1);
-    bool is_dilation = (dilation_x != 1) || (dilation_y != 1);
-    bool is_3x3 = (kernel_x == 3) && (kernel_y == 3) && (!is_dilation);
-    int col_i, col_j, kch, ky, kx, i, j;
-
-    if(is_1x1)
-    {
-        for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
-        {
-            for(col_j = 0; col_j < kernel_size; col_j++)
-            {
-                for(i = 0; i < 4; i++)
-                    *cur_col++ = *(im + input_xy * col_j + col_i + i);
-            }
-        }
-        // final 4 input
-        if(col_end & 0x3)
-        {
-            for(col_j = 0; col_j < kernel_size; col_j++)
-            {
-                for(i = 0; i < 4; i++)
-                {
-                    if((col_i + i) < col_end)
-                        *cur_col++ = *(im + input_xy * col_j + col_i + i);
-                    else
-                        *cur_col++ = 0.0;
-                }
-            }
-        }
-    }
-    else if(is_3x3)
-    {
-        int stride_x2 = stride_x * 2;
-        int stride_x3 = stride_x * 3;
-        bool is_pad0 = (pad_x0 == 0) && (pad_y0 == 0) && (pad_x1 == 0) && (pad_y1 == 0);
-        for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
-        {
-            cur_col = col + col_i * kernel_size;
-            int imy0 = col_i / output_x;
-            int imy3 = (col_i + 3) / output_x;
-            int imx0 = col_i - imy0 * output_x;
-            int imx3 = (col_i + 3) - imy3 * output_x;
-            if((imy0 == imy3) &&
-               (is_pad0 || (imy0 != 0 && imx0 != 0 && imy0 != (output_y - 1) && imx3 != (output_x - 1))))
-            {
-                float* l0 = im + (imy0 * stride_y - pad_y) * input_x + (imx0 * stride_x - pad_x);
-                float* l1 = l0 + input_x;
-                float* l2 = l0 + input_x * 2;
-                for(i = 0; i < input_chan; i++)
-                {
-                    for(j = 0; j < 3; j++)
-                    {
-                        cur_col[j * 4 + 0] = l0[j];
-                        cur_col[j * 4 + 1] = l0[j + stride_x];
-                        cur_col[j * 4 + 2] = l0[j + stride_x2];
-                        cur_col[j * 4 + 3] = l0[j + stride_x3];
-                        cur_col[j * 4 + 12] = l1[j];
-                        cur_col[j * 4 + 13] = l1[j + stride_x];
-                        cur_col[j * 4 + 14] = l1[j + stride_x2];
-                        cur_col[j * 4 + 15] = l1[j + stride_x3];
-                        cur_col[j * 4 + 24] = l2[j];
-                        cur_col[j * 4 + 25] = l2[j + stride_x];
-                        cur_col[j * 4 + 26] = l2[j + stride_x2];
-                        cur_col[j * 4 + 27] = l2[j + stride_x3];
-                    }
-                    cur_col += 36;
-                    l0 += input_xy;
-                    l1 += input_xy;
-                    l2 += input_xy;
-                }
-            }
-            else
-            {
-                int cnt_y[4] = {imy0, (col_i + 1) / output_x, (col_i + 2) / output_x, imy3};
-                int cnt_x[4] = {imx0, col_i - cnt_y[1] * output_x + 1, col_i - cnt_y[2] * output_x + 2, imx3};
-                int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x,
-                                    cnt_x[2] * stride_x - pad_x, cnt_x[3] * stride_x - pad_x};
-                int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y,
-                                    cnt_y[2] * stride_y - pad_y, cnt_y[3] * stride_y - pad_y};
-                for(kch = 0; kch < input_chan; kch++)
-                    for(ky = 0; ky < 3; ky++)
-                        for(kx = 0; kx < 3; kx++)
-                        {
-                            int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
-                            int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                            for(i = 0; i < 4; i++)
-                            {
-                                if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
-                                    *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
-                                else
-                                    *cur_col++ = 0.0;
-                            }
-                        }
-            }
-        }
-        // final 4 input
-        if(col_end & 0x3)
-        {
-            int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x};
-            int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1,
-                            col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3};
-            int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x,
-                                cnt_x[3] * stride_x - pad_x};
-            int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y,
-                                cnt_y[3] * stride_y - pad_y};
-            for(kch = 0; kch < input_chan; kch++)
-                for(ky = 0; ky < 3; ky++)
-                    for(kx = 0; kx < 3; kx++)
-                    {
-                        int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
-                        int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                        for(i = 0; i < 4; i++)
-                        {
-                            if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 &&
-                               imy[i] < input_y)
-                                *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
-                            else
-                                *cur_col++ = 0.0;
-                        }
-                    }
-        }
-    }
-    else
-    {    // for general cases
-        for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
-        {
-            int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x};
-            int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1,
-                            col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3};
-            int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x,
-                                cnt_x[3] * stride_x - pad_x};
-            int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y,
-                                cnt_y[3] * stride_y - pad_y};
-            for(kch = 0; kch < input_chan; kch++)
-                for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y)
-                    for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x)
-                    {
-                        int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
-                        int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                        for(i = 0; i < 4; i++)
-                        {
-                            if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
-                                *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
-                            else
-                                *cur_col++ = 0.0;
-                        }
-                    }
-        }
-        // final 4 input
-        if(col_end & 0x3)
-        {
-            int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x};
-            int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1,
-                            col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3};
-            int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x,
-                                cnt_x[3] * stride_x - pad_x};
-            int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y,
-                                cnt_y[3] * stride_y - pad_y};
-            for(kch = 0; kch < input_chan; kch++)
-                for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y)
-                    for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x)
-                    {
-                        int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
-                        int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                        for(i = 0; i < 4; i++)
-                        {
-                            if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 &&
-                               imy[i] < input_y)
-                                *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
-                            else
-                                *cur_col++ = 0.0;
-                        }
-                    }
-        }
-    }
-}
-
-// interleave 0 ~ (output_chan & -16) kernels with 16 in form of k[0-15][0],k[0-15][1],k[0-15][2]..
-// interleave (output_chan & -16) ~ ((output_chan + 3) & -4) tail kernls with 4 in form of
-// k[0-3][0],k[0-3][1],k[0-3][2]..
-void interleave_kernel(float* kernel, float* kernel_interleaved, int kernel_chan, int kernel_size)
-{
-    int i, j;
-    float *cur_kernel0, *cur_kernel1, *cur_kernel2, *cur_kernel3, *cur_kernel4, *cur_kernel5, *cur_kernel6,
-        *cur_kernel7;
-    float *cur_kernel8, *cur_kernel9, *cur_kernel10, *cur_kernel11, *cur_kernel12, *cur_kernel13, *cur_kernel14,
-        *cur_kernel15;
-    float* cur_kernel_interleaved = kernel_interleaved;
-
-    // interleave 16 kernels
-    for(i = 0; i < (kernel_chan & -16); i += 16)
-    {
-        cur_kernel0 = kernel + kernel_size * i;
-        cur_kernel1 = kernel + kernel_size * (i + 1);
-        cur_kernel2 = kernel + kernel_size * (i + 2);
-        cur_kernel3 = kernel + kernel_size * (i + 3);
-        cur_kernel4 = kernel + kernel_size * (i + 4);
-        cur_kernel5 = kernel + kernel_size * (i + 5);
-        cur_kernel6 = kernel + kernel_size * (i + 6);
-        cur_kernel7 = kernel + kernel_size * (i + 7);
-        cur_kernel8 = kernel + kernel_size * (i + 8);
-        cur_kernel9 = kernel + kernel_size * (i + 9);
-        cur_kernel10 = kernel + kernel_size * (i + 10);
-        cur_kernel11 = kernel + kernel_size * (i + 11);
-        cur_kernel12 = kernel + kernel_size * (i + 12);
-        cur_kernel13 = kernel + kernel_size * (i + 13);
-        cur_kernel14 = kernel + kernel_size * (i + 14);
-        cur_kernel15 = kernel + kernel_size * (i + 15);
-        for(j = 0; j < kernel_size; j++)
-        {
-            *(cur_kernel_interleaved++) = cur_kernel0[j];
-            *(cur_kernel_interleaved++) = cur_kernel1[j];
-            *(cur_kernel_interleaved++) = cur_kernel2[j];
-            *(cur_kernel_interleaved++) = cur_kernel3[j];
-            *(cur_kernel_interleaved++) = cur_kernel4[j];
-            *(cur_kernel_interleaved++) = cur_kernel5[j];
-            *(cur_kernel_interleaved++) = cur_kernel6[j];
-            *(cur_kernel_interleaved++) = cur_kernel7[j];
-            *(cur_kernel_interleaved++) = cur_kernel8[j];
-            *(cur_kernel_interleaved++) = cur_kernel9[j];
-            *(cur_kernel_interleaved++) = cur_kernel10[j];
-            *(cur_kernel_interleaved++) = cur_kernel11[j];
-            *(cur_kernel_interleaved++) = cur_kernel12[j];
-            *(cur_kernel_interleaved++) = cur_kernel13[j];
-            *(cur_kernel_interleaved++) = cur_kernel14[j];
-            *(cur_kernel_interleaved++) = cur_kernel15[j];
-        }
-    }
-
-    for(i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4)
-    {
-        cur_kernel0 = kernel + kernel_size * i;
-        cur_kernel1 = kernel + kernel_size * (i + 1);
-        cur_kernel2 = kernel + kernel_size * (i + 2);
-        cur_kernel3 = kernel + kernel_size * (i + 3);
-        for(j = 0; j < kernel_size; j++)
-        {
-            *(cur_kernel_interleaved++) = cur_kernel0[j];
-            *(cur_kernel_interleaved++) = cur_kernel1[j];
-            *(cur_kernel_interleaved++) = cur_kernel2[j];
-            *(cur_kernel_interleaved++) = cur_kernel3[j];
-        }
-    }
-    // last 4 kernel
-    cur_kernel0 = kernel + kernel_size * i;
-    cur_kernel1 = kernel + kernel_size * (i + 1);
-    cur_kernel2 = kernel + kernel_size * (i + 2);
-    if((kernel_chan & 0x3) == 3)
-    {
-        for(j = 0; j < kernel_size; j++)
-        {
-            *(cur_kernel_interleaved++) = cur_kernel0[j];
-            *(cur_kernel_interleaved++) = cur_kernel1[j];
-            *(cur_kernel_interleaved++) = cur_kernel2[j];
-            *(cur_kernel_interleaved++) = 0.0;
-        }
-    }
-    else if((kernel_chan & 0x3) == 2)
-    {
-        for(j = 0; j < kernel_size; j++)
-        {
-            *(cur_kernel_interleaved++) = cur_kernel0[j];
-            *(cur_kernel_interleaved++) = cur_kernel1[j];
-            *(cur_kernel_interleaved++) = 0.0;
-            *(cur_kernel_interleaved++) = 0.0;
-        }
-    }
-    else if((kernel_chan & 0x3) == 1)
-    {
-        for(j = 0; j < kernel_size; j++)
-        {
-            *(cur_kernel_interleaved++) = cur_kernel0[j];
-            *(cur_kernel_interleaved++) = 0.0;
-            *(cur_kernel_interleaved++) = 0.0;
-            *(cur_kernel_interleaved++) = 0.0;
-        }
-    }
-
-    return;
-}
-
-static void sgemm4x16(float* col, float* kernel, float* biases, bool bias_term, float* output, int kernel_size,
-                      int col_start, int col_end, int kernel_start, int kernel_end, int output_xy, int activation,
-                      int cpu_type)
-{
-    float initial[64], result[64];
-    int col_line, kernel_num;
-    int i, j;
-    float *cur_col, *cur_kernel;
-
-    for(kernel_num = (kernel_start & -16); kernel_num < (kernel_end & -16); kernel_num += 16)
-    {
-        if(bias_term)
-            for(i = 0; i < 64; i++)
-                initial[i] = *(biases + kernel_num + (i >> 2));
-        cur_kernel = ( float* )(kernel + kernel_num * kernel_size);
-
-        for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
-        {
-            cur_col = ( float* )(col + col_line * kernel_size);
-            if(activation >= 0)
-                sgemm_4x16_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-            else
-                sgemm_4x16_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-
-            if(activation > 0)
-            {
-                for(i = 0; i < 16; i++)
-                {
-                    *(output + (kernel_num + i) * output_xy + col_line) =
-                        std::min(result[(i << 2)], ( float )activation);
-                    *(output + (kernel_num + i) * output_xy + col_line + 1) =
-                        std::min(result[(i << 2) + 1], ( float )activation);
-                    *(output + (kernel_num + i) * output_xy + col_line + 2) =
-                        std::min(result[(i << 2) + 2], ( float )activation);
-                    *(output + (kernel_num + i) * output_xy + col_line + 3) =
-                        std::min(result[(i << 2) + 3], ( float )activation);
-                }
-            }
-            else
-            {
-                for(i = 0; i < 16; i++)
-                {
-                    *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2)];
-                    *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1];
-                    *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2];
-                    *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3];
-                }
-            }
-        }
-        if(col_end & 0x3)
-        {
-            cur_col = ( float* )(col + col_line * kernel_size);
-
-            if(activation >= 0)
-                sgemm_4x16_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-            else
-                sgemm_4x16_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-
-            for(i = 0; i < 16; i++)
-                for(j = 0; j < (col_end & 0x3); j++)
-                {
-                    if(activation > 0)
-                        *(output + (kernel_num + i) * output_xy + col_line + j) =
-                            std::min(result[(i << 2) + j], ( float )activation);
-                    else
-                        *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
-                }
-        }
-    }
-}
-
-static void sgemm4x4(float* col, float* kernel, float* biases, bool bias_term, float* output, int kernel_size,
-                     int col_start, int col_end, int kernel_start, int kernel_end, int output_xy, int activation,
-                     int cpu_type)
-{
-    float initial[16], result[16];
-    int col_line, kernel_num;
-    int i, j;
-    float *cur_col, *cur_kernel;
-
-    for(kernel_num = kernel_start & -4; kernel_num < (kernel_end & -4); kernel_num += 4)
-    {
-        if(bias_term)
-            for(i = 0; i < 16; i++)
-                initial[i] = *(biases + kernel_num + (i >> 2));
-        cur_kernel = ( float* )(kernel + kernel_num * kernel_size);
-        for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
-        {
-            cur_col = ( float* )(col + col_line * kernel_size);
-
-            if(activation >= 0)
-                sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-            else
-                sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-
-            if(activation > 0)
-            {
-                for(i = 0; i < 4; i++)
-                {
-                    *(output + (kernel_num + i) * output_xy + col_line) =
-                        std::min(result[(i << 2) + 0], ( float )activation);
-                    *(output + (kernel_num + i) * output_xy + col_line + 1) =
-                        std::min(result[(i << 2) + 1], ( float )activation);
-                    *(output + (kernel_num + i) * output_xy + col_line + 2) =
-                        std::min(result[(i << 2) + 2], ( float )activation);
-                    *(output + (kernel_num + i) * output_xy + col_line + 3) =
-                        std::min(result[(i << 2) + 3], ( float )activation);
-                }
-            }
-            else
-            {
-                for(i = 0; i < 4; i++)
-                {
-                    *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2) + 0];
-                    *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1];
-                    *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2];
-                    *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3];
-                }
-            }
-        }
-        if(col_end & 0x3)
-        {
-            cur_col = ( float* )(col + col_line * kernel_size);
-            if(activation >= 0)
-                sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-            else
-                sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-            for(i = 0; i < 4; i++)
-            {
-                for(j = 0; j < (col_end & 0x3); j++)
-                {
-                    if(activation > 0)
-                        *(output + (kernel_num + i) * output_xy + col_line + j) =
-                            std::min(result[(i << 2) + j], ( float )activation);
-                    else
-                        *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
-                }
-            }
-        }
-    }
-
-    if(kernel_end & 0x3)
-    {
-        if(bias_term)
-            for(i = 0; i < ((kernel_end & 0x3) << 2); i++)
-                initial[i] = *(biases + kernel_num + (i >> 2));
-        cur_kernel = ( float* )(kernel + kernel_num * kernel_size);
-        for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
-        {
-            cur_col = ( float* )(col + col_line * kernel_size);
-
-            if(activation >= 0)
-                sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-            else
-                sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-
-            if(activation > 0)
-            {
-                for(i = 0; i < (kernel_end & 0x3); i++)
-                {
-                    *(output + (kernel_num + i) * output_xy + col_line) =
-                        std::min(result[(i << 2) + 0], ( float )activation);
-                    *(output + (kernel_num + i) * output_xy + col_line + 1) =
-                        std::min(result[(i << 2) + 1], ( float )activation);
-                    *(output + (kernel_num + i) * output_xy + col_line + 2) =
-                        std::min(result[(i << 2) + 2], ( float )activation);
-                    *(output + (kernel_num + i) * output_xy + col_line + 3) =
-                        std::min(result[(i << 2) + 3], ( float )activation);
-                }
-            }
-            else
-            {
-                for(i = 0; i < (kernel_end & 0x3); i++)
-                {
-                    *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2) + 0];
-                    *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1];
-                    *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2];
-                    *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3];
-                }
-            }
-        }
-        if(col_end & 0x3)
-        {
-            cur_col = ( float* )(col + col_line * kernel_size);
-            if(activation >= 0)
-                sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-            else
-                sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
-
-            for(i = 0; i < (kernel_end & 0x3); i++)
-            {
-                for(j = 0; j < (col_end & 0x3); j++)
-                {
-                    if(activation > 0)
-                        *(output + (kernel_num + i) * output_xy + col_line + j) =
-                            std::min(result[(i << 2) + j], ( float )activation);
-                    else
-                        *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
-                }
-            }
-        }
-    }
-}
-
-struct im2col_param
-{
-    float* im;
-    float* col;
-    int input_chan;
-    int input_x;
-    int input_y;
-    int kernel_x;
-    int kernel_y;
-    int stride_x;
-    int stride_y;
-    int dilation_x;
-    int dilation_y;
-    int pad_x0;
-    int pad_x1;
-    int pad_y0;
-    int pad_y1;
-    int output_x;
-    int output_y;
-    int col_start;
-    int col_end;
-};
-
-struct sgemm_param
-{
-    float* col;
-    float* kernel;
-    float* biases;
-    bool bias_term;
-    float* output;
-    int kernel_size;
-    int col_start;
-    int col_end;
-    int kernel_start;
-    int kernel_end;
-    int output_xy;
-};
-
-struct conv1x1s1_param
-{
-    const float* input;
-    float* output;
-    const float* kernel;
-    const float* bias;
-    int in_h;
-    int in_w;
-    int in_ch;
-    int out_h;
-    int out_w;
-    int out_ch;
-    bool relu_fused;
-};
-
-struct ConvFast : public MTNodeOps
-{
-    bool Prerun(Node* node) override;
-    bool Reshape(Node* node) override;
-    bool Run(Node* node) override;
-    bool Postrun(Node* node) override;
-    bool GetSharedMemorySize(Node*, unsigned int& mem_size) override;
-    bool SetSharedMemoryAddr(Node*, void* mem_addr, int mem_size) override;
-
-    bool float_mode;
-    bool im2col_aider(int cpu, int seq, void* data /* im2col_param * param */);
-    bool sgemm_aider(int cpu, int seq, void* data /* sgemm_param * param */);
-    bool sgemm4x4_aider(int cpu, int seq, void* data /* sgemm_param * param */);
-
-    int activation;
-    bool dynamic_shape;
-};
-
-bool ConvFast::im2col_aider(int cpu, int seq, void* data)
-{
-    im2col_param* param = ( im2col_param* )(data);
-    im2col(param->im, param->col, param->input_chan, param->input_x, param->input_y, param->kernel_x, param->kernel_y,
-           param->stride_x, param->stride_y, param->dilation_x, param->dilation_y, param->pad_x0, param->pad_x1,
-           param->pad_y0, param->pad_y1, param->output_x, param->output_y, param->col_start, param->col_end);
-
-    return true;
-}
-
-bool ConvFast::sgemm4x4_aider(int cpu, int seq, void* data)
-{
-    int cpu_type = TYPE_A72;
-    sgemm_param* param = ( sgemm_param* )(data);
-
-    sgemm4x4(param->col, param->kernel, param->biases, param->bias_term, param->output, param->kernel_size,
-             param->col_start, param->col_end, param->kernel_start, param->kernel_end, param->output_xy, activation,
-             cpu_type);
-
-    return true;
-}
-
-bool ConvFast::sgemm_aider(int cpu, int seq, void* data)
-{
-    int cpu_type = TYPE_A72;
-    sgemm_param* param = ( sgemm_param* )(data);
-
-    sgemm4x16(param->col, param->kernel, param->biases, param->bias_term, param->output, param->kernel_size,
-              param->col_start, param->col_end, param->kernel_start, param->kernel_end, param->output_xy, activation,
-              cpu_type);
-
-    return true;
-}
-
-bool ConvFast::Prerun(Node* node)
-{
-    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
-    ConvParam* param = conv_op->GetParam();
-    int group = param->group;
-
-    Tensor* output_tensor = node->GetOutputTensor(0);
-    TShape& output_shape = output_tensor->GetShape();
-    int output_chan = output_shape.GetC() / group;
-
-    /* pre-allocate col_buf */
-    Tensor* input_tensor = node->GetInputTensor(0);
-    TShape& input_shape = input_tensor->GetShape();
-
-    int input_chan = input_shape.GetC() / group;
-    int kernel_size = input_chan * param->kernel_h * param->kernel_w;
-
-    if(!dynamic_shape)
-    {
-        if(node->ExistAttr("shared_col_buf"))
-        {
-            float* addr = ( float* )any_cast<void*>(node->GetAttr("shared_col_buf"));
-
-            (*node)["col_buf"] = addr;
-        }
-        else
-        {
-            unsigned int col_size;
-
-            GetSharedMemorySize(node, col_size);
-
-            float* col_buf = ( float* )mem_alloc(col_size);
-            (*node)["col_buf"] = col_buf;
-            node->SetAttr("col_buf_allocated", col_size);
-        }
-    }
-
-    /* packing kernel data */
-    Tensor* kernel_tensor = node->GetInputTensor(1);
-
-    float* kernel_interleaved = NULL;
-
-    int kernel_interleaved_size_g = kernel_size * ((output_chan + 3) & -4);
-    int kernel_size_g = kernel_size * output_chan;
-    float* kernel_org = ( float* )get_tensor_mem(kernel_tensor);
-    kernel_interleaved = ( float* )mem_alloc(sizeof(float) * (kernel_interleaved_size_g * group) + 128);
-
-    for(int g = 0; g < group; ++g)
-    {
-        float* kernel = kernel_org + g * kernel_size_g;
-        float* kernel_interleaved_g = kernel_interleaved + g * kernel_interleaved_size_g;
-        interleave_kernel(kernel, kernel_interleaved_g, output_chan, kernel_size);
-    }
-
-    (*node)["kernel_interleaved"] = kernel_interleaved;
-
-    if(exec_attr->low_mem_mode)
-    {
-        printf("free convolution kernel: %s %d\n", kernel_tensor->GetName().c_str(), kernel_tensor->GetTotalSize());
-
-        kernel_tensor->FreeMem();
-    }
-
-    return true;
-}
-
-bool ConvFast::Reshape(Node* node)
-{
-    unsigned int new_col_size;
-
-    GetSharedMemorySize(node, new_col_size);
-
-    if(node->ExistAttr("col_buf_allocated"))
-    {
-        unsigned int col_size = any_cast<unsigned int>(node->GetAttr("col_buf_allocated"));
-        if(new_col_size == col_size)
-            return true;
-
-        float* addr = any_cast<float*>(node->GetAttr("col_buf"));
-        mem_free(addr);
-    }
-
-    float* col_buf = ( float* )mem_alloc(new_col_size);
-    (*node)["col_buf"] = col_buf;
-
-    node->SetAttr("col_buf_allocated", new_col_size);
-    return true;
-}
-
-bool ConvFast::Run(Node* node)
-{
-    /* input */
-    Tensor* input_tensor = node->GetInputTensor(0);
-
-    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
-    ConvParam* param = conv_op->GetParam();
-
-    const TShape& input_shape = input_tensor->GetShape();
-
-    int group = param->group;
-    int input_chan = input_shape.GetC() / group;
-    int input_h = input_shape.GetH();
-    int input_w = input_shape.GetW();
-    int input_size = input_w * input_h * input_chan;
-    int pad_x0 = param->pads[1];    // left padding columns
-    int pad_x1 = param->pads[3];    // right padding columns
-    int pad_y0 = param->pads[0];    // top padding rows
-    int pad_y1 = param->pads[2];    // bottom padding rows
-    int stride_x = param->stride_w;
-    int stride_y = param->stride_h;
-    int dilation_x = param->dilation_w;
-    int dilation_y = param->dilation_h;
-    float* input_org = ( float* )get_tensor_mem(input_tensor);
-    float* col = any_cast<float*>(node->GetAttr("col_buf"));
-
-    /* output */
-    Tensor* output_tensor = node->GetOutputTensor(0);
-    TShape& output_shape = output_tensor->GetShape();
-    float* output_org = ( float* )get_tensor_mem(output_tensor);
-    int output_y = output_shape.GetH();
-    int output_x = output_shape.GetW();
-    int output_xy = output_x * output_y;
-    int output_chan = output_shape.GetC() / group;
-    int output_n = output_shape.GetN();
-
-    /* kernel */
-    int kernel_x = param->kernel_w;
-    int kernel_y = param->kernel_h;
-    int kernel_size = input_chan * kernel_x * kernel_y;
-
-    float* kernel_interleaved = any_cast<float*>(node->GetAttr("kernel_interleaved"));
-
-    int cpu_number = cpu_info->GetCPUNumber();
-
-    /* biases */
-
-    float* biases = NULL;
-    bool have_biases = (node->GetInputNum() > 2);
-
-    if(have_biases)
-    {
-        biases = ( float* )get_tensor_mem(node->GetInputTensor(2));
-    }
-
-    int cpu_type;
-
-    if(cpu_info->GetCPUModel(cpu_info->GetMasterCPU()) == CPU_A72)
-        cpu_type = TYPE_A72;
-    else
-        cpu_type = TYPE_A53;
-
-    /* block size split parameter */
-    int L2_CACHE_SIZE = (cpu_type == TYPE_A53) ? 512 * 1024 : 1024 * 1024;
-    int kernel_size_l1 = kernel_size;
-    int col_cnt_l2 = L2_CACHE_SIZE / 4 / kernel_size_l1 * 7 / 8;
-    col_cnt_l2 = col_cnt_l2 > 4 ? (col_cnt_l2 & -4) : 4;
-
-    /* one image per time */
-    for(int i = 0; i < output_n; i++)
-    {
-        float* input = input_org + i * input_size * group;
-        float* output = output_org + i * output_xy * output_chan * group;
-
-        for(int g = 0; g < group; g++)
-        {
-            float* input_g = input + g * input_size;
-            int total_num = output_xy * input_chan * kernel_x * kernel_y;
-
-            if(cpu_number == 1 || total_num < 100 * 1000)
-                im2col(input_g, col, input_chan, input_w, input_h, kernel_x, kernel_y, stride_x, stride_y, dilation_x,
-                       dilation_y, pad_x0, pad_x1, pad_y0, pad_y1, output_x, output_y, 0, output_xy);
-            else
-            {
-                std::vector<sub_op_task> task_list;
-                std::vector<im2col_param> param_list;
-
-                auto f = std::bind(&ConvFast::im2col_aider, this, std::placeholders::_1, std::placeholders::_2,
-                                   std::placeholders::_3);
-
-                int steps = output_xy / cpu_number;
-
-                steps = (steps + 3) & (~0x3);
-
-                int offset;
-                int real_cpu_number = cpu_number;
-
-                while(1)
-                {
-                    offset = steps * real_cpu_number - output_xy;
-
-                    if(offset < steps)
-                        break;
-
-                    real_cpu_number--;
-                }
-
-                task_list.resize(real_cpu_number);
-                param_list.resize(real_cpu_number);
-
-                for(int i = 0; i < real_cpu_number; i++)
-                {
-                    im2col_param* param = &param_list[i];
-                    sub_op_task* task = &task_list[i];
-
-                    task->exec_func = f;
-                    task->seq = i;
-                    task->data = param;
-
-                    param->im = input_g;
-                    param->col = col;
-                    param->input_chan = input_chan;
-                    param->input_x = input_w;
-                    param->input_y = input_h;
-                    param->kernel_x = kernel_x;
-                    param->kernel_y = kernel_y;
-                    param->stride_x = stride_x;
-                    param->stride_y = stride_y;
-                    param->dilation_x = dilation_x;
-                    param->dilation_y = dilation_y;
-                    param->pad_x0 = pad_x0;
-                    param->pad_x1 = pad_x1;
-                    param->pad_y0 = pad_y0;
-                    param->pad_y1 = pad_y1;
-                    param->output_x = output_x;
-                    param->output_y = output_y;
-                    param->col_start = i * steps;
-                    param->col_end = param->col_start + steps;
-                }
-
-                param_list[real_cpu_number - 1].col_end = output_xy;
-
-                task_dispatch(task_list, -1);
-                wait_done();
-            }
-
-            float* kernel_g = kernel_interleaved + g * (kernel_size * ((output_chan + 3) & -4));
-            float* output_g = output + g * output_xy * output_chan;
-            float* bias_g = biases + g * output_chan;
-
-            std::vector<sub_op_task> task_list;
-            std::vector<sgemm_param> param_list;
-
-            int chan_16_num = output_chan / 16;
-            int chan_4_num = (output_chan & 0xf) ? 1 : 0;
-            int l2_loop = (output_xy - 1) / col_cnt_l2 + 1;
-            int max_task_num = l2_loop * (chan_16_num + chan_4_num);
-
-            if(cpu_number > 1)
-                param_list.resize(max_task_num);
-
-            // for input block of L2 cache size
-            for(int col_i = 0; col_i < output_xy; col_i += col_cnt_l2)
-            {
-                int col_start = col_i;
-                int col_end = col_i + col_cnt_l2;
-                col_end = col_end > output_xy ? output_xy : col_end;
-
-                if(cpu_number == 1)
-                {
-                    sgemm4x16(col, kernel_g, bias_g, have_biases, output_g, kernel_size, col_start, col_end, 0,
-                              output_chan & -16, output_xy, activation, cpu_type);
-                    if(output_chan & 0xf)
-                        sgemm4x4(col, kernel_g, bias_g, have_biases, output_g, kernel_size, col_start, col_end,
-                                 output_chan & -16, output_chan, output_xy, activation, cpu_type);
-                }
-                else
-                {
-                    auto f = std::bind(&ConvFast::sgemm_aider, this, std::placeholders::_1, std::placeholders::_2,
-                                       std::placeholders::_3);
-
-                    for(int i = 0; i < chan_16_num; i++)
-                    {
-                        sub_op_task tmp_task;
-                        sgemm_param* param = &param_list[task_list.size()];
-                        sub_op_task* task = &tmp_task;
-                        task->exec_func = f;
-                        task->seq = i;
-                        task->data = param;
-
-                        param->col = col;
-                        param->kernel = kernel_g;
-                        param->biases = bias_g;
-                        param->bias_term = have_biases;
-                        param->output = output_g;
-                        param->kernel_size = kernel_size;
-                        param->col_start = col_start;
-                        param->col_end = col_end;
-                        param->kernel_start = i * 16;
-                        param->kernel_end = param->kernel_start + 16;
-                        param->output_xy = output_xy;
-
-                        task_list.emplace_back(tmp_task);
-                    }
-
-                    if(output_chan & 0xf)
-                    {
-                        auto f = std::bind(&ConvFast::sgemm4x4_aider, this, std::placeholders::_1,
-                                           std::placeholders::_2, std::placeholders::_3);
-                        sub_op_task tmp_task;
-                        sgemm_param* param = &param_list[task_list.size()];
-                        sub_op_task* task = &tmp_task;
-                        task->exec_func = f;
-                        task->seq = task_list.size() - 1;
-                        task->data = param;
-
-                        param->col = col;
-                        param->kernel = kernel_g;
-                        param->biases = bias_g;
-                        param->bias_term = have_biases;
-                        param->output = output_g;
-                        param->kernel_size = kernel_size;
-                        param->col_start = col_start;
-                        param->col_end = col_end;
-                        param->kernel_start = output_chan & -16;
-                        param->kernel_end = output_chan;
-                        param->output_xy = output_xy;
-
-                        task_list.emplace_back(tmp_task);
-                    }
-                }
-            }
-
-            if(cpu_number > 1)
-            {
-                task_dispatch(task_list, -1);
-                wait_done();
-            }
-        }
-    }
-
-    return true;
-}
-
-bool ConvFast::Postrun(Node* node)
-{
-    if(node->ExistAttr("kernel_interleaved"))
-    {
-        float* addr;
-        addr = any_cast<float*>(node->GetAttr("kernel_interleaved"));
-
-        mem_free(addr);
-
-        node->RemoveAttr("kernel_interleaved");
-    }
-
-    if(node->ExistAttr("col_buf_allocated"))
-    {
-        float* addr = any_cast<float*>(node->GetAttr("col_buf"));
-        mem_free(addr);
-
-        node->RemoveAttr("col_buf_allocated");
-    }
-
-    if(node->ExistAttr("col_buf"))
-        node->RemoveAttr("col_buf");
-
-    return true;
-}
-
-bool ConvFast::GetSharedMemorySize(Node* node, unsigned int& mem_size)
-{
-    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
-    ConvParam* param = conv_op->GetParam();
-    int group = param->group;
-
-    Tensor* output_tensor = node->GetOutputTensor(0);
-    TShape& output_shape = output_tensor->GetShape();
-    int output_y = output_shape.GetH();
-    int output_x = output_shape.GetW();
-
-    Tensor* input_tensor = node->GetInputTensor(0);
-    TShape& input_shape = input_tensor->GetShape();
-
-    int input_chan = input_shape.GetC() / group;
-    int kernel_size = input_chan * param->kernel_h * param->kernel_w;
-    int output_xy = output_x * output_y;
-
-    mem_size = (sizeof(float) * (kernel_size * ((output_xy + 3) & -4)) + 128);
-
-    return true;
-}
-
-bool ConvFast::SetSharedMemoryAddr(Node* node, void* mem_addr, int mem_size)
-{
-    (*node)["shared_col_buf"] = mem_addr;
-    return true;
-}
-
-NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
-{
-    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
-
-    if(exec_attr->kernel_mode != EXEC_KERNEL_FP32)
-        return nullptr;
-
-    if(exec_attr->layout == TENGINE_LAYOUT_NHWC)
-        return nullptr;
-
-    ConvFast* ops = new ConvFast();
-
-    ops->need_free = true;
-
-    if(node->IsDynamicShape())
-        ops->dynamic_shape = true;
-    else
-        ops->dynamic_shape = false;
-
-    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
-    ConvParam* param = conv_op->GetParam();
-
-    ops->activation = param->activation;
-
-    return ops;
-}
-
-}    // conv_fast
-
-void RegisterConv2dFast(void)
-{
-    NodeOpsRegistryManager::RegisterOPImplementor("arm64", "Convolution", conv_fast::SelectFunc,
-                                                  conv_fast::default_prio);
-}
-
-}    // namespace TEngine
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: xiaowei@openailab.com
+ */
+#include <iostream>
+#include <cstring>
+#include <cstdlib>
+#include <arm_neon.h>
+
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+
+#include "graph.hpp"
+#include "operator/convolution.hpp"
+#include <math.h>
+
+extern "C" void sgemm_4x16_interleave(bool have_biases, float* biases, float* input, float* kernel, float* output,
+                                      long kernel_size);
+extern "C" void sgemm_4x4_interleave(bool have_biases, float* biases, float* input, float* kernel, float* output,
+                                     long kernel_size);
+extern "C" void sgemm_4x16_interleave_relu_fused(bool have_biases, float* biases, float* input, float* kernel,
+                                                 float* output, long kernel_size);
+extern "C" void sgemm_4x4_interleave_relu_fused(bool have_biases, float* biases, float* input, float* kernel,
+                                                float* output, long kernel_size);
+
+namespace TEngine {
+
+namespace conv_fast {
+
+#define TYPE_A53 0
+#define TYPE_A72 1
+const char* conv_name = "CONV_FAST";
+const int default_prio = 1000;
+
+void im2col(float* im, float* col, int input_chan, int input_x, int input_y, int kernel_x, int kernel_y, int stride_x,
+            int stride_y, int dilation_x, int dilation_y, int pad_x0, int pad_x1, int pad_y0, int pad_y1, int output_x,
+            int output_y, int col_start, int col_end)
+{
+    int kernel_size = kernel_x * kernel_y * input_chan;
+    int input_xy = input_x * input_y;
+    int pad_x = pad_x0;
+    int pad_y = pad_y0;
+    float* cur_col = col + col_start * kernel_size;
+    bool is_1x1 = (kernel_x == 1) && (kernel_y == 1) && (stride_x == 1) && (stride_y == 1);
+    bool is_dilation = (dilation_x != 1) || (dilation_y != 1);
+    bool is_3x3 = (kernel_x == 3) && (kernel_y == 3) && (!is_dilation);
+    int col_i, col_j, kch, ky, kx, i, j;
+
+    if(is_1x1)
+    {
+        for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
+        {
+            for(col_j = 0; col_j < kernel_size; col_j++)
+            {
+                for(i = 0; i < 4; i++)
+                    *cur_col++ = *(im + input_xy * col_j + col_i + i);
+            }
+        }
+        // final 4 input
+        if(col_end & 0x3)
+        {
+            for(col_j = 0; col_j < kernel_size; col_j++)
+            {
+                for(i = 0; i < 4; i++)
+                {
+                    if((col_i + i) < col_end)
+                        *cur_col++ = *(im + input_xy * col_j + col_i + i);
+                    else
+                        *cur_col++ = 0.0;
+                }
+            }
+        }
+    }
+    else if(is_3x3)
+    {
+        int stride_x2 = stride_x * 2;
+        int stride_x3 = stride_x * 3;
+        bool is_pad0 = (pad_x0 == 0) && (pad_y0 == 0) && (pad_x1 == 0) && (pad_y1 == 0);
+        for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
+        {
+            cur_col = col + col_i * kernel_size;
+            int imy0 = col_i / output_x;
+            int imy3 = (col_i + 3) / output_x;
+            int imx0 = col_i - imy0 * output_x;
+            int imx3 = (col_i + 3) - imy3 * output_x;
+            if((imy0 == imy3) &&
+               (is_pad0 || (imy0 != 0 && imx0 != 0 && imy0 != (output_y - 1) && imx3 != (output_x - 1))))
+            {
+                float* l0 = im + (imy0 * stride_y - pad_y) * input_x + (imx0 * stride_x - pad_x);
+                float* l1 = l0 + input_x;
+                float* l2 = l0 + input_x * 2;
+                for(i = 0; i < input_chan; i++)
+                {
+                    for(j = 0; j < 3; j++)
+                    {
+                        cur_col[j * 4 + 0] = l0[j];
+                        cur_col[j * 4 + 1] = l0[j + stride_x];
+                        cur_col[j * 4 + 2] = l0[j + stride_x2];
+                        cur_col[j * 4 + 3] = l0[j + stride_x3];
+                        cur_col[j * 4 + 12] = l1[j];
+                        cur_col[j * 4 + 13] = l1[j + stride_x];
+                        cur_col[j * 4 + 14] = l1[j + stride_x2];
+                        cur_col[j * 4 + 15] = l1[j + stride_x3];
+                        cur_col[j * 4 + 24] = l2[j];
+                        cur_col[j * 4 + 25] = l2[j + stride_x];
+                        cur_col[j * 4 + 26] = l2[j + stride_x2];
+                        cur_col[j * 4 + 27] = l2[j + stride_x3];
+                    }
+                    cur_col += 36;
+                    l0 += input_xy;
+                    l1 += input_xy;
+                    l2 += input_xy;
+                }
+            }
+            else
+            {
+                int cnt_y[4] = {imy0, (col_i + 1) / output_x, (col_i + 2) / output_x, imy3};
+                int cnt_x[4] = {imx0, col_i - cnt_y[1] * output_x + 1, col_i - cnt_y[2] * output_x + 2, imx3};
+                int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x,
+                                    cnt_x[2] * stride_x - pad_x, cnt_x[3] * stride_x - pad_x};
+                int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y,
+                                    cnt_y[2] * stride_y - pad_y, cnt_y[3] * stride_y - pad_y};
+                for(kch = 0; kch < input_chan; kch++)
+                    for(ky = 0; ky < 3; ky++)
+                        for(kx = 0; kx < 3; kx++)
+                        {
+                            int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
+                            int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
+                            for(i = 0; i < 4; i++)
+                            {
+                                if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                    *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
+                                else
+                                    *cur_col++ = 0.0;
+                            }
+                        }
+            }
+        }
+        // final 4 input
+        if(col_end & 0x3)
+        {
+            int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x};
+            int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1,
+                            col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3};
+            int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x,
+                                cnt_x[3] * stride_x - pad_x};
+            int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y,
+                                cnt_y[3] * stride_y - pad_y};
+            for(kch = 0; kch < input_chan; kch++)
+                for(ky = 0; ky < 3; ky++)
+                    for(kx = 0; kx < 3; kx++)
+                    {
+                        int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
+                        int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
+                        for(i = 0; i < 4; i++)
+                        {
+                            if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 &&
+                               imy[i] < input_y)
+                                *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
+                            else
+                                *cur_col++ = 0.0;
+                        }
+                    }
+        }
+    }
+    else
+    {    // for general cases
+        for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
+        {
+            int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x};
+            int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1,
+                            col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3};
+            int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x,
+                                cnt_x[3] * stride_x - pad_x};
+            int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y,
+                                cnt_y[3] * stride_y - pad_y};
+            for(kch = 0; kch < input_chan; kch++)
+                for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y)
+                    for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x)
+                    {
+                        int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
+                        int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
+                        for(i = 0; i < 4; i++)
+                        {
+                            if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
+                            else
+                                *cur_col++ = 0.0;
+                        }
+                    }
+        }
+        // final 4 input
+        if(col_end & 0x3)
+        {
+            int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x};
+            int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1,
+                            col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3};
+            int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x,
+                                cnt_x[3] * stride_x - pad_x};
+            int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y,
+                                cnt_y[3] * stride_y - pad_y};
+            for(kch = 0; kch < input_chan; kch++)
+                for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y)
+                    for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x)
+                    {
+                        int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
+                        int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
+                        for(i = 0; i < 4; i++)
+                        {
+                            if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 &&
+                               imy[i] < input_y)
+                                *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
+                            else
+                                *cur_col++ = 0.0;
+                        }
+                    }
+        }
+    }
+}
+
+// interleave 0 ~ (output_chan & -16) kernels with 16 in form of k[0-15][0],k[0-15][1],k[0-15][2]..
+// interleave (output_chan & -16) ~ ((output_chan + 3) & -4) tail kernls with 4 in form of
+// k[0-3][0],k[0-3][1],k[0-3][2]..
+void interleave_kernel(float* kernel, float* kernel_interleaved, int kernel_chan, int kernel_size)
+{
+    int i, j;
+    float *cur_kernel0, *cur_kernel1, *cur_kernel2, *cur_kernel3, *cur_kernel4, *cur_kernel5, *cur_kernel6,
+        *cur_kernel7;
+    float *cur_kernel8, *cur_kernel9, *cur_kernel10, *cur_kernel11, *cur_kernel12, *cur_kernel13, *cur_kernel14,
+        *cur_kernel15;
+    float* cur_kernel_interleaved = kernel_interleaved;
+
+    // interleave 16 kernels
+    for(i = 0; i < (kernel_chan & -16); i += 16)
+    {
+        cur_kernel0 = kernel + kernel_size * i;
+        cur_kernel1 = kernel + kernel_size * (i + 1);
+        cur_kernel2 = kernel + kernel_size * (i + 2);
+        cur_kernel3 = kernel + kernel_size * (i + 3);
+        cur_kernel4 = kernel + kernel_size * (i + 4);
+        cur_kernel5 = kernel + kernel_size * (i + 5);
+        cur_kernel6 = kernel + kernel_size * (i + 6);
+        cur_kernel7 = kernel + kernel_size * (i + 7);
+        cur_kernel8 = kernel + kernel_size * (i + 8);
+        cur_kernel9 = kernel + kernel_size * (i + 9);
+        cur_kernel10 = kernel + kernel_size * (i + 10);
+        cur_kernel11 = kernel + kernel_size * (i + 11);
+        cur_kernel12 = kernel + kernel_size * (i + 12);
+        cur_kernel13 = kernel + kernel_size * (i + 13);
+        cur_kernel14 = kernel + kernel_size * (i + 14);
+        cur_kernel15 = kernel + kernel_size * (i + 15);
+        for(j = 0; j < kernel_size; j++)
+        {
+            *(cur_kernel_interleaved++) = cur_kernel0[j];
+            *(cur_kernel_interleaved++) = cur_kernel1[j];
+            *(cur_kernel_interleaved++) = cur_kernel2[j];
+            *(cur_kernel_interleaved++) = cur_kernel3[j];
+            *(cur_kernel_interleaved++) = cur_kernel4[j];
+            *(cur_kernel_interleaved++) = cur_kernel5[j];
+            *(cur_kernel_interleaved++) = cur_kernel6[j];
+            *(cur_kernel_interleaved++) = cur_kernel7[j];
+            *(cur_kernel_interleaved++) = cur_kernel8[j];
+            *(cur_kernel_interleaved++) = cur_kernel9[j];
+            *(cur_kernel_interleaved++) = cur_kernel10[j];
+            *(cur_kernel_interleaved++) = cur_kernel11[j];
+            *(cur_kernel_interleaved++) = cur_kernel12[j];
+            *(cur_kernel_interleaved++) = cur_kernel13[j];
+            *(cur_kernel_interleaved++) = cur_kernel14[j];
+            *(cur_kernel_interleaved++) = cur_kernel15[j];
+        }
+    }
+
+    for(i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4)
+    {
+        cur_kernel0 = kernel + kernel_size * i;
+        cur_kernel1 = kernel + kernel_size * (i + 1);
+        cur_kernel2 = kernel + kernel_size * (i + 2);
+        cur_kernel3 = kernel + kernel_size * (i + 3);
+        for(j = 0; j < kernel_size; j++)
+        {
+            *(cur_kernel_interleaved++) = cur_kernel0[j];
+            *(cur_kernel_interleaved++) = cur_kernel1[j];
+            *(cur_kernel_interleaved++) = cur_kernel2[j];
+            *(cur_kernel_interleaved++) = cur_kernel3[j];
+        }
+    }
+    // last 4 kernel
+    cur_kernel0 = kernel + kernel_size * i;
+    cur_kernel1 = kernel + kernel_size * (i + 1);
+    cur_kernel2 = kernel + kernel_size * (i + 2);
+    if((kernel_chan & 0x3) == 3)
+    {
+        for(j = 0; j < kernel_size; j++)
+        {
+            *(cur_kernel_interleaved++) = cur_kernel0[j];
+            *(cur_kernel_interleaved++) = cur_kernel1[j];
+            *(cur_kernel_interleaved++) = cur_kernel2[j];
+            *(cur_kernel_interleaved++) = 0.0;
+        }
+    }
+    else if((kernel_chan & 0x3) == 2)
+    {
+        for(j = 0; j < kernel_size; j++)
+        {
+            *(cur_kernel_interleaved++) = cur_kernel0[j];
+            *(cur_kernel_interleaved++) = cur_kernel1[j];
+            *(cur_kernel_interleaved++) = 0.0;
+            *(cur_kernel_interleaved++) = 0.0;
+        }
+    }
+    else if((kernel_chan & 0x3) == 1)
+    {
+        for(j = 0; j < kernel_size; j++)
+        {
+            *(cur_kernel_interleaved++) = cur_kernel0[j];
+            *(cur_kernel_interleaved++) = 0.0;
+            *(cur_kernel_interleaved++) = 0.0;
+            *(cur_kernel_interleaved++) = 0.0;
+        }
+    }
+
+    return;
+}
+
+static void sgemm4x16(float* col, float* kernel, float* biases, bool bias_term, float* output, int kernel_size,
+                      int col_start, int col_end, int kernel_start, int kernel_end, int output_xy, int activation,
+                      int cpu_type)
+{
+    float initial[64], result[64];
+    int col_line, kernel_num;
+    int i, j;
+    float *cur_col, *cur_kernel;
+
+    for(kernel_num = (kernel_start & -16); kernel_num < (kernel_end & -16); kernel_num += 16)
+    {
+        if(bias_term)
+            for(i = 0; i < 64; i++)
+                initial[i] = *(biases + kernel_num + (i >> 2));
+        cur_kernel = ( float* )(kernel + kernel_num * kernel_size);
+
+        for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
+        {
+            cur_col = ( float* )(col + col_line * kernel_size);
+            if(activation >= 0)
+                sgemm_4x16_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+            else
+                sgemm_4x16_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+
+            if(activation > 0)
+            {
+                for(i = 0; i < 16; i++)
+                {
+                    *(output + (kernel_num + i) * output_xy + col_line) =
+                        std::min(result[(i << 2)], ( float )activation);
+                    *(output + (kernel_num + i) * output_xy + col_line + 1) =
+                        std::min(result[(i << 2) + 1], ( float )activation);
+                    *(output + (kernel_num + i) * output_xy + col_line + 2) =
+                        std::min(result[(i << 2) + 2], ( float )activation);
+                    *(output + (kernel_num + i) * output_xy + col_line + 3) =
+                        std::min(result[(i << 2) + 3], ( float )activation);
+                }
+            }
+            else
+            {
+                for(i = 0; i < 16; i++)
+                {
+                    *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2)];
+                    *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1];
+                    *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2];
+                    *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3];
+                }
+            }
+        }
+        if(col_end & 0x3)
+        {
+            cur_col = ( float* )(col + col_line * kernel_size);
+
+            if(activation >= 0)
+                sgemm_4x16_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+            else
+                sgemm_4x16_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+
+            for(i = 0; i < 16; i++)
+                for(j = 0; j < (col_end & 0x3); j++)
+                {
+                    if(activation > 0)
+                        *(output + (kernel_num + i) * output_xy + col_line + j) =
+                            std::min(result[(i << 2) + j], ( float )activation);
+                    else
+                        *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
+                }
+        }
+    }
+}
+
+static void sgemm4x4(float* col, float* kernel, float* biases, bool bias_term, float* output, int kernel_size,
+                     int col_start, int col_end, int kernel_start, int kernel_end, int output_xy, int activation,
+                     int cpu_type)
+{
+    float initial[16], result[16];
+    int col_line, kernel_num;
+    int i, j;
+    float *cur_col, *cur_kernel;
+
+    for(kernel_num = kernel_start & -4; kernel_num < (kernel_end & -4); kernel_num += 4)
+    {
+        if(bias_term)
+            for(i = 0; i < 16; i++)
+                initial[i] = *(biases + kernel_num + (i >> 2));
+        cur_kernel = ( float* )(kernel + kernel_num * kernel_size);
+        for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
+        {
+            cur_col = ( float* )(col + col_line * kernel_size);
+
+            if(activation >= 0)
+                sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+            else
+                sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+
+            if(activation > 0)
+            {
+                for(i = 0; i < 4; i++)
+                {
+                    *(output + (kernel_num + i) * output_xy + col_line) =
+                        std::min(result[(i << 2) + 0], ( float )activation);
+                    *(output + (kernel_num + i) * output_xy + col_line + 1) =
+                        std::min(result[(i << 2) + 1], ( float )activation);
+                    *(output + (kernel_num + i) * output_xy + col_line + 2) =
+                        std::min(result[(i << 2) + 2], ( float )activation);
+                    *(output + (kernel_num + i) * output_xy + col_line + 3) =
+                        std::min(result[(i << 2) + 3], ( float )activation);
+                }
+            }
+            else
+            {
+                for(i = 0; i < 4; i++)
+                {
+                    *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2) + 0];
+                    *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1];
+                    *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2];
+                    *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3];
+                }
+            }
+        }
+        if(col_end & 0x3)
+        {
+            cur_col = ( float* )(col + col_line * kernel_size);
+            if(activation >= 0)
+                sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+            else
+                sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+            for(i = 0; i < 4; i++)
+            {
+                for(j = 0; j < (col_end & 0x3); j++)
+                {
+                    if(activation > 0)
+                        *(output + (kernel_num + i) * output_xy + col_line + j) =
+                            std::min(result[(i << 2) + j], ( float )activation);
+                    else
+                        *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
+                }
+            }
+        }
+    }
+
+    if(kernel_end & 0x3)
+    {
+        if(bias_term)
+            for(i = 0; i < ((kernel_end & 0x3) << 2); i++)
+                initial[i] = *(biases + kernel_num + (i >> 2));
+        cur_kernel = ( float* )(kernel + kernel_num * kernel_size);
+        for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
+        {
+            cur_col = ( float* )(col + col_line * kernel_size);
+
+            if(activation >= 0)
+                sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+            else
+                sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+
+            if(activation > 0)
+            {
+                for(i = 0; i < (kernel_end & 0x3); i++)
+                {
+                    *(output + (kernel_num + i) * output_xy + col_line) =
+                        std::min(result[(i << 2) + 0], ( float )activation);
+                    *(output + (kernel_num + i) * output_xy + col_line + 1) =
+                        std::min(result[(i << 2) + 1], ( float )activation);
+                    *(output + (kernel_num + i) * output_xy + col_line + 2) =
+                        std::min(result[(i << 2) + 2], ( float )activation);
+                    *(output + (kernel_num + i) * output_xy + col_line + 3) =
+                        std::min(result[(i << 2) + 3], ( float )activation);
+                }
+            }
+            else
+            {
+                for(i = 0; i < (kernel_end & 0x3); i++)
+                {
+                    *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2) + 0];
+                    *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1];
+                    *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2];
+                    *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3];
+                }
+            }
+        }
+        if(col_end & 0x3)
+        {
+            cur_col = ( float* )(col + col_line * kernel_size);
+            if(activation >= 0)
+                sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+            else
+                sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size);
+
+            for(i = 0; i < (kernel_end & 0x3); i++)
+            {
+                for(j = 0; j < (col_end & 0x3); j++)
+                {
+                    if(activation > 0)
+                        *(output + (kernel_num + i) * output_xy + col_line + j) =
+                            std::min(result[(i << 2) + j], ( float )activation);
+                    else
+                        *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
+                }
+            }
+        }
+    }
+}
+
+struct im2col_param
+{
+    float* im;
+    float* col;
+    int input_chan;
+    int input_x;
+    int input_y;
+    int kernel_x;
+    int kernel_y;
+    int stride_x;
+    int stride_y;
+    int dilation_x;
+    int dilation_y;
+    int pad_x0;
+    int pad_x1;
+    int pad_y0;
+    int pad_y1;
+    int output_x;
+    int output_y;
+    int col_start;
+    int col_end;
+};
+
+struct sgemm_param
+{
+    float* col;
+    float* kernel;
+    float* biases;
+    bool bias_term;
+    float* output;
+    int kernel_size;
+    int col_start;
+    int col_end;
+    int kernel_start;
+    int kernel_end;
+    int output_xy;
+};
+
+struct conv1x1s1_param
+{
+    const float* input;
+    float* output;
+    const float* kernel;
+    const float* bias;
+    int in_h;
+    int in_w;
+    int in_ch;
+    int out_h;
+    int out_w;
+    int out_ch;
+    bool relu_fused;
+};
+
+struct ConvFast : public MTNodeOps
+{
+    bool Prerun(Node* node) override;
+    bool Reshape(Node* node) override;
+    bool Run(Node* node) override;
+    bool Postrun(Node* node) override;
+    bool GetSharedMemorySize(Node*, unsigned int& mem_size) override;
+    bool SetSharedMemoryAddr(Node*, void* mem_addr, int mem_size) override;
+
+    bool float_mode;
+    bool im2col_aider(int cpu, int seq, void* data /* im2col_param * param */);
+    bool sgemm_aider(int cpu, int seq, void* data /* sgemm_param * param */);
+    bool sgemm4x4_aider(int cpu, int seq, void* data /* sgemm_param * param */);
+
+    int activation;
+    bool dynamic_shape;
+};
+
+bool ConvFast::im2col_aider(int cpu, int seq, void* data)
+{
+    im2col_param* param = ( im2col_param* )(data);
+    im2col(param->im, param->col, param->input_chan, param->input_x, param->input_y, param->kernel_x, param->kernel_y,
+           param->stride_x, param->stride_y, param->dilation_x, param->dilation_y, param->pad_x0, param->pad_x1,
+           param->pad_y0, param->pad_y1, param->output_x, param->output_y, param->col_start, param->col_end);
+
+    return true;
+}
+
+bool ConvFast::sgemm4x4_aider(int cpu, int seq, void* data)
+{
+    int cpu_type = TYPE_A72;
+    sgemm_param* param = ( sgemm_param* )(data);
+
+    sgemm4x4(param->col, param->kernel, param->biases, param->bias_term, param->output, param->kernel_size,
+             param->col_start, param->col_end, param->kernel_start, param->kernel_end, param->output_xy, activation,
+             cpu_type);
+
+    return true;
+}
+
+bool ConvFast::sgemm_aider(int cpu, int seq, void* data)
+{
+    int cpu_type = TYPE_A72;
+    sgemm_param* param = ( sgemm_param* )(data);
+
+    sgemm4x16(param->col, param->kernel, param->biases, param->bias_term, param->output, param->kernel_size,
+              param->col_start, param->col_end, param->kernel_start, param->kernel_end, param->output_xy, activation,
+              cpu_type);
+
+    return true;
+}
+
+bool ConvFast::Prerun(Node* node)
+{
+    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
+    ConvParam* param = conv_op->GetParam();
+    int group = param->group;
+
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    TShape& output_shape = output_tensor->GetShape();
+    int output_chan = output_shape.GetC() / group;
+
+    /* pre-allocate col_buf */
+    Tensor* input_tensor = node->GetInputTensor(0);
+    TShape& input_shape = input_tensor->GetShape();
+
+    int input_chan = input_shape.GetC() / group;
+    int kernel_size = input_chan * param->kernel_h * param->kernel_w;
+
+    if(!dynamic_shape)
+    {
+        if(node->ExistAttr("shared_col_buf"))
+        {
+            float* addr = ( float* )any_cast<void*>(node->GetAttr("shared_col_buf"));
+
+            (*node)["col_buf"] = addr;
+        }
+        else
+        {
+            unsigned int col_size;
+
+            GetSharedMemorySize(node, col_size);
+
+            float* col_buf = ( float* )mem_alloc(col_size);
+            (*node)["col_buf"] = col_buf;
+            node->SetAttr("col_buf_allocated", col_size);
+        }
+    }
+
+    /* packing kernel data */
+    Tensor* kernel_tensor = node->GetInputTensor(1);
+
+    float* kernel_interleaved = NULL;
+
+    int kernel_interleaved_size_g = kernel_size * ((output_chan + 3) & -4);
+    int kernel_size_g = kernel_size * output_chan;
+    float* kernel_org = ( float* )get_tensor_mem(kernel_tensor);
+    kernel_interleaved = ( float* )mem_alloc(sizeof(float) * (kernel_interleaved_size_g * group) + 128);
+
+    for(int g = 0; g < group; ++g)
+    {
+        float* kernel = kernel_org + g * kernel_size_g;
+        float* kernel_interleaved_g = kernel_interleaved + g * kernel_interleaved_size_g;
+        interleave_kernel(kernel, kernel_interleaved_g, output_chan, kernel_size);
+    }
+
+    (*node)["kernel_interleaved"] = kernel_interleaved;
+
+    if(exec_attr->low_mem_mode)
+    {
+        kernel_tensor->FreeMem();
+    }
+
+    return true;
+}
+
+bool ConvFast::Reshape(Node* node)
+{
+    unsigned int new_col_size;
+
+    GetSharedMemorySize(node, new_col_size);
+
+    if(node->ExistAttr("col_buf_allocated"))
+    {
+        unsigned int col_size = any_cast<unsigned int>(node->GetAttr("col_buf_allocated"));
+        if(new_col_size == col_size)
+            return true;
+
+        float* addr = any_cast<float*>(node->GetAttr("col_buf"));
+        mem_free(addr);
+    }
+
+    float* col_buf = ( float* )mem_alloc(new_col_size);
+    (*node)["col_buf"] = col_buf;
+
+    node->SetAttr("col_buf_allocated", new_col_size);
+    return true;
+}
+
+bool ConvFast::Run(Node* node)
+{
+    /* input */
+    Tensor* input_tensor = node->GetInputTensor(0);
+
+    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
+    ConvParam* param = conv_op->GetParam();
+
+    const TShape& input_shape = input_tensor->GetShape();
+
+    int group = param->group;
+    int input_chan = input_shape.GetC() / group;
+    int input_h = input_shape.GetH();
+    int input_w = input_shape.GetW();
+    int input_size = input_w * input_h * input_chan;
+    int pad_x0 = param->pad_w0;    // left padding columns
+    int pad_x1 = param->pad_w1;    // right padding columns
+    int pad_y0 = param->pad_h0;    // top padding rows
+    int pad_y1 = param->pad_h1;    // bottom padding rows
+    int stride_x = param->stride_w;
+    int stride_y = param->stride_h;
+    int dilation_x = param->dilation_w;
+    int dilation_y = param->dilation_h;
+    float* input_org = ( float* )get_tensor_mem(input_tensor);
+    float* col = any_cast<float*>(node->GetAttr("col_buf"));
+
+    /* output */
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    TShape& output_shape = output_tensor->GetShape();
+    float* output_org = ( float* )get_tensor_mem(output_tensor);
+    int output_y = output_shape.GetH();
+    int output_x = output_shape.GetW();
+    int output_xy = output_x * output_y;
+    int output_chan = output_shape.GetC() / group;
+    int output_n = output_shape.GetN();
+
+    /* kernel */
+    int kernel_x = param->kernel_w;
+    int kernel_y = param->kernel_h;
+    int kernel_size = input_chan * kernel_x * kernel_y;
+
+    float* kernel_interleaved = any_cast<float*>(node->GetAttr("kernel_interleaved"));
+
+    int cpu_number = cpu_info->GetCPUNumber();
+
+    /* biases */
+
+    float* biases = NULL;
+    bool have_biases = (node->GetInputNum() > 2);
+
+    if(have_biases)
+    {
+        biases = ( float* )get_tensor_mem(node->GetInputTensor(2));
+    }
+
+    int cpu_type;
+
+    if(cpu_info->GetCPUModel(cpu_info->GetMasterCPU()) == CPU_A72)
+        cpu_type = TYPE_A72;
+    else
+        cpu_type = TYPE_A53;
+
+    /* block size split parameter */
+    int L2_CACHE_SIZE = (cpu_type == TYPE_A53) ? 512 * 1024 : 1024 * 1024;
+    int kernel_size_l1 = kernel_size;
+    int col_cnt_l2 = L2_CACHE_SIZE / 4 / kernel_size_l1 * 7 / 8;
+    col_cnt_l2 = col_cnt_l2 > 4 ? (col_cnt_l2 & -4) : 4;
+
+    /* one image per time */
+    for(int i = 0; i < output_n; i++)
+    {
+        float* input = input_org + i * input_size * group;
+        float* output = output_org + i * output_xy * output_chan * group;
+
+        for(int g = 0; g < group; g++)
+        {
+            float* input_g = input + g * input_size;
+            int total_num = output_xy * input_chan * kernel_x * kernel_y;
+
+            if(cpu_number == 1 || total_num < 100 * 1000)
+                im2col(input_g, col, input_chan, input_w, input_h, kernel_x, kernel_y, stride_x, stride_y, dilation_x,
+                       dilation_y, pad_x0, pad_x1, pad_y0, pad_y1, output_x, output_y, 0, output_xy);
+            else
+            {
+                std::vector<sub_op_task> task_list;
+                std::vector<im2col_param> param_list;
+
+                auto f = std::bind(&ConvFast::im2col_aider, this, std::placeholders::_1, std::placeholders::_2,
+                                   std::placeholders::_3);
+
+                int steps = output_xy / cpu_number;
+
+                steps = (steps + 3) & (~0x3);
+
+                int offset;
+                int real_cpu_number = cpu_number;
+
+                while(1)
+                {
+                    offset = steps * real_cpu_number - output_xy;
+
+                    if(offset < steps)
+                        break;
+
+                    real_cpu_number--;
+                }
+
+                task_list.resize(real_cpu_number);
+                param_list.resize(real_cpu_number);
+
+                for(int i = 0; i < real_cpu_number; i++)
+                {
+                    im2col_param* param = &param_list[i];
+                    sub_op_task* task = &task_list[i];
+
+                    task->exec_func = f;
+                    task->seq = i;
+                    task->data = param;
+
+                    param->im = input_g;
+                    param->col = col;
+                    param->input_chan = input_chan;
+                    param->input_x = input_w;
+                    param->input_y = input_h;
+                    param->kernel_x = kernel_x;
+                    param->kernel_y = kernel_y;
+                    param->stride_x = stride_x;
+                    param->stride_y = stride_y;
+                    param->dilation_x = dilation_x;
+                    param->dilation_y = dilation_y;
+                    param->pad_x0 = pad_x0;
+                    param->pad_x1 = pad_x1;
+                    param->pad_y0 = pad_y0;
+                    param->pad_y1 = pad_y1;
+                    param->output_x = output_x;
+                    param->output_y = output_y;
+                    param->col_start = i * steps;
+                    param->col_end = param->col_start + steps;
+                }
+
+                param_list[real_cpu_number - 1].col_end = output_xy;
+
+                task_dispatch(task_list, -1);
+                wait_done();
+            }
+
+            float* kernel_g = kernel_interleaved + g * (kernel_size * ((output_chan + 3) & -4));
+            float* output_g = output + g * output_xy * output_chan;
+            float* bias_g = biases + g * output_chan;
+
+            std::vector<sub_op_task> task_list;
+            std::vector<sgemm_param> param_list;
+
+            int chan_16_num = output_chan / 16;
+            int chan_4_num = (output_chan & 0xf) ? 1 : 0;
+            int l2_loop = (output_xy - 1) / col_cnt_l2 + 1;
+            int max_task_num = l2_loop * (chan_16_num + chan_4_num);
+
+            if(cpu_number > 1)
+                param_list.resize(max_task_num);
+
+            // for input block of L2 cache size
+            for(int col_i = 0; col_i < output_xy; col_i += col_cnt_l2)
+            {
+                int col_start = col_i;
+                int col_end = col_i + col_cnt_l2;
+                col_end = col_end > output_xy ? output_xy : col_end;
+
+                if(cpu_number == 1)
+                {
+                    sgemm4x16(col, kernel_g, bias_g, have_biases, output_g, kernel_size, col_start, col_end, 0,
+                              output_chan & -16, output_xy, activation, cpu_type);
+                    if(output_chan & 0xf)
+                        sgemm4x4(col, kernel_g, bias_g, have_biases, output_g, kernel_size, col_start, col_end,
+                                 output_chan & -16, output_chan, output_xy, activation, cpu_type);
+                }
+                else
+                {
+                    auto f = std::bind(&ConvFast::sgemm_aider, this, std::placeholders::_1, std::placeholders::_2,
+                                       std::placeholders::_3);
+
+                    for(int i = 0; i < chan_16_num; i++)
+                    {
+                        sub_op_task tmp_task;
+                        sgemm_param* param = &param_list[task_list.size()];
+                        sub_op_task* task = &tmp_task;
+                        task->exec_func = f;
+                        task->seq = i;
+                        task->data = param;
+
+                        param->col = col;
+                        param->kernel = kernel_g;
+                        param->biases = bias_g;
+                        param->bias_term = have_biases;
+                        param->output = output_g;
+                        param->kernel_size = kernel_size;
+                        param->col_start = col_start;
+                        param->col_end = col_end;
+                        param->kernel_start = i * 16;
+                        param->kernel_end = param->kernel_start + 16;
+                        param->output_xy = output_xy;
+
+                        task_list.emplace_back(tmp_task);
+                    }
+
+                    if(output_chan & 0xf)
+                    {
+                        auto f = std::bind(&ConvFast::sgemm4x4_aider, this, std::placeholders::_1,
+                                           std::placeholders::_2, std::placeholders::_3);
+                        sub_op_task tmp_task;
+                        sgemm_param* param = &param_list[task_list.size()];
+                        sub_op_task* task = &tmp_task;
+                        task->exec_func = f;
+                        task->seq = task_list.size() - 1;
+                        task->data = param;
+
+                        param->col = col;
+                        param->kernel = kernel_g;
+                        param->biases = bias_g;
+                        param->bias_term = have_biases;
+                        param->output = output_g;
+                        param->kernel_size = kernel_size;
+                        param->col_start = col_start;
+                        param->col_end = col_end;
+                        param->kernel_start = output_chan & -16;
+                        param->kernel_end = output_chan;
+                        param->output_xy = output_xy;
+
+                        task_list.emplace_back(tmp_task);
+                    }
+                }
+            }
+
+            if(cpu_number > 1)
+            {
+                task_dispatch(task_list, -1);
+                wait_done();
+            }
+        }
+    }
+
+    return true;
+}
+
+bool ConvFast::Postrun(Node* node)
+{
+    if(node->ExistAttr("kernel_interleaved"))
+    {
+        float* addr;
+        addr = any_cast<float*>(node->GetAttr("kernel_interleaved"));
+
+        mem_free(addr);
+
+        node->RemoveAttr("kernel_interleaved");
+    }
+
+    if(node->ExistAttr("col_buf_allocated"))
+    {
+        float* addr = any_cast<float*>(node->GetAttr("col_buf"));
+        mem_free(addr);
+
+        node->RemoveAttr("col_buf_allocated");
+    }
+
+    if(node->ExistAttr("col_buf"))
+        node->RemoveAttr("col_buf");
+
+    return true;
+}
+
+bool ConvFast::GetSharedMemorySize(Node* node, unsigned int& mem_size)
+{
+    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
+    ConvParam* param = conv_op->GetParam();
+    int group = param->group;
+
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    TShape& output_shape = output_tensor->GetShape();
+    int output_y = output_shape.GetH();
+    int output_x = output_shape.GetW();
+
+    Tensor* input_tensor = node->GetInputTensor(0);
+    TShape& input_shape = input_tensor->GetShape();
+
+    int input_chan = input_shape.GetC() / group;
+    int kernel_size = input_chan * param->kernel_h * param->kernel_w;
+    int output_xy = output_x * output_y;
+
+    mem_size = (sizeof(float) * (kernel_size * ((output_xy + 3) & -4)) + 128);
+
+    return true;
+}
+
+bool ConvFast::SetSharedMemoryAddr(Node* node, void* mem_addr, int mem_size)
+{
+    (*node)["shared_col_buf"] = mem_addr;
+    return true;
+}
+
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+
+    if(exec_attr->graph_layout == TENGINE_LAYOUT_NHWC)
+        return nullptr;
+
+    ConvFast* ops = new ConvFast();
+
+    ops->need_free = true;
+
+    if(node->IsDynamicShape())
+        ops->dynamic_shape = true;
+    else
+        ops->dynamic_shape = false;
+
+    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
+    ConvParam* param = conv_op->GetParam();
+
+    ops->activation = param->activation;
+
+    return ops;
+}
+
+}    // conv_fast
+
+void RegisterConv2dFast(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor("arm64", "Convolution", conv_fast::SelectFunc,
+                                                  conv_fast::default_prio);
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/arm64/conv/dw_k3s1p1.S b/executor/operator/arm64/conv/dw_k3s1p1.S
index 36a0d3563..ca3e34223 100644
--- a/executor/operator/arm64/conv/dw_k3s1p1.S
+++ b/executor/operator/arm64/conv/dw_k3s1p1.S
@@ -1,736 +1,736 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: haitao@openailab.com
- */
-
-
-//x0: input
-//x1: h
-//x2: w
-//x3: kernel
-//x4: output //L-2
-//x5 : bias
-//x10: L-1 output
-//x6: L0 output
-//x7: processed item
-//x8: counter
-//x9: x2*4 
-
-//v0-v3: L-2  
-//v4-v7: L-1  
-//v8-v11: L0  
-//v12-v15/v16-v20: input two group
-//v24-v26: kernel
-//v27 --- saved previous vector
-// v28,v29 --- shifted 
-
-//v30 : bias
-#ifndef KERNEL_NAME
-#define KERNEL_NAME dw_k3s1p1
-#endif
-
-.text
-.align 5
-.global KERNEL_NAME
-.type KERNEL_NAME, %function
-
-
-KERNEL_NAME:
-
-   //Load Kernel
-   ld1 {v24.4s,v25.4s,v26.4s}, [x3]
-
-   ext  v26.16b,v25.16b,v26.16b,8
-   ext  v25.16b,v24.16b,v25.16b,12
-
-   lsl x9,x2,2
-   fmov s31,wzr
-   dup  v31.4s,v31.s[0]
-
-   cbz  x5 ,non_biases
-   //get the bias
-   ldr  s30, [x5]
-   dup  v30.4s,v30.s[0] 
-
-   b  first_row_start
-non_biases:
-   fmov s30, wzr
-   dup v30.4s,v30.s[0]
-first_row_start:
-   sub  x1,x1,1  
-   sub  x7,x2,1  //save last item in row  
-   lsr  x8,x7,2
-   lsl  x7,x8,2
-
-   ins  v27.s[3],v31.s[0]   //pre_vector for input
-
-   cbz x1,single_line
-   mov x10,x4      //L-1
-   add x6,x10,x9   //L-0
-
-
-   cbz  x8,first_last_4
-
-   //output
-   
-   
-first_row_loop:
-   //load 4 float input
-   ld1 {v12.4s},[x0],#16
-   ld1r {v13.4s},[x0]
-   
-   ext v28.16b,v27.16b,v12.16b,12  //last_3 , a00, a01, a02
-   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04
-   
-   ins v27.s[3],v12.s[3]  //save prev vector
-   
-   //L-1: k1 xinput
-   fmul v4.4s,v28.4s,v25.s[0]  //k10, 
-   fmla v4.4s,v12.4s,v25.s[1]  //k11,
-   fmla v4.4s,v29.4s,v25.s[2]  //k12
-  
-   st1 {v4.4s},[x10],#16
-   
-   //L0
-   fmul v8.4s,v28.4s,v24.s[0]   //k00
-   fmla v8.4s,v12.4s,v24.s[1]   //k01
-   fmla v8.4s,v29.4s,v24.s[2]   //k02
-   
-   st1 {v8.4s},[x6],#16
-   
-   //next loop
-   subs x8,x8,1
-   b.ne first_row_loop
-
-first_last_4:
-   //left ones: 1-4
-   sub x8,x2,x7
-   cmp x8,4
-   blt first_less_4
-
-   //4  nodes
-   ld1 {v12.4s},[x0],#16
-   ins v13.s[0],v31.s[0]
-   
-   ext v28.16b,v27.16b,v12.16b,12  //last_3 , a00, a01, a02
-   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04         
-   
-   //L-1: k1 xinput
-   fmul v4.4s,v28.4s,v25.s[0]  //k10, 
-   fmla v4.4s,v12.4s,v25.s[1]  //k11,
-   fmla v4.4s,v29.4s,v25.s[2]  //k12
-   
-   st1 {v4.4s},[x10],#16
-   
-   //L0
-   fmul v8.4s,v28.4s,v24.s[0]   //k00
-   fmla v8.4s,v12.4s,v24.s[1]   //k01
-   fmla v8.4s,v29.4s,v24.s[2]   //k02
-   
-   st1 {v8.4s},[x6],#16
-  
-   b first_row_done
-   
-first_less_4:
-   cmp x8,1
-   bge first_1_2_3
-   b   first_row_done
-
-first_1_2_3:   
-   dup v12.4s,v31.s[0]
-   dup v13.4s,v31.s[0]
-   
-   //2 or 3 items
-   ldr s28,[x0],#4
-   ins v12.s[0],v28.s[0]
-   sub x7,x8,1
-   cbz x7, first_left_load_done
-   
-   ldr s28,[x0],#4
-   ins v12.s[1],v28.s[0]
-   sub x7,x8,2
-   
-   cbz x7, first_left_load_done
-   ldr s28,[x0],#4
-   ins v12.s[2],v28.s[0]
-
-first_left_load_done:         
-
-   ext v28.16b,v27.16b,v12.16b,12  //last_3 , a00, a01, a02
-   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04         
-
-   //L-1   
-   fmul v4.4s,v28.4s,v25.s[0]  //k10, 
-   fmul v8.4s,v28.4s,v24.s[0]   //k00
-   fmla v4.4s,v12.4s,v25.s[1]  //k11,
-   fmla v8.4s,v12.4s,v24.s[1]   //k01
-   fmla v4.4s,v29.4s,v25.s[2]  //k12
-   fmla v8.4s,v29.4s,v24.s[2]   //k02
-   
-   //save result: 2 or 3
-   ins v28.s[0],v4.s[0]
-   str  s28,[x10],#4
-   
-   ins v28.s[0],v8.s[0]
-   str s28,[x6],#4
-   
-   cmp x8, 2
-   blt  first_row_done
-   
-  ins v28.s[0],v4.s[1]
-  str s28,[x10],#4
-   
-   
-   ins v28.s[0],v8.s[1]
-   str s28,[x6],#4
-   
-   cmp x8,3
-   blt first_row_done
-   
-   ins v28.s[0],v4.s[2]
-   str s28,[x10]
-   
-   ins v28.s[0],v8.s[2]
-   str s28,[x6]
-
-first_row_done:
-   
-mid_row_start:
-
-   sub x1,x1,1
-   cbz x1, last_row_start
-
-   sub  x7,x2,1  //save one 
-   lsr  x8,x7,2
-   lsl  x7,x8,2
-   
-   add x10,x4,x9    //L-1
-   add x6,x10,x9   //L0
-   dup v27.4s,v31.s[0]
-     
-   cbz x8,mid_last_4
-  
-mid_loop_start:
-   
-   ld1 {v0.4s},[x4]
-   ld1 {v4.4s},[x10]
-  //ld1 {v8.4s},[x6],#16  //L0 is always zero
-  
-  ld1 {v12.4s},[x0],#16
-  ld1r {v13.4s},[x0]
- 
-  ext v28.16b,v27.16b,v12.16b,12  // last_3 , a00, a01, a02
-                          //v12: a00, a01, a02 ,a03 
-  ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04   
-  
-  //L-2 
-  fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-   fmla v4.4s,v28.4s,v25.s[0]  //k10, 
-   fmul v8.4s,v28.4s,v24.s[0]   //k00
-  fmla v0.4s,v12.4s,v26.s[1]  //k21,
-   fmla v4.4s,v12.4s,v25.s[1]  //k11,
-   fmla v8.4s,v12.4s,v24.s[1]   //k01
-  fmla v0.4s,v29.4s,v26.s[2]  //k22
-   fmla v4.4s,v29.4s,v25.s[2]  //k12
-   fmla v8.4s,v29.4s,v24.s[2]   //k02
-//add bias
-  fadd v0.4s,v0.4s,v30.4s
-#ifdef CONV_RELU_FUSE
-  fmax v0.4s,v0.4s,v31.4s
-#endif
-  st1 {v0.4s},[x4],#16
-  
-  //L-1   
-   st1 {v4.4s},[x10],#16
-  
-   
-   //L0
-   st1 {v8.4s},[x6],#16
-  
-   ins v27.s[3],v12.s[3]
-   
-   //next loop
-   subs x8,x8,1
-   b.ne mid_loop_start
-
-mid_last_4:
-   sub x8,x2,x7
-   cmp x8,4
-   blt mid_less_4
-   
-   ld1 {v0.4s},[x4]
-   ld1 {v4.4s},[x10]
-   
-  ld1 {v12.4s},[x0],#16
-  ins v13.s[0],v31.s[0]
-  
-  ext v28.16b,v27.16b,v12.16b,12  // last_3 , a00, a01, a02
-                          //v12: a00, a01, a02 ,a03 
-  ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04   
-  
-  //L-2 
-  fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-  fmla v4.4s,v28.4s,v25.s[0]  //k10, 
-   fmul v8.4s,v28.4s,v24.s[0]   //k00
-  fmla v0.4s,v12.4s,v26.s[1]  //k21,
-   fmla v4.4s,v12.4s,v25.s[1]  //k11,
-   fmla v8.4s,v12.4s,v24.s[1]   //k01
-  fmla v0.4s,v29.4s,v26.s[2]  //k22
-   fmla v4.4s,v29.4s,v25.s[2]  //k12
-   fmla v8.4s,v29.4s,v24.s[2]   //k02
-//add bias
-  fadd v0.4s,v0.4s,v30.4s
-#ifdef CONV_RELU_FUSE
-  fmax v0.4s,v0.4s,v31.4s
-#endif
-  st1 {v0.4s},[x4],#16
-  
-  
-  //L-1   
-   st1 {v4.4s},[x10],#16
-  
-   //L0
-   st1 {v8.4s},[x6],#16
-   
-   b mid_row_start
- 
-mid_less_4:
-   cmp x8,1
-   blt mid_row_start
-   
-mid_left_1_2_3: 
-  
-   dup v12.4s,v31.s[0]
-   dup v13.4s,v31.s[0]
-   dup v0.4s,v31.s[0]
-   dup v4.4s,v31.s[0]
-   
-   
-   ldr s28,[x0],#4
-   ins v12.s[0],v28.s[0]
-      
-   ldr s28,[x4]
-   ins v0.s[0],v28.s[0]
-   ldr s28,[x10]
-   ins v4.s[0],v28.s[0]
-   
-   
-   cmp  x8,2
-   blt mid_left_load_done
-   
-   ldr s28,[x0],#4
-   ins v12.s[1],v28.s[0]
-   
-   ldr s28,[x4,#4]
-   ins v0.s[1],v28.s[0]
-   ldr s28,[x10, #4]
-   ins v4.s[1],v28.s[0]
-   
-   cmp  x8,3
-   blt mid_left_load_done
-   
-   
-   ldr s28,[x0],#4
-   ins v12.s[2],v28.s[0]
-   
-   ldr s28,[x4,#8]
-   ins v0.s[2],v28.s[0]
-   ldr s28,[x10, #8]
-   ins v4.s[2],v28.s[0]
-
-mid_left_load_done:         
-
-   ext v28.16b,v27.16b,v12.16b,12  //last_3 , a00, a01, a02
-   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04         
-
-   //L-2 
-   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-   fmla v4.4s,v28.4s,v25.s[0]  //k10, 
-   fmul v8.4s,v28.4s,v24.s[0]   //k00
-   fmla v0.4s,v12.4s,v26.s[1]  //k21,
-   fmla v4.4s,v12.4s,v25.s[1]  //k11,
-   fmla v8.4s,v12.4s,v24.s[1]   //k01
-
-   fmla v0.4s,v29.4s,v26.s[2]  //k22
-   fmla v4.4s,v29.4s,v25.s[2]  //k12
-   fmla v8.4s,v29.4s,v24.s[2]   //k02
-
-//add bias 
-   fadd v0.4s,v0.4s,v30.4s
-   //save result:1, 2 or 3
-   ins v28.s[0],v0.s[0]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31
-#endif
-   str  s28,[x4],#4
-   
-   ins v28.s[0],v4.s[0]
-   str  s28,[x10],#4
-   
-   ins v28.s[0],v8.s[0]
-   str s28,[x6],#4
-   
-   cmp x8,2
-   blt mid_row_start
-   
-   ins v28.s[0],v0.s[1]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31
-#endif
-   str s28,[x4],#4
-
-   ins v28.s[0],v4.s[1]
-   str s28,[x10],#4
-   
-   ins v28.s[0],v8.s[1]
-   str s28,[x6],#4
-   
-   cmp x8,3
-   blt mid_row_start
-
-   ins v28.s[0],v0.s[2]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31
-#endif
-   str s28,[x4],#4
-  
-   ins v28.s[0],v4.s[2]
-   str s28,[x10]
-   
-   ins v28.s[0],v8.s[2]
-   str s28,[x6]
-   
-   b mid_row_start
-   
-
-last_row_start:
-
-   
-   sub  x7,x2,1
-   lsr  x8,x7,2
-   lsl  x7,x8,2
-   
-   dup v27.4s,v31.s[0]
-   
-   add x10,x4,x9 //L-1
-
-   cbz x8,last_last_4
-   
-last_loop_start:
-   
-  ld1 {v0.4s},[x4]
-  ld1 {v4.4s},[x10]
-  
-  ld1 {v12.4s},[x0],#16
-  ld1 {v13.4s},[x0]
-  
-  ext v28.16b,v27.16b,v12.16b,12  // last_3 , a00, a01, a02
-                           //v12: a00, a01, a02 ,a03 
-  ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04   
-  
-  //L-2 
-  fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-  fmla v4.4s,v28.4s,v25.s[0]  //k10, 
-  fmla v0.4s,v12.4s,v26.s[1]  //k21,
-  fmla v4.4s,v12.4s,v25.s[1]  //k11,
-  fmla v0.4s,v29.4s,v26.s[2]  //k22
-  fmla v4.4s,v29.4s,v25.s[2]  //k12
-//add bias
-  fadd v0.4s,v0.4s,v30.4s
-
-#ifdef CONV_RELU_FUSE
-  fmax v0.4s,v0.4s,v31.4s
-#endif
-  st1 {v0.4s},[x4],#16
-  
-  //L-1   
-//add bias
-   fadd v4.4s,v4.4s,v30.4s
-#ifdef CONV_RELU_FUSE
-   fmax v4.4s,v4.4s,v31.4s
-#endif
-   st1 {v4.4s},[x10],#16
-  
-   ins v27.s[3],v12.s[3]
-   
-   //next loop
-   subs x8,x8,1
-   b.ne last_loop_start
-
-last_last_4:
-
-   sub x8,x2,x7
-   cmp x8,4
-   blt last_less_4
-  
-   ld1 {v12.4s},[x0],#16
-   dup v13.4s,v31.s[0]
-  
-   ext v28.16b,v27.16b,v12.16b,12  // last_3 , a00, a01, a02
-                           //v12: a00, a01, a02 ,a03 
-   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04      
-
-   ld1 {v0.4s},[x4]
-   ld1 {v4.4s},[x10]
-   
-   //L-2 
-   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-   fmla v0.4s,v12.4s,v26.s[1]  //k21,
-   fmla v0.4s,v29.4s,v26.s[2]  //k22
-//add bias
-   fadd v0.4s,v0.4s,v30.4s   
-
-#ifdef CONV_RELU_FUSE
-   fmax v0.4s,v0.4s,v31.4s
-#endif
-   st1 {v0.4s},[x4],#16
-  
-  
-   //L-1   
-   fmla v4.4s,v28.4s,v25.s[0]  //k10, 
-   fmla v4.4s,v12.4s,v25.s[1]  //k11,
-   fmla v4.4s,v29.4s,v25.s[2]  //k12
-//add bias
-   fadd v4.4s,v4.4s,v30.4s
-#ifdef CONV_RELU_FUSE
-   fmax v4.4s,v4.4s,v31.4s
-#endif
-   st1 {v4.4s},[x10],#16
-  
-   ins v27.s[3],v12.s[3]
- 
-   b last_row_done
-    
-last_less_4:
-      
-   cmp x8,1
-   blt last_row_done
-
-last_1_2_3:   
-  
-   dup v12.4s,v31.s[0]
-   dup v13.4s,v31.s[0]
-   dup v0.4s,v31.s[0]
-   dup v4.4s,v31.s[0]
-   
-
-   ldr s28,[x0],#4
-   ins v12.s[0],v28.s[0]
-   ldr s28,[x4]
-   ins v0.s[0],v28.s[0]
-   ldr s28,[x10]
-   ins v4.s[0],v28.s[0]
-  
-   sub x7,x8,1
-   cbz x7, last_left_load_done
-    
-   ldr s28,[x0],#4
-   ins v12.s[1],v28.s[0]
-
-   ldr s28,[x4,#4]
-   ins v0.s[1],v28.s[0]
-   ldr s28,[x10,#4]
-   ins v4.s[1],v28.s[0]
-   
-   
-   sub x7,x8,2
-   cbz x7, last_left_load_done
-   
-   ldr s28,[x0],#4
-   ins v12.s[2],v28.s[0]
-
-   ldr s28,[x4,#8]
-   ins v0.s[2],v28.s[0]
-   ldr s28,[x10,#8]
-   ins v4.s[2],v28.s[0]
-
-last_left_load_done:         
-
-   ext v28.16b,v27.16b,v12.16b,12  //last_3 , a00, a01, a02
-   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04         
-
-   //L-2 
-   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-   fmla v0.4s,v12.4s,v26.s[1]  //k21,
-   fmla v0.4s,v29.4s,v26.s[2]  //k22
-  
-   //L-1   
-   fmla v4.4s,v28.4s,v25.s[0]  //k10, 
-   fmla v4.4s,v12.4s,v25.s[1]  //k11,
-   fmla v4.4s,v29.4s,v25.s[2]  //k12
-
-//add bias
-   fadd v0.4s,v0.4s,v30.4s   
-   //save result: 1 2 or 3
-   ins v28.s[0],v0.s[0]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31 
-#endif
-   str  s28,[x4],#4
-
-//add bias
-   fadd v4.4s,v4.4s,v30.4s
-
-   ins v28.s[0],v4.s[0]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31 
-#endif
-   str  s28,[x10],#4
- 
-   cmp x8,2
-   blt last_row_done
- 
-   ins v28.s[0],v0.s[1]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31 
-#endif
-   str s28,[x4],#4
-  
-   ins v28.s[0],v4.s[1]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31 
-#endif
-   str s28,[x10],#4
-   
-   cmp x8,3
-   blt last_row_done
-
-   ins v28.s[0],v0.s[2]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31 
-#endif
-   str s28,[x4]
-   
-   ins v28.s[0],v4.s[2]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31 
-#endif
-   str s28,[x10]
-   
-   
-last_row_done:
-   ret
-
-single_line:
-   mov x10,x4
-   cbz x8,single_line_last_4
-  
-single_line_row_loop:
-   //load 4 input
-   ld1 {v12.4s},[x0],#16    
-   ld1r {v13.4s},[x0]
-   
-   ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02
-   ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04
-
-   ins v27.s[3],v12.s[3]
-
-   dup v4.4s,v30.s[0]
-   fmla v4.4s,v28.4s,v25.s[0] //k10,
-   fmla v4.4s,v12.4s,v25.s[1] //k11,
-   fmla v4.4s,v29.4s,v25.s[2] //k12
-#ifdef CONV_RELU_FUSE
-   fmax v4.4s,v4.4s,v31.4s
-#endif
-  
-   st1 {v4.4s},[x10],#16
-
-   //next loop
-   subs x8,x8,1
-   b.ne single_line_row_loop 
-
-single_line_last_4:
-   //x8=x2-x7, and x7<=x2-1 and x7=4N and N is non-negative number, so left ones: 1-4
-   sub x8,x2,x7
-   cmp x8,4
-   blt single_line_less_4
-
-   ld1 {v12.4s},[x0],#16
-   ins v13.s[0],v31.s[0]
-   
-   ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02
-   ext v29.16b,v12.16b,v13.16b,4  //a01, a02, a03, a04
-
- 
-   dup v4.4s,v30.s[0]
-   fmla v4.4s,v28.4s,v25.s[0] //k10,
-   fmla v4.4s,v12.4s,v25.s[1] //k11,
-   fmla v4.4s,v29.4s,v25.s[2] //k12
-#ifdef CONV_RELU_FUSE
-   fmax v4.4s,v4.4s,v31.4s
-#endif
-
-   st1 {v4.4s},[x10],#16
-   b single_line_done
-
-single_line_less_4:
-   cmp x8,1
-   bge single_line_1_2_3
-   b   single_line_done
-
-single_line_1_2_3:
-   dup v12.4s,v31.s[0]
-   dup v13.4s,v31.s[0]
-
-   ldr s28,[x0],#4
-   ins v12.s[0],v28.s[0]
-   sub x7,x8,1
-   cbz x7,single_line_left_load_done
-   
-   ldr s28,[x0],#4
-   ins v12.s[1],v28.s[0]
-   sub x7,x8,2
-
-   cbz x7,single_line_left_load_done
-   ldr s28,[x0],#4
-   ins v12.s[2],v28.s[0]
-
-single_line_left_load_done:
-   ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02
-   ext v29.16b,v12.16b,v13.16b,4  //a01, a02, a03, a04
-
-   dup v4.4s,v30.s[0]
-   fmla v4.4s,v28.4s,v25.s[0] //k10
-   fmla v4.4s,v12.4s,v25.s[1] //k11
-   fmla v4.4s,v29.4s,v25.s[2] //k12
-#ifdef CONV_RELU_FUSE
-   fmax v4.4s,v4.4s,v31.4s
-#endif
-
-   //save result
-   ins v28.s[0],v4.s[0]
-   str s28,[x10],#4
-
-   cmp x8,2
-   blt single_line_done 
-
-   ins v28.s[0],v4.s[1]
-   str s28,[x10],#4
-
-   cmp x8,3
-   blt single_line_done
-
-   ins v28.s[0],v4.s[2]
-   str s28,[x10]
-
-single_line_done: 
-   ret 
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+
+
+//x0: input
+//x1: h
+//x2: w
+//x3: kernel
+//x4: output //L-2
+//x5 : bias
+//x10: L-1 output
+//x6: L0 output
+//x7: processed item
+//x8: counter
+//x9: x2*4 
+
+//v0-v3: L-2  
+//v4-v7: L-1  
+//v8-v11: L0  
+//v12-v15/v16-v20: input two group
+//v24-v26: kernel
+//v27 --- saved previous vector
+// v28,v29 --- shifted 
+
+//v30 : bias
+#ifndef KERNEL_NAME
+#define KERNEL_NAME dw_k3s1p1
+#endif
+
+.text
+.align 5
+.global KERNEL_NAME
+.type KERNEL_NAME, %function
+
+
+KERNEL_NAME:
+
+   //Load Kernel
+   ld1 {v24.4s,v25.4s,v26.4s}, [x3]
+
+   ext  v26.16b,v25.16b,v26.16b,8
+   ext  v25.16b,v24.16b,v25.16b,12
+
+   lsl x9,x2,2
+   fmov s31,wzr
+   dup  v31.4s,v31.s[0]
+
+   cbz  x5 ,non_biases
+   //get the bias
+   ldr  s30, [x5]
+   dup  v30.4s,v30.s[0] 
+
+   b  first_row_start
+non_biases:
+   fmov s30, wzr
+   dup v30.4s,v30.s[0]
+first_row_start:
+   sub  x1,x1,1  
+   sub  x7,x2,1  //save last item in row  
+   lsr  x8,x7,2
+   lsl  x7,x8,2
+
+   ins  v27.s[3],v31.s[0]   //pre_vector for input
+
+   cbz x1,single_line
+   mov x10,x4      //L-1
+   add x6,x10,x9   //L-0
+
+
+   cbz  x8,first_last_4
+
+   //output
+   
+   
+first_row_loop:
+   //load 4 float input
+   ld1 {v12.4s},[x0],#16
+   ld1r {v13.4s},[x0]
+   
+   ext v28.16b,v27.16b,v12.16b,12  //last_3 , a00, a01, a02
+   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04
+   
+   ins v27.s[3],v12.s[3]  //save prev vector
+   
+   //L-1: k1 xinput
+   fmul v4.4s,v28.4s,v25.s[0]  //k10, 
+   fmla v4.4s,v12.4s,v25.s[1]  //k11,
+   fmla v4.4s,v29.4s,v25.s[2]  //k12
+  
+   st1 {v4.4s},[x10],#16
+   
+   //L0
+   fmul v8.4s,v28.4s,v24.s[0]   //k00
+   fmla v8.4s,v12.4s,v24.s[1]   //k01
+   fmla v8.4s,v29.4s,v24.s[2]   //k02
+   
+   st1 {v8.4s},[x6],#16
+   
+   //next loop
+   subs x8,x8,1
+   b.ne first_row_loop
+
+first_last_4:
+   //left ones: 1-4
+   sub x8,x2,x7
+   cmp x8,4
+   blt first_less_4
+
+   //4  nodes
+   ld1 {v12.4s},[x0],#16
+   ins v13.s[0],v31.s[0]
+   
+   ext v28.16b,v27.16b,v12.16b,12  //last_3 , a00, a01, a02
+   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04         
+   
+   //L-1: k1 xinput
+   fmul v4.4s,v28.4s,v25.s[0]  //k10, 
+   fmla v4.4s,v12.4s,v25.s[1]  //k11,
+   fmla v4.4s,v29.4s,v25.s[2]  //k12
+   
+   st1 {v4.4s},[x10],#16
+   
+   //L0
+   fmul v8.4s,v28.4s,v24.s[0]   //k00
+   fmla v8.4s,v12.4s,v24.s[1]   //k01
+   fmla v8.4s,v29.4s,v24.s[2]   //k02
+   
+   st1 {v8.4s},[x6],#16
+  
+   b first_row_done
+   
+first_less_4:
+   cmp x8,1
+   bge first_1_2_3
+   b   first_row_done
+
+first_1_2_3:   
+   dup v12.4s,v31.s[0]
+   dup v13.4s,v31.s[0]
+   
+   //2 or 3 items
+   ldr s28,[x0],#4
+   ins v12.s[0],v28.s[0]
+   sub x7,x8,1
+   cbz x7, first_left_load_done
+   
+   ldr s28,[x0],#4
+   ins v12.s[1],v28.s[0]
+   sub x7,x8,2
+   
+   cbz x7, first_left_load_done
+   ldr s28,[x0],#4
+   ins v12.s[2],v28.s[0]
+
+first_left_load_done:         
+
+   ext v28.16b,v27.16b,v12.16b,12  //last_3 , a00, a01, a02
+   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04         
+
+   //L-1   
+   fmul v4.4s,v28.4s,v25.s[0]  //k10, 
+   fmul v8.4s,v28.4s,v24.s[0]   //k00
+   fmla v4.4s,v12.4s,v25.s[1]  //k11,
+   fmla v8.4s,v12.4s,v24.s[1]   //k01
+   fmla v4.4s,v29.4s,v25.s[2]  //k12
+   fmla v8.4s,v29.4s,v24.s[2]   //k02
+   
+   //save result: 2 or 3
+   ins v28.s[0],v4.s[0]
+   str  s28,[x10],#4
+   
+   ins v28.s[0],v8.s[0]
+   str s28,[x6],#4
+   
+   cmp x8, 2
+   blt  first_row_done
+   
+  ins v28.s[0],v4.s[1]
+  str s28,[x10],#4
+   
+   
+   ins v28.s[0],v8.s[1]
+   str s28,[x6],#4
+   
+   cmp x8,3
+   blt first_row_done
+   
+   ins v28.s[0],v4.s[2]
+   str s28,[x10]
+   
+   ins v28.s[0],v8.s[2]
+   str s28,[x6]
+
+first_row_done:
+   
+mid_row_start:
+
+   sub x1,x1,1
+   cbz x1, last_row_start
+
+   sub  x7,x2,1  //save one 
+   lsr  x8,x7,2
+   lsl  x7,x8,2
+   
+   add x10,x4,x9    //L-1
+   add x6,x10,x9   //L0
+   dup v27.4s,v31.s[0]
+     
+   cbz x8,mid_last_4
+  
+mid_loop_start:
+   
+   ld1 {v0.4s},[x4]
+   ld1 {v4.4s},[x10]
+  //ld1 {v8.4s},[x6],#16  //L0 is always zero
+  
+  ld1 {v12.4s},[x0],#16
+  ld1r {v13.4s},[x0]
+ 
+  ext v28.16b,v27.16b,v12.16b,12  // last_3 , a00, a01, a02
+                          //v12: a00, a01, a02 ,a03 
+  ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04   
+  
+  //L-2 
+  fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+   fmla v4.4s,v28.4s,v25.s[0]  //k10, 
+   fmul v8.4s,v28.4s,v24.s[0]   //k00
+  fmla v0.4s,v12.4s,v26.s[1]  //k21,
+   fmla v4.4s,v12.4s,v25.s[1]  //k11,
+   fmla v8.4s,v12.4s,v24.s[1]   //k01
+  fmla v0.4s,v29.4s,v26.s[2]  //k22
+   fmla v4.4s,v29.4s,v25.s[2]  //k12
+   fmla v8.4s,v29.4s,v24.s[2]   //k02
+//add bias
+  fadd v0.4s,v0.4s,v30.4s
+#ifdef CONV_RELU_FUSE
+  fmax v0.4s,v0.4s,v31.4s
+#endif
+  st1 {v0.4s},[x4],#16
+  
+  //L-1   
+   st1 {v4.4s},[x10],#16
+  
+   
+   //L0
+   st1 {v8.4s},[x6],#16
+  
+   ins v27.s[3],v12.s[3]
+   
+   //next loop
+   subs x8,x8,1
+   b.ne mid_loop_start
+
+mid_last_4:
+   sub x8,x2,x7
+   cmp x8,4
+   blt mid_less_4
+   
+   ld1 {v0.4s},[x4]
+   ld1 {v4.4s},[x10]
+   
+  ld1 {v12.4s},[x0],#16
+  ins v13.s[0],v31.s[0]
+  
+  ext v28.16b,v27.16b,v12.16b,12  // last_3 , a00, a01, a02
+                          //v12: a00, a01, a02 ,a03 
+  ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04   
+  
+  //L-2 
+  fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+  fmla v4.4s,v28.4s,v25.s[0]  //k10, 
+   fmul v8.4s,v28.4s,v24.s[0]   //k00
+  fmla v0.4s,v12.4s,v26.s[1]  //k21,
+   fmla v4.4s,v12.4s,v25.s[1]  //k11,
+   fmla v8.4s,v12.4s,v24.s[1]   //k01
+  fmla v0.4s,v29.4s,v26.s[2]  //k22
+   fmla v4.4s,v29.4s,v25.s[2]  //k12
+   fmla v8.4s,v29.4s,v24.s[2]   //k02
+//add bias
+  fadd v0.4s,v0.4s,v30.4s
+#ifdef CONV_RELU_FUSE
+  fmax v0.4s,v0.4s,v31.4s
+#endif
+  st1 {v0.4s},[x4],#16
+  
+  
+  //L-1   
+   st1 {v4.4s},[x10],#16
+  
+   //L0
+   st1 {v8.4s},[x6],#16
+   
+   b mid_row_start
+ 
+mid_less_4:
+   cmp x8,1
+   blt mid_row_start
+   
+mid_left_1_2_3: 
+  
+   dup v12.4s,v31.s[0]
+   dup v13.4s,v31.s[0]
+   dup v0.4s,v31.s[0]
+   dup v4.4s,v31.s[0]
+   
+   
+   ldr s28,[x0],#4
+   ins v12.s[0],v28.s[0]
+      
+   ldr s28,[x4]
+   ins v0.s[0],v28.s[0]
+   ldr s28,[x10]
+   ins v4.s[0],v28.s[0]
+   
+   
+   cmp  x8,2
+   blt mid_left_load_done
+   
+   ldr s28,[x0],#4
+   ins v12.s[1],v28.s[0]
+   
+   ldr s28,[x4,#4]
+   ins v0.s[1],v28.s[0]
+   ldr s28,[x10, #4]
+   ins v4.s[1],v28.s[0]
+   
+   cmp  x8,3
+   blt mid_left_load_done
+   
+   
+   ldr s28,[x0],#4
+   ins v12.s[2],v28.s[0]
+   
+   ldr s28,[x4,#8]
+   ins v0.s[2],v28.s[0]
+   ldr s28,[x10, #8]
+   ins v4.s[2],v28.s[0]
+
+mid_left_load_done:         
+
+   ext v28.16b,v27.16b,v12.16b,12  //last_3 , a00, a01, a02
+   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04         
+
+   //L-2 
+   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+   fmla v4.4s,v28.4s,v25.s[0]  //k10, 
+   fmul v8.4s,v28.4s,v24.s[0]   //k00
+   fmla v0.4s,v12.4s,v26.s[1]  //k21,
+   fmla v4.4s,v12.4s,v25.s[1]  //k11,
+   fmla v8.4s,v12.4s,v24.s[1]   //k01
+
+   fmla v0.4s,v29.4s,v26.s[2]  //k22
+   fmla v4.4s,v29.4s,v25.s[2]  //k12
+   fmla v8.4s,v29.4s,v24.s[2]   //k02
+
+//add bias 
+   fadd v0.4s,v0.4s,v30.4s
+   //save result:1, 2 or 3
+   ins v28.s[0],v0.s[0]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31
+#endif
+   str  s28,[x4],#4
+   
+   ins v28.s[0],v4.s[0]
+   str  s28,[x10],#4
+   
+   ins v28.s[0],v8.s[0]
+   str s28,[x6],#4
+   
+   cmp x8,2
+   blt mid_row_start
+   
+   ins v28.s[0],v0.s[1]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31
+#endif
+   str s28,[x4],#4
+
+   ins v28.s[0],v4.s[1]
+   str s28,[x10],#4
+   
+   ins v28.s[0],v8.s[1]
+   str s28,[x6],#4
+   
+   cmp x8,3
+   blt mid_row_start
+
+   ins v28.s[0],v0.s[2]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31
+#endif
+   str s28,[x4],#4
+  
+   ins v28.s[0],v4.s[2]
+   str s28,[x10]
+   
+   ins v28.s[0],v8.s[2]
+   str s28,[x6]
+   
+   b mid_row_start
+   
+
+last_row_start:
+
+   
+   sub  x7,x2,1
+   lsr  x8,x7,2
+   lsl  x7,x8,2
+   
+   dup v27.4s,v31.s[0]
+   
+   add x10,x4,x9 //L-1
+
+   cbz x8,last_last_4
+   
+last_loop_start:
+   
+  ld1 {v0.4s},[x4]
+  ld1 {v4.4s},[x10]
+  
+  ld1 {v12.4s},[x0],#16
+  ld1 {v13.4s},[x0]
+  
+  ext v28.16b,v27.16b,v12.16b,12  // last_3 , a00, a01, a02
+                           //v12: a00, a01, a02 ,a03 
+  ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04   
+  
+  //L-2 
+  fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+  fmla v4.4s,v28.4s,v25.s[0]  //k10, 
+  fmla v0.4s,v12.4s,v26.s[1]  //k21,
+  fmla v4.4s,v12.4s,v25.s[1]  //k11,
+  fmla v0.4s,v29.4s,v26.s[2]  //k22
+  fmla v4.4s,v29.4s,v25.s[2]  //k12
+//add bias
+  fadd v0.4s,v0.4s,v30.4s
+
+#ifdef CONV_RELU_FUSE
+  fmax v0.4s,v0.4s,v31.4s
+#endif
+  st1 {v0.4s},[x4],#16
+  
+  //L-1   
+//add bias
+   fadd v4.4s,v4.4s,v30.4s
+#ifdef CONV_RELU_FUSE
+   fmax v4.4s,v4.4s,v31.4s
+#endif
+   st1 {v4.4s},[x10],#16
+  
+   ins v27.s[3],v12.s[3]
+   
+   //next loop
+   subs x8,x8,1
+   b.ne last_loop_start
+
+last_last_4:
+
+   sub x8,x2,x7
+   cmp x8,4
+   blt last_less_4
+  
+   ld1 {v12.4s},[x0],#16
+   dup v13.4s,v31.s[0]
+  
+   ext v28.16b,v27.16b,v12.16b,12  // last_3 , a00, a01, a02
+                           //v12: a00, a01, a02 ,a03 
+   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04      
+
+   ld1 {v0.4s},[x4]
+   ld1 {v4.4s},[x10]
+   
+   //L-2 
+   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+   fmla v0.4s,v12.4s,v26.s[1]  //k21,
+   fmla v0.4s,v29.4s,v26.s[2]  //k22
+//add bias
+   fadd v0.4s,v0.4s,v30.4s   
+
+#ifdef CONV_RELU_FUSE
+   fmax v0.4s,v0.4s,v31.4s
+#endif
+   st1 {v0.4s},[x4],#16
+  
+  
+   //L-1   
+   fmla v4.4s,v28.4s,v25.s[0]  //k10, 
+   fmla v4.4s,v12.4s,v25.s[1]  //k11,
+   fmla v4.4s,v29.4s,v25.s[2]  //k12
+//add bias
+   fadd v4.4s,v4.4s,v30.4s
+#ifdef CONV_RELU_FUSE
+   fmax v4.4s,v4.4s,v31.4s
+#endif
+   st1 {v4.4s},[x10],#16
+  
+   ins v27.s[3],v12.s[3]
+ 
+   b last_row_done
+    
+last_less_4:
+      
+   cmp x8,1
+   blt last_row_done
+
+last_1_2_3:   
+  
+   dup v12.4s,v31.s[0]
+   dup v13.4s,v31.s[0]
+   dup v0.4s,v31.s[0]
+   dup v4.4s,v31.s[0]
+   
+
+   ldr s28,[x0],#4
+   ins v12.s[0],v28.s[0]
+   ldr s28,[x4]
+   ins v0.s[0],v28.s[0]
+   ldr s28,[x10]
+   ins v4.s[0],v28.s[0]
+  
+   sub x7,x8,1
+   cbz x7, last_left_load_done
+    
+   ldr s28,[x0],#4
+   ins v12.s[1],v28.s[0]
+
+   ldr s28,[x4,#4]
+   ins v0.s[1],v28.s[0]
+   ldr s28,[x10,#4]
+   ins v4.s[1],v28.s[0]
+   
+   
+   sub x7,x8,2
+   cbz x7, last_left_load_done
+   
+   ldr s28,[x0],#4
+   ins v12.s[2],v28.s[0]
+
+   ldr s28,[x4,#8]
+   ins v0.s[2],v28.s[0]
+   ldr s28,[x10,#8]
+   ins v4.s[2],v28.s[0]
+
+last_left_load_done:         
+
+   ext v28.16b,v27.16b,v12.16b,12  //last_3 , a00, a01, a02
+   ext v29.16b,v12.16b,v13.16b,4   //a01, a02, a03, a04         
+
+   //L-2 
+   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+   fmla v0.4s,v12.4s,v26.s[1]  //k21,
+   fmla v0.4s,v29.4s,v26.s[2]  //k22
+  
+   //L-1   
+   fmla v4.4s,v28.4s,v25.s[0]  //k10, 
+   fmla v4.4s,v12.4s,v25.s[1]  //k11,
+   fmla v4.4s,v29.4s,v25.s[2]  //k12
+
+//add bias
+   fadd v0.4s,v0.4s,v30.4s   
+   //save result: 1 2 or 3
+   ins v28.s[0],v0.s[0]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31 
+#endif
+   str  s28,[x4],#4
+
+//add bias
+   fadd v4.4s,v4.4s,v30.4s
+
+   ins v28.s[0],v4.s[0]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31 
+#endif
+   str  s28,[x10],#4
+ 
+   cmp x8,2
+   blt last_row_done
+ 
+   ins v28.s[0],v0.s[1]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31 
+#endif
+   str s28,[x4],#4
+  
+   ins v28.s[0],v4.s[1]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31 
+#endif
+   str s28,[x10],#4
+   
+   cmp x8,3
+   blt last_row_done
+
+   ins v28.s[0],v0.s[2]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31 
+#endif
+   str s28,[x4]
+   
+   ins v28.s[0],v4.s[2]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31 
+#endif
+   str s28,[x10]
+   
+   
+last_row_done:
+   ret
+
+single_line:
+   mov x10,x4
+   cbz x8,single_line_last_4
+  
+single_line_row_loop:
+   //load 4 input
+   ld1 {v12.4s},[x0],#16    
+   ld1r {v13.4s},[x0]
+   
+   ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02
+   ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04
+
+   ins v27.s[3],v12.s[3]
+
+   dup v4.4s,v30.s[0]
+   fmla v4.4s,v28.4s,v25.s[0] //k10,
+   fmla v4.4s,v12.4s,v25.s[1] //k11,
+   fmla v4.4s,v29.4s,v25.s[2] //k12
+#ifdef CONV_RELU_FUSE
+   fmax v4.4s,v4.4s,v31.4s
+#endif
+  
+   st1 {v4.4s},[x10],#16
+
+   //next loop
+   subs x8,x8,1
+   b.ne single_line_row_loop 
+
+single_line_last_4:
+   //x8=x2-x7, and x7<=x2-1 and x7=4N and N is non-negative number, so left ones: 1-4
+   sub x8,x2,x7
+   cmp x8,4
+   blt single_line_less_4
+
+   ld1 {v12.4s},[x0],#16
+   ins v13.s[0],v31.s[0]
+   
+   ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02
+   ext v29.16b,v12.16b,v13.16b,4  //a01, a02, a03, a04
+
+ 
+   dup v4.4s,v30.s[0]
+   fmla v4.4s,v28.4s,v25.s[0] //k10,
+   fmla v4.4s,v12.4s,v25.s[1] //k11,
+   fmla v4.4s,v29.4s,v25.s[2] //k12
+#ifdef CONV_RELU_FUSE
+   fmax v4.4s,v4.4s,v31.4s
+#endif
+
+   st1 {v4.4s},[x10],#16
+   b single_line_done
+
+single_line_less_4:
+   cmp x8,1
+   bge single_line_1_2_3
+   b   single_line_done
+
+single_line_1_2_3:
+   dup v12.4s,v31.s[0]
+   dup v13.4s,v31.s[0]
+
+   ldr s28,[x0],#4
+   ins v12.s[0],v28.s[0]
+   sub x7,x8,1
+   cbz x7,single_line_left_load_done
+   
+   ldr s28,[x0],#4
+   ins v12.s[1],v28.s[0]
+   sub x7,x8,2
+
+   cbz x7,single_line_left_load_done
+   ldr s28,[x0],#4
+   ins v12.s[2],v28.s[0]
+
+single_line_left_load_done:
+   ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02
+   ext v29.16b,v12.16b,v13.16b,4  //a01, a02, a03, a04
+
+   dup v4.4s,v30.s[0]
+   fmla v4.4s,v28.4s,v25.s[0] //k10
+   fmla v4.4s,v12.4s,v25.s[1] //k11
+   fmla v4.4s,v29.4s,v25.s[2] //k12
+#ifdef CONV_RELU_FUSE
+   fmax v4.4s,v4.4s,v31.4s
+#endif
+
+   //save result
+   ins v28.s[0],v4.s[0]
+   str s28,[x10],#4
+
+   cmp x8,2
+   blt single_line_done 
+
+   ins v28.s[0],v4.s[1]
+   str s28,[x10],#4
+
+   cmp x8,3
+   blt single_line_done
+
+   ins v28.s[0],v4.s[2]
+   str s28,[x10]
+
+single_line_done: 
+   ret 
diff --git a/executor/operator/arm64/conv/dw_k3s1p1_relu_fused.S b/executor/operator/arm64/conv/dw_k3s1p1_relu_fused.S
index 6466b842c..9c4ac3f7a 100644
--- a/executor/operator/arm64/conv/dw_k3s1p1_relu_fused.S
+++ b/executor/operator/arm64/conv/dw_k3s1p1_relu_fused.S
@@ -1,27 +1,27 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2018, Open AI Lab
- * Author: haitao@openailab.com
- */
-#define KERNEL_NAME dw_k3s1p1_relu_fused
-#define CONV_RELU_FUSE
-
-#include "./dw_k3s1p1.S"
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+#define KERNEL_NAME dw_k3s1p1_relu_fused
+#define CONV_RELU_FUSE
+
+#include "./dw_k3s1p1.S"
diff --git a/executor/operator/arm64/conv/dw_k3s2p1.S b/executor/operator/arm64/conv/dw_k3s2p1.S
index 808ead01d..3f796c5db 100644
--- a/executor/operator/arm64/conv/dw_k3s2p1.S
+++ b/executor/operator/arm64/conv/dw_k3s2p1.S
@@ -1,689 +1,689 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: haitao@openailab.com
- */
-//x0: input
-//x1: h
-//x2: w
-//x3: kernel
-//x4: output //L-2
-//x5: bias
-//x10: L-1 output
-//x6: L0 output
-//x7: processed item
-//x8: counter
-//x9: output width
-
-//v0-v3: L-2  
-//v4-v7: L-1  
-//v8-v11: L0  
-//v12-v15/v16-v20: input two group
-//v24-v26: kernel
-//v27 --- saved previous vector
-// v28,v29 --- shifted 
-
-//v20 bias
-
-#ifndef KERNEL_NAME
-#define KERNEL_NAME dw_k3s2p1
-#endif
-
-.text
-.align 5
-.global KERNEL_NAME
-.type KERNEL_NAME, %function
-
-
-KERNEL_NAME:
-   //Load Kernel
-   ld1 {v24.4s,v25.4s,v26.4s}, [x3]
-   ext  v26.16b,v25.16b,v26.16b,8
-   ext  v25.16b,v24.16b,v25.16b,12
-
-   sub x9,x2,1
-   lsr x9,x9,1
-   add x9,x9,1
-   lsl x9,x9,2
-   fmov s31,wzr
-   dup  v31.4s,v31.s[0]
-
-   //get bias
-   cbz x5,non_biases
-   ldr s21,[x5]
-   dup v21.4s,v21.s[0]
-   b first_row_start
-
-non_biases:
-   fmov s21,wzr
-   dup v21.4s,v21.s[0]
-
-//first row
-
-first_row_start:
-   sub  x1,x1,1
-     
-   lsr  x8,x2,3    //x8 loop counter
-   lsl  x7,x8,3    //x7 processed number
-
-   ins  v27.s[3],v31.s[0]   //pre_vector for input
-
-   mov x10,x4      //L-1  //L1 ONLY
-   cbz  x8,first_less_8
-   
-first_loop_start:
-   //load 4 float input
-   ld1 {v12.4s,v13.4s},[x0],#32    //a00,a01,a02,a03,a04,a05,a06,a07
-   
-   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
-   
-   /*
-     v28:   last_3, a01, a03, a05
-     v29    a00     a02,  a04, a06
-     v30    a01     a03,  a05, a07
-   */  
-   
-   //L-1: k1 xinput
-   fmul v4.4s,v28.4s,v25.s[0]  //k10, 
-   fmla v4.4s,v29.4s,v25.s[1]  //k11,
-   fmla v4.4s,v30.4s,v25.s[2]  //k12
-
-   ins v27.s[3],v13.s[3]  //save prev vector
-
-    //save data, four are valid
-    st1 {v4.4s},[x10],#16
-   
-    //next loop
-    subs x8,x8,1
-    b.ne first_loop_start
-
-first_less_8:
-   
-    sub x8,x2,x7
-    cmp  x8,1
-    blt first_row_done
-
-first_1_7:
-    dup v13.4s,v31.s[0]
-
-    cmp x8,4
-    blt  first_1_2_3
-    
-    ld1 {v12.4s},[x0],#16
-
-    uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-    uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-    ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
-
-    //L-1   
-    fmul v4.4s,v28.4s,v25.s[0]  //k10, 
-    fmla v4.4s,v29.4s,v25.s[1]  //k11,
-    fmla v4.4s,v30.4s,v25.s[2]  //k12
-
-    ins v28.s[0],v4.s[0]
-    str  s28,[x10],#4
-
-    ins v28.s[0],v4.s[1]
-    str  s28,[x10],#4
-
-    sub x8,x8,4
-    cbz x8,first_row_done
-
-    ins v27.s[3],v12.s[3]
-     
-first_1_2_3:
-    dup v12.4s,v31.s[0]
-
-    //1-3 items
-    ldr s28,[x0],#4
-    ins v12.s[0],v28.s[0]
-
-    cmp x8,2
-    blt first_left_load_done
-
-    ldr s28,[x0],#4
-    ins v12.s[1],v28.s[0]
-
-    cmp x8,3
-    blt first_left_load_done
-
-   ldr s28,[x0],#4
-   ins v12.s[2],v28.s[0]
-   
-first_left_load_done:         
-
-   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
-   
-   //L-1   
-   fmul v4.4s,v28.4s,v25.s[0]  //k10, 
-   fmla v4.4s,v29.4s,v25.s[1]  //k11,
-   fmla v4.4s,v30.4s,v25.s[2]  //k12
-
-first_left_save_1_3:  
-   
-   ins v28.s[0],v4.s[0]
-   str  s28,[x10],#4
-
-   cmp x8,3
-   blt first_row_done
-   
-   ins v28.s[0],v4.s[1]
-   str s28,[x10],#4
-
-first_row_done:
-
-
-odd_row_start:
-   sub x1,x1,1
-   cbz x1, last_row_is_odd
-
-   lsr  x8,x2,3
-   lsl  x7,x8,3
-   
-   dup v27.4s,v31.s[0]
-                   //x4: L-2
-   add x6,x4,x9   //L0
-     
-   cbz x8,odd_less_8
-  
-odd_loop_start:
-
-   ld1 {v0.4s}, [x4]   //L-2
-   ld1 {v12.4s,v13.4s},[x0],#32
-   
-   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
- 
-  
-  //L-2 
-   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-   fmul v8.4s,v28.4s,v24.s[0]   //k00
-   fmla v0.4s,v29.4s,v26.s[1]  //k21,
-   fmla v8.4s,v29.4s,v24.s[1]   //k01
-   fmla v0.4s,v30.4s,v26.s[2]  //k22
-   fmla v8.4s,v30.4s,v24.s[2]   //k02
-//add bias
-   fadd v0.4s,v0.4s,v21.4s
-
-#ifdef CONV_RELU_FUSE
-   fmax v0.4s,v0.4s,v31.4s
-#endif
-     
-   //L0 is always zero
-   
-   st1 {v0.4s}, [x4],#16
-   st1 {v8.4s}, [x6],#16
-      
-   ins v27.s[3],v13.s[3]
-   
-   //next loop
-   subs x8,x8,1
-   b.ne odd_loop_start
-
-odd_less_8:
-   sub x8,x2,x7
-   cmp x8,1
-   blt odd_row_done
-
-odd_1_7:
-    dup v13.4s,v31.s[0]
-    cmp x8,4
-    blt  odd_1_2_3
-
-    ld1 {v12.4s},[x0],#16
-
-    ldr s28,[x4]
-    ins v0.s[0],v28.s[0]
-   
-    ldr s28,[x4,#4]
-    ins v0.s[1],v28.s[0]
-
-    uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-    uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-    ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
- 
-    //L-2 
-    fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-    fmul v8.4s,v28.4s,v24.s[0]   //k00
-    fmla v0.4s,v29.4s,v26.s[1]  //k21,
-    fmla v8.4s,v29.4s,v24.s[1]   //k01
-    fmla v0.4s,v30.4s,v26.s[2]  //k22
-    fmla v8.4s,v30.4s,v24.s[2]   //k02  
-     
-//add bias
-    fadd v0.4s,v0.4s,v21.4s
-    //L0 is always zero
-    ins v28.s[0],v0.s[0]
-#ifdef CONV_RELU_FUSE
-    fmax s28,s28,s31
-#endif
-    str  s28,[x4],#4
-
-    ins v28.s[0],v8.s[0]
-    str  s28,[x6],#4
-  
-    ins v28.s[0],v0.s[1]
-#ifdef CONV_RELU_FUSE
-    fmax s28,s28,s31
-#endif
-    str s28,[x4],#4
-
-    ins v28.s[0],v8.s[1]
-    str  s28,[x6],#4
-
-    sub x8,x8,4
-    cbz x8, odd_row_done
-
-    ins v27.s[3],v12.s[3]
-
-odd_1_2_3:
-
-   dup v12.4s,v31.s[0]
-
-   ldr s28,[x0],#4
-   ins v12.s[0],v28.s[0]
-   
-   ldr s28,[x4]
-   ins v0.s[0],v28.s[0]
-  
-   cmp  x8,2
-   blt odd_left_load_done
-   
-   ldr s28,[x0],#4
-   ins v12.s[1],v28.s[0]
-     
-   cmp  x8,3
-   blt odd_left_load_done
-   
-   ldr s28,[x0],#4
-   ins v12.s[2],v28.s[0]
-   
-   ldr s28,[x4,#4]
-   ins v0.s[1],v28.s[0]
-
-odd_left_load_done:         
-
-   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
- 
-  
-   //L-2 
-   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-   fmul v8.4s,v28.4s,v24.s[0]   //k00
-   fmla v0.4s,v29.4s,v26.s[1]  //k21,
-   fmla v8.4s,v29.4s,v24.s[1]   //k01
-   fmla v0.4s,v30.4s,v26.s[2]  //k22
-   fmla v8.4s,v30.4s,v24.s[2]   //k02
-     
-   //L0
-//add bias
-   fadd v0.4s,v0.4s,v21.4s   
-   //save result:1 or 2
-   ins v28.s[0],v0.s[0]
-#ifdef CONV_RELU_FUSE
-    fmax s28,s28,s31
-#endif
-   str  s28,[x4],#4
-
-   ins v28.s[0],v8.s[0]
-   str  s28,[x6],#4
-
-   cmp x8,3
-   blt odd_row_done
-   
-   ins v28.s[0],v0.s[1]
-#ifdef CONV_RELU_FUSE
-    fmax s28,s28,s31
-#endif
-   str s28,[x4],#4
-
-   ins v28.s[0],v8.s[1]
-   str  s28,[x6],#4
-
-odd_row_done:   
-
-even_row_start:
-
-   lsr  x8,x2,3
-   lsl  x7,x8,3
-
-   ins  v27.s[3],v31.s[0]   //pre_vector for input
-
-   mov x10,x4       //L-1  //L1 ONLY
-   cbz  x8,even_less_8
-   
-even_loop_start:
-   //load 4 float input
-   ld1 {v12.4s,v13.4s},[x0],#32      
-   ld1 {v4.4s},[x10]
-   
-   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
-
-    //L-1: k1 xinput
-   fmla v4.4s,v28.4s,v25.s[0]  //k10, 
-   fmla v4.4s,v29.4s,v25.s[1]  //k11,
-   fmla v4.4s,v30.4s,v25.s[2]  //k12
-
-   ins v27.s[3],v13.s[3]  //save prev vector
-
-   st1 {v4.4s},[x10],#16
-   
-   //next loop
-   subs x8,x8,1
-   b.ne even_loop_start
-
-even_less_8:
-   
-   sub x8,x2,x7
-   cmp  x8,1
-   blt even_row_done
-
-even_1_7:
-    dup v13.4s,v31.s[0]
-    
-    cmp x8,4
-    blt  even_1_2_3
-
-    ld1 {v12.4s},[x0],#16
-    ldr s28,[x10]
-    ins v4.s[0],v28.s[0]
-    ldr s28,[x10,#4]
-    ins v4.s[1],v28.s[0]
-   
-    uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-    uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-    ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
-
-    //L-1: k1 xinput
-    fmla v4.4s,v28.4s,v25.s[0]  //k10, 
-    fmla v4.4s,v29.4s,v25.s[1]  //k11,
-    fmla v4.4s,v30.4s,v25.s[2]  //k12
-
-    ins v28.s[0],v4.s[0]
-    str  s28,[x10],#4
-    
-    ins v28.s[0],v4.s[1]
-    str  s28,[x10],#4
-
-    sub x8,x8,4
-    cbz x8, even_row_done
-
-    ins v27.s[3],v12.s[3]  //save prev vector
-
-even_1_2_3:   
-   dup v12.4s,v31.s[0]
-   
-   //1, 2 or 3 items
-   ldr s28,[x0],#4
-   ins v12.s[0],v28.s[0]
-
-   ldr s28,[x10]
-   ins v4.s[0],v28.s[0]
-   
-   sub x7,x8,1
-   cbz x7, even_left_load_done
-   
-   ldr s28,[x0],#4
-   ins v12.s[1],v28.s[0]
-  
-   sub x7,x8,2
-   cbz x7, even_left_load_done
-   
-   ldr s28,[x0],#4
-   ins v12.s[2],v28.s[0]
-   
-   ldr s28,[x10,#4]
-   ins v4.s[1],v28.s[0]
-
-even_left_load_done:         
-
-    uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-    uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-    ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
-
-    //L-1: k1 xinput
-    fmla v4.4s,v28.4s,v25.s[0]  //k10, 
-    fmla v4.4s,v29.4s,v25.s[1]  //k11,
-    fmla v4.4s,v30.4s,v25.s[2]  //k12
-
-      
-   //save result: 1 or 2
-   ins v28.s[0],v4.s[0]
-   str  s28,[x10],#4
-
-   cmp x8,3
-   blt even_row_done
-   
-   ins v28.s[0],v4.s[1]
-   str s28,[x10],#4
-
-even_row_done:
-   sub  x1,x1,1  
-   cbz x1, last_even_add_bias
-   b odd_row_start
-
-last_even_add_bias:
-   mov   x10,x4
-   //cal out_w
-   sub   x6,x2,1   
-   lsr   x6,x6,1 
-   add   x6,x6,1
-   //finish
-   lsr  x8,x6,3
-   lsl  x7,x8,3
-   cbz  x8,last_even_less_8
-last_even_loop_start: 
-   ld1 {v12.4s,v13.4s},[x10],#32
-//add bias
-   fadd v12.4s,v12.4s,v21.4s
-   fadd v13.4s,v13.4s,v21.4s
-#ifdef CONV_RELU_FUSE
-   fmax v12.4s,v12.4s,v31.4s
-   fmax v13.4s,v13.4s,v31.4s
-#endif
-   st1 {v12.4s},[x4],#16
-   st1 {v13.4s},[x4],#16
-// next loop
-   subs x8,x8,1
-   b.ne last_even_loop_start
-last_even_less_8:
-   subs x8,x6,x7
-   cmp x8,1
-   blt last_even_loop_done
-   cmp x8,4
-   blt last_even_1_2_3
-   ld1 {v0.4s},[x10],#16
-//add bias
-   fadd v0.4s,v0.4s,v21.4s
-#ifdef CONV_RELU_FUSE
-   fmax v0.4s,v0.4s,v31.4s
-#endif
-   st1 {v0.4s},[x4],#16
-   subs x8,x8,4
-   cbz x8,last_even_loop_done
-last_even_1_2_3:
-   cmp x8,1
-   blt last_even_loop_done
-   ldr s0,[x10],#0x4
-   //add bias
-   fadd s0,s0,s21
-#ifdef CONV_RELU_FUSE
-   fmax s0,s0,s31
-#endif
-   str s0,[x4],#0x4
-   subs x8,x8,1
-   cbz x8,last_even_loop_done
-   b last_even_1_2_3
-
-last_even_loop_done:
-   b all_row_done
-
-// Last Row: even or odd
-
-last_row_is_odd:
-  
-   lsr  x8,x2,3
-   lsl  x7,x8,3
-   
-   dup v27.4s,v31.s[0]
-   cbz x8,last_odd_less_8
-   
-last_odd_loop_start:
-
-   ld1 {v0.4s},[x4]   //L-2
-   ld1 {v12.4s,v13.4s},[x0],#32
-  
-   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
-
-   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-   fmla v0.4s,v29.4s,v26.s[1]  //k21,
-   fmla v0.4s,v30.4s,v26.s[2]  //k22
-//add bias
-   fadd v0.4s,v0.4s,v21.4s
-
-#ifdef CONV_RELU_FUSE
-   fmax v0.4s,v0.4s,v31.4s
-#endif
-   st1 {v0.4s},[x4],#16
-      
-   ins v27.s[3],v13.s[3]
-   
-   //next loop
-   subs x8,x8,1
-   b.ne last_odd_loop_start
-
-last_odd_less_8:
-   sub x8,x2,x7
-   cmp x8,1
-   blt last_odd_row_done
-   cmp x8,4
-   blt last_odd_1_2_3
-
-   ld1 {v12.4s},[x0],#16
-   dup v13.4s,v31.s[0]
-
-    //L-2
-   ldr s28,[x4]
-   ins v0.s[0],v28.s[0]
-   ldr s28,[x4,#4]
-   ins v0.s[1],v28.s[0]
-   
-   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
-
-   //L-2 
-   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-   fmla v0.4s,v29.4s,v26.s[1]  //k21,
-   fmla v0.4s,v30.4s,v26.s[2]  //k22
-
-//add bias
-   fadd v0.4s,v0.4s,v21.4s
-   ins v28.s[0],v0.s[0]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31
-#endif
-   str  s28,[x4],#4
-
-   ins v28.s[0],v0.s[1]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31
-#endif
-   str s28,[x4],#4
-
-   sub x8,x8,4
-   cbz x8,last_odd_row_done
-   
-   ins v27.s[3],v12.s[3]
-  
-last_odd_1_2_3:
-
-   dup v12.4s,v31.s[0]
-       
-   ldr s28,[x0],#4
-   ins v12.s[0],v28.s[0]
-   
-   ldr s28,[x4]
-   ins v0.s[0],v28.s[0]
-  
-   cmp  x8,2
-   blt last_odd_left_load_done
-   
-   ldr s28,[x0],#4
-   ins v12.s[1],v28.s[0]
-     
-   cmp  x8,3
-   blt last_odd_left_load_done
-   
-   ldr s28,[x0],#4
-   ins v12.s[2],v28.s[0]
-   
-   ldr s28,[x4,#4]
-   ins v0.s[1],v28.s[0]
-
-last_odd_left_load_done:         
-
-   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
-   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
-   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
-
-   //L-2 
-   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
-   fmla v0.4s,v29.4s,v26.s[1]  //k21,
-   fmla v0.4s,v30.4s,v26.s[2]  //k22
-
-//add bias
-   fadd v0.4s,v0.4s,v21.4s
-   //save result:1 or 2
-   ins v28.s[0],v0.s[0]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31
-#endif
-   str  s28,[x4],#4
-
-
-   cmp x8,3
-   blt last_odd_row_done
-   
-   ins v28.s[0],v0.s[1]
-#ifdef CONV_RELU_FUSE
-   fmax s28,s28,s31
-#endif
-   str s28,[x4],#4
-
-
-last_odd_row_done:   
-all_row_done:
-   ret
-
-
-
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+//x0: input
+//x1: h
+//x2: w
+//x3: kernel
+//x4: output //L-2
+//x5: bias
+//x10: L-1 output
+//x6: L0 output
+//x7: processed item
+//x8: counter
+//x9: output width
+
+//v0-v3: L-2  
+//v4-v7: L-1  
+//v8-v11: L0  
+//v12-v15/v16-v20: input two group
+//v24-v26: kernel
+//v27 --- saved previous vector
+// v28,v29 --- shifted 
+
+//v20 bias
+
+#ifndef KERNEL_NAME
+#define KERNEL_NAME dw_k3s2p1
+#endif
+
+.text
+.align 5
+.global KERNEL_NAME
+.type KERNEL_NAME, %function
+
+
+KERNEL_NAME:
+   //Load Kernel
+   ld1 {v24.4s,v25.4s,v26.4s}, [x3]
+   ext  v26.16b,v25.16b,v26.16b,8
+   ext  v25.16b,v24.16b,v25.16b,12
+
+   sub x9,x2,1
+   lsr x9,x9,1
+   add x9,x9,1
+   lsl x9,x9,2
+   fmov s31,wzr
+   dup  v31.4s,v31.s[0]
+
+   //get bias
+   cbz x5,non_biases
+   ldr s21,[x5]
+   dup v21.4s,v21.s[0]
+   b first_row_start
+
+non_biases:
+   fmov s21,wzr
+   dup v21.4s,v21.s[0]
+
+//first row
+
+first_row_start:
+   sub  x1,x1,1
+     
+   lsr  x8,x2,3    //x8 loop counter
+   lsl  x7,x8,3    //x7 processed number
+
+   ins  v27.s[3],v31.s[0]   //pre_vector for input
+
+   mov x10,x4      //L-1  //L1 ONLY
+   cbz  x8,first_less_8
+   
+first_loop_start:
+   //load 4 float input
+   ld1 {v12.4s,v13.4s},[x0],#32    //a00,a01,a02,a03,a04,a05,a06,a07
+   
+   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+   
+   /*
+     v28:   last_3, a01, a03, a05
+     v29    a00     a02,  a04, a06
+     v30    a01     a03,  a05, a07
+   */  
+   
+   //L-1: k1 xinput
+   fmul v4.4s,v28.4s,v25.s[0]  //k10, 
+   fmla v4.4s,v29.4s,v25.s[1]  //k11,
+   fmla v4.4s,v30.4s,v25.s[2]  //k12
+
+   ins v27.s[3],v13.s[3]  //save prev vector
+
+    //save data, four are valid
+    st1 {v4.4s},[x10],#16
+   
+    //next loop
+    subs x8,x8,1
+    b.ne first_loop_start
+
+first_less_8:
+   
+    sub x8,x2,x7
+    cmp  x8,1
+    blt first_row_done
+
+first_1_7:
+    dup v13.4s,v31.s[0]
+
+    cmp x8,4
+    blt  first_1_2_3
+    
+    ld1 {v12.4s},[x0],#16
+
+    uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+    uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+    ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+
+    //L-1   
+    fmul v4.4s,v28.4s,v25.s[0]  //k10, 
+    fmla v4.4s,v29.4s,v25.s[1]  //k11,
+    fmla v4.4s,v30.4s,v25.s[2]  //k12
+
+    ins v28.s[0],v4.s[0]
+    str  s28,[x10],#4
+
+    ins v28.s[0],v4.s[1]
+    str  s28,[x10],#4
+
+    sub x8,x8,4
+    cbz x8,first_row_done
+
+    ins v27.s[3],v12.s[3]
+     
+first_1_2_3:
+    dup v12.4s,v31.s[0]
+
+    //1-3 items
+    ldr s28,[x0],#4
+    ins v12.s[0],v28.s[0]
+
+    cmp x8,2
+    blt first_left_load_done
+
+    ldr s28,[x0],#4
+    ins v12.s[1],v28.s[0]
+
+    cmp x8,3
+    blt first_left_load_done
+
+   ldr s28,[x0],#4
+   ins v12.s[2],v28.s[0]
+   
+first_left_load_done:         
+
+   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+   
+   //L-1   
+   fmul v4.4s,v28.4s,v25.s[0]  //k10, 
+   fmla v4.4s,v29.4s,v25.s[1]  //k11,
+   fmla v4.4s,v30.4s,v25.s[2]  //k12
+
+first_left_save_1_3:  
+   
+   ins v28.s[0],v4.s[0]
+   str  s28,[x10],#4
+
+   cmp x8,3
+   blt first_row_done
+   
+   ins v28.s[0],v4.s[1]
+   str s28,[x10],#4
+
+first_row_done:
+
+
+odd_row_start:
+   sub x1,x1,1
+   cbz x1, last_row_is_odd
+
+   lsr  x8,x2,3
+   lsl  x7,x8,3
+   
+   dup v27.4s,v31.s[0]
+                   //x4: L-2
+   add x6,x4,x9   //L0
+     
+   cbz x8,odd_less_8
+  
+odd_loop_start:
+
+   ld1 {v0.4s}, [x4]   //L-2
+   ld1 {v12.4s,v13.4s},[x0],#32
+   
+   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+ 
+  
+  //L-2 
+   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+   fmul v8.4s,v28.4s,v24.s[0]   //k00
+   fmla v0.4s,v29.4s,v26.s[1]  //k21,
+   fmla v8.4s,v29.4s,v24.s[1]   //k01
+   fmla v0.4s,v30.4s,v26.s[2]  //k22
+   fmla v8.4s,v30.4s,v24.s[2]   //k02
+//add bias
+   fadd v0.4s,v0.4s,v21.4s
+
+#ifdef CONV_RELU_FUSE
+   fmax v0.4s,v0.4s,v31.4s
+#endif
+     
+   //L0 is always zero
+   
+   st1 {v0.4s}, [x4],#16
+   st1 {v8.4s}, [x6],#16
+      
+   ins v27.s[3],v13.s[3]
+   
+   //next loop
+   subs x8,x8,1
+   b.ne odd_loop_start
+
+odd_less_8:
+   sub x8,x2,x7
+   cmp x8,1
+   blt odd_row_done
+
+odd_1_7:
+    dup v13.4s,v31.s[0]
+    cmp x8,4
+    blt  odd_1_2_3
+
+    ld1 {v12.4s},[x0],#16
+
+    ldr s28,[x4]
+    ins v0.s[0],v28.s[0]
+   
+    ldr s28,[x4,#4]
+    ins v0.s[1],v28.s[0]
+
+    uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+    uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+    ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+ 
+    //L-2 
+    fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+    fmul v8.4s,v28.4s,v24.s[0]   //k00
+    fmla v0.4s,v29.4s,v26.s[1]  //k21,
+    fmla v8.4s,v29.4s,v24.s[1]   //k01
+    fmla v0.4s,v30.4s,v26.s[2]  //k22
+    fmla v8.4s,v30.4s,v24.s[2]   //k02  
+     
+//add bias
+    fadd v0.4s,v0.4s,v21.4s
+    //L0 is always zero
+    ins v28.s[0],v0.s[0]
+#ifdef CONV_RELU_FUSE
+    fmax s28,s28,s31
+#endif
+    str  s28,[x4],#4
+
+    ins v28.s[0],v8.s[0]
+    str  s28,[x6],#4
+  
+    ins v28.s[0],v0.s[1]
+#ifdef CONV_RELU_FUSE
+    fmax s28,s28,s31
+#endif
+    str s28,[x4],#4
+
+    ins v28.s[0],v8.s[1]
+    str  s28,[x6],#4
+
+    sub x8,x8,4
+    cbz x8, odd_row_done
+
+    ins v27.s[3],v12.s[3]
+
+odd_1_2_3:
+
+   dup v12.4s,v31.s[0]
+
+   ldr s28,[x0],#4
+   ins v12.s[0],v28.s[0]
+   
+   ldr s28,[x4]
+   ins v0.s[0],v28.s[0]
+  
+   cmp  x8,2
+   blt odd_left_load_done
+   
+   ldr s28,[x0],#4
+   ins v12.s[1],v28.s[0]
+     
+   cmp  x8,3
+   blt odd_left_load_done
+   
+   ldr s28,[x0],#4
+   ins v12.s[2],v28.s[0]
+   
+   ldr s28,[x4,#4]
+   ins v0.s[1],v28.s[0]
+
+odd_left_load_done:         
+
+   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+ 
+  
+   //L-2 
+   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+   fmul v8.4s,v28.4s,v24.s[0]   //k00
+   fmla v0.4s,v29.4s,v26.s[1]  //k21,
+   fmla v8.4s,v29.4s,v24.s[1]   //k01
+   fmla v0.4s,v30.4s,v26.s[2]  //k22
+   fmla v8.4s,v30.4s,v24.s[2]   //k02
+     
+   //L0
+//add bias
+   fadd v0.4s,v0.4s,v21.4s   
+   //save result:1 or 2
+   ins v28.s[0],v0.s[0]
+#ifdef CONV_RELU_FUSE
+    fmax s28,s28,s31
+#endif
+   str  s28,[x4],#4
+
+   ins v28.s[0],v8.s[0]
+   str  s28,[x6],#4
+
+   cmp x8,3
+   blt odd_row_done
+   
+   ins v28.s[0],v0.s[1]
+#ifdef CONV_RELU_FUSE
+    fmax s28,s28,s31
+#endif
+   str s28,[x4],#4
+
+   ins v28.s[0],v8.s[1]
+   str  s28,[x6],#4
+
+odd_row_done:   
+
+even_row_start:
+
+   lsr  x8,x2,3
+   lsl  x7,x8,3
+
+   ins  v27.s[3],v31.s[0]   //pre_vector for input
+
+   mov x10,x4       //L-1  //L1 ONLY
+   cbz  x8,even_less_8
+   
+even_loop_start:
+   //load 4 float input
+   ld1 {v12.4s,v13.4s},[x0],#32      
+   ld1 {v4.4s},[x10]
+   
+   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+
+    //L-1: k1 xinput
+   fmla v4.4s,v28.4s,v25.s[0]  //k10, 
+   fmla v4.4s,v29.4s,v25.s[1]  //k11,
+   fmla v4.4s,v30.4s,v25.s[2]  //k12
+
+   ins v27.s[3],v13.s[3]  //save prev vector
+
+   st1 {v4.4s},[x10],#16
+   
+   //next loop
+   subs x8,x8,1
+   b.ne even_loop_start
+
+even_less_8:
+   
+   sub x8,x2,x7
+   cmp  x8,1
+   blt even_row_done
+
+even_1_7:
+    dup v13.4s,v31.s[0]
+    
+    cmp x8,4
+    blt  even_1_2_3
+
+    ld1 {v12.4s},[x0],#16
+    ldr s28,[x10]
+    ins v4.s[0],v28.s[0]
+    ldr s28,[x10,#4]
+    ins v4.s[1],v28.s[0]
+   
+    uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+    uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+    ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+
+    //L-1: k1 xinput
+    fmla v4.4s,v28.4s,v25.s[0]  //k10, 
+    fmla v4.4s,v29.4s,v25.s[1]  //k11,
+    fmla v4.4s,v30.4s,v25.s[2]  //k12
+
+    ins v28.s[0],v4.s[0]
+    str  s28,[x10],#4
+    
+    ins v28.s[0],v4.s[1]
+    str  s28,[x10],#4
+
+    sub x8,x8,4
+    cbz x8, even_row_done
+
+    ins v27.s[3],v12.s[3]  //save prev vector
+
+even_1_2_3:   
+   dup v12.4s,v31.s[0]
+   
+   //1, 2 or 3 items
+   ldr s28,[x0],#4
+   ins v12.s[0],v28.s[0]
+
+   ldr s28,[x10]
+   ins v4.s[0],v28.s[0]
+   
+   sub x7,x8,1
+   cbz x7, even_left_load_done
+   
+   ldr s28,[x0],#4
+   ins v12.s[1],v28.s[0]
+  
+   sub x7,x8,2
+   cbz x7, even_left_load_done
+   
+   ldr s28,[x0],#4
+   ins v12.s[2],v28.s[0]
+   
+   ldr s28,[x10,#4]
+   ins v4.s[1],v28.s[0]
+
+even_left_load_done:         
+
+    uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+    uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+    ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+
+    //L-1: k1 xinput
+    fmla v4.4s,v28.4s,v25.s[0]  //k10, 
+    fmla v4.4s,v29.4s,v25.s[1]  //k11,
+    fmla v4.4s,v30.4s,v25.s[2]  //k12
+
+      
+   //save result: 1 or 2
+   ins v28.s[0],v4.s[0]
+   str  s28,[x10],#4
+
+   cmp x8,3
+   blt even_row_done
+   
+   ins v28.s[0],v4.s[1]
+   str s28,[x10],#4
+
+even_row_done:
+   sub  x1,x1,1  
+   cbz x1, last_even_add_bias
+   b odd_row_start
+
+last_even_add_bias:
+   mov   x10,x4
+   //cal out_w
+   sub   x6,x2,1   
+   lsr   x6,x6,1 
+   add   x6,x6,1
+   //finish
+   lsr  x8,x6,3
+   lsl  x7,x8,3
+   cbz  x8,last_even_less_8
+last_even_loop_start: 
+   ld1 {v12.4s,v13.4s},[x10],#32
+//add bias
+   fadd v12.4s,v12.4s,v21.4s
+   fadd v13.4s,v13.4s,v21.4s
+#ifdef CONV_RELU_FUSE
+   fmax v12.4s,v12.4s,v31.4s
+   fmax v13.4s,v13.4s,v31.4s
+#endif
+   st1 {v12.4s},[x4],#16
+   st1 {v13.4s},[x4],#16
+// next loop
+   subs x8,x8,1
+   b.ne last_even_loop_start
+last_even_less_8:
+   subs x8,x6,x7
+   cmp x8,1
+   blt last_even_loop_done
+   cmp x8,4
+   blt last_even_1_2_3
+   ld1 {v0.4s},[x10],#16
+//add bias
+   fadd v0.4s,v0.4s,v21.4s
+#ifdef CONV_RELU_FUSE
+   fmax v0.4s,v0.4s,v31.4s
+#endif
+   st1 {v0.4s},[x4],#16
+   subs x8,x8,4
+   cbz x8,last_even_loop_done
+last_even_1_2_3:
+   cmp x8,1
+   blt last_even_loop_done
+   ldr s0,[x10],#0x4
+   //add bias
+   fadd s0,s0,s21
+#ifdef CONV_RELU_FUSE
+   fmax s0,s0,s31
+#endif
+   str s0,[x4],#0x4
+   subs x8,x8,1
+   cbz x8,last_even_loop_done
+   b last_even_1_2_3
+
+last_even_loop_done:
+   b all_row_done
+
+// Last Row: even or odd
+
+last_row_is_odd:
+  
+   lsr  x8,x2,3
+   lsl  x7,x8,3
+   
+   dup v27.4s,v31.s[0]
+   cbz x8,last_odd_less_8
+   
+last_odd_loop_start:
+
+   ld1 {v0.4s},[x4]   //L-2
+   ld1 {v12.4s,v13.4s},[x0],#32
+  
+   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+
+   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+   fmla v0.4s,v29.4s,v26.s[1]  //k21,
+   fmla v0.4s,v30.4s,v26.s[2]  //k22
+//add bias
+   fadd v0.4s,v0.4s,v21.4s
+
+#ifdef CONV_RELU_FUSE
+   fmax v0.4s,v0.4s,v31.4s
+#endif
+   st1 {v0.4s},[x4],#16
+      
+   ins v27.s[3],v13.s[3]
+   
+   //next loop
+   subs x8,x8,1
+   b.ne last_odd_loop_start
+
+last_odd_less_8:
+   sub x8,x2,x7
+   cmp x8,1
+   blt last_odd_row_done
+   cmp x8,4
+   blt last_odd_1_2_3
+
+   ld1 {v12.4s},[x0],#16
+   dup v13.4s,v31.s[0]
+
+    //L-2
+   ldr s28,[x4]
+   ins v0.s[0],v28.s[0]
+   ldr s28,[x4,#4]
+   ins v0.s[1],v28.s[0]
+   
+   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+
+   //L-2 
+   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+   fmla v0.4s,v29.4s,v26.s[1]  //k21,
+   fmla v0.4s,v30.4s,v26.s[2]  //k22
+
+//add bias
+   fadd v0.4s,v0.4s,v21.4s
+   ins v28.s[0],v0.s[0]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31
+#endif
+   str  s28,[x4],#4
+
+   ins v28.s[0],v0.s[1]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31
+#endif
+   str s28,[x4],#4
+
+   sub x8,x8,4
+   cbz x8,last_odd_row_done
+   
+   ins v27.s[3],v12.s[3]
+  
+last_odd_1_2_3:
+
+   dup v12.4s,v31.s[0]
+       
+   ldr s28,[x0],#4
+   ins v12.s[0],v28.s[0]
+   
+   ldr s28,[x4]
+   ins v0.s[0],v28.s[0]
+  
+   cmp  x8,2
+   blt last_odd_left_load_done
+   
+   ldr s28,[x0],#4
+   ins v12.s[1],v28.s[0]
+     
+   cmp  x8,3
+   blt last_odd_left_load_done
+   
+   ldr s28,[x0],#4
+   ins v12.s[2],v28.s[0]
+   
+   ldr s28,[x4,#4]
+   ins v0.s[1],v28.s[0]
+
+last_odd_left_load_done:         
+
+   uzp1 v29.4s,v12.4s,v13.4s  //a00,a02,a04,a06
+   uzp2 v30.4s,v12.4s,v13.4s  //a01,a03,a05,a07
+   ext v28.16b,v27.16b,v30.16b,12  //last_3 , a01, a03,a05
+
+   //L-2 
+   fmla v0.4s,v28.4s,v26.s[0]  //k20, 
+   fmla v0.4s,v29.4s,v26.s[1]  //k21,
+   fmla v0.4s,v30.4s,v26.s[2]  //k22
+
+//add bias
+   fadd v0.4s,v0.4s,v21.4s
+   //save result:1 or 2
+   ins v28.s[0],v0.s[0]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31
+#endif
+   str  s28,[x4],#4
+
+
+   cmp x8,3
+   blt last_odd_row_done
+   
+   ins v28.s[0],v0.s[1]
+#ifdef CONV_RELU_FUSE
+   fmax s28,s28,s31
+#endif
+   str s28,[x4],#4
+
+
+last_odd_row_done:   
+all_row_done:
+   ret
+
+
+
+
diff --git a/executor/operator/arm64/conv/dw_k3s2p1_relu_fused.S b/executor/operator/arm64/conv/dw_k3s2p1_relu_fused.S
index f1953b510..2a517114d 100644
--- a/executor/operator/arm64/conv/dw_k3s2p1_relu_fused.S
+++ b/executor/operator/arm64/conv/dw_k3s2p1_relu_fused.S
@@ -1,27 +1,27 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2018, Open AI Lab
- * Author: haitao@openailab.com
- */
-#define KERNEL_NAME dw_k3s2p1_relu_fused
-#define CONV_RELU_FUSE
-
-#include "./dw_k3s2p1.S"
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+#define KERNEL_NAME dw_k3s2p1_relu_fused
+#define CONV_RELU_FUSE
+
+#include "./dw_k3s2p1.S"
diff --git a/executor/operator/arm64/conv/sgemm_4x16_interleave.S b/executor/operator/arm64/conv/sgemm_4x16_interleave.S
index 565470e89..6de3ef5c5 100644
--- a/executor/operator/arm64/conv/sgemm_4x16_interleave.S
+++ b/executor/operator/arm64/conv/sgemm_4x16_interleave.S
@@ -1,313 +1,313 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: xiaowei@openailab.com
- */
-//
-// 4*16 single precise floating point matric multiplication
-//
-//    --              --      --               --     --                --         --                  --
-//    | i0 - - - - - - |      |  k0  k1  ..  kf |     |  t00 t01  .. t0f |         | i0k0 i0k1 .. i0kf |
-//    |                |      |  .   .   .   .  |     |                  |         |                   |
-//    | i1 - - - - - - |      |  .   .   .   .  |     |  t10 t11 .   t1f |         | i1k0 i1k1 .. i1kf |
-//    |                |  x   |  .   .   .   .  |  +  |                  |     =   |                   |
-//    | i2 - - - - - - |      |  .   .   .   .  |     |  t20 t21 .   t2f |         | i2k0 i2k1 .. i2kf |
-//    |                |      |  .   .   .   .  |     |                  |         |                   |
-//    | i3 - - - - - - |      |  .   .   .   .  |     |  t30 t31 .   t3f |         | i3k0 i3k1 .. i3kf |
-//    --              --      --               --     --                --         --                  --
-//      input 4 x p             kernel p x 16            biases 4 x 16                 output 4 x 16           p = kernel size
-//
-//
-// optimised for Cortex-A72 pipeline  66 cycle per loop (4*16*4 dot product)
-//
-// input: 
-//         x0 arg0  have biases flag
-//         x1 arg1  biases start address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3],i[0-3]k[4]..} 
-//         x2 arg2  input  start address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...}
-//         x3 arg3  kernel start address {k[0-15][0],k[0-15][1],k[0-15][2],k[0-15][3],...}
-//         x4 arg4  output save  address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3],i[0-3]k[4]..}
-//         x5 arg5  kernel size
-//
-// output: no
-//
-// register definition
-// x0        have biases flag
-// x1        biases start address
-// x2        input start address
-// x3        kernel start address
-// x4        output start address
-// x5        loop time = kernal size 
-// x6 ~ x31 not used
-//
-// v0~v1  4S data of input0   {i3   i2   i1   i0}
-// v2-v3 not used
-// v4  4S kernal data      {k3 | k2 | k1 | k0}
-// v5  4S kernal data      {k7 | k6 | k5 | k4}
-// v6  4S kernal data      {kb | ka | k9 | k8}
-// v7  4S kernal data      {kf | ke | kd | kc}
-// v8~v15 not used
-// v16 dot product for {i3k0, i2k0, i1k0, i0k0}
-// v17 dot product for {i3k1, i2k1, i1k1, i0k1}
-// v18 dot product for {i3k2, i2k2, i1k2, i0k2}
-// v19 dot product for {i3k3, i2k3, i1k3, i0k3}
-// v20 dot product for {i3k4, i2k4, i1k4, i0k4}
-// v21 dot product for {i3k5, i2k5, i1k5, i0k5}
-// v22 dot product for {i3k6, i2k6, i1k6, i0k6}
-// v23 dot product for {i3k7, i2k7, i1k7, i0k7}
-// v24 dot product for {i3k8, i2k8, i1k8, i0k8}
-// v25 dot product for {i3k9, i2k9, i1k9, i0k9}
-// v26 dot product for {i3ka, i2ka, i1ka, i0ka}
-// v27 dot product for {i3kb, i2kb, i1kb, i0kb}
-// v28 dot product for {i3kc, i2kc, i1kc, i0kc}
-// v29 dot product for {i3kd, i2kd, i1kd, i0kd}
-// v30 dot product for {i3ke, i2ke, i1ke, i0ke}
-// v31 dot product for {i3kf, i2kf, i1kf, i0kf}
-
-#ifndef INTERLEAVE_FUNC_NAME
-#define INTERLEAVE_FUNC_NAME sgemm_4x16_interleave
-#endif
-
-        .section .text,"ax"
-        .align 5
-
-        .type INTERLEAVE_FUNC_NAME STT_FUNC
-        .global INTERLEAVE_FUNC_NAME
-
-INTERLEAVE_FUNC_NAME:
-// biases_initial
-	cbz	x0, none_biases
-        ldp     q16, q17 ,[x1]
-        ldp     q18, q19 ,[x1, #0x20]
-        ldp     q20, q21 ,[x1, #0x40]
-        ldp     q22, q23 ,[x1, #0x60]
-        ldp     q24, q25 ,[x1, #0x80]
-        ldp     q26, q27 ,[x1, #0xa0]
-        ldp     q28, q29 ,[x1, #0xc0]
-        ldp     q30, q31 ,[x1, #0xe0]
-	b	convolution_start
-
-none_biases:
-	movi	d16, #0
-	movi	d17, #0
-	movi	d18, #0
-	movi	d19, #0
-	movi	d20, #0
-	movi	d21, #0
-	movi	d22, #0
-	movi	d23, #0
-	movi	d24, #0
-	movi	d25, #0
-	movi	d26, #0
-	movi	d27, #0
-	movi	d28, #0
-	movi	d29, #0
-	movi	d30, #0
-	movi	d31, #0
-
-convolution_start:
-	// compare to 0x4
-	cmp	x5, 0x4
-	blt	loop4_end
-	lsr	x6, x5, 0x2
-
-// main loop     each loop generate dot prodcut for 4x16SFP
-loop4:  
-	ldr	q0, [x2]			// q0=i[3-0]
-	ldp	q4, q5, [x3]			// q4=k[3-0] q5=k[7-4] 
-	fmla	v16.4s, v0.4s,  v4.s[0]		// i[3-0]k[0]
-	fmla	v17.4s, v0.4s,  v4.s[1]		// i[3-0]k[1]
-	fmla	v18.4s, v0.4s,  v4.s[2]		// i[3-0]k[2]
-	fmla	v19.4s, v0.4s,  v4.s[3]		// i[3-0]k[3]
-	ldp	q6, q7, [x3, 0x20]		// q6=k[b-8] q7=k[f-c]
-	fmla	v20.4s, v0.4s,  v5.s[0]		// i[3-0]k[4]
-	fmla	v21.4s, v0.4s,  v5.s[1]		// i[3-0]k[5]
-	fmla	v22.4s, v0.4s,  v5.s[2]		// i[3-0]k[6]
-	fmla	v23.4s, v0.4s,  v5.s[3]		// i[3-0]k[7]
-	ldr	q1, [x2, 0x10]			// q1=i[3-0]
-	fmla	v24.4s, v0.4s,  v6.s[0]		// i[3-0]k[8]
-	fmla	v25.4s, v0.4s,  v6.s[1]		// i[3-0]k[9]
-	fmla	v26.4s, v0.4s,  v6.s[2]		// i[3-0]k[a]
-	ldp	q4, q5, [x3, 0x40]		// q4=k[3-0] q5=k[7-4] 
-	fmla	v27.4s, v0.4s,  v6.s[3]		// i[3-0]k[b]
-	fmla	v28.4s, v0.4s,  v7.s[0]		// i[3-0]k[c]
-	fmla	v29.4s, v0.4s,  v7.s[1]		// i[3-0]k[d]
-	fmla	v30.4s, v0.4s,  v7.s[2]		// i[3-0]k[e]
-	fmla	v31.4s, v0.4s,  v7.s[3]		// i[3-0]k[f]
-
-	ldp	q6, q7, [x3, 0x60]		// q6=k[b-8] q7=k[f-c]
-	fmla	v16.4s, v1.4s,  v4.s[0]		// i[3-0]k[0]
-	fmla	v17.4s, v1.4s,  v4.s[1]		// i[3-0]k[1]
-	fmla	v18.4s, v1.4s,  v4.s[2]		// i[3-0]k[2]
-	fmla	v19.4s, v1.4s,  v4.s[3]		// i[3-0]k[3]
-	ldr	q0, [x2, 0x20]			// q1=i[3-0]
-	fmla	v20.4s, v1.4s,  v5.s[0]		// i[3-0]k[4]
-	fmla	v21.4s, v1.4s,  v5.s[1]		// i[3-0]k[5]
-	fmla	v22.4s, v1.4s,  v5.s[2]		// i[3-0]k[6]
-	fmla	v23.4s, v1.4s,  v5.s[3]		// i[3-0]k[7]
-	ldp	q4, q5, [x3, 0x80]		// q4=k[3-0] q5=k[7-4] 
-	fmla	v24.4s, v1.4s,  v6.s[0]		// i[3-0]k[8]
-	fmla	v25.4s, v1.4s,  v6.s[1]		// i[3-0]k[9]
-	fmla	v26.4s, v1.4s,  v6.s[2]		// i[3-0]k[a]
-	fmla	v27.4s, v1.4s,  v6.s[3]		// i[3-0]k[b]
-	subs	x6, x6, #0x1
-	prfm	pldl1keep, [x2, 0x80]
-	fmla	v28.4s, v1.4s,  v7.s[0]		// i[3-0]k[c]
-	fmla	v29.4s, v1.4s,  v7.s[1]		// i[3-0]k[d]
-	fmla	v30.4s, v1.4s,  v7.s[2]		// i[3-0]k[e]
-	fmla	v31.4s, v1.4s,  v7.s[3]		// i[3-0]k[f]
-
-	ldp	q6, q7, [x3, 0xa0]		// q6=k[b-8] q7=k[f-c]
-	fmla	v16.4s, v0.4s,  v4.s[0]		// i[3-0]k[0]
-	fmla	v17.4s, v0.4s,  v4.s[1]		// i[3-0]k[1]
-	fmla	v18.4s, v0.4s,  v4.s[2]		// i[3-0]k[2]
-	fmla	v19.4s, v0.4s,  v4.s[3]		// i[3-0]k[3]
-	ldr	q1, [x2, 0x30]			// q1=i[3-0]
-	add	x2, x2, #0x40
-	fmla	v20.4s, v0.4s,  v5.s[0]		// i[3-0]k[4]
-	fmla	v21.4s, v0.4s,  v5.s[1]		// i[3-0]k[5]
-	fmla	v22.4s, v0.4s,  v5.s[2]		// i[3-0]k[6]
-	fmla	v23.4s, v0.4s,  v5.s[3]		// i[3-0]k[7]
-	ldp	q4, q5, [x3, 0xc0]		// q4=k[3-0] q5=k[7-4] 
-	fmla	v24.4s, v0.4s,  v6.s[0]		// i[3-0]k[8]
-	fmla	v25.4s, v0.4s,  v6.s[1]		// i[3-0]k[9]
-	fmla	v26.4s, v0.4s,  v6.s[2]		// i[3-0]k[a]
-	fmla	v27.4s, v0.4s,  v6.s[3]		// i[3-0]k[b]
-	prfm	pldl1keep, [x3, 0x140]
-	fmla	v28.4s, v0.4s,  v7.s[0]		// i[3-0]k[c]
-	fmla	v29.4s, v0.4s,  v7.s[1]		// i[3-0]k[d]
-	fmla	v30.4s, v0.4s,  v7.s[2]		// i[3-0]k[e]
-	fmla	v31.4s, v0.4s,  v7.s[3]		// i[3-0]k[f]
-
-	ldp	q6, q7, [x3, 0xe0]		// q6=k[b-8] q7=k[f-c]
-	fmla	v16.4s, v1.4s,  v4.s[0]		// i[3-0]k[0]
-	fmla	v17.4s, v1.4s,  v4.s[1]		// i[3-0]k[1]
-	fmla	v18.4s, v1.4s,  v4.s[2]		// i[3-0]k[2]
-	fmla	v19.4s, v1.4s,  v4.s[3]		// i[3-0]k[3]
-	prfm	pldl1keep, [x3, 0x180]
-	fmla	v20.4s, v1.4s,  v5.s[0]		// i[3-0]k[4]
-	fmla	v21.4s, v1.4s,  v5.s[1]		// i[3-0]k[5]
-	fmla	v22.4s, v1.4s,  v5.s[2]		// i[3-0]k[6]
-	fmla	v23.4s, v1.4s,  v5.s[3]		// i[3-0]k[7]
-	prfm	pldl1keep, [x3, 0x1c0]
-	fmla	v24.4s, v1.4s,  v6.s[0]		// i[3-0]k[8]
-	fmla	v25.4s, v1.4s,  v6.s[1]		// i[3-0]k[9]
-	fmla	v26.4s, v1.4s,  v6.s[2]		// i[3-0]k[a]
-	fmla	v27.4s, v1.4s,  v6.s[3]		// i[3-0]k[b]
-	prfm	pldl1keep, [x3, 0x200]
-	add	x3, x3, #0x100
-	fmla	v28.4s, v1.4s,  v7.s[0]		// i[3-0]k[c]
-	fmla	v29.4s, v1.4s,  v7.s[1]		// i[3-0]k[d]
-	fmla	v30.4s, v1.4s,  v7.s[2]		// i[3-0]k[e]
-	fmla	v31.4s, v1.4s,  v7.s[3]		// i[3-0]k[f]
-	b.ne	loop4
-
-	and	x5, x5, 0x3
-
-loop4_end:
-	cbz	x5, finish
-
-loop1:
-        ldr     q0, [x2], 0x10                  // q0=i[3-0]
-        ldp     q4, q5, [x3]                    // q4=k[3-0] q5=k[7-4]
-        ldp     q6, q7, [x3, 0x20]              // q6=k[b-8] q7=k[f-c]
-        subs    x5 ,x5 ,0x1
-	fmla	v16.4s, v0.4s,  v4.s[0]		// i[3-0]k[0]
-	fmla	v17.4s, v0.4s,  v4.s[1]		// i[3-0]k[1]
-	fmla	v18.4s, v0.4s,  v4.s[2]		// i[3-0]k[2]
-	fmla	v19.4s, v0.4s,  v4.s[3]		// i[3-0]k[3]
-	fmla	v20.4s, v0.4s,  v5.s[0]		// i[3-0]k[4]
-	fmla	v21.4s, v0.4s,  v5.s[1]		// i[3-0]k[5]
-	fmla	v22.4s, v0.4s,  v5.s[2]		// i[3-0]k[6]
-	fmla	v23.4s, v0.4s,  v5.s[3]		// i[3-0]k[7]
-	fmla	v24.4s, v0.4s,  v6.s[0]		// i[3-0]k[8]
-	fmla	v25.4s, v0.4s,  v6.s[1]		// i[3-0]k[9]
-	fmla	v26.4s, v0.4s,  v6.s[2]		// i[3-0]k[a]
-	fmla	v27.4s, v0.4s,  v6.s[3]		// i[3-0]k[b]
-	fmla	v28.4s, v0.4s,  v7.s[0]		// i[3-0]k[c]
-	fmla	v29.4s, v0.4s,  v7.s[1]		// i[3-0]k[d]
-	fmla	v30.4s, v0.4s,  v7.s[2]		// i[3-0]k[e]
-	fmla	v31.4s, v0.4s,  v7.s[3]		// i[3-0]k[f]
-        add     x3, x3, #0x40
-
-        b.ne    loop1
-
-
-finish:
-// store result
-#ifdef CONV_RELU_FUSE
-        fmov    s0,wzr
-        dup     v1.4s,v0.s[0]
-        fmax    v16.4s,v16.4s,v1.4s
-        fmax    v17.4s,v17.4s,v1.4s
-#endif
-        stp     q16, q17 ,[x4]
-
-#ifdef CONV_RELU_FUSE
-        fmax    v18.4s,v18.4s,v1.4s
-        fmax    v19.4s,v19.4s,v1.4s
-#endif
-        stp     q18, q19 ,[x4, #0x20]
-
-#ifdef CONV_RELU_FUSE
-        fmax    v20.4s,v20.4s,v1.4s
-        fmax    v21.4s,v21.4s,v1.4s
-#endif
-        stp     q20, q21 ,[x4, #0x40]
-
-#ifdef CONV_RELU_FUSE
-        fmax    v22.4s,v22.4s,v1.4s
-        fmax    v23.4s,v23.4s,v1.4s
-#endif
-        stp     q22, q23 ,[x4, #0x60]
-
-#ifdef CONV_RELU_FUSE
-        fmax    v24.4s,v24.4s,v1.4s
-        fmax    v25.4s,v25.4s,v1.4s
-#endif
-        stp     q24, q25 ,[x4, #0x80]
-
-#ifdef CONV_RELU_FUSE
-        fmax    v26.4s,v26.4s,v1.4s
-        fmax    v27.4s,v27.4s,v1.4s
-#endif
-        stp     q26, q27 ,[x4, #0xa0]
-
-#ifdef CONV_RELU_FUSE
-        fmax    v28.4s,v28.4s,v1.4s
-        fmax    v29.4s,v29.4s,v1.4s
-#endif
-        stp     q28, q29 ,[x4, #0xc0]
-
-
-#ifdef CONV_RELU_FUSE
-        fmax    v30.4s,v30.4s,v1.4s
-        fmax    v31.4s,v31.4s,v1.4s
-#endif
-        stp     q30, q31 ,[x4, #0xe0]
-
-	ret
-
-// zero data to fill out a few more cache lines so the prefetcher doesn't
-// cause uninitialized memory to be read
-
-                .space  256
-                .end
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: xiaowei@openailab.com
+ */
+//
+// 4*16 single precise floating point matric multiplication
+//
+//    --              --      --               --     --                --         --                  --
+//    | i0 - - - - - - |      |  k0  k1  ..  kf |     |  t00 t01  .. t0f |         | i0k0 i0k1 .. i0kf |
+//    |                |      |  .   .   .   .  |     |                  |         |                   |
+//    | i1 - - - - - - |      |  .   .   .   .  |     |  t10 t11 .   t1f |         | i1k0 i1k1 .. i1kf |
+//    |                |  x   |  .   .   .   .  |  +  |                  |     =   |                   |
+//    | i2 - - - - - - |      |  .   .   .   .  |     |  t20 t21 .   t2f |         | i2k0 i2k1 .. i2kf |
+//    |                |      |  .   .   .   .  |     |                  |         |                   |
+//    | i3 - - - - - - |      |  .   .   .   .  |     |  t30 t31 .   t3f |         | i3k0 i3k1 .. i3kf |
+//    --              --      --               --     --                --         --                  --
+//      input 4 x p             kernel p x 16            biases 4 x 16                 output 4 x 16           p = kernel size
+//
+//
+// optimised for Cortex-A72 pipeline  66 cycle per loop (4*16*4 dot product)
+//
+// input: 
+//         x0 arg0  have biases flag
+//         x1 arg1  biases start address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3],i[0-3]k[4]..} 
+//         x2 arg2  input  start address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...}
+//         x3 arg3  kernel start address {k[0-15][0],k[0-15][1],k[0-15][2],k[0-15][3],...}
+//         x4 arg4  output save  address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3],i[0-3]k[4]..}
+//         x5 arg5  kernel size
+//
+// output: no
+//
+// register definition
+// x0        have biases flag
+// x1        biases start address
+// x2        input start address
+// x3        kernel start address
+// x4        output start address
+// x5        loop time = kernal size 
+// x6 ~ x31 not used
+//
+// v0~v1  4S data of input0   {i3   i2   i1   i0}
+// v2-v3 not used
+// v4  4S kernal data      {k3 | k2 | k1 | k0}
+// v5  4S kernal data      {k7 | k6 | k5 | k4}
+// v6  4S kernal data      {kb | ka | k9 | k8}
+// v7  4S kernal data      {kf | ke | kd | kc}
+// v8~v15 not used
+// v16 dot product for {i3k0, i2k0, i1k0, i0k0}
+// v17 dot product for {i3k1, i2k1, i1k1, i0k1}
+// v18 dot product for {i3k2, i2k2, i1k2, i0k2}
+// v19 dot product for {i3k3, i2k3, i1k3, i0k3}
+// v20 dot product for {i3k4, i2k4, i1k4, i0k4}
+// v21 dot product for {i3k5, i2k5, i1k5, i0k5}
+// v22 dot product for {i3k6, i2k6, i1k6, i0k6}
+// v23 dot product for {i3k7, i2k7, i1k7, i0k7}
+// v24 dot product for {i3k8, i2k8, i1k8, i0k8}
+// v25 dot product for {i3k9, i2k9, i1k9, i0k9}
+// v26 dot product for {i3ka, i2ka, i1ka, i0ka}
+// v27 dot product for {i3kb, i2kb, i1kb, i0kb}
+// v28 dot product for {i3kc, i2kc, i1kc, i0kc}
+// v29 dot product for {i3kd, i2kd, i1kd, i0kd}
+// v30 dot product for {i3ke, i2ke, i1ke, i0ke}
+// v31 dot product for {i3kf, i2kf, i1kf, i0kf}
+
+#ifndef INTERLEAVE_FUNC_NAME
+#define INTERLEAVE_FUNC_NAME sgemm_4x16_interleave
+#endif
+
+        .section .text,"ax"
+        .align 5
+
+        .type INTERLEAVE_FUNC_NAME STT_FUNC
+        .global INTERLEAVE_FUNC_NAME
+
+INTERLEAVE_FUNC_NAME:
+// biases_initial
+	cbz	x0, none_biases
+        ldp     q16, q17 ,[x1]
+        ldp     q18, q19 ,[x1, #0x20]
+        ldp     q20, q21 ,[x1, #0x40]
+        ldp     q22, q23 ,[x1, #0x60]
+        ldp     q24, q25 ,[x1, #0x80]
+        ldp     q26, q27 ,[x1, #0xa0]
+        ldp     q28, q29 ,[x1, #0xc0]
+        ldp     q30, q31 ,[x1, #0xe0]
+	b	convolution_start
+
+none_biases:
+	movi	d16, #0
+	movi	d17, #0
+	movi	d18, #0
+	movi	d19, #0
+	movi	d20, #0
+	movi	d21, #0
+	movi	d22, #0
+	movi	d23, #0
+	movi	d24, #0
+	movi	d25, #0
+	movi	d26, #0
+	movi	d27, #0
+	movi	d28, #0
+	movi	d29, #0
+	movi	d30, #0
+	movi	d31, #0
+
+convolution_start:
+	// compare to 0x4
+	cmp	x5, 0x4
+	blt	loop4_end
+	lsr	x6, x5, 0x2
+
+// main loop     each loop generate dot prodcut for 4x16SFP
+loop4:  
+	ldr	q0, [x2]			// q0=i[3-0]
+	ldp	q4, q5, [x3]			// q4=k[3-0] q5=k[7-4] 
+	fmla	v16.4s, v0.4s,  v4.s[0]		// i[3-0]k[0]
+	fmla	v17.4s, v0.4s,  v4.s[1]		// i[3-0]k[1]
+	fmla	v18.4s, v0.4s,  v4.s[2]		// i[3-0]k[2]
+	fmla	v19.4s, v0.4s,  v4.s[3]		// i[3-0]k[3]
+	ldp	q6, q7, [x3, 0x20]		// q6=k[b-8] q7=k[f-c]
+	fmla	v20.4s, v0.4s,  v5.s[0]		// i[3-0]k[4]
+	fmla	v21.4s, v0.4s,  v5.s[1]		// i[3-0]k[5]
+	fmla	v22.4s, v0.4s,  v5.s[2]		// i[3-0]k[6]
+	fmla	v23.4s, v0.4s,  v5.s[3]		// i[3-0]k[7]
+	ldr	q1, [x2, 0x10]			// q1=i[3-0]
+	fmla	v24.4s, v0.4s,  v6.s[0]		// i[3-0]k[8]
+	fmla	v25.4s, v0.4s,  v6.s[1]		// i[3-0]k[9]
+	fmla	v26.4s, v0.4s,  v6.s[2]		// i[3-0]k[a]
+	ldp	q4, q5, [x3, 0x40]		// q4=k[3-0] q5=k[7-4] 
+	fmla	v27.4s, v0.4s,  v6.s[3]		// i[3-0]k[b]
+	fmla	v28.4s, v0.4s,  v7.s[0]		// i[3-0]k[c]
+	fmla	v29.4s, v0.4s,  v7.s[1]		// i[3-0]k[d]
+	fmla	v30.4s, v0.4s,  v7.s[2]		// i[3-0]k[e]
+	fmla	v31.4s, v0.4s,  v7.s[3]		// i[3-0]k[f]
+
+	ldp	q6, q7, [x3, 0x60]		// q6=k[b-8] q7=k[f-c]
+	fmla	v16.4s, v1.4s,  v4.s[0]		// i[3-0]k[0]
+	fmla	v17.4s, v1.4s,  v4.s[1]		// i[3-0]k[1]
+	fmla	v18.4s, v1.4s,  v4.s[2]		// i[3-0]k[2]
+	fmla	v19.4s, v1.4s,  v4.s[3]		// i[3-0]k[3]
+	ldr	q0, [x2, 0x20]			// q1=i[3-0]
+	fmla	v20.4s, v1.4s,  v5.s[0]		// i[3-0]k[4]
+	fmla	v21.4s, v1.4s,  v5.s[1]		// i[3-0]k[5]
+	fmla	v22.4s, v1.4s,  v5.s[2]		// i[3-0]k[6]
+	fmla	v23.4s, v1.4s,  v5.s[3]		// i[3-0]k[7]
+	ldp	q4, q5, [x3, 0x80]		// q4=k[3-0] q5=k[7-4] 
+	fmla	v24.4s, v1.4s,  v6.s[0]		// i[3-0]k[8]
+	fmla	v25.4s, v1.4s,  v6.s[1]		// i[3-0]k[9]
+	fmla	v26.4s, v1.4s,  v6.s[2]		// i[3-0]k[a]
+	fmla	v27.4s, v1.4s,  v6.s[3]		// i[3-0]k[b]
+	subs	x6, x6, #0x1
+	prfm	pldl1keep, [x2, 0x80]
+	fmla	v28.4s, v1.4s,  v7.s[0]		// i[3-0]k[c]
+	fmla	v29.4s, v1.4s,  v7.s[1]		// i[3-0]k[d]
+	fmla	v30.4s, v1.4s,  v7.s[2]		// i[3-0]k[e]
+	fmla	v31.4s, v1.4s,  v7.s[3]		// i[3-0]k[f]
+
+	ldp	q6, q7, [x3, 0xa0]		// q6=k[b-8] q7=k[f-c]
+	fmla	v16.4s, v0.4s,  v4.s[0]		// i[3-0]k[0]
+	fmla	v17.4s, v0.4s,  v4.s[1]		// i[3-0]k[1]
+	fmla	v18.4s, v0.4s,  v4.s[2]		// i[3-0]k[2]
+	fmla	v19.4s, v0.4s,  v4.s[3]		// i[3-0]k[3]
+	ldr	q1, [x2, 0x30]			// q1=i[3-0]
+	add	x2, x2, #0x40
+	fmla	v20.4s, v0.4s,  v5.s[0]		// i[3-0]k[4]
+	fmla	v21.4s, v0.4s,  v5.s[1]		// i[3-0]k[5]
+	fmla	v22.4s, v0.4s,  v5.s[2]		// i[3-0]k[6]
+	fmla	v23.4s, v0.4s,  v5.s[3]		// i[3-0]k[7]
+	ldp	q4, q5, [x3, 0xc0]		// q4=k[3-0] q5=k[7-4] 
+	fmla	v24.4s, v0.4s,  v6.s[0]		// i[3-0]k[8]
+	fmla	v25.4s, v0.4s,  v6.s[1]		// i[3-0]k[9]
+	fmla	v26.4s, v0.4s,  v6.s[2]		// i[3-0]k[a]
+	fmla	v27.4s, v0.4s,  v6.s[3]		// i[3-0]k[b]
+	prfm	pldl1keep, [x3, 0x140]
+	fmla	v28.4s, v0.4s,  v7.s[0]		// i[3-0]k[c]
+	fmla	v29.4s, v0.4s,  v7.s[1]		// i[3-0]k[d]
+	fmla	v30.4s, v0.4s,  v7.s[2]		// i[3-0]k[e]
+	fmla	v31.4s, v0.4s,  v7.s[3]		// i[3-0]k[f]
+
+	ldp	q6, q7, [x3, 0xe0]		// q6=k[b-8] q7=k[f-c]
+	fmla	v16.4s, v1.4s,  v4.s[0]		// i[3-0]k[0]
+	fmla	v17.4s, v1.4s,  v4.s[1]		// i[3-0]k[1]
+	fmla	v18.4s, v1.4s,  v4.s[2]		// i[3-0]k[2]
+	fmla	v19.4s, v1.4s,  v4.s[3]		// i[3-0]k[3]
+	prfm	pldl1keep, [x3, 0x180]
+	fmla	v20.4s, v1.4s,  v5.s[0]		// i[3-0]k[4]
+	fmla	v21.4s, v1.4s,  v5.s[1]		// i[3-0]k[5]
+	fmla	v22.4s, v1.4s,  v5.s[2]		// i[3-0]k[6]
+	fmla	v23.4s, v1.4s,  v5.s[3]		// i[3-0]k[7]
+	prfm	pldl1keep, [x3, 0x1c0]
+	fmla	v24.4s, v1.4s,  v6.s[0]		// i[3-0]k[8]
+	fmla	v25.4s, v1.4s,  v6.s[1]		// i[3-0]k[9]
+	fmla	v26.4s, v1.4s,  v6.s[2]		// i[3-0]k[a]
+	fmla	v27.4s, v1.4s,  v6.s[3]		// i[3-0]k[b]
+	prfm	pldl1keep, [x3, 0x200]
+	add	x3, x3, #0x100
+	fmla	v28.4s, v1.4s,  v7.s[0]		// i[3-0]k[c]
+	fmla	v29.4s, v1.4s,  v7.s[1]		// i[3-0]k[d]
+	fmla	v30.4s, v1.4s,  v7.s[2]		// i[3-0]k[e]
+	fmla	v31.4s, v1.4s,  v7.s[3]		// i[3-0]k[f]
+	b.ne	loop4
+
+	and	x5, x5, 0x3
+
+loop4_end:
+	cbz	x5, finish
+
+loop1:
+        ldr     q0, [x2], 0x10                  // q0=i[3-0]
+        ldp     q4, q5, [x3]                    // q4=k[3-0] q5=k[7-4]
+        ldp     q6, q7, [x3, 0x20]              // q6=k[b-8] q7=k[f-c]
+        subs    x5 ,x5 ,0x1
+	fmla	v16.4s, v0.4s,  v4.s[0]		// i[3-0]k[0]
+	fmla	v17.4s, v0.4s,  v4.s[1]		// i[3-0]k[1]
+	fmla	v18.4s, v0.4s,  v4.s[2]		// i[3-0]k[2]
+	fmla	v19.4s, v0.4s,  v4.s[3]		// i[3-0]k[3]
+	fmla	v20.4s, v0.4s,  v5.s[0]		// i[3-0]k[4]
+	fmla	v21.4s, v0.4s,  v5.s[1]		// i[3-0]k[5]
+	fmla	v22.4s, v0.4s,  v5.s[2]		// i[3-0]k[6]
+	fmla	v23.4s, v0.4s,  v5.s[3]		// i[3-0]k[7]
+	fmla	v24.4s, v0.4s,  v6.s[0]		// i[3-0]k[8]
+	fmla	v25.4s, v0.4s,  v6.s[1]		// i[3-0]k[9]
+	fmla	v26.4s, v0.4s,  v6.s[2]		// i[3-0]k[a]
+	fmla	v27.4s, v0.4s,  v6.s[3]		// i[3-0]k[b]
+	fmla	v28.4s, v0.4s,  v7.s[0]		// i[3-0]k[c]
+	fmla	v29.4s, v0.4s,  v7.s[1]		// i[3-0]k[d]
+	fmla	v30.4s, v0.4s,  v7.s[2]		// i[3-0]k[e]
+	fmla	v31.4s, v0.4s,  v7.s[3]		// i[3-0]k[f]
+        add     x3, x3, #0x40
+
+        b.ne    loop1
+
+
+finish:
+// store result
+#ifdef CONV_RELU_FUSE
+        fmov    s0,wzr
+        dup     v1.4s,v0.s[0]
+        fmax    v16.4s,v16.4s,v1.4s
+        fmax    v17.4s,v17.4s,v1.4s
+#endif
+        stp     q16, q17 ,[x4]
+
+#ifdef CONV_RELU_FUSE
+        fmax    v18.4s,v18.4s,v1.4s
+        fmax    v19.4s,v19.4s,v1.4s
+#endif
+        stp     q18, q19 ,[x4, #0x20]
+
+#ifdef CONV_RELU_FUSE
+        fmax    v20.4s,v20.4s,v1.4s
+        fmax    v21.4s,v21.4s,v1.4s
+#endif
+        stp     q20, q21 ,[x4, #0x40]
+
+#ifdef CONV_RELU_FUSE
+        fmax    v22.4s,v22.4s,v1.4s
+        fmax    v23.4s,v23.4s,v1.4s
+#endif
+        stp     q22, q23 ,[x4, #0x60]
+
+#ifdef CONV_RELU_FUSE
+        fmax    v24.4s,v24.4s,v1.4s
+        fmax    v25.4s,v25.4s,v1.4s
+#endif
+        stp     q24, q25 ,[x4, #0x80]
+
+#ifdef CONV_RELU_FUSE
+        fmax    v26.4s,v26.4s,v1.4s
+        fmax    v27.4s,v27.4s,v1.4s
+#endif
+        stp     q26, q27 ,[x4, #0xa0]
+
+#ifdef CONV_RELU_FUSE
+        fmax    v28.4s,v28.4s,v1.4s
+        fmax    v29.4s,v29.4s,v1.4s
+#endif
+        stp     q28, q29 ,[x4, #0xc0]
+
+
+#ifdef CONV_RELU_FUSE
+        fmax    v30.4s,v30.4s,v1.4s
+        fmax    v31.4s,v31.4s,v1.4s
+#endif
+        stp     q30, q31 ,[x4, #0xe0]
+
+	ret
+
+// zero data to fill out a few more cache lines so the prefetcher doesn't
+// cause uninitialized memory to be read
+
+                .space  256
+                .end
+
diff --git a/executor/operator/arm64/conv/sgemm_4x16_interleave_relu_fused.S b/executor/operator/arm64/conv/sgemm_4x16_interleave_relu_fused.S
index 515cc0d14..c3dac1948 100644
--- a/executor/operator/arm64/conv/sgemm_4x16_interleave_relu_fused.S
+++ b/executor/operator/arm64/conv/sgemm_4x16_interleave_relu_fused.S
@@ -1,28 +1,28 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2018, Open AI Lab
- * Author: haitao@openailab.com
- */
-
-#define CONV_RELU_FUSE 1
-#define INTERLEAVE_FUNC_NAME sgemm_4x16_interleave_relu_fused
-
-#include "./sgemm_4x16_interleave.S"
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+
+#define CONV_RELU_FUSE 1
+#define INTERLEAVE_FUNC_NAME sgemm_4x16_interleave_relu_fused
+
+#include "./sgemm_4x16_interleave.S"
diff --git a/executor/operator/arm64/conv/sgemm_4x4_interleave.S b/executor/operator/arm64/conv/sgemm_4x4_interleave.S
index 59a820ba8..2197c2604 100644
--- a/executor/operator/arm64/conv/sgemm_4x4_interleave.S
+++ b/executor/operator/arm64/conv/sgemm_4x4_interleave.S
@@ -1,170 +1,170 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: xiaowei@openailab.com
- */
-//
-// 4*4 single precise floating point matric multiplication
-//
-//    --              --      --               --     --                --         --                 --
-//    | i0 - - - - - - |      |  k0  k1  k2  k3 |     |  t00 t01 t02 t03 |         | i0k0 i0k1 .. i0kf |
-//    |                |      |  .   .   .   .  |     |                  |         |                   |
-//    | i1 - - - - - - |      |  .   .   .   .  |     |  t10 t11 t12 t13 |         | i1k0 i1k1 .. i1kf |
-//    |                |  x   |  .   .   .   .  |  +  |                  |     =   |                   |
-//    | i2 - - - - - - |      |  .   .   .   .  |     |  t20 t21 t22 t23 |         | i2k0 i2k1 .. i2kf |
-//    |                |      |  .   .   .   .  |     |                  |         |                   |
-//    | i3 - - - - - - |      |  .   .   .   .  |     |  t30 t31 t32 t33 |         | i3k0 i3k1 .. i3kf |
-//    --              --      --               --     --                --         --                 --
-//      input 4 x p             kernel p x 4             biases 4 x 4                 output 4 x 4         p = kernel size
-//
-//
-// optimised for Cortex-A72 pipeline 18 cycle per loop (4*4*4 dot product)
-//
-// input:  
-//         x0 arg0  have biases flag
-//         x1 arg1  biases start address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]} 
-//         x2 arg2  input  start address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...}
-//         x3 arg3  kernel start address {k[0-3][0],k[0-3][1],k[0-3][2],k[0-3][3],...}
-//         x4 arg4  output save  address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]}
-//         x5 arg5  kernel size
-//
-// output: no
-//
-// register definition
-// x0        have biases flag
-// x1        biases start address
-// x2        input start address
-// x3        kernel start address
-// x4        output start address
-// x5        loop time = kernal size 
-// x6 ~ x31 not used
-//
-// v0-3 4S data of input0   {i3   i2   i1   i0}
-// v4-7 4S kernal data      {k3   k2   k1   k0}
-// v8~v15 not used
-// v16 dot product for {i3k0, i2k0, i1k0, i0k0}
-// v17 dot product for {i3k1, i2k1, i1k1, i0k1}
-// v18 dot product for {i3k2, i2k2, i1k2, i0k2}
-// v19 dot product for {i3k3, i2k3, i1k3, i0k3}
-// v20~V31 not used
-        .section .text,"ax"
-        .align 5
-
-#ifndef INTERLEAVE_FUNC_NAME
-#define INTERLEAVE_FUNC_NAME sgemm_4x4_interleave
-#endif
-        .type INTERLEAVE_FUNC_NAME STT_FUNC
-        .global INTERLEAVE_FUNC_NAME
-
-INTERLEAVE_FUNC_NAME:
-// initial
-	cbz	x0,  non_biases
-
-	ldp	q16, q17, [x1]
-	ldp	q18, q19, [x1,0x20]
-	b	convoluation_start
-	
-non_biases:
-	movi	d16, #0x0
-	movi	d17, #0x0
-	movi	d18, #0x0
-	movi	d19, #0x0
-
-convoluation_start:
-	// compare to 0x4
-	cmp	x5, 0x4
-	blt	loop4_end
-	lsr	x6, x5, 0x2
-
-// main loop     each loop generate dot prodcut for 4x4SFP
-loop4:  
-	subs	x6 ,x6 ,0x1
-
-	ldr	q0, [x2]			// q0=i[3-0]
-	ldp	q4, q5, [x3]			// q4=k[3-0] 
-	fmla	v16.4s, v0.4s,  v4.s[0]		// i[3-0]k[0]
-	fmla	v17.4s, v0.4s,  v4.s[1]		// i[3-0]k[1]
-	ldr	q1, [x2, 0x10]			// q1=i[3-0]
-	fmla	v18.4s, v0.4s,  v4.s[2]		// i[3-0]k[2]
-	fmla	v19.4s, v0.4s,  v4.s[3]		// i[3-0]k[3]
-
-	ldp	q2, q3, [x2, 0x20]		// q2=i[3-0] q3=i[3-0]
-	fmla	v16.4s, v1.4s,  v5.s[0]		// i[3-0]k[0]
-	fmla	v17.4s, v1.4s,  v5.s[1]		// i[3-0]k[1]
-	ldp	q6, q7, [x3, 0x20]		// q6=k[3-0] q7=q7=k[3-0]
-	fmla	v18.4s, v1.4s,  v5.s[2]		// i[3-0]k[2]
-	fmla	v19.4s, v1.4s,  v5.s[3]		// i[3-0]k[3]
-
-	fmla	v16.4s, v2.4s,  v6.s[0]		// i[3-0]k[0]
-	fmla	v17.4s, v2.4s,  v6.s[1]		// i[3-0]k[1]
-	prfm	pldl1keep, [x2, 0x140]
-	add	x2, x2, #0x40
-	fmla	v18.4s, v2.4s,  v6.s[2]		// i[3-0]k[2]
-	fmla	v19.4s, v2.4s,  v6.s[3]		// i[3-0]k[3]
-
-	prfm	pldl1keep, [x3, 0x140]
-	add	x3, x3, #0x40
-	fmla	v16.4s, v3.4s,  v7.s[0]		// i[3-0]k[0]
-	fmla	v17.4s, v3.4s,  v7.s[1]		// i[3-0]k[1]
-	fmla	v18.4s, v3.4s,  v7.s[2]		// i[3-0]k[2]
-	fmla	v19.4s, v3.4s,  v7.s[3]		// i[3-0]k[3]
-
-	b.ne	loop4
-
-	and	x5, x5, 0x3
-
-loop4_end:
-	cbz	x5, finish
-
-loop1:
-	subs	x5 ,x5 ,0x1
-	ldr     q0, [x2], 0x10                  // q0=i[3-0]
-        ldr     q4, [x3], 0x10                  // q4=k[3-0]
-	fmla	v16.4s, v0.4s,  v4.s[0]		// i[0]k[3-0]
-	fmla	v17.4s, v0.4s,  v4.s[1]		// i[1]k[3-0]
-	fmla	v18.4s, v0.4s,  v4.s[2]		// i[2]k[3-0]
-	fmla	v19.4s, v0.4s,  v4.s[3]		// i[3]k[3-0]
-
-	b.ne	loop1
-finish:
-// store result
-#ifdef CONV_RELU_FUSE
-        fmov s0,wzr
-        dup  v1.4s,v0.s[0]
-        fmax v16.4s,v16.4s,v1.4s
-        fmax v17.4s,v17.4s,v1.4s
-#endif
-	stp	q16, q17, [x4]
-
-#ifdef CONV_RELU_FUSE
-        fmax v18.4s,v18.4s,v1.4s
-        fmax v19.4s,v19.4s,v1.4s
-#endif
-	stp	q18, q19, [x4,0x20]
-
-	ret
-
-// zero data to fill out a few more cache lines so the prefetcher doesn't
-// cause uninitialized memory to be read
-
-                .space  256
-                .end
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: xiaowei@openailab.com
+ */
+//
+// 4*4 single precise floating point matric multiplication
+//
+//    --              --      --               --     --                --         --                 --
+//    | i0 - - - - - - |      |  k0  k1  k2  k3 |     |  t00 t01 t02 t03 |         | i0k0 i0k1 .. i0kf |
+//    |                |      |  .   .   .   .  |     |                  |         |                   |
+//    | i1 - - - - - - |      |  .   .   .   .  |     |  t10 t11 t12 t13 |         | i1k0 i1k1 .. i1kf |
+//    |                |  x   |  .   .   .   .  |  +  |                  |     =   |                   |
+//    | i2 - - - - - - |      |  .   .   .   .  |     |  t20 t21 t22 t23 |         | i2k0 i2k1 .. i2kf |
+//    |                |      |  .   .   .   .  |     |                  |         |                   |
+//    | i3 - - - - - - |      |  .   .   .   .  |     |  t30 t31 t32 t33 |         | i3k0 i3k1 .. i3kf |
+//    --              --      --               --     --                --         --                 --
+//      input 4 x p             kernel p x 4             biases 4 x 4                 output 4 x 4         p = kernel size
+//
+//
+// optimised for Cortex-A72 pipeline 18 cycle per loop (4*4*4 dot product)
+//
+// input:  
+//         x0 arg0  have biases flag
+//         x1 arg1  biases start address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]} 
+//         x2 arg2  input  start address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...}
+//         x3 arg3  kernel start address {k[0-3][0],k[0-3][1],k[0-3][2],k[0-3][3],...}
+//         x4 arg4  output save  address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]}
+//         x5 arg5  kernel size
+//
+// output: no
+//
+// register definition
+// x0        have biases flag
+// x1        biases start address
+// x2        input start address
+// x3        kernel start address
+// x4        output start address
+// x5        loop time = kernal size 
+// x6 ~ x31 not used
+//
+// v0-3 4S data of input0   {i3   i2   i1   i0}
+// v4-7 4S kernal data      {k3   k2   k1   k0}
+// v8~v15 not used
+// v16 dot product for {i3k0, i2k0, i1k0, i0k0}
+// v17 dot product for {i3k1, i2k1, i1k1, i0k1}
+// v18 dot product for {i3k2, i2k2, i1k2, i0k2}
+// v19 dot product for {i3k3, i2k3, i1k3, i0k3}
+// v20~V31 not used
+        .section .text,"ax"
+        .align 5
+
+#ifndef INTERLEAVE_FUNC_NAME
+#define INTERLEAVE_FUNC_NAME sgemm_4x4_interleave
+#endif
+        .type INTERLEAVE_FUNC_NAME STT_FUNC
+        .global INTERLEAVE_FUNC_NAME
+
+INTERLEAVE_FUNC_NAME:
+// initial
+	cbz	x0,  non_biases
+
+	ldp	q16, q17, [x1]
+	ldp	q18, q19, [x1,0x20]
+	b	convoluation_start
+	
+non_biases:
+	movi	d16, #0x0
+	movi	d17, #0x0
+	movi	d18, #0x0
+	movi	d19, #0x0
+
+convoluation_start:
+	// compare to 0x4
+	cmp	x5, 0x4
+	blt	loop4_end
+	lsr	x6, x5, 0x2
+
+// main loop     each loop generate dot prodcut for 4x4SFP
+loop4:  
+	subs	x6 ,x6 ,0x1
+
+	ldr	q0, [x2]			// q0=i[3-0]
+	ldp	q4, q5, [x3]			// q4=k[3-0] 
+	fmla	v16.4s, v0.4s,  v4.s[0]		// i[3-0]k[0]
+	fmla	v17.4s, v0.4s,  v4.s[1]		// i[3-0]k[1]
+	ldr	q1, [x2, 0x10]			// q1=i[3-0]
+	fmla	v18.4s, v0.4s,  v4.s[2]		// i[3-0]k[2]
+	fmla	v19.4s, v0.4s,  v4.s[3]		// i[3-0]k[3]
+
+	ldp	q2, q3, [x2, 0x20]		// q2=i[3-0] q3=i[3-0]
+	fmla	v16.4s, v1.4s,  v5.s[0]		// i[3-0]k[0]
+	fmla	v17.4s, v1.4s,  v5.s[1]		// i[3-0]k[1]
+	ldp	q6, q7, [x3, 0x20]		// q6=k[3-0] q7=q7=k[3-0]
+	fmla	v18.4s, v1.4s,  v5.s[2]		// i[3-0]k[2]
+	fmla	v19.4s, v1.4s,  v5.s[3]		// i[3-0]k[3]
+
+	fmla	v16.4s, v2.4s,  v6.s[0]		// i[3-0]k[0]
+	fmla	v17.4s, v2.4s,  v6.s[1]		// i[3-0]k[1]
+	prfm	pldl1keep, [x2, 0x140]
+	add	x2, x2, #0x40
+	fmla	v18.4s, v2.4s,  v6.s[2]		// i[3-0]k[2]
+	fmla	v19.4s, v2.4s,  v6.s[3]		// i[3-0]k[3]
+
+	prfm	pldl1keep, [x3, 0x140]
+	add	x3, x3, #0x40
+	fmla	v16.4s, v3.4s,  v7.s[0]		// i[3-0]k[0]
+	fmla	v17.4s, v3.4s,  v7.s[1]		// i[3-0]k[1]
+	fmla	v18.4s, v3.4s,  v7.s[2]		// i[3-0]k[2]
+	fmla	v19.4s, v3.4s,  v7.s[3]		// i[3-0]k[3]
+
+	b.ne	loop4
+
+	and	x5, x5, 0x3
+
+loop4_end:
+	cbz	x5, finish
+
+loop1:
+	subs	x5 ,x5 ,0x1
+	ldr     q0, [x2], 0x10                  // q0=i[3-0]
+        ldr     q4, [x3], 0x10                  // q4=k[3-0]
+	fmla	v16.4s, v0.4s,  v4.s[0]		// i[0]k[3-0]
+	fmla	v17.4s, v0.4s,  v4.s[1]		// i[1]k[3-0]
+	fmla	v18.4s, v0.4s,  v4.s[2]		// i[2]k[3-0]
+	fmla	v19.4s, v0.4s,  v4.s[3]		// i[3]k[3-0]
+
+	b.ne	loop1
+finish:
+// store result
+#ifdef CONV_RELU_FUSE
+        fmov s0,wzr
+        dup  v1.4s,v0.s[0]
+        fmax v16.4s,v16.4s,v1.4s
+        fmax v17.4s,v17.4s,v1.4s
+#endif
+	stp	q16, q17, [x4]
+
+#ifdef CONV_RELU_FUSE
+        fmax v18.4s,v18.4s,v1.4s
+        fmax v19.4s,v19.4s,v1.4s
+#endif
+	stp	q18, q19, [x4,0x20]
+
+	ret
+
+// zero data to fill out a few more cache lines so the prefetcher doesn't
+// cause uninitialized memory to be read
+
+                .space  256
+                .end
+
diff --git a/executor/operator/arm64/conv/sgemm_4x4_interleave_relu_fused.S b/executor/operator/arm64/conv/sgemm_4x4_interleave_relu_fused.S
index f956b72a5..65162223c 100644
--- a/executor/operator/arm64/conv/sgemm_4x4_interleave_relu_fused.S
+++ b/executor/operator/arm64/conv/sgemm_4x4_interleave_relu_fused.S
@@ -1,27 +1,27 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2018, Open AI Lab
- * Author: haitao@openailab.com
- */
-#define CONV_RELU_FUSE 1
-#define INTERLEAVE_FUNC_NAME  sgemm_4x4_interleave_relu_fused
-
-#include "./sgemm_4x4_interleave.S"
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+#define CONV_RELU_FUSE 1
+#define INTERLEAVE_FUNC_NAME  sgemm_4x4_interleave_relu_fused
+
+#include "./sgemm_4x4_interleave.S"
diff --git a/executor/operator/arm64/fc/Makefile b/executor/operator/arm64/fc/Makefile
index 243a02695..d4482a4cd 100644
--- a/executor/operator/arm64/fc/Makefile
+++ b/executor/operator/arm64/fc/Makefile
@@ -1,7 +1,7 @@
-obj-y+=fully_connected_fast.o
-obj-y+=sgemv_1x8_a72.o
-obj-y+=sgemv_1x2_a72.o
-obj-y+=sgemv_1x8_a53.o
-obj-y+=sgemv_1x2_a53.o
-
-fully_connected_fast_CXXFLAGS+=-I../include 
+obj-y+=fully_connected_fast.o
+obj-y+=sgemv_1x8_a72.o
+obj-y+=sgemv_1x2_a72.o
+obj-y+=sgemv_1x8_a53.o
+obj-y+=sgemv_1x2_a53.o
+
+fully_connected_fast_CXXFLAGS+=-I../include 
diff --git a/executor/operator/arm64/fc/fully_connected_fast.cpp b/executor/operator/arm64/fc/fully_connected_fast.cpp
index 6c1b71849..234a26539 100644
--- a/executor/operator/arm64/fc/fully_connected_fast.cpp
+++ b/executor/operator/arm64/fc/fully_connected_fast.cpp
@@ -176,8 +176,8 @@ struct FCOps : public MTNodeOps
         Tensor* tensor;
 
         tensor = node->GetInputTensor(1);
-        int M = tensor->GetShape().GetH();
-        int K = tensor->GetShape().GetW();
+        int M = tensor->GetShape().Shape(0);
+        int K = tensor->GetShape().Shape(1);
 
         float* weight = ( float* )get_tensor_mem(tensor);
 
@@ -188,8 +188,6 @@ struct FCOps : public MTNodeOps
 
         if(exec_attr->low_mem_mode)
         {
-            printf("Free fc weight: %s %d\n", tensor->GetName().c_str(), tensor->GetTotalSize());
-
             tensor->FreeMem();
         }
 
@@ -240,8 +238,8 @@ struct FCOps : public MTNodeOps
 
         /* weight */
         tensor = node->GetInputTensor(1);
-        int M = tensor->GetShape().GetH();
-        int K = tensor->GetShape().GetW();
+        int M = tensor->GetShape().Shape(0);
+        int K = tensor->GetShape().Shape(1);
         float* weight_interleaved = any_cast<float*>(node->GetAttr("weight_interleaved"));
 
         /* output */
@@ -338,6 +336,10 @@ struct FCOps : public MTNodeOps
 
 NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
 {
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
     FCOps* ops = new FCOps();
 
     int master_cpu = cpu_info->GetMasterCPU();
diff --git a/executor/operator/arm64/fc/sgemv_1x2_a53.S b/executor/operator/arm64/fc/sgemv_1x2_a53.S
index a131d9f35..4cdec48bf 100644
--- a/executor/operator/arm64/fc/sgemv_1x2_a53.S
+++ b/executor/operator/arm64/fc/sgemv_1x2_a53.S
@@ -1,126 +1,126 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2018, Open AI Lab
- * Author: xiaowei@openailab.com
- */
-
-//
-// 1*2 single precise floating point matric multiplication
-//
-//                            --           --
-//                            |   k0   k1   |                                                      
-//                            |   .    .    |                                                      
-//    --              --      |   .    .    |      --          --         --            --                     
-//    | i0 - - - - - - |  x   |   .    .    |   +  |   b0   b1   |    =   |  i0k0  i0k1  |
-//    --              --      |   .    .    |      --          --         --            --     
-//                            |   .    .    |                                                      
-//                            |   .    .    |                                                      
-//                            --           --                                       
-//      input 1 x p             kernel p x 2          biases x 2            output 1 x 2           p = kernel size
-//
-//
-// optimised for Cortex-A53 pipeline 15 cycle per loop (1*2*4 dot product) 
-// the bottleneck is memory bandwidth
-//
-// input:
-//         x0   arg0   biases start address      {b0, b1 }
-//         x1   arg1   input data start address  {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...}
-//         x2   arg2   kernel data start address {k00, k10, k01, k11, k02, k12, ...}
-//         x3   arg3   kernel size
-//         x4   arg4   output data save address  {ik0, ik1}
-//
-// output: no
-//
-// v0  4S data of input {i3 i2 i1 i0 }
-// v1~v7  not used
-// v16 2S kernal data0  {k10 | k00}
-// v17    not used 
-// v18 2S kernal data1  {k11 | k01}
-// v19    not used 
-// v20 2S kernal data2  {k12 | k02}
-// v21    not used 
-// v22 2S kernal data3  {k13 | k03}
-// v23    not used 
-// v24-29 not used
-// v30 dot product for  {ik1,  ik0}
-// v31 dot product for  {ik1,  ik0}
-
-        .section .text,"ax"
-        .align 5
-
-        .type sgemv_1x2_a53 STT_FUNC
-        .global sgemv_1x2_a53
-sgemv_1x2_a53:
-	// initial
-	movi	d30, 0
-	prfm	pldl1keep, [x1, 0x40] 
-	prfm	pldl1keep, [x2, 0x80] 
-	cmp	x3, 0x4
-	cbz	x0,  start_convolution
-        ldr	d30, [x0]  
-
-start_convolution:
-	and	x10,x3, 0x3
-	b.lt	loop4_end
-	movi	d31, 0
-	lsr	x9, x3, 0x2
-
-
-// main loop     each loop generate dot prodcut for 1x8x2SFP
-loop4:
-	ldr	q0,  [x1]			// q0  = i[3-0]
-	ldp     d16, d18, [x2]			// d16 = k[1-0][0]  d18 = k[1-0][1]
-	ldp     d20, d22, [x2, 0x10]		// d20 = k[1-0][2]  d22 = k[1-0][3]
-	prfm	pldl1keep, [x1, 0xa0] 
-	add	x1,  x1,  0x10
-	
-	fmla	v30.2s, v16.2s, v0.s[0]		// ik[1-0][0]
-	subs	x9, x9, 0x1
-	fmla	v31.2s, v18.2s, v0.s[1]		// ik[1-0][1]
-	prfm	pldl1keep, [x2, 0x140] 
-	fmla	v30.2s, v20.2s, v0.s[2]		// ik[1-0][2]
-	add	x2,  x2,  0x20
-	fmla	v31.2s, v22.2s, v0.s[3]		// ik[1-0][3]
-
-	b.ne	loop4
-	fadd	v30.2s, v30.2s, v31.2s
-
-loop4_end:
-	cbz	x10, save_result
-
-loop1:
-	ldr	s0, [x1], 0x4
-	ldr	d16,[x2], 0x8
-	subs	x10,x10, 0x1
-
-	fmla	v30.2s, v16.2s, v0.s[0]
-
-	b.ne	loop1
-	
-save_result:
-	str	d30, [x4]
-
-	ret
-
-
-        .space  256
-        .end
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: xiaowei@openailab.com
+ */
+
+//
+// 1*2 single precise floating point matric multiplication
+//
+//                            --           --
+//                            |   k0   k1   |                                                      
+//                            |   .    .    |                                                      
+//    --              --      |   .    .    |      --          --         --            --                     
+//    | i0 - - - - - - |  x   |   .    .    |   +  |   b0   b1   |    =   |  i0k0  i0k1  |
+//    --              --      |   .    .    |      --          --         --            --     
+//                            |   .    .    |                                                      
+//                            |   .    .    |                                                      
+//                            --           --                                       
+//      input 1 x p             kernel p x 2          biases x 2            output 1 x 2           p = kernel size
+//
+//
+// optimised for Cortex-A53 pipeline 15 cycle per loop (1*2*4 dot product) 
+// the bottleneck is memory bandwidth
+//
+// input:
+//         x0   arg0   biases start address      {b0, b1 }
+//         x1   arg1   input data start address  {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...}
+//         x2   arg2   kernel data start address {k00, k10, k01, k11, k02, k12, ...}
+//         x3   arg3   kernel size
+//         x4   arg4   output data save address  {ik0, ik1}
+//
+// output: no
+//
+// v0  4S data of input {i3 i2 i1 i0 }
+// v1~v7  not used
+// v16 2S kernal data0  {k10 | k00}
+// v17    not used 
+// v18 2S kernal data1  {k11 | k01}
+// v19    not used 
+// v20 2S kernal data2  {k12 | k02}
+// v21    not used 
+// v22 2S kernal data3  {k13 | k03}
+// v23    not used 
+// v24-29 not used
+// v30 dot product for  {ik1,  ik0}
+// v31 dot product for  {ik1,  ik0}
+
+        .section .text,"ax"
+        .align 5
+
+        .type sgemv_1x2_a53 STT_FUNC
+        .global sgemv_1x2_a53
+sgemv_1x2_a53:
+	// initial
+	movi	d30, 0
+	prfm	pldl1keep, [x1, 0x40] 
+	prfm	pldl1keep, [x2, 0x80] 
+	cmp	x3, 0x4
+	cbz	x0,  start_convolution
+        ldr	d30, [x0]  
+
+start_convolution:
+	and	x10,x3, 0x3
+	b.lt	loop4_end
+	movi	d31, 0
+	lsr	x9, x3, 0x2
+
+
+// main loop     each loop generate dot prodcut for 1x8x2SFP
+loop4:
+	ldr	q0,  [x1]			// q0  = i[3-0]
+	ldp     d16, d18, [x2]			// d16 = k[1-0][0]  d18 = k[1-0][1]
+	ldp     d20, d22, [x2, 0x10]		// d20 = k[1-0][2]  d22 = k[1-0][3]
+	prfm	pldl1keep, [x1, 0xa0] 
+	add	x1,  x1,  0x10
+	
+	fmla	v30.2s, v16.2s, v0.s[0]		// ik[1-0][0]
+	subs	x9, x9, 0x1
+	fmla	v31.2s, v18.2s, v0.s[1]		// ik[1-0][1]
+	prfm	pldl1keep, [x2, 0x140] 
+	fmla	v30.2s, v20.2s, v0.s[2]		// ik[1-0][2]
+	add	x2,  x2,  0x20
+	fmla	v31.2s, v22.2s, v0.s[3]		// ik[1-0][3]
+
+	b.ne	loop4
+	fadd	v30.2s, v30.2s, v31.2s
+
+loop4_end:
+	cbz	x10, save_result
+
+loop1:
+	ldr	s0, [x1], 0x4
+	ldr	d16,[x2], 0x8
+	subs	x10,x10, 0x1
+
+	fmla	v30.2s, v16.2s, v0.s[0]
+
+	b.ne	loop1
+	
+save_result:
+	str	d30, [x4]
+
+	ret
+
+
+        .space  256
+        .end
+
diff --git a/executor/operator/arm64/fc/sgemv_1x2_a72.S b/executor/operator/arm64/fc/sgemv_1x2_a72.S
index 85a7312f3..ea1ad0faa 100644
--- a/executor/operator/arm64/fc/sgemv_1x2_a72.S
+++ b/executor/operator/arm64/fc/sgemv_1x2_a72.S
@@ -1,126 +1,126 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: xiaowei@openailab.com
- */
-
-//
-// 1*2 single precise floating point matric multiplication
-//
-//                            --           --
-//                            |   k0   k1   |                                                      
-//                            |   .    .    |                                                      
-//    --              --      |   .    .    |      --          --         --            --                     
-//    | i0 - - - - - - |  x   |   .    .    |   +  |   b0   b1   |    =   |  i0k0  i0k1  |
-//    --              --      |   .    .    |      --          --         --            --     
-//                            |   .    .    |                                                      
-//                            |   .    .    |                                                      
-//                            --           --                                       
-//      input 1 x p             kernel p x 2          biases x 2            output 1 x 2           p = kernel size
-//
-//
-// optimised for Cortex-A72 pipeline 9 cycle per loop (1*2*4 dot product) 
-// the bottleneck is memory bandwidth
-//
-// input:
-//         x0   arg0   biases start address      {b0, b1 }
-//         x1   arg1   input data start address  {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...}
-//         x2   arg2   kernel data start address {k00, k10, k01, k11, k02, k12, ...}
-//         x3   arg3   kernel size
-//         x4   arg4   output data save address  {ik0, ik1}
-//
-// output: no
-//
-// v0  4S data of input {i3 i2 i1 i0 }
-// v1~v7  not used
-// v16 2S kernal data0  {k10 | k00}
-// v17    not used 
-// v18 2S kernal data1  {k11 | k01}
-// v19    not used 
-// v20 2S kernal data2  {k12 | k02}
-// v21    not used 
-// v22 2S kernal data3  {k13 | k03}
-// v23    not used 
-// v24-29 not used
-// v30 dot product for  {ik1,  ik0}
-// v31 dot product for  {ik1,  ik0}
-
-        .section .text,"ax"
-        .align 5
-
-        .type sgemv_1x2_a72 STT_FUNC
-        .global sgemv_1x2_a72
-sgemv_1x2_a72:
-// initial
-	movi	d30, 0
-	prfm	pldl1keep, [x1, 0x80] 
-	cmp	x3, 0x4
-	prfm	pldl1keep, [x2, 0x100] 
-	prfm	pldl1keep, [x2, 0x140] 
-	cbz	x0,  start_convolution
-        ldr	d30, [x0]  
-
-start_convolution:
-	and	x10,x3, 0x3
-	b.lt	loop4_end
-	movi	d31, 0
-	lsr	x9, x3, 0x2
-
-
-// main loop     each loop generate dot prodcut for 1x2x4SFP
-loop4:
-	ldr	q0,  [x1]			// q0  = i[3-0]
-	ldp     d16, d18, [x2]			// d16 = k[1-0][0]  d18 = k[1-0][1]
-	ldp     d20, d22, [x2, 0x10]		// d20 = k[1-0][2]  d22 = k[1-0][3]
-	prfm	pldl1keep, [x1, 0x100] 
-	add	x1,  x1,  0x10
-	
-	fmla	v30.2s, v16.2s, v0.s[0]		// ik[1-0][0]
-	subs	x9, x9, 0x1
-	fmla	v31.2s, v18.2s, v0.s[1]		// ik[1-0][1]
-	prfm	pldl1keep, [x2, 0x200] 
-	add	x2,  x2,  0x20
-	fmla	v30.2s, v20.2s, v0.s[2]		// ik[1-0][2]
-	fmla	v31.2s, v22.2s, v0.s[3]		// ik[1-0][3]
-
-	b.ne	loop4
-	fadd	v30.2s, v30.2s, v31.2s
-
-loop4_end:
-	cbz	x10, save_result
-
-loop1:
-	ldr	s0, [x1], 0x4
-	ldr	d16,[x2], 0x8
-	subs	x10,x10, 0x1
-
-	fmla	v30.2s, v16.2s, v0.s[0]
-
-	b.ne	loop1
-	
-save_result:
-	str	d30, [x4]
-
-	ret
-
-        .space  256
-        .end
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: xiaowei@openailab.com
+ */
+
+//
+// 1*2 single precise floating point matric multiplication
+//
+//                            --           --
+//                            |   k0   k1   |                                                      
+//                            |   .    .    |                                                      
+//    --              --      |   .    .    |      --          --         --            --                     
+//    | i0 - - - - - - |  x   |   .    .    |   +  |   b0   b1   |    =   |  i0k0  i0k1  |
+//    --              --      |   .    .    |      --          --         --            --     
+//                            |   .    .    |                                                      
+//                            |   .    .    |                                                      
+//                            --           --                                       
+//      input 1 x p             kernel p x 2          biases x 2            output 1 x 2           p = kernel size
+//
+//
+// optimised for Cortex-A72 pipeline 9 cycle per loop (1*2*4 dot product) 
+// the bottleneck is memory bandwidth
+//
+// input:
+//         x0   arg0   biases start address      {b0, b1 }
+//         x1   arg1   input data start address  {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...}
+//         x2   arg2   kernel data start address {k00, k10, k01, k11, k02, k12, ...}
+//         x3   arg3   kernel size
+//         x4   arg4   output data save address  {ik0, ik1}
+//
+// output: no
+//
+// v0  4S data of input {i3 i2 i1 i0 }
+// v1~v7  not used
+// v16 2S kernal data0  {k10 | k00}
+// v17    not used 
+// v18 2S kernal data1  {k11 | k01}
+// v19    not used 
+// v20 2S kernal data2  {k12 | k02}
+// v21    not used 
+// v22 2S kernal data3  {k13 | k03}
+// v23    not used 
+// v24-29 not used
+// v30 dot product for  {ik1,  ik0}
+// v31 dot product for  {ik1,  ik0}
+
+        .section .text,"ax"
+        .align 5
+
+        .type sgemv_1x2_a72 STT_FUNC
+        .global sgemv_1x2_a72
+sgemv_1x2_a72:
+// initial
+	movi	d30, 0
+	prfm	pldl1keep, [x1, 0x80] 
+	cmp	x3, 0x4
+	prfm	pldl1keep, [x2, 0x100] 
+	prfm	pldl1keep, [x2, 0x140] 
+	cbz	x0,  start_convolution
+        ldr	d30, [x0]  
+
+start_convolution:
+	and	x10,x3, 0x3
+	b.lt	loop4_end
+	movi	d31, 0
+	lsr	x9, x3, 0x2
+
+
+// main loop     each loop generate dot prodcut for 1x2x4SFP
+loop4:
+	ldr	q0,  [x1]			// q0  = i[3-0]
+	ldp     d16, d18, [x2]			// d16 = k[1-0][0]  d18 = k[1-0][1]
+	ldp     d20, d22, [x2, 0x10]		// d20 = k[1-0][2]  d22 = k[1-0][3]
+	prfm	pldl1keep, [x1, 0x100] 
+	add	x1,  x1,  0x10
+	
+	fmla	v30.2s, v16.2s, v0.s[0]		// ik[1-0][0]
+	subs	x9, x9, 0x1
+	fmla	v31.2s, v18.2s, v0.s[1]		// ik[1-0][1]
+	prfm	pldl1keep, [x2, 0x200] 
+	add	x2,  x2,  0x20
+	fmla	v30.2s, v20.2s, v0.s[2]		// ik[1-0][2]
+	fmla	v31.2s, v22.2s, v0.s[3]		// ik[1-0][3]
+
+	b.ne	loop4
+	fadd	v30.2s, v30.2s, v31.2s
+
+loop4_end:
+	cbz	x10, save_result
+
+loop1:
+	ldr	s0, [x1], 0x4
+	ldr	d16,[x2], 0x8
+	subs	x10,x10, 0x1
+
+	fmla	v30.2s, v16.2s, v0.s[0]
+
+	b.ne	loop1
+	
+save_result:
+	str	d30, [x4]
+
+	ret
+
+        .space  256
+        .end
+
diff --git a/executor/operator/arm64/fc/sgemv_1x8_a53.S b/executor/operator/arm64/fc/sgemv_1x8_a53.S
index b0382a813..e7b5698e2 100644
--- a/executor/operator/arm64/fc/sgemv_1x8_a53.S
+++ b/executor/operator/arm64/fc/sgemv_1x8_a53.S
@@ -1,133 +1,133 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2018, Open AI Lab
- * Author: xiaowei@openailab.com
- */
-
-//
-// 1*8 single precise floating point matric multiplication
-//
-//                            --               --
-//                            |  k0  k1  ..  k7 |                                                      
-//                            |  .   .   .   .  |                                                      
-//    --              --      |  .   .   .   .  |     --               --         --                 --
-//    | i0 - - - - - - |  x   |  .   .   .   .  |  +  |  b0  b1  ..  b7 |     =   | i0k0 i0k1 .. i0k7 |
-//    --              --      |  .   .   .   .  |     --               --         --                 --     
-//                            |  .   .   .   .  |                                                      
-//                            |  .   .   .   .  |                                                      
-//                            --               --                                       
-//      input 1 x p              kernel p x 8            biases 1 x 8                 output 1 x 8           p = kernel size
-//
-//
-// optimised for Cortex-A53 pipeline 43 cycle per loop (1*8*4 dot product) 
-// the bottleneck is memory bandwidth
-//
-// input:
-//         x0   arg0   biases start address      {b0, b1, b2, b3, b4, b5, b6, b7}}
-//         x1   arg1   input data start address  {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...}
-//         x2   arg2   kernel data start address {k00, k10, k20, k30, k40, k50, k60, k70, k80, k01, k11, k21, k31, ...}
-//         x3   arg3   kernel size
-//         x4   arg4   output data save address  {ik0, ik1, ik2, ik3, ik4, ik5, ik6, ik7, ik8}
-//
-// output: no
-//
-// v0  4S data of input {i3    i2    i1    i0 }
-// v1~v7  not used
-// v16 4S kernal data0  {k30 | k20 | k10 | k00}
-// v17 4S kernal data4  {k70 | k60 | k50 | k40}
-// v18 4S kernal data1  {k31 | k21 | k11 | k01}
-// v19 4S kernal data5  {k71 | k61 | k51 | k41}
-// v20 4S kernal data2  {k32 | k22 | k12 | k02}
-// v21 4S kernal data6  {k72 | k62 | k52 | k42}
-// v22 4S kernal data3  {k33 | k23 | k13 | k03}
-// v23 4S kernal data7  {k73 | k63 | k53 | k43}
-// v24-v29 not used
-// v30 dot product for  {ik3,  ik2,  ik1,  ik0}
-// v31 dot product for  {ik7,  ik6,  ik5,  ik4}
-
-        .section .text,"ax"
-        .align 5
-
-        .type sgemv_1x8_a53 STT_FUNC
-        .global sgemv_1x8_a53
-sgemv_1x8_a53:
-	// initial
-	movi	d30, 0
-	cmp	x3, 0x4
-	movi	d31, 0
-	prfm	pldl1keep, [x1, 0x40] 
-	cbz	x0,  start_convolution
-        ldp	q30, q31, [x0]  
-
-start_convolution:
-	b.lt	loop1
-	lsr	x6, x3, 0x2
-	movi	d28, 0
-	movi	d29, 0
-
-// main loop     each loop generate dot prodcut for 1x8x4SFP
-loop4:
-	ldr	q0,  [x1]			// q0  = i[3-0]
-	ldp     q16, q17, [x2]			// q16 = k[3-0][0]  q17 = k[7-4][0]
-	ldp     q18, q19, [x2, 0x20]		// q18 = k[3-0][1]  q19 = k[7-4][1]
-	ldp     q20, q21, [x2, 0x40]		// q20 = k[3-0][2]  q21 = k[7-4][2]
-	ldp     q22, q23, [x2, 0x60]		// q22 = k[3-0][3]  q23 = k[7-4][3]
-	subs	x6, x6, 0x1
-	
-	fmla	v28.4s, v16.4s, v0.s[0]		// ik[3-0][0]
-	fmla	v29.4s, v17.4s, v0.s[0]		// ik[7-4][0]
-	prfm	pldl1keep, [x1, 0x80] 
-	fmla	v30.4s, v18.4s, v0.s[1]		// ik[3-0][1]
-	prfm	pldl1keep, [x2, 0x340] 
-	fmla	v31.4s, v19.4s, v0.s[1]		// ik[7-4][1]
-	add	x1,  x1,  0x10
-	fmla	v28.4s, v20.4s, v0.s[2]		// ik[3-0][2]
-	prfm	pldl1keep, [x2, 0x380] 
-	fmla	v29.4s, v21.4s, v0.s[2]		// ik[7-4][2]
-	fmla	v30.4s, v22.4s, v0.s[3]		// ik[3-0][3]
-	add	x2,  x2,  0x80
-	fmla	v31.4s, v23.4s, v0.s[3]		// ik[7-4][3]
-
-	b.ne	loop4
-
-	and	x3, x3, 0x3
-	fadd	v30.4s, v30.4s, v28.4s
-	fadd	v31.4s, v31.4s, v29.4s
-	cbz	x3, save_result
-
-loop1:
-	ldr	s0, [x1], 0x4
-	ldp	q16, q17, [x2], 0x20
-	subs	x3, x3, 0x1
-
-	fmla	v30.4s, v16.4s, v0.s[0]
-	fmla	v31.4s, v17.4s, v0.s[0]
-
-	b.ne	loop1
-	
-save_result:
-	stp	q30, q31, [x4]
-
-	ret
-
-        .space  256
-        .end
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: xiaowei@openailab.com
+ */
+
+//
+// 1*8 single precise floating point matric multiplication
+//
+//                            --               --
+//                            |  k0  k1  ..  k7 |                                                      
+//                            |  .   .   .   .  |                                                      
+//    --              --      |  .   .   .   .  |     --               --         --                 --
+//    | i0 - - - - - - |  x   |  .   .   .   .  |  +  |  b0  b1  ..  b7 |     =   | i0k0 i0k1 .. i0k7 |
+//    --              --      |  .   .   .   .  |     --               --         --                 --     
+//                            |  .   .   .   .  |                                                      
+//                            |  .   .   .   .  |                                                      
+//                            --               --                                       
+//      input 1 x p              kernel p x 8            biases 1 x 8                 output 1 x 8           p = kernel size
+//
+//
+// optimised for Cortex-A53 pipeline 43 cycle per loop (1*8*4 dot product) 
+// the bottleneck is memory bandwidth
+//
+// input:
+//         x0   arg0   biases start address      {b0, b1, b2, b3, b4, b5, b6, b7}}
+//         x1   arg1   input data start address  {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...}
+//         x2   arg2   kernel data start address {k00, k10, k20, k30, k40, k50, k60, k70, k80, k01, k11, k21, k31, ...}
+//         x3   arg3   kernel size
+//         x4   arg4   output data save address  {ik0, ik1, ik2, ik3, ik4, ik5, ik6, ik7, ik8}
+//
+// output: no
+//
+// v0  4S data of input {i3    i2    i1    i0 }
+// v1~v7  not used
+// v16 4S kernal data0  {k30 | k20 | k10 | k00}
+// v17 4S kernal data4  {k70 | k60 | k50 | k40}
+// v18 4S kernal data1  {k31 | k21 | k11 | k01}
+// v19 4S kernal data5  {k71 | k61 | k51 | k41}
+// v20 4S kernal data2  {k32 | k22 | k12 | k02}
+// v21 4S kernal data6  {k72 | k62 | k52 | k42}
+// v22 4S kernal data3  {k33 | k23 | k13 | k03}
+// v23 4S kernal data7  {k73 | k63 | k53 | k43}
+// v24-v29 not used
+// v30 dot product for  {ik3,  ik2,  ik1,  ik0}
+// v31 dot product for  {ik7,  ik6,  ik5,  ik4}
+
+        .section .text,"ax"
+        .align 5
+
+        .type sgemv_1x8_a53 STT_FUNC
+        .global sgemv_1x8_a53
+sgemv_1x8_a53:
+	// initial
+	movi	d30, 0
+	cmp	x3, 0x4
+	movi	d31, 0
+	prfm	pldl1keep, [x1, 0x40] 
+	cbz	x0,  start_convolution
+        ldp	q30, q31, [x0]  
+
+start_convolution:
+	b.lt	loop1
+	lsr	x6, x3, 0x2
+	movi	d28, 0
+	movi	d29, 0
+
+// main loop     each loop generate dot prodcut for 1x8x4SFP
+loop4:
+	ldr	q0,  [x1]			// q0  = i[3-0]
+	ldp     q16, q17, [x2]			// q16 = k[3-0][0]  q17 = k[7-4][0]
+	ldp     q18, q19, [x2, 0x20]		// q18 = k[3-0][1]  q19 = k[7-4][1]
+	ldp     q20, q21, [x2, 0x40]		// q20 = k[3-0][2]  q21 = k[7-4][2]
+	ldp     q22, q23, [x2, 0x60]		// q22 = k[3-0][3]  q23 = k[7-4][3]
+	subs	x6, x6, 0x1
+	
+	fmla	v28.4s, v16.4s, v0.s[0]		// ik[3-0][0]
+	fmla	v29.4s, v17.4s, v0.s[0]		// ik[7-4][0]
+	prfm	pldl1keep, [x1, 0x80] 
+	fmla	v30.4s, v18.4s, v0.s[1]		// ik[3-0][1]
+	prfm	pldl1keep, [x2, 0x340] 
+	fmla	v31.4s, v19.4s, v0.s[1]		// ik[7-4][1]
+	add	x1,  x1,  0x10
+	fmla	v28.4s, v20.4s, v0.s[2]		// ik[3-0][2]
+	prfm	pldl1keep, [x2, 0x380] 
+	fmla	v29.4s, v21.4s, v0.s[2]		// ik[7-4][2]
+	fmla	v30.4s, v22.4s, v0.s[3]		// ik[3-0][3]
+	add	x2,  x2,  0x80
+	fmla	v31.4s, v23.4s, v0.s[3]		// ik[7-4][3]
+
+	b.ne	loop4
+
+	and	x3, x3, 0x3
+	fadd	v30.4s, v30.4s, v28.4s
+	fadd	v31.4s, v31.4s, v29.4s
+	cbz	x3, save_result
+
+loop1:
+	ldr	s0, [x1], 0x4
+	ldp	q16, q17, [x2], 0x20
+	subs	x3, x3, 0x1
+
+	fmla	v30.4s, v16.4s, v0.s[0]
+	fmla	v31.4s, v17.4s, v0.s[0]
+
+	b.ne	loop1
+	
+save_result:
+	stp	q30, q31, [x4]
+
+	ret
+
+        .space  256
+        .end
+
diff --git a/executor/operator/arm64/fc/sgemv_1x8_a72.S b/executor/operator/arm64/fc/sgemv_1x8_a72.S
index aa5665cdc..6129d735e 100644
--- a/executor/operator/arm64/fc/sgemv_1x8_a72.S
+++ b/executor/operator/arm64/fc/sgemv_1x8_a72.S
@@ -1,132 +1,132 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: xiaowei@openailab.com
- */
-
-//
-// 1*8 single precise floating point matric multiplication
-//
-//                            --               --
-//                            |  k0  k1  ..  k7 |                                                      
-//                            |  .   .   .   .  |                                                      
-//    --              --      |  .   .   .   .  |     --               --         --                 --
-//    | i0 - - - - - - |  x   |  .   .   .   .  |  +  |  b0  b1  ..  b7 |     =   | i0k0 i0k1 .. i0k7 |
-//    --              --      |  .   .   .   .  |     --               --         --                 --     
-//                            |  .   .   .   .  |                                                      
-//                            |  .   .   .   .  |                                                      
-//                            --               --                                       
-//      input 1 x p              kernel p x 8            biases 1 x 8                 output 1 x 8           p = kernel size
-//
-//
-// optimised for Cortex-A72 pipeline 13 cycle per loop (1*8*4 dot product) 
-// the bottleneck is memory bandwidth
-//
-// input:
-//         x0   arg0   biases start address      {b0, b1, b2, b3, b4, b5, b6, b7}}
-//         x1   arg1   input data start address  {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...}
-//         x2   arg2   kernel data start address {k00, k10, k20, k30, k40, k50, k60, k70, k80, k01, k11, k21, k31, ...}
-//         x3   arg3   kernel size
-//         x4   arg4   output data save address  {ik0, ik1, ik2, ik3, ik4, ik5, ik6, ik7, ik8}
-//
-// output: no
-//
-// v0  4S data of input {i3    i2    i1    i0 }
-// v1~v7  not used
-// v16 4S kernal data0  {k30 | k20 | k10 | k00}
-// v17 4S kernal data4  {k70 | k60 | k50 | k40}
-// v18 4S kernal data1  {k31 | k21 | k11 | k01}
-// v19 4S kernal data5  {k71 | k61 | k51 | k41}
-// v20 4S kernal data2  {k32 | k22 | k12 | k02}
-// v21 4S kernal data6  {k72 | k62 | k52 | k42}
-// v22 4S kernal data3  {k33 | k23 | k13 | k03}
-// v23 4S kernal data7  {k73 | k63 | k53 | k43}
-// v24-v29 not used
-// v30 dot product for  {ik3,  ik2,  ik1,  ik0}
-// v31 dot product for  {ik7,  ik6,  ik5,  ik4}
-
-        .section .text,"ax"
-        .align 5
-
-        .type sgemv_1x8_a72 STT_FUNC
-        .global sgemv_1x8_a72
-sgemv_1x8_a72:
-// initial
-	cmp	x3, 0x4
-	prfm	pldl1keep, [x1, 0x40] 
-	prfm	pldl1keep, [x2, 0x200] 
-	prfm	pldl1keep, [x2, 0x240] 
-	movi	d30, 0
-	movi	d31, 0
-	cbz	x0,  start_convolution
-        ldp	q30, q31, [x0]  
-
-start_convolution:
-	and	x5, x3, 0x3
-	b.lt	loop1
-	lsr	x6, x3, 0x2
-
-
-// main loop     each loop generate dot prodcut for 1x8x4SFP
-loop4:
-	ldr	q0,  [x1]			// q0  = i[3-0]
-	ldp     q16, q17, [x2]			// q16 = k[3-0][0]  q17 = k[7-4][0]
-	ldp     q18, q19, [x2, 0x20]		// q18 = k[3-0][1]  q19 = k[7-4][1]
-	prfm	pldl1keep, [x1, 0x80] 
-	add	x1,  x1,  0x10
-	subs	x6, x6, 0x1
-	
-	fmla	v30.4s, v16.4s, v0.s[0]		// ik[3-0][0]
-	fmla	v31.4s, v17.4s, v0.s[0]		// ik[7-4][0]
-	ldp     q20, q21, [x2, 0x40]		// q20 = k[3-0][2]  q21 = k[7-4][2]
-	fmla	v30.4s, v18.4s, v0.s[1]		// ik[3-0][1]
-	prfm	pldl1keep, [x2, 0x400] 
-	fmla	v31.4s, v19.4s, v0.s[1]		// ik[7-4][1]
-	ldp     q22, q23, [x2, 0x60]		// q22 = k[3-0][3]  q23 = k[7-4][3]
-	fmla	v30.4s, v20.4s, v0.s[2]		// ik[3-0][2]
-	prfm	pldl1keep, [x2, 0x440] 
-	add	x2,  x2,  0x80
-	fmla	v31.4s, v21.4s, v0.s[2]		// ik[7-4][2]
-	fmla	v30.4s, v22.4s, v0.s[3]		// ik[3-0][3]
-	fmla	v31.4s, v23.4s, v0.s[3]		// ik[7-4][3]
-
-	b.ne	loop4
-
-	cbz	x5, save_result
-
-loop1:
-	ldr	s0, [x1], 0x4
-	ldp	q16, q17, [x2], 0x20
-	subs	x5, x5, 0x1
-
-	fmla	v30.4s, v16.4s, v0.s[0]
-	fmla	v31.4s, v17.4s, v0.s[0]
-
-	b.ne	loop1
-	
-save_result:
-	stp	q30, q31, [x4]
-
-	ret
- 
-        .space  256
-        .end
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: xiaowei@openailab.com
+ */
+
+//
+// 1*8 single precise floating point matric multiplication
+//
+//                            --               --
+//                            |  k0  k1  ..  k7 |                                                      
+//                            |  .   .   .   .  |                                                      
+//    --              --      |  .   .   .   .  |     --               --         --                 --
+//    | i0 - - - - - - |  x   |  .   .   .   .  |  +  |  b0  b1  ..  b7 |     =   | i0k0 i0k1 .. i0k7 |
+//    --              --      |  .   .   .   .  |     --               --         --                 --     
+//                            |  .   .   .   .  |                                                      
+//                            |  .   .   .   .  |                                                      
+//                            --               --                                       
+//      input 1 x p              kernel p x 8            biases 1 x 8                 output 1 x 8           p = kernel size
+//
+//
+// optimised for Cortex-A72 pipeline 13 cycle per loop (1*8*4 dot product) 
+// the bottleneck is memory bandwidth
+//
+// input:
+//         x0   arg0   biases start address      {b0, b1, b2, b3, b4, b5, b6, b7}}
+//         x1   arg1   input data start address  {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...}
+//         x2   arg2   kernel data start address {k00, k10, k20, k30, k40, k50, k60, k70, k80, k01, k11, k21, k31, ...}
+//         x3   arg3   kernel size
+//         x4   arg4   output data save address  {ik0, ik1, ik2, ik3, ik4, ik5, ik6, ik7, ik8}
+//
+// output: no
+//
+// v0  4S data of input {i3    i2    i1    i0 }
+// v1~v7  not used
+// v16 4S kernal data0  {k30 | k20 | k10 | k00}
+// v17 4S kernal data4  {k70 | k60 | k50 | k40}
+// v18 4S kernal data1  {k31 | k21 | k11 | k01}
+// v19 4S kernal data5  {k71 | k61 | k51 | k41}
+// v20 4S kernal data2  {k32 | k22 | k12 | k02}
+// v21 4S kernal data6  {k72 | k62 | k52 | k42}
+// v22 4S kernal data3  {k33 | k23 | k13 | k03}
+// v23 4S kernal data7  {k73 | k63 | k53 | k43}
+// v24-v29 not used
+// v30 dot product for  {ik3,  ik2,  ik1,  ik0}
+// v31 dot product for  {ik7,  ik6,  ik5,  ik4}
+
+        .section .text,"ax"
+        .align 5
+
+        .type sgemv_1x8_a72 STT_FUNC
+        .global sgemv_1x8_a72
+sgemv_1x8_a72:
+// initial
+	cmp	x3, 0x4
+	prfm	pldl1keep, [x1, 0x40] 
+	prfm	pldl1keep, [x2, 0x200] 
+	prfm	pldl1keep, [x2, 0x240] 
+	movi	d30, 0
+	movi	d31, 0
+	cbz	x0,  start_convolution
+        ldp	q30, q31, [x0]  
+
+start_convolution:
+	and	x5, x3, 0x3
+	b.lt	loop1
+	lsr	x6, x3, 0x2
+
+
+// main loop     each loop generate dot prodcut for 1x8x4SFP
+loop4:
+	ldr	q0,  [x1]			// q0  = i[3-0]
+	ldp     q16, q17, [x2]			// q16 = k[3-0][0]  q17 = k[7-4][0]
+	ldp     q18, q19, [x2, 0x20]		// q18 = k[3-0][1]  q19 = k[7-4][1]
+	prfm	pldl1keep, [x1, 0x80] 
+	add	x1,  x1,  0x10
+	subs	x6, x6, 0x1
+	
+	fmla	v30.4s, v16.4s, v0.s[0]		// ik[3-0][0]
+	fmla	v31.4s, v17.4s, v0.s[0]		// ik[7-4][0]
+	ldp     q20, q21, [x2, 0x40]		// q20 = k[3-0][2]  q21 = k[7-4][2]
+	fmla	v30.4s, v18.4s, v0.s[1]		// ik[3-0][1]
+	prfm	pldl1keep, [x2, 0x400] 
+	fmla	v31.4s, v19.4s, v0.s[1]		// ik[7-4][1]
+	ldp     q22, q23, [x2, 0x60]		// q22 = k[3-0][3]  q23 = k[7-4][3]
+	fmla	v30.4s, v20.4s, v0.s[2]		// ik[3-0][2]
+	prfm	pldl1keep, [x2, 0x440] 
+	add	x2,  x2,  0x80
+	fmla	v31.4s, v21.4s, v0.s[2]		// ik[7-4][2]
+	fmla	v30.4s, v22.4s, v0.s[3]		// ik[3-0][3]
+	fmla	v31.4s, v23.4s, v0.s[3]		// ik[7-4][3]
+
+	b.ne	loop4
+
+	cbz	x5, save_result
+
+loop1:
+	ldr	s0, [x1], 0x4
+	ldp	q16, q17, [x2], 0x20
+	subs	x5, x5, 0x1
+
+	fmla	v30.4s, v16.4s, v0.s[0]
+	fmla	v31.4s, v17.4s, v0.s[0]
+
+	b.ne	loop1
+	
+save_result:
+	stp	q30, q31, [x4]
+
+	ret
+ 
+        .space  256
+        .end
+
diff --git a/executor/operator/arm64/fused/Makefile b/executor/operator/arm64/fused/Makefile
index d250d1466..f9a189f58 100644
--- a/executor/operator/arm64/fused/Makefile
+++ b/executor/operator/arm64/fused/Makefile
@@ -1,3 +1,3 @@
-obj-y+=fused_bn_scale_relu.o
-obj-y+=bn_scale_relu_neon.o
-
+obj-y+=fused_bn_scale_relu.o
+obj-y+=bn_scale_relu_neon.o
+
diff --git a/executor/operator/arm64/fused/bn_scale_relu_neon.S b/executor/operator/arm64/fused/bn_scale_relu_neon.S
index 095c0dc5f..316b00351 100644
--- a/executor/operator/arm64/fused/bn_scale_relu_neon.S
+++ b/executor/operator/arm64/fused/bn_scale_relu_neon.S
@@ -1,332 +1,332 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: haitao@openailab.com
- */
-/* relu implementation using neon vector */
-
-
-.text
-.align 5
-.global bn_scale_relu_neon
-.type   bn_scale_relu_neon, %function
-
-bn_scale_relu_neon:
-   //x0 input 
-   //x1 gamma
-   //x2 beta
-   //x3 mean
-   //x4 var
-   //x5 channel_number
-   //x6 channel_size
-   //x7 output
-
-   //s28 -- gamma
-   //s29 -- beta
-   //s30 -- mean
-   //s31 -- var
-   //v27 --- zero
-
-   /* 
-     data=data*s_var+s_mean;
-     data=data*s_gamma+s_beta;
-   */
-
-    fmov s27,wzr
-    dup  v27.4s,v27.s[0]
-
-channel_start:
-
-    ldr  s28,[x1],#4
-    ld1r {v29.4s},[x2],#4 
-    ld1r {v30.4s},[x3],#4
-    ldr  s31,[x4],#4
-
-    lsr x9,x6,6
-    lsl x10,x9,6
-    sub x10,x6,x10
-    cbz x9, less_64
-
-    ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
-    ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
-
-    sub x9,x9,1
-    cbz x9, last_block_64
-
-block_64_start:
-
-
-    mov v8.16b,v30.16b
-    mov v9.16b,v30.16b
-    mov v10.16b,v30.16b
-    mov v11.16b,v30.16b
-
-    ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64
-
-    fmla v8.4s,v0.4s,v31.s[0]
-    fmla v9.4s,v1.4s,v31.s[0]
-    fmla v10.4s,v2.4s,v31.s[0]
-    fmla v11.4s,v3.4s,v31.s[0]
-
-
-    fmul v8.4s,v8.4s,v28.s[0]
-    fmul v9.4s,v9.4s,v28.s[0]
-    fmul v10.4s,v10.4s,v28.s[0]
-    fmul v11.4s,v11.4s,v28.s[0]
-
-    fadd v8.4s,v8.4s,v29.4s
-    fadd v9.4s,v9.4s,v29.4s
-    fadd v10.4s,v10.4s,v29.4s
-    fadd v11.4s,v11.4s,v29.4s
-
-    fmax v8.4s,v8.4s,v27.4s
-    fmax v9.4s,v9.4s,v27.4s
-    fmax v10.4s,v10.4s,v27.4s
-    fmax v11.4s,v11.4s,v27.4s
-
-    mov v12.16b,v30.16b
-    mov v13.16b,v30.16b
-
-    st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64
-
-    mov v14.16b,v30.16b
-    mov v15.16b,v30.16b
-
-    ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64
-
-    fmla v12.4s,v4.4s,v31.s[0]
-    fmla v13.4s,v5.4s,v31.s[0]
-    fmla v14.4s,v6.4s,v31.s[0]
-    fmla v15.4s,v7.4s,v31.s[0]
-
-    fmul v12.4s,v12.4s,v28.s[0]
-    fmul v13.4s,v13.4s,v28.s[0]
-    fmul v14.4s,v14.4s,v28.s[0]
-    fmul v15.4s,v15.4s,v28.s[0]
-
-    fadd v12.4s,v12.4s,v29.4s
-    fadd v13.4s,v13.4s,v29.4s
-    fadd v14.4s,v14.4s,v29.4s
-    fadd v15.4s,v15.4s,v29.4s
-
-    fmax v12.4s,v12.4s,v27.4s
-    fmax v13.4s,v13.4s,v27.4s
-    fmax v14.4s,v14.4s,v27.4s
-    fmax v15.4s,v15.4s,v27.4s
-
-    mov v8.16b,v30.16b
-    mov v9.16b,v30.16b
-
-    st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64
-
-    mov v10.16b,v30.16b
-    mov v11.16b,v30.16b
-    subs x9,x9,1
-
-    ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
-
-    fmla v8.4s,v16.4s,v31.s[0]
-    fmla v9.4s,v17.4s,v31.s[0]
-    fmla v10.4s,v18.4s,v31.s[0]
-    fmla v11.4s,v19.4s,v31.s[0]
-
-    fmul v8.4s,v8.4s,v28.s[0]
-    fmul v9.4s,v9.4s,v28.s[0]
-    fmul v10.4s,v10.4s,v28.s[0]
-    fmul v11.4s,v11.4s,v28.s[0]
-
-    fadd v8.4s,v8.4s,v29.4s
-    fadd v9.4s,v9.4s,v29.4s
-    fadd v10.4s,v10.4s,v29.4s
-    fadd v11.4s,v11.4s,v29.4s
-
-    fmax v8.4s,v8.4s,v27.4s
-    fmax v9.4s,v9.4s,v27.4s
-    fmax v10.4s,v10.4s,v27.4s
-    fmax v11.4s,v11.4s,v27.4s
-
-
-    mov v12.16b,v30.16b
-    mov v13.16b,v30.16b
-
-    st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64
-
-    mov v14.16b,v30.16b
-    mov v15.16b,v30.16b
-
-    ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
-
-    fmla v12.4s,v20.4s,v31.s[0]
-    fmla v13.4s,v21.4s,v31.s[0]
-    fmla v14.4s,v22.4s,v31.s[0]
-    fmla v15.4s,v23.4s,v31.s[0]
-
-    fmul v12.4s,v12.4s,v28.s[0]
-    fmul v13.4s,v13.4s,v28.s[0]
-    fmul v14.4s,v14.4s,v28.s[0]
-    fmul v15.4s,v15.4s,v28.s[0]
-
-    fadd v12.4s,v12.4s,v29.4s
-    fadd v13.4s,v13.4s,v29.4s
-    fadd v14.4s,v14.4s,v29.4s
-    fadd v15.4s,v15.4s,v29.4s
-
-    fmax v12.4s,v12.4s,v27.4s
-    fmax v13.4s,v13.4s,v27.4s
-    fmax v14.4s,v14.4s,v27.4s
-    fmax v15.4s,v15.4s,v27.4s
-
-    st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64
-
-    b.ne block_64_start
-
-last_block_64:
-
-    ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64
-
-    mov v8.16b,v30.16b
-    mov v9.16b,v30.16b
-    mov v10.16b,v30.16b
-    mov v11.16b,v30.16b
-
-    fmla v8.4s,v0.4s,v31.s[0]
-    fmla v9.4s,v1.4s,v31.s[0]
-    fmla v10.4s,v2.4s,v31.s[0]
-    fmla v11.4s,v3.4s,v31.s[0]
-
-    fmul v8.4s,v8.4s,v28.s[0]
-    fmul v9.4s,v9.4s,v28.s[0]
-    fmul v10.4s,v10.4s,v28.s[0]
-    fmul v11.4s,v11.4s,v28.s[0]
-
-    fadd v8.4s,v8.4s,v29.4s
-    fadd v9.4s,v9.4s,v29.4s
-    fadd v10.4s,v10.4s,v29.4s
-    fadd v11.4s,v11.4s,v29.4s
-
-    fmax v8.4s,v8.4s,v27.4s
-    fmax v9.4s,v9.4s,v27.4s
-    fmax v10.4s,v10.4s,v27.4s
-    fmax v11.4s,v11.4s,v27.4s
-
-    st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64
-
-    ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64
-
-    mov v12.16b,v30.16b
-    mov v13.16b,v30.16b
-    mov v14.16b,v30.16b
-    mov v15.16b,v30.16b
-
-    fmla v12.4s,v4.4s,v31.s[0]
-    fmla v13.4s,v5.4s,v31.s[0]
-    fmla v14.4s,v6.4s,v31.s[0]
-    fmla v15.4s,v7.4s,v31.s[0]
-
-    fmul v12.4s,v12.4s,v28.s[0]
-    fmul v13.4s,v13.4s,v28.s[0]
-    fmul v14.4s,v14.4s,v28.s[0]
-    fmul v15.4s,v15.4s,v28.s[0]
-
-    fadd v12.4s,v12.4s,v29.4s
-    fadd v13.4s,v13.4s,v29.4s
-    fadd v14.4s,v14.4s,v29.4s
-    fadd v15.4s,v15.4s,v29.4s
-
-    fmax v12.4s,v12.4s,v27.4s
-    fmax v13.4s,v13.4s,v27.4s
-    fmax v14.4s,v14.4s,v27.4s
-    fmax v15.4s,v15.4s,v27.4s
-
-
-    st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64
-
-    mov v8.16b,v30.16b
-    mov v9.16b,v30.16b
-    mov v10.16b,v30.16b
-    mov v11.16b,v30.16b
-
-    fmla v8.4s,v16.4s,v31.s[0]
-    fmla v9.4s,v17.4s,v31.s[0]
-    fmla v10.4s,v18.4s,v31.s[0]
-    fmla v11.4s,v19.4s,v31.s[0]
-
-    fmul v8.4s,v8.4s,v28.s[0]
-    fmul v9.4s,v9.4s,v28.s[0]
-    fmul v10.4s,v10.4s,v28.s[0]
-    fmul v11.4s,v11.4s,v28.s[0]
-
-    fadd v8.4s,v8.4s,v29.4s
-    fadd v9.4s,v9.4s,v29.4s
-    fadd v10.4s,v10.4s,v29.4s
-    fadd v11.4s,v11.4s,v29.4s
-
-    fmax v8.4s,v8.4s,v27.4s
-    fmax v9.4s,v9.4s,v27.4s
-    fmax v10.4s,v10.4s,v27.4s
-    fmax v11.4s,v11.4s,v27.4s
-
-    mov v12.16b,v30.16b
-    mov v13.16b,v30.16b
-    mov v14.16b,v30.16b
-    mov v15.16b,v30.16b
-
-    st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64
-
-    fmla v12.4s,v20.4s,v31.s[0]
-    fmla v13.4s,v21.4s,v31.s[0]
-    fmla v14.4s,v22.4s,v31.s[0]
-    fmla v15.4s,v23.4s,v31.s[0]
-
-    fmul v12.4s,v12.4s,v28.s[0]
-    fmul v13.4s,v13.4s,v28.s[0]
-    fmul v14.4s,v14.4s,v28.s[0]
-    fmul v15.4s,v15.4s,v28.s[0]
-
-    fadd v12.4s,v12.4s,v29.4s
-    fadd v13.4s,v13.4s,v29.4s
-    fadd v14.4s,v14.4s,v29.4s
-    fadd v15.4s,v15.4s,v29.4s
-
-    fmax v12.4s,v12.4s,v27.4s
-    fmax v13.4s,v13.4s,v27.4s
-    fmax v14.4s,v14.4s,v27.4s
-    fmax v15.4s,v15.4s,v27.4s
-
-    st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64            
-    
-    cbz  x10, channel_done
-
-less_64:
-    subs x10,x10,1
-    ldr s0,[x0],#4
-    fmadd s1,s0,s31,s30
-    fmadd s1,s1,s28,s29
-    fmax  s1,s1,s27
-    str s1,[x7],#4
-    b.ne less_64
-
-channel_done:
-
-    subs x5,x5,1  //channel_counter
-    b.ne channel_start
-
-    ret
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+/* relu implementation using neon vector */
+
+
+.text
+.align 5
+.global bn_scale_relu_neon
+.type   bn_scale_relu_neon, %function
+
+bn_scale_relu_neon:
+   //x0 input 
+   //x1 gamma
+   //x2 beta
+   //x3 mean
+   //x4 var
+   //x5 channel_number
+   //x6 channel_size
+   //x7 output
+
+   //s28 -- gamma
+   //s29 -- beta
+   //s30 -- mean
+   //s31 -- var
+   //v27 --- zero
+
+   /* 
+     data=data*s_var+s_mean;
+     data=data*s_gamma+s_beta;
+   */
+
+    fmov s27,wzr
+    dup  v27.4s,v27.s[0]
+
+channel_start:
+
+    ldr  s28,[x1],#4
+    ld1r {v29.4s},[x2],#4 
+    ld1r {v30.4s},[x3],#4
+    ldr  s31,[x4],#4
+
+    lsr x9,x6,6
+    lsl x10,x9,6
+    sub x10,x6,x10
+    cbz x9, less_64
+
+    ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
+    ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+
+    sub x9,x9,1
+    cbz x9, last_block_64
+
+block_64_start:
+
+
+    mov v8.16b,v30.16b
+    mov v9.16b,v30.16b
+    mov v10.16b,v30.16b
+    mov v11.16b,v30.16b
+
+    ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64
+
+    fmla v8.4s,v0.4s,v31.s[0]
+    fmla v9.4s,v1.4s,v31.s[0]
+    fmla v10.4s,v2.4s,v31.s[0]
+    fmla v11.4s,v3.4s,v31.s[0]
+
+
+    fmul v8.4s,v8.4s,v28.s[0]
+    fmul v9.4s,v9.4s,v28.s[0]
+    fmul v10.4s,v10.4s,v28.s[0]
+    fmul v11.4s,v11.4s,v28.s[0]
+
+    fadd v8.4s,v8.4s,v29.4s
+    fadd v9.4s,v9.4s,v29.4s
+    fadd v10.4s,v10.4s,v29.4s
+    fadd v11.4s,v11.4s,v29.4s
+
+    fmax v8.4s,v8.4s,v27.4s
+    fmax v9.4s,v9.4s,v27.4s
+    fmax v10.4s,v10.4s,v27.4s
+    fmax v11.4s,v11.4s,v27.4s
+
+    mov v12.16b,v30.16b
+    mov v13.16b,v30.16b
+
+    st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64
+
+    mov v14.16b,v30.16b
+    mov v15.16b,v30.16b
+
+    ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64
+
+    fmla v12.4s,v4.4s,v31.s[0]
+    fmla v13.4s,v5.4s,v31.s[0]
+    fmla v14.4s,v6.4s,v31.s[0]
+    fmla v15.4s,v7.4s,v31.s[0]
+
+    fmul v12.4s,v12.4s,v28.s[0]
+    fmul v13.4s,v13.4s,v28.s[0]
+    fmul v14.4s,v14.4s,v28.s[0]
+    fmul v15.4s,v15.4s,v28.s[0]
+
+    fadd v12.4s,v12.4s,v29.4s
+    fadd v13.4s,v13.4s,v29.4s
+    fadd v14.4s,v14.4s,v29.4s
+    fadd v15.4s,v15.4s,v29.4s
+
+    fmax v12.4s,v12.4s,v27.4s
+    fmax v13.4s,v13.4s,v27.4s
+    fmax v14.4s,v14.4s,v27.4s
+    fmax v15.4s,v15.4s,v27.4s
+
+    mov v8.16b,v30.16b
+    mov v9.16b,v30.16b
+
+    st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64
+
+    mov v10.16b,v30.16b
+    mov v11.16b,v30.16b
+    subs x9,x9,1
+
+    ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
+
+    fmla v8.4s,v16.4s,v31.s[0]
+    fmla v9.4s,v17.4s,v31.s[0]
+    fmla v10.4s,v18.4s,v31.s[0]
+    fmla v11.4s,v19.4s,v31.s[0]
+
+    fmul v8.4s,v8.4s,v28.s[0]
+    fmul v9.4s,v9.4s,v28.s[0]
+    fmul v10.4s,v10.4s,v28.s[0]
+    fmul v11.4s,v11.4s,v28.s[0]
+
+    fadd v8.4s,v8.4s,v29.4s
+    fadd v9.4s,v9.4s,v29.4s
+    fadd v10.4s,v10.4s,v29.4s
+    fadd v11.4s,v11.4s,v29.4s
+
+    fmax v8.4s,v8.4s,v27.4s
+    fmax v9.4s,v9.4s,v27.4s
+    fmax v10.4s,v10.4s,v27.4s
+    fmax v11.4s,v11.4s,v27.4s
+
+
+    mov v12.16b,v30.16b
+    mov v13.16b,v30.16b
+
+    st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64
+
+    mov v14.16b,v30.16b
+    mov v15.16b,v30.16b
+
+    ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+
+    fmla v12.4s,v20.4s,v31.s[0]
+    fmla v13.4s,v21.4s,v31.s[0]
+    fmla v14.4s,v22.4s,v31.s[0]
+    fmla v15.4s,v23.4s,v31.s[0]
+
+    fmul v12.4s,v12.4s,v28.s[0]
+    fmul v13.4s,v13.4s,v28.s[0]
+    fmul v14.4s,v14.4s,v28.s[0]
+    fmul v15.4s,v15.4s,v28.s[0]
+
+    fadd v12.4s,v12.4s,v29.4s
+    fadd v13.4s,v13.4s,v29.4s
+    fadd v14.4s,v14.4s,v29.4s
+    fadd v15.4s,v15.4s,v29.4s
+
+    fmax v12.4s,v12.4s,v27.4s
+    fmax v13.4s,v13.4s,v27.4s
+    fmax v14.4s,v14.4s,v27.4s
+    fmax v15.4s,v15.4s,v27.4s
+
+    st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64
+
+    b.ne block_64_start
+
+last_block_64:
+
+    ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64
+
+    mov v8.16b,v30.16b
+    mov v9.16b,v30.16b
+    mov v10.16b,v30.16b
+    mov v11.16b,v30.16b
+
+    fmla v8.4s,v0.4s,v31.s[0]
+    fmla v9.4s,v1.4s,v31.s[0]
+    fmla v10.4s,v2.4s,v31.s[0]
+    fmla v11.4s,v3.4s,v31.s[0]
+
+    fmul v8.4s,v8.4s,v28.s[0]
+    fmul v9.4s,v9.4s,v28.s[0]
+    fmul v10.4s,v10.4s,v28.s[0]
+    fmul v11.4s,v11.4s,v28.s[0]
+
+    fadd v8.4s,v8.4s,v29.4s
+    fadd v9.4s,v9.4s,v29.4s
+    fadd v10.4s,v10.4s,v29.4s
+    fadd v11.4s,v11.4s,v29.4s
+
+    fmax v8.4s,v8.4s,v27.4s
+    fmax v9.4s,v9.4s,v27.4s
+    fmax v10.4s,v10.4s,v27.4s
+    fmax v11.4s,v11.4s,v27.4s
+
+    st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64
+
+    ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64
+
+    mov v12.16b,v30.16b
+    mov v13.16b,v30.16b
+    mov v14.16b,v30.16b
+    mov v15.16b,v30.16b
+
+    fmla v12.4s,v4.4s,v31.s[0]
+    fmla v13.4s,v5.4s,v31.s[0]
+    fmla v14.4s,v6.4s,v31.s[0]
+    fmla v15.4s,v7.4s,v31.s[0]
+
+    fmul v12.4s,v12.4s,v28.s[0]
+    fmul v13.4s,v13.4s,v28.s[0]
+    fmul v14.4s,v14.4s,v28.s[0]
+    fmul v15.4s,v15.4s,v28.s[0]
+
+    fadd v12.4s,v12.4s,v29.4s
+    fadd v13.4s,v13.4s,v29.4s
+    fadd v14.4s,v14.4s,v29.4s
+    fadd v15.4s,v15.4s,v29.4s
+
+    fmax v12.4s,v12.4s,v27.4s
+    fmax v13.4s,v13.4s,v27.4s
+    fmax v14.4s,v14.4s,v27.4s
+    fmax v15.4s,v15.4s,v27.4s
+
+
+    st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64
+
+    mov v8.16b,v30.16b
+    mov v9.16b,v30.16b
+    mov v10.16b,v30.16b
+    mov v11.16b,v30.16b
+
+    fmla v8.4s,v16.4s,v31.s[0]
+    fmla v9.4s,v17.4s,v31.s[0]
+    fmla v10.4s,v18.4s,v31.s[0]
+    fmla v11.4s,v19.4s,v31.s[0]
+
+    fmul v8.4s,v8.4s,v28.s[0]
+    fmul v9.4s,v9.4s,v28.s[0]
+    fmul v10.4s,v10.4s,v28.s[0]
+    fmul v11.4s,v11.4s,v28.s[0]
+
+    fadd v8.4s,v8.4s,v29.4s
+    fadd v9.4s,v9.4s,v29.4s
+    fadd v10.4s,v10.4s,v29.4s
+    fadd v11.4s,v11.4s,v29.4s
+
+    fmax v8.4s,v8.4s,v27.4s
+    fmax v9.4s,v9.4s,v27.4s
+    fmax v10.4s,v10.4s,v27.4s
+    fmax v11.4s,v11.4s,v27.4s
+
+    mov v12.16b,v30.16b
+    mov v13.16b,v30.16b
+    mov v14.16b,v30.16b
+    mov v15.16b,v30.16b
+
+    st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64
+
+    fmla v12.4s,v20.4s,v31.s[0]
+    fmla v13.4s,v21.4s,v31.s[0]
+    fmla v14.4s,v22.4s,v31.s[0]
+    fmla v15.4s,v23.4s,v31.s[0]
+
+    fmul v12.4s,v12.4s,v28.s[0]
+    fmul v13.4s,v13.4s,v28.s[0]
+    fmul v14.4s,v14.4s,v28.s[0]
+    fmul v15.4s,v15.4s,v28.s[0]
+
+    fadd v12.4s,v12.4s,v29.4s
+    fadd v13.4s,v13.4s,v29.4s
+    fadd v14.4s,v14.4s,v29.4s
+    fadd v15.4s,v15.4s,v29.4s
+
+    fmax v12.4s,v12.4s,v27.4s
+    fmax v13.4s,v13.4s,v27.4s
+    fmax v14.4s,v14.4s,v27.4s
+    fmax v15.4s,v15.4s,v27.4s
+
+    st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64            
+    
+    cbz  x10, channel_done
+
+less_64:
+    subs x10,x10,1
+    ldr s0,[x0],#4
+    fmadd s1,s0,s31,s30
+    fmadd s1,s1,s28,s29
+    fmax  s1,s1,s27
+    str s1,[x7],#4
+    b.ne less_64
+
+channel_done:
+
+    subs x5,x5,1  //channel_counter
+    b.ne channel_start
+
+    ret
diff --git a/executor/operator/arm64/fused/fused_bn_scale_relu.cpp b/executor/operator/arm64/fused/fused_bn_scale_relu.cpp
index 4dfb8753a..0caab72ff 100644
--- a/executor/operator/arm64/fused/fused_bn_scale_relu.cpp
+++ b/executor/operator/arm64/fused/fused_bn_scale_relu.cpp
@@ -1,207 +1,207 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: haitao@openailab.com
- */
-#include <iostream>
-#include <functional>
-#include <cstring>
-
-#include "logger.hpp"
-#include "operator/fused_operator.hpp"
-#include "node_ops.hpp"
-#include "tensor_mem.hpp"
-#include "graph.hpp"
-
-extern "C" void bn_scale_relu_neon(const float* input, float* gamma, float* beta, float* mean, float* var,
-                                   int channel_number, int channel_size, float* output);
-
-namespace TEngine {
-
-namespace FusedBNScaleReluArm64 {
-
-struct FusedOps : public MTNodeOps
-{
-    struct BNParam
-    {
-        const float* input;
-        float* gamma;
-        float* beta;
-        float* mean;
-        float* var;
-        int channel_num;
-        int channel_size;
-        float* output;
-    };
-
-    bool Aider(int cpu, int seq, void* data)
-    {
-        BNParam* param = ( BNParam* )(data);
-
-        bn_scale_relu_neon(param->input, param->gamma, param->beta, param->mean, param->var, param->channel_num,
-                           param->channel_size, param->output);
-
-        return true;
-    }
-
-    bool OnBind(Node* node)
-    {
-        inplace_t io_map;
-
-        io_map[0] = 0;
-
-        node->SetAttr(ATTR_INPLACE, io_map);
-
-        return true;
-    }
-
-    bool Run(Node* node)
-    {
-        const Tensor* input_tensor = node->GetInputTensor(0);
-        Tensor* output_tensor = node->GetOutputTensor(0);
-
-        const TShape& shape = input_tensor->GetShape();
-
-        const std::vector<int> dims = shape.GetDim();
-
-        int batch_number = dims[0];
-        int channel_num = dims[1];
-        int channel_size = dims[2] * dims[3];
-
-        Tensor* gamma_tensor = node->GetInputTensor(1);
-        Tensor* beta_tensor = node->GetInputTensor(2);
-        Tensor* mean_tensor = node->GetInputTensor(3);
-        Tensor* var_tensor = node->GetInputTensor(4);
-
-        float* gamma = ( float* )get_tensor_mem(gamma_tensor);
-        float* beta = ( float* )get_tensor_mem(beta_tensor);
-        float* mean = ( float* )get_tensor_mem(mean_tensor);
-        float* var = ( float* )get_tensor_mem(var_tensor);
-
-        const float* input = ( const float* )get_tensor_mem(input_tensor);
-        float* output = ( float* )get_tensor_mem(output_tensor);
-
-        int cpu_number = cpu_info->GetCPUNumber();
-
-        for(int i = 0; i < batch_number; i++)
-        {
-            if(cpu_number == 1)
-            {
-                bn_scale_relu_neon(input, gamma, beta, mean, var, channel_num, channel_size, output);
-                input += channel_size * channel_num;
-                output += channel_size * channel_num;
-            }
-            else
-            {
-                std::vector<sub_op_task> task_list;
-                std::vector<BNParam> param_list;
-
-                auto f = std::bind(&FusedOps::Aider, this, std::placeholders::_1, std::placeholders::_2,
-                                   std::placeholders::_3);
-
-                int step = (channel_num + (cpu_number - 1)) / cpu_number;
-
-                if(channel_num - (cpu_number - 1) * step <= 0)
-                    step = channel_num / cpu_number;
-
-                task_list.resize(cpu_number);
-                param_list.resize(cpu_number);
-
-                for(int i = 0; i < cpu_number; i++)
-                {
-                    BNParam* param = &param_list[i];
-                    sub_op_task* task = &task_list[i];
-
-                    task->exec_func = f;
-                    task->seq = i;
-                    task->data = param;
-
-                    param->input = input;
-                    param->gamma = gamma;
-                    param->beta = beta;
-                    param->mean = mean;
-                    param->var = var;
-
-                    param->channel_num = step;
-                    param->channel_size = channel_size;
-                    param->output = output;
-
-                    input += channel_size * step;
-                    output += channel_size * step;
-
-                    gamma += step;
-                    beta += step;
-                    mean += step;
-                    var += step;
-                }
-
-                param_list[cpu_number - 1].channel_num = channel_num - (cpu_number - 1) * step;
-
-                task_dispatch(task_list, -1);
-                wait_done();
-            }
-
-            /*
-                   the c code of assembly code
-
-                    for(int c=0;c<channel_num;c++)
-                    {
-                       float s_mean=mean[c];
-                       float s_var=var[c];
-                       float s_gamma=gamma[c];
-                       float s_beta=beta[c];
-
-                       for(int l=0;l<channel_size;l++)
-                       {
-                          float data=input[l];
-                          data=data*s_var+s_mean;
-
-                          data=data*s_gamma+s_beta;
-
-                          if(data<0.0)
-                              data=0;
-
-                          output[l]=data;
-                       }
-
-                       input+=channel_size;
-                       output+=channel_size;
-                    }
-
-            */
-        }
-
-        return true;
-    }
-};
-
-}    // namespace FusedBNScaleReluArm64
-
-using namespace FusedBNScaleReluArm64;
-
-void RegisterFusedBNScaleReluNodeExec(void)
-{
-    FusedOps* ops = new FusedOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("arm64", FusedBNScaleReLu::class_name, ops);
-}
-
-}    // namespace TEngine
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+#include <iostream>
+#include <functional>
+#include <cstring>
+
+#include "logger.hpp"
+#include "operator/fused_operator.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "graph.hpp"
+
+extern "C" void bn_scale_relu_neon(const float* input, float* gamma, float* beta, float* mean, float* var,
+                                   int channel_number, int channel_size, float* output);
+
+namespace TEngine {
+
+namespace FusedBNScaleReluArm64 {
+
+struct FusedOps : public MTNodeOps
+{
+    struct BNParam
+    {
+        const float* input;
+        float* gamma;
+        float* beta;
+        float* mean;
+        float* var;
+        int channel_num;
+        int channel_size;
+        float* output;
+    };
+
+    bool Aider(int cpu, int seq, void* data)
+    {
+        BNParam* param = ( BNParam* )(data);
+
+        bn_scale_relu_neon(param->input, param->gamma, param->beta, param->mean, param->var, param->channel_num,
+                           param->channel_size, param->output);
+
+        return true;
+    }
+
+    bool OnBind(Node* node)
+    {
+        inplace_t io_map;
+
+        io_map[0] = 0;
+
+        node->SetAttr(ATTR_INPLACE, io_map);
+
+        return true;
+    }
+
+    bool Run(Node* node)
+    {
+        const Tensor* input_tensor = node->GetInputTensor(0);
+        Tensor* output_tensor = node->GetOutputTensor(0);
+
+        const TShape& shape = input_tensor->GetShape();
+
+        const std::vector<int> dims = shape.GetDim();
+
+        int batch_number = dims[0];
+        int channel_num = dims[1];
+        int channel_size = dims[2] * dims[3];
+
+        Tensor* gamma_tensor = node->GetInputTensor(1);
+        Tensor* beta_tensor = node->GetInputTensor(2);
+        Tensor* mean_tensor = node->GetInputTensor(3);
+        Tensor* var_tensor = node->GetInputTensor(4);
+
+        float* gamma = ( float* )get_tensor_mem(gamma_tensor);
+        float* beta = ( float* )get_tensor_mem(beta_tensor);
+        float* mean = ( float* )get_tensor_mem(mean_tensor);
+        float* var = ( float* )get_tensor_mem(var_tensor);
+
+        const float* input = ( const float* )get_tensor_mem(input_tensor);
+        float* output = ( float* )get_tensor_mem(output_tensor);
+
+        int cpu_number = cpu_info->GetCPUNumber();
+
+        for(int i = 0; i < batch_number; i++)
+        {
+            if(cpu_number == 1)
+            {
+                bn_scale_relu_neon(input, gamma, beta, mean, var, channel_num, channel_size, output);
+                input += channel_size * channel_num;
+                output += channel_size * channel_num;
+            }
+            else
+            {
+                std::vector<sub_op_task> task_list;
+                std::vector<BNParam> param_list;
+
+                auto f = std::bind(&FusedOps::Aider, this, std::placeholders::_1, std::placeholders::_2,
+                                   std::placeholders::_3);
+
+                int step = (channel_num + (cpu_number - 1)) / cpu_number;
+
+                if(channel_num - (cpu_number - 1) * step <= 0)
+                    step = channel_num / cpu_number;
+
+                task_list.resize(cpu_number);
+                param_list.resize(cpu_number);
+
+                for(int i = 0; i < cpu_number; i++)
+                {
+                    BNParam* param = &param_list[i];
+                    sub_op_task* task = &task_list[i];
+
+                    task->exec_func = f;
+                    task->seq = i;
+                    task->data = param;
+
+                    param->input = input;
+                    param->gamma = gamma;
+                    param->beta = beta;
+                    param->mean = mean;
+                    param->var = var;
+
+                    param->channel_num = step;
+                    param->channel_size = channel_size;
+                    param->output = output;
+
+                    input += channel_size * step;
+                    output += channel_size * step;
+
+                    gamma += step;
+                    beta += step;
+                    mean += step;
+                    var += step;
+                }
+
+                param_list[cpu_number - 1].channel_num = channel_num - (cpu_number - 1) * step;
+
+                task_dispatch(task_list, -1);
+                wait_done();
+            }
+
+            /*
+                   the c code of assembly code
+
+                    for(int c=0;c<channel_num;c++)
+                    {
+                       float s_mean=mean[c];
+                       float s_var=var[c];
+                       float s_gamma=gamma[c];
+                       float s_beta=beta[c];
+
+                       for(int l=0;l<channel_size;l++)
+                       {
+                          float data=input[l];
+                          data=data*s_var+s_mean;
+
+                          data=data*s_gamma+s_beta;
+
+                          if(data<0.0)
+                              data=0;
+
+                          output[l]=data;
+                       }
+
+                       input+=channel_size;
+                       output+=channel_size;
+                    }
+
+            */
+        }
+
+        return true;
+    }
+};
+
+}    // namespace FusedBNScaleReluArm64
+
+using namespace FusedBNScaleReluArm64;
+
+void RegisterFusedBNScaleReluNodeExec(void)
+{
+    FusedOps* ops = new FusedOps();
+
+    NodeOpsRegistryManager::RegisterOPImplementor("arm64", FusedBNScaleReLu::class_name, ops);
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/arm64/include/pooling_kernel.h b/executor/operator/arm64/include/pooling_kernel.h
index f2c47fa72..6fb417694 100644
--- a/executor/operator/arm64/include/pooling_kernel.h
+++ b/executor/operator/arm64/include/pooling_kernel.h
@@ -27,16 +27,16 @@
 #include <arm_neon.h>
 
 /**
-* MaxPool_2x2: pooling for ksize=2x2,stride=2, pad=0(default pad=0)
-* @param[in]    input     input data (const float pointer)
-* @param[in]    output    output data (float pointer)
-* @param[in]    inc       input channel (int)
-* @param[in]    inh       input height (int)
-* @param[in]    inw       input width (int)
-* @param[in]    outh      output height (int)
-* @param[in]    outw      output width (int)
-* @return		None
-*/
+ * MaxPool_2x2: pooling for ksize=2x2,stride=2, pad=0(default pad=0)
+ * @param[in]    input     input data (const float pointer)
+ * @param[in]    output    output data (float pointer)
+ * @param[in]    inc       input channel (int)
+ * @param[in]    inh       input height (int)
+ * @param[in]    inw       input width (int)
+ * @param[in]    outh      output height (int)
+ * @param[in]    outw      output width (int)
+ * @return		None
+ */
 
 static void MaxPool_2x2s2(const float* input, float* output, int inc, int inh, int inw, int outh, int outw, int, int,
                           int, int, int, int, int pad_h1, int pad_w1, int)
diff --git a/executor/operator/arm64/init.cpp b/executor/operator/arm64/init.cpp
index b8f52d7ad..5a2a51771 100644
--- a/executor/operator/arm64/init.cpp
+++ b/executor/operator/arm64/init.cpp
@@ -25,16 +25,14 @@
 namespace TEngine {
 
 extern void RegisterConv2dFast(void);
-extern void RegisterConv2dINT8(void);
 extern void RegisterConv2dDepth(void);
 extern void RegisterFullyConnectedFast(void);
-extern void RegisterFullyConnectedINT8(void);
 extern void RegisterPoolingNodeExec(void);
 extern void RegisterBatchNormNodeExec(void);
 extern void RegisterScaleNodeExec(void);
-extern void RegisterDeconvNodeExec(void);
 extern void RegisterLRNNodeExec(void);
 
+
 void __attribute__((visibility("default"))) RegisterArmOps(void)
 {
     RegisterConv2dFast();
diff --git a/executor/operator/arm64/pooling.cpp b/executor/operator/arm64/pooling.cpp
index 221b7d602..0cd1473c2 100644
--- a/executor/operator/arm64/pooling.cpp
+++ b/executor/operator/arm64/pooling.cpp
@@ -31,15 +31,21 @@
 #include "tensor_mem.hpp"
 #include "pooling_kernel.h"
 
+#ifdef CONFIG_AUTH_DEVICE
+#include "auth_nodeops.hpp"
+#endif
+
 namespace TEngine {
 
 namespace PoolingImpl {
 
+const int default_prio = 100;
+
 typedef void (*pool_kernel_t)(const float* input, float* output, int inc, int in_h, int inw, int out_h, int out_w,
                               int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h0, int pad_w0,
                               int pad_h1, int pad_w1, int is_caffe);
 
-struct PoolingOps : public NodeOps
+struct PoolingOps : public MTNodeOps
 {
     PoolingSize pooling_size = POOL_GENERIC;
     pool_kernel_t kernel_run = nullptr;
@@ -80,15 +86,15 @@ struct PoolingOps : public NodeOps
         Pooling* pooling_op = dynamic_cast<Pooling*>(node->GetOp());
         PoolParam* param_ = pooling_op->GetParam();
 
-        if(param_->strides[0] == 2 && param_->strides[1] == 2)
+        if(param_->stride_h == 2 && param_->stride_w == 2)
         {
-            if(param_->kernel_shape[0] == 2 && param_->kernel_shape[1] == 2)
+            if(param_->kernel_h == 2 && param_->kernel_w == 2)
                 pooling_size = POOL_K2S2;
-            else if(param_->kernel_shape[0] == 3 && param_->kernel_shape[1] == 3)
+            else if(param_->kernel_h == 3 && param_->kernel_w == 3)
                 pooling_size = POOL_K3S2;
         }
-        else if(param_->strides[0] == 1 && param_->strides[1] == 1 && param_->kernel_shape[0] == 3 &&
-                param_->kernel_shape[1] == 3)
+        else if(param_->stride_h == 1 && param_->stride_w == 1 && param_->kernel_h == 3 &&
+                param_->kernel_w == 3)
         {
             pooling_size = POOL_K3S1;
         }
@@ -102,14 +108,14 @@ struct PoolingOps : public NodeOps
             }
 
             kernel_run = Generic_MaxPool;
-            if(param_->pads[0] == 0 && param_->pads[1] == 0)
+            if(param_->pad_h0 == 0 && param_->pad_w0 == 0)
             {
                 if(pooling_size == POOL_K2S2)
                     kernel_run = MaxPool_2x2s2;
                 else if(pooling_size == POOL_K3S2)
                     kernel_run = MaxPool_3x3s2;
             }
-            else if(param_->pads[0] == 1 && param_->pads[1] == 1)
+            else if(param_->pad_h0 == 1 && param_->pad_w0 == 1)
             {
                 if(pooling_size == POOL_K2S2)
                     kernel_run = MaxPool_2x2s2_pad1;
@@ -131,7 +137,7 @@ struct PoolingOps : public NodeOps
             }
 
             kernel_run = Generic_AvgPool;
-            if(param_->pads[0] == 0 && param_->pads[1] == 0)
+            if(param_->pad_h0 == 0 && param_->pad_w0 == 0)
             {
                 if(pooling_size == POOL_K2S2)
                     kernel_run = AvgPool_2x2s2;
@@ -139,7 +145,7 @@ struct PoolingOps : public NodeOps
                     kernel_run = AvgPool_3x3s2;
             }
 
-            if(param_->pads[0] == 1 && param_->pads[1] == 1)
+            if(param_->pad_h0 == 1 && param_->pad_w0 == 1)
             {
                 if(pooling_size == POOL_K2S2)
                     kernel_run = AvgPool_2x2s2_pad1;
@@ -180,8 +186,8 @@ struct PoolingOps : public NodeOps
     printf("input: %d,%d,%d   --> output: %d,%d \n",
                 in_dim[1], in_dim[2], in_dim[3], out_dim[2], out_dim[3]);
     printf("kernel: %d, stride: %d, arg: %d, pad: %d,%d,%d,%d\n",
-                param_->kernel_shape[0], param_->strides[0], param_->alg,
-                param_->pads[0],param_->pads[1],param_->pads[2],param_->pads[3]);
+                param_->kernel_h, param_->stride_h, param_->alg,
+                param_->pad_h0,param_->pad_w0,param_->pad_h1,param_->pad_w1);
 #endif
         int is_caffe = param_->caffe_flavor;
         for(int n = 0; n < in_dim[0]; n++)
@@ -191,8 +197,8 @@ struct PoolingOps : public NodeOps
             if(!exec_attr->pooling_mt)
             {
                 kernel_run(in_ptr, out_ptr, in_dim[1], in_dim[2], in_dim[3], out_dim[2], out_dim[3],
-                           param_->kernel_shape[0], param_->kernel_shape[1], param_->strides[0], param_->strides[1],
-                           param_->pads[0], param_->pads[1], param_->pads[2], param_->pads[3], is_caffe);
+                           param_->kernel_h, param_->kernel_w, param_->stride_h, param_->stride_w,
+                           param_->pad_h0, param_->pad_w0, param_->pad_h1, param_->pad_w1, is_caffe);
             }
             else
             {
@@ -221,14 +227,14 @@ struct PoolingOps : public NodeOps
                     param->in_w = in_dim[3];
                     param->out_h = out_dim[2];
                     param->out_w = out_dim[3];
-                    param->kernel_h = param_->kernel_shape[0];
-                    param->kernel_w = param_->kernel_shape[1];
-                    param->stride_h = param_->strides[0];
-                    param->stride_w = param_->strides[1];
-                    param->pad_h0 = param_->pads[0];
-                    param->pad_w0 = param_->pads[1];
-                    param->pad_h1 = param_->pads[2];
-                    param->pad_w1 = param_->pads[3];
+                    param->kernel_h = param_->kernel_h;
+                    param->kernel_w = param_->kernel_w;
+                    param->stride_h = param_->stride_h;
+                    param->stride_w = param_->stride_w;
+                    param->pad_h0 = param_->pad_h0;
+                    param->pad_w0 = param_->pad_w0;
+                    param->pad_h1 = param_->pad_h1;
+                    param->pad_w1 = param_->pad_w1;
                     param->is_caffe = is_caffe;
                 }
 
@@ -241,17 +247,21 @@ struct PoolingOps : public NodeOps
     }
 };
 
-const int default_prio = 100;
 NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
 {
+#ifdef CONFIG_AUTH_DEVICE
+    if(!get_auth_float_enabled())
+         return nullptr;
+#endif
+
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
     const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
-    if(exec_attr->layout == TENGINE_LAYOUT_NHWC)
+    if( data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
         return nullptr;
 
     PoolingOps* ops = new PoolingOps();
 
-    ops->need_free = true;
-
     return ops;
 }
 
diff --git a/executor/operator/arm64/scale_neon.S b/executor/operator/arm64/scale_neon.S
index f67da6c15..a66f39827 100644
--- a/executor/operator/arm64/scale_neon.S
+++ b/executor/operator/arm64/scale_neon.S
@@ -27,6 +27,7 @@
 .text
 .align 5
 .global scale_neon
+.hidden scale_neon
 .type   scale_neon, %function
 
 scale_neon:
@@ -133,6 +134,7 @@ channel_done:
 
 //scale_neon_bias
 .global scale_neon_bias
+.hidden scale_neon_bias
 .type   scale_neon_bias, %function
 
 scale_neon_bias:
diff --git a/executor/operator/common/Makefile b/executor/operator/common/Makefile
index 9ff4bad94..cb0ad1b2e 100644
--- a/executor/operator/common/Makefile
+++ b/executor/operator/common/Makefile
@@ -1,4 +1,3 @@
-obj-y+=conv_ref.o
 obj-y+=concat.o
 obj-y+=dropout.o
 obj-y+=softmax.o
@@ -22,7 +21,6 @@ obj-y+=resize.o
 obj-y+=pooling.o
 obj-y+=batchnorm.o
 obj-y+=scale.o
-obj-y+=custom_kernel_ops.o
 obj-y+=logistic.o
 obj-y+=detection_postprocess.o
 obj-y+=fused/
diff --git a/executor/operator/common/batchnorm.cpp b/executor/operator/common/batchnorm.cpp
index d251471f7..30aa6662d 100644
--- a/executor/operator/common/batchnorm.cpp
+++ b/executor/operator/common/batchnorm.cpp
@@ -165,15 +165,28 @@ struct BatchNormOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+    if((input->GetShape()).GetDim().size() != 4)
+        return nullptr;
+
+    BatchNormOps* ops = new BatchNormOps();
+
+    return ops;
+}
+
 }    // namespace BatchNormImpl
 
 using namespace BatchNormImpl;
 
 void RegisterBatchNorm_NodeExec(void)
 {
-    BatchNormOps* ops = new BatchNormOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "BatchNormalization", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "BatchNormalization", BatchNormImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/blas/Makefile b/executor/operator/common/blas/Makefile
index 29f55108a..849d3f253 100644
--- a/executor/operator/common/blas/Makefile
+++ b/executor/operator/common/blas/Makefile
@@ -1,4 +1,6 @@
 obj-y+=conv_2d_blas.o
 obj-y+=deconv_2d_blas.o
 obj-y+=fc_blas.o
-obj-y+=lstm_blas.o
\ No newline at end of file
+obj-y+=lstm_blas.o
+obj-y+=rnn_blas.o
+obj-y+=gru_blas.o
\ No newline at end of file
diff --git a/executor/operator/common/blas/conv_2d_blas.cpp b/executor/operator/common/blas/conv_2d_blas.cpp
index d64c2838e..b05b87360 100644
--- a/executor/operator/common/blas/conv_2d_blas.cpp
+++ b/executor/operator/common/blas/conv_2d_blas.cpp
@@ -37,8 +37,8 @@
 namespace TEngine {
 
 namespace ConvolutionImpl {
-const char* conv_name = "CONV_IMPL";
-const int default_prio = 1200;
+const char* conv_name = "CONV_BLAS";
+const int default_prio = 5000;
 
 struct ConvolutionOps : public NodeOps
 {
@@ -177,8 +177,8 @@ struct ConvolutionOps : public NodeOps
 
         int ksize_h = param->kernel_h;
         int ksize_w = param->kernel_w;
-        int pad_w = param->pads[1];
-        int pad_h = param->pads[0];
+        int pad_w = param->pad_w0;
+        int pad_h = param->pad_h0;
 
         int stride_w = param->stride_w;
         int stride_h = param->stride_h;
@@ -248,14 +248,14 @@ struct ConvolutionOps : public NodeOps
 
 NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
 {
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
     const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
-    if(exec_attr->layout == TENGINE_LAYOUT_NHWC)
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
         return nullptr;
 
     ConvolutionOps* ops = new ConvolutionOps();
 
-    ops->need_free = true;
-
     return ops;
 }
 
@@ -263,8 +263,9 @@ NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
 
 void RegisterConvBlasNodeExec(void)
 {
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Convolution", ConvolutionImpl::SelectFunc,
-                                                  ConvolutionImpl::default_prio);
+    if(!NodeOpsRegistryManager::RegisterOPImplementor("common", "Convolution", ConvolutionImpl::SelectFunc,
+                                                  ConvolutionImpl::default_prio))
+        LOG_ERROR()<<__FUNCTION__<<" :Regist OP failed for prio ["<<ConvolutionImpl::default_prio<<"]\n";
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/blas/deconv_2d_blas.cpp b/executor/operator/common/blas/deconv_2d_blas.cpp
index 432ddc1e9..0b509355c 100644
--- a/executor/operator/common/blas/deconv_2d_blas.cpp
+++ b/executor/operator/common/blas/deconv_2d_blas.cpp
@@ -37,7 +37,7 @@
 
 namespace TEngine {
 
-namespace DeconvolutionImpl {
+namespace DeconvolutionBlasImpl {
 
 struct DeconvBlasOps : public NodeOps
 {
@@ -51,7 +51,7 @@ struct DeconvBlasOps : public NodeOps
         const TShape& shape = input_tensor->GetShape();
         const std::vector<int> dims = shape.GetDim();
 
-        int size = dims[2] * dims[3] * param_->kernel_size * param_->kernel_size * param_->num_output;
+        int size = dims[2] * dims[3] * param_->kernel_h * param_->kernel_w * param_->num_output;
         float* buffer = ( float* )std::malloc(sizeof(float) * size);
         memset(buffer, 0, size * sizeof(float));
         (*node)["buffer"] = buffer;
@@ -154,10 +154,10 @@ struct DeconvBlasOps : public NodeOps
         // param
         Deconvolution* deconv_op = dynamic_cast<Deconvolution*>(node->GetOp());
         DeconvParam* param_ = deconv_op->GetParam();
-        int pad = param_->pad;
-        int stride = param_->stride;
-        int ksize = param_->kernel_size;
-        int dilation = param_->dilation;
+        int pad = param_->pad_w0;
+        int stride = param_->stride_w;
+        int ksize = param_->kernel_w;
+        int dilation = param_->dilation_w;
 
         // buffer
         float* buffer = any_cast<float*>(node->GetAttr("buffer"));
@@ -205,15 +205,48 @@ struct DeconvBlasOps : public NodeOps
     }
 };
 
-}    // namespace DeconvolutionImpl
+    
+static bool isDeconvSupported(DeconvParam * param)
+{
+    if(param->pad_h0 != param->pad_h1 || param->pad_w0 != param->pad_w1 ||
+        param->pad_w0 != param->pad_h0 ||
+        param->stride_h != param->stride_w ||
+        param->dilation_h != param->dilation_w ||
+        param->group != 1 ||
+        param->kernel_h != param->kernel_w
+        )
+        return false;
+    return true;
+
+}    
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+#ifdef CONFIG_ATUH_DEVICE
+    if(!get_auth_float_enabled())
+        return nullptr;
+#endif
+    Operator* op = node->GetOp();
+    Deconvolution* deconv_op = dynamic_cast<Deconvolution*>(op);
+    DeconvParam* param = deconv_op->GetParam();    
+    if(!isDeconvSupported(param))
+        return nullptr;
+
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+    DeconvBlasOps* ops = new DeconvBlasOps();
+    return ops;
+}
 
-using namespace DeconvolutionImpl;
+}    // namespace DeconvolutionBlasImpl
+
+using namespace DeconvolutionBlasImpl;
 
 void RegisterDeconvBlasNodeExec(void)
 {
-    DeconvBlasOps* ops = new DeconvBlasOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Deconvolution", ops);
+      NodeOpsRegistryManager::RegisterOPImplementor("common","Deconvolution",DeconvolutionBlasImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/blas/fc_blas.cpp b/executor/operator/common/blas/fc_blas.cpp
index 965a7f862..ae089f424 100644
--- a/executor/operator/common/blas/fc_blas.cpp
+++ b/executor/operator/common/blas/fc_blas.cpp
@@ -129,14 +129,25 @@ struct FcBlasOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    FcBlasOps* ops = new FcBlasOps();
+
+    return ops;
+}
+
 }    // namespace FCImpl
 
 using namespace FCImpl;
 void RegisterFcBlasNodeExec(void)
 {
-    FcBlasOps* ops = new FcBlasOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "FullyConnected", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "FullyConnected", FCImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/blas/gru_blas.cpp b/executor/operator/common/blas/gru_blas.cpp
new file mode 100644
index 000000000..a7a690c7d
--- /dev/null
+++ b/executor/operator/common/blas/gru_blas.cpp
@@ -0,0 +1,473 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#include <algorithm>
+#include <cstring>
+#include <fstream>
+#include <functional>
+#include <iostream>
+
+#include "graph.hpp"
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "operator/gru.hpp"
+#include "tensor_mem.hpp"
+#include "tengine_errno.hpp"
+#include <cblas.h>
+#include <math.h>
+
+namespace TEngine {
+
+namespace GRURefImpl {
+
+struct GRUOps : public NodeOps
+{
+    Tensor* init_h_tensor;
+    Tensor* kernel_tensor;
+    Tensor* bias_tensor;
+    Tensor* candidate_kernel_tensor;
+    Tensor* candidate_bias_tensor;
+    Tensor* fused_kernel_tensor;
+    // bool dynamic_shape;
+    void* init_h_data;
+
+    GRUOps(void)
+    {
+        init_h_tensor = nullptr;
+        bias_tensor = nullptr;
+        init_h_data = nullptr;
+        kernel_tensor=nullptr;
+        candidate_kernel_tensor=nullptr;
+        candidate_bias_tensor=nullptr;
+        fused_kernel_tensor=nullptr;
+    }
+
+    void sigmoid(float* data, int size)
+    {
+        for(int i = 0; i < size; i++)
+        {
+            data[i] = std::min(data[i], 30.0f);
+            data[i] = std::max(data[i], -30.0f);
+
+            data[i] = 1 / (1 + exp(-data[i]));
+        }
+    }
+    /*
+  @ func_name: concat_axis_1
+  @ param:
+      a:[m, n1]
+      b:[m, n2]
+      c:[m, n1 + n2]
+  */
+    void concat_axis_1(const float* a, const float* b, float* c, int m, int n1, int n2)
+    {
+        int n = n1 + n2;
+        for(int i = 0; i < m; i++)
+        {
+            for(int j = 0; j < n1; j++)
+            {
+                c[j + i * n] = a[j + i * n1];
+            }
+            for(int j = 0; j < n2; j++)
+            {
+                c[j + i * n + n1] = b[j + i * n2];
+            }
+        }
+    }
+
+    void slice_axis_1(float* a, float* c, int m, int n, int st, int ed)
+    {
+        for(int i = 0; i < m; i++)
+        {
+            for(int j = st; j < ed; j++)
+            {
+                c[i * (ed - st) + j - st] = a[i * n + j];
+            }
+        }
+    }
+    void do_gemm(const float* a, const float* b, float* c, int m, int k, int n, int lda, int ldb, int ldc)
+    {
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, a, lda, b, ldb, 0.0, c, ldc);
+    }
+    void do_gemm_mx(const float* a, const float* b, float* c, int m, int k, int n, int lda, int ldb, int ldc)
+    {
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0, a, lda, b, ldb, 0.0, c, ldc);
+    }
+
+    bool do_GRU_step(const float* input, float* init_h, const float* kernel, const float* bias,
+                      const float* candidate_kernel,const float* candidate_bias,int batch_size, 
+                      int input_size, int hidden_size,int mxnet_flag)
+    {
+
+        if(mxnet_flag==1)
+        {
+            float* i2h_mat = ( float* )malloc(sizeof(float) * batch_size *3* hidden_size);
+            float* h2h_mat = ( float* )malloc(sizeof(float) * batch_size *3* hidden_size);
+            
+            float* i2h_r = ( float* )malloc(batch_size*hidden_size * sizeof(float));
+            float* i2h_z = ( float* )malloc(batch_size*hidden_size * sizeof(float));
+            float* i2h = ( float* )malloc(batch_size*hidden_size * sizeof(float));
+
+            float* h2h_r = ( float* )malloc(batch_size*hidden_size * sizeof(float));
+            float* h2h_z = ( float* )malloc(batch_size*hidden_size * sizeof(float));
+            float* h2h = ( float* )malloc(batch_size*hidden_size * sizeof(float));
+
+            float* r_g = ( float* )malloc(batch_size*hidden_size * sizeof(float));
+            float* u_g = ( float* )malloc(batch_size*hidden_size * sizeof(float));
+            float* next_h_tmp = ( float* )malloc(batch_size*hidden_size * sizeof(float));
+
+            do_gemm_mx(input, kernel, i2h_mat, batch_size, input_size, 3*hidden_size, input_size,
+                    input_size, 3*hidden_size);
+
+            for(int i = 0; i < batch_size; i++)
+            {
+                for(int j = 0; j < (3*hidden_size); j++)
+                {
+                    i2h_mat[i *(3*hidden_size) + j] += bias[j];
+                }
+            }
+
+            do_gemm_mx(init_h, candidate_kernel, h2h_mat, batch_size, hidden_size, 3*hidden_size, hidden_size,
+                    hidden_size, 3*hidden_size);
+            
+            for(int i = 0; i < batch_size; i++)
+            {
+                for(int j = 0; j < (3*hidden_size); j++)
+                {
+                    h2h_mat[i *(3*hidden_size) + j] += candidate_bias[j];
+                }
+            }
+            slice_axis_1(i2h_mat, i2h_r, batch_size, 3 * hidden_size, 0, hidden_size);
+            slice_axis_1(i2h_mat, i2h_z, batch_size, 3 * hidden_size, hidden_size, 2*hidden_size);
+            slice_axis_1(i2h_mat, i2h, batch_size, 3 * hidden_size, 2*hidden_size, 3*hidden_size);
+
+            slice_axis_1(h2h_mat, h2h_r, batch_size, 3 * hidden_size, 0, hidden_size);
+            slice_axis_1(h2h_mat, h2h_z, batch_size, 3 * hidden_size, hidden_size, 2*hidden_size);
+            slice_axis_1(h2h_mat, h2h, batch_size, 3 * hidden_size, 2*hidden_size, 3*hidden_size);
+
+            for(int i = 0; i < batch_size*hidden_size; i++)
+            {
+                r_g[i] = i2h_r[i]+h2h_r[i];
+            }
+            sigmoid(r_g,hidden_size * batch_size);
+            for(int i = 0; i < batch_size*hidden_size; i++)
+            {
+                u_g[i] = i2h_z[i]+h2h_z[i];
+            }
+            sigmoid(u_g,hidden_size * batch_size);
+
+            for(int i = 0; i < batch_size*hidden_size; i++)
+            {
+                next_h_tmp[i] = tanh(i2h[i]+r_g[i]*h2h[i]);
+            }
+
+            for(int i = 0; i < batch_size*hidden_size; i++)
+            {
+                init_h[i] = u_g[i] * init_h[i] + (1-u_g[i]) * next_h_tmp[i];
+            }
+
+            // free memory
+            free(i2h_mat);
+            free(h2h_mat);
+            free(i2h_r);
+            free(i2h_z);
+            free(i2h);
+            free(h2h_r);
+            free(h2h_z);
+            free(h2h);
+            free(r_g);
+            free(u_g);
+            free(next_h_tmp);
+
+            return true;
+        }
+        else
+        {
+            int input_total_size = input_size + hidden_size;
+            int batch_cell_size = hidden_size * batch_size;
+
+            float* merged_input = ( float* )malloc(sizeof(float) * batch_size * (input_total_size));
+            float* matmul_result = ( float* )malloc(sizeof(float) * batch_size *2* hidden_size );
+            float* r = ( float* )malloc(batch_cell_size * sizeof(float));
+            float* u = ( float* )malloc(batch_cell_size * sizeof(float));
+            float* c = ( float* )malloc(batch_cell_size * sizeof(float));
+            float* r_state = ( float* )malloc(batch_cell_size * sizeof(float));
+            float* candidate = ( float* )malloc(sizeof(float) * batch_size* hidden_size);
+
+            // merge input
+            concat_axis_1(input, init_h, merged_input, batch_size, input_size, hidden_size);
+            // do gemm
+            do_gemm(merged_input, kernel, matmul_result, batch_size, input_total_size, 2*hidden_size, input_total_size,
+                    2*hidden_size, 2*hidden_size);
+            // add bias
+            
+            
+            for(int i = 0; i < batch_size; i++)
+            {
+                for(int j = 0; j < (2*hidden_size); j++)
+                {
+                    matmul_result[i *(2*hidden_size) + j] += bias[j];
+                }
+                
+            }
+            
+
+            sigmoid(matmul_result,2*hidden_size * batch_size);
+            slice_axis_1(matmul_result, r, batch_size, 2 * hidden_size, 0, hidden_size);
+            slice_axis_1(matmul_result, u, batch_size, 2 * hidden_size, hidden_size, 2*hidden_size);
+
+
+            for(int i = 0; i < batch_cell_size; i++)
+                r_state[i] = r[i] * init_h[i];
+            
+            concat_axis_1(input, r_state, merged_input, batch_size, input_size, hidden_size);
+            //candidate kernerl
+
+
+            do_gemm(merged_input, candidate_kernel, candidate, batch_size, input_total_size, hidden_size, input_total_size,
+                    hidden_size, hidden_size);
+            //candidate bias
+            
+            for(int i = 0; i < batch_size; i++)
+            {
+                for(int j = 0; j < hidden_size; j++)
+                {
+                    candidate[i *hidden_size + j] += candidate_bias[j];
+                }
+            }
+            
+
+            for(int i = 0; i < batch_cell_size; i++)
+            {
+                c[i] = tanh(candidate[i]);
+            }
+
+            for(int i = 0; i < batch_cell_size; i++)
+            {
+                init_h[i] = u[i] * init_h[i] + (1-u[i]) * c[i];
+            }
+            // free memory
+            free(merged_input);
+            free(matmul_result);
+            free(candidate);
+            free(r);
+            free(u);
+            free(c);
+            return true;
+        }
+        
+        
+    }
+
+    bool do_GRU(const float* input, float* output, float* init_h, const float* kernel,
+                 const float* bias,const float* candidate_kernel,const float* candidate_bias, 
+                 int seq_lens, int batch_size, int input_size,int output_len, int hidden_size,int mxnet_flag)
+    {
+        for(int i = 0; i < seq_lens; i++)
+        {
+
+            const float* seq_input = input + i * batch_size * input_size;
+            if(!do_GRU_step(seq_input, init_h, kernel, bias, candidate_kernel,candidate_bias,batch_size, input_size, hidden_size,mxnet_flag))
+            {   
+                return false;
+            }
+
+            if(i + output_len >= seq_lens)
+            {
+                memcpy(output, init_h, batch_size*hidden_size * sizeof(float));
+                output += batch_size*hidden_size;
+            }
+        }
+
+        return true;
+    }
+
+    bool Prerun(Node* node)
+    {
+        GRU* gru_op = dynamic_cast<GRU*>(node->GetOp());
+
+        int in_num = node->GetInputNum();
+
+        for(int count = 0; count < in_num; count++)
+        {
+            Tensor* temptensor = node->GetInputTensor(count);
+            const std::string& name = temptensor->GetName();
+
+            if(name.find(gru_op->GetInitHiddenName()) != std::string::npos)
+            {
+                init_h_tensor = temptensor;
+            }
+            if(name.find(gru_op->GetBiasName()) != std::string::npos)
+            {
+                bias_tensor = temptensor;
+            }
+            if(name.find(gru_op->GetKernelName()) != std::string::npos)
+            {
+                kernel_tensor = temptensor;
+            }
+            if(name.find(gru_op->GetCandidateKernelName()) != std::string::npos)
+            {
+                candidate_kernel_tensor = temptensor;
+            }
+            if(name.find(gru_op->GetCandidateBiasName()) != std::string::npos)
+            {
+                candidate_bias_tensor = temptensor;
+            }
+            if(name.find(gru_op->Geti2hweightName()) != std::string::npos)
+            {
+                kernel_tensor = temptensor;
+            }
+            if(name.find(gru_op->Geti2hbiasName()) != std::string::npos)
+            {
+                bias_tensor = temptensor;
+            }
+            if(name.find(gru_op->Geth2hweightName()) != std::string::npos)
+            {
+                candidate_kernel_tensor = temptensor;
+            }
+            if(name.find(gru_op->Geth2hbiasName()) != std::string::npos)
+            {
+                candidate_bias_tensor = temptensor;
+            }
+            if(name.find(gru_op->GetFusedKernelName()) != std::string::npos)
+            {
+                fused_kernel_tensor = temptensor;
+            }
+
+           
+        }
+
+        if(init_h_tensor)
+        {
+            init_h_data = get_tensor_mem(init_h_tensor);
+        }
+
+        return true;
+    }
+
+    bool Run(Node* node)
+    {
+        GRU* gru_op = dynamic_cast<GRU*>(node->GetOp());
+        GRUParam* param = gru_op->GetParam();
+
+        Tensor* input_tensor = node->GetInputTensor(0);
+        Tensor* output_tensor = node->GetOutputTensor(0);
+        // Tensor* kernel_tensor = node->GetInputTensor(1);
+        
+
+
+        int input_size = 0;
+        int hidden_size = param->hidden_size;
+
+        float* output = ( float* )get_tensor_mem(output_tensor);
+        // std::cout<<"ot::"<<output<<"\n";
+        float* input = ( float* )get_tensor_mem(input_tensor);
+
+        const TShape& input_shape = input_tensor->GetShape();
+
+        int seq_lens = input_shape.Shape(0);
+        int batch_size = input_shape.Shape(1);
+        int output_len = param->output_len;
+        int mxnet_flag = param->mxnet_flag;
+
+        if(mxnet_flag==1)
+        {
+            input_size=input_shape.Shape(2);
+            // kernel_tensor = node->GetInputTensor(1);
+        }
+        else
+        {
+            input_size = param->input_size;
+        }
+        float* init_h = ( float* )malloc(batch_size * hidden_size * sizeof(float));
+
+        if(init_h == nullptr)
+        {
+            set_tengine_errno(ENOMEM);
+            return false;
+        }
+
+        if(init_h_data)
+        {
+            for(int i = 0; i < batch_size; i++)
+            {
+                memcpy(init_h + i * hidden_size, init_h_data, hidden_size * sizeof(float));
+            }
+        }
+        else
+        {
+            memset(init_h, 0x0, sizeof(batch_size * hidden_size * sizeof(float)));
+        }
+
+        float* kernel = nullptr;
+        float* bias = nullptr;
+        float* fused_kernel=nullptr;
+        float* candidate_kernel = nullptr;
+        float* candidate_bias = nullptr;
+        
+        if(kernel_tensor)
+            kernel = ( float* )get_tensor_mem(kernel_tensor);
+        
+        if(bias_tensor)
+            bias = ( float* )get_tensor_mem(bias_tensor);
+            
+        if(candidate_kernel_tensor)
+            candidate_kernel = ( float* )get_tensor_mem(candidate_kernel_tensor);
+        
+        if(candidate_bias_tensor)
+            candidate_bias = ( float* )get_tensor_mem(candidate_bias_tensor);
+        
+        if(fused_kernel_tensor)
+        {
+            // std::cout<<"fused_kernel\n";
+            fused_kernel=( float* )get_tensor_mem(fused_kernel_tensor);
+            kernel=fused_kernel;
+            candidate_kernel=fused_kernel+input_size*hidden_size*3;
+            bias=candidate_kernel+hidden_size*hidden_size*3;
+            candidate_bias=bias+hidden_size*3;
+        }
+
+        bool ret = do_GRU(input, output, init_h, kernel, bias, candidate_kernel
+        ,candidate_bias,seq_lens, batch_size, input_size, output_len, hidden_size,mxnet_flag);
+
+        free(init_h);
+        return ret;
+    }
+
+    bool Postrun(Node* node)
+    {
+        return true;
+    }
+};
+
+}    // namespace GRURefImpl
+
+using namespace GRURefImpl;
+void RegisterGRUNodeExec(void)
+{
+    GRUOps* ops = new GRUOps();
+
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "GRU", ops);
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/common/blas/lstm_blas.cpp b/executor/operator/common/blas/lstm_blas.cpp
index 64e0d4c0b..04af0dad9 100644
--- a/executor/operator/common/blas/lstm_blas.cpp
+++ b/executor/operator/common/blas/lstm_blas.cpp
@@ -49,13 +49,18 @@ struct LSTMOps : public NodeOps
     Tensor* w_i_tensor;
     Tensor* w_o_tensor;
     Tensor* proj_tensor;
+    Tensor* kernel_tensor;
+    Tensor* h2h_kernel_tensor;
+    Tensor* h2h_bias_tensor;
+    Tensor* fused_kernel_tensor;
     void* init_h_data;
     void* init_c_data;
-
+    // bool dynamic_shape;
     LSTMOps(void)
     {
         init_c_tensor = nullptr;
         init_h_tensor = nullptr;
+        kernel_tensor=nullptr;
         bias_tensor = nullptr;
         w_f_tensor = nullptr;
         w_i_tensor = nullptr;
@@ -63,6 +68,9 @@ struct LSTMOps : public NodeOps
         proj_tensor = nullptr;
         init_h_data = nullptr;
         init_c_data = nullptr;
+        h2h_kernel_tensor=nullptr;
+        h2h_bias_tensor=nullptr;
+        fused_kernel_tensor=nullptr;
     }
 
     /*
@@ -228,116 +236,191 @@ struct LSTMOps : public NodeOps
     {
         cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, a, lda, b, ldb, 0.0, c, ldc);
     }
+    void do_gemm_mx(const float* a, const float* b, float* c, int m, int k, int n, int lda, int ldb, int ldc)
+    {
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0, a, lda, b, ldb, 0.0, c, ldc);
+    }
 
     bool do_LSTM_step(const float* input, float* init_h, float* init_c, const float* kernel, const float* bias,
+                      const float* h2h_kernel, const float* h2h_bias,
                       const float* w_f_data, const float* w_i_data, const float* w_o_data, const float* projection,
-                      float forget_bias, int batch_size, int input_size, int hidden_size, int cell_size)
+                      float forget_bias, int batch_size, int input_size, int hidden_size, int cell_size,int mxnet_flag)
     {
-        int input_total_size = input_size + hidden_size;
-        int batch_cell_size = cell_size * batch_size;
+        if(mxnet_flag==1)
+        {
+            int batch_cell_size = cell_size * batch_size;
+            float* i2h = ( float* )malloc(sizeof(float) * batch_size * cell_size * 4);
+            float* h2h = ( float* )malloc(sizeof(float) * batch_size * cell_size * 4);
+            float* gates = ( float* )malloc(sizeof(float) * batch_size * cell_size * 4);
+
+            float* ig = ( float* )malloc(batch_cell_size * sizeof(float));
+            float* cg = ( float* )malloc(batch_cell_size * sizeof(float));
+            float* fg = ( float* )malloc(batch_cell_size * sizeof(float));
+            float* og = ( float* )malloc(batch_cell_size * sizeof(float));
+            //                                 m         k              n        
+            do_gemm_mx(input, kernel, i2h, batch_size, input_size, 4 * cell_size, input_size,
+                    input_size, 4 * cell_size);
+            
+            if(bias)
+            {
+                for(int i = 0; i < batch_size; i++)
+                    for(int j = 0; j < 4 * cell_size; j++)
+                        i2h[i * 4 * cell_size + j] += bias[j];
+            }
 
-        float* merged_input = ( float* )malloc(sizeof(float) * batch_size * (input_total_size));
-        float* matmul_result = ( float* )malloc(sizeof(float) * batch_size * cell_size * 4);
+            do_gemm_mx(init_h, h2h_kernel, h2h, batch_size, hidden_size, 4 * hidden_size, hidden_size,
+                    hidden_size, 4 * hidden_size);
+            if(h2h_bias)
+            {
+                for(int i = 0; i < batch_size; i++)
+                    for(int j = 0; j < 4 * cell_size; j++)
+                        h2h[i * 4 * cell_size + j] += h2h_bias[j];
+            }
+            
+            for(int i = 0; i < batch_size*4*cell_size; i++)
+                gates[i] = i2h[i]+h2h[i];
+
+            slice_axis_1(gates, ig, batch_size, 4 * cell_size, 0, cell_size);
+            slice_axis_1(gates, fg, batch_size, 4 * cell_size, cell_size, 2 * cell_size);
+            slice_axis_1(gates, cg, batch_size, 4 * cell_size, 2 * cell_size, 3 * cell_size);
+            slice_axis_1(gates, og, batch_size, 4 * cell_size, 3 * cell_size, 4 * cell_size);
+            
+            for(int i = 0; i < batch_size*cell_size; i++)
+                fg[i]+=1;
+            
+            sigmoid(ig, batch_cell_size);
+            sigmoid(fg, batch_cell_size);
+            mytanh(cg, batch_cell_size);
+            sigmoid(og, batch_cell_size);
 
-        // merge input
-        concat_axis_1(input, init_h, merged_input, batch_size, input_size, hidden_size);
+            for(int i = 0; i < batch_cell_size; i++)
+                init_c[i] = init_c[i] * fg[i] + cg[i] * ig[i];
+            
+            for(int i = 0; i < batch_cell_size; i++)
+            {
+                init_h[i] = tanh(init_c[i]) * og[i];
+            }
 
-        // do gemm
-        do_gemm(merged_input, kernel, matmul_result, batch_size, input_total_size, 4 * cell_size, input_total_size,
-                4 * cell_size, 4 * cell_size);
+            free(i2h);
+            free(h2h);
+            free(gates);
+            free(ig);
+            free(fg);
+            free(cg);
+            free(og);
+            return true;
 
-        // add bias
-        if(bias)
-        {
-            for(int i = 0; i < batch_size; i++)
-                for(int j = 0; j < 4 * cell_size; j++)
-                    matmul_result[i * 4 * cell_size + j] += bias[j];
         }
+        else
+        {
+            int input_total_size = input_size + hidden_size;
+            int batch_cell_size = cell_size * batch_size;
 
-        float* ig = ( float* )malloc(batch_cell_size * sizeof(float));
-        float* cg = ( float* )malloc(batch_cell_size * sizeof(float));
-        float* fg = ( float* )malloc(batch_cell_size * sizeof(float));
-        float* og = ( float* )malloc(batch_cell_size * sizeof(float));
+            float* merged_input = ( float* )malloc(sizeof(float) * batch_size * (input_total_size));
+            float* matmul_result = ( float* )malloc(sizeof(float) * batch_size * cell_size * 4);
 
-        slice_axis_1(matmul_result, ig, batch_size, 4 * cell_size, 0, cell_size);
-        slice_axis_1(matmul_result, cg, batch_size, 4 * cell_size, cell_size, 2 * cell_size);
-        slice_axis_1(matmul_result, fg, batch_size, 4 * cell_size, 2 * cell_size, 3 * cell_size);
-        slice_axis_1(matmul_result, og, batch_size, 4 * cell_size, 3 * cell_size, 4 * cell_size);
+            // merge input
+            concat_axis_1(input, init_h, merged_input, batch_size, input_size, hidden_size);
 
-        // forget gate
-        for(int i = 0; i < batch_cell_size; i++)
-            fg[i] += forget_bias;
+            // do gemm
+            do_gemm(merged_input, kernel, matmul_result, batch_size, input_total_size, 4 * cell_size, input_total_size,
+                    4 * cell_size, 4 * cell_size);
 
-        // peephole
-        if(w_f_data)
-        {
-            for(int i = 0; i < batch_size; i++)
-                for(int j = 0; j < cell_size; j++)
-                {
-                    fg[i * cell_size + j] += init_c[i * cell_size + j] * w_f_data[j];
-                    ig[i * cell_size + j] += init_c[i * cell_size + j] * w_i_data[j];
-                }
-        }
+            // add bias
+            if(bias)
+            {
+                for(int i = 0; i < batch_size; i++)
+                    for(int j = 0; j < 4 * cell_size; j++)
+                        matmul_result[i * 4 * cell_size + j] += bias[j];
+            }
 
-        sigmoid(fg, batch_cell_size);
-        sigmoid(ig, batch_cell_size);
-        mytanh(cg, batch_cell_size);
+            float* ig = ( float* )malloc(batch_cell_size * sizeof(float));
+            float* cg = ( float* )malloc(batch_cell_size * sizeof(float));
+            float* fg = ( float* )malloc(batch_cell_size * sizeof(float));
+            float* og = ( float* )malloc(batch_cell_size * sizeof(float));
 
-        // get cell output
-        for(int i = 0; i < batch_cell_size; i++)
-            init_c[i] = init_c[i] * fg[i] + cg[i] * ig[i];
+            slice_axis_1(matmul_result, ig, batch_size, 4 * cell_size, 0, cell_size);
+            slice_axis_1(matmul_result, cg, batch_size, 4 * cell_size, cell_size, 2 * cell_size);
+            slice_axis_1(matmul_result, fg, batch_size, 4 * cell_size, 2 * cell_size, 3 * cell_size);
+            slice_axis_1(matmul_result, og, batch_size, 4 * cell_size, 3 * cell_size, 4 * cell_size);
 
-        if(w_o_data)
-        {
-            for(int i = 0; i < batch_size; i++)
-                for(int j = 0; j < cell_size; j++)
-                {
-                    og[i * cell_size + j] += init_c[i * cell_size + j] * w_o_data[j];
-                }
-        }
+            // forget gate
+            for(int i = 0; i < batch_cell_size; i++)
+                fg[i] += forget_bias;
 
-        sigmoid(og, batch_cell_size);
+            // peephole
+            if(w_f_data)
+            {
+                for(int i = 0; i < batch_size; i++)
+                    for(int j = 0; j < cell_size; j++)
+                    {
+                        fg[i * cell_size + j] += init_c[i * cell_size + j] * w_f_data[j];
+                        ig[i * cell_size + j] += init_c[i * cell_size + j] * w_i_data[j];
+                    }
+            }
 
-        if(projection)
-        {
+            sigmoid(fg, batch_cell_size);
+            sigmoid(ig, batch_cell_size);
+            mytanh(cg, batch_cell_size);
+
+            // get cell output
             for(int i = 0; i < batch_cell_size; i++)
+                init_c[i] = init_c[i] * fg[i] + cg[i] * ig[i];
+
+            if(w_o_data)
             {
-                og[i] = tanh(init_c[i]) * og[i];
+                for(int i = 0; i < batch_size; i++)
+                    for(int j = 0; j < cell_size; j++)
+                    {
+                        og[i * cell_size + j] += init_c[i * cell_size + j] * w_o_data[j];
+                    }
             }
 
-            /*batchxcell_size * cell_sizexhidden_size --> batch* hidden_size*/
-            do_gemm(og, projection, init_h, batch_size, cell_size, hidden_size, cell_size, hidden_size, hidden_size);
-        }
-        else
-        {
-            for(int i = 0; i < batch_cell_size; i++)
+            sigmoid(og, batch_cell_size);
+
+            if(projection)
             {
-                init_h[i] = tanh(init_c[i]) * og[i];
+                for(int i = 0; i < batch_cell_size; i++)
+                {
+                    og[i] = tanh(init_c[i]) * og[i];
+                }
+
+                /*batchxcell_size * cell_sizexhidden_size --> batch* hidden_size*/
+                do_gemm(og, projection, init_h, batch_size, cell_size, hidden_size, cell_size, hidden_size, hidden_size);
+            }
+            else
+            {
+                for(int i = 0; i < batch_cell_size; i++)
+                {
+                    init_h[i] = tanh(init_c[i]) * og[i];
+                }
             }
-        }
 
-        // free memory
-        free(merged_input);
-        free(matmul_result);
-        free(ig);
-        free(cg);
-        free(fg);
-        free(og);
+            // free memory
+            free(merged_input);
+            free(matmul_result);
+            free(ig);
+            free(cg);
+            free(fg);
+            free(og);
+            return true;
+        }
+        
 
-        return true;
+        
     }
 
     bool do_LSTM(const float* input, float* output, float* init_h, float* init_c, const float* kernel,
-                 const float* bias, const float* w_f_data, const float* w_i_data, const float* w_o_data,
+                 const float* bias, const float* h2h_kernel, const float* h2h_bias, const float* w_f_data, const float* w_i_data, const float* w_o_data,
                  const float* projection, float forget_bias, int seq_lens, int batch_size, int input_size,
-                 int output_len, int hidden_size, int cell_size)
+                 int output_len, int hidden_size, int cell_size,int mxnet_flag)
     {
         for(int i = 0; i < seq_lens; i++)
         {
             const float* seq_input = input + i * batch_size * input_size;
 
-            if(!do_LSTM_step(seq_input, init_h, init_c, kernel, bias, w_f_data, w_i_data, w_o_data, projection,
-                             forget_bias, batch_size, input_size, hidden_size, cell_size))
+            if(!do_LSTM_step(seq_input, init_h, init_c, kernel, bias,h2h_kernel,h2h_bias, w_f_data, w_i_data, w_o_data, projection,
+                             forget_bias, batch_size, input_size, hidden_size, cell_size,mxnet_flag))
                 return false;
 
             if(i + output_len >= seq_lens)
@@ -360,7 +443,10 @@ struct LSTMOps : public NodeOps
         {
             Tensor* temptensor = node->GetInputTensor(count);
             const std::string& name = temptensor->GetName();
-
+            if(name.find(lstm_op->GetKernelName()) != std::string::npos)
+            {
+                kernel_tensor = temptensor;
+            }
             if(name.find(lstm_op->GetInitCellName()) != std::string::npos)
             {
                 init_c_tensor = temptensor;
@@ -389,6 +475,26 @@ struct LSTMOps : public NodeOps
             {
                 proj_tensor = temptensor;
             }
+            if(name.find(lstm_op->Geti2hKernelName()) != std::string::npos)
+            {
+                kernel_tensor = temptensor;
+            }
+            if(name.find(lstm_op->Geti2hBiasName()) != std::string::npos)
+            {
+                bias_tensor = temptensor;
+            }
+            if(name.find(lstm_op->Geth2hKernelName()) != std::string::npos)
+            {
+                h2h_kernel_tensor = temptensor;
+            }
+            if(name.find(lstm_op->Geth2hBiasName()) != std::string::npos)
+            {
+                h2h_bias_tensor = temptensor;
+            }
+            if(name.find(lstm_op->GetFusedKernelName()) != std::string::npos)
+            {
+                fused_kernel_tensor = temptensor;
+            }
         }
 
         if(init_c_tensor)
@@ -411,17 +517,17 @@ struct LSTMOps : public NodeOps
 
         Tensor* input_tensor = node->GetInputTensor(0);
         Tensor* output_tensor = node->GetOutputTensor(0);
-        Tensor* kernel_tensor = node->GetInputTensor(1);
+        // Tensor* kernel_tensor = node->GetInputTensor(1);
 
         float forget_bias = param->forget_bias;
 
         bool has_peephole = param->has_peephole;
         bool has_projection = param->has_projection;
 
-        int input_size = param->input_size;
+        
         int hidden_size = param->hidden_size;
         int cell_size = param->cell_size;
-
+        int input_size=0;
         float* output = ( float* )get_tensor_mem(output_tensor);
         float* input = ( float* )get_tensor_mem(input_tensor);
 
@@ -430,7 +536,16 @@ struct LSTMOps : public NodeOps
         int seq_lens = input_shape.Shape(0);
         int batch_size = input_shape.Shape(1);
         int output_len = param->output_len;
+        int mxnet_flag= param->mxnet_flag;
 
+        if(mxnet_flag==1)
+        {
+            input_size=input_shape.Shape(2);
+        }
+        else
+        {
+            input_size = param->input_size;
+        }
         float* init_h = ( float* )malloc(batch_size * hidden_size * sizeof(float));
 
         if(init_h == nullptr)
@@ -462,16 +577,27 @@ struct LSTMOps : public NodeOps
             memset(init_c, 0x0, sizeof(batch_size * cell_size * sizeof(float)));
         }
 
-        float* kernel = ( float* )get_tensor_mem(kernel_tensor);
-
+        float* kernel =nullptr;
         float* bias = nullptr;
         float* w_f_data = nullptr;
         float* w_i_data = nullptr;
         float* w_o_data = nullptr;
         float* projection = nullptr;
+        float* h2h_kernel =nullptr;
+        float* h2h_bias =nullptr;
+        float* fused_kernel =nullptr;
 
+        if(kernel_tensor)
+            kernel = ( float* )get_tensor_mem(kernel_tensor);
+        
         if(bias_tensor)
             bias = ( float* )get_tensor_mem(bias_tensor);
+        
+        if(h2h_kernel_tensor)
+            h2h_kernel = ( float* )get_tensor_mem(h2h_kernel_tensor);
+        
+        if(h2h_bias_tensor)
+            h2h_bias = ( float* )get_tensor_mem(h2h_bias_tensor);
 
         if(has_peephole)
         {
@@ -479,12 +605,23 @@ struct LSTMOps : public NodeOps
             w_i_data = ( float* )get_tensor_mem(w_i_tensor);
             w_o_data = ( float* )get_tensor_mem(w_o_tensor);
         }
+        //int bsize=2*cell_size*4;
 
+        if(fused_kernel_tensor)
+        {
+            fused_kernel=( float* )get_tensor_mem(fused_kernel_tensor);
+            int kernel_size=get_tensor_mem_size(fused_kernel_tensor)/sizeof(float);
+            kernel=fused_kernel;
+            h2h_kernel=kernel+input_size*hidden_size*4;
+            bias=kernel+kernel_size-hidden_size*4*2;
+            h2h_bias=bias+hidden_size*4;
+        }
         if(has_projection)
             projection = ( float* )get_tensor_mem(proj_tensor);
 
-        bool ret = do_LSTM(input, output, init_h, init_c, kernel, bias, w_f_data, w_i_data, w_o_data, projection,
-                           forget_bias, seq_lens, batch_size, input_size, output_len, hidden_size, cell_size);
+        // std::cout<<"inputmem: "<<input<<"\n";
+        bool ret = do_LSTM(input, output, init_h, init_c, kernel, bias,h2h_kernel,h2h_bias, w_f_data, w_i_data, w_o_data, projection,
+                           forget_bias, seq_lens, batch_size, input_size, output_len, hidden_size, cell_size,mxnet_flag);
 
         free(init_h);
         free(init_c);
diff --git a/executor/operator/common/blas/rnn_blas.cpp b/executor/operator/common/blas/rnn_blas.cpp
new file mode 100644
index 000000000..ac0e37213
--- /dev/null
+++ b/executor/operator/common/blas/rnn_blas.cpp
@@ -0,0 +1,246 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#include <algorithm>
+#include <cstring>
+#include <fstream>
+#include <functional>
+#include <iostream>
+
+#include "graph.hpp"
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "operator/rnn.hpp"
+#include "tensor_mem.hpp"
+#include "tengine_errno.hpp"
+#include <cblas.h>
+#include <math.h>
+
+namespace TEngine {
+
+namespace RNNRefImpl {
+
+struct RNNOps : public NodeOps
+{
+    Tensor* init_h_tensor;
+    Tensor* bias_tensor;
+    void* init_h_data;
+
+    RNNOps(void)
+    {
+        init_h_tensor = nullptr;
+        bias_tensor = nullptr;
+        init_h_data = nullptr;
+    }
+
+    /*
+  @ func_name: concat_axis_1
+  @ param:
+      a:[m, n1]
+      b:[m, n2]
+      c:[m, n1 + n2]
+  */
+    void concat_axis_1(const float* a, const float* b, float* c, int m, int n1, int n2)
+    {
+        int n = n1 + n2;
+        for(int i = 0; i < m; i++)
+        {
+            for(int j = 0; j < n1; j++)
+            {
+                c[j + i * n] = a[j + i * n1];
+            }
+            for(int j = 0; j < n2; j++)
+            {
+                c[j + i * n + n1] = b[j + i * n2];
+            }
+        }
+    }
+
+    void do_gemm(const float* a, const float* b, float* c, int m, int k, int n, int lda, int ldb, int ldc)
+    {
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, a, lda, b, ldb, 0.0, c, ldc);
+    }
+
+    bool do_RNN_step(const float* input, float* init_h, const float* kernel, const float* bias,
+                      int batch_size, int input_size, int hidden_size)
+    {
+        int input_total_size = input_size + hidden_size;
+        int batch_cell_size = hidden_size * batch_size;
+
+        float* ig = ( float* )malloc(batch_cell_size * sizeof(float));
+
+        float* merged_input = ( float* )malloc(sizeof(float) * batch_size * (input_total_size));
+        float* matmul_result = ( float* )malloc(sizeof(float) * batch_size * hidden_size );
+
+        // merge input
+        concat_axis_1(input, init_h, merged_input, batch_size, input_size, hidden_size);
+
+        // do gemm
+        do_gemm(merged_input, kernel, matmul_result, batch_size, input_total_size, hidden_size, input_total_size,
+                hidden_size, hidden_size);
+
+        // add bias
+        if(bias)
+        {
+            for(int i = 0; i < batch_size; i++)
+                for(int j = 0; j < hidden_size; j++)
+                    matmul_result[i *hidden_size + j] += bias[j];
+        }
+        //activation
+        for(int i = 0; i < batch_cell_size; i++)
+        {
+            ig[i] = tanh(matmul_result[i]);
+            init_h[i]=ig[i];
+        }
+
+        // free memory
+        free(merged_input);
+        free(matmul_result);
+        free(ig);
+
+        return true;
+    }
+
+    bool do_RNN(const float* input, float* output, float* init_h, const float* kernel,
+                 const float* bias, int seq_lens, int batch_size, int input_size,int output_len, int hidden_size)
+    {
+        for(int i = 0; i < seq_lens; i++)
+        {
+            const float* seq_input = input + i * batch_size * input_size;
+
+            if(!do_RNN_step(seq_input, init_h, kernel, bias, batch_size, input_size, hidden_size))
+                return false;
+            //outputs [batch_size,seq_len,hidden_size]
+            //final_state [batch_size,hidden_size]   
+            if(i + output_len >= seq_lens)
+            {
+                memcpy(output, init_h, batch_size*hidden_size * sizeof(float));
+                output += batch_size*hidden_size;
+            }
+        }
+
+        return true;
+    }
+
+    bool Prerun(Node* node)
+    {
+        RNN* rnn_op = dynamic_cast<RNN*>(node->GetOp());
+
+        int in_num = node->GetInputNum();
+
+        for(int count = 0; count < in_num; count++)
+        {
+            Tensor* temptensor = node->GetInputTensor(count);
+            const std::string& name = temptensor->GetName();
+
+            if(name.find(rnn_op->GetInitHiddenName()) != std::string::npos)
+            {
+                init_h_tensor = temptensor;
+            }
+            if(name.find(rnn_op->GetBiasName()) != std::string::npos)
+            {
+                bias_tensor = temptensor;
+            }
+           
+        }
+
+        if(init_h_tensor)
+        {
+            init_h_data = get_tensor_mem(init_h_tensor);
+        }
+
+        return true;
+    }
+
+    bool Run(Node* node)
+    {
+        RNN* rnn_op = dynamic_cast<RNN*>(node->GetOp());
+        RNNParam* param = rnn_op->GetParam();
+
+        Tensor* input_tensor = node->GetInputTensor(0);
+        Tensor* output_tensor = node->GetOutputTensor(0);
+        Tensor* kernel_tensor = node->GetInputTensor(1);
+
+        int input_size = param->input_size;
+        int hidden_size = param->hidden_size;
+
+        float* output = ( float* )get_tensor_mem(output_tensor);
+        float* input = ( float* )get_tensor_mem(input_tensor);
+
+        const TShape& input_shape = input_tensor->GetShape();
+
+        int seq_lens = input_shape.Shape(0);
+        int batch_size = input_shape.Shape(1);
+        int output_len = param->output_len;
+
+        float* init_h = ( float* )malloc(batch_size * hidden_size * sizeof(float));
+
+        if(init_h == nullptr)
+        {
+            set_tengine_errno(ENOMEM);
+            return false;
+        }
+
+        if(init_h_data)
+        {
+            for(int i = 0; i < batch_size; i++)
+            {
+                memcpy(init_h + i * hidden_size, init_h_data, hidden_size * sizeof(float));
+            }
+        }
+        else
+        {
+            memset(init_h, 0x0, sizeof(batch_size * hidden_size * sizeof(float)));
+        }
+
+        float* kernel = ( float* )get_tensor_mem(kernel_tensor);
+
+        float* bias = nullptr;
+  
+        if(bias_tensor)
+            bias = ( float* )get_tensor_mem(bias_tensor);
+
+        bool ret = do_RNN(input, output, init_h, kernel, bias, seq_lens, batch_size, input_size, output_len, hidden_size);
+
+        free(init_h);
+
+        return ret;
+    }
+
+    bool Postrun(Node* node)
+    {
+        return true;
+    }
+};
+
+}    // namespace RNNRefImpl
+
+using namespace RNNRefImpl;
+void RegisterRNNNodeExec(void)
+{
+    RNNOps* ops = new RNNOps();
+
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "RNN", ops);
+}
+
+}    // namespace TEngine
\ No newline at end of file
diff --git a/executor/operator/common/concat.cpp b/executor/operator/common/concat.cpp
index cb8ee5a2b..1e6304325 100644
--- a/executor/operator/common/concat.cpp
+++ b/executor/operator/common/concat.cpp
@@ -48,8 +48,13 @@ struct ConcatOps : public NodeOps
         int element_size = DataType::GetTypeSize(input_tensor->GetDataType());
         Tensor* output_tensor = node->GetOutputTensor(0);
         auto out_quant = output_tensor->GetQuantParam();
-        int out_zero = (*out_quant)[0].zero_point;
-        float out_scale = (*out_quant)[0].scale;
+        int out_zero = 0;
+        float out_scale = 1;
+        if( !out_quant->empty() )
+        {
+            out_zero = (*out_quant)[0].zero_point;
+            out_scale = (*out_quant)[0].scale;
+        }
 
         Concat* concat_op = dynamic_cast<Concat*>(node->GetOp());
         ConcatParam* param = concat_op->GetParam();
@@ -107,15 +112,25 @@ struct ConcatOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    if(data_type != TENGINE_DT_FP32 && data_type != TENGINE_DT_UINT8)
+        return nullptr;
+
+    ConcatOps* ops = new ConcatOps();
+
+    return ops;
+}
+
 }    // namespace ConcatImpl
 
 using namespace ConcatImpl;
 
 void RegisterConcatNodeExec(void)
 {
-    ConcatOps* ops = new ConcatOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Concat", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Concat", ConcatImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/conv_ref.cpp b/executor/operator/common/conv_ref.cpp
deleted file mode 100644
index 2d5fdfe4b..000000000
--- a/executor/operator/common/conv_ref.cpp
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2017, Open AI Lab
- * Author: haoluo@openailab.com
- */
-#include <iostream>
-#include <cstring>
-#include <cstdlib>
-#include <math.h>
-#include <cmath>
-
-#include "logger.hpp"
-#include "node_ops.hpp"
-#include "tensor_mem.hpp"
-#include "data_type.hpp"
-
-#include "graph.hpp"
-#include "operator/convolution.hpp"
-
-namespace TEngine {
-
-namespace conv_ref {
-
-struct op_data
-{
-    float i_scale;
-    int i_zero;
-    float k_scale;
-    int k_zero;
-    float o_scale;
-    int o_zero;
-    int activation_min;
-    int activation_max;
-};
-
-const char* conv_name = "CONV_REF";
-const int default_prio = 1500;
-/*
-template <typename data_type>
-void interleave_kernel(void* kernel_org , void* kernel_interleaved,int output_chan ,
-            int kernel_h, int kernel_w,int kernel_c)
-{
-    data_type* kernel = (data_type*) kernel_org;
-    data_type* kernel_inter = (data_type*) kernel_interleaved;
-
-    int kernel_size = kernel_h * kernel_w * kernel_c;
-    for(int i =0;i<output_chan; i++)
-    {
-        data_type* kernel_interleaved_cur = kernel_inter + i*kernel_size;
-        data_type* kernel_cur = kernel + i* kernel_size;
-        for(int h=0;h<kernel_h;h++)
-            for(int w=0;w<kernel_w;w++)
-                for(int c=0;c<kernel_c;c++)
-                {
-                    kernel_interleaved_cur[c*kernel_h*kernel_w + h*kernel_w + w] = *kernel_cur++;
-                }
-    }
-
-}
-*/
-
-bool GetQuantizedActivationMinMax(op_data& op_param, int activation_type)
-{
-    const float scale = op_param.o_scale;
-    const int zero = op_param.o_zero;
-    auto quantize = [scale, zero](float f) { return zero + static_cast<int>(std::round(f / scale)); };
-
-    if(activation_type == 0)
-    {
-        op_param.activation_max = 255;
-        op_param.activation_min = std::max(0, quantize(0));
-    }
-    else if(activation_type == 6)
-    {
-        op_param.activation_max = std::min(255, quantize(6));
-        op_param.activation_min = std::max(0, quantize(0));
-    }
-    else if(activation_type == 1)
-    {
-        op_param.activation_max = std::min(255, quantize(1));
-        op_param.activation_min = std::max(0, quantize(-1));
-    }
-    else
-    {
-        op_param.activation_max = 255;
-        op_param.activation_min = 0;
-    }
-
-    return true;
-}
-/*
-bool GetQuantizedMultiplerShift(op_data& op_param)
-{
-    const double input_product_scale = op_param.i_scale*op_param.k_scale;
-    double double_multiplier = input_product_scale/op_param.o_scale;
-    int shift = 0;
-    if(double_multiplier<1)
-    {
-        while(double_multiplier < 0.5)
-        {
-            double_multiplier*=2;
-            shift ++;
-        }
-    }
-    else if(double_multiplier>=1)
-    {
-        while(double_multiplier>1)
-        {
-            double_multiplier/=2;
-            shift --;
-        }
-    }
-    op_param.multiplier = std::round(double_multiplier * 256);
-    op_param.shift = -shift;
-    //printf("%f, %f, %f, %f,%d, %d\n",op_param.i_scale,op_param.k_scale,op_param.o_scale,
-    //            dd, op_param.multiplier, shift);
-    //printf("%d, %d\n",op_param.i_zero,op_param.k_zero);
-
-    return true;
-}
-*/
-template <typename data_type>
-void im2col(void* input_org, void* im2col, int input_chan, int input_x, int input_y, int kernel_x, int kernel_y,
-            int stride_x, int stride_y, int pad_x0, int pad_y0, int pad_x1, int pad_y1, int output_x, int output_y,
-            int group, int i_zero)
-{
-    data_type* input = ( data_type* )input_org;
-    data_type* col = ( data_type* )im2col;
-
-    int input_c = input_chan * group;
-    int kernel_size = input_chan * kernel_x * kernel_y;
-    for(int h = 0; h < output_y; h++)
-    {
-        data_type* col_h = col + output_x * kernel_size * h;
-        for(int w = 0; w < output_x; w++)
-        {
-            data_type* col_w = col_h + kernel_size * w;
-            int w_start = w * stride_x - pad_x0;
-            int w_end = w_start + kernel_x;
-            int h_start = h * stride_y - pad_y0;
-            int h_end = h_start + kernel_y;
-
-            for(int kh = h_start; kh < h_end; kh++)
-                for(int kw = w_start; kw < w_end; kw++)
-                    for(int kc = 0; kc < input_chan; kc++)
-                    {
-                        if(kh < 0 || kh >= input_y || kw < 0 || kw >= input_x)
-                        {
-                            *col_w++ = ( data_type )i_zero;
-                        }
-                        else
-                            *col_w++ = input[kh * input_c * input_x + kw * input_c + kc];
-                    }
-        }
-    }
-}
-
-template <typename data_type>
-static void run_kernel(void* input, void* output, void* kernel, void* bias, int activation, int kernel_h, int kernel_w,
-                       int input_c, int output_chan, int output_x, int output_y, int group, op_data param)
-{
-    data_type* output0 = ( data_type* )output;
-    data_type* kernel0 = ( data_type* )kernel;
-
-    int in_chan_rel = input_c * group;
-    int out_chan_real = output_chan * group;
-    int kernel_size = input_c * kernel_h * kernel_w;
-
-    for(int c = 0; c < output_chan; c++)
-    {
-        data_type* kernel_cur = kernel0 + c * in_chan_rel * kernel_h * kernel_w;
-        if(sizeof(data_type) == 4)
-        {
-            float* bias0 = ( float* )bias;
-            float bias_cur = bias0 ? bias0[c] : 0;
-            for(int h = 0; h < output_y; h++)
-                for(int w = 0; w < output_x; w++)
-                {
-                    int index = h * output_x * out_chan_real + w * out_chan_real + c;
-                    float tmp = bias_cur;
-                    float* input_cur = ( float* )input + kernel_size * h * output_x + w * kernel_size;
-                    for(int i = 0; i < kernel_h; i++)
-                        for(int j = 0; j < kernel_w; j++)
-                            for(int k = 0; k < input_c; k++)
-                            {
-                                int pos = i * kernel_w * in_chan_rel + j * in_chan_rel + k;
-                                tmp += *input_cur * kernel_cur[pos];
-                                input_cur++;
-                            }
-
-                    if(activation == 0)
-                    {
-                        if(tmp < 0)
-                            tmp = 0;
-                    }
-                    if(activation == 6)
-                    {
-                        if(tmp < 0)
-                            tmp = 0;
-                        if(tmp > 6)
-                            tmp = 6;
-                    }
-                    output0[index] = tmp;
-                }
-        }
-        else
-        {
-            int* bias0 = ( int* )bias;
-            int bias_cur = bias0 ? bias0[c] : 0;
-            for(int h = 0; h < output_y; h++)
-                for(int w = 0; w < output_x; w++)
-                {
-                    int index = h * output_x * out_chan_real + w * out_chan_real + c;
-                    int tmp = bias_cur;
-                    uint8_t* input_cur = ( uint8_t* )input + kernel_size * h * output_x + w * kernel_size;
-                    for(int i = 0; i < kernel_h; i++)
-                        for(int j = 0; j < kernel_w; j++)
-                            for(int k = 0; k < input_c; k++)
-                            {
-                                int pos = i * kernel_w * in_chan_rel + j * in_chan_rel + k;
-                                tmp += (*input_cur - param.i_zero) * (kernel_cur[pos] - param.k_zero);
-                                input_cur++;
-                            }
-                    tmp = std::round(tmp * param.i_scale * param.k_scale / param.o_scale);
-
-                    tmp += param.o_zero;
-                    tmp = std::max(param.activation_min, tmp);
-                    tmp = std::min(param.activation_max, tmp);
-                    output0[index] = tmp;
-                }
-        }
-    }
-}
-
-struct ConvRef : public MTNodeOps
-{
-    bool Prerun(Node* node) override;
-    bool Run(Node* node) override;
-    bool Reshape(Node* node) override;
-    bool Postrun(Node* node) override;
-    bool GetSharedMemorySize(Node*, unsigned int& mem_size) override;
-    bool SetSharedMemoryAddr(Node*, void* mem_addr, int mem_size) override;
-
-    bool RunNHWC(Node* node);
-    bool RunNCHW(Node* node);
-
-    op_data op_param;
-    int element_size;
-    bool dynamic_shape;
-};
-
-bool ConvRef::Reshape(Node* node)
-{
-    unsigned int new_col_size;
-
-    GetSharedMemorySize(node, new_col_size);
-
-    if(node->ExistAttr("col_buf_allocated"))
-    {
-        unsigned int col_size = any_cast<unsigned int>(node->GetAttr("col_buf_allocated"));
-
-        if(new_col_size == col_size)
-            return true;
-
-        float* addr = any_cast<float*>(node->GetAttr("col_buf"));
-        mem_free(addr);
-    }
-
-    float* col_buf = ( float* )mem_alloc(new_col_size);
-    node->SetAttr("col_buf", col_buf);
-    node->SetAttr("col_buf_allocated", new_col_size);
-    return true;
-}
-
-bool ConvRef::SetSharedMemoryAddr(Node* node, void* mem_addr, int mem_size)
-{
-    (*node)["shared_col_buf"] = mem_addr;
-    return true;
-}
-
-bool ConvRef::GetSharedMemorySize(Node* node, unsigned int& mem_size)
-{
-    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
-    ConvParam* param = conv_op->GetParam();
-    int group = param->group;
-
-    Tensor* output_tensor = node->GetOutputTensor(0);
-    TShape& output_shape = output_tensor->GetShape();
-    int output_y = output_shape.GetH();
-    int output_x = output_shape.GetW();
-
-    Tensor* input_tensor = node->GetInputTensor(0);
-    TShape& input_shape = input_tensor->GetShape();
-    element_size = DataType::GetTypeSize(input_tensor->GetDataType());
-
-    int input_chan = input_shape.GetC();
-    int kernel_size = input_chan / group * param->kernel_h * param->kernel_w;
-    int output_xy = output_x * output_y;
-
-    mem_size = (element_size * kernel_size * output_xy);
-
-    return true;
-}
-
-bool ConvRef::Prerun(Node* node)
-{
-    if(!dynamic_shape)
-    {
-        if(node->ExistAttr("shared_col_buf"))
-        {
-            void* addr = any_cast<void*>(node->GetAttr("shared_col_buf"));
-
-            (*node)["col_buf"] = addr;
-        }
-        else
-        {
-            unsigned int col_size;
-
-            GetSharedMemorySize(node, col_size);
-
-            void* col_buf = mem_alloc(col_size);
-            (*node)["col_buf"] = col_buf;
-            node->SetAttr("col_buf_allocated", col_size);
-        }
-    }
-    if(element_size == 1)
-    {
-        Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
-        ConvParam* param = conv_op->GetParam();
-        Tensor* input_tensor = node->GetInputTensor(0);
-        Tensor* kernel_tensor = node->GetInputTensor(1);
-        Tensor* output_tensor = node->GetOutputTensor(0);
-
-        auto* in_quant = input_tensor->GetQuantParam();
-        op_param.i_scale = (*in_quant)[0].scale;
-        op_param.i_zero = (*in_quant)[0].zero_point;
-        auto* k_quant = kernel_tensor->GetQuantParam();
-        op_param.k_scale = (*k_quant)[0].scale;
-        op_param.k_zero = (*k_quant)[0].zero_point;
-        auto* o_quant = output_tensor->GetQuantParam();
-        op_param.o_scale = (*o_quant)[0].scale;
-        op_param.o_zero = (*o_quant)[0].zero_point;
-        // GetQuantizedMultiplerShift(op_param);
-        GetQuantizedActivationMinMax(op_param, param->activation);
-    }
-
-    return true;
-}
-
-bool ConvRef::Run(Node* node)
-{
-    if(exec_attr->layout == TENGINE_LAYOUT_NHWC)
-    {
-        return RunNHWC(node);
-    }
-    else
-    {
-        // TODO: support NCHW
-        return false;
-    }
-}
-
-bool ConvRef::RunNHWC(Node* node)
-{
-    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
-    ConvParam* param = conv_op->GetParam();
-
-    int kernel_h = param->kernel_h;
-    int kernel_w = param->kernel_w;
-    int stride_h = param->stride_h;
-    int stride_w = param->stride_w;
-    // int  pad_h = param->pad_h;
-    // int  pad_w = param->pad_w;
-    int dilation_h = param->dilation_h;
-    int dilation_w = param->dilation_w;
-    int pad_x0 = param->pads[1];    // left padding columns
-    int pad_x1 = param->pads[3];    // right padding columns
-    int pad_y0 = param->pads[0];    // top padding rows
-    int pad_y1 = param->pads[2];    // bottom padding rows
-    int group = param->group;
-    int activation = param->activation;
-    if(dilation_h != 1 || dilation_w != 1)
-        return false;
-
-    Tensor* input_tensor = node->GetInputTensor(0);
-    uint8_t* input_org = ( uint8_t* )get_tensor_mem(input_tensor);
-    TShape& input_shape = input_tensor->GetShape();
-    int input_w = input_shape.GetW();
-    int input_h = input_shape.GetH();
-    int input_c = input_shape.GetC() / group;
-    int input_n = input_shape.GetN();
-    int input_size = input_w * input_h * input_c;
-
-    Tensor* kernel_tensor = node->GetInputTensor(1);
-    uint8_t* kernel = ( uint8_t* )get_tensor_mem(kernel_tensor);
-
-    uint8_t* bias_data = nullptr;
-    if(node->GetInputNum() > 2)
-    {
-        Tensor* bias_tensor = node->GetInputTensor(2);
-        bias_data = ( uint8_t* )get_tensor_mem(bias_tensor);
-    }
-
-    Tensor* output_tensor = node->GetOutputTensor(0);
-    uint8_t* output_org = ( uint8_t* )get_tensor_mem(output_tensor);
-
-    TShape& output_shape = output_tensor->GetShape();
-    int output_w = output_shape.GetW();
-    int output_h = output_shape.GetH();
-    int output_c = output_shape.GetC() / group;
-    int output_xy = output_h * output_w;
-
-    void* col_buf = any_cast<void*>(node->GetAttr("col_buf"));
-    uint8_t* col = ( uint8_t* )col_buf;
-
-    for(int n = 0; n < input_n; n++)
-    {
-        uint8_t* input = input_org + n * input_size * group * element_size;
-        uint8_t* output = output_org + n * output_xy * output_c * group * element_size;
-
-        for(int g = 0; g < group; g++)
-        {
-            uint8_t* input_g = input + input_c * g * element_size;
-            uint8_t* output_g = output + output_c * g * element_size;
-            uint8_t* kernel_g = kernel + input_c * g * element_size;
-            uint8_t* bias_g = bias_data ? bias_data + output_c * g * 4 : nullptr;
-            if(element_size == 4)
-            {
-                im2col<float>(input_g, col, input_c, input_w, input_h, kernel_w, kernel_h, stride_w, stride_h, pad_x0,
-                              pad_y0, pad_x1, pad_y1, output_w, output_h, group, 0);
-                run_kernel<float>(col, output_g, kernel_g, bias_g, activation, kernel_h, kernel_w, input_c, output_c,
-                                  output_w, output_h, group, op_param);
-            }
-
-            if(element_size == 1)
-            {
-                im2col<uint8_t>(input_g, col, input_c, input_w, input_h, kernel_w, kernel_h, stride_w, stride_h, pad_x0,
-                                pad_y0, pad_x1, pad_y1, output_w, output_h, group, op_param.i_zero);
-                run_kernel<uint8_t>(col, output_g, kernel_g, bias_g, activation, kernel_h, kernel_w, input_c, output_c,
-                                    output_w, output_h, group, op_param);
-            }
-        }
-    }
-
-    return true;
-}
-
-bool ConvRef::Postrun(Node* node)
-{
-    if(node->ExistAttr("col_buf_allocated"))
-    {
-        void* addr = any_cast<void*>(node->GetAttr("col_buf"));
-        mem_free(addr);
-        node->RemoveAttr("col_buf_allocated");
-    }
-    return true;
-}
-
-NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
-{
-    ConvRef* ops = new ConvRef();
-
-    ops->need_free = true;
-    if(node->IsDynamicShape())
-        ops->dynamic_shape = true;
-    else
-        ops->dynamic_shape = false;
-
-    return ops;
-}
-
-}    // namespace conv_ref
-
-void RegisterConv2dRef(void)
-{
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Convolution", conv_ref::SelectFunc,
-                                                  conv_ref::default_prio);
-}
-
-}    // namespace TEngine
diff --git a/executor/operator/common/detection_output.cpp b/executor/operator/common/detection_output.cpp
index 52c591697..c47e7364c 100644
--- a/executor/operator/common/detection_output.cpp
+++ b/executor/operator/common/detection_output.cpp
@@ -240,15 +240,26 @@ struct DetectionOutputOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    DetectionOutputOps* ops = new DetectionOutputOps();
+
+    return ops;
+}
+
 }    // namespace DetectionOutputImpl
 
 using namespace DetectionOutputImpl;
 
 void RegisterDetectionOutputNodeExec(void)
 {
-    DetectionOutputOps* ops = new DetectionOutputOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "DetectionOutput", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "DetectionOutput", DetectionOutputImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/detection_postprocess.cpp b/executor/operator/common/detection_postprocess.cpp
index 1f7ebb1ad..44ee81204 100644
--- a/executor/operator/common/detection_postprocess.cpp
+++ b/executor/operator/common/detection_postprocess.cpp
@@ -325,15 +325,27 @@ struct DetectionPostProcessOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if((data_type != TENGINE_DT_FP32&&data_type != TENGINE_DT_UINT8) ||
+        exec_attr->graph_layout != TENGINE_LAYOUT_NHWC)
+        return nullptr;
+
+    DetectionPostProcessOps* ops = new DetectionPostProcessOps();
+
+    return ops;
+}
+
 }    // namespace DetectionPostProcessImpl
 
 using namespace DetectionPostProcessImpl;
 
 void RegisterDetectionPostProcessNodeExec(void)
 {
-    DetectionPostProcessOps* ops = new DetectionPostProcessOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "DetectionPostProcess", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "DetectionPostProcess", DetectionPostProcessImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/dropout.cpp b/executor/operator/common/dropout.cpp
index e595ecd51..bfcdeb918 100644
--- a/executor/operator/common/dropout.cpp
+++ b/executor/operator/common/dropout.cpp
@@ -51,19 +51,39 @@ struct DropoutOps : public NodeOps
     bool Run(Node* node)
     {
         // Nothing needs to do for inference
-        return true;
+	Tensor* input_tensor = node->GetInputTensor(0);
+	Tensor* output_tensor = node->GetOutputTensor(0);
+	void* input_org = get_tensor_mem(input_tensor);
+	void* output_org = get_tensor_mem(output_tensor);
+	if(input_org == output_org)
+            return true;
+
+	int size = input_tensor->GetTotalSize();
+	memcpy(output_org, input_org, size);
+	return true;
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    DropoutOps* ops = new DropoutOps();
+
+    return ops;
+}
+
 }    // namespace DropImpl
 
 using namespace DropImpl;
 
 void RegisterDropoutNodeExec(void)
 {
-    DropoutOps* ops = new DropoutOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Dropout", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Dropout", DropImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/eltwise.cpp b/executor/operator/common/eltwise.cpp
index bbfab37f7..dcde9f9ca 100644
--- a/executor/operator/common/eltwise.cpp
+++ b/executor/operator/common/eltwise.cpp
@@ -204,15 +204,26 @@ struct EltwiseOps : public NodeOps
 
 };    // struct EltwiseOps
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    EltwiseOps* ops = new EltwiseOps();
+
+    return ops;
+}
+
 }    // namespace EltwiseImpl
 
 using namespace EltwiseImpl;
 
 void RegisterEltwiseNodeExec(void)
 {
-    EltwiseOps* ops = new EltwiseOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Eltwise", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Eltwise", EltwiseImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/flatten.cpp b/executor/operator/common/flatten.cpp
index 1a7fd0fbc..145e8ca9f 100644
--- a/executor/operator/common/flatten.cpp
+++ b/executor/operator/common/flatten.cpp
@@ -73,15 +73,26 @@ struct FlattenOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    FlattenOps* ops = new FlattenOps();
+
+    return ops;
+}
+
 }    // namespace FlattenImpl
 
 using namespace FlattenImpl;
 
 void RegisterFlattenNodeExec(void)
 {
-    FlattenOps* ops = new FlattenOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Flatten", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Flatten", FlattenImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/fused/fused_bn_scale_relu.cpp b/executor/operator/common/fused/fused_bn_scale_relu.cpp
index 1d2362bff..b360144de 100644
--- a/executor/operator/common/fused/fused_bn_scale_relu.cpp
+++ b/executor/operator/common/fused/fused_bn_scale_relu.cpp
@@ -187,15 +187,26 @@ struct FusedOps : public MTNodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    FusedOps* ops = new FusedOps();
+
+    return ops;
+}
+
 }    // namespace FusedBNScaleReluImpl
 
 using namespace FusedBNScaleReluImpl;
 
 void RegisterCommonFusedBNScaleReluNodeExec(void)
 {
-    FusedOps* ops = new FusedOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", FusedBNScaleReLu::class_name, ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", FusedBNScaleReLu::class_name, FusedBNScaleReluImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/init.cpp b/executor/operator/common/init.cpp
index 1b1f595ef..3cf766d3c 100644
--- a/executor/operator/common/init.cpp
+++ b/executor/operator/common/init.cpp
@@ -26,7 +26,6 @@
 
 namespace TEngine {
 
-extern void NodeOpsRegistryManagerInit(void);
 extern void RegisterConcatNodeExec(void);
 extern void RegisterDropoutNodeExec(void);
 extern void RegisterSoftmaxNodeExec(void);
@@ -49,27 +48,23 @@ extern void RegisterReLuNodeExec(void);
 extern void RegisterResizeNodeExec(void);
 extern void RegisterLogisticNodeExec(void);
 extern void RegisterDetectionPostProcessNodeExec(void);
-extern void RegisterConv2dRef(void);
 
 #ifdef CONFIG_ARCH_BLAS
 extern void RegisterConvBlasNodeExec(void);
 extern void RegisterDeconvBlasNodeExec(void);
 extern void RegisterFcBlasNodeExec(void);
 extern void RegisterLSTMNodeExec(void);
+extern void RegisterRNNNodeExec(void);
+extern void RegisterGRUNodeExec(void);
 #endif
 extern void RegisterPooling_NodeExec(void);
 extern void RegisterBatchNorm_NodeExec(void);
 extern void RegisterScale_NodeExec(void);
 
 extern void RegisterCommonFusedBNScaleReluNodeExec(void);
-extern void RegisterDemoOps(void);
 
 void RegisterCommonOps(void)
 {
-#ifndef ANDROID
-    RegisterDemoOps();
-#endif
-
     RegisterConcatNodeExec();
     RegisterDropoutNodeExec();
     RegisterSoftmaxNodeExec();
@@ -92,13 +87,14 @@ void RegisterCommonOps(void)
     RegisterResizeNodeExec();
     RegisterLogisticNodeExec();
     RegisterDetectionPostProcessNodeExec();
-    RegisterConv2dRef();
 
 #ifdef CONFIG_ARCH_BLAS
     RegisterConvBlasNodeExec();
     RegisterDeconvBlasNodeExec();
     RegisterFcBlasNodeExec();
     RegisterLSTMNodeExec();
+    RegisterRNNNodeExec();
+    RegisterGRUNodeExec();
 #endif
     RegisterPooling_NodeExec();
     RegisterBatchNorm_NodeExec();
diff --git a/executor/operator/common/logistic.cpp b/executor/operator/common/logistic.cpp
index 3cd108d90..92ad3ea10 100644
--- a/executor/operator/common/logistic.cpp
+++ b/executor/operator/common/logistic.cpp
@@ -74,9 +74,9 @@ struct LogisticOps : public NodeOps
             auto o_quantized = output->GetQuantParam();
 
             float i_scale = (*i_quantized)[0].scale;
-            float i_zero = (*i_quantized)[0].zero_point;
+            int i_zero = (*i_quantized)[0].zero_point;
             float o_scale = (*o_quantized)[0].scale;
-            float o_zero = (*o_quantized)[0].zero_point;
+            int o_zero = (*o_quantized)[0].zero_point;
 
             for(int i = 0; i < elements; i++)
             {
@@ -90,15 +90,27 @@ struct LogisticOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if( (data_type != TENGINE_DT_FP32 && data_type != TENGINE_DT_UINT8)
+        || exec_attr->graph_layout != TENGINE_LAYOUT_NHWC)
+        return nullptr;
+
+    LogisticOps* ops = new LogisticOps();
+
+    return ops;
+}
+
 }    // namespace LogisticImpl
 
 using namespace LogisticImpl;
 
 void RegisterLogisticNodeExec(void)
 {
-    LogisticOps* ops = new LogisticOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Logistic", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Logistic", LogisticImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/lrn.cpp b/executor/operator/common/lrn.cpp
index eb9a2f31b..151619a5f 100644
--- a/executor/operator/common/lrn.cpp
+++ b/executor/operator/common/lrn.cpp
@@ -122,15 +122,26 @@ struct LRNOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    LRNOps* ops = new LRNOps();
+
+    return ops;
+}
+
 }    // namespace LRNImpl
 
 using namespace LRNImpl;
 
 void RegisterLRN_NodeExec(void)
 {
-    LRNOps* ops = new LRNOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "LRN", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "LRN", LRNImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
\ No newline at end of file
diff --git a/executor/operator/common/lrn_arm.cpp b/executor/operator/common/lrn_arm.cpp
index 2d0fcf760..cea518834 100644
--- a/executor/operator/common/lrn_arm.cpp
+++ b/executor/operator/common/lrn_arm.cpp
@@ -269,6 +269,19 @@ struct LRNOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    LRNOps* ops = new LRNOps();
+
+    return ops;
+}
+
 }    // namespace LRNImplArm
 
 using namespace LRNImplArm;
@@ -276,13 +289,11 @@ using namespace LRNImplArm;
 void RegisterLRNNodeExec(void)
 {
 #ifdef CONFIG_ARCH_ARM32
-    LRNOps* arm32_ops = new LRNOps();
-    NodeOpsRegistryManager::RegisterOPImplementor("arm32", "LRN", arm32_ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("arm32", "LRN", LRNImplArm::SelectFunc, 1000);
 #endif
 
 #ifdef CONFIG_ARCH_ARM64
-    LRNOps* arm64_ops = new LRNOps();
-    NodeOpsRegistryManager::RegisterOPImplementor("arm64", "LRN", arm64_ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("arm64", "LRN", LRNImplArm::SelectFunc, 1000);
 #endif
 }
 
diff --git a/executor/operator/common/normalize.cpp b/executor/operator/common/normalize.cpp
index 2a70291dc..e387b3233 100644
--- a/executor/operator/common/normalize.cpp
+++ b/executor/operator/common/normalize.cpp
@@ -118,15 +118,26 @@ struct NormalizeOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    NormalizeOps* ops = new NormalizeOps();
+
+    return ops;
+}
+
 }    // namespace NormalizeImpl
 
 using namespace NormalizeImpl;
 
 void RegisterNormalizeNodeExec(void)
 {
-    NormalizeOps* ops = new NormalizeOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Normalize", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Normalize", NormalizeImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/permute.cpp b/executor/operator/common/permute.cpp
index b05623361..b683e7684 100644
--- a/executor/operator/common/permute.cpp
+++ b/executor/operator/common/permute.cpp
@@ -56,7 +56,7 @@ struct PermuteOps : public NodeOps
         }
     }
 
-    bool Run(Node* node)
+   bool Run(Node* node)
     {
         const Tensor* input_tensor = node->GetInputTensor(0);
         Tensor* output_tensor = node->GetOutputTensor(0);
@@ -66,41 +66,83 @@ struct PermuteOps : public NodeOps
 
         const TShape& shape = input_tensor->GetShape();
         const std::vector<int> dims = shape.GetDim();
-
-        int batch_number = dims[0];
-        int channel = dims[1];
-        int width = dims[3];
-        int height = dims[2];
-        int _wc = width * channel;
-        int _hw = width * height;
-        int _chw = channel * _hw;
-
-        float* input = ( float* )get_tensor_mem(input_tensor);
-        float* output = ( float* )get_tensor_mem(output_tensor);
-        // 0231 [bhwc]
-        if((param->order0 == 0) && (param->order1 == 2) && (param->order2 == 3) && (param->order3 == 1))
+        if(dims.size()==4){
+            int batch_number = dims[0];
+            int channel = dims[1];
+            int width = dims[3];
+            int height = dims[2];
+            int _wc = width * channel;
+            int _hw = width * height;
+            int _chw = channel * _hw;
+
+            float* input = ( float* )get_tensor_mem(input_tensor);
+            float* output = ( float* )get_tensor_mem(output_tensor);
+            // 0231 [bhwc]
+            // other case to be support
+            if((param->order0 == 0) && (param->order1 == 2) && (param->order2 == 3) && (param->order3 == 1))
+            {
+                for(int b = 0; b < batch_number; b++)
+                {
+                    permute_hwc(input, output, height, width, channel, _wc, _hw);
+                    input += _chw;
+                    output += _chw;
+                }
+            }
+        }
+        else if(dims.size()==3)
         {
-            for(int b = 0; b < batch_number; b++)
+            int channel = dims[0];
+            int width = dims[2];
+            int height = dims[1];
+            int _hw = height * width;
+            int _cw = channel * width;
+
+            float* input = ( float* )get_tensor_mem(input_tensor);
+            float* output = ( float* )get_tensor_mem(output_tensor);
+            if((param->order0 == 1) && (param->order1 == 0) && (param->order2 == 2))
             {
-                permute_hwc(input, output, height, width, channel, _wc, _hw);
-                input += _chw;
-                output += _chw;
+                for (int q=0; q<height; q++)
+                {
+                    float* outptr = output+q*_cw;
+
+                    for (int i = 0; i < channel; i++)
+                    {
+                        const float* ptr = input+i*_hw;
+
+                        for (int j = 0; j < width; j++)
+                        {
+                            outptr[i*width + j] = ptr[q*width + j];
+                        }
+                    }
+                }
             }
         }
-        // other case to be support
+        
         return true;
     }
+
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    PermuteOps* ops = new PermuteOps();
+
+    return ops;
+}
+
 }    // namespace PermuteImpl
 
 using namespace PermuteImpl;
 
 void RegisterPermuteNodeExec(void)
 {
-    PermuteOps* ops = new PermuteOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Permute", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Permute", PermuteImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/pooling.cpp b/executor/operator/common/pooling.cpp
index 9bd735440..cd8a69a51 100644
--- a/executor/operator/common/pooling.cpp
+++ b/executor/operator/common/pooling.cpp
@@ -245,7 +245,7 @@ struct PoolOps : public NodeOps
         uint8_t* input_data = ( uint8_t* )get_tensor_mem(itensor);
         uint8_t* output_data = ( uint8_t* )get_tensor_mem(otensor);
 
-        if(exec_attr->layout == TENGINE_LAYOUT_NCHW)
+        if(exec_attr->graph_layout == TENGINE_LAYOUT_NCHW)
         {
             if(param_->alg == kPoolMax)
             {
@@ -260,9 +260,9 @@ struct PoolOps : public NodeOps
                     for(int n = 0; n < input_n; n++)
                     {
                         Generic_MaxPool(( float* )input_data + n * in_chw, ( float* )output_data + n * out_chw, input_c,
-                                        input_h, input_w, output_h, output_w, param_->kernel_shape[0],
-                                        param_->kernel_shape[1], param_->strides[0], param_->strides[1],
-                                        param_->pads[0], param_->pads[1]);
+                                        input_h, input_w, output_h, output_w, param_->kernel_h,
+                                        param_->kernel_w, param_->stride_h, param_->stride_w,
+                                        param_->pad_h0, param_->pad_w0);
                     }
                 }
             }
@@ -279,9 +279,9 @@ struct PoolOps : public NodeOps
                     for(int n = 0; n < input_n; n++)
                     {
                         Generic_AvgPool(( float* )input_data + n * in_chw, ( float* )output_data + n * out_chw, input_c,
-                                        input_h, input_w, output_h, output_w, param_->kernel_shape[0],
-                                        param_->kernel_shape[1], param_->strides[0], param_->strides[1],
-                                        param_->pads[0], param_->pads[1], param_->caffe_flavor);
+                                        input_h, input_w, output_h, output_w, param_->kernel_h,
+                                        param_->kernel_w, param_->stride_h, param_->stride_w,
+                                        param_->pad_h0, param_->pad_w0, param_->caffe_flavor);
                     }
                 }
             }
@@ -314,13 +314,13 @@ struct PoolOps : public NodeOps
                         if(elem_size == 4)
                             Generic_AvgPool_nhwc<float>(
                                 input_data + n * in_chw * 4, output_data + n * out_chw * 4, input_c, input_h, input_w,
-                                output_h, output_w, param_->kernel_shape[0], param_->kernel_shape[1],
-                                param_->strides[0], param_->strides[1], param_->pads[0], param_->pads[1]);
+                                output_h, output_w, param_->kernel_h, param_->kernel_w,
+                                param_->stride_h, param_->stride_w, param_->pad_h0, param_->pad_w0);
                         if(elem_size == 1)
                             Generic_AvgPool_nhwc<uint8_t>(
                                 input_data + n * in_chw, output_data + n * out_chw * 1, input_c, input_h, input_w,
-                                output_h, output_w, param_->kernel_shape[0], param_->kernel_shape[1],
-                                param_->strides[0], param_->strides[1], param_->pads[0], param_->pads[1]);
+                                output_h, output_w, param_->kernel_h, param_->kernel_w,
+                                param_->stride_h, param_->stride_w, param_->pad_h0, param_->pad_w0);
                     }
                 }
             }
@@ -335,15 +335,26 @@ struct PoolOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    PoolOps* ops = new PoolOps();
+
+    return ops;
+}
+
 }    // namespace PoolingRef
 
 using namespace PoolingRef;
 
 void RegisterPooling_NodeExec(void)
 {
-    PoolOps* ops = new PoolOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Pooling", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Pooling", PoolingRef::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/prelu.cpp b/executor/operator/common/prelu.cpp
index c7ea2e140..328f2633a 100644
--- a/executor/operator/common/prelu.cpp
+++ b/executor/operator/common/prelu.cpp
@@ -80,15 +80,26 @@ struct PreluOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    PreluOps* ops = new PreluOps();
+
+    return ops;
+}
+
 }    // namespace PreluImpl
 
 using namespace PreluImpl;
 
 void RegisterPReLUNodeExec(void)
 {
-    PreluOps* ops = new PreluOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "PReLU", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "PReLU", PreluImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/priorbox.cpp b/executor/operator/common/priorbox.cpp
index 3d0ec9676..52ace91e2 100644
--- a/executor/operator/common/priorbox.cpp
+++ b/executor/operator/common/priorbox.cpp
@@ -165,15 +165,26 @@ struct PriorBoxOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    PriorBoxOps* ops = new PriorBoxOps();
+
+    return ops;
+}
+
 }    // namespace PriorBoxImpl
 
 using namespace PriorBoxImpl;
 
 void RegisterPriorBoxNodeExec(void)
 {
-    PriorBoxOps* ops = new PriorBoxOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "PriorBox", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "PriorBox", PriorBoxImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/region.cpp b/executor/operator/common/region.cpp
index a380f137a..bc781c60b 100644
--- a/executor/operator/common/region.cpp
+++ b/executor/operator/common/region.cpp
@@ -135,15 +135,26 @@ struct RegionOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    RegionOps* ops = new RegionOps();
+
+    return ops;
+}
+
 }    // namespace RegionImpl
 
 using namespace RegionImpl;
 
 void RegisterRegionNodeExec(void)
 {
-    RegionOps* ops = new RegionOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Region", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Region", RegionImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/relu.cpp b/executor/operator/common/relu.cpp
index 57e963a68..f353d68d1 100644
--- a/executor/operator/common/relu.cpp
+++ b/executor/operator/common/relu.cpp
@@ -103,15 +103,26 @@ struct ReLuOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    ReLuOps* ops = new ReLuOps();
+
+    return ops;
+}
+
 }    // namespace ReLuImpl
 
 using namespace ReLuImpl;
 
 void RegisterReLuNodeExec(void)
 {
-    ReLuOps* ops = new ReLuOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "ReLu", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "ReLu", ReLuImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/relu6.cpp b/executor/operator/common/relu6.cpp
index 635d4a382..87d946b75 100644
--- a/executor/operator/common/relu6.cpp
+++ b/executor/operator/common/relu6.cpp
@@ -89,15 +89,26 @@ struct ReLu6Ops : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    ReLu6Ops* ops = new ReLu6Ops();
+
+    return ops;
+}
+
 }    // namespace ReLu6Impl
 
 using namespace ReLu6Impl;
 
 void RegisterReLu6NodeExec(void)
 {
-    ReLu6Ops* ops = new ReLu6Ops();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "ReLu6", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "ReLu6", ReLu6Impl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/reorg.cpp b/executor/operator/common/reorg.cpp
index 3342d8911..52bea5da2 100644
--- a/executor/operator/common/reorg.cpp
+++ b/executor/operator/common/reorg.cpp
@@ -81,15 +81,26 @@ struct ReorgOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    ReorgOps* ops = new ReorgOps();
+
+    return ops;
+}
+
 }    // namespace ReorgImpl
 
 using namespace ReorgImpl;
 
 void RegisterReorgNodeExec(void)
 {
-    ReorgOps* ops = new ReorgOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Reorg", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Reorg", ReorgImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/reshape.cpp b/executor/operator/common/reshape.cpp
index 4f0ec4677..3785048fa 100644
--- a/executor/operator/common/reshape.cpp
+++ b/executor/operator/common/reshape.cpp
@@ -54,15 +54,26 @@ struct ReshapeOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    ReshapeOps* ops = new ReshapeOps();
+
+    return ops;
+}
+
 }    // namespace ReshapeImpl
 
 using namespace ReshapeImpl;
 
 void RegisterReshapeNodeExec(void)
 {
-    ReshapeOps* ops = new ReshapeOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Reshape", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Reshape", ReshapeImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/resize.cpp b/executor/operator/common/resize.cpp
index 4ba4f4797..1ce8316ad 100644
--- a/executor/operator/common/resize.cpp
+++ b/executor/operator/common/resize.cpp
@@ -309,15 +309,26 @@ struct ResizeOps : public MTNodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    ResizeOps* ops = new ResizeOps();
+
+    return ops;
+}
+
 }    // namespace ResizeImpl
 
 using namespace ResizeImpl;
 
 void RegisterResizeNodeExec(void)
 {
-    ResizeOps* ops = new ResizeOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Resize", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Resize", ResizeImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
\ No newline at end of file
diff --git a/executor/operator/common/roi_pooling.cpp b/executor/operator/common/roi_pooling.cpp
index bd550175f..07964188f 100644
--- a/executor/operator/common/roi_pooling.cpp
+++ b/executor/operator/common/roi_pooling.cpp
@@ -117,15 +117,26 @@ struct ROIPoolingOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    ROIPoolingOps* ops = new ROIPoolingOps();
+
+    return ops;
+}
+
 }    // namespace ROIPoolingImpl
 
 using namespace ROIPoolingImpl;
 
 void RegisterROIPoolingNodeExec(void)
 {
-    ROIPoolingOps* ops = new ROIPoolingOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "ROIPooling", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "ROIPooling", ROIPoolingImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/rpn.cpp b/executor/operator/common/rpn.cpp
index fe669535a..e06ced2ec 100644
--- a/executor/operator/common/rpn.cpp
+++ b/executor/operator/common/rpn.cpp
@@ -321,15 +321,26 @@ struct RPNOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    RPNOps* ops = new RPNOps();
+
+    return ops;
+}
+
 }    // namespace RPNImpl
 
 using namespace RPNImpl;
 
 void RegisterRPNNodeExec(void)
 {
-    RPNOps* ops = new RPNOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "RPN", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "RPN", RPNImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/scale.cpp b/executor/operator/common/scale.cpp
index f291673ec..8cf3d9953 100644
--- a/executor/operator/common/scale.cpp
+++ b/executor/operator/common/scale.cpp
@@ -108,15 +108,26 @@ struct ScaleOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    ScaleOps* ops = new ScaleOps();
+
+    return ops;
+}
+
 }    // namespace ScaleImpl
 
 using namespace ScaleImpl;
 
 void RegisterScale_NodeExec(void)
 {
-    ScaleOps* ops = new ScaleOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Scale", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Scale", ScaleImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/slice.cpp b/executor/operator/common/slice.cpp
index b65389322..469214d2d 100644
--- a/executor/operator/common/slice.cpp
+++ b/executor/operator/common/slice.cpp
@@ -19,7 +19,7 @@
 
 /*
  * Copyright (c) 2018, Open AI Lab
- * Author: chunyinglv@openailab.com
+ * Author: ruizhang@openailab.com
  */
 #include <iostream>
 #include <functional>
@@ -30,42 +30,160 @@
 #include "node_ops.hpp"
 #include "tensor_mem.hpp"
 #include "graph.hpp"
+#include "operator/slice.hpp"
+
 
 namespace TEngine {
 
 namespace SliceImpl {
-
+const int default_prio = 200;
 struct SliceOps : public NodeOps
 {
-    bool Run(Node* node)
+    template <typename T>
+    bool caffe_run(Node *node)
     {
-        // currently, only working on channel C (slice_axis=1)
+        // get the slice param
+        Slice * slice_op = dynamic_cast<Slice*>(node->GetOp());
+        SliceParam * param = slice_op->GetParam();
+        int slice_axis = param->axis;
+        int num_slices = 1;
+        int slice_size = 1;
         Tensor* input_tensor = node->GetInputTensor(0);
-        Tensor* output_tensor0 = node->GetOutputTensor(0);
-        Tensor* output_tensor1 = node->GetOutputTensor(1);
 
-        const std::vector<int>& dims = input_tensor->GetShape().GetDim();
+        const TShape& input_shape = input_tensor->GetShape();
+        T* input = ( T* )get_tensor_mem(input_tensor);
+        std::vector<int> in_dim = input_shape.GetDim();
+        for(int i = 0; i < slice_axis; i++)
+        {
+            num_slices = num_slices * in_dim[i];
+        }
+        for(unsigned int i = slice_axis + 1; i < in_dim.size(); i++)
+        {
+            slice_size = slice_size * in_dim[i];
+        }
+        int in_slice = in_dim[slice_axis];
+        int slice_index = 0;
+        unsigned int out_num = node->GetOutputNum();
+        for(unsigned int i = 0; i < out_num; i++)
+        {
+              Tensor* output_tensor = node->GetOutputTensor(i);
+              T* output = (T* )get_tensor_mem(output_tensor);
+              int out_slice = (output_tensor->GetShape()).Shape(slice_axis);
+
+              for(int n = 0; n < num_slices; n++)
+              {
+                   int in_offset = (n * in_slice + slice_index) * slice_size;
+                   int out_offset  = n * out_slice * slice_size;
+                   memcpy(output+out_offset,input + in_offset,slice_size * out_slice * sizeof(T));
+              }
+              slice_index += out_slice;
+        }
+        return true;
+
+    }
+    template<typename T>
+    bool tf_run(Node *node)
+    {
+        // get the slice param
+        Slice * slice_op = dynamic_cast<Slice*>(node->GetOp());
+        SliceParam * param = slice_op->GetParam();
+        // get the input data
+        Tensor* input_tensor = node->GetInputTensor(0);
+        const TShape& input_shape = input_tensor->GetShape();
+        T* input = (T* )get_tensor_mem(input_tensor);
+        Tensor* output_tensor = node->GetOutputTensor(0);
+        T *output = (T* )get_tensor_mem(output_tensor);
+        std::vector<int> in_dim = input_shape.GetDim();
+        int in_dim_new[4];
+        int maxdim = 4;
+        int begins[4];
+        int sizes[4];
+        int real_dim = param->begin_.size();
+        int dim_idx = 0;
+        for(int idx = 0; idx < maxdim; idx++)
+        {
+            if(maxdim - idx > real_dim)
+            {
+                begins[idx] = 0;
+                sizes[idx] = 1;
+                in_dim_new[idx] = 1;
+            }
+            else
+            {
+                begins[idx] = param->begin_[dim_idx];
+                sizes[idx] = param->size_[dim_idx];
+                in_dim_new[idx] = in_dim[dim_idx];
+                dim_idx++;
+            }
+        }
+        int in_dim_0 = in_dim_new[0];
+        int in_dim_1 = in_dim_new[1];
+        int in_dim_2 = in_dim_new[2];
+        int in_dim_3 = in_dim_new[3];
 
-        int hw = dims[2] * dims[3];
-        int slice_size = dims[1] / 2 * hw;
-        int size = dims[1] * hw;
-        float* input = ( float* )get_tensor_mem(input_tensor);
-        float* output0 = ( float* )get_tensor_mem(output_tensor0);
-        float* output1 = ( float* )get_tensor_mem(output_tensor1);
+        int start_dim_0 = (4 - real_dim) > 0 ? 0 : begins[0];
+        int stop_dim_0 = ((4 - real_dim) > 0 || sizes[0] == -1)
+                     ? in_dim_0 - start_dim_0
+                     : start_dim_0 + sizes[0];
+        int start_dim_1 = (3 - real_dim) > 0 ? 0 : begins[1];
+        int stop_dim_1 = ((3 - real_dim) > 0 || sizes[1] == -1)
+                     ? in_dim_1 - start_dim_1
+                     : start_dim_1 + sizes[1];
+        int start_dim_2 = (2 - real_dim) > 0 ? 0 : begins[2];
+        int stop_dim_2 = ((2 - real_dim) > 0 || sizes[2] == -1)
+                     ? in_dim_2 - start_dim_2
+                     : start_dim_2 + sizes[2];
+        int start_dim_3 = (1 - real_dim) > 0 ? 0 : begins[3];
+        int stop_dim_3 = ((1 - real_dim) > 0 || sizes[3] == -1)
+                     ? in_dim_3 - start_dim_3
+                     : start_dim_3 + sizes[3];
 
-        for(int i = 0; i < dims[0]; i++)
+        for(int n = start_dim_0; n < stop_dim_0;++n)
         {
-            float* in0 = input + i * size;
-            float* in1 = in0 + slice_size;
-            for(int j = 0; j < slice_size; j++)
+            for(int i = start_dim_1; i < stop_dim_1; ++i)
             {
-                output0[j] = in0[j];
-                output1[j] = in1[j];
+                for(int j = start_dim_2; j < stop_dim_2; ++j)
+                {
+                    int len = stop_dim_3 - start_dim_3;
+                    int input_off = n * in_dim_1 * in_dim_2 * in_dim_3 +
+                                    i * in_dim_2 * in_dim_3 +
+                                    j * in_dim_3 + start_dim_3;
+                    memcpy(output,input + input_off,len * sizeof(T));
+                    output += len;
+                }
             }
         }
         return true;
     }
+    bool Run(Node* node)
+    {
+        Slice * slice_op = dynamic_cast<Slice*>(node->GetOp());
+        SliceParam * param = slice_op->GetParam();
+        if(param->iscaffe)
+        {
+            return caffe_run<float>(node);
+        }
+        else
+        {
+            return tf_run<float>(node);
+        }
+    }
 };
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+#ifdef CONFIG_ATUH_DEVICE
+    if(!get_auth_float_enabled())
+       return nullptr;
+#endif
+
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 ||exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+    SliceOps* ops = new SliceOps();
+    return ops;
+}
 
 }    // namespace SliceImpl
 
@@ -73,9 +191,9 @@ using namespace SliceImpl;
 
 void RegisterSliceNodeExec(void)
 {
-    SliceOps* ops = new SliceOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Slice", ops);
+    if(!NodeOpsRegistryManager::RegisterOPImplementor("common", "Slice", SliceImpl::SelectFunc,
+                                                  SliceImpl::default_prio))
+        LOG_ERROR()<<__FUNCTION__<<" :Regist OP failed for prio["<<SliceImpl::default_prio<<"]\n";
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/common/softmax.cpp b/executor/operator/common/softmax.cpp
index bef84f159..9464993c0 100644
--- a/executor/operator/common/softmax.cpp
+++ b/executor/operator/common/softmax.cpp
@@ -173,15 +173,26 @@ struct SoftmaxOps : public NodeOps
     }
 };
 
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    Tensor* input = node->GetInputTensor(0);
+    const int data_type = input->GetDataType();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW)
+        return nullptr;
+
+    SoftmaxOps* ops = new SoftmaxOps();
+
+    return ops;
+}
+
 }    // namespace SoftmaxImpl
 
 using namespace SoftmaxImpl;
 
 void RegisterSoftmaxNodeExec(void)
 {
-    SoftmaxOps* ops = new SoftmaxOps();
-
-    NodeOpsRegistryManager::RegisterOPImplementor("common", "Softmax", ops);
+    NodeOpsRegistryManager::RegisterOPImplementor("common", "Softmax", SoftmaxImpl::SelectFunc, 1000);
 }
 
 }    // namespace TEngine
diff --git a/executor/operator/init.cpp b/executor/operator/init.cpp
new file mode 100644
index 000000000..5fc6f6373
--- /dev/null
+++ b/executor/operator/init.cpp
@@ -0,0 +1,27 @@
+namespace TEngine {
+
+extern void NodeOpsRegistryManagerInit(void);
+extern void RegisterCommonOps(void);
+extern void RegisterRefOps(void);
+
+#if CONFIG_ARCH_ARM64 == 1
+extern void RegisterArmOps(void);
+#endif
+
+}
+
+using namespace TEngine;
+
+extern "C" int register_hclcpu_ops(void)
+{
+    RegisterCommonOps();
+    RegisterRefOps();
+
+#if CONFIG_ARCH_ARM64
+    RegisterArmOps();
+#endif
+
+    return 0;
+
+}
+
diff --git a/executor/operator/ref/Makefile b/executor/operator/ref/Makefile
index f1037d992..62d066cfb 100644
--- a/executor/operator/ref/Makefile
+++ b/executor/operator/ref/Makefile
@@ -1,3 +1,34 @@
-obj-y+=demo_operator.o
+obj-y+=init.o
+obj-y+=ref_convolution.o
+obj-y+=ref_pooling.o
+obj-y+=ref_deconvolution.o
+obj-y+=ref_fully_connected.o
+obj-y+=ref_softmax.o
+obj-y+=ref_concat.o
+obj-y+=ref_permute.o
+obj-y+=ref_swap_axis.o
+obj-y+=ref_rpn.o
+obj-y+=prelu.o
+obj-y+=relu.o
+obj-y+=relu6.o
+obj-y+=sigmoid.o
+obj-y+=squeeze.o
+obj-y+=tanh.o
+obj-y+=resize.o
+obj-y+=reshape.o
+obj-y+=flatten.o
+obj-y+=dropout.o
+obj-y+=ref_detection_postprocess.o
+obj-y+=ref_lrn.o
+obj-y+=eltwise.o
+obj-y+=ref_slice.o
+obj-y+=split.o
+obj-y+=pad.o
+obj-y+=reduction.o
+obj-y+=ref_add_n.o
+obj-y+=ref_batchnorm.o
+obj-y+=ref_normalize.o
+
+COMMON_CFLAGS+=-I.
 
 
diff --git a/executor/operator/ref/demo_operator.cpp b/executor/operator/ref/demo_operator.cpp
deleted file mode 100644
index 132bbfc46..000000000
--- a/executor/operator/ref/demo_operator.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2018, Open AI Lab
- * Author: haitao@openailab.com
- */
-
-#include <vector>
-#include <thread>
-
-#include "logger.hpp"
-#include "graph.hpp"
-#include "node_ops.hpp"
-
-namespace TEngine {
-
-namespace demo_ops {
-
-struct DemoOps : public MTNodeOps
-{
-public:
-    bool FloatPrerun(Node* node)
-    {
-        LOG_INFO() << "float prerun done!\n";
-        return true;
-    }
-
-    bool FloatPostrun(Node* node)
-    {
-        LOG_INFO() << "float post run done!\n";
-        return true;
-    }
-
-    bool FloatRun(Node* node)
-    {
-        LOG_INFO() << "float run done!\n";
-        return true;
-    }
-
-    bool IntPrerun(Node* node)
-    {
-        LOG_INFO() << "int prerun done!\n";
-        return true;
-    }
-
-    bool IntPostrun(Node* node)
-    {
-        LOG_INFO() << "int post run done!\n";
-        return true;
-    }
-
-    bool IntRun(Node* node)
-    {
-        LOG_INFO() << "int run done!\n";
-        return true;
-    }
-
-    bool MTIntRun(Node* node)
-    {
-        std::vector<sub_op_task> task_list;
-
-        for(int i = 0; i < cpu_info->GetCPUNumber() * 2; i++)
-        {
-            sub_op_task task;
-            task.exec_func = std::move(std::bind(&DemoOps::IntAider, this, std::placeholders::_1, std::placeholders::_2,
-                                                 std::placeholders::_3));
-            task.seq = i;
-            task.data = ( void* )(( unsigned long )i);
-
-            task_list.push_back(task);
-        }
-
-        task_dispatch(task_list, -1);
-
-        wait_done();
-
-        return true;
-    }
-
-    bool MTFloatRun(Node* node)
-    {
-        std::vector<sub_op_task> task_list;
-
-        for(int i = 0; i < cpu_info->GetCPUNumber() * 2; i++)
-        {
-            sub_op_task task;
-            task.exec_func = std::bind(&DemoOps::FloatAider, this, std::placeholders::_1, std::placeholders::_2,
-                                       std::placeholders::_3);
-            task.seq = i;
-            task.data = ( void* )(( unsigned long )i);
-
-            task_list.push_back(task);
-        }
-
-        task_dispatch(task_list, -1);
-
-        wait_done();
-
-        return true;
-    }
-
-    bool IntAider(int cpu, int seq, void* data)
-    {
-        int cpu_model = cpu_info->GetCPUModel(cpu);
-
-        if(cpu_model == CPU_A72)
-            A53IntAider(cpu, seq, data);
-        else
-            A72IntAider(cpu, seq, data);
-
-        return true;
-    }
-
-    bool FloatAider(int cpu, int seq, void* data)
-    {
-        int cpu_model = cpu_info->GetCPUModel(cpu);
-
-        if(cpu_model == CPU_A53)
-            A53FloatAider(cpu, seq, data);
-        else
-            A72FloatAider(cpu, seq, data);
-
-        return true;
-    }
-
-    bool A72FloatAider(int cpu, int seq, void* data)
-    {
-        unsigned long n = ( unsigned long )(data);
-
-        LOG_INFO() << "cpu: " << cpu << " A72 FLOAT called\n";
-        LOG_INFO() << "cpu: " << cpu << " will sleep " << n << " seconds\n";
-
-        std::chrono::milliseconds sleep_time(n * 1000);
-        std::this_thread::sleep_for(sleep_time);
-
-        LOG_INFO() << "cpu: " << cpu << " DONE\n";
-
-        return true;
-    }
-
-    bool A53FloatAider(int cpu, int seq, void* data)
-    {
-        unsigned long n = ( unsigned long )(data);
-
-        LOG_INFO() << "cpu: " << cpu << " A53 FLOAT called\n";
-        LOG_INFO() << "cpu: " << cpu << " will sleep " << n << " seconds\n";
-
-        std::chrono::milliseconds sleep_time(n * 1000);
-        std::this_thread::sleep_for(sleep_time);
-
-        LOG_INFO() << "cpu: " << cpu << " DONE\n";
-
-        return true;
-    }
-
-    bool A72IntAider(int cpu, int seq, void* data)
-    {
-        unsigned long n = ( unsigned long )(data);
-
-        LOG_INFO() << "cpu: " << cpu << " A72 INT called\n";
-        LOG_INFO() << "cpu: " << cpu << " will sleep " << n << " seconds\n";
-
-        std::chrono::milliseconds sleep_time(n * 1000);
-        std::this_thread::sleep_for(sleep_time);
-
-        LOG_INFO() << "cpu: " << cpu << " DONE\n";
-
-        return true;
-    }
-
-    bool A53IntAider(int cpu, int seq, void* data)
-    {
-        unsigned long n = ( unsigned long )(data);
-
-        LOG_INFO() << "cpu: " << cpu << " A53 INT called\n";
-        LOG_INFO() << "cpu: " << cpu << " will sleep " << n << " seconds\n";
-
-        std::chrono::milliseconds sleep_time(n * 1000);
-        std::this_thread::sleep_for(sleep_time);
-
-        LOG_INFO() << "cpu: " << cpu << " DONE\n";
-
-        return true;
-    }
-
-    /*****************************************************/
-    bool Prerun(Node* node) override
-    {
-        if(float_mode)
-            return FloatPrerun(node);
-        else
-            return IntPrerun(node);
-    }
-
-    bool Run(Node* node) override
-    {
-        std::cout << "Run launched on : " << cpu_info->GetCPUModelString(cpu_info->GetMasterCPU()) << "\n";
-
-        if(float_mode)
-        {
-            if(mt_mode)
-                return MTFloatRun(node);
-            else
-                return FloatRun(node);
-        }
-        else
-        {
-            if(mt_mode)
-                return MTIntRun(node);
-            else
-                return IntRun(node);
-        }
-    }
-
-    bool Postrun(Node* node) override
-    {
-        if(float_mode)
-            return FloatPostrun(node);
-        else
-            return IntPostrun(node);
-    }
-
-    DemoOps()
-    {
-        float_mode = true;
-        mt_mode = false;
-    }
-
-    bool float_mode;
-    bool mt_mode;
-};
-
-NodeOps* SelectFunc(const CPUInfo* info, Node* node)
-{
-    DemoOps* ops = new DemoOps();
-
-    if(info->GetCPUNumber() > 1)
-        ops->mt_mode = true;
-    else
-        ops->mt_mode = false;
-
-    Tensor* input_tensor = node->GetInputTensor(0);
-
-    if(input_tensor->GetDataType() == TENGINE_DT_FP32)
-    {
-        ops->float_mode = true;
-    }
-    else
-    {
-        ops->float_mode = false;
-    }
-
-    return ops;
-}
-
-}    // namespace demo_ops
-
-using namespace demo_ops;
-
-void RegisterDemoOps(void)
-{
-    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "DemoOp", demo_ops::SelectFunc, 1000);
-}
-
-}    // namespace TEngine
diff --git a/executor/operator/ref/dropout.cpp b/executor/operator/ref/dropout.cpp
new file mode 100644
index 000000000..7d5c600d1
--- /dev/null
+++ b/executor/operator/ref/dropout.cpp
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+// #include "kernel/Dropout/Dropout_kernel.h"
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/dropout.hpp"
+
+namespace TEngine {
+
+namespace RefDropoutOps {
+
+
+
+struct RefDropout : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+    bool OnBind(Node * node) override; 
+    bool Run(Node * node) override;
+    bool Postrun(Node * node) override;
+    // void InitRegistry(void);
+    // Dropout_param op_param;
+    // void * mem;
+    // Dropout_t  kernel_run;
+
+
+    // KernelRegistry<Dropout_t>  kernel_registry;
+    
+    RefDropout(void) 
+    {
+    //    mem=nullptr; 
+    //    kernel_run=nullptr;
+
+    //    InitRegistry();
+    }
+};
+
+
+bool RefDropout::Prerun(Node * node)
+{
+    // Tensor * input=node->GetInputTensor(0);
+    // Tensor* output_tensor = node->GetOutputTensor(0);
+    // int  layout=exec_attr->graph_layout;
+
+    // if(input->GetDataType() == TENGINE_DT_INT8 ||
+    //     input->GetDataType() == TENGINE_DT_UINT8 )
+    // {
+    //     if(get_scale_zero(input, output_tensor, &op_param) < 0)
+    //         return false;
+    // }
+
+      
+    // if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType()))
+    // {
+    //     set_tengine_errno(ENOENT);
+    //     return false;
+    // }
+
+    return true;
+}
+
+bool RefDropout::OnBind(Node * node)
+{
+    inplace_t io_map;
+
+    io_map[0] = 0;
+
+    node->SetAttr(ATTR_INPLACE, io_map);
+    return true;
+}
+
+bool RefDropout::Run(Node * node)
+{
+    // Tensor* input_tensor = node->GetInputTensor(0);
+    // Tensor* output_tensor = node->GetOutputTensor(0);
+    // const TShape& shape = input_tensor->GetShape();
+    // void* data = get_tensor_mem(input_tensor);
+    // void* out_data = get_tensor_mem(output_tensor);
+  
+    // int size = shape.GetSize();
+    // int ret=kernel_run(data,out_data,size,&op_param);
+   
+    // if(ret<0)
+    //     return false;
+    // else
+    //     return true;
+
+    Tensor* input = node->GetInputTensor(0);
+    Tensor* output = node->GetOutputTensor(0);
+    auto i_quant = input->GetQuantParam();
+    auto o_quant = output->GetQuantParam();
+    if(i_quant->size() != 1)
+    {
+        LOG_ERROR()<<"input quant param num isnot 1 \n";
+        return false;
+    }
+    o_quant->resize(0);
+    o_quant->push_back((*i_quant)[0]);
+
+    return true;
+}
+
+bool RefDropout::Postrun(Node * node)
+{
+    return true;
+}
+
+// void RefDropout::InitRegistry(void)
+// {
+// #ifdef CONFIG_KERNEL_FP32
+//     kernel_registry.Register((Dropout_t)Dropout_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+//     kernel_registry.Register((Dropout_t)Dropout_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+// #endif
+
+// #ifdef CONFIG_KERNEL_FP16
+//     kernel_registry.Register((Dropout_t)Dropout_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+//     kernel_registry.Register((Dropout_t)Dropout_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+// #endif
+// #ifdef CONFIG_KERNEL_INT8
+//     kernel_registry.Register((Dropout_t)Dropout_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+//     kernel_registry.Register((Dropout_t)Dropout_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+// #endif
+
+// #ifdef CONFIG_KERNEL_UINT8
+//     kernel_registry.Register((Dropout_t)Dropout_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+//     kernel_registry.Register((Dropout_t)Dropout_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+// #endif
+
+// }
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    RefDropout* ops = new RefDropout();
+
+    LOG_DEBUG()<<"ReluOps RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefReluOps
+void RegisterDropoutOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Dropout", RefDropoutOps::SelectFunc, 1000);
+}
+}    // namespace TEngine
diff --git a/executor/operator/ref/eltwise.cpp b/executor/operator/ref/eltwise.cpp
new file mode 100644
index 000000000..4ebaf9cd7
--- /dev/null
+++ b/executor/operator/ref/eltwise.cpp
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+#include "kernel/eltwise/eltwise.h"
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/eltwise.hpp"
+
+namespace TEngine {
+
+namespace RefEltwiseOps {
+
+
+
+struct EltwiseOps : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+    bool Run(Node * node) override; 
+    bool Postrun(Node * node) override;
+    void InitRegistry(void);
+
+    eltwise_param op_param;
+    eltwise_t  kernel_run;
+
+    KernelRegistry<eltwise_t>  kernel_registry;
+
+    EltwiseOps(void) 
+    {
+       kernel_run=nullptr;
+
+       InitRegistry();
+    }
+};
+static int get_scale_zero(Tensor* itensor,Tensor * otensor,eltwise_param* param)
+{
+    auto* i_quant = itensor->GetQuantParam();
+    auto* o_quant = otensor->GetQuantParam();
+    if( i_quant->size() != 1 )
+    {
+        LOG_ERROR()<<"Input quant size: ("<<i_quant->size()<<")\n";
+        return -1;
+    }
+    param->scale[0] = (*i_quant)[0].scale;
+    if(itensor->GetDataType() == TENGINE_DT_UINT8)
+    {
+        if( o_quant->size() != 1)
+        {
+            LOG_ERROR()<<"Output quant size: ("<<o_quant->size()<<")\n";
+            return -1;
+        }
+
+        param->scale[2] = (*o_quant)[0].scale;
+        param->zero[2] = (*o_quant)[0].zero_point;
+
+        param->zero[0] = (*i_quant)[0].zero_point;
+    }
+
+    return 0;
+}
+static int get_scale_zero_1(Tensor* itensor,eltwise_param* param)
+{
+    auto* i_quant = itensor->GetQuantParam();
+    if( i_quant->size() != 1 )
+    {
+        LOG_ERROR()<<"Input quant size: ("<<i_quant->size()<<")\n";
+        return -1;
+    }
+    param->scale[1] = (*i_quant)[0].scale;
+    if(itensor->GetDataType() == TENGINE_DT_UINT8)
+    {
+
+        param->zero[1] = (*i_quant)[0].zero_point;
+    }
+    return 0;
+}
+
+bool EltwiseOps::Prerun(Node * node)
+{
+    Tensor * input_tensor=node->GetInputTensor(0);
+
+    int  layout=exec_attr->graph_layout;
+
+    if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+ 
+    //int elem_size=DataType::GetTypeSize(input->GetDataType());
+
+    return true;
+}
+
+
+bool EltwiseOps::Run(Node * node)
+{
+    Tensor* input_tensor0 = node->GetInputTensor(0);
+    int element_size = DataType::GetTypeSize(input_tensor0->GetDataType());
+    const TShape& ishape = input_tensor0->GetShape();
+    void* input0 = get_tensor_mem(input_tensor0);
+    Tensor* input_tensor1 = nullptr;
+    void* input1 = nullptr;
+    int input1_count4 = 0;
+    int input_chan_1 = 0;
+    int input_hw_1 = 0;
+    int input_h_1 = 0;
+    int input_w_1 = 0;
+    int input_n_1 = 0;
+    // this version only support for input_num=2
+    // int input_number=node->GetInputNum();
+
+    // output
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    if(input_tensor0->GetDataType() == TENGINE_DT_INT8 ||input_tensor0->GetDataType() == TENGINE_DT_UINT8 )
+    {
+        if(get_scale_zero(input_tensor0,output_tensor, &op_param) < 0)
+            return false;
+    }
+
+    if(node->GetInputNum() > 1)
+    {
+        input_tensor1 = node->GetInputTensor(1);
+        const TShape& ishape1 = input_tensor1->GetShape();
+        input1 = get_tensor_mem(input_tensor1);
+        input1_count4 = input_tensor1->GetTotalSize() / element_size;
+        input_n_1=ishape1.GetN();
+        input_chan_1 = ishape1.GetC();
+        input_hw_1 = ishape1.GetH() * ishape1.GetW();
+        input_h_1=ishape1.GetH();
+        input_w_1=ishape1.GetW();
+
+        if(input_tensor1->GetDataType() == TENGINE_DT_INT8 ||
+        input_tensor1->GetDataType() == TENGINE_DT_UINT8 )
+        {   
+            if(get_scale_zero_1(input_tensor1, &op_param) < 0)
+                return false;
+        }
+    }
+    int layout = ishape.GetDataLayout();
+    void* output = get_tensor_mem(output_tensor);
+    Eltwise* eltwise_op = dynamic_cast<Eltwise*>(node->GetOp());
+    EltwiseParam* param = eltwise_op->GetParam();
+    int input_count4 = input_tensor0->GetTotalSize() / element_size;
+    int input_chan = ishape.GetC();
+    int input_hw = ishape.GetH() * ishape.GetW();
+    int input_h=ishape.GetH();
+    int input_w=ishape.GetW();
+    int input_n=ishape.GetN();
+    //get out_tensor size
+    Tensor* output_tensor0 = node->GetOutputTensor(0);
+    int out_element_size = DataType::GetTypeSize(output_tensor0->GetDataType());
+    int out_size = output_tensor0->GetTotalSize()/out_element_size;
+    float * output_buf=(float *)malloc(sizeof(float)*out_size);
+    int ret=kernel_run(output, input0, input1, param->type, input_count4,
+                     input_chan,input_chan_1,input_hw,input_hw_1, input1_count4,
+                     input_h,input_w,input_h_1,input_w_1,input_n,input_n_1,layout,
+                     out_size,output_buf,&op_param);
+    free(output_buf);
+
+    if(input_tensor1->GetDataType() == TENGINE_DT_INT8 
+        || input_tensor0->GetDataType() == TENGINE_DT_INT8)
+    {
+
+        auto* o_quant = output_tensor->GetQuantParam();
+        QuantParam q_param;
+        q_param.scale =op_param.scale[2];
+        o_quant->resize(0);
+        o_quant->push_back(q_param);
+
+    }
+
+
+    if(ret<0)
+         return false;
+    else
+         return true;
+}
+
+bool EltwiseOps::Postrun(Node * node)
+{
+    return true;
+}
+
+void EltwiseOps::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((eltwise_t)eltwise_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((eltwise_t)eltwise_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((eltwise_t)eltwise_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((eltwise_t)eltwise_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((eltwise_t)eltwise_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((eltwise_t)eltwise_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((eltwise_t)eltwise_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((eltwise_t)eltwise_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    EltwiseOps* ops = new EltwiseOps();
+
+    LOG_DEBUG()<<"EltwiseOps RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefEltwiseOps
+void RegisterEltwiseOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Eltwise", RefEltwiseOps::SelectFunc, 1000);
+}
+
+}    // namespace TEngine
\ No newline at end of file
diff --git a/executor/operator/ref/flatten.cpp b/executor/operator/ref/flatten.cpp
new file mode 100644
index 000000000..4ebcab773
--- /dev/null
+++ b/executor/operator/ref/flatten.cpp
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/flatten.hpp"
+
+namespace TEngine {
+
+namespace RefFlattenOps {
+
+
+
+struct RefFlatten : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+    bool OnBind(Node * node) override; 
+    bool Run(Node * node) override;
+    bool Postrun(Node * node) override;
+    
+    RefFlatten(void) 
+    {
+
+    }
+};
+
+
+bool RefFlatten::Prerun(Node * node)
+{
+    return true;
+}
+
+bool RefFlatten::OnBind(Node * node)
+{
+    inplace_t io_map;
+
+    io_map[0] = 0;
+
+    node->SetAttr(ATTR_INPLACE, io_map);
+    return true;
+}
+
+bool RefFlatten::Run(Node * node)
+{
+
+    return true;
+}
+
+bool RefFlatten::Postrun(Node * node)
+{
+    return true;
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    RefFlatten* ops = new RefFlatten();
+
+    LOG_DEBUG()<<"ReluOps RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefReluOps
+void RegisterFlattenOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Flatten", RefFlattenOps::SelectFunc, 1000);
+}
+}    // namespace TEngine
\ No newline at end of file
diff --git a/executor/operator/ref/init.cpp b/executor/operator/ref/init.cpp
new file mode 100644
index 000000000..5f3015803
--- /dev/null
+++ b/executor/operator/ref/init.cpp
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+#include <iostream>
+#include <functional>
+
+namespace TEngine {
+
+extern void RegisterRefPoolingOps(void);
+extern void RegisterRefConv2d(void);
+extern void RegisterRefDeconv2d(void);
+extern void RegisterRefSoftmaxOps(void);
+extern void RegisterRefDetectionPostOps(void);
+extern void RegisterRefFCOps(void);
+extern void RegisterRelu6Ops(void);
+extern void RegisterReluOps(void);
+extern void RegisterPreluOps(void);
+extern void RegisterTanhOps(void);
+extern void RegisterSigmoidOps(void);
+extern void RegisterResizeOps(void);
+extern void RegisterFlattenOps(void);
+extern void RegisterReshapeOps(void);
+extern void RegisterDropoutOps(void);
+extern void RegisterRefConcat(void);
+extern void RegisterRefPermute(void);
+extern void RegisterRefLrn(void);
+extern void RegisterEltwiseOps(void);
+extern void RegisterRefSlice(void);
+extern void RegisterSplitOps(void);
+extern void RegisterPadOps(void);
+extern void RegisterReductionOps(void);
+extern void RegisterSqueezeOps(void);
+extern void RegisterSwapAxisOps(void);
+extern void RegisterRefRPNOps(void);
+extern void RegisterRefBatchNormOps(void);
+extern void RegisterRefNormlizeOps(void);
+extern void RegisterRefAddNOps(void);
+
+void RegisterRefOps(void)
+{
+    RegisterRefPoolingOps();
+    RegisterRefConv2d();
+    RegisterRefDeconv2d();
+    RegisterRefSoftmaxOps();
+    RegisterRefDetectionPostOps();
+    RegisterRefFCOps();
+    RegisterRefConcat();
+    RegisterRefPermute();
+    RegisterRelu6Ops();
+    RegisterReluOps();
+    RegisterPreluOps();
+    RegisterTanhOps();
+    RegisterSigmoidOps();
+    RegisterResizeOps();
+    RegisterFlattenOps();
+    RegisterReshapeOps();
+    RegisterDropoutOps();
+    RegisterRefLrn();
+    RegisterEltwiseOps();
+    RegisterRefSlice();
+    RegisterSplitOps();
+    RegisterPadOps();
+    RegisterReductionOps();
+    RegisterSqueezeOps();
+    RegisterSwapAxisOps();
+    RegisterRefRPNOps();
+    RegisterRefBatchNormOps();
+    RegisterRefNormlizeOps();
+    RegisterRefAddNOps();
+
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/ref/kernel/concat/concat_fp16.c b/executor/operator/ref/kernel/concat/concat_fp16.c
new file mode 100644
index 000000000..a494301df
--- /dev/null
+++ b/executor/operator/ref/kernel/concat/concat_fp16.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+
+static int ref_concat_fp16(const __fp16** in_data, __fp16* out_data, const struct concat_param* param)
+{
+    int axis = param->axis;
+    int concat_dim = 0;
+    for( int ii=0; ii<param->input_counts; ++ii )
+    {
+        concat_dim += param->input_shape[ii].dim[axis];
+    }
+
+    if( concat_dim != param->output_shape.dim[axis] )
+    {
+        printf("concant dimensions[%d] is not same output[%d]\n",concat_dim,param->output_shape.dim[axis]);
+        return -1;
+    }
+    
+    int out_size,in_size;
+    
+    out_size = 1;
+    for( int ii=0; ii<axis; ++ii )
+    {
+        out_size *= param->output_shape.dim[ii];
+    }
+    in_size = 1;
+    for( int ii=axis+1; ii < param->output_dim; ++ii )
+    {
+        in_size *= param->input_shape[0].dim[ii];
+    }
+
+    __fp16* output_ptr = out_data;
+
+    for(int k = 0; k < out_size; ++k )
+    {
+        for(int j = 0 ; j<param->input_counts; ++j )
+        {
+            int cp_size = param->input_shape[j].dim[axis] * in_size;
+            memcpy(output_ptr, in_data[j] + k * cp_size, cp_size* sizeof(__fp16));
+            output_ptr += cp_size;
+        }
+    }
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/concat/concat_fp32.c b/executor/operator/ref/kernel/concat/concat_fp32.c
new file mode 100644
index 000000000..3065dac19
--- /dev/null
+++ b/executor/operator/ref/kernel/concat/concat_fp32.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+
+static int ref_concat_fp32(const float** in_data, float* out_data, const struct concat_param* param)
+{
+    int axis = param->axis;
+    int concat_dim = 0;
+    for( int ii=0; ii<param->input_counts; ++ii )
+    {
+        concat_dim += param->input_shape[ii].dim[axis];
+    }
+
+    if( concat_dim != param->output_shape.dim[axis] )
+    {
+        printf("concant dimensions[%d] is not same output[%d]\n",concat_dim,param->output_shape.dim[axis]);
+        return -1;
+    }
+    
+    int out_size,in_size;
+    
+    out_size = 1;
+    for( int ii=0; ii<axis; ++ii )
+    {
+        out_size *= param->output_shape.dim[ii];
+    }
+    in_size = 1;
+    for( int ii=axis+1; ii < param->output_dim; ++ii )
+    {
+        in_size *= param->input_shape[0].dim[ii];
+    }
+
+    float* output_ptr = out_data;
+
+    for(int k = 0; k < out_size; ++k )
+    {
+        for(int j = 0 ; j<param->input_counts; ++j )
+        {
+            int cp_size = param->input_shape[j].dim[axis] * in_size;
+            memcpy(output_ptr, in_data[j] + k * cp_size, cp_size* sizeof(float));
+            output_ptr += cp_size;
+        }
+    }
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/concat/concat_int8.c b/executor/operator/ref/kernel/concat/concat_int8.c
new file mode 100644
index 000000000..d4eff55f9
--- /dev/null
+++ b/executor/operator/ref/kernel/concat/concat_int8.c
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+
+static int ref_concat_int8(const int8_t** in_data,int8_t* out_data,const struct concat_param* param)
+{
+    int axis = param->axis;
+    int concat_dim = 0;
+    for( int ii=0; ii<param->input_counts; ++ii )
+    {
+        concat_dim += param->input_shape[ii].dim[axis];
+    }
+
+    if( concat_dim != param->output_shape.dim[axis] )
+    {
+        printf("concant dimensions[%d] is not same output[%d]\n",concat_dim,param->output_shape.dim[axis]);
+        return -1;
+    }
+    
+    int outer_size,in_size;
+    outer_size = 1;
+    for( int ii=0; ii<axis; ++ii )
+    {
+        outer_size *= param->output_shape.dim[ii];
+    }
+    in_size = 1;
+    for( int ii=axis+1; ii < param->output_dim; ++ii )
+    {
+        in_size *= param->output_shape.dim[ii];
+    }
+
+    int output_size = 1;
+    for( int ii=0; ii<param->output_dim;++ii )
+    {
+        output_size *= param->output_shape.dim[ii];
+    }
+    
+    float* output_tmp = (float*)malloc(output_size * 4);
+    if(NULL == output_tmp)
+    {
+        printf("Malloc output tmp memory failed\n");
+        return -1;
+    }
+
+    float* output_ptr = output_tmp;
+    float max_scale = 0.0f;
+    for(int k = 0; k < outer_size; ++k )
+    {
+        for(int j = 0 ; j<param->input_counts; ++j )
+        {
+            int cp_size = param->input_shape[j].dim[axis] * in_size;
+            float scale = param->input_shape[j].scale;
+            const int8_t* input_ptr = in_data[j] + k * cp_size;
+            
+            for(int ii=0; ii<cp_size;++ii)
+            {
+                float val = ( input_ptr[ii] ) * scale;
+                output_ptr[ii] = val;
+            }
+            output_ptr += cp_size;
+
+            if(max_scale < scale)
+                max_scale = scale;
+        }
+    }
+
+    int8_t* last_output_ptr = out_data;
+    for(int ii=0; ii<output_size; ++ii)
+    {
+        last_output_ptr[ii] = round(output_tmp[ii] / max_scale);
+    }
+
+    concat_param* out_param = const_cast<concat_param*>(param);
+    out_param->out_scale = max_scale;
+
+    free(output_tmp);
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/concat/concat_kernel.h b/executor/operator/ref/kernel/concat/concat_kernel.h
new file mode 100644
index 000000000..0300cd21c
--- /dev/null
+++ b/executor/operator/ref/kernel/concat/concat_kernel.h
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+#ifndef __CONACT_KERNEL_H__
+#define __CONACT_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct shape_dim
+{
+    int dim[4];
+    float scale;
+    int zero;
+};
+
+struct concat_param
+{
+    struct shape_dim* input_shape;
+    int input_counts;
+    int input_dim;
+    struct shape_dim output_shape;
+    int output_dim;
+    int axis;
+    float out_scale;
+};
+
+typedef int (*concat_t)(const void** in_data, void* out_data, const struct concat_param* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "concat_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "concat_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "concat_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "concat_uint8.c"
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+    
+#endif
diff --git a/executor/operator/ref/kernel/concat/concat_uint8.c b/executor/operator/ref/kernel/concat/concat_uint8.c
new file mode 100644
index 000000000..4902431c6
--- /dev/null
+++ b/executor/operator/ref/kernel/concat/concat_uint8.c
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+
+static int ref_concat_uint8(const uint8_t** in_data,uint8_t* out_data,const struct concat_param* param)
+{
+    int axis = param->axis;
+    int concat_dim = 0;
+    for( int ii=0; ii<param->input_counts; ++ii )
+    {
+        concat_dim += param->input_shape[ii].dim[axis];
+    }
+
+    if( concat_dim != param->output_shape.dim[axis] )
+    {
+        printf("concant dimensions[%d] is not same output[%d]\n",concat_dim,param->output_shape.dim[axis]);
+        return -1;
+    }
+    
+    int outer_size,in_size;
+    outer_size = 1;
+    for( int ii=0; ii<axis; ++ii )
+    {
+        outer_size *= param->output_shape.dim[ii];
+    }
+    in_size = 1;
+    for( int ii=axis+1; ii < param->output_dim; ++ii )
+    {
+        in_size *= param->output_shape.dim[ii];
+    }
+
+    int output_size = 1;
+    for( int ii=0; ii<param->output_dim;++ii )
+    {
+        output_size *= param->output_shape.dim[ii];
+    }
+    
+    float* output_tmp = (float*)malloc(output_size*4);
+    if(NULL == output_tmp)
+    {
+        printf("Malloc output tmp memory failed\n");
+        return -1;
+    }
+
+    float* output_ptr = output_tmp;
+    for(int k = 0; k < outer_size; ++k )
+    {
+        for(int j = 0 ; j<param->input_counts; ++j )
+        {
+            int cp_size = param->input_shape[j].dim[axis] * in_size;
+            float scale = param->input_shape[j].scale;
+            uint8_t input_zero = param->input_shape[j].zero;
+            
+            const uint8_t* input_ptr = (const uint8_t*)(in_data[j] + k * cp_size);
+            
+            for(int ii=0; ii<cp_size;++ii)
+            {
+                float val = ( input_ptr[ii] - input_zero ) * scale;
+                output_ptr[ii] = val;
+            }
+            output_ptr += cp_size;
+        }
+    }
+
+    float out_scale = 1.0f / param->output_shape.scale;
+    uint8_t out_zero = param->output_shape.zero;
+
+    uint8_t* last_output_ptr = out_data;
+    for(int ii=0; ii<output_size; ++ii)
+    {
+        last_output_ptr[ii] = round(output_tmp[ii] * out_scale + out_zero);
+    }
+
+    free(output_tmp);
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/convolution/ref_conv_fp16.c b/executor/operator/ref/kernel/convolution/ref_conv_fp16.c
new file mode 100644
index 000000000..a23dd27f6
--- /dev/null
+++ b/executor/operator/ref/kernel/convolution/ref_conv_fp16.c
@@ -0,0 +1,113 @@
+
+static inline void activation_fp16(__fp16 *input, int activation)
+{
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+    float tmp = fp16_to_fp32(*input);
+#else
+    __fp16 tmp = *input;
+#endif
+    if( activation >= 0)
+    {
+        if(tmp < 0)
+            tmp = 0;
+        if(activation== 1 && tmp>1)
+            tmp = 1;
+        if(activation== 2 && tmp>6)
+            tmp = 6;
+    }
+
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+    *input = fp32_to_fp16(tmp);
+#else
+    *input = tmp;
+#endif
+}
+
+static int ref_conv_fp16(const __fp16 * input, __fp16 * output, const __fp16* kernel, const __fp16* bias, op_data* param)
+{
+    int batch = param->batch;
+    int group = param->group;
+    int input_c = param->in_shape[0]/group;
+    int input_h = param->in_shape[1];
+    int input_w = param->in_shape[2];
+    int output_c = param->out_shape[0]/group;
+    int output_h = param->out_shape[1];
+    int output_w = param->out_shape[2];
+ 
+    int kernel_size = input_c * param->kernels[0] * param->kernels[1];
+    int n,g,c,h,w,kc,kh,kw;
+    int input_offset=0;
+    int kernel_offset=0;
+    int output_offset=0;
+    for ( n = 0; n < batch; ++n){
+      for( g = 0; g < group; ++g){
+        for ( c = 0; c < output_c; ++c) {
+          for ( h = 0; h < output_h; ++h){
+            for ( w = 0; w < output_w; ++w){
+              const int h_start = (h * param->strides[0]) - param->pads[0];
+              const int w_start = (w * param->strides[1]) - param->pads[1];
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+              float total = bias ? fp16_to_fp32(bias[output_c* g + c]) : 0;
+#else
+              __fp16 total = bias ? bias[output_c* g + c] : 0;
+#endif
+              if(param->layout == 0){
+
+                output_offset =  n*group*output_c*output_h*output_w +
+                                    g*output_c*output_h*output_w +
+                                    c* output_h*output_w + h*output_w + w;
+              }
+              else{
+                output_offset =  n*group*output_c*output_h*output_w +
+                                    h*output_w*group*output_c + w*group*output_c +
+                                    output_c*group + c;
+
+              }
+              for (kc = 0; kc < input_c; ++kc){
+                for ( kh = 0; kh < param->kernels[0]; ++kh){
+                  for ( kw = 0; kw < param->kernels[1]; ++kw){
+                    const int cur_y = h_start + param->dilations[0] * kh;
+                    const int cur_x = w_start + param->dilations[1] * kw;
+                    // If the location is outside the bounds of the input image,
+                    // use zero as a default value.
+                    if ((cur_x >= 0) && (cur_x < input_w) && (cur_y >= 0) && (cur_y < input_h)) {
+                        if(param->layout == 0){
+                            input_offset =  n*group*input_c*input_h*input_w +
+                                                g*input_c*input_h*input_w +
+                                                kc* input_h*input_w + cur_y*input_w + cur_x;
+                            kernel_offset = g*output_c*kernel_size + c*kernel_size +
+                                                kc*param->kernels[0]*param->kernels[1] +
+                                                kh* param->kernels[1] + kw;
+                        }
+                        else{
+                            input_offset =  n*group*input_c*input_h*input_w +
+                                                cur_y*input_w*input_c*group + cur_x* input_c*group +
+                                                g*input_c + kc;
+                            kernel_offset = c*kernel_size*group +kh* param->kernels[1]*input_c*group +
+                                                kw*input_c*group + g*input_c + kc;
+                        }
+
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                        total += fp16_to_fp32(input[input_offset]) * fp16_to_fp32(kernel[kernel_offset]);
+#else
+                        total += (input[input_offset] * kernel[kernel_offset]);
+#endif
+                    }
+                  }
+                }
+              }
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+              total = activation(total, param->activation);
+              output[output_offset] = fp32_to_fp16(total);
+#else
+	          activation_fp16(&total, param->activation);
+              output[output_offset] = total;
+#endif
+            }
+          }
+        }
+      }
+    }
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/convolution/ref_conv_fp32.c b/executor/operator/ref/kernel/convolution/ref_conv_fp32.c
new file mode 100644
index 000000000..e67864074
--- /dev/null
+++ b/executor/operator/ref/kernel/convolution/ref_conv_fp32.c
@@ -0,0 +1,80 @@
+
+
+static int ref_conv_fp32(const float * input, float * output, const float* kernel, const float* bias, op_data* param)
+{
+    int batch = param->batch;
+    int group = param->group;
+    int input_c = param->in_shape[0]/group;
+    int input_h = param->in_shape[1];
+    int input_w = param->in_shape[2];
+    int output_c = param->out_shape[0]/group;
+    int output_h = param->out_shape[1];
+    int output_w = param->out_shape[2];
+ 
+    int kernel_size = input_c * param->kernels[0] * param->kernels[1];
+    int n,g,c,h,w,kc,kh,kw;
+    int input_offset=0;
+    int kernel_offset=0;
+    int output_offset=0;
+    for ( n = 0; n < batch; ++n){
+      for( g = 0; g < group; ++g){
+        for ( c = 0; c < output_c; ++c) {
+          for ( h = 0; h < output_h; ++h){
+            for ( w = 0; w < output_w; ++w){
+              const int h_start = (h * param->strides[0]) - param->pads[0];
+              const int w_start = (w * param->strides[1]) - param->pads[1];
+              float total = 0.f;
+              if(param->layout == 0){
+ 
+                output_offset =  n*group*output_c*output_h*output_w +
+                                    g*output_c*output_h*output_w +
+                                    c* output_h*output_w + h*output_w + w;
+              }
+              else{
+                output_offset =  n*group*output_c*output_h*output_w +
+                                    h*output_w*group*output_c + w*group*output_c +
+                                    output_c*g + c;
+
+              }
+              for (kc = 0; kc < input_c; ++kc){
+                for ( kh = 0; kh < param->kernels[0]; ++kh){
+                  for ( kw = 0; kw < param->kernels[1]; ++kw){
+                    const int cur_y = h_start + param->dilations[0] * kh;
+                    const int cur_x = w_start + param->dilations[1] * kw;
+                    // If the location is outside the bounds of the input image,
+                    // use zero as a default value.
+                    if ((cur_x >= 0) && (cur_x < input_w) && (cur_y >= 0) && (cur_y < input_h)) {
+                        if(param->layout == 0){
+                            input_offset =  n*group*input_c*input_h*input_w +
+                                                g*input_c*input_h*input_w +
+                                                kc* input_h*input_w + cur_y*input_w + cur_x;
+                            kernel_offset = g*output_c*kernel_size + c*kernel_size +
+                                                kc*param->kernels[0]*param->kernels[1] +
+                                                kh* param->kernels[1] + kw;
+                        }
+                        else{
+                            input_offset =  n*group*input_c*input_h*input_w +
+                                                cur_y*input_w*input_c*group + cur_x* input_c*group +
+                                                g*input_c + kc;
+                            kernel_offset = c*group*kernel_size + kh* param->kernels[1]*input_c*group +
+                                                kw*input_c*group + g*input_c + kc;
+                        }
+
+                        total += (input[input_offset] * kernel[kernel_offset]);
+                    }
+                  }
+                }
+              }
+              float bias_value = 0.0f;
+              if (bias) {
+                bias_value = bias[output_c* g + c];
+              }
+              output[output_offset] = activation(total + bias_value, param->activation);
+            }
+          }
+        }
+      }
+    }
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/convolution/ref_conv_int8.c b/executor/operator/ref/kernel/convolution/ref_conv_int8.c
new file mode 100644
index 000000000..0b50c97c2
--- /dev/null
+++ b/executor/operator/ref/kernel/convolution/ref_conv_int8.c
@@ -0,0 +1,112 @@
+
+static int ref_conv_int8(const int8_t * input, int8_t * output, const int8_t* kernel, const float* bias, op_data* param)
+{
+    int batch = param->batch;
+    int group = param->group;
+    int input_c = param->in_shape[0]/group;
+    int input_h = param->in_shape[1];
+    int input_w = param->in_shape[2];
+    int output_c = param->out_shape[0]/group;
+    int output_h = param->out_shape[1];
+    int output_w = param->out_shape[2];
+
+    int kernel_size = input_c * param->kernels[0] * param->kernels[1];
+
+    /* dequant input  */
+    int input_size = batch * group * input_c * input_h * input_w;
+    float* input_buf = (float*)malloc(sizeof(float) * input_size);
+    for(int i=0; i<input_size;i++)
+    {
+        input_buf[i] = input[i] * param->scale[0];
+    }
+
+    /* dequant kernel  */
+    int kernel_total = group *output_c* kernel_size;
+    float* kernel_buf = (float*)malloc(sizeof(float) * kernel_total);
+    for(int i=0; i<kernel_total;i++)
+    {
+        kernel_buf[i] = kernel[i] * param->scale[1];
+    }
+
+    /* malloc  output */
+    int output_size = group*batch*output_c*output_h*output_w;
+    float* output_buf = (float*)malloc(sizeof(float) * output_size);
+
+    int n,g,c,h,w,kc,kh,kw;
+    int input_offset=0;
+    int kernel_offset=0;
+    int output_offset=0;
+    for ( n = 0; n < batch; ++n){
+      for( g = 0; g < group; ++g){
+        for ( c = 0; c < output_c; ++c) {
+          for ( h = 0; h < output_h; ++h){
+            for ( w = 0; w < output_w; ++w){
+              const int h_start = (h * param->strides[0]) - param->pads[0];
+              const int w_start = (w * param->strides[1]) - param->pads[1];
+              float total = 0.f;
+              if(param->layout == 0){
+                output_offset =  n*group*output_c*output_h*output_w +
+                                    g*output_c*output_h*output_w +
+                                    c* output_h*output_w + h*output_w + w;
+              }
+              else{
+                output_offset =  n*group*output_c*output_h*output_w +
+                                    h*output_w*group*output_c + w*group*output_c +
+                                    output_c*group + c;
+              }
+              for (kc = 0; kc < input_c; ++kc){
+                for ( kh = 0; kh < param->kernels[0]; ++kh){
+                  for ( kw = 0; kw < param->kernels[1]; ++kw){
+                    const int cur_y = h_start + param->dilations[0] * kh;
+                    const int cur_x = w_start + param->dilations[1] * kw;
+                    // If the location is outside the bounds of the input image,
+                    // use zero as a default value.
+                    if ((cur_x >= 0) && (cur_x < input_w) && (cur_y >= 0) && (cur_y < input_h)) {
+                        if(param->layout == 0){
+                            input_offset =  n*group*input_c*input_h*input_w +
+                                                g*input_c*input_h*input_w +
+                                                kc* input_h*input_w + cur_y*input_w + cur_x;
+                            kernel_offset = g*output_c*kernel_size + c*kernel_size +
+                                                kc*param->kernels[0]*param->kernels[1] +
+                                                kh* param->kernels[1] + kw;
+                        }
+                        else{
+                            input_offset =  n*group*input_c*input_h*input_w +
+                                                cur_y*input_w*input_c*group + cur_x* input_c*group +
+                                                g*input_c + kc;
+                            kernel_offset = c*kernel_size*group + kh* param->kernels[1]*input_c*group +
+                                                kw*input_c*group + g*input_c + kc;
+                        }
+
+                        total += (input_buf[input_offset] * kernel_buf[kernel_offset]);
+                    }
+                  }
+                }
+              }
+              float bias_value = 0.0f;
+              if (bias) {
+                bias_value = bias[output_c* g + c];
+              }
+              output_buf[output_offset] = activation(total + bias_value, param->activation);
+            }
+          }
+        }
+      }
+    }
+    float output_max = 0.0f;
+    for(int i =0; i< output_size; i++)
+    {
+        if(output_max < fabs(output_buf[i]))
+            output_max = fabs(output_buf[i]);
+    }
+    param->scale[2] = output_max/127;
+    for(int i =0; i< output_size; i++)
+    {
+        output[i] = round(output_buf[i]*127/output_max);
+    }
+    free(output_buf);
+    free(kernel_buf);
+    free(input_buf);
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/convolution/ref_conv_kernel.h b/executor/operator/ref/kernel/convolution/ref_conv_kernel.h
new file mode 100644
index 000000000..60779708c
--- /dev/null
+++ b/executor/operator/ref/kernel/convolution/ref_conv_kernel.h
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+#ifndef __REF_CONV_KERNEL_H__
+#define __REF_CONV_KERNEL_H__
+
+#include <stdint.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct op_data
+{
+    int in_shape[3];        //NCHW
+    int out_shape[3];       //CHW
+    int kernels[2];
+    int strides[2];
+    int dilations[2];
+    int pads[2];
+    int batch;
+    int group;
+    int activation;
+    int layout;
+    int zero[3];            //input, kernel, output
+    float scale[3];         //input, kernel, output
+};
+
+static inline float activation(float input, int activation)
+{
+    if( activation >= 0)
+    {
+        if(input < 0)
+            input = 0;
+        if(activation== 1 && input>1)
+            input = 1;
+        if(activation== 6 && input>6)
+            input = 6;
+    }
+
+    return input;
+}
+    
+typedef int (*ref_conv_kernel_t)(const void * input, void * output, const void* kernel, const void* bias, op_data* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_conv_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_conv_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "ref_conv_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_conv_uint8.c"
+#endif
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/convolution/ref_conv_uint8.c b/executor/operator/ref/kernel/convolution/ref_conv_uint8.c
new file mode 100644
index 000000000..ce56ddbe2
--- /dev/null
+++ b/executor/operator/ref/kernel/convolution/ref_conv_uint8.c
@@ -0,0 +1,108 @@
+
+static int ref_conv_uint8(const uint8_t * input, uint8_t * output, const uint8_t* kernel, const int* bias, op_data* param)
+{
+    int batch = param->batch;
+    int group = param->group;
+    int input_c = param->in_shape[0]/group;
+    int input_h = param->in_shape[1];
+    int input_w = param->in_shape[2];
+    int output_c = param->out_shape[0]/group;
+    int output_h = param->out_shape[1];
+    int output_w = param->out_shape[2];
+
+    int kernel_size = input_c * param->kernels[0] * param->kernels[1];
+
+    /* dequant input  */
+    int input_size = batch * group * input_c * input_h * input_w;
+    float* input_buf = (float*)malloc(sizeof(float) * input_size);
+    for(int i=0; i<input_size;i++)
+        input_buf[i] = (input[i]-param->zero[0]) * param->scale[0];
+
+    /* dequant kernel  */
+    int kernel_total = group *output_c* kernel_size;
+    float* kernel_buf = (float*)malloc(sizeof(float) * kernel_total);
+    for(int i=0; i<kernel_total;i++)
+        kernel_buf[i] = (kernel[i]-param->zero[1]) * param->scale[1];
+
+    /* dequant biases  */
+    int bias_size = group *output_c;
+    
+    float* bias_buf = NULL;
+    if(bias != NULL)
+    {
+        bias_buf = (float*)malloc(sizeof(float) * bias_size);
+        for(int i=0; i<bias_size;i++)
+            bias_buf[i] = bias[i] * param->scale[0] * param->scale[1];
+    }
+
+    int n,g,c,h,w,kc,kh,kw;
+    int input_offset=0;
+    int kernel_offset=0;
+    int output_offset=0;
+    for ( n = 0; n < batch; ++n){
+      for( g = 0; g < group; ++g){
+        for ( c = 0; c < output_c; ++c) {
+          for ( h = 0; h < output_h; ++h){
+            for ( w = 0; w < output_w; ++w){
+              const int h_start = (h * param->strides[0]) - param->pads[0];
+              const int w_start = (w * param->strides[1]) - param->pads[1];
+              float total = 0.f;
+              if(param->layout == 0){
+                output_offset =  n*group*output_c*output_h*output_w +
+                                    g*output_c*output_h*output_w +
+                                    c* output_h*output_w + h*output_w + w;
+              }
+              else{
+                output_offset =  n*group*output_c*output_h*output_w +
+                                    h*output_w*group*output_c + w*group*output_c +
+                                    output_c*g + c;
+              }
+              for (kc = 0; kc < input_c; ++kc){
+                for ( kh = 0; kh < param->kernels[0]; ++kh){
+                  for ( kw = 0; kw < param->kernels[1]; ++kw){
+                    const int cur_y = h_start + param->dilations[0] * kh;
+                    const int cur_x = w_start + param->dilations[1] * kw;
+                    // If the location is outside the bounds of the input image,
+                    // use zero as a default value.
+                    if ((cur_x >= 0) && (cur_x < input_w) && (cur_y >= 0) && (cur_y < input_h)) {
+                        if(param->layout == 0){
+                            input_offset =  n*group*input_c*input_h*input_w +
+                                                g*input_c*input_h*input_w +
+                                                kc* input_h*input_w + cur_y*input_w + cur_x;
+                            kernel_offset = g*output_c*kernel_size + c*kernel_size +
+                                                kc*param->kernels[0]*param->kernels[1] +
+                                                kh* param->kernels[1] + kw;
+                        }
+                        else{
+                            input_offset =  n*group*input_c*input_h*input_w +
+                                                cur_y*input_w*input_c*group + cur_x* input_c*group +
+                                                g*input_c + kc;
+                            kernel_offset = c*kernel_size*group + kh* param->kernels[1]*input_c*group +
+                                            kw*input_c*group + g*input_c + kc;
+                        }
+                        total += (input_buf[input_offset] * kernel_buf[kernel_offset]);
+                    }
+                  }
+                }
+              }
+              float bias_value = 0.0f;
+              if (bias != NULL) {
+                bias_value = bias_buf[output_c* g + c];
+              }
+              total = activation(total + bias_value,param->activation);
+              int out = round(total/param->scale[2]) + param->zero[2];
+              if(out > 255) out = 255;
+              if(out < 0 ) out = 0;
+              output[output_offset] = out; 
+            }
+          }
+        }
+      }
+    }
+    if( bias != NULL)
+        free(bias_buf);
+    free(kernel_buf);
+    free(input_buf);
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/deconvolution/ref_deconv_fp16.c b/executor/operator/ref/kernel/deconvolution/ref_deconv_fp16.c
new file mode 100644
index 000000000..b59dbd1f7
--- /dev/null
+++ b/executor/operator/ref/kernel/deconvolution/ref_deconv_fp16.c
@@ -0,0 +1,185 @@
+static inline void activation_fp16(__fp16* input, int activation)
+{
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+    float tmp = fp16_to_fp32(*input);
+#else
+    __fp16 tmp = *input;
+#endif
+    if( activation >= 0)
+    {
+        if(tmp < 0)
+            tmp = 0;
+        if(activation== 1 && tmp>1)
+            tmp = 1;
+        if(activation== 2 && tmp>6)
+            tmp = 6;
+    }
+
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+    *input = fp32_to_fp16(tmp);
+#else
+    *input = tmp;
+#endif
+}
+
+
+static int ref_deconv_fp16(const __fp16* input, __fp16* output, const __fp16*  kernel, const __fp16* bias, const deconv_ref_param* param)
+{
+    int batch = param->batch;
+    int group = param->group;
+    int input_c = param->in_shape[0]/group;
+    int input_h = param->in_shape[1];
+    int input_w = param->in_shape[2];
+    int output_c = param->out_shape[0]/group;
+    int output_h = param->out_shape[1];
+    int output_w = param->out_shape[2];
+    int kernel_h = param->kernels[0];
+    int kernel_w = param->kernels[1];
+    int pad_h0 = param->pads[0];
+    int pad_w0 = param->pads[1];
+    int stride_h  = param->strides[0];
+    int stride_w  = param->strides[1];
+    int dilation_h = param->dilations[0];
+    int dilation_w = param->dilations[1];
+
+    int n,g,c,h,w,kc,k_h,k_w;
+    int org_out_x = 0;
+    int org_out_y = 0;
+    int cur_out_x = 0;
+    int cur_out_y = 0;
+
+    int input_offset=0;
+    int kernel_offset=0;
+    int output_offset=0;
+
+    memset((void*)output,0,output_h* output_w * output_c *batch* group * sizeof(__fp16));
+
+    for (n = 0; n < batch; ++n)
+    {
+      for( g = 0; g < group; ++g)
+      {
+        for(h = 0; h < input_h; h++)
+        {
+          for(w = 0;w < input_w; w++)
+          {
+            org_out_x = w * stride_w - pad_w0;
+            org_out_y = h * stride_h - pad_h0;
+            for(kc = 0; kc < input_c;kc++)
+            {
+              if(param->layout == 0)
+              {
+                input_offset =  n * group * input_c * input_h * input_w +
+                                g * input_c * input_h * input_w +
+                                kc * input_h * input_w +
+                                h * input_w + w;
+              }
+              else
+              {
+                input_offset = n * group * input_c * input_h * input_w + \
+                               h * group * input_c * input_w + \
+                               w * group * input_c + \
+                               g * input_c + kc;
+              }
+              for(c = 0; c < output_c; c++)
+              {
+                for(k_h = 0;k_h < kernel_h;k_h++)
+                {
+                  for(k_w = 0;k_w < kernel_w; k_w++)
+                    {
+                      cur_out_x = org_out_x + k_w * dilation_w;
+                      cur_out_y = org_out_y + k_h * dilation_h;
+
+                      if(cur_out_x >= 0 && cur_out_x < output_w && cur_out_y >=0 && cur_out_y < output_h)
+                      {
+                        if(param->layout == 0)
+                        {
+                          kernel_offset = g * output_c *input_c * kernel_h *kernel_w + \
+                                          kc * output_c * kernel_h * kernel_w + \
+                                          c * kernel_h * kernel_w + \
+                                          k_h * kernel_w + k_w;
+
+                          output_offset = n * group * output_c * output_w * output_h +\
+                                          g * output_c * output_w * output_h + \
+                                          c * output_w * output_h +\
+                                          cur_out_y * output_w + cur_out_x;
+
+                        }
+                        else
+                        {
+                          kernel_offset = g * output_c * input_c * kernel_h * kernel_w +\
+                                          k_h * kernel_w * output_c +\
+                                          k_w * output_c + c;
+                          output_offset = n * output_h * output_w * output_c * group +\
+                                          cur_out_y * group * output_w * output_c + \
+                                          cur_out_x * group * output_c + \
+                                          g * output_c + c;
+                        }
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                        float tmp = 0;
+                        tmp = fp16_to_fp32(output[output_offset]);
+                        tmp += (fp16_to_fp32(input[input_offset]) * fp16_to_fp32(kernel[kernel_offset]));
+                        output[output_offset] = fp32_to_fp16(tmp);
+#else
+                        output[output_offset] += kernel[kernel_offset] * input[input_offset];
+#endif
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+       }
+     }
+    if(NULL != bias)
+    {
+      for(n = 0; n < batch; n++)
+      {
+        for(g = 0; g < group; g++)
+        {
+          for(c = 0; c < output_c ;c++)
+          {
+            for(h = 0; h < output_h;h++)
+            {
+              for(w= 0;w < output_w;w++)
+              {
+                if(param->layout == 0)
+                {
+                  output_offset = n * output_c * group * output_w * output_h +\
+                                  g * output_c * output_w * output_h + \
+                                  c * output_h * output_w + \
+                                  h * output_w + w;
+                }
+                else
+                {
+                  output_offset = n * output_c * group * output_w * output_h +\
+                                  h * output_c * group * output_w + \
+                                  w * output_c * group + c;
+                }
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                float tmp = 0;
+                tmp = fp16_to_fp32(output[output_offset]);
+                tmp += (fp16_to_fp32(output[output_offset]) + fp16_to_fp32(bias[g*output_c+c]));;
+                output[output_offset] = fp32_to_fp16(tmp);
+#else
+                output[output_offset] += bias[g*output_c +c];
+#endif
+               }
+            }
+          }
+        }
+      }
+    }
+
+    //activation
+    if(param->activation >= 0)
+    {
+        for(n = 0; n < batch*group*output_c*output_w*output_h; n++)
+        {
+            activation_fp16(&output[n], param->activation);
+        }
+    }
+
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/deconvolution/ref_deconv_fp32.c b/executor/operator/ref/kernel/deconvolution/ref_deconv_fp32.c
new file mode 100644
index 000000000..613b367d7
--- /dev/null
+++ b/executor/operator/ref/kernel/deconvolution/ref_deconv_fp32.c
@@ -0,0 +1,127 @@
+static int ref_deconv_fp32(const float * input, float * output, const float* kernel, const float* bias, const deconv_ref_param* param)
+{
+    int batch = param->batch;
+    int group = param->group;
+    int input_c = param->in_shape[0]/group;
+    int input_h = param->in_shape[1];
+    int input_w = param->in_shape[2];
+    int output_c = param->out_shape[0]/group;
+    int output_h = param->out_shape[1];
+    int output_w = param->out_shape[2];
+    int kernel_h = param->kernels[0];
+    int kernel_w = param->kernels[1];
+    int pad_h0 = param->pads[0];
+    int pad_w0 = param->pads[1];
+    int stride_h  = param->strides[0];
+    int stride_w  = param->strides[1];
+    int dilation_h = param->dilations[0];
+    int dilation_w = param->dilations[1];
+
+    int n,g,c,h,w,kc,k_h,k_w;
+    int org_out_x = 0;
+    int org_out_y = 0;
+    int cur_out_x = 0;
+    int cur_out_y = 0;
+
+    float input_val;
+    float weight_val;
+    float bias_val = 0;
+
+    int input_offset=0;
+    int kernel_offset=0;
+    int output_offset=0;
+
+    memset((void*)output,0,output_h* output_w * output_c *batch* group*sizeof(float));
+
+    for (n = 0; n < batch; ++n){
+        for( g = 0; g < group; ++g){
+            for(h = 0; h < input_h; h++){
+                for(w = 0;w < input_w; w++){
+                    org_out_x = w * stride_w - pad_w0;
+                    org_out_y = h * stride_h - pad_h0;
+                    for(kc = 0; kc < input_c;kc++){
+                        if(param->layout == 0){
+                            input_offset = n * group * input_c * input_h * input_w + \
+                                g * input_c * input_h * input_w + \
+                                kc * input_h * input_w + \
+                                h * input_w + w;
+                        }
+                        else{
+                            input_offset = n * group * input_c * input_h * input_w + \
+                                h * group * input_c * input_w + \
+                                w * group * input_c + \
+                                g * input_c + kc;
+                        }
+                        input_val = input[input_offset];
+                        for(c = 0; c < output_c; c++){
+                            for(k_h = 0;k_h < kernel_h;k_h++){
+                                for(k_w = 0;k_w < kernel_w; k_w++){
+                                    cur_out_x = org_out_x + k_w * dilation_w;
+                                    cur_out_y = org_out_y + k_h * dilation_h;
+                                    if(cur_out_x >= 0 && cur_out_x < output_w
+                                        && cur_out_y >=0 && cur_out_y < output_h){
+                                        if(param->layout == 0){
+                                            kernel_offset = g * output_c *input_c * kernel_h *kernel_w + \
+                                                kc * output_c * kernel_h * kernel_w + \
+                                                c * kernel_h * kernel_w + \
+                                                k_h * kernel_w + k_w;
+
+                                            output_offset = n * group * output_c * output_w * output_h +\
+                                                g * output_c * output_w * output_h + \
+                                                c * output_w * output_h +\
+                                                cur_out_y * output_w + cur_out_x;
+                                        }
+                                        else{
+                                            kernel_offset = g * output_c * input_c * kernel_h * kernel_w +\
+                                                k_h * kernel_w * output_c +\
+                                                k_w * output_c + c;
+                                            output_offset = n * output_h * output_w * output_c * group +\
+                                                cur_out_y * group * output_w * output_c + \
+                                                cur_out_x * group * output_c + \
+                                                g * output_c + c;
+                                        }
+                                        weight_val = kernel[kernel_offset];
+                                        output[output_offset] += weight_val * input_val;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    if(NULL != bias){
+        for(n = 0; n < batch; n++){
+            for(g = 0; g < group; g++){
+                for(c = 0; c < output_c ;c++){
+                    bias_val = bias[g * output_c + c];
+                    for(h = 0; h < output_h;h++){
+                        for(w= 0;w < output_w;w++){
+                            if(param->layout == 0){
+                                output_offset = n * output_c * group * output_w * output_h +\
+                                    g * output_c * output_w * output_h + \
+                                    c * output_h * output_w + \
+                                    h * output_w + w;
+                            }
+                            else{
+                                output_offset = n * output_c * group * output_w * output_h +\
+                                    h * output_c * group * output_w + \
+                                    w * output_c * group + c;
+                            }
+                            output[output_offset] += bias_val;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    //activation
+    for(n = 0; n < batch*group*output_c*output_w*output_h; n++) {
+        output[n] = activation(output[n], param->activation);
+    }
+
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/deconvolution/ref_deconv_int8.c b/executor/operator/ref/kernel/deconvolution/ref_deconv_int8.c
new file mode 100644
index 000000000..73b07b2ad
--- /dev/null
+++ b/executor/operator/ref/kernel/deconvolution/ref_deconv_int8.c
@@ -0,0 +1,192 @@
+static int ref_deconv_int8(const int8_t* input, int8_t* output, const int8_t*  kernel, const float* bias, deconv_ref_param* param)
+{
+    int batch = param->batch;
+    int group = param->group;
+    int input_c = param->in_shape[0]/group;
+    int input_h = param->in_shape[1];
+    int input_w = param->in_shape[2];
+    int output_c = param->out_shape[0]/group;
+    int output_h = param->out_shape[1];
+    int output_w = param->out_shape[2];
+    int kernel_h = param->kernels[0];
+    int kernel_w = param->kernels[1];
+    int pad_h0 = param->pads[0];
+    int pad_w0 = param->pads[1];
+    int stride_h  = param->strides[0];
+    int stride_w  = param->strides[1];
+    int dilation_h = param->dilations[0];
+    int dilation_w = param->dilations[1];
+
+    int n,g,c,h,w,kc,k_h,k_w;
+    int org_out_x = 0;
+    int org_out_y = 0;
+    int cur_out_x = 0;
+    int cur_out_y = 0;
+
+    int input_offset=0;
+    int kernel_offset=0;
+    int output_offset=0;
+
+    float input_scale = param->scale[0];
+    float weight_scale = param->scale[1];
+    float output_scale = 1/(input_scale* weight_scale);
+    int output_max = 0;
+
+    int output_size = batch * output_c * group * output_h * output_w;
+
+    float *output_tmp = (float*)malloc(output_size);
+    float input_val;
+    float weight_val;
+    if(NULL == output_tmp)
+    {
+        printf("Malloc output tmp memory failed!\n");
+        return -1;
+    }
+    memset(output_tmp,0,output_size);
+
+    for (n = 0; n < batch; ++n)
+    {
+      for( g = 0; g < group; ++g)
+      {
+        for(h = 0; h < input_h; h++)
+        {
+          for(w = 0;w < input_w; w++)
+          {
+            org_out_x = w * stride_w - pad_w0;
+            org_out_y = h * stride_h - pad_h0;
+            for(kc = 0; kc < input_c;kc++)
+            {
+              if(param->layout == 0)
+              {
+                input_offset =  n * group * input_c * input_h * input_w +
+                                g * input_c * input_h * input_w +
+                                kc * input_h * input_w +
+                                h * input_w + w;
+              }
+              else
+              {
+                input_offset = n * group * input_c * input_h * input_w + \
+                               h * group * input_c * input_w + \
+                               w * group * input_c + \
+                               g * input_c + kc;
+              }
+              for(c = 0; c < output_c; c++)
+              {
+                for(k_h = 0;k_h < kernel_h;k_h++)
+                {
+                  for(k_w = 0;k_w < kernel_w; k_w++)
+                    {
+                      cur_out_x = org_out_x + k_w * dilation_w;
+                      cur_out_y = org_out_y + k_h * dilation_h;
+
+                      if(cur_out_x >= 0 && cur_out_x < output_w && cur_out_y >=0 && cur_out_y < output_h)
+                      {
+                        if(param->layout == 0)
+                        {
+                          kernel_offset = g * output_c *input_c * kernel_h *kernel_w + \
+                                          kc * output_c * kernel_h * kernel_w + \
+                                          c * kernel_h * kernel_w + \
+                                          k_h * kernel_w + k_w;
+
+                          output_offset = n * group * output_c * output_w * output_h +\
+                                          g * output_c * output_w * output_h + \
+                                          c * output_w * output_h +\
+                                          cur_out_y * output_w + cur_out_x;
+
+                        }
+                        else
+                        {
+                          kernel_offset = g * output_c * input_c * kernel_h * kernel_w +\
+                                          k_h * kernel_w * output_c +\
+                                          k_w * output_c + c;
+                          output_offset = n * output_h * output_w * output_c * group +\
+                                          cur_out_y * group * output_w * output_c + \
+                                          cur_out_x * group * output_c + \
+                                          g * output_c + c;
+                        }
+                        input_val = input[input_offset] / input_scale;
+                        weight_val = kernel[kernel_offset] / weight_scale;
+                        output_tmp[output_offset] +=  input_val * weight_val ;
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+       }
+     }
+    if(NULL != bias)
+    {
+      for(n = 0; n < batch; n++)
+      {
+        for(g = 0; g < group; g++)
+        {
+          for(c = 0; c < output_c ;c++)
+          {
+            for(h = 0; h < output_h;h++)
+            {
+              for(w= 0;w < output_w;w++)
+              {
+                if(param->layout == 0)
+                {
+                  output_offset = n * output_c * group * output_w * output_h +\
+                                  g * output_c * output_w * output_h + \
+                                  c * output_h * output_w + \
+                                  h * output_w + w;
+                }
+                else
+                {
+                  output_offset = n * output_c * group * output_w * output_h +\
+                                  h * output_c * group * output_w + \
+                                  w * output_c * group + c;
+                }
+                output_tmp[output_offset] += bias[g*output_c +c];
+               }
+            }
+          }
+        }
+      }
+    }
+
+
+    //activation
+    if(param->activation >= 0)
+    {
+        for(n = 0; n < batch*group*output_c*output_w*output_h; n++)
+        {
+            output_tmp[n] = activation(output_tmp[n], param->activation);
+        }
+    }
+
+
+    output_max = abs(output_tmp[0]);
+
+    for(n = 1; n < output_size; n++)
+    {
+        if(fabs(output_tmp[n]) > output_max)
+        {
+            output_max = fabs(output_tmp[n]);
+        }
+    }
+
+    output_scale = output_max / 127;
+
+    // quant output
+    for(n = 0; n < batch*group*output_c*output_w*output_h; n++)
+    {
+        int output_data = round(output_tmp[n] / output_scale );
+        if(output_data > 127)
+            output[n] =  127;
+        else if(output_data < -127)
+            output[n] = -127;
+        else
+            output[n] = (int8_t)output_data;
+    }
+
+    param->scale[2] = output_scale;
+    free(output_tmp);
+
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/deconvolution/ref_deconv_kernel.h b/executor/operator/ref/kernel/deconvolution/ref_deconv_kernel.h
new file mode 100644
index 000000000..4a73fd203
--- /dev/null
+++ b/executor/operator/ref/kernel/deconvolution/ref_deconv_kernel.h
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+#ifndef __REF_DECONV_KERNEL_H__
+#define __REF_DECONV_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct deconv_ref_param
+{
+    int in_shape[4];        //NCHW
+    int out_shape[3];       //CHW
+    int kernels[2];         //hw
+    int strides[2];         //hw
+    int dilations[2];       //hw
+    int pads[2];
+    int batch;
+    int group;
+    int activation;
+    int layout;
+    int zero[3];            //input, kernel, output
+    float scale[3];         //input, kernel, output
+};
+
+static inline float activation(float input, int activation)
+{
+    if( activation >= 0)
+    {
+        if(input < 0)
+            input = 0;
+        if(activation== 1 && input>1)
+            input = 1;
+        if(activation== 2 && input>6)
+            input = 6;
+    }
+
+    return input;
+}
+
+typedef int (*ref_deconv_kernel_t)(const void * input, void * output, const void* kernel, const void* bias, const deconv_ref_param* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_deconv_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_deconv_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "ref_deconv_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_deconv_uint8.c"
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/deconvolution/ref_deconv_uint8.c b/executor/operator/ref/kernel/deconvolution/ref_deconv_uint8.c
new file mode 100644
index 000000000..cce178ecc
--- /dev/null
+++ b/executor/operator/ref/kernel/deconvolution/ref_deconv_uint8.c
@@ -0,0 +1,182 @@
+static int ref_deconv_uint8(const uint8_t* input, uint8_t* output, const uint8_t*  kernel, const int* bias, const deconv_ref_param* param)
+{
+    int batch = param->batch;
+    int group = param->group;
+    int input_c = param->in_shape[0]/group;
+    int input_h = param->in_shape[1];
+    int input_w = param->in_shape[2];
+    int output_c = param->out_shape[0]/group;
+    int output_h = param->out_shape[1];
+    int output_w = param->out_shape[2];
+    int kernel_h = param->kernels[0];
+    int kernel_w = param->kernels[1];
+    int pad_h0 = param->pads[0];
+    int pad_w0 = param->pads[1];
+    int stride_h  = param->strides[0];
+    int stride_w  = param->strides[1];
+    int dilation_h = param->dilations[0];
+    int dilation_w = param->dilations[1];
+
+    int n,g,c,h,w,kc,k_h,k_w;
+    int org_out_x = 0;
+    int org_out_y = 0;
+    int cur_out_x = 0;
+    int cur_out_y = 0;
+
+    int input_offset=0;
+    int kernel_offset=0;
+    int output_offset=0;
+
+    float input_scale = param->scale[0];
+    float weight_scale = param->scale[1];
+    float output_scale = param->scale[2];
+
+    float input_val = 0;
+    float weight_val = 0;
+    float bias_val = 0;
+    uint8_t input_zero = param->zero[0];
+    uint8_t weight_zero = param->zero[1];
+    uint8_t output_zero = param->zero[2];
+
+    float output_size = batch * output_c * group * output_h * output_w;
+    float *output_tmp = (float*)malloc(output_size);
+    if(NULL == output_tmp)
+    {
+        printf("Malloc output tmp memory failed!\n");
+        return -1;
+    }
+    memset(output_tmp,0,output_size);
+
+    for (n = 0; n < batch; ++n)
+    {
+        for( g = 0; g < group; ++g)
+        {
+            for(h = 0; h < input_h; h++)
+            {
+                for(w = 0;w < input_w; w++)
+                {
+                    org_out_x = w * stride_w - pad_w0;
+                    org_out_y = h * stride_h - pad_h0;
+                    for(kc = 0; kc < input_c;kc++)
+                    {
+                        if(param->layout == 0)
+                        {
+                            input_offset =  n * group * input_c * input_h * input_w +
+                            g * input_c * input_h * input_w +
+                            kc * input_h * input_w +
+                            h * input_w + w;
+                        }
+                        else
+                        {
+                            input_offset = n * group * input_c * input_h * input_w + \
+                            h * group * input_c * input_w + \
+                            w * group * input_c + \
+                            g * input_c + kc;
+                        }
+                        for(c = 0; c < output_c; c++)
+                        {
+                            for(k_h = 0;k_h < kernel_h;k_h++)
+                            {
+                                for(k_w = 0;k_w < kernel_w; k_w++)
+                                {
+                                    cur_out_x = org_out_x + k_w * dilation_w;
+                                    cur_out_y = org_out_y + k_h * dilation_h;
+
+                                    if(cur_out_x >= 0 && cur_out_x < output_w && cur_out_y >=0 && cur_out_y < output_h)
+                                    {
+                                        if(param->layout == 0)
+                                        {
+                                            kernel_offset = g * output_c *input_c * kernel_h *kernel_w + \
+                                                            kc * output_c * kernel_h * kernel_w + \
+                                                            c * kernel_h * kernel_w + \
+                                                            k_h * kernel_w + k_w;
+
+                                            output_offset = n * group * output_c * output_w * output_h +\
+                                                            g * output_c * output_w * output_h + \
+                                                            c * output_w * output_h +\
+                                                            cur_out_y * output_w + cur_out_x;
+
+                                        }
+                                        else
+                                        {
+                                            kernel_offset = g * output_c * input_c * kernel_h * kernel_w +\
+                                                            k_h * kernel_w * output_c +\
+                                                            k_w * output_c + c;
+                                            output_offset = n * output_h * output_w * output_c * group +\
+                                                            cur_out_y * group * output_w * output_c + \
+                                                            cur_out_x * group * output_c + \
+                                                            g * output_c + c;
+                                        }
+
+                                        input_val = input_scale*(input[input_offset] - input_zero);
+                                        weight_val = weight_scale * (kernel[kernel_offset] - weight_zero);
+                                        output_tmp[output_offset] += input_val * weight_val;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    if(NULL != bias)
+    {
+        for(n = 0; n < batch; n++)
+        {
+            for(g = 0; g < group; g++)
+            {
+                for(c = 0; c < output_c ;c++)
+                {
+                    for(h = 0; h < output_h;h++)
+                    {
+                        for(w= 0;w < output_w;w++)
+                        {
+                            if(param->layout == 0)
+                            {
+                                output_offset = n * output_c * group * output_w * output_h +\
+                                                g * output_c * output_w * output_h + \
+                                                c * output_h * output_w + \
+                                                h * output_w + w;
+                            }
+                            else
+                            {
+                                output_offset = n * output_c * group * output_w * output_h +\
+                                h * output_c * group * output_w + \
+                                w * output_c * group + c;
+                            }
+                            bias_val = bias[g*output_c +c] * input_scale * weight_scale;
+                            output_tmp[output_offset] += bias_val;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    //activation
+    if(param->activation >= 0)
+    {
+        for(n = 0; n < batch*group*output_c*output_w*output_h; n++)
+        {
+            output_tmp[n] = activation(output_tmp[n], param->activation);
+        }
+    }
+
+    //quant the output
+    for(n = 0; n < batch*group*output_c*output_w*output_h; n++)
+    {
+        int output_data = round( output_tmp[n] / output_scale + output_zero);
+        if(output_data > 255)
+            output[n] = 255;
+        else if(output_data < 0)
+            output[n] = 0;
+        else
+            output[n] = (uint8_t)output_data;
+
+    }
+
+    free(output_tmp);
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/dpp/ref_dpp_fp16.c b/executor/operator/ref/kernel/dpp/ref_dpp_fp16.c
new file mode 100644
index 000000000..5422439fc
--- /dev/null
+++ b/executor/operator/ref/kernel/dpp/ref_dpp_fp16.c
@@ -0,0 +1,36 @@
+
+int ref_dpp_fp16(const __fp16* input, const __fp16* score, const __fp16* anchor,
+            float* detect_num, float* detect_class, float* detect_score, float* detect_boxes, dpp_param* param)
+{
+    const int num_classes = param->num_classes + 1;
+    const int num_boxes = param->num_boxes;
+
+    /* transform __fp16 to fp32 */
+    int input_size = num_boxes * 4;
+    int score_size = num_boxes * num_classes;
+    float* input_f = (float*)malloc( input_size * sizeof(float));
+    float* score_f = (float*)malloc( score_size * sizeof(float));
+    float* anchor_f = (float*)malloc( input_size * sizeof(float));
+
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+    for(int i =0; i < input_size; i++)
+        input_f[i] = fp16_to_fp32(input[i]);
+    for(int i =0; i < input_size; i++)
+        score_f[i] = fp16_to_fp32(score[i]);
+    for(int i =0; i < input_size; i++)
+        anchor_f[i] = fp16_to_fp32(anchor[i]);
+#else
+    for(int i =0; i < input_size; i++)
+        input_f[i] = input[i];
+    for(int i =0; i < input_size; i++)
+        score_f[i] = score[i];
+    for(int i =0; i < input_size; i++)
+        anchor_f[i] = anchor[i];
+#endif
+
+    ref_dpp_common(input_f, score_f, anchor_f, param, detect_num, detect_class, detect_score, detect_boxes);
+    free(anchor_f);
+    free(score_f);
+    free(input_f);
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/dpp/ref_dpp_fp32.c b/executor/operator/ref/kernel/dpp/ref_dpp_fp32.c
new file mode 100644
index 000000000..dcf9a6cee
--- /dev/null
+++ b/executor/operator/ref/kernel/dpp/ref_dpp_fp32.c
@@ -0,0 +1,6 @@
+
+int ref_dpp_fp32(const float* input, const float* score, const float* anchor,
+            float* detect_num, float* detect_class, float* detect_score, float* detect_boxes, dpp_param* param)
+{
+    return ref_dpp_common(input, score, anchor, param, detect_num, detect_class, detect_score, detect_boxes);;
+}
diff --git a/executor/operator/ref/kernel/dpp/ref_dpp_kernel.h b/executor/operator/ref/kernel/dpp/ref_dpp_kernel.h
new file mode 100644
index 000000000..fe34a09b6
--- /dev/null
+++ b/executor/operator/ref/kernel/dpp/ref_dpp_kernel.h
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+#ifndef __REF_DPP_KERNEL_H__
+#define __REF_DPP_KERNEL_H__
+
+#include <stdint.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct Dpp_Box
+{
+    float x0;    // xmin
+    float y0;    // ymin
+    float x1;    // xmax
+    float y1;    // ymax
+    int box_idx;
+    int class_idx;
+    float score;
+};
+
+struct dpp_param
+{
+    int max_detections;
+    int max_classes_per_detection;
+    float nms_score_threshold;
+    float nms_iou_threshold;
+    int num_classes;
+    int num_boxes;
+    float scales[4];
+    float quant_scale[3];
+    int zero[3];
+};
+
+#define DPP_MIN(a,b) ( a<b ? a : b )
+#define DPP_MAX(a,b) ( a>b ? a : b )
+
+typedef int (*ref_dpp_kernel_t )(const void* input, const void* score, const void* anchor,
+        void* detect_num, void* detect_class, void* detect_score, void* detect_boxes, dpp_param* param);
+
+static inline float intersection_area(const struct Dpp_Box a, const struct Dpp_Box b)
+{
+    if(a.x0 > b.x1 || a.x1 < b.x0 || a.y0 > b.y1 || a.y1 < b.y0)
+    {
+        // no intersection
+        return 0.f;
+    }
+
+    float inter_width = DPP_MIN(a.x1, b.x1) - DPP_MAX(a.x0, b.x0);
+    float inter_height = DPP_MIN(a.y1, b.y1) - DPP_MAX(a.y0, b.y0);
+
+    return inter_width * inter_height;
+}
+
+static inline void nms_sorted_bboxes(const struct Dpp_Box* boxes, int boxes_size, int* picked, int* picked_size, float nms_threshold)
+{
+    float areas[boxes_size];
+    int n_picked = 0;
+    for(int i = 0; i < boxes_size; i++)
+    {
+
+        float width = boxes[i].x1 - boxes[i].x0;
+        float height = boxes[i].y1 - boxes[i].y0;
+
+        areas[i] = width * height;
+    }
+
+    for(int i = 0; i < boxes_size; i++)
+    {
+        int keep = 1;
+        for(int j = 0; j < n_picked; j++)
+        {
+
+            // intersection over union
+            float inter_area = intersection_area(boxes[i], boxes[picked[j]]);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if(inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if(keep)
+        {
+            picked[n_picked] = i;
+            n_picked ++;
+        }
+    }
+    *picked_size = n_picked;
+}
+
+void sort_boxes_by_score(struct Dpp_Box* boxes, int size)
+{
+    int i, j;
+    for(i = 0; i < size-1; i++)
+    {
+        int max_idx = i;
+        for(j = i + 1; j < size; j++)
+        {
+            if(boxes[j].score < 0.6)
+                continue;
+            if(boxes[max_idx].score < boxes[j].score)
+                max_idx = j;
+        }
+        if(i != max_idx)
+        {
+            struct Dpp_Box tmp;
+            memcpy(&tmp, boxes+i, sizeof(struct Dpp_Box));
+            memcpy(boxes + i, boxes+max_idx, sizeof(struct Dpp_Box));
+            memcpy(boxes + max_idx, &tmp, sizeof(struct Dpp_Box));
+        }
+        else
+        {
+            if(boxes[max_idx].score < 0.6)
+                return ;
+        }
+    }
+}
+
+static inline int decode_single_box(struct Dpp_Box* box, const float* box_ptr, const float* anchor_ptr,
+        const float* scales)
+{
+    int i = box->box_idx;
+
+    const float* box_coord = box_ptr + i * 4;
+    const float* anchor = anchor_ptr + i * 4;
+
+    // [0]: y  [1]: x  [2]: h  [3]: w
+    float ycenter = box_coord[0] / scales[0] * anchor[2] + anchor[0];
+    float xcenter = box_coord[1] / scales[1] * anchor[3] + anchor[1];
+    float half_h = 0.5f * (exp(box_coord[2] / scales[2])) * anchor[2];
+    float half_w = 0.5f * (exp(box_coord[3] / scales[3])) * anchor[3];
+
+    box->y0 = ycenter - half_h;
+    box->x0 = xcenter - half_w;
+    box->y1 = ycenter + half_h;
+    box->x1 = xcenter + half_w;
+    if(box->y0 < 0 || box->x0 < 0)
+        return -1;
+    return 0;
+}
+
+void get_all_boxes_rect(struct Dpp_Box* all_class_bbox_rects,
+        const float* box, const float* scores, const float* anchor,
+        int num_boxes, int num_classes, float* scales)
+{
+    struct Dpp_Box selected_box;
+    for(int j = 0; j < num_boxes; j++)
+    {
+        for(int i = 1; i < num_classes; i++)
+        {
+            float score = scores[j * num_classes + i];
+
+            if(score < 0.6)
+                continue;
+
+            selected_box.score = score;
+            selected_box.class_idx = i;
+            selected_box.box_idx = j;
+            //printf("score: %f ,box_idx: %d ,class: %d\n",score, j, i);
+
+            if(decode_single_box(&selected_box, box, anchor, scales) < 0)
+                continue;
+
+            //struct Box* cls_vector = all_class_bbox_rects[i];
+            memcpy(all_class_bbox_rects + i*num_boxes +j, &selected_box, sizeof(struct Dpp_Box));
+            
+        }
+    }
+}
+
+int ref_dpp_common(const float* input_f, const float* score_f, const float* anchor_f, dpp_param* param,
+        float* detect_num, float* detect_class, float* detect_score, float* detect_boxes)
+{
+
+    const int num_classes = param->num_classes + 1;
+    const int num_boxes = param->num_boxes;
+    const int max_detections = param->max_detections;
+
+    struct Dpp_Box* all_boxes = (struct Dpp_Box*)malloc(num_classes*num_boxes*sizeof(struct Dpp_Box));
+    memset(all_boxes, 0, sizeof(struct Dpp_Box)*num_classes*num_boxes);
+
+    get_all_boxes_rect(all_boxes, input_f, score_f, anchor_f,
+            num_boxes, num_classes, param->scales);
+
+    int max_picked_boxes = 2 * max_detections * num_classes;
+    struct Dpp_Box* picked_boxes = (struct Dpp_Box*)malloc(max_picked_boxes * sizeof(struct Dpp_Box));
+    memset(picked_boxes, 0, sizeof(struct Dpp_Box)*max_picked_boxes);
+    int all_picked_size = 0;
+
+    for(int i = 1; i < num_classes; i++)
+    {
+        struct Dpp_Box* class_box = all_boxes + i*num_boxes;
+
+        // sort
+        sort_boxes_by_score(class_box, num_boxes);
+        int box_size = 0;
+        for(int j = 0; j < num_boxes; j ++)
+        {
+            if(class_box[j].score < 0.6)
+                break;
+            box_size ++;
+        }
+        if(box_size == 0)
+            continue;
+
+
+
+        if( box_size > max_detections * 2)
+            box_size = max_detections * 2;
+
+        int picked[num_boxes];
+        int picked_size = 0;
+
+        picked[0]=0;
+        nms_sorted_bboxes(class_box, box_size, picked, &picked_size, param->nms_iou_threshold);
+
+        // save the survivors
+        for(int j = 0; j < picked_size; j++)
+        {
+            int z = picked[j];
+            memcpy(picked_boxes + all_picked_size, class_box + z,sizeof(struct Dpp_Box));
+            all_picked_size++;
+        }
+
+    }
+
+    sort_boxes_by_score(picked_boxes, max_picked_boxes);
+    if(all_picked_size > max_detections)
+        all_picked_size = max_detections;
+
+    printf("all_picked_size: %d\n",all_picked_size);
+    // generate output tensors
+    detect_num[0] = all_picked_size;
+
+    for(int i = 0; i < all_picked_size; i++)
+    {
+
+        detect_class[i] = picked_boxes[i].class_idx;
+        detect_score[i] = picked_boxes[i].score;
+
+        detect_boxes[4 * i] = picked_boxes[i].x0;
+        detect_boxes[4 * i + 1] = picked_boxes[i].y0;
+        detect_boxes[4 * i + 2] = picked_boxes[i].x1;
+        detect_boxes[4 * i + 3] = picked_boxes[i].y1;
+    }
+
+    free(all_boxes);
+    free(picked_boxes);
+
+    return 0;
+}
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_dpp_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_dpp_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_dpp_uint8.c"
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/dpp/ref_dpp_uint8.c b/executor/operator/ref/kernel/dpp/ref_dpp_uint8.c
new file mode 100644
index 000000000..f3d362ea2
--- /dev/null
+++ b/executor/operator/ref/kernel/dpp/ref_dpp_uint8.c
@@ -0,0 +1,28 @@
+
+int ref_dpp_uint8(const uint8_t* input, const uint8_t* score, const uint8_t* anchor,
+            float* detect_num, float* detect_class, float* detect_score, float* detect_boxes, dpp_param* param)
+{
+    const int num_classes = param->num_classes + 1;
+    const int num_boxes = param->num_boxes;
+
+    /* transform uint8_t to fp32 */
+    int input_size = num_boxes * 4;
+    int score_size = num_boxes * num_classes;
+    float* input_f = (float*)malloc( input_size * sizeof(float));
+    float* score_f = (float*)malloc( score_size * sizeof(float));
+    float* anchor_f = (float*)malloc( input_size * sizeof(float));
+    for(int i =0; i < input_size; i++)
+        input_f[i] = (input[i] - param->zero[0]) * param->quant_scale[0];
+    for(int i =0; i < score_size; i++)
+        score_f[i] = score[i]  * param->quant_scale[1];
+    for(int i =0; i < input_size; i++)
+        anchor_f[i] = (anchor[i] - param->zero[2]) * param->quant_scale[2];
+
+    ref_dpp_common(input_f, score_f, anchor_f, param, detect_num, detect_class, detect_score, detect_boxes);
+
+    free(anchor_f);
+    free(score_f);
+    free(input_f);
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/eltwise/eltwise.h b/executor/operator/ref/kernel/eltwise/eltwise.h
new file mode 100644
index 000000000..05f58b3da
--- /dev/null
+++ b/executor/operator/ref/kernel/eltwise/eltwise.h
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#ifndef __ELTWISE_KERNEL_H__
+#define __ELTWISE_KERNEL_H__
+
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct eltwise_param;
+
+struct eltwise_param
+{
+    float scale[3];
+    int zero[3];
+};
+
+typedef int (*eltwise_t)(void* output, void* input0, void* input1, int type, int input_count4,
+            int input_chan,int input_chan_1,int input_hw,int input_hw_1, int input1_count4,
+            int input_h,int input_w,int input_h_1,int input_w_1,int input_n,int input_n_1,int layout,
+            int out_size,float* output_buf,eltwise_param* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "eltwise_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "eltwise_fp16.c" 
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "eltwise_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "eltwise_uint8.c"
+#endif
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/eltwise/eltwise_fp16.c b/executor/operator/ref/kernel/eltwise/eltwise_fp16.c
new file mode 100644
index 000000000..4f51a3508
--- /dev/null
+++ b/executor/operator/ref/kernel/eltwise/eltwise_fp16.c
@@ -0,0 +1,1010 @@
+static int eltwise_fp16(__fp16* output, __fp16* input0, __fp16* input1,int type, int input_count4,
+                        int input_chan,int input_chan_1,int input_hw, int input_hw_1,int input1_count4,
+                        int input_h,int input_w,int input_h_1,int input_w_1,int input_n,
+                        int input_n_1,int layout,int out_size,float * output_buf,eltwise_param* param)
+{
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+    switch(type)
+    {
+        case 10:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(fp16_to_fp32((*input0++)) / fp16_to_fp32(input1[0]));
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(fp16_to_fp32(input0[i]) / fp16_to_fp32(input1[i]));
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(fp16_to_fp32(input0[0])/fp16_to_fp32((*input1++)));
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                output[ofset] = fp32_to_fp16(fp16_to_fp32(input0[ofset]) / fp16_to_fp32(input1[k]));
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+                                    output[ofst]=fp32_to_fp16(fp16_to_fp32(input0[ofst]) / fp16_to_fp32(input1[c]));
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = fp32_to_fp16(fp16_to_fp32(input0[k]) / fp16_to_fp32(input1[ofset]));
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+                                    output[ofst]=fp32_to_fp16(fp16_to_fp32(input0[c]) / fp16_to_fp32(input1[ofst]));
+                                }
+                            }
+                        }
+                    }
+                }
+                
+                
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        case 0:
+        { 
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(fp16_to_fp32((*input0++)) * fp16_to_fp32(input1[0]));
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(fp16_to_fp32(input0[i]) * fp16_to_fp32(input1[i]));
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(fp16_to_fp32((*input1++)) * fp16_to_fp32(input0[0]));
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                   for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                output[ofset] = fp32_to_fp16(fp16_to_fp32(input0[ofset]) * fp16_to_fp32(input1[k]));
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+
+                                    output[ofst]=fp32_to_fp16(fp16_to_fp32(input0[ofst]) * fp16_to_fp32(input1[c]));
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = fp32_to_fp16(fp16_to_fp32(input0[k]) * fp16_to_fp32(input1[ofset]));
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+
+                                    output[ofst]=fp32_to_fp16(fp16_to_fp32(input0[c]) * fp16_to_fp32(input1[ofst]));
+                                }
+                            }
+                        }
+                    }
+                }
+                
+                
+            }
+            else
+                return -1;
+            break;
+        }
+        case 4:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(fp16_to_fp32((*input0++)) - fp16_to_fp32(input1[0]));
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    output[i] = fp32_to_fp16(fp16_to_fp32(input0[i])-fp16_to_fp32(input1[i]));
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(fp16_to_fp32(input0[0]) - fp16_to_fp32((*input1++)));
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                output[ofset] = fp32_to_fp16(fp16_to_fp32(input0[ofset]) - fp16_to_fp32(input1[k]));
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+
+                                    output[ofst]=fp32_to_fp16(fp16_to_fp32(input0[ofst]) - fp16_to_fp32(input1[c]));
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = fp32_to_fp16(fp16_to_fp32(input0[k]) - fp16_to_fp32(input1[ofset]));
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+
+                                    output[ofst]=fp32_to_fp16(fp16_to_fp32(input0[c]) - fp16_to_fp32(input1[ofst]));
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+                return -1;
+            break;
+        }
+        case 2:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(fp16_to_fp32((*input0++)) + fp16_to_fp32(input1[0]));
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(fp16_to_fp32((*input0++))+ fp16_to_fp32((*input1++)));
+                }
+            }
+            else if (input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(fp16_to_fp32((*input1++)) + fp16_to_fp32(input0[0]));
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                output[ofset] = fp32_to_fp16(fp16_to_fp32(input0[ofset]) + fp16_to_fp32(input1[k]));
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+
+                                    output[ofst]=fp32_to_fp16(fp16_to_fp32(input0[ofst]) + fp16_to_fp32(input1[c]));
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = fp32_to_fp16(fp16_to_fp32(input0[k]) + fp16_to_fp32(input1[ofset]));
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+
+                                    output[ofst]=fp32_to_fp16(fp16_to_fp32(input0[c]) + fp16_to_fp32(input1[ofst]));
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+                return -1;
+            break;
+        }
+        case 11:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = fp32_to_fp16(log(fp16_to_fp32(input0[i])));
+            }
+            break;
+        }
+        case 12:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = fp32_to_fp16(exp(fp16_to_fp32(input0[i])));
+            }
+            break;
+        }
+        case 7:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = fp32_to_fp16(1 / sqrt(fp16_to_fp32(input0[i])));
+            }
+            break;
+        }
+        case 13:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = fp32_to_fp16(sqrt(fp16_to_fp32(input0[i])));
+            }
+            break;
+        }
+        case 14:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = fp32_to_fp16(floor(fp16_to_fp32(input0[i])));
+            }
+            break;
+        }
+        case 15:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = fp32_to_fp16(pow(fp16_to_fp32(input0[i]),2));
+            }
+            break;
+        }
+        // case 11:
+        // {
+        //     for(int i = 0; i < input_count4; ++i)
+        //     {
+        //         output[0] += input0[i];
+        //     }
+        //     break;
+        // }
+        // case 17:
+        // {
+        //     float sum=0;
+        //     for(int i = 0; i < input_count4; ++i)
+        //     {
+        //         sum += input0[i];
+        //     }
+        //     output[0]=sum / input_count4;
+        //     break;
+        // }
+        case 16:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(pow(fp16_to_fp32(input0[i]),fp16_to_fp32(input1[0])));
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(pow(fp16_to_fp32(input0[i]),fp16_to_fp32(input1[i])));
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = fp32_to_fp16(pow(fp16_to_fp32(input0[0]),fp16_to_fp32(input1[i])));
+                }
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+
+#else
+   switch(type)
+    {
+        case 10:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++) / input1[0];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = input0[i] / input1[i];
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = input0[0]/(*input1++);
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                //nchw
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                output[ofset] = input0[ofset] / input1[k];
+                            }
+                        }
+                    }
+                }
+                //nhwc
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+                                    output[ofst]=input0[ofst] / input1[c];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = input0[k] / input1[ofset];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+
+                                    output[ofst]=input0[c] / input1[ofst];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        case 0:
+        { 
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++) * input1[0];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = input0[i] * input1[i];
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = (*input1++) * input0[0];
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                   for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                output[ofset] = input0[ofset] * input1[k];
+                                
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+
+                                    output[ofst]=input0[ofst] * input1[c];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+                
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = input0[k] * input1[ofset];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+
+                                    output[ofst]=input0[c] * input1[ofst];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+                return -1;
+            break;
+        }
+        case 4:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++) - input1[0];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++) - (*input1++);
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = input0[0] - (*input1++);
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                output[ofset] = input0[ofset] - input1[k];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+
+                                    output[ofst]=input0[ofst] - input1[c];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = input0[k] - input1[ofset];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+
+                                    output[ofst]=input0[c] - input1[ofst];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+               
+            }
+            else
+                return -1;
+            break;
+        }
+        case 2:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++) + input1[0];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++)+ (*input1++);
+                }
+            }
+            else if (input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = (*input1++) + input0[0];
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+
+                                output[ofset] = input0[ofset] + input1[k];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+
+                                    output[ofst]=input0[ofst] + input1[c];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = input0[k] + input1[ofset];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+
+                                    output[ofst]=input0[c] + input1[ofst];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+                
+            }
+            else
+                return -1;
+            break;
+        }
+        case 11:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = log(input0[i]);
+            }
+            break;
+        }
+        case 12:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = exp(input0[i]);
+            }
+            break;
+        }
+        case 7:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = 1 / sqrt(input0[i]);
+            }
+            break;
+        }
+        case 13:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = sqrt(input0[i]);
+            }
+            break;
+        }
+        case 14:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = floor(input0[i]);
+            }
+            break;
+        }
+        case 15:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = pow(input0[i],2);
+            }
+            break;
+        }
+        case 16:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = pow(input0[i],input1[0]);
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = pow(input0[i],input1[i]);
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = pow(input0[0],input1[i]);
+                }
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    #endif
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/eltwise/eltwise_fp32.c b/executor/operator/ref/kernel/eltwise/eltwise_fp32.c
new file mode 100644
index 000000000..8cf671957
--- /dev/null
+++ b/executor/operator/ref/kernel/eltwise/eltwise_fp32.c
@@ -0,0 +1,511 @@
+static int eltwise_fp32(float* output, float* input0, float* input1, int type, int input_count4,
+                        int input_chan,int input_chan_1,int input_hw, int input_hw_1,int input1_count4,
+                        int input_h,int input_w,int input_h_1,int input_w_1,int input_n,
+                        int input_n_1,int layout,int out_size,float * output_buf,eltwise_param* param)
+{
+    switch(type)
+    {
+        //ELT_DIV
+        case 10:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++) / input1[0];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = input0[i] / input1[i];
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = input0[0]/(*input1++);
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                //nchw
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                output[ofset] = input0[ofset] / input1[k];
+                            }
+                        }
+                    }
+                }
+                //nhwc
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+                                    output[ofst]=input0[ofst] / input1[c];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = input0[k] / input1[ofset];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+
+                                    output[ofst]=input0[c] / input1[ofst];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        //ELT_PROD
+        case 0:
+        { 
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++) * input1[0];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = input0[i] * input1[i];
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = (*input1++) * input0[0];
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                   for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                output[ofset] = input0[ofset] * input1[k];
+                                
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+
+                                    output[ofst]=input0[ofst] * input1[c];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+                
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = input0[k] * input1[ofset];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+
+                                    output[ofst]=input0[c] * input1[ofst];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+                return -1;
+            break;
+        }
+        //ELT_SUB
+        case 4:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++) - input1[0];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++) - (*input1++);
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = input0[0] - (*input1++);
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                output[ofset] = input0[ofset] - input1[k];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+
+                                    output[ofst]=input0[ofst] - input1[c];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = input0[k] - input1[ofset];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+
+                                    output[ofst]=input0[c] - input1[ofst];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+               
+            }
+            else
+                return -1;
+            break;
+        }
+        //ELT_SUM
+        case 2:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++) + input1[0];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = (*input0++)+ (*input1++);
+                }
+            }
+            else if (input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = (*input1++) + input0[0];
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+
+                                output[ofset] = input0[ofset] + input1[k];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+
+                                    output[ofst]=input0[ofst] + input1[c];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                output[ofset] = input0[k] + input1[ofset];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+
+                                    output[ofst]=input0[c] + input1[ofst];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+                
+            }
+            else
+                return -1;
+            break;
+        }
+        //ELT_LOG
+        case 11:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = log(input0[i]);
+            }
+            break;
+        }
+        //ELT_EXP
+        case 12:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = exp(input0[i]);
+            }
+            break;
+        }
+        //ELT_RSQRT
+        case 7:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = 1 / sqrt(input0[i]);
+            }
+            break;
+        }
+        //ELT_SQRT
+        case 13:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = sqrt(input0[i]);
+            }
+            break;
+        }
+        //ELT_FLOOR
+        case 14:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = floor(input0[i]);
+            }
+            break;
+        }
+        //ELT_SQUARE
+        case 15:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                *output++ = pow(input0[i],2);
+            }
+            break;
+        }
+        //ELT_POW
+        case 16:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = pow(input0[i],input1[0]);
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    *output++ = pow(input0[i],input1[i]);
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    *output++ = pow(input0[0],input1[i]);
+                }
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/eltwise/eltwise_int8.c b/executor/operator/ref/kernel/eltwise/eltwise_int8.c
new file mode 100644
index 000000000..537b1ba2e
--- /dev/null
+++ b/executor/operator/ref/kernel/eltwise/eltwise_int8.c
@@ -0,0 +1,527 @@
+static int eltwise_int8(int8_t* output, int8_t* input0, int8_t* input1,int type, int input_count4,
+                        int input_chan,int input_chan_1,int input_hw, int input_hw_1,int input1_count4,
+                        int input_h,int input_w,int input_h_1,int input_w_1,int input_n,
+                        int input_n_1,int layout,int out_size,float * output_buf,eltwise_param* param)
+{
+
+    // float * output_buf=(float *)malloc(sizeof(float)*out_size);
+    switch(type)
+    {
+         case 10:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = ((*input0++) ) * param->scale[0];
+                    float real_input1 = (input1[0]) * param->scale[1];
+                    float result=real_input0/real_input1;
+                    output_buf[i] =result;
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = (input0[i]) * param->scale[0];
+                    float real_input1 = (input1[i]) * param->scale[1];
+                    float result=real_input0/real_input1;
+                    output_buf[i] = result;
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    float real_input0 = (input0[0]) * param->scale[0];
+                    float real_input1 = ((*input1++)) * param->scale[1];
+                    float result=real_input0/real_input1;
+                    output_buf[i] = result;
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                float real_input0 = (input0[ofset]) * param->scale[0];
+                                float real_input1 = (input1[k]) * param->scale[1];
+                                float result=real_input0/real_input1;
+                                output_buf[ofset] = result;
+                            }
+                        }
+                    }
+                }
+                //nhwc
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+                                    float real_input0 = (input0[ofst]) * param->scale[0];
+                                    float real_input1 = (input1[c]) * param->scale[1];
+                                    float result=real_input0/real_input1;
+                                    output_buf[ofst]=result;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                float real_input0 = (input0[k]) * param->scale[0];
+                                float real_input1 = (input1[ofset]) * param->scale[1];
+                                float result=real_input0/real_input1;
+                                output_buf[ofset] = result;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+                                    float real_input0 = (input0[c]) * param->scale[0];
+                                    float real_input1 = (input1[ofst]) * param->scale[1];
+                                    float result=real_input0/real_input1;
+                                    output_buf[ofst]=result;
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        case 0:
+        { 
+           if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = ((*input0++) ) * param->scale[0];
+                    float real_input1 = (input1[0]) * param->scale[1];
+                    float result=real_input0*real_input1;
+                    printf("result: %f\n",result);
+                    output_buf[i] =result;
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = (input0[i]) * param->scale[0];
+                    float real_input1 = (input1[i]) * param->scale[1];
+                    float result=real_input0*real_input1;
+                    output_buf[i] = result;
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    float real_input0 = (input0[0]) * param->scale[0];
+                    float real_input1 = ((*input1++)) * param->scale[1];
+                    float result=real_input0*real_input1;
+                    output_buf[i] = result;
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                
+                if(layout==0)
+                {
+                   for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                float real_input0 = (input0[ofset]) * param->scale[0];
+                                float real_input1 = (input1[k]) * param->scale[1];
+                                float result=real_input0*real_input1;
+                                output_buf[ofset] = result;
+                            }
+                        }
+                    }
+                }
+                //nhwc
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+                                    float real_input0 = (input0[ofst]) * param->scale[0];
+                                    float real_input1 = (input1[c]) * param->scale[1];
+                                    float result=real_input0*real_input1;
+                                    output_buf[ofst]=result;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                float real_input0 = (input0[k]) * param->scale[0];
+                                float real_input1 = (input1[ofset]) * param->scale[1];
+                                float result=real_input0*real_input1;
+                                output_buf[ofset] = result;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+                                    float real_input0 = (input0[c]) * param->scale[0];
+                                    float real_input1 = (input1[ofst]) * param->scale[1];
+                                    float result=real_input0*real_input1;
+                                    output_buf[ofst]=result;
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        case 4:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = ((*input0++) ) * param->scale[0];
+                    float real_input1 = (input1[0]) * param->scale[1];
+                    float result=real_input0-real_input1;
+                    output_buf[i] =result;
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {   
+
+                    float real_input0 = (input0[i]) * param->scale[0];
+                    float real_input1 = (input1[i]) * param->scale[1];
+                    float result=real_input0-real_input1;
+                    *output_buf = result;
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    float real_input0 = (input0[0]) * param->scale[0];
+                    float real_input1 = ((*input1++)) * param->scale[1];
+                    float result=real_input0-real_input1;
+                    output_buf[i] = result;
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                   for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                float real_input0 = (input0[ofset]) * param->scale[0];
+                                float real_input1 = (input1[k]) * param->scale[1];
+                                float result=real_input0-real_input1;
+                                output_buf[ofset] = result;
+                            }
+                        }
+                    }
+                }
+                //nhwc
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+                                    float real_input0 = (input0[ofst]) * param->scale[0];
+                                    float real_input1 = (input1[c]) * param->scale[1];
+                                    float result=real_input0-real_input1;
+                                    output_buf[ofst]=result;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                float real_input0 = (input0[k]) * param->scale[0];
+                                float real_input1 = (input1[ofset]) * param->scale[1];
+                                float result=real_input0-real_input1;
+                                output_buf[ofset] = result;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+                                    float real_input0 = (input0[c]) * param->scale[0];
+                                    float real_input1 = (input1[ofst]) * param->scale[1];
+                                    float result=real_input0-real_input1;
+                                    output_buf[ofst]=result;
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        case 2:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = ((*input0++) ) * param->scale[0];
+                    float real_input1 = (input1[0]) * param->scale[1];
+                    float result=real_input0+real_input1;
+                    output_buf[i] =result;
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = (input0[i]) * param->scale[0];
+                    float real_input1 = (input1[i]) * param->scale[1];
+                    float result=real_input0+real_input1;
+                    output_buf[i] = result;
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    float real_input0 = (input0[0]) * param->scale[0];
+                    float real_input1 = ((*input1++)) * param->scale[1];
+                    float result=real_input0+real_input1;
+                    output_buf[i] = result;
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                   for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                float real_input0 = (input0[ofset]) * param->scale[0];
+                                float real_input1 = (input1[k]) * param->scale[1];
+                                float result=real_input0+real_input1;
+                                output_buf[ofset] = result;
+                            }
+                        }
+                    }
+                }
+                //nhwc
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+                                    float real_input0 = (input0[ofst]) * param->scale[0];
+                                    float real_input1 = (input1[c]) * param->scale[1];
+                                    float result=real_input0+real_input1;
+                                    output_buf[ofst]=result;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                float real_input0 = (input0[k]) * param->scale[0];
+                                float real_input1 = (input1[ofset]) * param->scale[1];
+                                float result=real_input0+real_input1;
+                                output_buf[ofset] = result;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+                                    float real_input0 = (input0[c]) * param->scale[0];
+                                    float real_input1 = (input1[ofst]) * param->scale[1];
+                                    float result=real_input0+real_input1;
+                                    output_buf[ofst]=result;
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        case 12:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                float real_input0 = (input0[i]) * param->scale[0];
+                float result=exp(real_input0);
+                output_buf[i] = result;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    float output_max = 0.0f;
+    for(int i =0; i< out_size; i++)
+    {
+        if(output_max < fabs(output_buf[i]))
+            output_max = fabs(output_buf[i]);
+    }
+    param->scale[2] = output_max/127;
+    for(int i =0; i< out_size; i++)
+    {
+        output[i] = round(output_buf[i]*127/output_max);
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/eltwise/eltwise_uint8.c b/executor/operator/ref/kernel/eltwise/eltwise_uint8.c
new file mode 100644
index 000000000..f1e78fc8f
--- /dev/null
+++ b/executor/operator/ref/kernel/eltwise/eltwise_uint8.c
@@ -0,0 +1,512 @@
+static int eltwise_uint8(uint8_t* output, uint8_t* input0, uint8_t* input1,int type, int input_count4,
+                        int input_chan,int input_chan_1,int input_hw, int input_hw_1,int input1_count4,
+                        int input_h,int input_w,int input_h_1,int input_w_1,int input_n,
+                        int input_n_1,int layout,int out_size,float * output_buf,eltwise_param* param)
+{
+    switch(type)
+    {
+         case 10:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = ((*input0++) - param->zero[0]) * param->scale[0];
+                    float real_input1 = (input1[0]- param->zero[1]) * param->scale[1];
+                    float result=real_input0/real_input1;
+                    *output++ =round(result / param->scale[2]) + param->zero[2];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = (input0[i]- param->zero[0]) * param->scale[0];
+                    float real_input1 = (input1[i]- param->zero[1]) * param->scale[1];
+                    float result=real_input0/real_input1;
+                    *output++ = round(result / param->scale[2]) + param->zero[2];
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    float real_input0 = (input0[0]- param->zero[0]) * param->scale[0];
+                    float real_input1 = ((*input1++)- param->zero[1]) * param->scale[1];
+                    float result=real_input0/real_input1;
+                    *output++ = round(result / param->scale[2]) + param->zero[2];
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                float real_input0 = (input0[ofset]- param->zero[0]) * param->scale[0];
+                                float real_input1 = (input1[k]- param->zero[1]) * param->scale[1];
+                                float result=real_input0/real_input1;
+                                output[ofset] = round(result / param->scale[2]) + param->zero[2];
+                            }
+                        }
+                    }
+                }
+                //nhwc
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+                                    float real_input0 = (input0[ofst]- param->zero[0]) * param->scale[0];
+                                    float real_input1 = (input1[c]- param->zero[1]) * param->scale[1];
+                                    float result=real_input0/real_input1;
+                                    output[ofst]=round(result / param->scale[2]) + param->zero[2];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                float real_input0 = (input0[k]- param->zero[0]) * param->scale[0];
+                                float real_input1 = (input1[ofset]- param->zero[1]) * param->scale[1];
+                                float result=real_input0/real_input1;
+                                output[ofset] = round(result / param->scale[2]) + param->zero[2];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+                                    float real_input0 = (input0[c]- param->zero[0]) * param->scale[0];
+                                    float real_input1 = (input1[ofst]- param->zero[1]) * param->scale[1];
+                                    float result=real_input0/real_input1;
+                                    output[ofst]=round(result / param->scale[2]) + param->zero[2];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        case 0:
+        { 
+           if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = ((*input0++) - param->zero[0]) * param->scale[0];
+                    float real_input1 = (input1[0]- param->zero[1]) * param->scale[1];
+                    float result=real_input0*real_input1;
+                    *output++ =round(result / param->scale[2]) + param->zero[2];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = (input0[i]- param->zero[0]) * param->scale[0];
+                    float real_input1 = (input1[i]- param->zero[1]) * param->scale[1];
+                    float result=real_input0*real_input1;
+                    *output++ = round(result / param->scale[2]) + param->zero[2];
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    float real_input0 = (input0[0]- param->zero[0]) * param->scale[0];
+                    float real_input1 = ((*input1++)- param->zero[1]) * param->scale[1];
+                    float result=real_input0*real_input1;
+                    *output++ = round(result / param->scale[2]) + param->zero[2];
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                
+                if(layout==0)
+                {
+                   for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                float real_input0 = (input0[ofset]- param->zero[0]) * param->scale[0];
+                                float real_input1 = (input1[k]- param->zero[1]) * param->scale[1];
+                                float result=real_input0*real_input1;
+                                output[ofset] = round(result / param->scale[2]) + param->zero[2];
+                            }
+                        }
+                    }
+                }
+                //nhwc
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+                                    float real_input0 = (input0[ofst]- param->zero[0]) * param->scale[0];
+                                    float real_input1 = (input1[c]- param->zero[1]) * param->scale[1];
+                                    float result=real_input0*real_input1;
+                                    output[ofst]=round(result / param->scale[2]) + param->zero[2];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                float real_input0 = (input0[k]- param->zero[0]) * param->scale[0];
+                                float real_input1 = (input1[ofset]- param->zero[1]) * param->scale[1];
+                                float result=real_input0*real_input1;
+                                output[ofset] = round(result / param->scale[2]) + param->zero[2];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+                                    float real_input0 = (input0[c]- param->zero[0]) * param->scale[0];
+                                    float real_input1 = (input1[ofst]- param->zero[1]) * param->scale[1];
+                                    float result=real_input0*real_input1;
+                                    output[ofst]=round(result / param->scale[2]) + param->zero[2];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        case 4:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = ((*input0++) - param->zero[0]) * param->scale[0];
+                    float real_input1 = (input1[0]- param->zero[1]) * param->scale[1];
+                    float result=real_input0-real_input1;
+                    *output++ =round(result / param->scale[2]) + param->zero[2];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {   
+
+                    float real_input0 = (input0[i]) * param->scale[0];
+                    float real_input1 = (input1[i]) * param->scale[1];
+                    float result=real_input0-real_input1;
+                    *output = round(result / param->scale[2]);
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    float real_input0 = (input0[0]- param->zero[0]) * param->scale[0];
+                    float real_input1 = ((*input1++)- param->zero[1]) * param->scale[1];
+                    float result=real_input0-real_input1;
+                    *output++ = round(result / param->scale[2]) + param->zero[2];
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                   for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                float real_input0 = (input0[ofset]- param->zero[0]) * param->scale[0];
+                                float real_input1 = (input1[k]- param->zero[1]) * param->scale[1];
+                                float result=real_input0-real_input1;
+                                output[ofset] = round(result / param->scale[2]) + param->zero[2];
+                            }
+                        }
+                    }
+                }
+                //nhwc
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+                                    float real_input0 = (input0[ofst]- param->zero[0]) * param->scale[0];
+                                    float real_input1 = (input1[c]- param->zero[1]) * param->scale[1];
+                                    float result=real_input0-real_input1;
+                                    output[ofst]=round(result / param->scale[2]) + param->zero[2];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                float real_input0 = (input0[k]- param->zero[0]) * param->scale[0];
+                                float real_input1 = (input1[ofset]- param->zero[1]) * param->scale[1];
+                                float result=real_input0-real_input1;
+                                output[ofset] = round(result / param->scale[2]) + param->zero[2];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+                                    float real_input0 = (input0[c]- param->zero[0]) * param->scale[0];
+                                    float real_input1 = (input1[ofst]- param->zero[1]) * param->scale[1];
+                                    float result=real_input0-real_input1;
+                                    output[ofst]=round(result / param->scale[2]) + param->zero[2];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        case 2:
+        {
+            if(input1_count4 == 1)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = ((*input0++) - param->zero[0]) * param->scale[0];
+                    float real_input1 = (input1[0]- param->zero[1]) * param->scale[1];
+                    float result=real_input0+real_input1;
+                    *output++ =round(result / param->scale[2]) + param->zero[2];
+                }
+            }
+            else if(input_count4 == input1_count4)
+            {
+                for(int i = 0; i < input_count4; ++i)
+                {
+                    float real_input0 = (input0[i]- param->zero[0]) * param->scale[0];
+                    float real_input1 = (input1[i]- param->zero[1]) * param->scale[1];
+                    float result=real_input0+real_input1;
+                    *output++ = round(result / param->scale[2]) + param->zero[2];
+                }
+            }
+            else if(input_count4 == 1)
+            {
+                for(int i = 0; i < input1_count4; ++i)
+                {
+                    float real_input0 = (input0[0]- param->zero[0]) * param->scale[0];
+                    float real_input1 = ((*input1++)- param->zero[1]) * param->scale[1];
+                    float result=real_input0+real_input1;
+                    *output++ = round(result / param->scale[2]) + param->zero[2];
+                }
+            }
+            else if(input_chan == input1_count4)
+            {
+                if(layout==0)
+                {
+                   for(int j=0;j<input_n;j++)
+                    {
+                        for(int k=0;k<input_chan;k++)
+                        {
+                            for(int i = 0; i < input_hw; ++i)
+                            {
+                                int ofset=j*input_chan*input_hw+k*input_hw+i;
+                                float real_input0 = (input0[ofset]- param->zero[0]) * param->scale[0];
+                                float real_input1 = (input1[k]- param->zero[1]) * param->scale[1];
+                                float result=real_input0+real_input1;
+                                output[ofset] = round(result / param->scale[2]) + param->zero[2];
+                            }
+                        }
+                    }
+                }
+                //nhwc
+                else
+                {
+                    for(int i=0;i<input_n;i++)
+                    {
+                        for(int j=0;j<input_h;j++)
+                        {
+                            for(int k=0;k<input_w;k++)
+                            {
+                                for(int c=0;c<input_chan;c++)
+                                {
+                                    int ofst=i*input_h*input_w*input_chan
+                                    +j*input_w*input_chan+k*input_chan+c;
+                                    float real_input0 = (input0[ofst]- param->zero[0]) * param->scale[0];
+                                    float real_input1 = (input1[c]- param->zero[1]) * param->scale[1];
+                                    float result=real_input0+real_input1;
+                                    output[ofst]=round(result / param->scale[2]) + param->zero[2];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else if(input_chan_1 == input_count4)
+            {
+                if(layout==0)
+                {
+                    for(int j=0;j<input_n_1;j++)
+                    {
+                        for(int k=0;k<input_chan_1;k++)
+                        {
+                            for(int i = 0; i < input_hw_1; ++i)
+                            {
+                                int ofset=j*input_chan_1*input_hw_1+k*input_hw_1+i;
+                                float real_input0 = (input0[k]- param->zero[0]) * param->scale[0];
+                                float real_input1 = (input1[ofset]- param->zero[1]) * param->scale[1];
+                                float result=real_input0+real_input1;
+                                output[ofset] = round(result / param->scale[2]) + param->zero[2];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for(int i=0;i<input_n_1;i++)
+                    {
+                        for(int j=0;j<input_h_1;j++)
+                        {
+                            for(int k=0;k<input_w_1;k++)
+                            {
+                                for(int c=0;c<input_chan_1;c++)
+                                {
+                                    int ofst=i*input_h_1*input_w_1*input_chan_1
+                                    +j*input_w_1*input_chan_1+k*input_chan_1+c;
+                                    float real_input0 = (input0[c]- param->zero[0]) * param->scale[0];
+                                    float real_input1 = (input1[ofst]- param->zero[1]) * param->scale[1];
+                                    float result=real_input0+real_input1;
+                                    output[ofst]=round(result / param->scale[2]) + param->zero[2];
+                                }
+                            }
+                        }
+                    }
+                }
+                
+            }
+            else
+            {
+                return -1;
+            }
+            break;
+        }
+        case 12:
+        {
+            for(int i = 0; i < input_count4; ++i)
+            {
+                float real_input0 = (input0[i]- param->zero[0]) * param->scale[0];
+                float result=exp(real_input0);
+                *output++ = round(result / param->scale[2]) + param->zero[2];
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/fully_connected/ref_fc_fp16.c b/executor/operator/ref/kernel/fully_connected/ref_fc_fp16.c
new file mode 100644
index 000000000..2f80cf269
--- /dev/null
+++ b/executor/operator/ref/kernel/fully_connected/ref_fc_fp16.c
@@ -0,0 +1,40 @@
+
+static int ref_fc_fp16(const __fp16 * input, __fp16 * output, const __fp16* weight, const __fp16* bias, fc_data* param)
+{
+    int batch = param->batch;
+    int hidden = param->hidden;
+    int out_number = param->out_number;
+
+    int n,i,j;
+    for ( n = 0; n < batch; ++n)
+    {
+        for( i = 0; i < out_number; ++i)
+        {
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+            float tmp = bias ? fp16_to_fp32(bias[i]) : 0.0;
+            for ( j = 0; j < hidden; ++j)
+            {
+                if(param->need_trans == 0)
+                    tmp += fp16_to_fp32(input[n*hidden + j]) * fp16_to_fp32(weight[i*hidden + j]);
+                else
+                    tmp += fp16_to_fp32(input[n*hidden + j]) * fp16_to_fp32(weight[i + j*out_number]);
+            }
+
+            output[n*out_number + i ] = fp32_to_fp16(tmp);
+#else
+            __fp16 tmp = bias ? bias[i] : 0.0;
+            for ( j = 0; j < hidden; ++j)
+            {
+                if(param->need_trans == 0)
+                    tmp += input[n*hidden + j] * weight[i*hidden + j];
+                else
+                    tmp += input[n*hidden + j] * weight[i + j*out_number];
+            }
+
+            output[n*out_number + i ] = tmp;
+#endif
+        }
+    }
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/fully_connected/ref_fc_fp32.c b/executor/operator/ref/kernel/fully_connected/ref_fc_fp32.c
new file mode 100644
index 000000000..55caca665
--- /dev/null
+++ b/executor/operator/ref/kernel/fully_connected/ref_fc_fp32.c
@@ -0,0 +1,28 @@
+
+
+static int ref_fc_fp32(const float * input, float * output, const float* weight, const float* bias, fc_data* param)
+{
+    int batch = param->batch;
+    int hidden = param->hidden;
+    int out_number = param->out_number;
+
+
+    int n,i,j;
+    for ( n = 0; n < batch; ++n)
+    {
+        for( i = 0; i < out_number; ++i)
+        {
+            float tmp = bias ? bias[i]:0.0;
+            for ( j = 0; j < hidden; ++j)
+            {
+                if(param->need_trans == 0)
+                    tmp += input[n* hidden + j] * weight[i*hidden + j];
+                else
+                    tmp += input[n* hidden + j] * weight[i + j*out_number];
+            }
+            output[n*out_number + i ] = tmp;
+        }
+    }
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/fully_connected/ref_fc_int8.c b/executor/operator/ref/kernel/fully_connected/ref_fc_int8.c
new file mode 100644
index 000000000..4c1ad953c
--- /dev/null
+++ b/executor/operator/ref/kernel/fully_connected/ref_fc_int8.c
@@ -0,0 +1,62 @@
+
+static int ref_fc_int8(const int8_t * input, int8_t * output, const int8_t* weight, const float* bias, fc_data* param)
+{
+    int batch = param->batch;
+    int hidden = param->hidden;
+    int out_number = param->out_number;
+
+    /* dequant input  */
+    int input_size = batch * hidden;
+    float* input_buf = (float*)malloc(sizeof(float) * input_size);
+    for(int i=0; i<input_size;i++)
+    {
+        input_buf[i] = input[i] * param->scale[0];
+    }
+
+    /* dequant kernel  */
+    int kernel_size = hidden * out_number;
+    float* weight_buf = (float*)malloc(sizeof(float) * kernel_size);
+    for(int i=0; i<kernel_size;i++)
+    {
+        weight_buf[i] = weight[i] * param->scale[1];
+    }
+
+    /* malloc  output_buffer */
+    int output_size = batch * out_number;
+    float* output_buf = (float*)malloc(sizeof(float) * output_size);
+
+    int n,i,j;
+    for ( n = 0; n < batch; ++n)
+    {
+        for( i = 0; i < out_number; ++i)
+        {
+            float tmp = bias ? bias[i] :0.0;
+            for ( j = 0; j < hidden; ++j)
+            {
+                if(param->need_trans == 0)
+                    tmp += input_buf[n* hidden + j] * weight_buf[i*hidden + j];
+                else
+                    tmp += input_buf[n* hidden + j] * weight_buf[i + j*out_number];
+            }
+            output_buf[n*out_number + i ] = tmp;
+        }
+    }
+
+    /* quant output */
+    float output_max = 0.0f;
+    for(int i =0; i< output_size; i++)
+    {
+        if(output_max < fabs(output_buf[i]))
+            output_max = fabs(output_buf[i]);
+    }
+    param->scale[2] = output_max/127;
+    for(int i =0; i< output_size; i++)
+    {
+        output[i] = round(output_buf[i]*127/output_max);
+    }
+    free(output_buf);
+    free(weight_buf);
+    free(input_buf);
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/fully_connected/ref_fc_kernel.h b/executor/operator/ref/kernel/fully_connected/ref_fc_kernel.h
new file mode 100644
index 000000000..fc55309e5
--- /dev/null
+++ b/executor/operator/ref/kernel/fully_connected/ref_fc_kernel.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+#ifndef __REF_FC_KERNEL_H__
+#define __REF_FC_KERNEL_H__
+
+#include <stdint.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct fc_data
+{
+    int need_trans;
+    int batch;              //N
+    int out_number;         //OUT
+    int hidden;             //hidden
+    int zero[3];            //input, kernel, output
+    float scale[3];         //input, kernel, output
+};
+
+    
+typedef int (*ref_fc_kernel_t)(const void * input, void * output, const void* weight, const void* bias, fc_data* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_fc_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_fc_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "ref_fc_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_fc_uint8.c"
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/fully_connected/ref_fc_uint8.c b/executor/operator/ref/kernel/fully_connected/ref_fc_uint8.c
new file mode 100644
index 000000000..b73d85ef0
--- /dev/null
+++ b/executor/operator/ref/kernel/fully_connected/ref_fc_uint8.c
@@ -0,0 +1,48 @@
+
+static int ref_fc_uint8(const uint8_t * input, uint8_t * output, const uint8_t* weight, const int* bias, fc_data* param)
+{
+    int batch = param->batch;
+    int hidden = param->hidden;
+    int out_number = param->out_number;
+
+    /* dequant input  */
+    int input_size = batch * hidden;
+    float* input_buf = (float*)malloc(sizeof(float) * input_size);
+    for(int i=0; i<input_size;i++)
+    {
+        input_buf[i] = (input[i] - param->zero[0]) * param->scale[0];
+    }
+
+    /* dequant kernel  */
+    int kernel_size = hidden * out_number;
+    float* weight_buf = (float*)malloc(sizeof(float) * kernel_size);
+    for(int i=0; i<kernel_size;i++)
+    {
+        weight_buf[i] = (weight[i] - param->zero[1]) * param->scale[1];
+    }
+
+    int n,i,j;
+    for ( n = 0; n < batch; ++n)
+    {
+        for( i = 0; i < out_number; ++i)
+        {
+            float tmp = bias? bias[i]*param->scale[0]*param->scale[1]:0.0;
+            for ( j = 0; j < hidden; ++j)
+            {
+                if(param->need_trans == 0)
+                    tmp += input_buf[n* hidden + j] * weight_buf[i*hidden + j];
+                else
+                    tmp += input_buf[n* hidden + j] * weight_buf[i + j*out_number];
+            }
+            int quant_tmp = round(tmp/param->scale[2]) + param->zero[2];
+            if(quant_tmp > 255) quant_tmp = 255;
+            if(quant_tmp < 0) quant_tmp = 0;
+            output[n*out_number + i ] = quant_tmp;
+        }
+    }
+
+    free(weight_buf);
+    free(input_buf);
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/lrn/ref_lrn_fp16.c b/executor/operator/ref/kernel/lrn/ref_lrn_fp16.c
new file mode 100644
index 000000000..37ebbcbe0
--- /dev/null
+++ b/executor/operator/ref/kernel/lrn/ref_lrn_fp16.c
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+
+static int ref_lrn_fp16(const __fp16* in_data, __fp16* out_data, ref_lrn_param* param)
+{
+    int n = param->dims[0];
+    int c = param->dims[1];
+    int h = param->dims[2];
+    int w = param->dims[3];
+
+    float alpha = param->alpha;
+    float beta = param->beta;
+    float bias = param->bias;
+    int local_size = param->local_size;
+
+    int channel_size = h * w;
+    int img_size = c * channel_size;
+
+    float* square = ( float* )(malloc(img_size * sizeof(float)));
+    float* accum_square = ( float* )(malloc(channel_size * sizeof(float)));
+
+    for(int i = 0; i < n; i++)
+    {
+        const __fp16* img_base = in_data + i * img_size;
+
+        /* get square value */
+        for(int j = 0; j < img_size; j++)
+        {
+            float img_data = fp16_to_fp32(img_base[j]);
+            square[j] = img_data * img_data + bias;
+        }
+
+        if(param->norm_region == 0)  /* LRN_ACROSS_CHANNELS */
+        {
+            float alpha_over_size = alpha / local_size;
+
+            for(int j = 0; j < c; j++)
+            {
+                int c_start = j - local_size / 2;
+                int c_end = j + local_size / 2;
+
+                memset(accum_square, 0x0, channel_size * sizeof(float));
+
+                for(int l = c_start; l <= c_end; l++)
+                {
+                    if(l < 0 || l >= c)
+                        continue;
+
+                    for(int n = 0; n < channel_size; n++)
+                    {
+                        accum_square[n] += square[l * channel_size + n];
+                    }
+                }
+
+                /* get the output */
+                for(int n = 0; n < channel_size; n++)
+                {
+                    int offset = i * img_size + j * channel_size + n;
+                    float input_f = fp16_to_fp32(in_data[offset]);
+                    float output_f = input_f * pow(1.0f + alpha_over_size * accum_square[n], -beta);
+                    out_data[offset] = fp32_to_fp16(output_f);
+                }
+            }
+        }
+        else
+        {
+            printf("LRN: IN CHANNEL, TO BE IMPLEMENTED\n");
+        }
+    }
+
+    free(square);
+    free(accum_square);
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/lrn/ref_lrn_fp32.c b/executor/operator/ref/kernel/lrn/ref_lrn_fp32.c
new file mode 100644
index 000000000..eb843c5cb
--- /dev/null
+++ b/executor/operator/ref/kernel/lrn/ref_lrn_fp32.c
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+
+static int ref_lrn_fp32(const float* in_data, float* out_data, ref_lrn_param* param)
+{
+    int n = param->dims[0];
+    int c = param->dims[1];
+    int h = param->dims[2];
+    int w = param->dims[3];
+
+    float alpha = param->alpha;
+    float beta = param->beta;
+    float bias = param->bias;
+    int local_size = param->local_size;
+
+    int channel_size = h * w;
+    int img_size = c * channel_size;
+
+    float* square = ( float* )(malloc(img_size * sizeof(float)));
+    float* accum_square = ( float* )(malloc(channel_size * sizeof(float)));
+
+    for(int i = 0; i < n; i++)
+    {
+        const float* img_base = in_data + i * img_size;
+
+        /* get square value */
+        for(int j = 0; j < img_size; j++)
+            square[j] = img_base[j] * img_base[j] + bias;
+
+        if(param->norm_region == 0)  /* LRN_ACROSS_CHANNELS */
+        {
+            float alpha_over_size = alpha / local_size;
+
+            for(int j = 0; j < c; j++)
+            {
+                int c_start = j - local_size / 2;
+                int c_end = j + local_size / 2;
+
+                memset(accum_square, 0x0, channel_size * sizeof(float));
+
+                for(int l = c_start; l <= c_end; l++)
+                {
+                    if(l < 0 || l >= c)
+                        continue;
+
+                    for(int n = 0; n < channel_size; n++)
+                    {
+                        accum_square[n] += square[l * channel_size + n];
+                    }
+                }
+
+                /* get the output */
+                for(int n = 0; n < channel_size; n++)
+                {
+                    int offset = i * img_size + j * channel_size + n;
+                    out_data[offset] = in_data[offset] * pow(1.0f + alpha_over_size * accum_square[n], -beta);
+                }
+            }
+        }
+        else
+        {
+            printf("LRN: IN CHANNEL, TO BE IMPLEMENTED\n");
+        }
+    }
+
+    free(square);
+    free(accum_square);
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/lrn/ref_lrn_int8.c b/executor/operator/ref/kernel/lrn/ref_lrn_int8.c
new file mode 100644
index 000000000..ec7bfb5f6
--- /dev/null
+++ b/executor/operator/ref/kernel/lrn/ref_lrn_int8.c
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+
+static int ref_lrn_int8(const int8_t* in_data, int8_t* out_data, ref_lrn_param* param)
+{
+    int n = param->dims[0];
+    int c = param->dims[1];
+    int h = param->dims[2];
+    int w = param->dims[3];
+
+    float alpha = param->alpha;
+    float beta = param->beta;
+    float bias = param->bias;
+    int local_size = param->local_size;
+
+    int channel_size = h * w;
+    int img_size = c * channel_size;
+    int input_size = n * img_size;
+
+    float* square = ( float* )(malloc(img_size * sizeof(float)));
+    float* accum_square = ( float* )(malloc(channel_size * sizeof(float)));
+    float* input_f = ( float* )(malloc(input_size * sizeof(float)));
+    float* output_f = ( float* )(malloc(input_size * sizeof(float)));
+
+    for(int i = 0; i < input_size; i++)
+        input_f[i] = in_data[i] * param->scale[0];
+
+    for(int i = 0; i < n; i++)
+    {
+        const float* img_base = input_f + i * img_size;
+
+        /* get square value */
+        for(int j = 0; j < img_size; j++)
+            square[j] = img_base[j] * img_base[j] + bias;
+
+        if(param->norm_region == 0)  /* LRN_ACROSS_CHANNELS */
+        {
+            float alpha_over_size = alpha / local_size;
+
+            for(int j = 0; j < c; j++)
+            {
+                int c_start = j - local_size / 2;
+                int c_end = j + local_size / 2;
+
+                memset(accum_square, 0x0, channel_size * sizeof(float));
+
+                for(int l = c_start; l <= c_end; l++)
+                {
+                    if(l < 0 || l >= c)
+                        continue;
+
+                    for(int n = 0; n < channel_size; n++)
+                    {
+                        accum_square[n] += square[l * channel_size + n];
+                    }
+                }
+
+                /* get the output */
+                for(int n = 0; n < channel_size; n++)
+                {
+                    int offset = i * img_size + j * channel_size + n;
+                    output_f[offset] = input_f[offset] * pow(1.0f + alpha_over_size * accum_square[n], -beta);
+                }
+            }
+        }
+        else
+        {
+            printf("LRN: IN CHANNEL, TO BE IMPLEMENTED\n");
+        }
+    }
+
+    float max_val = 0.0f;
+    for(int i = 0; i < input_size; i++)
+    {
+        if(max_val < fabs(output_f[i]))
+            max_val = fabs(output_f[i]);
+    }
+    float out_scale = max_val / 127;
+    for(int i = 0; i < input_size; i++)
+    {
+        out_data[i] = (int8_t)(round(output_f[i] / out_scale));
+    }
+    param->scale[1] = out_scale;
+
+    free(square);
+    free(accum_square);
+    free(input_f);
+    free(output_f);
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/lrn/ref_lrn_kernel.h b/executor/operator/ref/kernel/lrn/ref_lrn_kernel.h
new file mode 100644
index 000000000..9f32135ce
--- /dev/null
+++ b/executor/operator/ref/kernel/lrn/ref_lrn_kernel.h
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+
+#ifndef __REF_LRN_KERNEL_H__
+#define __REF_LRN_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ref_lrn_param
+{
+    float alpha;
+    float beta;
+    float bias;
+    int local_size;
+    int norm_region;
+    int layout;
+    int dims[4];
+    int zero[2];  /* input, output */
+    float scale[2];  /* input, output */
+};
+
+typedef int (*ref_lrn_kernel_t)(const void* in_data, void* out_data, ref_lrn_param* param);
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_lrn_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_lrn_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "ref_lrn_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_lrn_uint8.c"
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+    
+#endif
+
diff --git a/executor/operator/ref/kernel/lrn/ref_lrn_uint8.c b/executor/operator/ref/kernel/lrn/ref_lrn_uint8.c
new file mode 100644
index 000000000..41252fd2a
--- /dev/null
+++ b/executor/operator/ref/kernel/lrn/ref_lrn_uint8.c
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+
+static int ref_lrn_uint8(const uint8_t* in_data, uint8_t* out_data, ref_lrn_param* param)
+{
+    int n = param->dims[0];
+    int c = param->dims[1];
+    int h = param->dims[2];
+    int w = param->dims[3];
+
+    float alpha = param->alpha;
+    float beta = param->beta;
+    float bias = param->bias;
+    int local_size = param->local_size;
+
+    int channel_size = h * w;
+    int img_size = c * channel_size;
+
+    float* square = ( float* )(malloc(img_size * sizeof(float)));
+    float* accum_square = ( float* )(malloc(channel_size * sizeof(float)));
+
+    for(int i = 0; i < n; i++)
+    {
+        const uint8_t* img_base = in_data + i * img_size;
+
+        /* get square value */
+        for(int j = 0; j < img_size; j++)
+        {
+            float img_data = (img_base[j] - param->zero[0]) * param->scale[0];
+            square[j] = img_data * img_data + bias;
+        }
+
+        if(param->norm_region == 0)  /* LRN_ACROSS_CHANNELS */
+        {
+            float alpha_over_size = alpha / local_size;
+
+            for(int j = 0; j < c; j++)
+            {
+                int c_start = j - local_size / 2;
+                int c_end = j + local_size / 2;
+
+                memset(accum_square, 0x0, channel_size * sizeof(float));
+
+                for(int l = c_start; l <= c_end; l++)
+                {
+                    if(l < 0 || l >= c)
+                        continue;
+
+                    for(int n = 0; n < channel_size; n++)
+                    {
+                        accum_square[n] += square[l * channel_size + n];
+                    }
+                }
+
+                /* get the output */
+                for(int n = 0; n < channel_size; n++)
+                {
+                    int offset = i * img_size + j * channel_size + n;
+                    float output_f = in_data[offset] * pow(1.0f + alpha_over_size * accum_square[n], -beta);
+                    out_data[offset] = (uint8_t)(round(output_f / param->scale[1]) + param->zero[1]);
+                }
+            }
+        }
+        else
+        {
+            printf("LRN: IN CHANNEL, TO BE IMPLEMENTED\n");
+        }
+    }
+
+    free(square);
+    free(accum_square);
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/pad/pad_fp16.c b/executor/operator/ref/kernel/pad/pad_fp16.c
new file mode 100644
index 000000000..0e22417d0
--- /dev/null
+++ b/executor/operator/ref/kernel/pad/pad_fp16.c
@@ -0,0 +1,133 @@
+static int pad_fp16(__fp16 * data,__fp16 * out_data,pad_param * param)
+{
+   
+    if (param->mode==0) 
+    {
+        //support pad on h,w dim only
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        int h = ph - param->pad_1_h;
+                        int w = pw - param->pad_2_h;
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        if (h < 0 || w < 0 || h >= param->in_h || w >= param->in_w) 
+                        {
+                            for (int c = 0; c < param->in_c; ++c) 
+                            {
+                                #if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                                out_data[pad_index + c] = fp32_to_fp16(fp16_to_fp32(param->cv_f16));
+                                #else
+                                out_data[pad_index + c] = param->cv_f16;
+                                #endif
+                            }
+                        } 
+                        else 
+                        {
+                            const int input_index = (h * param->in_w + w) * param->in_c;
+                            for (int c = 0; c < param->in_c; ++c) 
+                            {
+                                out_data[pad_index + c] = data[input_index + c];
+                            }
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+        
+        
+    }
+    else if (param->mode==1)
+    {
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        int h = ph - param->pad_1_h;
+                        int w = pw - param->pad_2_h;
+                        h = MAX(h, -h);
+                        h = MIN(h, 2 * param->in_h - h - 2);
+                        w = MAX(w, -w);
+                        w = MIN(w, 2 * param->in_w - w - 2);
+                        const int input_index = (h * param->in_w + w) * param->in_c;
+                        for (int c = 0; c < param->in_c; ++c) 
+                        {
+                            out_data[pad_index + c] = data[input_index + c];
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+    }
+    else if(param->mode==2)
+    {
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        int w = pw - param->pad_2_h;
+                        int h = ph - param->pad_1_h;
+                        h = MAX(h, -h-1);
+                        h = MIN(h, 2 * param->in_h - h - 1);
+                        w = MAX(w, -w-1);
+                        w = MIN(w, 2 * param->in_w - w - 1);
+                        const int input_index = (h * param->in_w + w) * param->in_c;
+                        for (int c = 0; c < param->in_c; ++c) 
+                        {
+                            
+                            out_data[pad_index + c] = data[input_index + c];
+                           
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+
+
+    }
+    else
+    {
+        return -1;
+    }
+    return 0;
+    
+
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/pad/pad_fp32.c b/executor/operator/ref/kernel/pad/pad_fp32.c
new file mode 100644
index 000000000..80873fdc8
--- /dev/null
+++ b/executor/operator/ref/kernel/pad/pad_fp32.c
@@ -0,0 +1,129 @@
+static int pad_fp32(float * data,float * out_data,pad_param * param)
+{
+    if (param->mode==0) 
+    {
+        //support pad on h,w dim only
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        int h = ph - param->pad_1_h;
+                        int w = pw - param->pad_2_h;
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        if (h < 0 || w < 0 || h >= param->in_h || w >= param->in_w) 
+                        {
+                            for (int c = 0; c < param->in_c; ++c) 
+                            {
+                                out_data[pad_index + c] = param->cv_f32;
+                            }
+                        } 
+                        else 
+                        {
+                            const int input_index = (h * param->in_w + w) * param->in_c;
+                            for (int c = 0; c < param->in_c; ++c) 
+                            {
+                                out_data[pad_index + c] = data[input_index + c];
+                            }
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+        
+        
+    }
+    else if (param->mode==1)
+    {
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        int h = ph - param->pad_1_h;
+                        int w = pw - param->pad_2_h;
+                        h = MAX(h, -h);
+                        h = MIN(h, 2 * param->in_h - h - 2);
+                        w = MAX(w, -w);
+                        w = MIN(w, 2 * param->in_w - w - 2);
+                        const int input_index = (h * param->in_w + w) * param->in_c;
+                        for (int c = 0; c < param->in_c; ++c) 
+                        {
+                            out_data[pad_index + c] = data[input_index + c];
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+    }
+    else if(param->mode==2)
+    {
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        // int h = ph - param->pad_1_h;
+                        int w = pw - param->pad_2_h;
+                        int h = ph - param->pad_1_h;
+                        h = MAX(h, -h-1);
+                        h = MIN(h, 2 * param->in_h - h - 1);
+                        w = MAX(w, -w-1);
+                        w = MIN(w, 2 * param->in_w - w - 1);
+                       
+                        const int input_index = (h * param->in_w + w) * param->in_c ;
+                        for (int c = 0; c < param->in_c; ++c) 
+                        {
+                            out_data[pad_index + c] = data[input_index + c];
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+
+
+    }
+    else
+    {
+        return -1;
+    }
+    
+
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/pad/pad_int8.c b/executor/operator/ref/kernel/pad/pad_int8.c
new file mode 100644
index 000000000..f04b5a3d2
--- /dev/null
+++ b/executor/operator/ref/kernel/pad/pad_int8.c
@@ -0,0 +1,126 @@
+static int pad_int8(int8_t * data,int8_t * out_data,pad_param * param)
+{
+    if (param->mode==0) 
+    {
+        //support pad on h,w dim only
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        int h = ph - param->pad_1_h;
+                        int w = pw - param->pad_2_h;
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        if (h < 0 || w < 0 || h >= param->in_h || w >= param->in_w) 
+                        {
+                            for (int c = 0; c < param->in_c; ++c) 
+                            {
+                                out_data[pad_index + c] = param->cv_int8;
+                            }
+                        } 
+                        else 
+                        {
+                            const int input_index = (h * param->in_w + w) * param->in_c;
+                            for (int c = 0; c < param->in_c; ++c) 
+                            {
+                                out_data[pad_index + c] = data[input_index + c];
+                            }
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+        
+        
+    }
+    else if (param->mode==1)
+    {
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        int h = ph - param->pad_1_h;
+                        int w = pw - param->pad_2_h;
+                        h = MAX(h, -h);
+                        h = MIN(h, 2 * param->in_h - h - 2);
+                        w = MAX(w, -w);
+                        w = MIN(w, 2 * param->in_w - w - 2);
+                        const int input_index = (h * param->in_w + w) * param->in_c;
+                        for (int c = 0; c < param->in_c; ++c) 
+                        {
+                            out_data[pad_index + c] = data[input_index + c];
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+    }
+    else if(param->mode==2)
+    {
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        int w = pw - param->pad_2_h;
+                        int h = ph - param->pad_1_h;
+                        h = MAX(h, -h-1);
+                        h = MIN(h, 2 * param->in_h - h - 1);
+                        w = MAX(w, -w-1);
+                        w = MIN(w, 2 * param->in_w - w - 1);
+                        const int input_index = (h * param->in_w + w) * param->in_c;
+                        for (int c = 0; c < param->in_c; ++c) 
+                        {
+                            out_data[pad_index + c] = data[input_index + c];
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+
+
+    }
+    else
+    {
+        return -1;
+    }
+    
+
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/pad/pad_kernel.h b/executor/operator/ref/kernel/pad/pad_kernel.h
new file mode 100644
index 000000000..4e7b5aca3
--- /dev/null
+++ b/executor/operator/ref/kernel/pad/pad_kernel.h
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#ifndef __PAD_KERNEL_H__
+#define __PAD_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pad_param;
+
+struct pad_param
+{
+    int mode;
+    float cv_f32;
+    __fp16 cv_f16;
+    int8_t cv_int8;
+    uint8_t cv_uint8;
+    int in_size;
+    int out_size;
+    int in_n;
+    int in_h;
+    int in_w;
+    int in_c;
+    int out_h;
+    int out_w;
+    int out_n;
+    int pad_0_h;
+    int pad_0_w;
+    int pad_1_h;
+    int pad_1_w;
+    int pad_2_h;
+    int pad_2_w;
+    int pad_3_h;
+    int pad_3_w;
+    float scale[2];
+    int zero[2];
+};
+
+typedef int (*pad_t)(void * data,void * out_data,pad_param* param);
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MAX(a,b) (((a)>(b))?(a):(b))
+
+#ifdef CONFIG_KERNEL_FP32
+#include "pad_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "pad_fp16.c" 
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "pad_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "pad_uint8.c"
+#endif
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/pad/pad_uint8.c b/executor/operator/ref/kernel/pad/pad_uint8.c
new file mode 100644
index 000000000..f8053af58
--- /dev/null
+++ b/executor/operator/ref/kernel/pad/pad_uint8.c
@@ -0,0 +1,126 @@
+static int pad_uint8(uint8_t * data,uint8_t * out_data,pad_param * param)
+{
+    if (param->mode==0) 
+    {
+        //support pad on h,w dim only
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        int h = ph - param->pad_1_h;
+                        int w = pw - param->pad_2_h;
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        if (h < 0 || w < 0 || h >= param->in_h || w >= param->in_w) 
+                        {
+                            for (int c = 0; c < param->in_c; ++c) 
+                            {
+                                out_data[pad_index + c] = param->cv_uint8;
+                            }
+                        } 
+                        else 
+                        {
+                            const int input_index = (h * param->in_w + w) * param->in_c;
+                            for (int c = 0; c < param->in_c; ++c) 
+                            {
+                                out_data[pad_index + c] = data[input_index + c];
+                            }
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+        
+        
+    }
+    else if (param->mode==1)
+    {
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        int h = ph - param->pad_1_h;
+                        int w = pw - param->pad_2_h;
+                        h = MAX(h, -h);
+                        h = MIN(h, 2 * param->in_h - h - 2);
+                        w = MAX(w, -w);
+                        w = MIN(w, 2 * param->in_w - w - 2);
+                        const int input_index = (h * param->in_w + w) * param->in_c;
+                        for (int c = 0; c < param->in_c; ++c) 
+                        {
+                            out_data[pad_index + c] = data[input_index + c];
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+    }
+    else if(param->mode==2)
+    {
+        if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 )
+        {
+            for (int n = 0; n < param->in_n; ++n) 
+            {
+                for (int ph = 0; ph < param->out_h; ++ph) 
+                {
+                    for (int pw = 0; pw < param->out_w; ++pw) 
+                    {
+                        const int pad_index = (ph * param->out_w+ pw) * param->in_c;
+                        int w = pw - param->pad_2_h;
+                        int h = ph - param->pad_1_h;
+                        h = MAX(h, -h-1);
+                        h = MIN(h, 2 * param->in_h - h - 1);
+                        w = MAX(w, -w-1);
+                        w = MIN(w, 2 * param->in_w - w - 1);
+                        const int input_index = (h * param->in_w + w) * param->in_c;
+                        for (int c = 0; c < param->in_c; ++c) 
+                        {
+                            out_data[pad_index + c] = data[input_index + c];
+                        }
+                    }
+                }
+                // Do offset.
+                data += param->in_size / param->in_n;
+                out_data += param->out_size/ param->out_n;
+            }
+
+        }
+        else
+        {
+            return -1;
+        }
+
+
+    }
+    else
+    {
+        return -1;
+    }
+    
+
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/permute/permute_fp16.c b/executor/operator/ref/kernel/permute/permute_fp16.c
new file mode 100644
index 000000000..25008a2f8
--- /dev/null
+++ b/executor/operator/ref/kernel/permute/permute_fp16.c
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+static void __hwc_fp16(const __fp16* input,__fp16* output,int hh,int ww,int cc,int wc,int hw)
+{
+    for(int h=0; h<hh; ++h)
+    {
+        __fp16* out_ptr = output + h * wc;
+        
+        for(int w=0; w<ww; ++w)
+        {
+            for(int c=0; c<cc; ++c)
+            {
+                const __fp16* in_ptr = input + c * hw + h * ww;
+                out_ptr[w * cc + c] = in_ptr[w];
+            }
+        }
+    }
+}
+
+static void __chw_fp16(const __fp16* input,__fp16* output,int hh,int ww,int cc,int wc,int hw)
+{
+    for(int c=0; c<cc; ++c )
+    {
+        __fp16* output_ptr = output + c * hw; //chw
+        for(int h=0; h<hh; ++h)
+        {
+            for(int w =0; w<ww; ++w )
+            {
+                const __fp16* input_ptr = input + h * wc + w * cc; //input hwc + wc
+                //hw + w = input_ptr[c]
+                output_ptr[h*ww + w] = input_ptr[c];
+            }
+        }
+    }
+}
+
+static int ref_permute_fp16(const __fp16* in_data,__fp16* out_data,const permute_param* param)
+{
+    int n ;
+    int c ;
+    int h ;
+    int w ;
+    if(param->layout == TENGINE_LAYOUT_NCHW)
+    {
+        n = param->in_dim[0];
+        c = param->in_dim[1];
+        h = param->in_dim[2];
+        w = param->in_dim[3];
+    }
+    else
+    {
+        n = param->in_dim[0];
+        h = param->in_dim[1];
+        w = param->in_dim[2];
+        c = param->in_dim[3];
+    }
+    
+    int wc = w * c;
+    int hw = h * w;
+    int chw = c * hw;
+
+    const __fp16* input = in_data;
+    __fp16* output = out_data;
+    if( param->order0 == 0 && param->order1 == 2 && param->order2 == 3 && param->order3 == 1 )
+    {
+        for(int ii=0; ii<n; ++ii)
+        {
+            __hwc_fp16(input,output,h,w,c,wc,hw);
+            
+            input += chw;
+            output += chw;
+        }
+    }
+    else if( param->order0 == 0 && param->order1 == 3 && param->order2 == 1 && param->order3 == 2 )
+    {
+        for(int ii=0; ii<n; ++ii)
+        {
+            __chw_fp16(input,output,h,w,c,wc,hw);
+            
+            input += chw;
+            output += chw;
+        }
+    }
+    else
+    {
+        printf("need support!!!!\n");
+    }
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/permute/permute_fp32.c b/executor/operator/ref/kernel/permute/permute_fp32.c
new file mode 100644
index 000000000..c3dd3fc9a
--- /dev/null
+++ b/executor/operator/ref/kernel/permute/permute_fp32.c
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+
+static void __hwc(const float* input,float* output,int hh,int ww,int cc,int wc,int hw)
+{
+    for(int h=0; h<hh; ++h)
+    {
+        float* out_ptr = output + h * wc;
+        
+        for(int w=0; w<ww; ++w)
+        {
+            for(int c=0; c<cc; ++c)
+            {
+                const float* in_ptr = input + c * hw + h * ww;
+                out_ptr[w * cc + c] = in_ptr[w];
+            }
+        }
+    }
+}
+
+static void __chw(const float* input,float* output,int hh,int ww,int cc,int wc,int hw)
+{
+    for(int c=0; c<cc; ++c )
+    {
+        float* output_ptr = output + c * hw; //chw
+        for(int h=0; h<hh; ++h)
+        {
+            for(int w =0; w<ww; ++w )
+            {
+                const float* input_ptr = input + h * wc + w * cc; //input hwc + wc
+                //hw + w = input_ptr[c]
+                output_ptr[h*ww + w] = input_ptr[c];
+            }
+        }
+    }
+}
+
+static int ref_permute_fp32(const float* in_data,float* out_data,const permute_param* param)
+{
+    int n ;
+    int c ;
+    int h ;
+    int w ;
+    if(param->layout == TENGINE_LAYOUT_NCHW)
+    {
+        n = param->in_dim[0];
+        c = param->in_dim[1];
+        h = param->in_dim[2];
+        w = param->in_dim[3];
+    }
+    else
+    {
+        n = param->in_dim[0];
+        h = param->in_dim[1];
+        w = param->in_dim[2];
+        c = param->in_dim[3];
+    }
+    
+    int wc = w * c;
+    int hw = h * w;
+    int chw = c * hw;
+
+    const float* input = in_data;
+    float* output = out_data;
+    if( param->order0 == 0 && param->order1 == 2 && param->order2 == 3 && param->order3 == 1 )
+    {
+        for(int ii=0; ii<n; ++ii)
+        {
+            __hwc(input,output,h,w,c,wc,hw);
+            
+            input += chw;
+            output += chw;
+        }
+    }
+    else if( param->order0 == 0 && param->order1 == 3 && param->order2 == 1 && param->order3 == 2 )
+    {
+        for(int ii=0; ii<n; ++ii)
+        {
+            __chw(input,output,h,w,c,wc,hw);
+            
+            input += chw;
+            output += chw;
+        }
+    }
+    else
+    {
+        printf("need support!!!!\n");
+    }
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/permute/permute_int8.c b/executor/operator/ref/kernel/permute/permute_int8.c
new file mode 100644
index 000000000..7083a82d7
--- /dev/null
+++ b/executor/operator/ref/kernel/permute/permute_int8.c
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+static void __hwc_int8(const int8_t* input,int8_t* output,int hh,int ww,int cc,int wc,int hw)
+{
+    for(int h=0; h<hh; ++h)
+    {
+        int8_t* out_ptr = output + h * wc;
+        
+        for(int w=0; w<ww; ++w)
+        {
+            for(int c=0; c<cc; ++c)
+            {
+                const int8_t* in_ptr = input + c * hw + h * ww;
+                out_ptr[w * cc + c] = in_ptr[w];
+            }
+        }
+    }
+}
+
+static void __chw_int8(const int8_t* input,int8_t* output,int hh,int ww,int cc,int wc,int hw)
+{
+    for(int c=0; c<cc; ++c )
+    {
+        int8_t* output_ptr = output + c * hw; //chw
+        for(int h=0; h<hh; ++h)
+        {
+            for(int w =0; w<ww; ++ w )
+            {
+                const int8_t* input_ptr = input + h * wc + w * cc; //input hwc + wc
+                //hw + w = input_ptr[c]
+                output_ptr[h*ww + w] = input_ptr[c];
+            }
+        }
+    }
+}
+
+static int ref_permute_int8(const int8_t* in_data,int8_t* out_data,const permute_param* param)
+{
+    int n ;
+    int c ;
+    int h ;
+    int w ;
+    if(param->layout == TENGINE_LAYOUT_NCHW)
+    {
+        n = param->in_dim[0];
+        c = param->in_dim[1];
+        h = param->in_dim[2];
+        w = param->in_dim[3];
+    }
+    else
+    {
+        n = param->in_dim[0];
+        h = param->in_dim[1];
+        w = param->in_dim[2];
+        c = param->in_dim[3];
+    }
+    
+    int wc = w * c;
+    int hw = h * w;
+    int chw = c * hw;
+
+    const int8_t* input = in_data;
+    int8_t* output = out_data;
+    if( param->order0 == 0 && param->order1 == 2 && param->order2 == 3 && param->order3 == 1 )
+    {
+        for(int ii=0; ii<n; ++ii)
+        {
+            __hwc_int8(input,output,h,w,c,wc,hw);
+            
+            input += chw;
+            output += chw;
+        }
+    }
+    else if( param->order0 == 0 && param->order1 == 3 && param->order2 == 1 && param->order3 == 2 )
+    {
+        for(int ii=0; ii<n; ++ii)
+        {
+            __chw_int8(input,output,h,w,c,wc,hw);
+            
+            input += chw;
+            output += chw;
+        }
+    }
+    else
+    {
+        printf("need support!!!!\n");
+    }
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/permute/permute_kernel.h b/executor/operator/ref/kernel/permute/permute_kernel.h
new file mode 100644
index 000000000..40a642df6
--- /dev/null
+++ b/executor/operator/ref/kernel/permute/permute_kernel.h
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+#ifndef __PERMUTE_KERNEL_H__
+#define __PERMUTE_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct permute_param
+{
+    int order0;
+    int order1;
+    int order2;
+    int order3;
+
+    int in_dim[4];
+    int layout;
+};
+
+typedef int (*permute_t)(const void* in_data,void* out_data,const permute_param* param) ;
+
+#ifdef CONFIG_KERNEL_FP32
+#include "permute_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "permute_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "permute_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "permute_uint8.c"
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/permute/permute_uint8.c b/executor/operator/ref/kernel/permute/permute_uint8.c
new file mode 100644
index 000000000..b706a4878
--- /dev/null
+++ b/executor/operator/ref/kernel/permute/permute_uint8.c
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+static void __hwc_uint8(const uint8_t* input,uint8_t* output,int hh,int ww,int cc,int wc,int hw)
+{
+    for(int h=0; h<hh; ++h)
+    {
+        uint8_t* out_ptr = output + h * wc;
+        
+        for(int w=0; w<ww; ++w)
+        {
+            for(int c=0; c<cc; ++c)
+            {
+                const uint8_t* in_ptr = input + c * hw + h * ww;
+                out_ptr[w * cc + c] = in_ptr[w];
+            }
+        }
+    }
+}
+
+static void __chw_uint8(const uint8_t* input,uint8_t* output,int hh,int ww,int cc,int wc,int hw)
+{
+    for(int c=0; c<cc; ++c )
+    {
+        uint8_t* output_ptr = output + c * hw; //chw
+        for(int h=0; h<hh; ++h)
+        {
+            for(int w =0; w<ww; ++ w )
+            {
+                const uint8_t* input_ptr = input + h * wc + w * cc; //input hwc + wc
+                //hw + w = input_ptr[c]
+                output_ptr[h*ww + w] = input_ptr[c];
+            }
+        }
+    }
+}
+
+static int ref_permute_uint8(const uint8_t* in_data,uint8_t* out_data,const permute_param* param)
+{
+    int n ;
+    int c ;
+    int h ;
+    int w ;
+    if(param->layout == TENGINE_LAYOUT_NCHW)
+    {
+        n = param->in_dim[0];
+        c = param->in_dim[1];
+        h = param->in_dim[2];
+        w = param->in_dim[3];
+    }
+    else
+    {
+        n = param->in_dim[0];
+        h = param->in_dim[1];
+        w = param->in_dim[2];
+        c = param->in_dim[3];
+    }
+    
+    int wc = w * c;
+    int hw = h * w;
+    int chw = c * hw;
+
+    const uint8_t* input = in_data;
+    uint8_t* output = out_data;
+    if( param->order0 == 0 && param->order1 == 2 && param->order2 == 3 && param->order3 == 1 )
+    {
+        for(int ii=0; ii<n; ++ii)
+        {
+            __hwc_uint8(input,output,h,w,c,wc,hw);
+            
+            input += chw;
+            output += chw;
+        }
+    }
+    else if( param->order0 == 0 && param->order1 == 3 && param->order2 == 1 && param->order3 == 2 )
+    {
+        for(int ii=0; ii<n; ++ii)
+        {
+            __chw_uint8(input,output,h,w,c,wc,hw);
+            
+            input += chw;
+            output += chw;
+        }
+    }
+    else
+    {
+        printf("need support!!!!\n");
+    }
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/pooling/ref_pooling_fp16.c b/executor/operator/ref/kernel/pooling/ref_pooling_fp16.c
new file mode 100644
index 000000000..671aa99bb
--- /dev/null
+++ b/executor/operator/ref/kernel/pooling/ref_pooling_fp16.c
@@ -0,0 +1,142 @@
+
+static inline void calc_sum_fp16(const __fp16* input, __fp16* sum, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w)
+{
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+    float sum_f = 0.0f;
+    for(int i=start_h;i<end_h;i++)
+        for(int j=start_w;j<end_w;j++)
+        {
+            if(layout == 0)
+                sum_f += fp16_to_fp32(input[cur_ch*h*w + i*w + j]);
+            else
+                sum_f += fp16_to_fp32(input[i*w*c + j* c + cur_ch]);
+        }
+    *sum =  fp32_to_fp16(sum_f);
+#else
+    *sum = 0.0f;
+    for(int i=start_h;i<end_h;i++)
+        for(int j=start_w;j<end_w;j++)
+        {
+            if(layout == 0)
+                *sum += input[cur_ch*h*w + i*w + j];
+            else
+                *sum += input[i*w*c + j* c + cur_ch];
+
+        }
+#endif
+}
+
+static inline void calc_max_fp16(const __fp16* input, __fp16* max, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w)
+{
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+    float max_f = 0.0f;
+    float tmp = 0.0f;
+    if(layout == 0)
+        max_f = fp16_to_fp32(input[cur_ch*h*w + start_h*w + start_w]);
+    else
+        max_f = fp16_to_fp32(input[start_h*w*c + start_w*c + cur_ch]);
+    for(int i=start_h;i<end_h;i++)
+        for(int j=start_w;j<end_w;j++)
+        {
+            if(layout == 0)
+                tmp = fp16_to_fp32(input[cur_ch*h*w + i*w + j]);
+            else
+                tmp = fp16_to_fp32(input[i*w*c + j* c + cur_ch]);
+            //if(i ==start_h && j == start_w) printf("tmp :%f \n",tmp);
+
+            max_f = max_f>tmp ? max_f : tmp;
+
+        }
+    *max = fp32_to_fp16(max_f);
+#else
+    *max = 0.0f;
+    __fp16 tmp = 0.0f;
+    if(layout == 0)
+        *max = input[cur_ch*h*w + start_h*w + start_w];
+    else
+        *max = input[start_h*w*c + start_w*c + cur_ch];
+  
+    for(int i=start_h;i<end_h;i++)
+        for(int j=start_w;j<end_w;j++)
+        {
+            if(layout == 0)
+                tmp = input[cur_ch*h*w + i*w + j];
+            else
+                tmp = input[i*w*c + j* c + cur_ch];
+
+            *max = *max>tmp ? *max : tmp;
+
+        }
+
+#endif
+}
+
+static int ref_pooling_fp16(const __fp16 * input, __fp16 * output, struct op_data* param)
+{
+    int input_chw = param->channel * param->input[0]*param->input[1];
+    int output_chw = param->channel * param->output[0]*param->output[1];
+    
+    for(int n = 0; n < param->batch; n++)
+    {
+        const __fp16* input_cur = input + n*input_chw;
+        for(int c = 0; c < param->channel; c++)
+        {
+            for(int ph = 0; ph < param->output[0]; ph++)
+            {
+                for(int pw = 0; pw < param->output[1]; pw++)
+                {
+                    int pool_size = 1;
+                    int offset = 0;
+                    int h_start = ph * param->strides[0] - param->pads[0];
+                    int h_end = h_start + param->kernels[0];
+                    if( h_end > param->input[0] + param->pads[0])
+                        h_end = param->input[0] + param->pads[0];
+                    int w_start = pw * param->strides[1] - param->pads[1];
+                    int w_end = w_start + param->kernels[1];
+                    if( w_end > param->input[1] + param->pads[1])
+                        w_end = param->input[1] + param->pads[1];
+
+                    if(param->caffe_flavor)
+                        pool_size = (h_end - h_start) * (w_end - w_start);
+
+                    h_start = h_start > 0 ? h_start : 0;
+                    w_start = w_start > 0 ? w_start : 0;
+                    h_end = h_end < param->input[0] ? h_end : param->input[0];
+                    w_end = w_end < param->input[1] ? w_end : param->input[1];
+                    //printf("w: %d,%d ,h: %d,%d\n",w_start,w_end,h_start,h_end);
+
+                    if(!param->caffe_flavor)
+                        pool_size = (h_end - h_start) * (w_end - w_start);
+                    if(param->layout == 0)     //nchw
+                        offset = n*output_chw + c*param->output[0]*param->output[1]
+                                + ph*param->output[1] + pw;
+                    else
+                        offset =n*output_chw + ph*param->output[1]*param->channel
+                                + pw*param->channel + c;
+                    
+                    if(param->method == 0)
+                    {
+                        __fp16 max;
+		        calc_max_fp16(input_cur, &max, param->layout,param->channel,param->input[0],param->input[1],
+                                            c,h_start,w_start,h_end,w_end);
+                        output[offset] = max;
+                    }
+                    else if( param->method == 1)
+                    {
+                        __fp16 sum;
+		       calc_sum_fp16(input_cur, &sum, param->layout,param->channel,param->input[0],param->input[1],
+                                            c,h_start,w_start,h_end,w_end);
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                        output[offset] = fp32_to_fp16( fp16_to_fp32(sum) / pool_size );
+#else
+                        output[offset] = sum/pool_size;
+#endif
+                    }
+                    else
+                        return -1;
+                }
+            }
+        }
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/pooling/ref_pooling_fp32.c b/executor/operator/ref/kernel/pooling/ref_pooling_fp32.c
new file mode 100644
index 000000000..14b3de214
--- /dev/null
+++ b/executor/operator/ref/kernel/pooling/ref_pooling_fp32.c
@@ -0,0 +1,104 @@
+
+static inline float calc_sum(const float* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w)
+{
+    float sum = 0.0f;
+    for(int i=start_h;i<end_h;i++)
+        for(int j=start_w;j<end_w;j++)
+        {
+            if(layout == 0)
+                sum += input[cur_ch*h*w + i*w + j];
+            else
+                sum += input[i*w*c + j* c + cur_ch];
+
+        }
+
+    return sum;
+}
+
+static inline float calc_max(const float* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w)
+{
+    float max = 0.0f;
+    if(layout == 0)
+        max = input[cur_ch*h*w + start_h*w + start_w];
+    else
+        max = input[start_h*w*c + start_w*c + cur_ch];
+    
+    float tmp = 0.0f;
+    for(int i=start_h;i<end_h;i++)
+        for(int j=start_w;j<end_w;j++)
+        {
+            if(layout == 0)
+                tmp = input[cur_ch*h*w + i*w + j];
+            else
+                tmp = input[i*w*c + j* c + cur_ch];
+
+            max = max>tmp ? max : tmp;
+
+        }
+
+    return max;
+}
+
+static int ref_pooling_fp32(const float * input, float * output, struct op_data* param)
+{
+    int input_chw = param->channel * param->input[0]*param->input[1];
+    int output_chw = param->channel * param->output[0]*param->output[1];
+    
+    for(int n = 0; n < param->batch; n++)
+    {
+        const float* input_cur = input + n*input_chw;
+        for(int c = 0; c < param->channel; c++)
+        {
+            for(int ph = 0; ph < param->output[0]; ph++)
+            {
+                for(int pw = 0; pw < param->output[1]; pw++)
+                {
+                    int pool_size = 1;
+                    int offset = 0;
+                    int h_start = ph * param->strides[0] - param->pads[0];
+                    int h_end = h_start + param->kernels[0];
+                    if( h_end > param->input[0] + param->pads[0])
+                        h_end = param->input[0] + param->pads[0];
+                    int w_start = pw * param->strides[1] - param->pads[1];
+                    int w_end = w_start + param->kernels[1];
+                    if( w_end > param->input[1] + param->pads[1])
+                        w_end = param->input[1] + param->pads[1];
+
+                    if(param->caffe_flavor)
+                        pool_size = (h_end - h_start) * (w_end - w_start);
+
+                    h_start = h_start > 0 ? h_start : 0;
+                    w_start = w_start > 0 ? w_start : 0;
+                    h_end = h_end < param->input[0] ? h_end : param->input[0];
+                    w_end = w_end < param->input[1] ? w_end : param->input[1];
+                    //printf("w: %d,%d ,h: %d,%d\n",w_start,w_end,h_start,h_end);
+
+                    if(!param->caffe_flavor)
+                        pool_size = (h_end - h_start) * (w_end - w_start);
+                    if(param->layout == 0)     //nchw
+                        offset = n*output_chw + c*param->output[0]*param->output[1]
+                                + ph*param->output[1] + pw;
+                    else
+                        offset =n*output_chw + ph*param->output[1]*param->channel
+                                + pw*param->channel + c;
+                    
+                    if(param->method == 0)
+                    {
+                        float max = calc_max(input_cur,param->layout,param->channel,param->input[0],param->input[1],
+                                            c,h_start,w_start,h_end,w_end);
+                        output[offset] = max;
+                    }
+                    else if( param->method == 1)
+                    {
+                        float sum = calc_sum(input_cur,param->layout,param->channel,param->input[0],param->input[1],
+                                            c,h_start,w_start,h_end,w_end);
+                        output[offset] = sum/pool_size;
+                    }
+                    else
+                        return -1;
+                }
+            }
+        }
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/pooling/ref_pooling_int8.c b/executor/operator/ref/kernel/pooling/ref_pooling_int8.c
new file mode 100644
index 000000000..d5071096a
--- /dev/null
+++ b/executor/operator/ref/kernel/pooling/ref_pooling_int8.c
@@ -0,0 +1,104 @@
+
+static inline int calc_sum_int8(const int8_t* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w)
+{
+    int sum = 0;
+    for(int i=start_h;i<end_h;i++)
+        for(int j=start_w;j<end_w;j++)
+        {
+            if(layout == 0)
+                sum += input[cur_ch*h*w + i*w + j];
+            else
+                sum += input[i*w*c + j* c + cur_ch];
+
+        }
+
+    return sum;
+}
+
+static inline int8_t calc_max_int8(const int8_t* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w)
+{
+    int8_t max = 0;
+    if(layout == 0)
+        max = input[cur_ch*h*w + start_h*w + start_w];
+    else
+        max = input[start_h*w*c + start_w*c + cur_ch];
+    
+    int8_t tmp = 0;
+    for(int i=start_h;i<end_h;i++)
+        for(int j=start_w;j<end_w;j++)
+        {
+            if(layout == 0)
+                tmp = input[cur_ch*h*w + i*w + j];
+            else
+                tmp = input[i*w*c + j* c + cur_ch];
+
+            max = max>tmp ? max : tmp;
+
+        }
+
+    return max;
+}
+
+static int ref_pooling_int8(const int8_t* input, int8_t * output, struct op_data* param)
+{
+    int input_chw = param->channel * param->input[0]*param->input[1];
+    int output_chw = param->channel * param->output[0]*param->output[1];
+    
+    for(int n = 0; n < param->batch; n++)
+    {
+        const int8_t* input_cur = input + n*input_chw;
+        for(int c = 0; c < param->channel; c++)
+        {
+            for(int ph = 0; ph < param->output[0]; ph++)
+            {
+                for(int pw = 0; pw < param->output[1]; pw++)
+                {
+                    int pool_size = 1;
+                    int offset = 0;
+                    int h_start = ph * param->strides[0] - param->pads[0];
+                    int h_end = h_start + param->kernels[0];
+                    if( h_end > param->input[0] + param->pads[0])
+                        h_end = param->input[0] + param->pads[0];
+                    int w_start = pw * param->strides[1] - param->pads[1];
+                    int w_end = w_start + param->kernels[1];
+                    if( w_end > param->input[1] + param->pads[1])
+                        w_end = param->input[1] + param->pads[1];
+
+                    if(param->caffe_flavor)
+                        pool_size = (h_end - h_start) * (w_end - w_start);
+
+                    h_start = h_start > 0 ? h_start : 0;
+                    w_start = w_start > 0 ? w_start : 0;
+                    h_end = h_end < param->input[0] ? h_end : param->input[0];
+                    w_end = w_end < param->input[1] ? w_end : param->input[1];
+                    //printf("w: %d,%d ,h: %d,%d\n",w_start,w_end,h_start,h_end);
+
+                    if(!param->caffe_flavor)
+                        pool_size = (h_end - h_start) * (w_end - w_start);
+                    if(param->layout == 0)     //nchw
+                        offset = n*output_chw + c*param->output[0]*param->output[1]
+                                + ph*param->output[1] + pw;
+                    else
+                        offset =n*output_chw + ph*param->output[1]*param->channel
+                                + pw*param->channel + c;
+                    
+                    if(param->method == 0)
+                    {
+                        int8_t max = calc_max_int8(input_cur,param->layout,param->channel,param->input[0],param->input[1],
+                                            c,h_start,w_start,h_end,w_end);
+                        output[offset] = max;
+                    }
+                    else if( param->method == 1)
+                    {
+                        int sum = calc_sum_int8(input_cur,param->layout,param->channel,param->input[0],param->input[1],
+                                            c,h_start,w_start,h_end,w_end);
+                        output[offset] = (int8_t)round(sum/pool_size);
+                    }
+                    else
+                        return -1;
+                }
+            }
+        }
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/pooling/ref_pooling_kernel.h b/executor/operator/ref/kernel/pooling/ref_pooling_kernel.h
new file mode 100644
index 000000000..fc7e28c46
--- /dev/null
+++ b/executor/operator/ref/kernel/pooling/ref_pooling_kernel.h
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+#ifndef __REF_POOLING_KERNEL_H__
+#define __REF_POOLING_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct op_data{
+    int layout;
+    int batch;
+    int channel;
+    int method;
+    int input[2];
+    int output[2];
+    int kernels[2];
+    int strides[2];
+    int pads[2];
+    int caffe_flavor;
+    int zero_point;
+    int align[4];
+};
+
+
+typedef int (*ref_pooling_kernel_t)(const void * input, void * output, struct op_data* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_pooling_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_pooling_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "ref_pooling_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_pooling_uint8.c"
+#endif
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/pooling/ref_pooling_uint8.c b/executor/operator/ref/kernel/pooling/ref_pooling_uint8.c
new file mode 100644
index 000000000..e3ec7a78b
--- /dev/null
+++ b/executor/operator/ref/kernel/pooling/ref_pooling_uint8.c
@@ -0,0 +1,113 @@
+
+static inline int calc_sum_uint8(const uint8_t* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w)
+{
+    int sum = 0;
+    for(int i=start_h;i<end_h;i++)
+        for(int j=start_w;j<end_w;j++)
+        {
+            if(layout == 0)
+                sum += input[cur_ch*h*w + i*w + j];
+            else
+                sum += input[i*w*c + j* c + cur_ch];
+
+        }
+
+    return sum;
+}
+
+static inline uint8_t calc_max_uint8(const uint8_t* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w)
+{
+    uint8_t max = 0;
+    if(layout == 0)
+        max = input[cur_ch*h*w + start_h*w + start_w];
+    else
+        max = input[start_h*w*c + start_w*c + cur_ch];
+    
+    uint8_t tmp = 0.0f;
+    for(int i=start_h;i<end_h;i++)
+        for(int j=start_w;j<end_w;j++)
+        {
+            if(layout == 0)
+                tmp = input[cur_ch*h*w + i*w + j];
+            else
+                tmp = input[i*w*c + j* c + cur_ch];
+
+            max = max>tmp ? max : tmp;
+
+        }
+
+    return max;
+}
+
+static int ref_pooling_uint8(const uint8_t* input, uint8_t * output, struct op_data* param)
+{
+    int input_chw = param->channel * param->input[0]*param->input[1];
+    int output_chw = param->channel * param->output[0]*param->output[1];
+
+    int zero_point = param->zero_point;
+    
+    for(int n = 0; n < param->batch; n++)
+    {
+        const uint8_t* input_cur = input + n*input_chw;
+        for(int c = 0; c < param->channel; c++)
+        {
+            for(int ph = 0; ph < param->output[0]; ph++)
+            {
+                for(int pw = 0; pw < param->output[1]; pw++)
+                {
+                    int pool_size = 1;
+                    int pool_size_caffe = 1;
+                    int offset = 0;
+                    int h_start = ph * param->strides[0] - param->pads[0];
+                    int h_end = h_start + param->kernels[0];
+                    if( h_end > param->input[0] + param->pads[0])
+                        h_end = param->input[0] + param->pads[0];
+                    int w_start = pw * param->strides[1] - param->pads[1];
+                    int w_end = w_start + param->kernels[1];
+                    if( w_end > param->input[1] + param->pads[1])
+                        w_end = param->input[1] + param->pads[1];
+
+                    if(param->caffe_flavor)
+                        pool_size_caffe = (h_end - h_start) * (w_end - w_start);
+
+                    h_start = h_start > 0 ? h_start : 0;
+                    w_start = w_start > 0 ? w_start : 0;
+                    h_end = h_end < param->input[0] ? h_end : param->input[0];
+                    w_end = w_end < param->input[1] ? w_end : param->input[1];
+                    //printf("w: %d,%d ,h: %d,%d\n",w_start,w_end,h_start,h_end);
+
+                    pool_size = (h_end - h_start) * (w_end - w_start);
+                    if(!param->caffe_flavor)
+                        pool_size_caffe = (h_end - h_start) * (w_end - w_start);
+
+                    if(param->layout == 0)     //nchw
+                        offset = n*output_chw + c*param->output[0]*param->output[1]
+                                + ph*param->output[1] + pw;
+                    else
+                        offset =n*output_chw + ph*param->output[1]*param->channel
+                                + pw*param->channel + c;
+                    
+                    if(param->method == 0)
+                    {
+                        uint8_t max = calc_max_uint8(input_cur,param->layout,param->channel,param->input[0],param->input[1],
+                                            c,h_start,w_start,h_end,w_end);
+                        output[offset] = max;
+                    }
+                    else if( param->method == 1)
+                    {
+                        int sum = calc_sum_uint8(input_cur,param->layout,param->channel,param->input[0],param->input[1],
+                                            c,h_start,w_start,h_end,w_end);
+                        //  (a-z)*s + ... + (n-z)*s = (output-z)*s*pool_size_caffe
+                        //  (a+...+z)-pool_size*z = output*pool_size_caffe  - z* pool_size_caffe
+                        //  output = ( sum + (pool_size_caffe - pool_size)*z )/pool_size_caffe
+                        int diff_size = pool_size_caffe - pool_size;
+                        output[offset] = (uint8_t)round((sum + diff_size*zero_point)/pool_size_caffe);
+                    }
+                    else
+                        return -1;
+                }
+            }
+        }
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/prelu/prelu_fp16.c b/executor/operator/ref/kernel/prelu/prelu_fp16.c
new file mode 100644
index 000000000..49e9a6c5d
--- /dev/null
+++ b/executor/operator/ref/kernel/prelu/prelu_fp16.c
@@ -0,0 +1,38 @@
+static int prelu_fp16(__fp16 * data,__fp16 * out_data, int dim0,int dim1,int dim2,int dim3,float* slope,prelu_param* param)
+{
+    int offset=0;
+    //__fp16* data = ( __fp16* )data;
+    for(int i = 0; i < dim0; i++)
+    {
+        for(int c = 0; c < dim1; c++)
+        {
+            for(int l = 0; l < dim2; l++)
+            {
+                for(int k = 0; k < dim3; k++)
+                {
+                    if(param->layout==0)
+                    {
+                        //nchw
+                        offset = i*dim1*dim2*dim3 + c*dim2*dim3 + l*dim3 + k;
+                        
+                    }
+                    else
+                    {
+                        //nhwc
+                         offset = i*dim1*dim2*dim3 + l*dim3*dim1 + k*dim1 + c;
+                    }
+/* for arm32 && x86 */
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                    float output_real = MAX(fp16_to_fp32(data[offset]), 0) + slope[c] * MIN(fp16_to_fp32(data[offset]), 0.f);
+                    out_data[offset] = fp32_to_fp16(output_real);
+
+#else
+                    out_data[offset] = MAX(data[offset], 0) + slope[c] * MIN(data[offset], 0.f);
+#endif
+                }
+            }
+                
+        }
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/prelu/prelu_fp32.c b/executor/operator/ref/kernel/prelu/prelu_fp32.c
new file mode 100644
index 000000000..1654ae92e
--- /dev/null
+++ b/executor/operator/ref/kernel/prelu/prelu_fp32.c
@@ -0,0 +1,33 @@
+
+static int prelu_fp32(float * data,float * out_data, int dim0,int dim1,int dim2,int dim3,float* slope,const prelu_param * param)
+{   
+    int offset=0;
+    //nchw
+    //nhwc
+    for(int i = 0; i < dim0; i++)
+    {
+        for(int c = 0; c < dim1; c++)
+        {
+            for(int l = 0; l < dim2; l++)
+            {
+                for(int k = 0; k < dim3; k++)
+                {
+                    if(param->layout==0)
+                    {
+                        //nchw
+                        offset = i*dim1*dim2*dim3 + c*dim2*dim3 + l*dim3 + k;
+                        
+                    }
+                    else
+                    {
+                        //nhwc
+                        offset = i*dim1*dim2*dim3 + l*dim3*dim1 + k*dim1 + c;
+                    }
+                    out_data[offset] = MAX(data[offset], 0) + slope[c] * MIN(data[offset], 0.f);
+                }
+            }
+                
+        }
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/prelu/prelu_int8.c b/executor/operator/ref/kernel/prelu/prelu_int8.c
new file mode 100644
index 000000000..2b209b88f
--- /dev/null
+++ b/executor/operator/ref/kernel/prelu/prelu_int8.c
@@ -0,0 +1,34 @@
+
+static int prelu_int8(int8_t * data,int8_t * out_data, int dim0,int dim1,int dim2,int dim3,float* slope,const prelu_param * param)
+{   
+    int offset;
+  
+    for(int i = 0; i < dim0; i++)
+    {
+        for(int c = 0; c < dim1; c++)
+        {
+            for(int l = 0; l < dim2; l++)
+            {
+                for(int k = 0; k < dim3; k++)
+                {
+                    if(param->layout==0)
+                    {
+                        //nchw
+                        offset=i*dim1*dim2*dim3 + c*dim2*dim3 + l*dim3 + k;
+                        
+                    }
+                    else
+                    {
+                        //nhwc
+                         offset=i*dim1*dim2*dim3 + l*dim3*dim1 + k*dim1 + c;
+                    }
+                    float real_input = data[offset] * param->scale;
+                    float real_output = MAX(real_input, 0) + slope[c] * MIN(real_input, 0.f);
+                    out_data[offset] = round(real_output / param->scale);
+                }
+            }
+                
+        }
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/prelu/prelu_kernel.h b/executor/operator/ref/kernel/prelu/prelu_kernel.h
new file mode 100644
index 000000000..f66eb8f63
--- /dev/null
+++ b/executor/operator/ref/kernel/prelu/prelu_kernel.h
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#ifndef __PRELU_KERNEL_H__
+#define __PRELU_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct prelu_param;
+
+struct prelu_param
+{
+    int layout;
+    float scale;
+    int zero;
+};
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+typedef int (*prelu_t)(void * data,void * out_data, int dim0,int dim1,int dim2,int dim3,float* slope,const prelu_param* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "prelu_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "prelu_fp16.c" 
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "prelu_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "prelu_uint8.c"
+#endif
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/prelu/prelu_uint8.c b/executor/operator/ref/kernel/prelu/prelu_uint8.c
new file mode 100644
index 000000000..c6541187a
--- /dev/null
+++ b/executor/operator/ref/kernel/prelu/prelu_uint8.c
@@ -0,0 +1,34 @@
+
+static int prelu_uint8(uint8_t * data,uint8_t * out_data, int dim0,int dim1,int dim2,int dim3,float* slope,const prelu_param * param)
+{   
+    int offset;
+  
+    for(int i = 0; i < dim0; i++)
+    {
+        for(int c = 0; c < dim1; c++)
+        {
+            for(int l = 0; l < dim2; l++)
+            {
+                for(int k = 0; k < dim3; k++)
+                {
+                    if(param->layout==0)
+                    {
+                        //nchw
+                        offset=i*dim1*dim2*dim3 + c*dim2*dim3 + l*dim3 + k;
+                        
+                    }
+                    else
+                    {
+                        //nhwc
+                         offset=i*dim1*dim2*dim3 + l*dim3*dim1 + k*dim1 + c;
+                    }
+                    float real_input = (data[offset] - param->zero) * param->scale;
+                    float real_output = MAX(real_input, 0) + slope[c] * MIN(real_input, 0.f);
+                    out_data[offset] = round(real_output / param->scale) + param->zero;
+                }
+            }
+                
+        }
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/reduction/reduce.h b/executor/operator/ref/kernel/reduction/reduce.h
new file mode 100644
index 000000000..1902e39c3
--- /dev/null
+++ b/executor/operator/ref/kernel/reduction/reduce.h
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#ifndef __REDUCE_KERNEL_H__
+#define __REDUCE_KERNEL_H__
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct reduce_param;
+
+struct reduce_param
+{
+    int layout;
+    int type;
+    int param_dim[4];
+    float scale[2];
+    int zero[2];
+};
+
+typedef int (*reduce_t)(void * data,void * out_data, int dim0,int dim1,int dim2,int dim3,int out_size,reduce_param* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "reduce_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "reduce_fp16.c" 
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "reduce_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "reduce_uint8.c"
+#endif
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/reduction/reduce_fp16.c b/executor/operator/ref/kernel/reduction/reduce_fp16.c
new file mode 100644
index 000000000..96e1ebc1a
--- /dev/null
+++ b/executor/operator/ref/kernel/reduction/reduce_fp16.c
@@ -0,0 +1,1314 @@
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+void sum_4d_ax0_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp);
+void sum_4d_ax1_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp);
+void sum_4d_ax2_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp);
+void sum_4d_ax3_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp);
+void sum_3d_ax0_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_01);
+void sum_3d_ax1_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_02);
+void sum_3d_ax2_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_03);
+void sum_2d_ax0_fp16(int dim1,int dim2,float * tmp,float* tmp_0);
+void sum_2d_ax1_fp16(int dim1,int dim2,float * tmp,float* tmp_1);
+
+void mean_4d_ax0_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp);
+void mean_4d_ax1_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp);
+void mean_4d_ax2_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp);
+void mean_4d_ax3_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp);
+void mean_3d_ax0_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_01);
+void mean_3d_ax1_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_02);
+void mean_3d_ax2_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_03);
+void mean_2d_ax0_fp16(int dim1,int dim2,float * tmp,float* tmp_0);
+void mean_2d_ax1_fp16(int dim1,int dim2,float * tmp,float* tmp_1);
+#else
+void sum_4d_ax0_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp);
+void sum_4d_ax1_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp);
+void sum_4d_ax2_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp);
+void sum_4d_ax3_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp);
+void sum_3d_ax0_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_01);
+void sum_3d_ax1_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_02);
+void sum_3d_ax2_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_03);
+void sum_2d_ax0_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_0);
+void sum_2d_ax1_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_1);
+
+void mean_4d_ax0_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp);
+void mean_4d_ax1_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp);
+void mean_4d_ax2_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp);
+void mean_4d_ax3_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp);
+void mean_3d_ax0_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_01);
+void mean_3d_ax1_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_02);
+void mean_3d_ax2_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_03);
+void mean_2d_ax0_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_0);
+void mean_2d_ax1_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_1);
+#endif
+
+static int reduce_fp16(__fp16 * data,__fp16 * out_data, int dim0,
+                    int dim1,int dim2,int dim3,int out_size,reduce_param * param)
+{   
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+    int offset=0;
+    float* tmp=(float*)malloc(sizeof(float)*out_size);
+    memset(tmp, 0, sizeof(float) * out_size);
+    int param_dim0=param->param_dim[0];
+    int param_dim1=param->param_dim[1];
+    int param_dim2=param->param_dim[2];
+    int param_dim3=param->param_dim[3];
+    //reduce sum
+    if(param->type==0)
+    {
+        if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+            || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3))
+        {
+            for(int n = 0; n < dim0; n++)
+            {
+                for(int h = 0; h < dim1; h++)
+                {
+                    for(int w = 0; w < dim2; w++)
+                    {
+                        for(int c = 0; c < dim3; c++)
+                        {
+                            //nhwc
+                            offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                            tmp[0]+=fp16_to_fp32(data[offset]);
+                        }
+                    }
+                        
+                }
+            }
+        }
+        else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {
+            sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax2_fp16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax3_fp16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01);
+            sum_3d_ax0_fp16(dim1,dim2,dim3,tmp,tmp_01);
+
+            free(tmp_01);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_02);
+            sum_3d_ax1_fp16(dim1,dim2,dim3,tmp,tmp_02);
+
+            free(tmp_02);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_03);
+            sum_3d_ax2_fp16(dim1,dim2,dim3,tmp,tmp_03);
+            free(tmp_03);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+            sum_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_12);
+            sum_3d_ax1_fp16(dim0,dim2,dim3,tmp,tmp_12);
+
+            free(tmp_12);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3);
+            sum_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_13);
+            sum_3d_ax2_fp16(dim0,dim2,dim3,tmp,tmp_13);
+
+            free(tmp_13);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2)))
+        {
+            //reduce on axis2
+            float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3);
+            memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3);
+            sum_4d_ax2_fp16(dim0,dim1,dim2,dim3,data,tmp_23);
+            sum_3d_ax2_fp16(dim0,dim1,dim3,tmp,tmp_23);
+
+            free(tmp_23);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) 
+                ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==2 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_0, 0, sizeof(float) * dim2*dim3);
+
+            sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01);
+            sum_3d_ax0_fp16(dim1,dim2,dim3,tmp_0,tmp_01);
+            sum_2d_ax0_fp16(dim2,dim3,tmp,tmp_0);
+
+            free(tmp_01);
+            free(tmp_0);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==3 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim2*dim3);
+
+            sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01);
+            sum_3d_ax0_fp16(dim1,dim2,dim3,tmp_1,tmp_01);
+            sum_2d_ax1_fp16(dim2,dim3,tmp,tmp_1);
+
+            free(tmp_01);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim1*dim3);
+
+            sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_02);
+            sum_3d_ax1_fp16(dim1,dim2,dim3,tmp_1,tmp_02);
+            sum_2d_ax1_fp16(dim1,dim3,tmp,tmp_1);
+
+            free(tmp_02);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==1)))
+        {
+            //reduce on axis0
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim0*dim3);
+
+            sum_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_12);
+            sum_3d_ax1_fp16(dim0,dim2,dim3,tmp_1,tmp_12);
+            sum_2d_ax1_fp16(dim0,dim3,tmp,tmp_1);
+
+            free(tmp_12);
+            free(tmp_1);
+        }
+    }
+    //reduce mean
+    else if(param->type==1)
+    {
+        if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+            || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3))
+        {
+            float s_tmp=0.f;
+            for(int n = 0; n < dim0; n++)
+            {
+                for(int h = 0; h < dim1; h++)
+                {
+                    for(int w = 0; w < dim2; w++)
+                    {
+                        for(int c = 0; c < dim3; c++)
+                        {
+                            //nhwc
+                            offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                            s_tmp+=fp16_to_fp32(data[offset]);
+                        }
+                    }
+                }
+            }
+            tmp[0]=s_tmp / (dim0*dim1*dim2*dim3);
+        }
+        else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {
+            mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax2_fp16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax3_fp16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01);
+            mean_3d_ax0_fp16(dim1,dim2,dim3,tmp,tmp_01);
+
+            free(tmp_01);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_02);
+            mean_3d_ax1_fp16(dim1,dim2,dim3,tmp,tmp_02);
+
+            free(tmp_02);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_03);
+            mean_3d_ax2_fp16(dim1,dim2,dim3,tmp,tmp_03);
+            free(tmp_03);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+            mean_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_12);
+            mean_3d_ax1_fp16(dim0,dim2,dim3,tmp,tmp_12);
+
+            free(tmp_12);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3);
+            mean_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_13);
+            mean_3d_ax2_fp16(dim0,dim2,dim3,tmp,tmp_13);
+
+            free(tmp_13);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2)))
+        {
+            //reduce on axis2
+            float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3);
+            memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3);
+            mean_4d_ax2_fp16(dim0,dim1,dim2,dim3,data,tmp_23);
+            mean_3d_ax2_fp16(dim0,dim1,dim3,tmp,tmp_23);
+
+            free(tmp_23);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) 
+                ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==2 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_0, 0, sizeof(float) * dim2*dim3);
+
+            mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01);
+            mean_3d_ax0_fp16(dim1,dim2,dim3,tmp_0,tmp_01);
+            mean_2d_ax0_fp16(dim2,dim3,tmp,tmp_0);
+
+            free(tmp_01);
+            free(tmp_0);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==3 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim2*dim3);
+
+            mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01);
+            mean_3d_ax0_fp16(dim1,dim2,dim3,tmp_1,tmp_01);
+            mean_2d_ax1_fp16(dim2,dim3,tmp,tmp_1);
+
+            free(tmp_01);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim1*dim3);
+
+            mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_02);
+            mean_3d_ax1_fp16(dim1,dim2,dim3,tmp_1,tmp_02);
+            mean_2d_ax1_fp16(dim1,dim3,tmp,tmp_1);
+
+            free(tmp_02);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==1)))
+        {
+            //reduce on axis0
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim0*dim3);
+
+            mean_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_12);
+            mean_3d_ax1_fp16(dim0,dim2,dim3,tmp_1,tmp_12);
+            mean_2d_ax1_fp16(dim0,dim3,tmp,tmp_1);
+
+            free(tmp_12);
+            free(tmp_1);
+        }
+    }
+    //pase to out_data
+    for(int i=0;i<out_size;i++)
+    {
+        out_data[i]=fp32_to_fp16(tmp[i]);  
+    }
+    free(tmp);
+    return 0;
+#else
+    int offset=0;
+    __fp16* tmp=(__fp16*)malloc(sizeof(__fp16)*out_size);
+    memset(tmp, 0, sizeof(__fp16) * out_size);
+    int param_dim0=param->param_dim[0];
+    int param_dim1=param->param_dim[1];
+    int param_dim2=param->param_dim[2];
+    int param_dim3=param->param_dim[3];
+    //reduce sum
+    if(param->type==0)
+    {
+        if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+            || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3))
+        {
+            for(int n = 0; n < dim0; n++)
+            {
+                for(int h = 0; h < dim1; h++)
+                {
+                    for(int w = 0; w < dim2; w++)
+                    {
+                        for(int c = 0; c < dim3; c++)
+                        {
+                            //nhwc
+                            offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                            tmp[0]+=data[offset];
+                        }
+                    }
+                        
+                }
+            }
+        }
+        else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {
+            sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax2_f16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax3_f16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3);
+            sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01);
+            sum_3d_ax0_f16(dim1,dim2,dim3,tmp,tmp_01);
+
+            free(tmp_01);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_02=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(__fp16) * dim1*dim2*dim3);
+            sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_02);
+            sum_3d_ax1_f16(dim1,dim2,dim3,tmp,tmp_02);
+
+            free(tmp_02);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_03=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_03, 0, sizeof(__fp16) * dim1*dim2*dim3);
+            sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_03);
+            sum_3d_ax2_f16(dim1,dim2,dim3,tmp,tmp_03);
+            free(tmp_03);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1)))
+        {
+            //reduce on axis1
+            __fp16 * tmp_12=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(__fp16) * dim0*dim2*dim3);
+            sum_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_12);
+            sum_3d_ax1_f16(dim0,dim2,dim3,tmp,tmp_12);
+
+            free(tmp_12);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1)))
+        {
+            //reduce on axis1
+            __fp16 * tmp_13=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3);
+            memset(tmp_13, 0, sizeof(__fp16) * dim0*dim2*dim3);
+            sum_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_13);
+            sum_3d_ax2_f16(dim0,dim2,dim3,tmp,tmp_13);
+
+            free(tmp_13);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2)))
+        {
+            //reduce on axis2
+            __fp16 * tmp_23=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim1*dim3);
+            memset(tmp_23, 0, sizeof(__fp16) * dim0*dim1*dim3);
+            sum_4d_ax2_f16(dim0,dim1,dim2,dim3,data,tmp_23);
+            sum_3d_ax2_f16(dim0,dim1,dim3,tmp,tmp_23);
+
+            free(tmp_23);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) 
+                ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==2 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3);
+
+            __fp16 * tmp_0=(__fp16 *)malloc(sizeof(__fp16)*dim2*dim3);
+            memset(tmp_0, 0, sizeof(__fp16) * dim2*dim3);
+
+            sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01);
+            sum_3d_ax0_f16(dim1,dim2,dim3,tmp_0,tmp_01);
+            sum_2d_ax0_f16(dim2,dim3,tmp,tmp_0);
+
+            free(tmp_01);
+            free(tmp_0);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==3 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3);
+
+            __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim2*dim3);
+            memset(tmp_1, 0, sizeof(__fp16) * dim2*dim3);
+
+            sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01);
+            sum_3d_ax0_f16(dim1,dim2,dim3,tmp_1,tmp_01);
+            sum_2d_ax1_f16(dim2,dim3,tmp,tmp_1);
+
+            free(tmp_01);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==0)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_02=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(__fp16) * dim1*dim2*dim3);
+
+            __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim3);
+            memset(tmp_1, 0, sizeof(__fp16) * dim1*dim3);
+
+            sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_02);
+            sum_3d_ax1_f16(dim1,dim2,dim3,tmp_1,tmp_02);
+            sum_2d_ax1_f16(dim1,dim3,tmp,tmp_1);
+
+            free(tmp_02);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==1)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_12=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(__fp16) * dim0*dim2*dim3);
+
+            __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim3);
+            memset(tmp_1, 0, sizeof(__fp16) * dim0*dim3);
+
+            sum_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_12);
+            sum_3d_ax1_f16(dim0,dim2,dim3,tmp_1,tmp_12);
+            sum_2d_ax1_f16(dim0,dim3,tmp,tmp_1);
+
+            free(tmp_12);
+            free(tmp_1);
+        }
+    }
+    //reduce mean
+    else if(param->type==1)
+    {
+        if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+            || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3))
+        {
+            __fp16 s_tmp=fp32_to_fp16(0);
+            for(int n = 0; n < dim0; n++)
+            {
+                for(int h = 0; h < dim1; h++)
+                {
+                    for(int w = 0; w < dim2; w++)
+                    {
+                        for(int c = 0; c < dim3; c++)
+                        {
+                            //nhwc
+                            offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                            s_tmp+=data[offset];
+                        }
+                    }
+                }
+            }
+            tmp[0]=s_tmp / (dim0*dim1*dim2*dim3);
+        }
+        else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {
+            mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax2_f16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax3_f16(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3);
+            mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01);
+            mean_3d_ax0_f16(dim1,dim2,dim3,tmp,tmp_01);
+
+            free(tmp_01);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_02=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(__fp16) * dim1*dim2*dim3);
+            mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_02);
+            mean_3d_ax1_f16(dim1,dim2,dim3,tmp,tmp_02);
+
+            free(tmp_02);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_03=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_03, 0, sizeof(__fp16) * dim1*dim2*dim3);
+            mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_03);
+            mean_3d_ax2_f16(dim1,dim2,dim3,tmp,tmp_03);
+            free(tmp_03);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1)))
+        {
+            //reduce on axis1
+            __fp16 * tmp_12=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(__fp16) * dim0*dim2*dim3);
+            mean_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_12);
+            mean_3d_ax1_f16(dim0,dim2,dim3,tmp,tmp_12);
+
+            free(tmp_12);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1)))
+        {
+            //reduce on axis1
+            __fp16 * tmp_13=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3);
+            memset(tmp_13, 0, sizeof(__fp16) * dim0*dim2*dim3);
+            mean_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_13);
+            mean_3d_ax2_f16(dim0,dim2,dim3,tmp,tmp_13);
+
+            free(tmp_13);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2)))
+        {
+            //reduce on axis2
+            __fp16 * tmp_23=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim1*dim3);
+            memset(tmp_23, 0, sizeof(__fp16) * dim0*dim1*dim3);
+            mean_4d_ax2_f16(dim0,dim1,dim2,dim3,data,tmp_23);
+            mean_3d_ax2_f16(dim0,dim1,dim3,tmp,tmp_23);
+
+            free(tmp_23);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) 
+                ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==2 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3);
+
+            __fp16 * tmp_0=(__fp16 *)malloc(sizeof(__fp16)*dim2*dim3);
+            memset(tmp_0, 0, sizeof(__fp16) * dim2*dim3);
+
+            mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01);
+            mean_3d_ax0_f16(dim1,dim2,dim3,tmp_0,tmp_01);
+            mean_2d_ax0_f16(dim2,dim3,tmp,tmp_0);
+
+            free(tmp_01);
+            free(tmp_0);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==3 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3);
+
+            __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim2*dim3);
+            memset(tmp_1, 0, sizeof(__fp16) * dim2*dim3);
+
+            mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01);
+            mean_3d_ax0_f16(dim1,dim2,dim3,tmp_1,tmp_01);
+            mean_2d_ax1_f16(dim2,dim3,tmp,tmp_1);
+
+            free(tmp_01);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==0)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_02=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(__fp16) * dim1*dim2*dim3);
+
+            __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim3);
+            memset(tmp_1, 0, sizeof(__fp16) * dim1*dim3);
+
+            mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_02);
+            mean_3d_ax1_f16(dim1,dim2,dim3,tmp_1,tmp_02);
+            mean_2d_ax1_f16(dim1,dim3,tmp,tmp_1);
+
+            free(tmp_02);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==1)))
+        {
+            //reduce on axis0
+            __fp16 * tmp_12=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(__fp16) * dim0*dim2*dim3);
+
+            __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim3);
+            memset(tmp_1, 0, sizeof(__fp16) * dim0*dim3);
+
+            mean_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_12);
+            mean_3d_ax1_f16(dim0,dim2,dim3,tmp_1,tmp_12);
+            mean_2d_ax1_f16(dim0,dim3,tmp,tmp_1);
+
+            free(tmp_12);
+            free(tmp_1);
+        }
+    }
+    //pase to out_data
+    for(int i=0;i<out_size;i++)
+    {
+        out_data[i]=tmp[i];  
+    }
+    free(tmp);
+    return 0;
+#endif
+}
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+//fp16
+void mean_4d_ax0_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp)
+{
+    for(int j=0;j<dim1*dim2*dim3;j++)
+    {
+        //nhwc
+        float s_tmp=0;
+        for(int n = 0; n < dim0; n++)
+        {
+            int offset=n*dim1*dim2*dim3+j;
+            s_tmp+=fp16_to_fp32(data[offset]);
+            
+        }
+        tmp[j]=s_tmp / dim0;
+    }
+}
+void mean_4d_ax1_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int cw=0;cw<dim2*dim3;cw++)
+        {
+            float s_tmp=0;
+            for(int h=0;h<dim1;h++)
+            {
+                int offset=n*dim1*dim2*dim3+h*dim2*dim3+cw;
+                s_tmp+=fp16_to_fp32(data[offset]);
+            }
+            tmp[n*dim2*dim3+cw]=s_tmp/dim1;
+        }
+    }
+}
+void mean_4d_ax2_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                float s_tmp=0;
+                for(int w=0;w<dim2;w++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    s_tmp+=fp16_to_fp32(data[offset]);
+                }
+                tmp[n*dim1*dim3+h*dim3+c]=s_tmp/dim2;
+            }
+        }
+    }
+}
+void mean_4d_ax3_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp)
+{
+    for(int n = 0; n < dim0; n++)
+    {
+        for(int h = 0; h < dim1; h++)
+        {
+            for(int w = 0; w < dim2; w++)
+            {
+                float s_tmp=0;
+                for(int c = 0; c < dim3; c++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    s_tmp+=fp16_to_fp32(data[offset]);
+                }
+                tmp[n*dim1*dim2+h*dim2+w]=s_tmp/dim3;
+            }     
+        }
+    }
+}
+void sum_4d_ax0_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp)
+{
+    for(int j=0;j<dim1*dim2*dim3;j++)
+    {
+        //nhwc
+        for(int n = 0; n < dim0; n++)
+        {
+            int offset=n*dim1*dim2*dim3+j;
+            tmp[j] +=fp16_to_fp32(data[offset]);
+        }
+    }
+}
+void sum_4d_ax1_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int cw=0;cw<dim2*dim3;cw++)
+        {
+            for(int h=0;h<dim1;h++)
+            {
+                int offset=n*dim1*dim2*dim3+h*dim2*dim3+cw;
+                tmp[n*dim2*dim3+cw]+=fp16_to_fp32(data[offset]);
+            }
+        }
+    }
+}
+void sum_4d_ax2_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                for(int w=0;w<dim2;w++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    tmp[n*dim1*dim3+h*dim3+c]+=fp16_to_fp32(data[offset]);
+                }
+            }
+        }
+    }
+}
+void sum_4d_ax3_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp)
+{
+    for(int n = 0; n < dim0; n++)
+    {
+        for(int h = 0; h < dim1; h++)
+        {
+            for(int w = 0; w < dim2; w++)
+            {
+                for(int c = 0; c < dim3; c++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    tmp[n*dim1*dim2+h*dim2+w]+=fp16_to_fp32(data[offset]);
+                }
+            }     
+        }
+    }
+}
+void mean_3d_ax0_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_01)
+{
+    for(int wc=0;wc<dim2*dim3;wc++)
+    {
+        float s_tmp=0.f;
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2*dim3+wc;
+            s_tmp+=tmp_01[index];
+        }
+        tmp[wc]=s_tmp /dim1;
+    }
+}
+void mean_3d_ax1_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_02)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int c=0;c<dim3;c++)
+        {
+            float s_tmp=0.f;
+            for(int w=0;w<dim2;w++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                s_tmp+=tmp_02[index];
+            }
+            tmp[h*dim3+c]=s_tmp/dim2;
+        }
+    }
+}
+void mean_3d_ax2_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_03)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            float s_tmp=0.f;
+            for(int c=0;c<dim3;c++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                s_tmp+=tmp_03[index];
+            }
+            tmp[h*dim2+w]+=s_tmp/dim3;
+        }
+    }
+}
+void mean_2d_ax0_fp16(int dim1,int dim2,float * tmp,float* tmp_0)
+{
+
+        
+    for(int w=0;w<dim2;w++)
+    {
+        float s_tmp=0.f;
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2+w;
+            s_tmp+=tmp_0[index];   
+        }
+        tmp[w]+=s_tmp/dim1;
+    }
+}
+void mean_2d_ax1_fp16(int dim1,int dim2,float * tmp,float* tmp_1)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        float s_tmp=0.f;
+        for(int w=0;w<dim2;w++)
+        {
+            int index=h*dim2+w;
+            s_tmp+=tmp_1[index]; 
+        }
+        tmp[h]+=s_tmp/dim2;
+    }
+}
+void sum_3d_ax0_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_01)
+{
+    for(int wc=0;wc<dim2*dim3;wc++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2*dim3+wc;
+            tmp[wc]+=tmp_01[index];
+        }
+    }
+}
+void sum_3d_ax1_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_02)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int c=0;c<dim3;c++)
+        {
+            for(int w=0;w<dim2;w++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                tmp[h*dim3+c]+=tmp_02[index];
+            }
+        }
+    }
+}
+void sum_3d_ax2_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_03)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                tmp[h*dim2+w]+=tmp_03[index];
+            }
+        }
+    }
+}
+void sum_2d_ax0_fp16(int dim1,int dim2,float * tmp,float* tmp_0)
+{
+    
+    for(int w=0;w<dim2;w++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2+w;
+            tmp[w]+=tmp_0[index];
+        }
+    }
+}
+void sum_2d_ax1_fp16(int dim1,int dim2,float * tmp,float* tmp_1)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            int index=h*dim2+w;
+            tmp[h]+=tmp_1[index];
+        }
+    }
+}
+#else
+//mean
+void mean_4d_ax0_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp)
+{
+    for(int j=0;j<dim1*dim2*dim3;j++)
+    {
+        //nhwc
+        __fp16 s_tmp=fp32_to_fp16(0);
+        for(int n = 0; n < dim0; n++)
+        {
+            int offset=n*dim1*dim2*dim3+j;
+            s_tmp+=data[offset];
+            
+        }
+        tmp[j]=s_tmp / dim0;
+    }
+}
+void mean_4d_ax1_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int cw=0;cw<dim2*dim3;cw++)
+        {
+            __fp16 s_tmp=fp32_to_fp16(0);
+            for(int h=0;h<dim1;h++)
+            {
+                int offset=n*dim1*dim2*dim3+h*dim2*dim3+cw;
+                s_tmp+=data[offset];
+            }
+            tmp[n*dim2*dim3+cw]=s_tmp/dim1;
+        }
+    }
+}
+void mean_4d_ax2_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                __fp16 s_tmp=fp32_to_fp16(0);
+                for(int w=0;w<dim2;w++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    s_tmp+=data[offset];
+                }
+                tmp[n*dim1*dim3+h*dim3+c]=s_tmp/dim2;
+            }
+        }
+    }
+}
+void mean_4d_ax3_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp)
+{
+    for(int n = 0; n < dim0; n++)
+    {
+        for(int h = 0; h < dim1; h++)
+        {
+            for(int w = 0; w < dim2; w++)
+            {
+                __fp16 s_tmp=fp32_to_fp16(0);
+                for(int c = 0; c < dim3; c++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    s_tmp+=data[offset];
+                }
+                tmp[n*dim1*dim2+h*dim2+w]=s_tmp/dim3;
+            }     
+        }
+    }
+}
+void mean_3d_ax0_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_01)
+{
+    for(int wc=0;wc<dim2*dim3;wc++)
+    {
+        __fp16 s_tmp=fp32_to_fp16(0);
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2*dim3+wc;
+            s_tmp+=tmp_01[index];
+        }
+        tmp[wc]=s_tmp /dim1;
+    }
+}
+void mean_3d_ax1_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_02)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int c=0;c<dim3;c++)
+        {
+            __fp16 s_tmp=fp32_to_fp16(0);
+            for(int w=0;w<dim2;w++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                s_tmp+=tmp_02[index];
+            }
+            tmp[h*dim3+c]=s_tmp/dim2;
+        }
+    }
+}
+void mean_3d_ax2_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_03)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            __fp16 s_tmp=fp32_to_fp16(0);
+            for(int c=0;c<dim3;c++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                s_tmp+=tmp_03[index];
+            }
+            tmp[h*dim2+w]+=s_tmp/dim3;
+        }
+    }
+}
+void mean_2d_ax0_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_0)
+{
+
+        
+    for(int w=0;w<dim2;w++)
+    {
+        __fp16 s_tmp=fp32_to_fp16(0);
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2+w;
+            s_tmp+=tmp_0[index];   
+        }
+        tmp[w]+=s_tmp/dim1;
+    }
+}
+void mean_2d_ax1_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_1)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        __fp16 s_tmp=fp32_to_fp16(0);
+        for(int w=0;w<dim2;w++)
+        {
+            int index=h*dim2+w;
+            s_tmp+=tmp_1[index]; 
+        }
+        tmp[h]+=s_tmp/dim2;
+    }
+}
+
+//sum
+void sum_4d_ax0_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp)
+{
+    for(int j=0;j<dim1*dim2*dim3;j++)
+    {
+        //nhwc
+        for(int n = 0; n < dim0; n++)
+        {
+            int offset=n*dim1*dim2*dim3+j;
+            tmp[j] +=data[offset];
+        }
+    }
+}
+void sum_4d_ax1_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int cw=0;cw<dim2*dim3;cw++)
+        {
+            for(int h=0;h<dim1;h++)
+            {
+                int offset=n*dim1*dim2*dim3+h*dim2*dim3+cw;
+                tmp[n*dim2*dim3+cw]+=data[offset];
+            }
+        }
+    }
+}
+void sum_4d_ax2_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                for(int w=0;w<dim2;w++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    tmp[n*dim1*dim3+h*dim3+c]+=data[offset];
+                }
+            }
+        }
+    }
+}
+void sum_4d_ax3_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp)
+{
+    for(int n = 0; n < dim0; n++)
+    {
+        for(int h = 0; h < dim1; h++)
+        {
+            for(int w = 0; w < dim2; w++)
+            {
+                for(int c = 0; c < dim3; c++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    tmp[n*dim1*dim2+h*dim2+w]+=data[offset];
+                }
+            }     
+        }
+    }
+}
+void sum_3d_ax0_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_01)
+{
+    for(int wc=0;wc<dim2*dim3;wc++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2*dim3+wc;
+            tmp[wc]+=tmp_01[index];
+        }
+    }
+}
+void sum_3d_ax1_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_02)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int c=0;c<dim3;c++)
+        {
+            for(int w=0;w<dim2;w++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                tmp[h*dim3+c]+=tmp_02[index];
+            }
+        }
+    }
+}
+void sum_3d_ax2_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_03)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                tmp[h*dim2+w]+=tmp_03[index];
+            }
+        }
+    }
+}
+void sum_2d_ax0_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_0)
+{
+    
+    for(int w=0;w<dim2;w++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2+w;
+            tmp[w]+=tmp_0[index];
+        }
+    }
+}
+void sum_2d_ax1_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_1)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            int index=h*dim2+w;
+            tmp[h]+=tmp_1[index];
+        }
+    }
+}
+#endif
diff --git a/executor/operator/ref/kernel/reduction/reduce_fp32.c b/executor/operator/ref/kernel/reduction/reduce_fp32.c
new file mode 100644
index 000000000..750a16f24
--- /dev/null
+++ b/executor/operator/ref/kernel/reduction/reduce_fp32.c
@@ -0,0 +1,656 @@
+void sum_4d_ax0(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp);
+void sum_4d_ax1(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp);
+void sum_4d_ax2(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp);
+void sum_4d_ax3(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp);
+void sum_3d_ax0(int dim1,int dim2,int dim3,float * tmp,float* tmp_01);
+void sum_3d_ax1(int dim1,int dim2,int dim3,float * tmp,float* tmp_02);
+void sum_3d_ax2(int dim1,int dim2,int dim3,float * tmp,float* tmp_03);
+void sum_2d_ax0(int dim1,int dim2,float * tmp,float* tmp_0);
+void sum_2d_ax1(int dim1,int dim2,float * tmp,float* tmp_1);
+
+void mean_4d_ax0(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp);
+void mean_4d_ax1(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp);
+void mean_4d_ax2(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp);
+void mean_4d_ax3(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp);
+void mean_3d_ax0(int dim1,int dim2,int dim3,float * tmp,float* tmp_01);
+void mean_3d_ax1(int dim1,int dim2,int dim3,float * tmp,float* tmp_02);
+void mean_3d_ax2(int dim1,int dim2,int dim3,float * tmp,float* tmp_03);
+void mean_2d_ax0(int dim1,int dim2,float * tmp,float* tmp_0);
+void mean_2d_ax1(int dim1,int dim2,float * tmp,float* tmp_1);
+
+static int reduce_fp32(float * data,float * out_data, int dim0,
+                    int dim1,int dim2,int dim3,int out_size,reduce_param * param)
+{   
+    int offset=0;
+    float* tmp=(float*)malloc(sizeof(float)*out_size);
+    memset(tmp, 0, sizeof(float) * out_size);
+    int param_dim0=param->param_dim[0];
+    int param_dim1=param->param_dim[1];
+    int param_dim2=param->param_dim[2];
+    int param_dim3=param->param_dim[3];
+    //reduce sum
+    if(param->type==0)
+    {
+        if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+            || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3))
+        {
+            for(int n = 0; n < dim0; n++)
+            {
+                for(int h = 0; h < dim1; h++)
+                {
+                    for(int w = 0; w < dim2; w++)
+                    {
+                        for(int c = 0; c < dim3; c++)
+                        {
+                            //nhwc
+                            offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                            tmp[0]+=data[offset];
+                        }
+                    }
+                        
+                }
+            }
+        }
+        else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {
+            sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax1(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax2(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax3(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01);
+            sum_3d_ax0(dim1,dim2,dim3,tmp,tmp_01);
+
+            free(tmp_01);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_02);
+            sum_3d_ax1(dim1,dim2,dim3,tmp,tmp_02);
+
+            free(tmp_02);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_03);
+            sum_3d_ax2(dim1,dim2,dim3,tmp,tmp_03);
+            free(tmp_03);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+            sum_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_12);
+            sum_3d_ax1(dim0,dim2,dim3,tmp,tmp_12);
+
+            free(tmp_12);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3);
+            sum_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_13);
+            sum_3d_ax2(dim0,dim2,dim3,tmp,tmp_13);
+
+            free(tmp_13);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2)))
+        {
+            //reduce on axis2
+            float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3);
+            memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3);
+            sum_4d_ax2(dim0,dim1,dim2,dim3,data,tmp_23);
+            sum_3d_ax2(dim0,dim1,dim3,tmp,tmp_23);
+
+            free(tmp_23);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) 
+                ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==2 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_0, 0, sizeof(float) * dim2*dim3);
+
+            sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01);
+            sum_3d_ax0(dim1,dim2,dim3,tmp_0,tmp_01);
+            sum_2d_ax0(dim2,dim3,tmp,tmp_0);
+
+            free(tmp_01);
+            free(tmp_0);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==3 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim2*dim3);
+
+            sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01);
+            sum_3d_ax0(dim1,dim2,dim3,tmp_1,tmp_01);
+            sum_2d_ax1(dim2,dim3,tmp,tmp_1);
+
+            free(tmp_01);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim1*dim3);
+
+            sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_02);
+            sum_3d_ax1(dim1,dim2,dim3,tmp_1,tmp_02);
+            sum_2d_ax1(dim1,dim3,tmp,tmp_1);
+
+            free(tmp_02);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==1)))
+        {
+            //reduce on axis0
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim0*dim3);
+
+            sum_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_12);
+            sum_3d_ax1(dim0,dim2,dim3,tmp_1,tmp_12);
+            sum_2d_ax1(dim0,dim3,tmp,tmp_1);
+
+            free(tmp_12);
+            free(tmp_1);
+        }
+    }
+    //reduce mean
+    else if(param->type==1)
+    {
+        if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+            || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3))
+        {
+            float s_tmp=0.f;
+            for(int n = 0; n < dim0; n++)
+            {
+                for(int h = 0; h < dim1; h++)
+                {
+                    for(int w = 0; w < dim2; w++)
+                    {
+                        for(int c = 0; c < dim3; c++)
+                        {
+                            //nhwc
+                            offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                            s_tmp+=data[offset];
+                        }
+                    }
+                }
+            }
+            tmp[0]=s_tmp / (dim0*dim1*dim2*dim3);
+        }
+        else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {
+            mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax1(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax2(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax3(dim0,dim1,dim2,dim3,data,tmp);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01);
+            mean_3d_ax0(dim1,dim2,dim3,tmp,tmp_01);
+
+            free(tmp_01);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_02);
+            mean_3d_ax1(dim1,dim2,dim3,tmp,tmp_02);
+
+            free(tmp_02);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_03);
+            mean_3d_ax2(dim1,dim2,dim3,tmp,tmp_03);
+            free(tmp_03);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+            mean_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_12);
+            mean_3d_ax1(dim0,dim2,dim3,tmp,tmp_12);
+
+            free(tmp_12);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3);
+            mean_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_13);
+            mean_3d_ax2(dim0,dim2,dim3,tmp,tmp_13);
+
+            free(tmp_13);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2)))
+        {
+            //reduce on axis2
+            float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3);
+            memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3);
+            mean_4d_ax2(dim0,dim1,dim2,dim3,data,tmp_23);
+            mean_3d_ax2(dim0,dim1,dim3,tmp,tmp_23);
+
+            free(tmp_23);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) 
+                ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==2 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_0, 0, sizeof(float) * dim2*dim3);
+
+            mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01);
+            mean_3d_ax0(dim1,dim2,dim3,tmp_0,tmp_01);
+            mean_2d_ax0(dim2,dim3,tmp,tmp_0);
+
+            free(tmp_01);
+            free(tmp_0);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==3 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim2*dim3);
+
+            mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01);
+            mean_3d_ax0(dim1,dim2,dim3,tmp_1,tmp_01);
+            mean_2d_ax1(dim2,dim3,tmp,tmp_1);
+
+            free(tmp_01);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim1*dim3);
+
+            mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_02);
+            mean_3d_ax1(dim1,dim2,dim3,tmp_1,tmp_02);
+            mean_2d_ax1(dim1,dim3,tmp,tmp_1);
+
+            free(tmp_02);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==1)))
+        {
+            //reduce on axis0
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim0*dim3);
+
+            mean_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_12);
+            mean_3d_ax1(dim0,dim2,dim3,tmp_1,tmp_12);
+            mean_2d_ax1(dim0,dim3,tmp,tmp_1);
+
+            free(tmp_12);
+            free(tmp_1);
+        }
+    }
+    //pase to out_data
+    for(int i=0;i<out_size;i++)
+    {
+        out_data[i]=tmp[i];  
+    }
+    free(tmp);
+    return 0;
+}
+//mean
+void mean_4d_ax0(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp)
+{
+    for(int j=0;j<dim1*dim2*dim3;j++)
+    {
+        //nhwc
+        float s_tmp=0.f;
+        for(int n = 0; n < dim0; n++)
+        {
+            int offset=n*dim1*dim2*dim3+j;
+            s_tmp+=data[offset];
+            
+        }
+        tmp[j]=s_tmp / dim0;
+    }
+}
+void mean_4d_ax1(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int cw=0;cw<dim2*dim3;cw++)
+        {
+            float s_tmp=0.f;
+            for(int h=0;h<dim1;h++)
+            {
+                int offset=n*dim1*dim2*dim3+h*dim2*dim3+cw;
+                s_tmp+=data[offset];
+            }
+            tmp[n*dim2*dim3+cw]=s_tmp/dim1;
+        }
+    }
+}
+void mean_4d_ax2(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                float s_tmp=0.f;
+                for(int w=0;w<dim2;w++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    s_tmp+=data[offset];
+                }
+                tmp[n*dim1*dim3+h*dim3+c]=s_tmp/dim2;
+            }
+        }
+    }
+}
+void mean_4d_ax3(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp)
+{
+    for(int n = 0; n < dim0; n++)
+    {
+        for(int h = 0; h < dim1; h++)
+        {
+            for(int w = 0; w < dim2; w++)
+            {
+                float s_tmp=0.f;
+                for(int c = 0; c < dim3; c++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    s_tmp+=data[offset];
+                }
+                tmp[n*dim1*dim2+h*dim2+w]=s_tmp/dim3;
+            }     
+        }
+    }
+}
+void mean_3d_ax0(int dim1,int dim2,int dim3,float * tmp,float* tmp_01)
+{
+    for(int wc=0;wc<dim2*dim3;wc++)
+    {
+        float s_tmp=0.f;
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2*dim3+wc;
+            s_tmp+=tmp_01[index];
+        }
+        tmp[wc]=s_tmp /dim1;
+    }
+}
+void mean_3d_ax1(int dim1,int dim2,int dim3,float * tmp,float* tmp_02)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int c=0;c<dim3;c++)
+        {
+            float s_tmp=0.f;
+            for(int w=0;w<dim2;w++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                s_tmp+=tmp_02[index];
+            }
+            tmp[h*dim3+c]=s_tmp/dim2;
+        }
+    }
+}
+void mean_3d_ax2(int dim1,int dim2,int dim3,float * tmp,float* tmp_03)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            float s_tmp=0.f;
+            for(int c=0;c<dim3;c++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                s_tmp+=tmp_03[index];
+            }
+            tmp[h*dim2+w]+=s_tmp/dim3;
+        }
+    }
+}
+void mean_2d_ax0(int dim1,int dim2,float * tmp,float* tmp_0)
+{
+
+        
+    for(int w=0;w<dim2;w++)
+    {
+        float s_tmp=0.f;
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2+w;
+            s_tmp+=tmp_0[index];   
+        }
+        tmp[w]+=s_tmp/dim1;
+    }
+}
+void mean_2d_ax1(int dim1,int dim2,float * tmp,float* tmp_1)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        float s_tmp=0.f;
+        for(int w=0;w<dim2;w++)
+        {
+            int index=h*dim2+w;
+            s_tmp+=tmp_1[index]; 
+        }
+        tmp[h]+=s_tmp/dim2;
+    }
+}
+
+//sum
+void sum_4d_ax0(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp)
+{
+    for(int j=0;j<dim1*dim2*dim3;j++)
+    {
+        //nhwc
+        for(int n = 0; n < dim0; n++)
+        {
+            int offset=n*dim1*dim2*dim3+j;
+            tmp[j] +=data[offset];
+        }
+    }
+}
+void sum_4d_ax1(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int cw=0;cw<dim2*dim3;cw++)
+        {
+            for(int h=0;h<dim1;h++)
+            {
+                int offset=n*dim1*dim2*dim3+h*dim2*dim3+cw;
+                tmp[n*dim2*dim3+cw]+=data[offset];
+            }
+        }
+    }
+}
+void sum_4d_ax2(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                for(int w=0;w<dim2;w++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    tmp[n*dim1*dim3+h*dim3+c]+=data[offset];
+                }
+            }
+        }
+    }
+}
+void sum_4d_ax3(int dim0,int dim1,int dim2,int dim3,float * data,float * tmp)
+{
+    for(int n = 0; n < dim0; n++)
+    {
+        for(int h = 0; h < dim1; h++)
+        {
+            for(int w = 0; w < dim2; w++)
+            {
+                for(int c = 0; c < dim3; c++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    tmp[n*dim1*dim2+h*dim2+w]+=data[offset];
+                }
+            }     
+        }
+    }
+}
+void sum_3d_ax0(int dim1,int dim2,int dim3,float * tmp,float* tmp_01)
+{
+    for(int wc=0;wc<dim2*dim3;wc++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2*dim3+wc;
+            tmp[wc]+=tmp_01[index];
+        }
+    }
+}
+void sum_3d_ax1(int dim1,int dim2,int dim3,float * tmp,float* tmp_02)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int c=0;c<dim3;c++)
+        {
+            for(int w=0;w<dim2;w++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                tmp[h*dim3+c]+=tmp_02[index];
+            }
+        }
+    }
+}
+void sum_3d_ax2(int dim1,int dim2,int dim3,float * tmp,float* tmp_03)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                tmp[h*dim2+w]+=tmp_03[index];
+            }
+        }
+    }
+}
+void sum_2d_ax0(int dim1,int dim2,float * tmp,float* tmp_0)
+{
+    
+    for(int w=0;w<dim2;w++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2+w;
+            tmp[w]+=tmp_0[index];
+        }
+    }
+}
+void sum_2d_ax1(int dim1,int dim2,float * tmp,float* tmp_1)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            int index=h*dim2+w;
+            tmp[h]+=tmp_1[index];
+        }
+    }
+}
diff --git a/executor/operator/ref/kernel/reduction/reduce_int8.c b/executor/operator/ref/kernel/reduction/reduce_int8.c
new file mode 100644
index 000000000..786f96383
--- /dev/null
+++ b/executor/operator/ref/kernel/reduction/reduce_int8.c
@@ -0,0 +1,674 @@
+void sum_4d_ax0_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param);
+void sum_4d_ax1_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param);
+void sum_4d_ax2_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param);
+void sum_4d_ax3_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param);
+void sum_3d_ax0_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01);
+void sum_3d_ax1_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_02);
+void sum_3d_ax2_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_03);
+void sum_2d_ax0_int8(int dim1,int dim2,float * tmp,float* tmp_0);
+void sum_2d_ax1_int8(int dim1,int dim2,float * tmp,float* tmp_1);
+
+void mean_4d_ax0_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param);
+void mean_4d_ax1_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param);
+void mean_4d_ax2_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param);
+void mean_4d_ax3_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param);
+void mean_3d_ax0_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01);
+void mean_3d_ax1_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_02);
+void mean_3d_ax2_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_03);
+void mean_2d_ax0_int8(int dim1,int dim2,float * tmp,float* tmp_0);
+void mean_2d_ax1_int8(int dim1,int dim2,float * tmp,float* tmp_1);
+
+static int reduce_int8(int8_t * data,int8_t * out_data, int dim0,
+                    int dim1,int dim2,int dim3,int out_size,reduce_param * param)
+{   
+    int offset=0;
+    float* tmp=(float*)malloc(sizeof(float)*out_size);
+    memset(tmp, 0, sizeof(float) * out_size);
+    int param_dim0=param->param_dim[0];
+    int param_dim1=param->param_dim[1];
+    int param_dim2=param->param_dim[2];
+    int param_dim3=param->param_dim[3];
+    //reduce sum
+    if(param->type==0)
+    {
+        if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+            || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3))
+        {
+            for(int n = 0; n < dim0; n++)
+            {
+                for(int h = 0; h < dim1; h++)
+                {
+                    for(int w = 0; w < dim2; w++)
+                    {
+                        for(int c = 0; c < dim3; c++)
+                        {
+                            //nhwc
+                            offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                            float real_input0 = data[offset]  * param->scale[0];
+                            tmp[0]+=real_input0;
+                        }
+                    }
+                        
+                }
+            }
+        }
+        else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {
+            sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax2_int8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax3_int8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            sum_3d_ax0_int8(dim1,dim2,dim3,tmp,tmp_01);
+
+            free(tmp_01);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_02,param);
+            sum_3d_ax1_int8(dim1,dim2,dim3,tmp,tmp_02);
+
+            free(tmp_02);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_03,param);
+            sum_3d_ax2_int8(dim1,dim2,dim3,tmp,tmp_03);
+            free(tmp_03);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+            sum_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_12,param);
+            sum_3d_ax1_int8(dim0,dim2,dim3,tmp,tmp_12);
+
+            free(tmp_12);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3);
+            sum_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_13,param);
+            sum_3d_ax2_int8(dim0,dim2,dim3,tmp,tmp_13);
+
+            free(tmp_13);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2)))
+        {
+            //reduce on axis2
+            float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3);
+            memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3);
+            sum_4d_ax2_int8(dim0,dim1,dim2,dim3,data,tmp_23,param);
+            sum_3d_ax2_int8(dim0,dim1,dim3,tmp,tmp_23);
+
+            free(tmp_23);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) 
+                ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==2 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_0, 0, sizeof(float) * dim2*dim3);
+
+            sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            sum_3d_ax0_int8(dim1,dim2,dim3,tmp_0,tmp_01);
+            sum_2d_ax0_int8(dim2,dim3,tmp,tmp_0);
+
+            free(tmp_01);
+            free(tmp_0);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==3 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim2*dim3);
+
+            sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            sum_3d_ax0_int8(dim1,dim2,dim3,tmp_1,tmp_01);
+            sum_2d_ax1_int8(dim2,dim3,tmp,tmp_1);
+
+            free(tmp_01);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim1*dim3);
+
+            sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_02,param);
+            sum_3d_ax1_int8(dim1,dim2,dim3,tmp_1,tmp_02);
+            sum_2d_ax1_int8(dim1,dim3,tmp,tmp_1);
+
+            free(tmp_02);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==1)))
+        {
+            //reduce on axis0
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim0*dim3);
+
+            sum_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_12,param);
+            sum_3d_ax1_int8(dim0,dim2,dim3,tmp_1,tmp_12);
+            sum_2d_ax1_int8(dim0,dim3,tmp,tmp_1);
+
+            free(tmp_12);
+            free(tmp_1);
+        }
+    }
+    //reduce mean
+    else if(param->type==1)
+    {
+        if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+            || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3))
+        {
+            float s_tmp=0.f;
+            for(int n = 0; n < dim0; n++)
+            {
+                for(int h = 0; h < dim1; h++)
+                {
+                    for(int w = 0; w < dim2; w++)
+                    {
+                        for(int c = 0; c < dim3; c++)
+                        {
+                            //nhwc
+                            offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                            float real_input0 = data[offset] * param->scale[0];
+                            s_tmp+=real_input0;
+                        }
+                    }
+                }
+            }
+            tmp[0]=s_tmp / (dim0*dim1*dim2*dim3);
+        }
+        else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {
+            mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax2_int8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax3_int8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            mean_3d_ax0_int8(dim1,dim2,dim3,tmp,tmp_01);
+
+            free(tmp_01);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_02,param);
+            mean_3d_ax1_int8(dim1,dim2,dim3,tmp,tmp_02);
+
+            free(tmp_02);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_03,param);
+            mean_3d_ax2_int8(dim1,dim2,dim3,tmp,tmp_03);
+            free(tmp_03);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+            mean_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_12,param);
+            mean_3d_ax1_int8(dim0,dim2,dim3,tmp,tmp_12);
+
+            free(tmp_12);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3);
+            mean_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_13,param);
+            mean_3d_ax2_int8(dim0,dim2,dim3,tmp,tmp_13);
+
+            free(tmp_13);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2)))
+        {
+            //reduce on axis2
+            float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3);
+            memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3);
+            mean_4d_ax2_int8(dim0,dim1,dim2,dim3,data,tmp_23,param);
+            mean_3d_ax2_int8(dim0,dim1,dim3,tmp,tmp_23);
+
+            free(tmp_23);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) 
+                ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==2 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_0, 0, sizeof(float) * dim2*dim3);
+
+            mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            mean_3d_ax0_int8(dim1,dim2,dim3,tmp_0,tmp_01);
+            mean_2d_ax0_int8(dim2,dim3,tmp,tmp_0);
+
+            free(tmp_01);
+            free(tmp_0);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==3 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim2*dim3);
+
+            mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            mean_3d_ax0_int8(dim1,dim2,dim3,tmp_1,tmp_01);
+            mean_2d_ax1_int8(dim2,dim3,tmp,tmp_1);
+
+            free(tmp_01);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim1*dim3);
+
+            mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_02,param);
+            mean_3d_ax1_int8(dim1,dim2,dim3,tmp_1,tmp_02);
+            mean_2d_ax1_int8(dim1,dim3,tmp,tmp_1);
+
+            free(tmp_02);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==1)))
+        {
+            //reduce on axis0
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim0*dim3);
+
+            mean_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_12,param);
+            mean_3d_ax1_int8(dim0,dim2,dim3,tmp_1,tmp_12);
+            mean_2d_ax1_int8(dim0,dim3,tmp,tmp_1);
+
+            free(tmp_12);
+            free(tmp_1);
+        }
+    }
+
+    float output_max = 0.0f;
+    for(int i =0; i< out_size; i++)
+    {
+        if(output_max < fabs(tmp[i]))
+            output_max = fabs(tmp[i]);
+    }
+    param->scale[1] = output_max/127;
+    //pase to out_data
+    for(int i=0;i<out_size;i++)
+    {
+        out_data[i]=round(tmp[i] *127/output_max);
+    }
+    free(tmp);
+    return 0;
+}
+//mean
+void mean_4d_ax0_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param)
+{
+    for(int j=0;j<dim1*dim2*dim3;j++)
+    {
+        //nhwc
+        float s_tmp=0.f;
+        for(int n = 0; n < dim0; n++)
+        {
+            int offset=n*dim1*dim2*dim3+j;
+            float real_input0 = data[offset] * param->scale[0];
+            s_tmp+=real_input0;
+            
+        }
+        tmp[j]=s_tmp / dim0;
+    }
+}
+void mean_4d_ax1_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int cw=0;cw<dim2*dim3;cw++)
+        {
+            float s_tmp=0.f;
+            for(int h=0;h<dim1;h++)
+            {
+                int offset=n*dim1*dim2*dim3+h*dim2*dim3+cw;
+                float real_input0 = data[offset] * param->scale[0];
+                s_tmp+=real_input0;
+            }
+            tmp[n*dim2*dim3+cw]=s_tmp/dim1;
+        }
+    }
+}
+void mean_4d_ax2_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                float s_tmp=0.f;
+                for(int w=0;w<dim2;w++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    float real_input0 = data[offset]  * param->scale[0];
+                    s_tmp+=real_input0;
+                }
+                tmp[n*dim1*dim3+h*dim3+c]=s_tmp/dim2;
+            }
+        }
+    }
+}
+void mean_4d_ax3_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n = 0; n < dim0; n++)
+    {
+        for(int h = 0; h < dim1; h++)
+        {
+            for(int w = 0; w < dim2; w++)
+            {
+                float s_tmp=0.f;
+                for(int c = 0; c < dim3; c++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    float real_input0 = data[offset] * param->scale[0];
+                    s_tmp+=real_input0;
+                }
+                tmp[n*dim1*dim2+h*dim2+w]=s_tmp/dim3;
+            }     
+        }
+    }
+}
+void mean_3d_ax0_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01)
+{
+    for(int wc=0;wc<dim2*dim3;wc++)
+    {
+        float s_tmp=0.f;
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2*dim3+wc;
+            s_tmp+=tmp_01[index];
+        }
+        tmp[wc]=s_tmp /dim1;
+    }
+}
+void mean_3d_ax1_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_02)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int c=0;c<dim3;c++)
+        {
+            float s_tmp=0.f;
+            for(int w=0;w<dim2;w++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                s_tmp+=tmp_02[index];
+            }
+            tmp[h*dim3+c]=s_tmp/dim2;
+        }
+    }
+}
+void mean_3d_ax2_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_03)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            float s_tmp=0.f;
+            for(int c=0;c<dim3;c++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                s_tmp+=tmp_03[index];
+            }
+            tmp[h*dim2+w]+=s_tmp/dim3;
+        }
+    }
+}
+void mean_2d_ax0_int8(int dim1,int dim2,float * tmp,float* tmp_0)
+{
+
+        
+    for(int w=0;w<dim2;w++)
+    {
+        float s_tmp=0.f;
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2+w;
+            s_tmp+=tmp_0[index];   
+        }
+        tmp[w]+=s_tmp/dim1;
+    }
+}
+void mean_2d_ax1_int8(int dim1,int dim2,float * tmp,float* tmp_1)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        float s_tmp=0.f;
+        for(int w=0;w<dim2;w++)
+        {
+            int index=h*dim2+w;
+            s_tmp+=tmp_1[index]; 
+        }
+        tmp[h]+=s_tmp/dim2;
+    }
+}
+
+//sum
+void sum_4d_ax0_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param)
+{
+    for(int j=0;j<dim1*dim2*dim3;j++)
+    {
+        //nhwc
+        for(int n = 0; n < dim0; n++)
+        {
+            int offset=n*dim1*dim2*dim3+j;
+            float real_input0 = data[offset]* param->scale[0];
+            tmp[j] +=real_input0;
+        }
+    }
+}
+void sum_4d_ax1_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int cw=0;cw<dim2*dim3;cw++)
+        {
+            for(int h=0;h<dim1;h++)
+            {
+                int offset=n*dim1*dim2*dim3+h*dim2*dim3+cw;
+                float real_input0 = data[offset]* param->scale[0];
+                tmp[n*dim2*dim3+cw]+=real_input0;
+            }
+        }
+    }
+}
+void sum_4d_ax2_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                for(int w=0;w<dim2;w++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    float real_input0 = data[offset] * param->scale[0];
+                    tmp[n*dim1*dim3+h*dim3+c]+=real_input0;
+                }
+            }
+        }
+    }
+}
+void sum_4d_ax3_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n = 0; n < dim0; n++)
+    {
+        for(int h = 0; h < dim1; h++)
+        {
+            for(int w = 0; w < dim2; w++)
+            {
+                for(int c = 0; c < dim3; c++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    float real_input0 = data[offset] * param->scale[0];
+                    tmp[n*dim1*dim2+h*dim2+w]+=real_input0;
+                }
+            }     
+        }
+    }
+}
+void sum_3d_ax0_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01)
+{
+    for(int wc=0;wc<dim2*dim3;wc++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2*dim3+wc;
+            tmp[wc]+=tmp_01[index];
+        }
+    }
+}
+void sum_3d_ax1_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_02)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int c=0;c<dim3;c++)
+        {
+            for(int w=0;w<dim2;w++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                tmp[h*dim3+c]+=tmp_02[index];
+            }
+        }
+    }
+}
+void sum_3d_ax2_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_03)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                tmp[h*dim2+w]+=tmp_03[index];
+            }
+        }
+    }
+}
+void sum_2d_ax0_int8(int dim1,int dim2,float * tmp,float* tmp_0)
+{
+    
+    for(int w=0;w<dim2;w++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2+w;
+            tmp[w]+=tmp_0[index];
+        }
+    }
+}
+void sum_2d_ax1_int8(int dim1,int dim2,float * tmp,float* tmp_1)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            int index=h*dim2+w;
+            tmp[h]+=tmp_1[index];
+        }
+    }
+}
diff --git a/executor/operator/ref/kernel/reduction/reduce_uint8.c b/executor/operator/ref/kernel/reduction/reduce_uint8.c
new file mode 100644
index 000000000..4a5ba9901
--- /dev/null
+++ b/executor/operator/ref/kernel/reduction/reduce_uint8.c
@@ -0,0 +1,666 @@
+void sum_4d_ax0_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param);
+void sum_4d_ax1_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param);
+void sum_4d_ax2_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param);
+void sum_4d_ax3_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param);
+void sum_3d_ax0_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01);
+void sum_3d_ax1_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_02);
+void sum_3d_ax2_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_03);
+void sum_2d_ax0_uint8(int dim1,int dim2,float * tmp,float* tmp_0);
+void sum_2d_ax1_uint8(int dim1,int dim2,float * tmp,float* tmp_1);
+
+void mean_4d_ax0_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param);
+void mean_4d_ax1_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param);
+void mean_4d_ax2_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param);
+void mean_4d_ax3_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param);
+void mean_3d_ax0_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01);
+void mean_3d_ax1_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_02);
+void mean_3d_ax2_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_03);
+void mean_2d_ax0_uint8(int dim1,int dim2,float * tmp,float* tmp_0);
+void mean_2d_ax1_uint8(int dim1,int dim2,float * tmp,float* tmp_1);
+
+static int reduce_uint8(uint8_t * data,uint8_t * out_data, int dim0,
+                    int dim1,int dim2,int dim3,int out_size,reduce_param * param)
+{   
+    int offset=0;
+    float* tmp=(float*)malloc(sizeof(float)*out_size);
+    memset(tmp, 0, sizeof(float) * out_size);
+    int param_dim0=param->param_dim[0];
+    int param_dim1=param->param_dim[1];
+    int param_dim2=param->param_dim[2];
+    int param_dim3=param->param_dim[3];
+    //reduce sum
+    if(param->type==0)
+    {
+        if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+            || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3))
+        {
+            for(int n = 0; n < dim0; n++)
+            {
+                for(int h = 0; h < dim1; h++)
+                {
+                    for(int w = 0; w < dim2; w++)
+                    {
+                        for(int c = 0; c < dim3; c++)
+                        {
+                            //nhwc
+                            offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                            float real_input0 = (data[offset] - param->zero[0]) * param->scale[0];
+                            tmp[0]+=real_input0;
+                        }
+                    }
+                        
+                }
+            }
+        }
+        else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {
+            sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax2_uint8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            sum_4d_ax3_uint8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            sum_3d_ax0_uint8(dim1,dim2,dim3,tmp,tmp_01);
+
+            free(tmp_01);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_02,param);
+            sum_3d_ax1_uint8(dim1,dim2,dim3,tmp,tmp_02);
+
+            free(tmp_02);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3);
+            sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_03,param);
+            sum_3d_ax2_uint8(dim1,dim2,dim3,tmp,tmp_03);
+            free(tmp_03);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+            sum_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_12,param);
+            sum_3d_ax1_uint8(dim0,dim2,dim3,tmp,tmp_12);
+
+            free(tmp_12);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3);
+            sum_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_13,param);
+            sum_3d_ax2_uint8(dim0,dim2,dim3,tmp,tmp_13);
+
+            free(tmp_13);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2)))
+        {
+            //reduce on axis2
+            float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3);
+            memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3);
+            sum_4d_ax2_uint8(dim0,dim1,dim2,dim3,data,tmp_23,param);
+            sum_3d_ax2_uint8(dim0,dim1,dim3,tmp,tmp_23);
+
+            free(tmp_23);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) 
+                ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==2 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_0, 0, sizeof(float) * dim2*dim3);
+
+            sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            sum_3d_ax0_uint8(dim1,dim2,dim3,tmp_0,tmp_01);
+            sum_2d_ax0_uint8(dim2,dim3,tmp,tmp_0);
+
+            free(tmp_01);
+            free(tmp_0);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==3 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim2*dim3);
+
+            sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            sum_3d_ax0_uint8(dim1,dim2,dim3,tmp_1,tmp_01);
+            sum_2d_ax1_uint8(dim2,dim3,tmp,tmp_1);
+
+            free(tmp_01);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim1*dim3);
+
+            sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_02,param);
+            sum_3d_ax1_uint8(dim1,dim2,dim3,tmp_1,tmp_02);
+            sum_2d_ax1_uint8(dim1,dim3,tmp,tmp_1);
+
+            free(tmp_02);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==1)))
+        {
+            //reduce on axis0
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim0*dim3);
+
+            sum_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_12,param);
+            sum_3d_ax1_uint8(dim0,dim2,dim3,tmp_1,tmp_12);
+            sum_2d_ax1_uint8(dim0,dim3,tmp,tmp_1);
+
+            free(tmp_12);
+            free(tmp_1);
+        }
+    }
+    //reduce mean
+    else if(param->type==1)
+    {
+        if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+            || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3))
+        {
+            float s_tmp=0.f;
+            for(int n = 0; n < dim0; n++)
+            {
+                for(int h = 0; h < dim1; h++)
+                {
+                    for(int w = 0; w < dim2; w++)
+                    {
+                        for(int c = 0; c < dim3; c++)
+                        {
+                            //nhwc
+                            offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                            float real_input0 = (data[offset] - param->zero[0]) * param->scale[0];
+                            s_tmp+=real_input0;
+                        }
+                    }
+                }
+            }
+            tmp[0]=s_tmp / (dim0*dim1*dim2*dim3);
+        }
+        else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {
+            mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax2_uint8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2)
+        {  
+            mean_4d_ax3_uint8(dim0,dim1,dim2,dim3,data,tmp,param);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            mean_3d_ax0_uint8(dim1,dim2,dim3,tmp,tmp_01);
+
+            free(tmp_01);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_02,param);
+            mean_3d_ax1_uint8(dim1,dim2,dim3,tmp,tmp_02);
+
+            free(tmp_02);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0)))
+        {
+            //reduce on axis0
+            float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3);
+            mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_03,param);
+            mean_3d_ax2_uint8(dim1,dim2,dim3,tmp,tmp_03);
+            free(tmp_03);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+            mean_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_12,param);
+            mean_3d_ax1_uint8(dim0,dim2,dim3,tmp,tmp_12);
+
+            free(tmp_12);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1)))
+        {
+            //reduce on axis1
+            float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3);
+            mean_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_13,param);
+            mean_3d_ax2_uint8(dim0,dim2,dim3,tmp,tmp_13);
+
+            free(tmp_13);
+        }
+        else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2)))
+        {
+            //reduce on axis2
+            float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3);
+            memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3);
+            mean_4d_ax2_uint8(dim0,dim1,dim2,dim3,data,tmp_23,param);
+            mean_3d_ax2_uint8(dim0,dim1,dim3,tmp,tmp_23);
+
+            free(tmp_23);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) 
+                ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==2 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_0, 0, sizeof(float) * dim2*dim3);
+
+            mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            mean_3d_ax0_uint8(dim1,dim2,dim3,tmp_0,tmp_01);
+            mean_2d_ax0_uint8(dim2,dim3,tmp,tmp_0);
+
+            free(tmp_01);
+            free(tmp_0);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1)
+                ||(param_dim0==3 && param_dim1==1 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim2*dim3);
+
+            mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param);
+            mean_3d_ax0_uint8(dim1,dim2,dim3,tmp_1,tmp_01);
+            mean_2d_ax1_uint8(dim2,dim3,tmp,tmp_1);
+
+            free(tmp_01);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==0)))
+        {
+            //reduce on axis0
+            float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3);
+            memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim1*dim3);
+
+            mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_02,param);
+            mean_3d_ax1_uint8(dim1,dim2,dim3,tmp_1,tmp_02);
+            mean_2d_ax1_uint8(dim1,dim3,tmp,tmp_1);
+
+            free(tmp_02);
+            free(tmp_1);
+        }
+        else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) 
+                ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3)
+                ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2)
+                ||(param_dim0==3 && param_dim1==2 && param_dim2==1)))
+        {
+            //reduce on axis0
+            float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3);
+            memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3);
+
+            float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3);
+            memset(tmp_1, 0, sizeof(float) * dim0*dim3);
+
+            mean_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_12,param);
+            mean_3d_ax1_uint8(dim0,dim2,dim3,tmp_1,tmp_12);
+            mean_2d_ax1_uint8(dim0,dim3,tmp,tmp_1);
+
+            free(tmp_12);
+            free(tmp_1);
+        }
+    }
+    //pase to out_data
+    for(int i=0;i<out_size;i++)
+    {
+        out_data[i]=round(tmp[i] / param->scale[1]) + param->zero[1];
+    }
+    free(tmp);
+    return 0;
+}
+//mean
+void mean_4d_ax0_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param)
+{
+    for(int j=0;j<dim1*dim2*dim3;j++)
+    {
+        //nhwc
+        float s_tmp=0.f;
+        for(int n = 0; n < dim0; n++)
+        {
+            int offset=n*dim1*dim2*dim3+j;
+            float real_input0 = (data[offset] - param->zero[0]) * param->scale[0];
+            s_tmp+=real_input0;
+            
+        }
+        tmp[j]=s_tmp / dim0;
+    }
+}
+void mean_4d_ax1_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int cw=0;cw<dim2*dim3;cw++)
+        {
+            float s_tmp=0.f;
+            for(int h=0;h<dim1;h++)
+            {
+                int offset=n*dim1*dim2*dim3+h*dim2*dim3+cw;
+                float real_input0 = (data[offset] - param->zero[0]) * param->scale[0];
+                s_tmp+=real_input0;
+            }
+            tmp[n*dim2*dim3+cw]=s_tmp/dim1;
+        }
+    }
+}
+void mean_4d_ax2_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                float s_tmp=0.f;
+                for(int w=0;w<dim2;w++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    float real_input0 = (data[offset] - param->zero[0]) * param->scale[0];
+                    s_tmp+=real_input0;
+                }
+                tmp[n*dim1*dim3+h*dim3+c]=s_tmp/dim2;
+            }
+        }
+    }
+}
+void mean_4d_ax3_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n = 0; n < dim0; n++)
+    {
+        for(int h = 0; h < dim1; h++)
+        {
+            for(int w = 0; w < dim2; w++)
+            {
+                float s_tmp=0.f;
+                for(int c = 0; c < dim3; c++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    float real_input0 = (data[offset] - param->zero[0]) * param->scale[0];
+                    s_tmp+=real_input0;
+                }
+                tmp[n*dim1*dim2+h*dim2+w]=s_tmp/dim3;
+            }     
+        }
+    }
+}
+void mean_3d_ax0_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01)
+{
+    for(int wc=0;wc<dim2*dim3;wc++)
+    {
+        float s_tmp=0.f;
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2*dim3+wc;
+            s_tmp+=tmp_01[index];
+        }
+        tmp[wc]=s_tmp /dim1;
+    }
+}
+void mean_3d_ax1_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_02)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int c=0;c<dim3;c++)
+        {
+            float s_tmp=0.f;
+            for(int w=0;w<dim2;w++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                s_tmp+=tmp_02[index];
+            }
+            tmp[h*dim3+c]=s_tmp/dim2;
+        }
+    }
+}
+void mean_3d_ax2_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_03)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            float s_tmp=0.f;
+            for(int c=0;c<dim3;c++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                s_tmp+=tmp_03[index];
+            }
+            tmp[h*dim2+w]+=s_tmp/dim3;
+        }
+    }
+}
+void mean_2d_ax0_uint8(int dim1,int dim2,float * tmp,float* tmp_0)
+{
+
+        
+    for(int w=0;w<dim2;w++)
+    {
+        float s_tmp=0.f;
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2+w;
+            s_tmp+=tmp_0[index];   
+        }
+        tmp[w]+=s_tmp/dim1;
+    }
+}
+void mean_2d_ax1_uint8(int dim1,int dim2,float * tmp,float* tmp_1)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        float s_tmp=0.f;
+        for(int w=0;w<dim2;w++)
+        {
+            int index=h*dim2+w;
+            s_tmp+=tmp_1[index]; 
+        }
+        tmp[h]+=s_tmp/dim2;
+    }
+}
+
+//sum
+void sum_4d_ax0_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param)
+{
+    for(int j=0;j<dim1*dim2*dim3;j++)
+    {
+        //nhwc
+        for(int n = 0; n < dim0; n++)
+        {
+            int offset=n*dim1*dim2*dim3+j;
+            float real_input0 = (data[offset] - param->zero[0]) * param->scale[0];
+            tmp[j] +=real_input0;
+        }
+    }
+}
+void sum_4d_ax1_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int cw=0;cw<dim2*dim3;cw++)
+        {
+            for(int h=0;h<dim1;h++)
+            {
+                int offset=n*dim1*dim2*dim3+h*dim2*dim3+cw;
+                float real_input0 = (data[offset] - param->zero[0]) * param->scale[0];
+                tmp[n*dim2*dim3+cw]+=real_input0;
+            }
+        }
+    }
+}
+void sum_4d_ax2_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n=0;n<dim0;n++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                for(int w=0;w<dim2;w++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    float real_input0 = (data[offset] - param->zero[0]) * param->scale[0];
+                    tmp[n*dim1*dim3+h*dim3+c]+=real_input0;
+                }
+            }
+        }
+    }
+}
+void sum_4d_ax3_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param)
+{
+    for(int n = 0; n < dim0; n++)
+    {
+        for(int h = 0; h < dim1; h++)
+        {
+            for(int w = 0; w < dim2; w++)
+            {
+                for(int c = 0; c < dim3; c++)
+                {
+                    int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c;
+                    float real_input0 = (data[offset] - param->zero[0]) * param->scale[0];
+                    tmp[n*dim1*dim2+h*dim2+w]+=real_input0;
+                }
+            }     
+        }
+    }
+}
+void sum_3d_ax0_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01)
+{
+    for(int wc=0;wc<dim2*dim3;wc++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2*dim3+wc;
+            tmp[wc]+=tmp_01[index];
+        }
+    }
+}
+void sum_3d_ax1_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_02)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int c=0;c<dim3;c++)
+        {
+            for(int w=0;w<dim2;w++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                tmp[h*dim3+c]+=tmp_02[index];
+            }
+        }
+    }
+}
+void sum_3d_ax2_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_03)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            for(int c=0;c<dim3;c++)
+            {
+                int index=h*dim2*dim3+w*dim3+c;
+                tmp[h*dim2+w]+=tmp_03[index];
+            }
+        }
+    }
+}
+void sum_2d_ax0_uint8(int dim1,int dim2,float * tmp,float* tmp_0)
+{
+    
+    for(int w=0;w<dim2;w++)
+    {
+        for(int h=0;h<dim1;h++)
+        {
+            int index=h*dim2+w;
+            tmp[w]+=tmp_0[index];
+        }
+    }
+}
+void sum_2d_ax1_uint8(int dim1,int dim2,float * tmp,float* tmp_1)
+{
+    for(int h=0;h<dim1;h++)
+    {
+        for(int w=0;w<dim2;w++)
+        {
+            int index=h*dim2+w;
+            tmp[h]+=tmp_1[index];
+        }
+    }
+}
diff --git a/executor/operator/ref/kernel/ref_add_n/ref_addn_fp16.c b/executor/operator/ref/kernel/ref_add_n/ref_addn_fp16.c
new file mode 100644
index 000000000..3abde3f24
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_add_n/ref_addn_fp16.c
@@ -0,0 +1,35 @@
+static int ref_addn_fp16(__fp16**input,__fp16 * output, const ref_addn_param* param)
+{
+    int input_size = (param->input_size / sizeof(__fp16));
+    int in_num = param->in_num;
+
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+    float *buff = (float*)malloc(input_size);
+    for(int i = 0; i < in_num; ++i)
+    {
+       __fp16 *input_data = input[i];
+       for(int j = 0; j < input_size; ++j)
+       {
+            float data = fp16_to_fp32(input_data[j]);
+            buff[j] += data;
+       }
+    }
+    for(int j = 0; j < input_size; ++j)
+    {
+        output[j] = fp32_to_fp16(buff[j]);
+    }
+    
+    free(buff);
+#else
+    for(int i = 0; i < in_num; ++i)
+    {
+       __fp16 *input_data = input[i];
+       for(int j = 0; j < input_size; ++j)
+       {
+            output[j] += input_data[j];
+       }
+    }
+#endif
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/ref_add_n/ref_addn_fp32.c b/executor/operator/ref/kernel/ref_add_n/ref_addn_fp32.c
new file mode 100644
index 000000000..50857e78d
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_add_n/ref_addn_fp32.c
@@ -0,0 +1,16 @@
+static int ref_addn_fp32(float**input,float * output, const ref_addn_param* param)
+{
+   int input_size = (param->input_size / sizeof(float));
+   int in_num = param->in_num;
+
+   for(int i = 0; i < in_num; ++i)
+   {
+       float *input_data = input[i];
+       for(int j = 0; j < input_size; ++j)
+       {
+           output[j] += input_data[j];
+       }
+   }
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/ref_add_n/ref_addn_int8.c b/executor/operator/ref/kernel/ref_add_n/ref_addn_int8.c
new file mode 100644
index 000000000..8497f45b8
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_add_n/ref_addn_int8.c
@@ -0,0 +1,35 @@
+static int ref_addn_int8(int8_t**input,int8_t * output, ref_addn_param* param)
+{
+    int input_size = param->input_size;
+    int in_num = param->in_num;
+
+    float *out_f32 = (float*)malloc(input_size);
+    memset(out_f32,0,input_size);
+    for(int i = 0; i < in_num; ++i)
+    {
+       int8_t *input_data = input[i];
+       float input_scale = param->in_scale[i];
+       for(int j = 0; j < input_size; ++j)
+       {
+           out_f32[j] += input_data[j] * input_scale;
+       }
+    }
+    float output_max = 0.0f;
+    for(int i =0; i< input_size; i++)
+    {
+        if(output_max < fabs(out_f32[i]))
+            output_max = fabs(out_f32[i]);
+    }
+    param->out_scale = output_max / 127;
+    param->out_zero = 0;
+    for(int i =0; i< input_size; i++)
+    {
+        int s32_out = round(out_f32[i]*127/output_max);
+        if(s32_out > 127) s32_out = 127;
+        if(s32_out < -127 ) s32_out = -127;
+        output[i] = s32_out; 
+    }
+    free(out_f32);
+    out_f32 = NULL;
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/ref_add_n/ref_addn_kernel.h b/executor/operator/ref/kernel/ref_add_n/ref_addn_kernel.h
new file mode 100644
index 000000000..64effac3a
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_add_n/ref_addn_kernel.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+
+#ifndef __REF_ADDN_KERNEL_H__
+#define __REF_ADDN_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ref_addn_param
+{
+    float * in_scale;
+    int *in_zero;
+    int in_num;
+    int input_size;
+    float out_scale;
+    int out_zero;
+};
+
+typedef int (*ref_add_n_kernel_t)(uint8_t **input,uint8_t * output,const ref_addn_param* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_addn_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_addn_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "ref_addn_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_addn_uint8.c"
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/ref_add_n/ref_addn_uint8.c b/executor/operator/ref/kernel/ref_add_n/ref_addn_uint8.c
new file mode 100644
index 000000000..558cb0f7c
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_add_n/ref_addn_uint8.c
@@ -0,0 +1,31 @@
+static int ref_addn_uint8(uint8_t**input,uint8_t * output, const ref_addn_param* param)
+{
+    int input_size = param->input_size;
+    int in_num = param->in_num;
+    float *out_f32 = (float*)malloc(input_size);
+    memset(out_f32,0,input_size);
+    for(int i = 0; i < in_num; ++i)
+    {
+        uint8_t *input_data = input[i];
+        float input_scale = param->in_scale[i];
+        int zero_point = param->in_zero[i];
+        for(int j = 0; j < input_size; ++j)
+        {
+           out_f32[j] += (input_data[j] * input_scale + zero_point);
+        }
+    }
+    for(int j = 0; j < input_size; ++j)
+    {
+        int s32_out = round(out_f32[j]/param->out_scale) + param->out_zero;
+        if(s32_out > 255)
+            s32_out = 255;
+        if(s32_out < 0 ) 
+            s32_out = 0;
+        output[j] = s32_out; 
+    }
+    free(out_f32);
+    out_f32 = NULL;
+
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp16.c b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp16.c
new file mode 100644
index 000000000..1bb8483e5
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp16.c
@@ -0,0 +1,64 @@
+static int ref_batchnorm_fp16(__fp16*input,__fp16 * output,const ref_batchnorm_param* param)
+{
+    float* scale_mean = param->scale_mean;
+    float* scale_var_inv = param->scale_var_inv;
+    float* gamma = param->gamma;
+    float* beta = param->beta;
+    
+    int img_size = param->input_c * param->input_h * param->input_w;
+    float *out_f32 = (float*)malloc(sizeof(float)* img_size * param->input_n);
+    memset(out_f32,0,sizeof(float) * img_size);
+    for(int n = 0; n < param->input_n; ++n)
+    {
+        for(int h = 0; h < param->input_h; ++h)
+        {
+            for(int w = 0; w < param->input_w;++w)
+            {
+                for(int c = 0; c < param->input_c; ++c)
+                {
+                    float s_mean = scale_mean[c];
+                    float s_var = scale_var_inv[c];
+                    float s_val1 = s_mean;
+                    float s_val2 = s_var;
+                    if(!param->iscaffe)
+                    {
+                        float s_gamma = gamma[c];
+                        float s_beta = beta[c];
+                        s_val1 = s_beta + s_gamma * s_mean;
+                        s_val2 = s_gamma * s_var;
+                    }
+                    int offset = 0;
+                    if(TENGINE_LAYOUT_NCHW == param->layout)
+                    {
+                        offset = n * img_size + c * param->input_h * param->input_w +
+                                 h * param->input_w + w;
+                    }
+                    else
+                    {
+                        offset = n * img_size + h * param->input_w * param->input_c + 
+                                 w * param->input_c + c;
+                    }
+                    #if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                        float data = fp16_to_fp32(input[offset]);
+                    #else
+                        __fp16 data = input[offset];
+                    #endif
+                    out_f32[offset] = data * s_val2 + s_val1;
+                }
+            }
+        }
+    }
+    for(int j = 0; j < img_size * param->input_n; ++j)
+    {
+        #if!defined( __ARM_ARCH) || __ARM_ARCH <8
+            output[j] = fp32_to_fp16(out_f32[j]);
+        #else
+            output[j] = (__fp16)out_f32[j];
+        #endif
+    }
+    free(out_f32);
+    out_f32 = NULL;
+    
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp32.c b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp32.c
new file mode 100644
index 000000000..263b8a0fd
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp32.c
@@ -0,0 +1,47 @@
+static int ref_batchnorm_fp32(float*input,float * output,const ref_batchnorm_param* param)
+{
+    float* scale_mean = param->scale_mean;
+    float* scale_var_inv = param->scale_var_inv;
+    float* gamma = param->gamma;
+    float* beta = param->beta;
+    
+    int img_size = param->input_c * param->input_h * param->input_w;
+   
+    for(int n = 0; n < param->input_n; ++n)
+    {
+        for(int h = 0; h < param->input_h; ++h)
+        {
+            for(int w = 0; w < param->input_w;++w)
+            {
+                for(int c = 0; c < param->input_c; ++c)
+                {
+                    float s_mean = scale_mean[c];
+                    float s_var = scale_var_inv[c];
+                    float s_val1 = s_mean;
+                    float s_val2 = s_var;
+                    if(!param->iscaffe)
+                    {
+                        float s_gamma = gamma[c];
+                        float s_beta = beta[c];
+                        s_val1 = s_beta + s_gamma * s_mean;
+                        s_val2 = s_gamma * s_var;
+                    }
+                    int offset = 0;
+                    if(TENGINE_LAYOUT_NCHW == param->layout)
+                    {
+                        offset = n * img_size + c * param->input_h * param->input_w +
+                                 h * param->input_w + w;
+                    }
+                    else
+                    {
+                        offset = n * img_size + h * param->input_w * param->input_c + 
+                                 w * param->input_c + c;
+                    }
+                    output[offset] = input[offset] * s_val2 + s_val1;
+                }
+            }
+        }
+    }
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_int8.c b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_int8.c
new file mode 100644
index 000000000..1e241e8e7
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_int8.c
@@ -0,0 +1,68 @@
+static int ref_batchnorm_int8(int8_t*input,int8_t * output,ref_batchnorm_param* param)
+{
+    float* scale_mean = param->scale_mean;
+    float* scale_var_inv = param->scale_var_inv;
+    float* gamma = param->gamma;
+    float* beta = param->beta;
+    
+    int img_size = param->input_c * param->input_h * param->input_w;
+    float *out_f32 = (float*)malloc(sizeof(float)* img_size * param->input_n);
+    memset(out_f32,0,sizeof(float) * img_size * param->input_n);
+    for(int n = 0; n < param->input_n; ++n)
+    {
+        for(int h = 0; h < param->input_h; ++h)
+        {
+            for(int w = 0; w < param->input_w;++w)
+            {
+                for(int c = 0; c < param->input_c; ++c)
+                {
+                    float s_mean = scale_mean[c];
+                    float s_var = scale_var_inv[c];
+                    float s_val1 = s_mean;
+                    float s_val2 = s_var;
+                    if(!param->iscaffe)
+                    {
+                        float s_gamma = gamma[c];
+                        float s_beta = beta[c];
+                        s_val1 = s_beta + s_gamma * s_mean;
+                        s_val2 = s_gamma * s_var;
+                    }
+                    int offset = 0;
+                    if(TENGINE_LAYOUT_NCHW == param->layout)
+                    {
+                        offset = n * img_size + c * param->input_h * param->input_w +
+                                 h * param->input_w + w;
+                    }
+                    else
+                    {
+                        offset = n * img_size + h * param->input_w * param->input_c + 
+                                 w * param->input_c + c;
+                    }
+                    float data = (float)param->in_scale * (input[offset] - param->in_zero);
+                    out_f32[offset] = data * s_val2 + s_val1;
+                }
+            }
+        }
+    }
+    float output_max = 0.0f;
+    for(int i =0; i< img_size*param->input_n; i++)
+    {
+        if(output_max < fabs(out_f32[i]))
+            output_max = fabs(out_f32[i]);
+    }
+    param->out_scale = output_max / 127;
+    param->out_zero = 0;
+    for(int i =0; i< img_size*param->input_n; i++)
+    {
+        int s32_out = round(out_f32[i]*127/output_max);
+        if(s32_out > 127) s32_out = 127;
+        if(s32_out < -127 ) s32_out = -127;
+        output[i] = s32_out; 
+    }
+
+    free(out_f32);
+    out_f32 = NULL;
+    
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_kernel.h b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_kernel.h
new file mode 100644
index 000000000..337dd5512
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_kernel.h
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+
+#ifndef __REF_BATCHNORM_KERNEL_H__
+#define __REF_BATCHNORM_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ref_batchnorm_param
+{
+    int input_n;
+    int input_h;
+    int input_w;
+    int input_c;
+    int layout;
+    bool iscaffe;
+    float* scale_mean;
+    float* scale_var_inv;
+    float* gamma;
+    float* beta;
+    float in_scale;
+    int in_zero;
+    float out_scale;
+    int out_zero;
+};
+
+typedef int (*ref_batchnorm_kernel_t)(uint8_t *input,uint8_t * output,const ref_batchnorm_param* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_batchnorm_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_batchnorm_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "ref_batchnorm_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_batchnorm_uint8.c"
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_uint8.c b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_uint8.c
new file mode 100644
index 000000000..daadabb9d
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_uint8.c
@@ -0,0 +1,62 @@
+static int ref_batchnorm_uint8(uint8_t*input,uint8_t* output,const ref_batchnorm_param* param)
+{
+    float* scale_mean = param->scale_mean;
+    float* scale_var_inv = param->scale_var_inv;
+    float* gamma = param->gamma;
+    float* beta = param->beta;
+    
+    int img_size = param->input_c * param->input_h * param->input_w;
+    float *out_f32 = (float*)malloc(sizeof(float)* img_size * param->input_n);
+    memset(out_f32,0,sizeof(float) * img_size * param->input_n);
+    for(int n = 0; n < param->input_n; ++n)
+    {
+        for(int h = 0; h < param->input_h; ++h)
+        {
+            for(int w = 0; w < param->input_w;++w)
+            {
+                for(int c = 0; c < param->input_c; ++c)
+                {
+                    float s_mean = scale_mean[c];
+                    float s_var = scale_var_inv[c];
+                    float s_val1 = s_mean;
+                    float s_val2 = s_var;
+                    if(!param->iscaffe)
+                    {
+                        float s_gamma = gamma[c];
+                        float s_beta = beta[c];
+                        s_val1 = s_beta + s_gamma * s_mean;
+                        s_val2 = s_gamma * s_var;
+                    }
+                    int offset = 0;
+                    if(TENGINE_LAYOUT_NCHW == param->layout)
+                    {
+                        offset = n * img_size + c * param->input_h * param->input_w +
+                                 h * param->input_w + w;
+                    }
+                    else
+                    {
+                        offset = n * img_size + h * param->input_w * param->input_c + 
+                                 w * param->input_c + c;
+                    }
+                    
+                    float data = param->in_scale*(input[offset]-param->in_zero);
+                    out_f32[offset] = data * s_val2 + s_val1;
+                }
+            }
+        }
+    }
+    for(int j = 0; j < img_size * param->input_n; ++j)
+    {
+        int s32_out = round(out_f32[j]/param->out_scale) + param->out_zero;
+        if(s32_out > 255)
+            s32_out = 255;
+        if(s32_out < 0 ) 
+            s32_out = 0;
+        output[j] = s32_out;        
+    }
+    free(out_f32);
+    out_f32 = NULL;
+    
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp16.c b/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp16.c
new file mode 100644
index 000000000..b2818cfa3
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp16.c
@@ -0,0 +1,93 @@
+
+static int ref_normalize_fp16(__fp16*input,__fp16 * output,__fp16* scale, const ref_normalize_param* param)
+{
+    int batch_num = param->input_n;
+    int in_h = param->input_h;
+    int in_w = param->input_w;
+    int in_c = param->input_c;
+    int in_offset = 0;
+    int out_offset = 0;
+
+    float *buff = (float*)malloc(sizeof(float)* in_h * in_w);
+    __fp16 *in_buf = input;
+    __fp16 *out_buf = output;
+    
+    for(int n = 0; n < batch_num; ++n)
+    {
+        memset(buff,0,sizeof(float)*in_h*in_w);
+        for(int h = 0; h < in_h; ++h)
+        {
+            for(int w = 0; w < in_w; ++w)
+            {
+                int buff_idx = h * in_w + w;
+                for(int c = 0; c < in_c; ++c)
+                {
+                    if(param->layout == 0) // nchw
+                    {
+                        in_offset = n * in_h * in_w * in_c + c * in_h * in_w + h * in_w + w;
+                    }
+                    else // nhwc
+                    {
+                        in_offset = n * in_h * in_w * in_c + h * in_w * in_c + w * in_c + c;
+                    }
+                    #if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                        float data = fp16_to_fp32(in_buf[in_offset]);
+                    #else
+                        __fp16 data = in_buf[in_offset];
+                    #endif
+                    buff[buff_idx] += data * data;
+                }
+                buff[buff_idx] = 1.f / sqrt(buff[buff_idx]);
+            }
+        }
+        for(int h = 0; h < in_h; ++h)
+        {
+            for(int w = 0; w < in_w; ++w)
+            {
+                int buff_idx = h * in_w + w;
+                for(int c = 0; c < in_c; ++c)
+                {
+                    if(param->layout == 0) // nchw
+                    {
+                        out_offset = n * in_h * in_w * in_c + c * in_h * in_w + h * in_w + w;
+                        in_offset = out_offset;
+                    }
+                    else // nhwc
+                    {
+                        out_offset = n * in_h * in_w * in_c + h * in_w * in_c + w * in_c + c;
+                        in_offset = out_offset;
+                    }
+                    #if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                        float in_data = fp16_to_fp32(in_buf[in_offset]);
+                    #else
+                        __fp16 in_data = in_buf[in_offset];
+                    #endif
+                    float data = buff[buff_idx];
+                    
+                    float out_data =  in_data * data;
+                    if(scale)
+                    {
+                        #if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                            float scale_data = fp16_to_fp32(scale[c]);
+                        #else
+                            __fp16 scale_data = scale[c];
+                        #endif
+                        
+                        out_data = out_data * scale_data;
+                    }
+                    
+                    #if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                        out_buf[out_offset] = fp32_to_fp16(out_data);
+                    #else
+                        out_buf[out_offset] = (__fp16)out_data;
+                    #endif
+                }
+            }
+        }
+    }
+    
+    free(buff);
+    buff = NULL;
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp32.c b/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp32.c
new file mode 100644
index 000000000..99f1f90ec
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp32.c
@@ -0,0 +1,75 @@
+
+static int ref_normalize_fp32(float*input,float * output,float* scale,const ref_normalize_param* param)
+{
+   
+    int batch_num = param->input_n;
+    int in_h = param->input_h;
+    int in_w = param->input_w;
+    int in_c = param->input_c;
+    int in_offset = 0;
+    int out_offset = 0;
+
+    float *buff = (float*)malloc(sizeof(float)* in_h * in_w);
+    float *in_buf = input;
+    float *out_buf = output;
+    
+    for(int n = 0; n < batch_num; ++n)
+    {
+        in_buf = input + n * in_h * in_w * in_c;
+        out_buf = output + n * in_h * in_w * in_c;
+        memset(buff,0,sizeof(float)*in_h*in_w);
+        for(int h = 0; h < in_h; ++h)
+        {
+            for(int w = 0; w < in_w; ++w)
+            {
+                int buff_idx = h * in_w + w;
+                for(int c = 0; c < in_c; ++c)
+                {
+                    if(param->layout == 0) // nchw
+                    {
+                        in_offset = c * in_h * in_w + h * in_w + w;
+                    }
+                    else // nhwc
+                    {
+                        in_offset = h * in_w * in_c + w * in_c + c;
+                    }
+                    float data = in_buf[in_offset];
+                    buff[buff_idx] += data * data;
+                }
+                buff[buff_idx] = 1.f / sqrt(buff[buff_idx]);
+            }
+        }
+        for(int h = 0; h < in_h; ++h)
+        {
+            for(int w = 0; w < in_w; ++w)
+            {
+                int buff_idx = h * in_w + w;
+                for(int c = 0; c < in_c; ++c)
+                {
+                    if(param->layout == 0) // nchw
+                    {
+                        out_offset = c * in_h * in_w + h * in_w + w;
+                        in_offset = out_offset;
+                    }
+                    else // nhwc
+                    {
+                        out_offset = h * in_w * in_c + w * in_c + c;
+                        in_offset = out_offset;
+                    }
+                    float data = buff[buff_idx];
+                    float in_data = in_buf[in_offset];
+                    out_buf[out_offset] = in_data * data;
+                    if(scale)
+                    {
+                        out_buf[out_offset] = out_buf[out_offset] * scale[c];
+                    }
+                }
+            }
+        }
+    }
+    
+    free(buff);
+    buff = NULL;
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/ref_normalize/ref_normalize_int8.c b/executor/operator/ref/kernel/ref_normalize/ref_normalize_int8.c
new file mode 100644
index 000000000..389784f77
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_normalize/ref_normalize_int8.c
@@ -0,0 +1,93 @@
+static int ref_normalize_int8(int8_t*input, int8_t * output,int8_t* scale,ref_normalize_param* param)
+{
+   
+    int batch_num = param->input_n;
+    int in_h = param->input_h;
+    int in_w = param->input_w;
+    int in_c = param->input_c;
+    int in_offset = 0;
+    int out_offset = 0;
+
+    int out_size = batch_num * in_h * in_w * in_c;
+    float *buff = (float*)malloc(sizeof(float)* in_h * in_w);
+    float *out_f32 = (float*)malloc(sizeof(float)* out_size);
+    int8_t *in_buf = input;
+    float * out_f32_tmp = out_f32;
+    for(int n = 0; n < batch_num; ++n)
+    {
+        in_buf = input + n * in_h * in_w * in_c;
+        out_f32_tmp = out_f32 + n * in_h * in_w * in_c;
+        memset(buff,0,sizeof(float)*in_h*in_w);
+        for(int h = 0; h < in_h; ++h)
+        {
+            for(int w = 0; w < in_w; ++w)
+            {
+                int buff_idx = h * in_w + w;
+                for(int c = 0; c < in_c; ++c)
+                {
+                    if(param->layout == 0) // nchw
+                    {
+                        in_offset = c * in_h * in_w + h * in_w + w;
+                    }
+                    else // nhwc
+                    {
+                        in_offset = h * in_w * in_c + w * in_c + c;
+                    }
+                    float data = (float)param->in_scale * (in_buf[in_offset] - param->in_zero);
+                    buff[buff_idx] += data * data;
+                }
+                buff[buff_idx] = 1.f / sqrt(buff[buff_idx]);
+            }
+        }
+        for(int h = 0; h < in_h; ++h)
+        {
+            for(int w = 0; w < in_w; ++w)
+            {
+                int buff_idx = h * in_w + w;
+                for(int c = 0; c < in_c; ++c)
+                {
+                    if(param->layout == 0) // nchw
+                    {
+                        out_offset = c * in_h * in_w + h * in_w + w;
+                        in_offset = out_offset;
+                    }
+                    else // nhwc
+                    {
+                        out_offset = h * in_w * in_c + w * in_c + c;
+                        in_offset = out_offset;
+                    }
+                    float data = buff[buff_idx];
+                    float in_data = (float)param->in_scale * (in_buf[in_offset] + param->in_zero);
+                    float out_data = in_data * data;
+                    if(scale)
+                    {
+                        out_data = out_data * param->scale_scale * (scale[c] + param->scale_zero);
+                    }
+                    out_f32_tmp[out_offset] = out_data;
+                }
+            }
+        }
+  
+    }
+    float output_max = 0.0f;
+    for(int i =0; i< out_size; i++)
+    {
+        if(output_max < fabs(out_f32[i]))
+            output_max = fabs(out_f32[i]);
+    }
+    param->out_scale = output_max / 127;
+    param->out_zero = 0;
+    for(int i =0; i< out_size; i++)
+    {
+        int s32_out = round(out_f32[i]*127/output_max);
+        if(s32_out > 127) s32_out = 127;
+        if(s32_out < -127 ) s32_out = -127;
+        output[i] = s32_out; 
+    }
+    free(buff);
+    free(out_f32);
+    out_f32 = NULL;
+    buff = NULL;
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/ref_normalize/ref_normalize_kernel.h b/executor/operator/ref/kernel/ref_normalize/ref_normalize_kernel.h
new file mode 100644
index 000000000..51d807eba
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_normalize/ref_normalize_kernel.h
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+
+#ifndef __REF_NORMAL_KERNEL_H__
+#define __REF_NORMAL_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ref_normalize_param
+{
+    int input_n;
+    int input_h;
+    int input_w;
+    int input_c;
+    int across_spatial;
+    int channel_shared;
+    int layout;
+    float in_scale;
+    int in_zero;
+    float out_scale;
+    int out_zero;
+    float scale_scale;
+    int scale_zero;
+};
+
+typedef int (*ref_normalize_kernel_t)(void *input,void * output,void* scale,const ref_normalize_param* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_normalize_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_normalize_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "ref_normalize_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_normalize_uint8.c"
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/ref_normalize/ref_normalize_uint8.c b/executor/operator/ref/kernel/ref_normalize/ref_normalize_uint8.c
new file mode 100644
index 000000000..6e4895781
--- /dev/null
+++ b/executor/operator/ref/kernel/ref_normalize/ref_normalize_uint8.c
@@ -0,0 +1,80 @@
+
+static int ref_normalize_uint8(uint8_t*input, uint8_t * output,uint8_t* scale,const ref_normalize_param* param)
+{
+   
+    int batch_num = param->input_n;
+    int in_h = param->input_h;
+    int in_w = param->input_w;
+    int in_c = param->input_c;
+    int in_offset = 0;
+    int out_offset = 0;
+
+    float *buff = (float*)malloc(sizeof(float)* in_h * in_w);
+    uint8_t *in_buf = input;
+    uint8_t *out_buf = output;
+    
+    for(int n = 0; n < batch_num; ++n)
+    {
+        in_buf = input + n * in_h * in_w * in_c;
+        out_buf = output + n * in_h * in_w * in_c;
+        memset(buff,0,sizeof(float)*in_h*in_w);
+        for(int h = 0; h < in_h; ++h)
+        {
+            for(int w = 0; w < in_w; ++w)
+            {
+                int buff_idx = h * in_w + w;
+                for(int c = 0; c < in_c; ++c)
+                {
+                    if(param->layout == 0) // nchw
+                    {
+                        in_offset = c * in_h * in_w + h * in_w + w;
+                    }
+                    else // nhwc
+                    {
+                        in_offset = h * in_w * in_c + w * in_c + c;
+                    }
+                    float data = (float)param->in_scale * (in_buf[in_offset] + param->in_zero);
+                    buff[buff_idx] += data * data;
+                }
+                buff[buff_idx] = 1.f / sqrt(buff[buff_idx]);
+            }
+        }
+        for(int h = 0; h < in_h; ++h)
+        {
+            for(int w = 0; w < in_w; ++w)
+            {
+                int buff_idx = h * in_w + w;
+                for(int c = 0; c < in_c; ++c)
+                {
+                    if(param->layout == 0) // nchw
+                    {
+                        out_offset = c * in_h * in_w + h * in_w + w;
+                        in_offset = out_offset;
+                    }
+                    else // nhwc
+                    {
+                        out_offset = h * in_w * in_c + w * in_c + c;
+                        in_offset = out_offset;
+                    }
+                    float data = buff[buff_idx];
+                    float in_data = (float)param->in_scale * (in_buf[in_offset] - param->in_zero);
+                    float out_data = in_data * data;
+                    if(scale)
+                    {
+                        out_data = out_data * param->scale_scale * (scale[c] - param->scale_zero);
+                    }
+                    int s32_out = round(out_data/param->out_scale) + param->out_zero;
+                    if(s32_out > 255) s32_out = 255;
+                    if(s32_out < 0 ) s32_out = 0;
+                    out_buf[out_offset] = s32_out; 
+                }
+            }
+        }
+    }
+    
+    free(buff);
+    buff = NULL;
+
+    return 0;
+}
+
diff --git a/executor/operator/ref/kernel/relu/relu.h b/executor/operator/ref/kernel/relu/relu.h
new file mode 100644
index 000000000..2b377895b
--- /dev/null
+++ b/executor/operator/ref/kernel/relu/relu.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#ifndef __RELU_KERNEL_H__
+#define __RELU_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+#include "relu_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int (*relu_t)(void * data, int size, float negative_slope, float scale, int zero_point);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "relu_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "relu_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "relu_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "relu_uint8.c"
+#endif
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/relu/relu_common.h b/executor/operator/ref/kernel/relu/relu_common.h
new file mode 100644
index 000000000..70994c29b
--- /dev/null
+++ b/executor/operator/ref/kernel/relu/relu_common.h
@@ -0,0 +1,7 @@
+#ifndef __RELU_COMMON_H__
+#define __RELU_COMMON_H__
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#endif
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/relu/relu_fp16.c b/executor/operator/ref/kernel/relu/relu_fp16.c
new file mode 100644
index 000000000..abc753ab4
--- /dev/null
+++ b/executor/operator/ref/kernel/relu/relu_fp16.c
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+int relu_fp16(__fp16 * data, int size, float negative_slope, float scale, int zero_point)
+{
+
+/* for arm32 && x86 */
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+    //__fp16* data = ( __fp16* )data;
+    if(negative_slope == 0.0)
+    {
+        for(int i = 0; i < size; i++)
+        {
+            
+            data[i] = fp32_to_fp16(MAX(fp16_to_fp32(data[i]), 0.0));
+
+            
+        }
+    }
+    else
+    {
+        for(int i = 0; i < size; i++)
+        {
+            float bias= negative_slope * MIN(fp16_to_fp32(data[i]), 0.f);
+            data[i] = fp32_to_fp16(MAX(fp16_to_fp32(data[i]), 0.f) +bias);
+        }
+    }
+#else
+    if(negative_slope == 0)
+    {
+        for(int i = 0; i < size; i++)
+        {
+            data[i] = MAX(data[i], 0.f);
+        }
+    }
+    else
+    {
+        for(int i = 0; i < size; i++)
+        {
+            data[i] = MAX(data[i], 0.f) + negative_slope * MIN(data[i], 0.f);
+        }
+    }
+#endif
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/relu/relu_fp32.c b/executor/operator/ref/kernel/relu/relu_fp32.c
new file mode 100644
index 000000000..978ec5d20
--- /dev/null
+++ b/executor/operator/ref/kernel/relu/relu_fp32.c
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+int relu_fp32(float * data, int size, float negative_slope, float scale, int zero_point)
+{
+    //float* out_data = ( float* )data;
+    if(negative_slope == 0)
+    {
+        for(int i = 0; i < size; i++)
+        {
+            data[i] = MAX(data[i], 0);
+        }
+    }
+    else
+    {
+        for(int i = 0; i < size; i++)
+        {
+            data[i] = MAX(data[i], 0.f) + negative_slope * MIN(data[i], 0.f);
+        }
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/relu/relu_int8.c b/executor/operator/ref/kernel/relu/relu_int8.c
new file mode 100644
index 000000000..4bd429008
--- /dev/null
+++ b/executor/operator/ref/kernel/relu/relu_int8.c
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+int relu_int8(int8_t * data, int size, float negative_slope, float scale, int zero_point)
+{
+    if(negative_slope == 0)
+    {
+        for(int i = 0; i < size; i++)
+        {
+            data[i] = MAX(data[i], 0);
+        }
+    }
+    else
+    {
+        for(int i = 0; i < size; i++)
+        {
+            data[i] = MAX(data[i], 0.f) + negative_slope * MIN(data[i], 0.f);
+        }
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/relu/relu_uint8.c b/executor/operator/ref/kernel/relu/relu_uint8.c
new file mode 100644
index 000000000..62aafa940
--- /dev/null
+++ b/executor/operator/ref/kernel/relu/relu_uint8.c
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+int relu_uint8(uint8_t * data, int size, float negative_slope, float scale, int zero_point)
+{
+    if(negative_slope == 0)
+    {
+        for(int i = 0; i < size; i++)
+        {
+            data[i] = MAX(data[i], zero_point);
+        }
+    }
+    else
+    {
+        for(int i = 0; i < size; i++)
+        {
+            if(data[i] < zero_point)
+            {
+                data[i] = round(negative_slope * data[i]);
+            }
+        }
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/relu6/relu6.h b/executor/operator/ref/kernel/relu6/relu6.h
new file mode 100644
index 000000000..5b0eac576
--- /dev/null
+++ b/executor/operator/ref/kernel/relu6/relu6.h
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#ifndef __RELU6_KERNEL_H__
+#define __RELU6_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+#include "compiler_fp16.h"
+#include "relu6_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int (*relu6_t)(void * data, int size, float scale, int zero_point);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "relu6_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "relu6_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "relu6_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "relu6_uint8.c"
+#endif
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/relu6/relu6_common.h b/executor/operator/ref/kernel/relu6/relu6_common.h
new file mode 100644
index 000000000..ceed752a7
--- /dev/null
+++ b/executor/operator/ref/kernel/relu6/relu6_common.h
@@ -0,0 +1,7 @@
+#ifndef __RELU6_COMMON_H__
+#define __RELU6_COMMON_H__
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#endif
diff --git a/executor/operator/ref/kernel/relu6/relu6_fp16.c b/executor/operator/ref/kernel/relu6/relu6_fp16.c
new file mode 100644
index 000000000..cdb5014d8
--- /dev/null
+++ b/executor/operator/ref/kernel/relu6/relu6_fp16.c
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+int relu6_fp16(__fp16 * data, int size, float scale, int zero_point)
+{
+
+/* for arm32 && x86 */
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+
+    for(int i = 0; i < size; i++)
+    {
+        data[i] = fp32_to_fp16(MIN(MAX(fp16_to_fp32(data[i]), 0.0),6.0));
+    }
+  
+#else
+    for(int i = 0; i < size; i++)
+    {
+        data[i] =  MIN(MAX(data[i], 0.0f), 6.0f);
+    }
+#endif
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/relu6/relu6_fp32.c b/executor/operator/ref/kernel/relu6/relu6_fp32.c
new file mode 100644
index 000000000..694915a00
--- /dev/null
+++ b/executor/operator/ref/kernel/relu6/relu6_fp32.c
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+
+int relu6_fp32(float * data, int size, float scale, int zero_point)
+{
+    for(int i = 0; i < size; i++)
+    {
+        data[i] =  MIN(MAX(data[i], 0), 6);
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/relu6/relu6_int8.c b/executor/operator/ref/kernel/relu6/relu6_int8.c
new file mode 100644
index 000000000..ffc142642
--- /dev/null
+++ b/executor/operator/ref/kernel/relu6/relu6_int8.c
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+int relu6_int8(int8_t * data, int size, float scale, int zero_point)
+{
+    for(int i = 0; i < size; i++)
+    {
+        float real_data = data[i] * scale;
+        real_data = MIN(MAX(real_data, 0.0), 6.0);
+        data[i] = round(real_data/scale);
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/relu6/relu6_uint8.c b/executor/operator/ref/kernel/relu6/relu6_uint8.c
new file mode 100644
index 000000000..a2b2f07f6
--- /dev/null
+++ b/executor/operator/ref/kernel/relu6/relu6_uint8.c
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+int relu6_uint8(uint8_t * data, int size, float scale, int zero_point)
+{
+    for(int i = 0; i < size; i++)
+    {
+        float real_data = (data[i]-zero_point) * scale;
+        real_data = MIN(MAX(real_data, 0.0), 6.0);
+        data[i] = round(real_data/scale) + zero_point;
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/resize/resize_fp16.c b/executor/operator/ref/kernel/resize/resize_fp16.c
new file mode 100644
index 000000000..19d428c5e
--- /dev/null
+++ b/executor/operator/ref/kernel/resize/resize_fp16.c
@@ -0,0 +1,92 @@
+
+static void bilinear_resize_fp16(__fp16* inp, __fp16* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow)
+{
+    int out_hw = oh * ow;
+    int in_hw = h * w;
+    for(int j = 0; j < oh; j++)
+    {
+        float fy = (j + 0.5) * scale_y - 0.5;
+        int sy = floor(fy);
+        fy -= sy;
+        sy = T_MIN(sy, h - 2);
+        sy = T_MAX(0, sy);
+        float fy_0 = 1.f - fy;
+
+        for(int i = 0; i < ow; i++)
+        {
+            float fx = (i + 0.5) * scale_x - 0.5;
+            int sx = floor(fx);
+            fx -= sx;
+            if(sx < 0)
+            {
+                sx = 0;
+                fx = 0;
+            }
+            if(sx >= w - 1)
+            {
+                fx = 0;
+                sx = w - 2;
+            }
+            float fx_0 = 1.f - fx;
+            int out_idx = j * ow + i;
+            int in_idx = sy * w + sx;
+            // printf("i=%d j=%d\t sx=%d fx=%f\t sy=%d fy=%f\n",i,j,sx,fx,sy,fy);
+            for(int k = 0; k < c; k++)
+            {
+                int in_index = in_idx + k * in_hw;
+                #if!defined( __ARM_ARCH) || __ARM_ARCH <8
+                output[k * out_hw + out_idx] = fp32_to_fp16(fp16_to_fp32(inp[in_index]) * fx_0 * fy_0 + fp16_to_fp32(inp[in_index + w]) * fx_0 * fy +
+                                                fp16_to_fp32(inp[in_index + 1]) * fx * fy_0 + fp16_to_fp32(inp[in_index + w + 1]) * fx * fy);
+                #else
+                output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy +
+                                                inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy;
+                #endif
+            }
+        }
+    }
+}
+
+static int resize_fp16(__fp16* input, __fp16* output, struct resize_param* param)
+{
+
+    int batch = param->batch;
+    int channel = param->channel;
+    int in_chw = channel * param->input_h * param->input_w;
+    int out_chw = channel * param->output_h * param->output_w;
+    
+
+    for(int n = 0; n < batch; n++)
+    {
+
+        if(param->type==0)
+        {
+            int si, sj;
+            for(int k = 0; k < channel; k++)
+            {
+                __fp16* input_c  = input + n*in_chw + k * param->input_h * param->input_w;
+                __fp16* output_c = output + k *param->output_h * param->output_w;
+                for(int i = 0; i < param->output_h; i++)
+                {
+                    si = T_MIN(( int )(i * param->scale_y), param->input_h - 1);
+                    for(int j = 0; j < param->output_w; j++)
+                    {
+                        sj = T_MIN(( int )(j * param->scale_x), param->output_w - 1);
+                        output_c[i * param->output_w + j] = input_c[si * param->input_w + sj];
+                    }
+                }
+            }
+            input += in_chw;
+            output += out_chw;
+
+        }
+        else
+        {
+            bilinear_resize_fp16(input + n*in_chw, output + n*out_chw, param->input_h, param->input_w, channel,
+                param->scale_x, param->scale_y, param->output_h, param->output_w);
+            input += in_chw;
+            output += out_chw;
+        }
+    }
+    
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/resize/resize_fp32.c b/executor/operator/ref/kernel/resize/resize_fp32.c
new file mode 100644
index 000000000..694fe39b2
--- /dev/null
+++ b/executor/operator/ref/kernel/resize/resize_fp32.c
@@ -0,0 +1,86 @@
+
+static void bilinear_resize_fp32(float* inp, float* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow)
+{
+    int out_hw = oh * ow;
+    int in_hw = h * w;
+    for(int j = 0; j < oh; j++)
+    {
+        float fy = (j + 0.5) * scale_y - 0.5;
+        int sy = floor(fy);
+        fy -= sy;
+        sy = T_MIN(sy, h - 2);
+        sy = T_MAX(0, sy);
+        float fy_0 = 1.f - fy;
+
+        for(int i = 0; i < ow; i++)
+        {
+            float fx = (i + 0.5) * scale_x - 0.5;
+            int sx = floor(fx);
+            fx -= sx;
+            if(sx < 0)
+            {
+                sx = 0;
+                fx = 0;
+            }
+            if(sx >= w - 1)
+            {
+                fx = 0;
+                sx = w - 2;
+            }
+            float fx_0 = 1.f - fx;
+            int out_idx = j * ow + i;
+            int in_idx = sy * w + sx;
+            // printf("i=%d j=%d\t sx=%d fx=%f\t sy=%d fy=%f\n",i,j,sx,fx,sy,fy);
+            for(int k = 0; k < c; k++)
+            {
+                int in_index = in_idx + k * in_hw;
+                output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy +
+                                                inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy;
+            }
+        }
+    }
+}
+
+static int resize_fp32(float* input, float* output, struct resize_param* param)
+{
+    int batch = param->batch;
+    int channel = param->channel;
+    int in_chw = channel * param->input_h * param->input_w;
+    int out_chw = channel * param->output_h * param->output_w;
+    
+    for(int n = 0; n < batch; n++)
+    {
+    
+        if(param->type==0)
+        {
+            int si, sj;
+            for(int k = 0; k < channel; k++)
+            {
+                float* input_c = input + n*in_chw + k * param->input_h * param->input_w;
+                float* output_c = output + n*out_chw + k * param->output_h * param->output_w;
+                for(int i = 0; i < param->output_h; i++)
+                {
+                    si = T_MIN(( int )(i * param->scale_y), param->input_h - 1);
+                    for(int j = 0; j < param->output_w; j++)
+                    {
+                        sj = T_MIN(( int )(j * param->scale_x), param->input_w - 1);
+                        output_c[i * param->output_w + j] = input_c[si * param->input_w + sj];
+                    }
+                }
+            }
+
+            input += in_chw;
+            output += out_chw;
+
+        }
+        else
+        {
+            bilinear_resize_fp32(input+ n*in_chw, output + n*out_chw, param->input_h, param->input_w, channel,
+                param->scale_x, param->scale_y, param->output_h, param->output_w);
+            input += in_chw;
+            output += out_chw;
+        }
+    }
+    
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/resize/resize_int8.c b/executor/operator/ref/kernel/resize/resize_int8.c
new file mode 100644
index 000000000..9db782b5e
--- /dev/null
+++ b/executor/operator/ref/kernel/resize/resize_int8.c
@@ -0,0 +1,55 @@
+#include <math.h>
+
+static int prelu_fp32(int batch_number,int in_chw,int out_chw,float* input, float* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow)
+{   
+
+    for(int i = 0; i < batch_number; i++)
+    {
+        bilinear_resize(input, output, h, w, c, scale_x, scale_y, oh, ow);
+        input += in_chw;
+        output += out_chw;
+    }
+
+    return 0;
+}
+void bilinear_resize(float* inp, float* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow)
+{
+    int out_hw = oh * ow;
+    int in_hw = h * w;
+    for(int j = 0; j < oh; j++)
+    {
+        float fy = (j + 0.5) * scale_y - 0.5;
+        int sy = floor(fy);
+        fy -= sy;
+        sy = min(sy, h - 2);
+        sy = max(0, sy);
+        float fy_0 = 1.f - fy;
+
+        for(int i = 0; i < ow; i++)
+        {
+            float fx = (i + 0.5) * scale_x - 0.5;
+            int sx = floor(fx);
+            fx -= sx;
+            if(sx < 0)
+            {
+                sx = 0;
+                fx = 0;
+            }
+            if(sx >= w - 1)
+            {
+                fx = 0;
+                sx = w - 2;
+            }
+            float fx_0 = 1.f - fx;
+            int out_idx = j * ow + i;
+            int in_idx = sy * w + sx;
+            // printf("i=%d j=%d\t sx=%d fx=%f\t sy=%d fy=%f\n",i,j,sx,fx,sy,fy);
+            for(int k = 0; k < c; k++)
+            {
+                int in_index = in_idx + k * in_hw;
+                output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy +
+                                                inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/resize/resize_kernel.h b/executor/operator/ref/kernel/resize/resize_kernel.h
new file mode 100644
index 000000000..3e75f60a9
--- /dev/null
+++ b/executor/operator/ref/kernel/resize/resize_kernel.h
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#ifndef __RESIZE_KERNEL_H__
+#define __RESIZE_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct resize_param
+{
+    int type;
+    int batch;
+    int channel;
+    int input_h;
+    int input_w;
+    int output_h;
+    int output_w;
+    float scale_x;
+    float scale_y;
+};
+#define T_MAX(a,b) ((a)>(b)?(a):(b))
+#define T_MIN(a,b) ((a)<(b)?(a):(b))
+
+typedef int (*resize_t)(void* input, void* output, struct resize_param* param);
+
+#ifdef CONFIG_KERNEL_FP32
+#include "resize_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "resize_fp16.c" 
+#endif
+
+// #ifdef CONFIG_KERNEL_INT8
+// #include "flatten_int8.c"
+// #endif
+
+// #ifdef CONFIG_KERNEL_UINT8
+// #include "flatten_uint8.c"
+// #endif
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/resize/resize_uint8.c b/executor/operator/ref/kernel/resize/resize_uint8.c
new file mode 100644
index 000000000..9db782b5e
--- /dev/null
+++ b/executor/operator/ref/kernel/resize/resize_uint8.c
@@ -0,0 +1,55 @@
+#include <math.h>
+
+static int prelu_fp32(int batch_number,int in_chw,int out_chw,float* input, float* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow)
+{   
+
+    for(int i = 0; i < batch_number; i++)
+    {
+        bilinear_resize(input, output, h, w, c, scale_x, scale_y, oh, ow);
+        input += in_chw;
+        output += out_chw;
+    }
+
+    return 0;
+}
+void bilinear_resize(float* inp, float* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow)
+{
+    int out_hw = oh * ow;
+    int in_hw = h * w;
+    for(int j = 0; j < oh; j++)
+    {
+        float fy = (j + 0.5) * scale_y - 0.5;
+        int sy = floor(fy);
+        fy -= sy;
+        sy = min(sy, h - 2);
+        sy = max(0, sy);
+        float fy_0 = 1.f - fy;
+
+        for(int i = 0; i < ow; i++)
+        {
+            float fx = (i + 0.5) * scale_x - 0.5;
+            int sx = floor(fx);
+            fx -= sx;
+            if(sx < 0)
+            {
+                sx = 0;
+                fx = 0;
+            }
+            if(sx >= w - 1)
+            {
+                fx = 0;
+                sx = w - 2;
+            }
+            float fx_0 = 1.f - fx;
+            int out_idx = j * ow + i;
+            int in_idx = sy * w + sx;
+            // printf("i=%d j=%d\t sx=%d fx=%f\t sy=%d fy=%f\n",i,j,sx,fx,sy,fy);
+            for(int k = 0; k < c; k++)
+            {
+                int in_index = in_idx + k * in_hw;
+                output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy +
+                                                inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/rpn/ref_rpn_fp16.c b/executor/operator/ref/kernel/rpn/ref_rpn_fp16.c
new file mode 100644
index 000000000..babe09e36
--- /dev/null
+++ b/executor/operator/ref/kernel/rpn/ref_rpn_fp16.c
@@ -0,0 +1,52 @@
+#include "ref_rpn_kernel.h"
+
+int ref_rpn_fp16(const __fp16* score, __fp16* featmap, float* anchors, __fp16* output, struct rpn_param* param)
+{
+    if(score == nullptr || featmap == nullptr || anchors == nullptr || output == nullptr)
+        return false;
+    int featmap_size = param->feat_height * param->feat_width * param->feat_chan;
+    int max_num_boxes = featmap_size /4;
+    struct RPN_Box* boxes = (struct RPN_Box*)malloc(max_num_boxes * sizeof(struct RPN_Box));
+
+    /*   __fp16 -> float */
+    float* featmap_fp32 = (float*)malloc(featmap_size * sizeof(float));
+    float* score_fp32 = (float*)malloc(max_num_boxes * sizeof(float));
+    for(int i = 0; i < featmap_size; i++)
+        featmap_fp32[i] = fp16_to_fp32(featmap[i]);
+    for(int i = 0; i < max_num_boxes; i++)
+        score_fp32[i] = fp16_to_fp32(score[i]);
+
+    bbox_tranform_inv(featmap_fp32, anchors, param);
+    
+    int num_boxes = 0;
+    ref_filter_boxes(boxes, featmap_fp32, score_fp32, &num_boxes, param);
+
+    sort_rpn_boxes_by_score(boxes, num_boxes);
+
+    if(param->per_nms_topn > 0)
+    {
+        num_boxes = RPN_MIN(param->per_nms_topn, num_boxes);
+    }
+    nms_rpn_boxes(boxes, &num_boxes, param->nms_thresh);
+
+    if(param->post_nms_topn > 0)
+    {
+        num_boxes = RPN_MIN(param->post_nms_topn, num_boxes);
+    }
+    // inder shape [default batch=1]
+
+    // std::cout<<"num_box "<<num_box<<"\n";
+    for(int i = 0; i < num_boxes; i++)
+    {
+        __fp16* outptr = output + i * 4;
+        outptr[0] = fp32_to_fp16(boxes[i].x0);
+        outptr[1] = fp32_to_fp16(boxes[i].y0);
+        outptr[2] = fp32_to_fp16(boxes[i].x1);
+        outptr[3] = fp32_to_fp16(boxes[i].y1);
+    }
+
+    free(score_fp32);
+    free(featmap_fp32);
+    free(boxes);
+    return num_boxes;
+}
diff --git a/executor/operator/ref/kernel/rpn/ref_rpn_fp32.c b/executor/operator/ref/kernel/rpn/ref_rpn_fp32.c
new file mode 100644
index 000000000..94c356a31
--- /dev/null
+++ b/executor/operator/ref/kernel/rpn/ref_rpn_fp32.c
@@ -0,0 +1,42 @@
+#include "ref_rpn_kernel.h"
+
+int ref_rpn_fp32(const float* score, float* featmap, float* anchors, float* output, struct rpn_param* param)
+{
+    if(score == nullptr || featmap == nullptr || anchors == nullptr || output == nullptr)
+        return false;
+    int featmap_size = param->feat_height * param->feat_width * param->feat_chan;
+    int max_num_boxes = featmap_size /4;
+    struct RPN_Box* boxes = (struct RPN_Box*)malloc(max_num_boxes * sizeof(struct RPN_Box));
+
+    bbox_tranform_inv(featmap, anchors, param);
+    
+    int num_boxes = 0;
+    ref_filter_boxes(boxes, featmap, score, &num_boxes, param);
+
+    sort_rpn_boxes_by_score(boxes, num_boxes);
+
+    if(param->per_nms_topn > 0)
+    {
+        num_boxes = RPN_MIN(param->per_nms_topn, num_boxes);
+    }
+    nms_rpn_boxes(boxes, &num_boxes, param->nms_thresh);
+
+    if(param->post_nms_topn > 0)
+    {
+        num_boxes = RPN_MIN(param->post_nms_topn, num_boxes);
+    }
+    // inder shape [default batch=1]
+
+    // std::cout<<"num_box "<<num_box<<"\n";
+    for(int i = 0; i < num_boxes; i++)
+    {
+        float* outptr = output + i * 4;
+        outptr[0] = boxes[i].x0;
+        outptr[1] = boxes[i].y0;
+        outptr[2] = boxes[i].x1;
+        outptr[3] = boxes[i].y1;
+    }
+
+    free(boxes);
+    return num_boxes;
+}
diff --git a/executor/operator/ref/kernel/rpn/ref_rpn_int8.c b/executor/operator/ref/kernel/rpn/ref_rpn_int8.c
new file mode 100644
index 000000000..f3d362ea2
--- /dev/null
+++ b/executor/operator/ref/kernel/rpn/ref_rpn_int8.c
@@ -0,0 +1,28 @@
+
+int ref_dpp_uint8(const uint8_t* input, const uint8_t* score, const uint8_t* anchor,
+            float* detect_num, float* detect_class, float* detect_score, float* detect_boxes, dpp_param* param)
+{
+    const int num_classes = param->num_classes + 1;
+    const int num_boxes = param->num_boxes;
+
+    /* transform uint8_t to fp32 */
+    int input_size = num_boxes * 4;
+    int score_size = num_boxes * num_classes;
+    float* input_f = (float*)malloc( input_size * sizeof(float));
+    float* score_f = (float*)malloc( score_size * sizeof(float));
+    float* anchor_f = (float*)malloc( input_size * sizeof(float));
+    for(int i =0; i < input_size; i++)
+        input_f[i] = (input[i] - param->zero[0]) * param->quant_scale[0];
+    for(int i =0; i < score_size; i++)
+        score_f[i] = score[i]  * param->quant_scale[1];
+    for(int i =0; i < input_size; i++)
+        anchor_f[i] = (anchor[i] - param->zero[2]) * param->quant_scale[2];
+
+    ref_dpp_common(input_f, score_f, anchor_f, param, detect_num, detect_class, detect_score, detect_boxes);
+
+    free(anchor_f);
+    free(score_f);
+    free(input_f);
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/rpn/ref_rpn_kernel.h b/executor/operator/ref/kernel/rpn/ref_rpn_kernel.h
new file mode 100644
index 000000000..6372c1dbf
--- /dev/null
+++ b/executor/operator/ref/kernel/rpn/ref_rpn_kernel.h
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+#ifndef __REF_RPN_KERNEL_H__
+#define __REF_RPN_KERNEL_H__
+
+#include <stdint.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct anchor_box
+{
+    float x0;    // xmin
+    float y0;    // ymin
+    float x1;    // xmax
+    float y1;    // ymax
+};
+struct RPN_Box
+{
+    float x0;    // xmin
+    float y0;    // ymin
+    float x1;    // xmax
+    float y1;    // ymax
+    float score;
+};
+
+struct rpn_param
+{
+    int feat_height;
+    int feat_width;
+    int feat_chan;
+    int score_chan;
+    float src_scale;
+    int src_width;
+    int src_height;
+    int num_anchors;
+    int min_size;
+    int feat_stride;
+    int per_nms_topn;
+    int post_nms_topn;
+    float nms_thresh;
+    //float scales[4];
+    //float quant_scale[3];
+    //int zero[3];
+};
+
+#define RPN_MIN(a,b) ( (a)<(b) ? (a) : (b) )
+#define RPN_MAX(a,b) ( (a)>(b) ? (a) : (b) )
+
+typedef int (*ref_rpn_kernel_t )(const void* score, void* featmap, float* anchor, void* output, struct rpn_param* param);
+
+static inline void bbox_tranform_inv(float* m_box, float* local_anchors, struct rpn_param* param)
+{
+    int feat_size = param->feat_height * param->feat_width;
+    int c_4 = param->feat_chan / 4;
+    for(int i = 0; i < c_4; ++i)
+    {
+        for(int j = 0; j < (2*feat_size); ++j)
+        {
+            local_anchors[(i*4+2)*feat_size + j] -= local_anchors[(i*4+0)*feat_size + j] - 1;
+            local_anchors[(i*4+0)*feat_size + j] += local_anchors[(i*4+2)*feat_size + j] * 0.5;
+
+            m_box[(i * 4 + 0) * feat_size + j] *= local_anchors[(i*4+2)*feat_size + j];
+            m_box[(i * 4 + 0) * feat_size + j] += local_anchors[(i*4+0)*feat_size + j];
+
+            m_box[(i * 4 + 2) * feat_size + j] = exp(m_box[(i * 4 + 2) * feat_size + j]);
+            m_box[(i * 4 + 2) * feat_size + j] *= local_anchors[(i*4+2)*feat_size + j];
+        }
+    }
+}
+
+static inline void ref_filter_boxes(struct RPN_Box* boxes, const float* featmap, const float* score, int* num_boxes, struct rpn_param* param)
+{
+    float local_minsize = param->min_size * param->src_scale;
+    int c_4 = param->feat_chan / 4;
+    int feat_size = param->feat_height * param->feat_width;
+    
+    int offset_w, offset_h, offset_x, offset_y, offset_s;
+    
+    int num = 0;
+    for(int h = 0; h < param->feat_height; h++)
+        for(int w = 0; w < param->feat_width; w++)
+        {
+            offset_x = h * param->feat_width + w;
+            offset_y = offset_x + feat_size;
+            offset_w = offset_y + feat_size;
+            offset_h = offset_w + feat_size;
+            offset_s = feat_size * param->num_anchors + offset_x;
+            for(int c = 0; c < c_4; c++)
+            {
+                float width = featmap[offset_w];
+                float height = featmap[offset_h];
+                if((width >= local_minsize) & (height >= local_minsize))
+                {
+                    struct RPN_Box tmp;
+                    tmp.x0 = featmap[offset_x] - 0.5 * width;
+                    tmp.y0 = featmap[offset_y] - 0.5 * height;
+                    tmp.x1 = featmap[offset_x] + 0.5 * width;
+                    tmp.y1 = featmap[offset_y] + 0.5 * height;
+                    tmp.x0 = RPN_MIN(RPN_MAX(tmp.x0, 0), param->src_width);
+                    tmp.y0 = RPN_MIN(RPN_MAX(tmp.y0, 0), param->src_height);
+                    tmp.x1 = RPN_MIN(RPN_MAX(tmp.x1, 0), param->src_width);
+                    tmp.y1 = RPN_MIN(RPN_MAX(tmp.y1, 0), param->src_height);
+                    tmp.score = score[offset_s];
+                    memcpy(boxes + num, &tmp, sizeof(struct RPN_Box));
+                    num ++;
+                }
+                offset_x += 4*feat_size;
+                offset_y += 4*feat_size;
+                offset_w += 4*feat_size;
+                offset_h += 4*feat_size;
+                offset_s += feat_size;
+            }
+        }
+        
+    *num_boxes = num;
+}
+
+void sort_rpn_boxes_by_score(struct RPN_Box* boxes, int size)
+{
+    int i, j;
+    for(i = 0; i < size-1; i++)
+    {
+        int max_idx = i;
+        for(j = i + 1; j < size; j++)
+        {
+            if(boxes[max_idx].score < boxes[j].score)
+                max_idx = j;
+        }
+        if(i != max_idx)
+        {
+            struct RPN_Box tmp;
+            memcpy(&tmp, boxes+i, sizeof(struct RPN_Box));
+            memcpy(boxes + i, boxes+max_idx, sizeof(struct RPN_Box));
+            memcpy(boxes + max_idx, &tmp, sizeof(struct RPN_Box));
+        }
+    }
+}
+
+void nms_rpn_boxes(struct RPN_Box* input_boxes, int* size, float nms_thresh)
+{
+    int input_size = *size;
+    int output_size = 0;
+
+    struct RPN_Box* output_boxes = (struct RPN_Box*)malloc(sizeof(struct RPN_Box)*input_size);
+    float* areas = (float*)malloc(sizeof(float)* input_size);
+    int* picked = (int*)malloc(sizeof(int)* input_size);
+
+    for(int i = 0; i < input_size; ++i)
+    {
+        areas[i] = (input_boxes[i].x1 - input_boxes[i].x0 + 1) * (input_boxes[i].y1 - input_boxes[i].y0 + 1);
+    }
+    for(int i = 0; i < input_size; ++i)
+    {
+        int keep = 1;
+        for(int j = 0; j < output_size;j++)
+        {
+            float xx1 = RPN_MAX(input_boxes[i].x0, output_boxes[j].x0);
+            float yy1 = RPN_MAX(input_boxes[i].y0, output_boxes[j].y0);
+            float xx2 = RPN_MIN(input_boxes[i].x1, output_boxes[j].x1);
+            float yy2 = RPN_MIN(input_boxes[i].y1, output_boxes[j].y1);
+            float w = RPN_MAX(float(0), xx2 - xx1 + 1);
+            float h = RPN_MAX(float(0), yy2 - yy1 + 1);
+            float inter = w * h;
+            float ovr = inter / (areas[i] + areas[picked[j]] - inter);
+            if(ovr >= nms_thresh)
+            {
+                keep = 0;
+                break;
+            }
+        }
+        if(keep)
+        {
+            memcpy(output_boxes + output_size, input_boxes + i, sizeof(struct RPN_Box));
+            picked[output_size] = i;
+            output_size ++;
+        }
+        
+    }
+    memcpy(input_boxes, output_boxes, output_size * sizeof(struct RPN_Box));
+    *size = output_size;
+    free(picked);
+    free(areas);
+    free(output_boxes);
+}
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_rpn_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_rpn_fp16.c"
+#endif
+/*
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_rpn_uint8.c"
+#endif
+*/
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/sigmoid/Makefile b/executor/operator/ref/kernel/sigmoid/Makefile
new file mode 100644
index 000000000..78d922637
--- /dev/null
+++ b/executor/operator/ref/kernel/sigmoid/Makefile
@@ -0,0 +1,4 @@
+# obj-$(CONFIG_KERNEL_FP32)+=sigmoid_fp32.o
+# obj-$(CONFIG_KERNEL_FP16)+=sigmoid_fp16.o
+# obj-$(CONFIG_KERNEL_INT8)+=sigmoid_int8.o
+# obj-$(CONFIG_KERNEL_UINT8)+=sigmoid_uint8.o
diff --git a/executor/operator/ref/kernel/sigmoid/sigmoid.h b/executor/operator/ref/kernel/sigmoid/sigmoid.h
new file mode 100644
index 000000000..42aaa8606
--- /dev/null
+++ b/executor/operator/ref/kernel/sigmoid/sigmoid.h
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#ifndef __SIGMOID_H__
+#define __SIGMOID_H__
+
+#include <stdint.h>
+
+#include "compiler_fp16.h"
+#include <math.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct sigmoid_param;
+
+struct sigmoid_param
+{
+    float scale[2];
+    int zero[2];
+};
+
+
+
+#define T_MAX(a, b) ((a) > (b) ? (a) : (b))
+#define T_MIN(a, b) ((a) < (b) ? (a) : (b))
+
+typedef int (*sigmoid_t)(void * data, int size,const sigmoid_param* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "sigmoid_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "sigmoid_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "sigmoid_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "sigmoid_uint8.c"
+#endif
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/sigmoid/sigmoid_fp16.c b/executor/operator/ref/kernel/sigmoid/sigmoid_fp16.c
new file mode 100644
index 000000000..45f19d6c5
--- /dev/null
+++ b/executor/operator/ref/kernel/sigmoid/sigmoid_fp16.c
@@ -0,0 +1,22 @@
+
+
+int sigmoid_fp16(__fp16 * data,int size,const sigmoid_param * param)
+{
+    for(int i=0;i<size;i++)
+    {
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+        float realdata = fp16_to_fp32(data[i]);
+        float realcompt = T_MIN(realdata, 30.f);
+        realcompt = T_MAX(realdata, -30.f);
+        realcompt = 1 / (1 + exp(-realcompt));
+        data[i] = fp32_to_fp16(realcompt);
+
+#else
+        data[i] = T_MIN(data[i], 30.0f);
+        data[i] = T_MAX(data[i], -30.0f);
+
+        data[i] = 1 / (1 + exp(-data[i]));
+#endif
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/sigmoid/sigmoid_fp32.c b/executor/operator/ref/kernel/sigmoid/sigmoid_fp32.c
new file mode 100644
index 000000000..cd66ab504
--- /dev/null
+++ b/executor/operator/ref/kernel/sigmoid/sigmoid_fp32.c
@@ -0,0 +1,13 @@
+
+
+int sigmoid_fp32(float * data,int size,const sigmoid_param * param)
+{
+    for(int i=0;i<size;i++)
+    {
+        data[i] = T_MIN(data[i], 30.0f);
+        data[i] = T_MAX(data[i], -30.0f);
+
+        data[i] = 1 / (1 + exp(-data[i]));
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/sigmoid/sigmoid_int8.c b/executor/operator/ref/kernel/sigmoid/sigmoid_int8.c
new file mode 100644
index 000000000..620fd0be5
--- /dev/null
+++ b/executor/operator/ref/kernel/sigmoid/sigmoid_int8.c
@@ -0,0 +1,16 @@
+
+
+int sigmoid_int8(int8_t * data,int size,const sigmoid_param * param)
+{
+    for(int i=0;i<size;i++)
+    {
+        float real_in = data[i]*param->scale[0];
+        float real_comp = T_MIN(real_in, 30);
+        real_comp = T_MAX(real_in, -30);
+
+        real_comp = 1 / (1 + exp(-real_comp));
+        data[i] = round(real_comp*127);
+
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/sigmoid/sigmoid_uint8.c b/executor/operator/ref/kernel/sigmoid/sigmoid_uint8.c
new file mode 100644
index 000000000..06ec6bbc1
--- /dev/null
+++ b/executor/operator/ref/kernel/sigmoid/sigmoid_uint8.c
@@ -0,0 +1,15 @@
+
+
+int sigmoid_uint8(uint8_t * data,int size,const sigmoid_param * param)
+{
+    for(int i=0;i<size;i++)
+    {
+        float real_in = (data[i]-param->zero[0])*param->scale[0];
+        float real_comp = T_MIN(real_in, 30);
+        real_comp = T_MAX(real_in, -30);
+        
+        real_comp = 1 / (1 + exp(-real_comp));
+        data[i] = round(real_comp/param->scale[1]) + param->zero[1];
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/slice/slice_common.c b/executor/operator/ref/kernel/slice/slice_common.c
new file mode 100644
index 000000000..7621e2a27
--- /dev/null
+++ b/executor/operator/ref/kernel/slice/slice_common.c
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+  
+static int caffe_run(const int8_t* in_data,int8_t** out_data,int element_size,const struct slice_param *param)
+{
+    // get the slice param
+    int slice_axis = param->axis;
+    int num_slices = 1;
+    int slice_size = 1; 
+    const int8_t * input = in_data;
+    const int *in_dim = param->in_shape;
+   
+    for(int i = 0; i < slice_axis; i++)
+    {
+        num_slices = num_slices * in_dim[i];
+    }
+    for(int i = slice_axis + 1; i < param->dim_num; i++)
+    {
+        slice_size = slice_size * in_dim[i];
+    }
+    int in_slice = in_dim[slice_axis];
+    int slice_index = 0;
+    int out_num = param->out_num;
+    for(int i = 0; i < out_num; i++)
+    {
+          int8_t* output = out_data[i];
+          int out_slice = param->output_shape[i].dims[slice_axis];        
+          for(int n = 0; n < num_slices; n++)
+          {
+               int in_offset = (n * in_slice + slice_index) * slice_size * element_size;
+               int out_offset  = n * out_slice * slice_size * element_size;
+               memcpy(output+out_offset,input+in_offset,slice_size * out_slice * element_size);
+          }
+          slice_index += out_slice;       
+    }
+    return 0;
+    
+}
+static int tf_run(const int8_t* in_data,int8_t** out_data,int element_size,const struct slice_param *param)
+{
+    const int8_t* input = in_data;
+    int8_t* output = out_data[0];
+   
+    const int *begins = param->output_shape[0].begins;
+    const int *sizes = param->output_shape[0].sizes;
+    int real_dim = param->dim_num;
+    const int* in_dim_new = param->in_shape;
+    int in_dim_0 = in_dim_new[0];
+    int in_dim_1 = in_dim_new[1];
+    int in_dim_2 = in_dim_new[2];
+    int in_dim_3 = in_dim_new[3];
+
+    int start_dim_0 = (4 - real_dim) > 0 ? 0 : begins[0];
+    int stop_dim_0 = ((4 - real_dim) > 0 || sizes[0] == -1)
+                 ? in_dim_0 - start_dim_0
+                 : start_dim_0 + sizes[0];
+    int start_dim_1 = (3 - real_dim) > 0 ? 0 : begins[1];
+    int stop_dim_1 = ((3 - real_dim) > 0 || sizes[1] == -1)
+                 ? in_dim_1 - start_dim_1
+                 : start_dim_1 + sizes[1];             
+    int start_dim_2 = (2 - real_dim) > 0 ? 0 : begins[2];
+    int stop_dim_2 = ((2 - real_dim) > 0 || sizes[2] == -1)
+                 ? in_dim_2 - start_dim_2
+                 : start_dim_2 + sizes[2];             
+    int start_dim_3 = (1 - real_dim) > 0 ? 0 : begins[3];
+    int stop_dim_3 = ((1 - real_dim) > 0 || sizes[3] == -1)
+                 ? in_dim_3 - start_dim_3
+                 : start_dim_3 + sizes[3];             
+    
+    for(int n = start_dim_0; n < stop_dim_0;++n)
+    {
+        for(int i = start_dim_1; i < stop_dim_1; ++i)
+        {
+            for(int j = start_dim_2; j < stop_dim_2; ++j)
+            {
+                int len = stop_dim_3 - start_dim_3;
+                int input_off = n * in_dim_1 * in_dim_2 * in_dim_3 + 
+                                i * in_dim_2 * in_dim_3 + 
+                                j * in_dim_3 + start_dim_3;
+                memcpy(output,input + input_off,len * element_size);
+                output += len * element_size;
+            }
+        }
+    }
+    return 0;
+}
+static int ref_slice_common(const int8_t* in_data,int8_t** out_data,int element_size,const struct slice_param *param)
+{
+    if(param->iscaffe)
+        return caffe_run(in_data,out_data,element_size,param);
+    else
+        return tf_run(in_data,out_data,element_size,param);
+     
+}
diff --git a/executor/operator/ref/kernel/slice/slice_fp16.c b/executor/operator/ref/kernel/slice/slice_fp16.c
new file mode 100644
index 000000000..7aff3ed2c
--- /dev/null
+++ b/executor/operator/ref/kernel/slice/slice_fp16.c
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+
+static int ref_slice_fp16(const __fp16* in_data, __fp16** out_data, const struct slice_param* param)
+{
+    return ref_slice_common((const int8_t*)in_data,(int8_t**)out_data,sizeof(__fp16),param);
+}
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/slice/slice_fp32.c b/executor/operator/ref/kernel/slice/slice_fp32.c
new file mode 100644
index 000000000..d35989590
--- /dev/null
+++ b/executor/operator/ref/kernel/slice/slice_fp32.c
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+
+
+static int ref_slice_fp32(const float* in_data, float** out_data, const struct slice_param* param)
+{
+    return ref_slice_common((const int8_t*)in_data,(int8_t**)out_data,sizeof(float),param);
+}
diff --git a/executor/operator/ref/kernel/slice/slice_int8.c b/executor/operator/ref/kernel/slice/slice_int8.c
new file mode 100644
index 000000000..50a09cd60
--- /dev/null
+++ b/executor/operator/ref/kernel/slice/slice_int8.c
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+
+static int ref_slice_int8(const int8_t* in_data, int8_t** out_data, const struct slice_param* param)
+{
+    return ref_slice_common((const int8_t*)in_data,(int8_t**)out_data,sizeof(int8_t),param);
+}
diff --git a/executor/operator/ref/kernel/slice/slice_kernel.h b/executor/operator/ref/kernel/slice/slice_kernel.h
new file mode 100644
index 000000000..f219d3282
--- /dev/null
+++ b/executor/operator/ref/kernel/slice/slice_kernel.h
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+
+#ifndef __SLICE_KERNEL_H__
+#define __SLICE_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct shape_dim
+{
+    int dims[4];    // for caffe   
+    int begins[4];  // for tf 
+    int sizes[4];   // for tf 
+};
+
+struct slice_param
+{
+    int in_shape[4];  // the dim of the input 
+    struct shape_dim *output_shape;  // out shape 
+    int out_num; 
+    int dim_num; 
+    int axis; // for caffe 
+    float out_scale; // for input tensor int8
+    bool iscaffe;
+};
+
+typedef int (*slice_t)(const int8_t* in_data, int8_t** out_data, const struct slice_param* param);
+
+#include "slice_common.c"
+#ifdef CONFIG_KERNEL_FP32
+#include "slice_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "slice_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "slice_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "slice_uint8.c"
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+    
+#endif
diff --git a/executor/operator/ref/kernel/slice/slice_uint8.c b/executor/operator/ref/kernel/slice/slice_uint8.c
new file mode 100644
index 000000000..c6fffce78
--- /dev/null
+++ b/executor/operator/ref/kernel/slice/slice_uint8.c
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+
+static int ref_slice_uint8(const uint8_t* in_data, uint8_t** out_data, const struct slice_param* param)
+{
+    return ref_slice_common((const int8_t*)in_data,(int8_t**)out_data,sizeof(uint8_t),param);
+}
+
diff --git a/executor/operator/ref/kernel/softmax/ref_softmax.h b/executor/operator/ref/kernel/softmax/ref_softmax.h
new file mode 100644
index 000000000..a46b69f9b
--- /dev/null
+++ b/executor/operator/ref/kernel/softmax/ref_softmax.h
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+
+#ifndef __REF_SOFTMAX_OP_KERNEL_H__
+#define __REF_SOFTMAX_OP_KERNEL_H__
+
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct op_data
+{
+    int out_size;
+    int in_size;
+    int on_size;
+    int i_zero;
+    float i_scale;
+    int o_zero;
+    float o_scale;
+};
+
+
+static void GetMaxArray(float* input, float* array, int in_size, int on_size)
+{   
+    float* input_ptr = ( float* )input;
+    float* array_ptr = ( float* )array;
+    memset(array, 0, in_size * sizeof(float));
+    
+    for(int j = 0; j < on_size; j++)
+        for(int l = 0; l < in_size; l++)
+        {   
+            if(array_ptr[l] < input_ptr[j * in_size + l])
+                array_ptr[l] = input_ptr[j * in_size + l];
+        }
+}
+
+
+static void GetOutResult(float* input, float* output, float* array, float* sum_array, int in_size, int on_size)
+{   
+    float* input_ptr = ( float* )input;
+    float* output_ptr = ( float* )output;
+    float* array_ptr = ( float* )array;
+    float* sum_array_ptr = ( float* )sum_array;
+    
+    memset(sum_array, 0x0, in_size * sizeof(float));
+    
+    /* get the exp and the summary */
+    
+    for(int j = 0; j < on_size; j++)
+        for(int l = 0; l < in_size; l++)
+        {   
+            int index = j * in_size + l;
+            output_ptr[index] = exp(input_ptr[index] - array_ptr[l]);
+            sum_array_ptr[l] += output_ptr[index];
+        }
+    
+    /* the final result */
+    for(int j = 0; j < on_size; j++)
+        for(int l = 0; l < in_size; l++)
+        {   
+            int index = j * in_size + l;
+            output_ptr[index] /= sum_array_ptr[l];
+        }
+}
+
+
+
+typedef int (*ref_softmax_kernel_t)(void * input, void * output, void * max_array, void * sum_array, op_data* op_param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_softmax_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_softmax_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "ref_softmax_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_softmax_uint8.c"
+#endif
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/softmax/ref_softmax_fp16.c b/executor/operator/ref/kernel/softmax/ref_softmax_fp16.c
new file mode 100644
index 000000000..24d493c99
--- /dev/null
+++ b/executor/operator/ref/kernel/softmax/ref_softmax_fp16.c
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+
+#include <string.h>
+
+#include "ref_softmax.h"
+#include <math.h>
+#include <stdlib.h>
+
+
+    int ref_softmax_kernel_fp16(__fp16* input, __fp16* output, float* max_array, float* sum_array, op_data* op_param)
+    {
+        int out_size = op_param->out_size;
+        int in_size = op_param->in_size;
+        int on_size = op_param->on_size;
+        int on_in_size = in_size * on_size;
+
+        float* input_f = (float*)malloc(out_size * on_in_size * sizeof(float));
+        float* output_f = (float*)malloc(out_size * on_in_size * sizeof(float));
+
+        for(int i = 0; i < out_size; i++)
+            for(int j=0; j< on_in_size; j++)
+                input_f[i*on_in_size+j] = fp16_to_fp32(input[i*on_in_size+j]);
+
+        for(int i = 0; i < out_size; i++)
+        {
+            /* get max */
+            int img_base = i * in_size * on_size;
+            GetMaxArray(input_f + img_base, max_array, in_size, on_size);
+            GetOutResult(input_f + img_base, output_f + img_base, max_array, sum_array, in_size, on_size);
+        }
+
+        for(int i = 0; i < out_size; i++)
+            for(int j=0; j< on_in_size; j++)
+                output[i*on_in_size+j] = fp32_to_fp16(output_f[i*on_in_size+j]);
+        
+        free(input_f);
+        free(output_f);
+
+        return 0;
+    }
+
+
+
+
diff --git a/executor/operator/ref/kernel/softmax/ref_softmax_fp32.c b/executor/operator/ref/kernel/softmax/ref_softmax_fp32.c
new file mode 100644
index 000000000..c7424d1bc
--- /dev/null
+++ b/executor/operator/ref/kernel/softmax/ref_softmax_fp32.c
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "ref_softmax.h"
+#include <math.h>
+
+
+    int ref_softmax_kernel_fp32(float* input, float* output, float* max_array, float* sum_array, op_data* op_param)
+    {
+        for(int i = 0; i < op_param->out_size; i++)
+        {
+            /* get max */
+            int img_base = i * op_param->in_size * op_param->on_size;
+            GetMaxArray(input + img_base, max_array, op_param->in_size, op_param->on_size);
+            GetOutResult(input + img_base, output + img_base, max_array, sum_array, op_param->in_size, op_param->on_size);
+        }
+
+        return 0;
+    }
+
+
+
+
diff --git a/executor/operator/ref/kernel/softmax/ref_softmax_int8.c b/executor/operator/ref/kernel/softmax/ref_softmax_int8.c
new file mode 100644
index 000000000..3ca40f0f6
--- /dev/null
+++ b/executor/operator/ref/kernel/softmax/ref_softmax_int8.c
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+
+#include <string.h>
+
+#include "ref_softmax.h"
+#include <math.h>
+#include <stdlib.h>
+
+
+    int ref_softmax_kernel_int8(int8_t* input, int8_t* output, float* max_array, float* sum_array, op_data* op_param)
+    {
+        int out_size = op_param->out_size;
+        int in_size = op_param->in_size;
+        int on_size = op_param->on_size;
+        int on_in_size = in_size * on_size;
+
+        float* input_f = (float*)malloc(out_size * on_in_size * sizeof(float));
+        float* output_f = (float*)malloc(out_size * on_in_size * sizeof(float));
+
+        for(int i = 0; i < out_size; i++)
+            for(int j=0; j< on_in_size; j++)
+                input_f[i*on_in_size+j] = (input[i*on_in_size+j])*op_param->i_scale;
+
+        for(int i = 0; i < out_size; i++)
+        {
+            /* get max */
+            int img_base = i * in_size * on_size;
+            GetMaxArray(input_f + img_base, max_array, in_size, on_size);
+            GetOutResult(input_f + img_base, output_f + img_base, max_array, sum_array, in_size, on_size);
+        }
+        
+        float fmax=0.0f;
+
+        for(int i = 0; i < out_size; i++)
+            for(int j=0; j< on_in_size; j++)
+                if(fmax<fabs(output_f[i*on_in_size+j])) fmax=fabs(output_f[i*on_in_size+j]);
+
+        float o_scale = fmax/127;
+        op_param->o_scale = o_scale;
+ 
+        for(int i = 0; i < out_size; i++)
+            for(int j=0; j< on_in_size; j++)
+                output[i*on_in_size+j] = round(output_f[i*on_in_size+j]/op_param->o_scale);
+        
+        free(input_f);
+        free(output_f);
+
+        return 0;
+    }
+
+
diff --git a/executor/operator/ref/kernel/softmax/ref_softmax_uint8.c b/executor/operator/ref/kernel/softmax/ref_softmax_uint8.c
new file mode 100644
index 000000000..8fdc5c080
--- /dev/null
+++ b/executor/operator/ref/kernel/softmax/ref_softmax_uint8.c
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+
+#include <string.h>
+
+#include "ref_softmax.h"
+#include <math.h>
+#include <stdlib.h>
+
+
+    int ref_softmax_kernel_uint8(uint8_t* input, uint8_t* output, float* max_array, float* sum_array, op_data* op_param)
+    {
+        int out_size = op_param->out_size;
+        int in_size = op_param->in_size;
+        int on_size = op_param->on_size;
+        int on_in_size = in_size * on_size;
+
+        float* input_f = (float*)malloc(out_size * on_in_size * sizeof(float));
+        float* output_f = (float*)malloc(out_size * on_in_size * sizeof(float));
+
+        for(int i = 0; i < out_size; i++)
+            for(int j=0; j< on_in_size; j++)
+                input_f[i*on_in_size+j] = (input[i*on_in_size+j]-op_param->i_zero)*op_param->i_scale;
+
+        for(int i = 0; i < out_size; i++)
+        {
+            /* get max */
+            int img_base = i * in_size * on_size;
+            GetMaxArray(input_f + img_base, max_array, in_size, on_size);
+            GetOutResult(input_f + img_base, output_f + img_base, max_array, sum_array, in_size, on_size);
+        }
+
+        for(int i = 0; i < out_size; i++)
+            for(int j=0; j< on_in_size; j++)
+                output[i*on_in_size+j] = round((output_f[i*on_in_size+j]/op_param->o_scale)+op_param->o_zero);
+        
+        free(input_f);
+        free(output_f);
+
+        return 0;
+    }
+
diff --git a/executor/operator/ref/kernel/split/split_fp16.c b/executor/operator/ref/kernel/split/split_fp16.c
new file mode 100644
index 000000000..e7a38bb23
--- /dev/null
+++ b/executor/operator/ref/kernel/split/split_fp16.c
@@ -0,0 +1,32 @@
+static int split_fp16(const __fp16* in_data, __fp16** out_data, struct split_param* param)
+{
+    int slice_axis = param->axis;
+    int num_slices = 1;
+    int slice_size = 1; 
+    for(int i = 0; i < slice_axis; i++)
+    {
+        num_slices = num_slices * param->input_shape.dim[i];
+    }
+    for(int i = slice_axis + 1; i < param->input_dim; i++)
+    {
+        slice_size = slice_size * param->input_shape.dim[i];
+    }
+    int in_slice =  param->input_shape.dim[slice_axis];
+    int slice_index = 0;
+    unsigned int out_num = param->output_counts;
+    for(unsigned int i = 0; i < out_num; i++)
+    {
+        __fp16 * output=(__fp16*)out_data[i];
+        int out_slice = param->output_shape[i].dim[slice_axis];
+        
+        for(int n = 0; n < num_slices; n++)
+        {
+            int in_offset = (n * in_slice + slice_index) * slice_size;
+            int out_offset  = n * out_slice * slice_size;
+            memcpy(output+out_offset,in_data + in_offset,slice_size * out_slice * sizeof(__fp16));
+        }
+        slice_index += out_slice;       
+    }
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/split/split_fp32.c b/executor/operator/ref/kernel/split/split_fp32.c
new file mode 100644
index 000000000..23496a5b2
--- /dev/null
+++ b/executor/operator/ref/kernel/split/split_fp32.c
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+
+static int split_fp32(const float* in_data, float** out_data, struct split_param* param)
+{
+    int slice_axis = param->axis;
+    int num_slices = 1;
+    int slice_size = 1; 
+    for(int i = 0; i < slice_axis; i++)
+    {
+        num_slices = num_slices * param->input_shape.dim[i];
+    }
+    for(int i = slice_axis + 1; i < param->input_dim; i++)
+    {
+        slice_size = slice_size * param->input_shape.dim[i];
+    }
+    int in_slice =  param->input_shape.dim[slice_axis];
+    int slice_index = 0;
+    unsigned int out_num =  param->output_counts;
+    for(unsigned int i = 0; i < out_num; i++)
+    {
+        float * output=(float*)out_data[i];
+        int out_slice = param->output_shape[i].dim[slice_axis];
+        
+        for(int n = 0; n < num_slices; n++)
+        {
+            int in_offset = (n * in_slice + slice_index) * slice_size;
+            int out_offset  = n * out_slice * slice_size;
+            memcpy(output+out_offset,in_data + in_offset,slice_size * out_slice * sizeof(float));
+        }
+        slice_index += out_slice;       
+    }
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/split/split_int8.c b/executor/operator/ref/kernel/split/split_int8.c
new file mode 100644
index 000000000..8a915d641
--- /dev/null
+++ b/executor/operator/ref/kernel/split/split_int8.c
@@ -0,0 +1,32 @@
+static int split_int8(const int8_t* in_data, int8_t** out_data, struct split_param* param)
+{
+    int slice_axis = param->axis;
+    int num_slices = 1;
+    int slice_size = 1; 
+    for(int i = 0; i < slice_axis; i++)
+    {
+        num_slices = num_slices * param->input_shape.dim[i];
+    }
+    for(int i = slice_axis + 1; i < param->input_dim; i++)
+    {
+        slice_size = slice_size * param->input_shape.dim[i];
+    }
+    int in_slice =  param->input_shape.dim[slice_axis];
+    int slice_index = 0;
+    unsigned int out_num = param->output_counts;
+    for(unsigned int i = 0; i < out_num; i++)
+    {
+        int8_t * output=(int8_t*)out_data[i];
+        int out_slice = param->output_shape[i].dim[slice_axis];
+        
+        for(int n = 0; n < num_slices; n++)
+        {
+            int in_offset = (n * in_slice + slice_index) * slice_size;
+            int out_offset  = n * out_slice * slice_size;
+            memcpy(output+out_offset,in_data + in_offset,slice_size * out_slice * sizeof(int8_t));
+        }
+        slice_index += out_slice;       
+    }
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/split/split_kernel.h b/executor/operator/ref/kernel/split/split_kernel.h
new file mode 100644
index 000000000..13bbddbed
--- /dev/null
+++ b/executor/operator/ref/kernel/split/split_kernel.h
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#ifndef __SPLIT_KERNEL_H__
+#define __SPLIT_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+#include "compiler_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct shape_dim
+{
+    int dim[4];
+    float scale;
+    int zero;
+};
+
+struct split_param
+{
+    struct shape_dim input_shape;
+    int output_counts;
+    int input_dim;
+    struct shape_dim* output_shape;
+    int output_dim;
+    int axis;
+    float out_scale;
+};
+
+
+typedef int (*split_t)(void * data,void ** out_data,split_param* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "split_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "split_fp16.c" 
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "split_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "split_uint8.c"
+#endif
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/executor/operator/ref/kernel/split/split_uint8.c b/executor/operator/ref/kernel/split/split_uint8.c
new file mode 100644
index 000000000..b07bf24e6
--- /dev/null
+++ b/executor/operator/ref/kernel/split/split_uint8.c
@@ -0,0 +1,32 @@
+static int split_uint8(const uint8_t* in_data, uint8_t** out_data,struct split_param* param)
+{
+    int slice_axis = param->axis;
+    int num_slices = 1;
+    int slice_size = 1; 
+    for(int i = 0; i < slice_axis; i++)
+    {
+        num_slices = num_slices * param->input_shape.dim[i];
+    }
+    for(int i = slice_axis + 1; i < param->input_dim; i++)
+    {
+        slice_size = slice_size * param->input_shape.dim[i];
+    }
+    int in_slice =  param->input_shape.dim[slice_axis];
+    int slice_index = 0;
+    unsigned int out_num = param->output_counts;
+    for(unsigned int i = 0; i < out_num; i++)
+    {
+        uint8_t * output=(uint8_t*)out_data[i];
+        int out_slice = param->output_shape[i].dim[slice_axis];
+        
+        for(int n = 0; n < num_slices; n++)
+        {
+            int in_offset = (n * in_slice + slice_index) * slice_size;
+            int out_offset  = n * out_slice * slice_size;
+            memcpy(output+out_offset,in_data + in_offset,slice_size * out_slice * sizeof(uint8_t));
+        }
+        slice_index += out_slice;       
+    }
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/swap_axis/ref_swap_axis_fp32.c b/executor/operator/ref/kernel/swap_axis/ref_swap_axis_fp32.c
new file mode 100644
index 000000000..980892587
--- /dev/null
+++ b/executor/operator/ref/kernel/swap_axis/ref_swap_axis_fp32.c
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+static int ref_swap_axis_fp32(const float* in_data,float* out_data,const int* dims)
+{
+    
+    for(int i = 0; i < dims[0]; i ++)
+        for(int j = 0; j < dims[3]; j ++)
+            for(int p = 0; p < dims[2]; p ++)
+                for(int q = 0; q < dims[1]; q ++)
+                {
+                    int out_index = i*dims[1]*dims[2]*dims[3]*dims[4] + j*dims[2]*dims[1]*dims[4]
+                                    + p*dims[1]*dims[4] + q*dims[4];
+                    int in_index = i*dims[1]*dims[2]*dims[3]*dims[4] + q*dims[2]*dims[3]*dims[4]
+                                    + p*dims[3]*dims[4] + j*dims[4];
+                    memcpy(out_data + out_index, in_data + in_index, dims[4]*sizeof(float));
+                }
+
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/swap_axis/ref_swap_axis_kernel.h b/executor/operator/ref/kernel/swap_axis/ref_swap_axis_kernel.h
new file mode 100644
index 000000000..1b124bbc8
--- /dev/null
+++ b/executor/operator/ref/kernel/swap_axis/ref_swap_axis_kernel.h
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+ 
+#ifndef __REF_SWAP_AXIS_H__
+#define __REF_SWAP_AXIS_H__
+ 
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include "compiler_fp16.h"
+ 
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+
+typedef int (*ref_swap_axis_kernel_t)(void * input, void * output, int* dims);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "ref_swap_axis_fp32.c"
+#endif
+/*
+#ifdef CONFIG_KERNEL_FP16
+#include "ref_swap_axis_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "ref_swap_axis_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "ref_swap_axis_uint8.c"
+#endif
+*/
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
+
+
diff --git a/executor/operator/ref/kernel/tanh/tanh.h b/executor/operator/ref/kernel/tanh/tanh.h
new file mode 100644
index 000000000..8d13a87e7
--- /dev/null
+++ b/executor/operator/ref/kernel/tanh/tanh.h
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#ifndef __TANH_KERNEL_H__
+#define __TANH_KERNEL_H__
+
+#include <stdint.h>
+#include <math.h>
+
+#include "compiler_fp16.h"
+#include "tanh_common.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct tanh_param{
+    float input_scale;
+    int input_zero;
+    float output_scale;
+    int output_zero;
+};
+typedef int (*tanh_t)(void * data, int size, struct tanh_param* param);
+
+
+#ifdef CONFIG_KERNEL_FP32
+#include "tanh_fp32.c"
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+#include "tanh_fp16.c"
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+#include "tanh_int8.c"
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+#include "tanh_uint8.c"
+#endif
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/executor/operator/ref/kernel/tanh/tanh_common.h b/executor/operator/ref/kernel/tanh/tanh_common.h
new file mode 100644
index 000000000..25026b4bb
--- /dev/null
+++ b/executor/operator/ref/kernel/tanh/tanh_common.h
@@ -0,0 +1,7 @@
+#ifndef __TANH_COMMON_H__
+#define __TANH_COMMON_H__
+
+#define T_MAX(a, b) ((a) > (b) ? (a) : (b))
+#define T_MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#endif
diff --git a/executor/operator/ref/kernel/tanh/tanh_fp16.c b/executor/operator/ref/kernel/tanh/tanh_fp16.c
new file mode 100644
index 000000000..f5c44e8b9
--- /dev/null
+++ b/executor/operator/ref/kernel/tanh/tanh_fp16.c
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+
+int tanh_fp16(__fp16 * data,int size, struct tanh_param* param)
+{
+    for(int i=0;i<size;i++)
+    {
+#if!defined( __ARM_ARCH) || __ARM_ARCH <8
+        data[i] = fp32_to_fp16(T_MIN(fp16_to_fp32(data[i]), 30.0f));
+        data[i] = fp32_to_fp16(T_MAX(fp16_to_fp32(data[i]), -30.0f));
+
+        data[i] = fp32_to_fp16((exp(fp16_to_fp32(data[i])) - exp(-fp16_to_fp32(data[i]))) / (exp(fp16_to_fp32(data[i])) + exp(-fp16_to_fp32(data[i]))));
+#else
+        data[i] = T_MIN(data[i], 30.0f);
+        data[i] = T_MAX(data[i], -30.0f);
+
+        data[i] = (exp(data[i]) - exp(-data[i])) / (exp(data[i]) + exp(-data[i]));
+#endif
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/tanh/tanh_fp32.c b/executor/operator/ref/kernel/tanh/tanh_fp32.c
new file mode 100644
index 000000000..b6312e31b
--- /dev/null
+++ b/executor/operator/ref/kernel/tanh/tanh_fp32.c
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+int tanh_fp32(float * data, int size, struct tanh_param* param)
+{
+     for(int i=0;i<size;i++)
+    {
+        data[i] = T_MIN(data[i], 30.0f);
+        data[i] = T_MAX(data[i], -30.0f);
+
+        data[i] = (exp(data[i]) - exp(-data[i])) / (exp(data[i]) + exp(-data[i]));
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/tanh/tanh_int8.c b/executor/operator/ref/kernel/tanh/tanh_int8.c
new file mode 100644
index 000000000..aa82383b6
--- /dev/null
+++ b/executor/operator/ref/kernel/tanh/tanh_int8.c
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+int tanh_int8(int8_t * data, int size, struct tanh_param* param)
+{
+     for(int i=0;i<size;i++)
+    {
+        float real_data = data[i] * param->input_scale;
+        real_data = T_MIN(real_data, 30.0f);
+        real_data = T_MAX(real_data, -30.0f);
+
+        real_data = (exp(real_data) - exp(-real_data)) / (exp(real_data) + exp(-real_data));
+        data[i] = real_data * 127;
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/kernel/tanh/tanh_uint8.c b/executor/operator/ref/kernel/tanh/tanh_uint8.c
new file mode 100644
index 000000000..d0870744d
--- /dev/null
+++ b/executor/operator/ref/kernel/tanh/tanh_uint8.c
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+int tanh_uint8(int8_t * data, int size, struct tanh_param* param)
+{
+     for(int i=0;i<size;i++)
+    {
+        float real_data = (data[i] - param->input_zero) * param->input_scale;
+        real_data = T_MIN(real_data, 30.0f);
+        real_data = T_MAX(real_data, -30.0f);
+
+        real_data = (exp(real_data) - exp(-real_data)) / (exp(real_data) + exp(-real_data));
+        data[i] = round(real_data /param->output_scale) + param->output_zero;
+    }
+    return 0;
+}
diff --git a/executor/operator/ref/pad.cpp b/executor/operator/ref/pad.cpp
new file mode 100644
index 000000000..3e9447735
--- /dev/null
+++ b/executor/operator/ref/pad.cpp
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+
+#include "operator/pad.hpp"
+#include "kernel/pad/pad_kernel.h"
+
+namespace TEngine {
+
+namespace RefPadOps {
+
+
+
+struct RefPad : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+   
+    bool Run(Node * node) override;
+    void InitRegistry(void);
+    
+    pad_t  kernel_run;
+    pad_param param;
+
+    KernelRegistry<pad_t>  kernel_registry;
+    RefPad(void) 
+    {
+        InitRegistry();
+    }
+};
+
+
+bool RefPad::Prerun(Node * node)
+{
+    Tensor * input_tensor=node->GetInputTensor(0);
+
+    int  layout=exec_attr->graph_layout;
+
+    if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+  
+
+    return true;
+}
+static int get_scale_zero(Tensor* itensor,Tensor * otensor,pad_param* param)
+{
+    auto* i_quant = itensor->GetQuantParam();
+    auto* o_quant = otensor->GetQuantParam();
+    if( i_quant->size() != 1 )
+        return -1;
+    param->scale[0] = (*i_quant)[0].scale;
+    if(itensor->GetDataType() == TENGINE_DT_UINT8)
+    {
+        if( o_quant->size() != 1)
+            return -1;
+
+        param->scale[1] = (*o_quant)[0].scale;
+        param->zero[1] = (*o_quant)[0].zero_point;
+
+        param->zero[0] = (*i_quant)[0].zero_point;
+    }
+    return 0;
+}
+
+bool RefPad::Run(Node * node)
+{   
+    Pad* pad_op = dynamic_cast<Pad*>(node->GetOp());
+    PadParam* op_param = pad_op->GetParam();
+
+    Tensor * input_tensor = node->GetInputTensor(0);
+    Tensor * out_tensor = node->GetOutputTensor(0);
+    // int element_size = DataType::GetTypeSize(out_tensor->GetDataType());
+    // int out_size= out_tensor->GetTotalSize() / element_size;
+
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8 ||input_tensor->GetDataType() == TENGINE_DT_UINT8 )
+    {
+        if(get_scale_zero(input_tensor,out_tensor, &param) < 0)
+            return false;
+    }
+    const TShape& i_shape = input_tensor->GetShape();
+    std::vector<int> i_dims=i_shape.GetDim();
+
+    const TShape& o_shape = out_tensor->GetShape();
+    std::vector<int> o_dims=o_shape.GetDim();
+
+    int in_n=i_shape.GetN();
+    int in_h=i_shape.GetH();
+    int in_w=i_shape.GetW();
+    int in_c=i_shape.GetC();
+
+    int out_n=o_shape.GetN();
+    int out_h=o_shape.GetH();
+    int out_w=o_shape.GetW();
+    int out_c=o_shape.GetC();
+
+    int in_size=in_n*in_h*in_w*in_c;
+    int out_size=out_n*out_h*out_w*out_c;
+
+    param.mode=op_param->mode;
+    if(param.mode==0)
+    {
+        param.cv_f32=op_param->value;
+        param.cv_f16=(__fp16 )fp32_to_fp16(op_param->value);
+        param.cv_int8=op_param->value;
+        param.cv_uint8=op_param->value;
+    }
+    param.pad_0_h=op_param->pad_0_h;
+    param.pad_0_w=op_param->pad_0_w;
+    param.pad_1_h=op_param->pad_1_h;
+    param.pad_1_w=op_param->pad_1_w;
+    param.pad_2_h=op_param->pad_2_h;
+    param.pad_2_w=op_param->pad_2_w;
+    param.pad_3_h=op_param->pad_3_h;
+    param.pad_3_w=op_param->pad_3_w;
+
+    param.in_n=in_n;
+    param.in_h=in_h;
+    param.in_w=in_w;
+    param.in_c=in_c;
+
+    param.out_n=out_n;
+    param.out_h=out_h;
+    param.out_w=out_w;
+
+    param.in_size=in_size;
+    param.out_size=out_size;
+
+    void* in_data=get_tensor_mem(input_tensor);
+    void* out_data=get_tensor_mem(out_tensor);
+
+    int ret=kernel_run(in_data,out_data,&param);
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8)
+    {
+        auto* i_quant = input_tensor->GetQuantParam();
+        auto* o_quant = out_tensor->GetQuantParam();
+        QuantParam q_param;
+        q_param.scale =(*i_quant)[0].scale;;
+        o_quant->resize(0);
+        o_quant->push_back(q_param);
+    }
+    if(ret<0)
+        return false;
+    else
+        return true;
+}
+
+void RefPad::InitRegistry(void)
+{
+
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((pad_t)pad_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((pad_t)pad_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((pad_t)pad_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((pad_t)pad_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((pad_t)pad_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((pad_t)pad_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((pad_t)pad_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((pad_t)pad_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    RefPad* ops = new RefPad();
+
+    LOG_DEBUG()<<"Pad RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefSqueezeOps
+void RegisterPadOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Pad", RefPadOps::SelectFunc, 1000);
+}
+}    // namespace TEngine
diff --git a/executor/operator/ref/prelu.cpp b/executor/operator/ref/prelu.cpp
new file mode 100644
index 000000000..41f264d63
--- /dev/null
+++ b/executor/operator/ref/prelu.cpp
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+#include "kernel/prelu/prelu_kernel.h"
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/prelu.hpp"
+
+namespace TEngine {
+
+namespace RefPreluOps {
+
+
+
+struct PReluOps : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+    bool OnBind(Node * node) override; 
+    bool Run(Node * node) override; 
+    bool Postrun(Node * node) override;
+    void InitRegistry(void);
+    prelu_param op_param;
+    prelu_t  kernel_run;
+
+    KernelRegistry<prelu_t>  kernel_registry;
+    
+    PReluOps(void) 
+    {
+       kernel_run = nullptr;
+
+       InitRegistry();
+    }
+};
+
+bool PReluOps::Prerun(Node * node)
+{
+    Tensor * input = node->GetInputTensor(0);
+    int  layout=exec_attr->graph_layout;
+    
+    op_param.layout = layout;
+    
+
+      
+    if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+
+    return true;
+}
+
+bool PReluOps::OnBind(Node * node)
+{
+    inplace_t io_map;
+
+    io_map[0] = 0;
+
+    node->SetAttr(ATTR_INPLACE, io_map);
+    return true;
+}
+
+bool PReluOps::Run(Node * node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    const TShape& shape = input_tensor->GetShape();
+    const std::vector<int> dims = shape.GetDim();
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8 ||
+        input_tensor->GetDataType() == TENGINE_DT_UINT8 )
+    {
+        auto quant_param = input_tensor->GetQuantParam();
+        if(quant_param->size() != 1 )
+            return false;
+        op_param.scale = (*quant_param)[0].scale;
+        op_param.zero = (*quant_param)[0].zero_point;
+    }
+    int ret = -1;
+    int dim0 = dims[0];
+    int dim1 = dims[1];
+    int dim2 = dims[2];
+    int dim3 = dims[3];
+    void* data = get_tensor_mem(input_tensor);
+    void* out_data = get_tensor_mem(output_tensor);
+    const Tensor* slope_tensor = node->GetInputTensor(1);
+    float* slope = ( float* )get_tensor_mem(slope_tensor);
+    ret = kernel_run(data,out_data,dim0,dim1,dim2,dim3,slope,&op_param);
+     
+    if(ret<0)
+        return false;
+    else
+        return true;
+
+   
+  
+}
+
+bool PReluOps::Postrun(Node * node)
+{
+    return true;
+}
+
+void PReluOps::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((prelu_t)prelu_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((prelu_t)prelu_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((prelu_t)prelu_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((prelu_t)prelu_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((prelu_t)prelu_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((prelu_t)prelu_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((prelu_t)prelu_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((prelu_t)prelu_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    PReluOps* ops = new PReluOps();
+
+    LOG_DEBUG()<<"ReluOps RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefReluOps
+void RegisterPreluOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "PReLU", RefPreluOps::SelectFunc, 1000);
+}
+}    // namespace TEngine
\ No newline at end of file
diff --git a/executor/operator/ref/reduction.cpp b/executor/operator/ref/reduction.cpp
new file mode 100644
index 000000000..824dc797a
--- /dev/null
+++ b/executor/operator/ref/reduction.cpp
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+
+#include "operator/reduction.hpp"
+#include "kernel/reduction/reduce.h"
+
+namespace TEngine {
+
+namespace RefReductionOps {
+
+
+
+struct RefReduction : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+   
+    bool Run(Node * node) override;
+    void InitRegistry(void);
+    
+    reduce_t  kernel_run;
+    reduce_param param;
+
+    KernelRegistry<reduce_t>  kernel_registry;
+    RefReduction(void) 
+    {
+        InitRegistry();
+    }
+};
+
+
+bool RefReduction::Prerun(Node * node)
+{
+    Tensor * input_tensor=node->GetInputTensor(0);
+
+    int  layout=exec_attr->graph_layout;
+
+    if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+  
+
+    return true;
+}
+static int get_scale_zero(Tensor* itensor,Tensor * otensor,reduce_param* param)
+{
+    auto* i_quant = itensor->GetQuantParam();
+    auto* o_quant = otensor->GetQuantParam();
+    if( i_quant->size() != 1 )
+        return -1;
+    param->scale[0] = (*i_quant)[0].scale;
+    if(itensor->GetDataType() == TENGINE_DT_UINT8)
+    {
+        if( o_quant->size() != 1)
+            return -1;
+
+        param->scale[1] = (*o_quant)[0].scale;
+        param->zero[1] = (*o_quant)[0].zero_point;
+
+        param->zero[0] = (*i_quant)[0].zero_point;
+    }
+    return 0;
+}
+
+bool RefReduction::Run(Node * node)
+{   
+    Reduction* reduction_op = dynamic_cast<Reduction*>(node->GetOp());
+    ReductionParam* op_param = reduction_op->GetParam();
+
+    Tensor * input_tensor = node->GetInputTensor(0);
+    Tensor * out_tensor = node->GetOutputTensor(0);
+    int element_size = DataType::GetTypeSize(out_tensor->GetDataType());
+    int out_size= out_tensor->GetTotalSize() / element_size;
+
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8 ||input_tensor->GetDataType() == TENGINE_DT_UINT8 )
+    {
+        if(get_scale_zero(input_tensor,out_tensor, &param) < 0)
+            return false;
+    }
+    const TShape& i_shape = input_tensor->GetShape();
+
+    std::vector<int> dims=i_shape.GetDim();
+
+    int dim0=dims[0];
+    int dim1=dims[1];
+    int dim2=dims[2];
+    int dim3=dims[3];
+
+    param.param_dim[0]=op_param->dim_0;
+    param.param_dim[1]=op_param->dim_1;
+    param.param_dim[2]=op_param->dim_2;
+    param.param_dim[3]=op_param->dim_3;
+    param.type=op_param->type;
+
+    void* in_data=get_tensor_mem(input_tensor);
+    void* out_data=get_tensor_mem(out_tensor);
+
+    int ret=kernel_run(in_data,out_data,dim0,dim1,dim2,dim3,out_size,&param);
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8)
+    {
+        auto* o_quant = out_tensor->GetQuantParam();
+        QuantParam q_param;
+        q_param.scale =param.scale[1];
+        o_quant->resize(0);
+        o_quant->push_back(q_param);
+    }
+    if(ret<0)
+        return false;
+    else
+        return true;
+}
+
+void RefReduction::InitRegistry(void)
+{
+
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((reduce_t)reduce_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((reduce_t)reduce_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((reduce_t)reduce_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((reduce_t)reduce_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((reduce_t)reduce_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((reduce_t)reduce_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((reduce_t)reduce_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((reduce_t)reduce_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    RefReduction* ops = new RefReduction();
+
+    LOG_DEBUG()<<"Reduction RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefSqueezeOps
+void RegisterReductionOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Reduction", RefReductionOps::SelectFunc, 1000);
+}
+}    // namespace TEngine
diff --git a/executor/operator/ref/ref_add_n.cpp b/executor/operator/ref/ref_add_n.cpp
new file mode 100644
index 000000000..be0ec8dc3
--- /dev/null
+++ b/executor/operator/ref/ref_add_n.cpp
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+#include <iostream>
+#include <functional>
+#include <stdlib.h>
+#include "kernel_registry.hpp"
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "graph.hpp"
+#include "tengine_errno.hpp"
+#include "operator/add_n.hpp"
+#include "kernel/ref_add_n/ref_addn_kernel.h"
+#include <cmath>
+
+namespace TEngine {
+
+namespace RefAddNImpl {
+//const int default_prio = 1500;
+struct RefAddNOps : public NodeOps
+{
+    bool Prerun(Node* node) override;
+    bool Run(Node* node) override;
+    bool Postrun(Node* node) override;
+    void InitRegistry(void);
+    RefAddNOps()
+    {
+        kernel_run = nullptr;
+        InitRegistry();
+    }
+    struct ref_addn_param op_param;
+    ref_add_n_kernel_t kernel_run;
+    uint8_t** in_data_ptrs;
+    KernelRegistry<ref_add_n_kernel_t> kernel_registry;
+};
+
+void RefAddNOps::InitRegistry(void)
+{
+    #ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((ref_add_n_kernel_t)ref_addn_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((ref_add_n_kernel_t)ref_addn_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+    #endif
+    #ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((ref_add_n_kernel_t)ref_addn_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((ref_add_n_kernel_t)ref_addn_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+    #endif
+    #ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((ref_add_n_kernel_t)ref_addn_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((ref_add_n_kernel_t)ref_addn_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+    #endif
+    #ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((ref_add_n_kernel_t)ref_addn_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((ref_add_n_kernel_t)ref_addn_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+    #endif
+    
+}
+
+bool RefAddNOps::Prerun(Node* node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    int data_type = input_tensor->GetDataType();
+    int layout = exec_attr->graph_layout;
+    unsigned int input_num = node->GetInputNum();
+    op_param.input_size = input_tensor->GetTotalSize();
+    op_param.in_num = input_num;
+    op_param.in_scale=new float[input_num];
+    op_param.in_zero =new int[input_num];
+    in_data_ptrs = new uint8_t*[input_num];
+
+    if(!kernel_registry.GetKernel(kernel_run,layout,data_type))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+    return true;
+}
+
+bool RefAddNOps::Run(Node* node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    const int data_type = input_tensor->GetDataType();
+
+    for(int i = 0; i < op_param.in_num; ++i)
+    {
+        Tensor* input_tensor = node->GetInputTensor(i);
+        auto* in_quant = input_tensor->GetQuantParam();
+        if(in_quant->size())
+        {
+            op_param.in_scale[i] = (*in_quant)[0].scale;
+            op_param.in_zero[i] = (*in_quant)[0].zero_point;
+        }
+        
+        in_data_ptrs[i] = (uint8_t*)get_tensor_mem(input_tensor);
+    }
+    
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    uint8_t* out_data = (uint8_t*)get_tensor_mem(output_tensor);
+    memset(out_data, 0, op_param.input_size);
+    if( data_type == TENGINE_DT_UINT8 )
+    {
+        auto* o_quant = output_tensor->GetQuantParam();
+        op_param.out_scale = (*o_quant)[0].scale;
+        op_param.out_zero = (*o_quant)[0].zero_point;
+    }
+    int ret = kernel_run(in_data_ptrs, out_data, &op_param);
+    if(ret<0)
+        return false;
+
+    if( data_type == TENGINE_DT_INT8 )
+    {
+        Tensor* o_tensor = node->GetOutputTensor(0);
+        auto* o_quant = o_tensor->GetQuantParam();
+        QuantParam q_param;
+        q_param.scale = op_param.out_scale;
+        o_quant->resize(0);
+        o_quant->push_back(q_param);
+    }
+    return true;
+}
+
+bool RefAddNOps::Postrun(Node* node)
+{
+    free(in_data_ptrs);
+    free(op_param.in_scale);
+    free(op_param.in_zero);
+
+    return true;
+}
+
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    RefAddNOps* ops = new RefAddNOps();
+    return ops;
+}
+
+}    // namespace RefAddNImpl
+
+using namespace RefAddNImpl;
+
+void RegisterRefAddNOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Addn", RefAddNImpl::SelectFunc, 1000);
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/ref/ref_batchnorm.cpp b/executor/operator/ref/ref_batchnorm.cpp
new file mode 100644
index 000000000..f26eb76f3
--- /dev/null
+++ b/executor/operator/ref/ref_batchnorm.cpp
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+#include <iostream>
+#include <functional>
+#include <stdlib.h>
+#include "kernel_registry.hpp"
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "tengine_errno.hpp"
+#include "graph.hpp"
+#include "operator/batch_norm.hpp"
+#include "kernel/ref_batchnorm/ref_batchnorm_kernel.h"
+#include <cmath>
+
+namespace TEngine {
+
+namespace RefBatchNormImpl{
+
+struct RefBatchNormOps : public NodeOps
+{
+    bool Prerun(Node* node) override;
+    bool Run(Node* node) override;
+    bool Postrun(Node* node) override;
+    void InitRegistry(void);
+    RefBatchNormOps()
+    {
+        kernel_run = nullptr;
+        InitRegistry();
+    }
+    struct ref_batchnorm_param op_param;
+    ref_batchnorm_kernel_t kernel_run;
+    KernelRegistry<ref_batchnorm_kernel_t> kernel_registry;
+};
+
+void RefBatchNormOps::InitRegistry(void)
+{
+    #ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+    #endif
+    #ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+    #endif
+    #ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+    #endif
+    #ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+    #endif
+    
+}
+bool RefBatchNormOps::Prerun(Node* node)
+{
+    BatchNorm* bn_op = dynamic_cast<BatchNorm*>(node->GetOp());
+    BatchNormParam* param = bn_op->GetParam();
+
+    const Tensor* input_tensor = node->GetInputTensor(0);
+    int data_type = input_tensor->GetDataType();
+    const TShape& shape = input_tensor->GetShape();
+    const std::vector<int> dims = shape.GetDim();
+    int channel_num = dims[1];
+    float* scale_mean = ( float* )mem_alloc(channel_num * sizeof(float));
+    float* scale_var_inv = ( float* )mem_alloc(channel_num * sizeof(float));
+    const Tensor* mean_tensor = node->GetInputTensor(3);
+    const Tensor* var_tensor = node->GetInputTensor(4);
+    const float* mean = ( const float* )get_tensor_mem(mean_tensor);
+    const float* var = ( const float* )get_tensor_mem(var_tensor);
+
+    float rescale_factor;
+    float eps = param->eps;
+
+    rescale_factor = param->rescale_factor ? 1 / param->rescale_factor : 0;
+    for(int c = 0; c < channel_num; c++)
+    {
+        scale_var_inv[c] = 1.f / sqrt(var[c] * rescale_factor + eps);
+        scale_mean[c] = -mean[c] * rescale_factor * scale_var_inv[c];
+    }
+    float* gamma = NULL;
+    float* beta = NULL;
+    if(!param->caffe_flavor)
+    {
+        const Tensor* gamma_tensor = node->GetInputTensor(1);
+        const Tensor* beta_tensor = node->GetInputTensor(2);
+        gamma = (float* )get_tensor_mem(gamma_tensor);
+        beta = (float* )get_tensor_mem(beta_tensor);
+    }
+    int layout = exec_attr->graph_layout;
+    op_param.iscaffe = param->caffe_flavor;
+    op_param.scale_mean = scale_mean;
+    op_param.scale_var_inv = scale_var_inv;
+    op_param.gamma = gamma;
+    op_param.beta = beta;
+    op_param.layout = layout;
+   
+    if(!kernel_registry.GetKernel(kernel_run, layout, data_type))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+    return true;
+}
+
+bool RefBatchNormOps::Run(Node* node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    const TShape& shape = input_tensor->GetShape();
+    const std::vector<int> dims = shape.GetDim();
+    
+    if(TENGINE_LAYOUT_NCHW == op_param.layout)
+    {
+        if(4 == dims.size())
+        {
+            op_param.input_n = dims[0];
+            op_param.input_c = dims[1];
+            op_param.input_h = dims[2];
+            op_param.input_w = dims[3];
+        }
+        else if(3 == dims.size())
+        {
+            op_param.input_n = dims[0];
+            op_param.input_c = dims[1];
+            op_param.input_w = dims[2];
+            op_param.input_h = 1;
+        }
+        else
+        {
+            return false;
+        }
+    }
+    else
+    {
+        if(4 == dims.size())
+        {
+            op_param.input_n = dims[0];
+            op_param.input_c = dims[3];
+            op_param.input_h = dims[1];
+            op_param.input_w = dims[2];
+        }
+        else if(3 == dims.size())
+        {
+            op_param.input_n = dims[0];
+            op_param.input_c = dims[2];
+            op_param.input_w = dims[1];
+            op_param.input_h = 1;
+        }
+        else
+        {
+            return false;
+        }
+    }
+    auto* in_quant = input_tensor->GetQuantParam();
+    if(in_quant->size())
+    {
+        op_param.in_scale = (*in_quant)[0].scale;
+        op_param.in_zero = (*in_quant)[0].zero_point;
+    }
+    uint8_t* input = (uint8_t*)get_tensor_mem(input_tensor);
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    uint8_t*out_data = (uint8_t*)get_tensor_mem(output_tensor);
+    const int data_type = input_tensor->GetDataType();
+    if( data_type == TENGINE_DT_UINT8 )
+    {
+        auto* o_quant = output_tensor->GetQuantParam();
+        op_param.out_scale = (*o_quant)[0].scale;
+        op_param.out_zero = (*o_quant)[0].zero_point;
+    }
+    int ret = kernel_run(input, out_data, &op_param);
+    if(ret<0)
+        return false;
+
+    if(data_type == TENGINE_DT_INT8 )
+    {
+        Tensor* o_tensor = node->GetOutputTensor(0);
+        auto* o_quant = o_tensor->GetQuantParam();
+        QuantParam q_param;
+        q_param.scale = op_param.out_scale;
+        o_quant->resize(0);
+        o_quant->push_back(q_param);
+    }
+
+    return true;
+}
+
+bool RefBatchNormOps::Postrun(Node* node)
+{
+    free(op_param.scale_mean);
+    free(op_param.scale_var_inv);
+    return true;
+}
+
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    RefBatchNormOps* ops = new RefBatchNormOps();
+    return ops;
+}
+
+}// namespace RefBatchNormImpl
+
+
+void RegisterRefBatchNormOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "BatchNormalization", 
+        RefBatchNormImpl::SelectFunc, 1000);
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/ref/ref_concat.cpp b/executor/operator/ref/ref_concat.cpp
new file mode 100644
index 000000000..d7624e036
--- /dev/null
+++ b/executor/operator/ref/ref_concat.cpp
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+#include <iostream>
+#include <math.h>
+
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "graph.hpp"
+#include "operator/concat.hpp"
+
+#include "kernel/concat/concat_kernel.h"
+
+namespace TEngine
+{
+    namespace RefConcatOps
+    {
+        const int default_prio = 1500;
+        struct RefConcat : public MTNodeOps
+        {
+            bool Prerun(Node* node) override;
+            bool Run(Node* node) override;
+            bool Postrun(Node* node) override;
+            void InitRegistry(void);
+
+            RefConcat()
+            {
+                kernel_run = nullptr;
+                InitRegistry();
+            }
+            
+            struct concat_param op_param;
+            concat_t kernel_run;
+            void** input_data;
+            KernelRegistry<concat_t> kernel_registry;
+        };
+
+        void RefConcat::InitRegistry(void)
+        {
+            #ifdef CONFIG_KERNEL_FP32
+            kernel_registry.Register((concat_t)ref_concat_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+            kernel_registry.Register((concat_t)ref_concat_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+            #endif
+            #ifdef CONFIG_KERNEL_FP16
+            kernel_registry.Register((concat_t)ref_concat_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+            kernel_registry.Register((concat_t)ref_concat_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+            #endif
+            #ifdef CONFIG_KERNEL_INT8
+            kernel_registry.Register((concat_t)ref_concat_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+            kernel_registry.Register((concat_t)ref_concat_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+            #endif
+            #ifdef CONFIG_KERNEL_UINT8
+            kernel_registry.Register((concat_t)ref_concat_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+            kernel_registry.Register((concat_t)ref_concat_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+            #endif
+            
+        }
+
+        bool RefConcat::Prerun(Node* node)
+        {
+            int  layout=exec_attr->graph_layout;
+            Tensor* output_tensor = node->GetOutputTensor(0);
+            Concat* concat_op = dynamic_cast<Concat*>(node->GetOp());
+            ConcatParam* param = concat_op->GetParam();
+
+            Tensor* input_tensor = node->GetInputTensor(0);
+            int data_type = input_tensor->GetDataType();
+            op_param.axis = param->axis;
+
+            int in_nums = node->GetInputNum();
+            input_data = new void*[in_nums];
+            op_param.input_shape = new shape_dim[in_nums];
+            op_param.input_counts = in_nums;
+
+            auto dims =  output_tensor->GetShape().GetDim();
+            op_param.output_dim = (int)(dims.size());
+            for(std::size_t ii=0; ii<dims.size();++ii)
+            {
+                op_param.output_shape.dim[ii] = dims[ii];
+            }
+
+            if(!kernel_registry.GetKernel(kernel_run,layout,data_type))
+            {
+                set_tengine_errno(ENOENT);
+                return false;
+            }
+            
+            return true;
+        }
+
+        bool RefConcat::Run(Node* node)
+        {
+            Tensor* o_tensor = node->GetOutputTensor(0);
+            void* output = get_tensor_mem(o_tensor);
+            int data_type = -1;
+            for(int ii=0; ii<op_param.input_counts;++ii)
+            {
+                Tensor* input_tensor = node->GetInputTensor(ii);
+                data_type = input_tensor->GetDataType();
+                auto* in_quant = input_tensor->GetQuantParam();
+                if( (*in_quant).size() != 0 )
+                {
+                    op_param.input_shape[ii].scale = (*in_quant)[0].scale;
+                    op_param.input_shape[ii].zero = (*in_quant)[0].zero_point;
+                }
+                else
+                {
+                    op_param.input_shape[ii].scale = 1;
+                    op_param.input_shape[ii].zero = 0;
+                }
+
+                auto dims = input_tensor->GetShape().GetDim();
+                op_param.input_dim = (int)(dims.size());
+                for(std::size_t jj=0; jj<dims.size();++jj)
+                {
+                    op_param.input_shape[ii].dim[jj] = dims[jj];
+                }
+                
+                input_data[ii] = get_tensor_mem(input_tensor);
+            }
+
+            auto* o_quant = o_tensor->GetQuantParam();
+            if( (*o_quant).size() !=0)
+            {
+                op_param.output_shape.scale = (*o_quant)[0].scale;
+                op_param.output_shape.zero = (*o_quant)[0].zero_point;
+            }
+            else
+            {
+                op_param.output_shape.scale = 1;
+                op_param.output_shape.zero = 0;
+            }
+
+            const void ** input = (const void**)input_data;
+            int ret = kernel_run(input, output, &op_param);
+            if(ret<0)
+                return false;
+
+            if( data_type == TENGINE_DT_INT8 )
+            {
+                auto* o_quant = o_tensor->GetQuantParam();
+                QuantParam q_param;
+                q_param.scale = op_param.out_scale;
+                o_quant->resize(0);
+                o_quant->push_back(q_param);
+            }
+            
+            return true;
+        }
+
+        bool RefConcat::Postrun(Node* node)
+        {
+            delete[] input_data;
+            delete[] op_param.input_shape;
+            return true;
+        }
+
+        NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+        {
+            RefConcat* ops = new RefConcat();
+
+            LOG_DEBUG()<<"Refconcat is selected\n";
+
+            return ops;
+        }
+        
+        
+    } //end namespace RefConcatOps
+
+    void RegisterRefConcat(void)
+    {
+        NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Concat", RefConcatOps::SelectFunc,RefConcatOps::default_prio);
+    }
+    
+}
diff --git a/executor/operator/ref/ref_convolution.cpp b/executor/operator/ref/ref_convolution.cpp
new file mode 100644
index 000000000..e25052cfc
--- /dev/null
+++ b/executor/operator/ref/ref_convolution.cpp
@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+#include <iostream>
+#include <cstring>
+#include <cstdlib>
+#include <math.h>
+#include <cmath>
+
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+
+#include "graph.hpp"
+#include "operator/convolution.hpp"
+#include "kernel/convolution/ref_conv_kernel.h"
+
+namespace TEngine {
+
+namespace RefConvolutionOps {
+
+const int default_prio = 1500;
+
+inline static int get_scale_zero(Tensor* itensor, Tensor* otensor, Tensor* ktensor, op_data* param)
+{
+    auto* i_quant = itensor->GetQuantParam();
+    auto* k_quant = ktensor->GetQuantParam();
+    auto* o_quant = otensor->GetQuantParam();
+    if( i_quant->size() != 1 || k_quant->size() != 1)
+    {
+        std::cerr<<"quant size: input("<< i_quant->size()<<"),kernel("<<k_quant->size()<<")\n";
+        return -1;
+    }
+    param->scale[0] = (*i_quant)[0].scale;
+    param->scale[1] = (*k_quant)[0].scale;
+    if(itensor->GetDataType() == TENGINE_DT_UINT8)
+    {
+        if( o_quant->size() != 1)
+        {
+            std::cerr<<"output quant size: "<<o_quant->size()<<"\n";
+            return -1;
+        }
+
+        param->scale[2] = (*o_quant)[0].scale;
+        param->zero[2] = (*o_quant)[0].zero_point;
+
+        param->zero[0] = (*i_quant)[0].zero_point;
+        param->zero[1] = (*k_quant)[0].zero_point;
+    }
+    //printf("scale: %f,%f,%f   --     zero : %d,%d,%d \n",
+    //            param->scale[0],param->scale[1],param->scale[2],
+    //            param->zero[0],param->zero[1],param->zero[2]);
+    return 0;
+}
+
+struct RefConv : public MTNodeOps
+{
+    bool Prerun(Node* node) override;
+    bool Reshape(Node* node) override;
+    bool Run(Node* node) override;
+    bool Postrun(Node* node) override;
+    void InitRegistry(void);
+
+    bool dynamic_shape;
+    op_data op_param;
+    
+    ref_conv_kernel_t  kernel_run;
+    KernelRegistry<ref_conv_kernel_t>  kernel_registry;
+    RefConv(void) 
+    {
+        kernel_run=nullptr;
+        InitRegistry();
+    }
+};
+void RefConv::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((ref_conv_kernel_t)ref_conv_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((ref_conv_kernel_t)ref_conv_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((ref_conv_kernel_t)ref_conv_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((ref_conv_kernel_t)ref_conv_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((ref_conv_kernel_t)ref_conv_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((ref_conv_kernel_t)ref_conv_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((ref_conv_kernel_t)ref_conv_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((ref_conv_kernel_t)ref_conv_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+bool RefConv::Prerun(Node* node)
+{
+    int  layout=exec_attr->graph_layout;
+    
+    Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
+    ConvParam* param = conv_op->GetParam();
+    
+    Tensor* input_tensor = node->GetInputTensor(0);
+    op_param.batch = input_tensor->GetShape().GetN();
+    op_param.in_shape[0] = input_tensor->GetShape().GetC();
+    op_param.in_shape[1] = input_tensor->GetShape().GetH();
+    op_param.in_shape[2] = input_tensor->GetShape().GetW();
+
+    Tensor* kernel_tensor = node->GetInputTensor(1);
+    op_param.kernels[0] = kernel_tensor->GetShape().GetH();
+    op_param.kernels[1] = kernel_tensor->GetShape().GetW();
+
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    op_param.out_shape[0] = output_tensor->GetShape().GetC();
+    op_param.out_shape[1] = output_tensor->GetShape().GetH();
+    op_param.out_shape[2] = output_tensor->GetShape().GetW();
+
+    op_param.strides[0] = param->stride_h;
+    op_param.strides[1] = param->stride_w;
+
+    op_param.dilations[1] = param->dilation_h;
+    op_param.dilations[0] = param->dilation_w;
+
+    op_param.pads[0] = param->pad_h0;
+    op_param.pads[1] = param->pad_w0;
+    op_param.group = param->group;
+    op_param.activation = param->activation;
+    op_param.layout = layout;
+
+    if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+
+    return true;
+}
+
+bool RefConv::Reshape(Node* node)
+{
+    
+    Tensor* input_tensor = node->GetInputTensor(0);
+    op_param.batch = input_tensor->GetShape().GetN();
+    op_param.in_shape[0] = input_tensor->GetShape().GetC();
+    op_param.in_shape[1] = input_tensor->GetShape().GetH();
+    op_param.in_shape[2] = input_tensor->GetShape().GetW();
+
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    op_param.out_shape[0] = output_tensor->GetShape().GetC();
+    op_param.out_shape[1] = output_tensor->GetShape().GetH();
+    op_param.out_shape[2] = output_tensor->GetShape().GetW();
+
+    return true;
+}
+bool RefConv::Run(Node* node)
+{
+    //printf("---------------------------- Run ref_conv!!!\n");
+    Tensor* i_tensor = node->GetInputTensor(0);
+    const void* input = get_tensor_mem(i_tensor);
+    Tensor* k_tensor = node->GetInputTensor(1);
+    const void* kernel = get_tensor_mem(k_tensor);
+    Tensor* b_tensor = node->GetInputTensor(2);
+    const void* bias = nullptr;
+    if(b_tensor != nullptr)
+        bias = get_tensor_mem(b_tensor);
+    Tensor* o_tensor = node->GetOutputTensor(0);
+    void* output = get_tensor_mem(o_tensor);
+
+    /* Get input,kernel,output scale & zero */
+    /* Current: one tensor has only one quantparam(scale)*/
+    if(i_tensor->GetDataType() == TENGINE_DT_INT8 ||
+        i_tensor->GetDataType() == TENGINE_DT_UINT8 )
+    {
+        if(get_scale_zero(i_tensor, o_tensor, k_tensor, &op_param) < 0)
+            return false;
+    }
+
+    int ret = kernel_run(input,output,kernel,bias,&op_param);
+    if(i_tensor->GetDataType() == TENGINE_DT_INT8)
+    {
+        auto* o_quant = o_tensor->GetQuantParam();
+        QuantParam q_param;
+        q_param.scale = op_param.scale[2];
+        o_quant->resize(0);
+        o_quant->push_back(q_param);
+    }
+    if(ret<0)
+        return false;
+    return true;
+}
+
+bool RefConv::Postrun(Node* node)
+{
+    return true;
+}
+
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    RefConv* ops = new RefConv();
+
+    return ops;
+}
+
+}    // namespace RefConvolutionOps
+
+void RegisterRefConv2d(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Convolution", RefConvolutionOps::SelectFunc,
+                                                  RefConvolutionOps::default_prio);
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/ref/ref_deconvolution.cpp b/executor/operator/ref/ref_deconvolution.cpp
new file mode 100644
index 000000000..cb0a8b2e2
--- /dev/null
+++ b/executor/operator/ref/ref_deconvolution.cpp
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+#include <iostream>
+#include <cstring>
+#include <cstdlib>
+#include <math.h>
+#include <cmath>
+
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+
+#include "graph.hpp"
+#include "operator/deconvolution.hpp"
+#include "kernel/deconvolution/ref_deconv_kernel.h"
+
+namespace TEngine {
+
+namespace RefDeconvolutionOps {
+
+const int default_prio = 1500;
+
+struct RefDeconv : public MTNodeOps
+{
+    bool Prerun(Node* node) override;
+    bool Run(Node* node) override;
+    bool Reshape(Node* node) override;
+    bool Postrun(Node* node) override;
+    bool GetSharedMemorySize(Node*, unsigned int& mem_size) override;
+    bool SetSharedMemoryAddr(Node*, void* mem_addr, int mem_size) override;
+    void InitRegistry(void);
+
+    int element_size;
+    bool dynamic_shape;
+    deconv_ref_param op_param;
+
+    ref_deconv_kernel_t  kernel_run;
+    KernelRegistry<ref_deconv_kernel_t>  kernel_registry;
+    RefDeconv(void)
+    {
+        kernel_run=nullptr;
+        InitRegistry();
+    }
+};
+void RefDeconv::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+bool RefDeconv::Reshape(Node* node)
+{
+    return true;
+}
+
+bool RefDeconv::SetSharedMemoryAddr(Node* node, void* mem_addr, int mem_size)
+{
+    return true;
+}
+
+bool RefDeconv::GetSharedMemorySize(Node* node, unsigned int& mem_size)
+{
+    return true;
+}
+
+bool RefDeconv::Prerun(Node* node)
+{
+    int  layout=exec_attr->graph_layout;
+
+    Deconvolution* deconv_op = dynamic_cast<Deconvolution*>(node->GetOp());
+    DeconvParam* param = deconv_op->GetParam();
+
+    Tensor* input_tensor = node->GetInputTensor(0);
+    TShape inshape = input_tensor->GetShape();
+
+    if(0 == layout) // nchw
+    {
+        op_param.batch = inshape.Shape(0);
+        op_param.in_shape[0] = inshape.Shape(1);
+        op_param.in_shape[1] = inshape.Shape(2);
+        op_param.in_shape[2] = inshape.Shape(3);
+    }
+    else            // nhwc
+    {
+        op_param.batch = inshape.Shape(0);
+        op_param.in_shape[0] = inshape.Shape(3);
+        op_param.in_shape[1] = inshape.Shape(1);
+        op_param.in_shape[2] = inshape.Shape(2);
+    }
+
+    /* kernel quant param */
+    Tensor* kernel_tensor = node->GetInputTensor(1);
+    auto* k_quant = kernel_tensor->GetQuantParam();
+    if( (*k_quant).size() !=0)
+    {
+        op_param.scale[1] = (*k_quant)[0].scale;
+        op_param.zero[1] = (*k_quant)[0].zero_point;
+    }
+
+    TShape wshape = kernel_tensor->GetShape();
+
+    if(0 == layout) // hw
+    {
+        op_param.kernels[0] = wshape.Shape(2);
+        op_param.kernels[1] = wshape.Shape(3);
+    }
+    else            //
+    {
+        op_param.kernels[0] = wshape.Shape(1);
+        op_param.kernels[1] = wshape.Shape(2);
+    }
+
+    /* output quant param */
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    auto* o_quant = output_tensor->GetQuantParam();
+    if( (*o_quant).size() !=0)
+    {
+        op_param.scale[2] = (*o_quant)[0].scale;
+        op_param.zero[2] = (*o_quant)[0].zero_point;
+    }
+
+    TShape outshape = output_tensor->GetShape();
+
+    if(0 == layout) // chw
+    {
+        op_param.out_shape[0] = outshape.Shape(1);
+        op_param.out_shape[1] = outshape.Shape(2);
+        op_param.out_shape[2] = outshape.Shape(3);
+    }
+    else
+    {
+        op_param.out_shape[0] = outshape.Shape(3);
+        op_param.out_shape[1] = outshape.Shape(1);
+        op_param.out_shape[2] = outshape.Shape(2);
+    }
+
+    op_param.strides[0] = param->stride_h;
+    op_param.strides[1] = param->stride_w;
+
+    op_param.dilations[1] = param->dilation_h;
+    op_param.dilations[0] = param->dilation_w;
+
+    op_param.pads[0] = param->pad_h0;    //pad_h
+    op_param.pads[1] = param->pad_w0;    //pad_w
+
+    op_param.group = param->group;
+    op_param.activation = param->activation;
+    op_param.layout = layout;
+
+    if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+
+    return true;
+}
+
+bool RefDeconv::Run(Node* node)
+{
+    //printf("run ref_deconv!!!\n");
+    Tensor* i_tensor = node->GetInputTensor(0);
+    const void* input = get_tensor_mem(i_tensor);
+    Tensor* k_tensor = node->GetInputTensor(1);
+    const void* kernel = get_tensor_mem(k_tensor);
+    Tensor* b_tensor = node->GetInputTensor(2);
+    const void* bias = nullptr;
+    if(b_tensor != nullptr)
+        bias = get_tensor_mem(b_tensor);
+    Tensor* o_tensor = node->GetOutputTensor(0);
+    void* output = get_tensor_mem(o_tensor);
+
+    /* input quant param */
+    auto* in_quant = i_tensor->GetQuantParam();
+    if((*in_quant).size() !=0)
+    {
+        op_param.scale[0] = (*in_quant)[0].scale;
+        op_param.zero[0] = (*in_quant)[0].zero_point;
+    }
+
+    int ret = kernel_run(input,output,kernel,bias,&op_param);
+    if(ret<0)
+        return false;
+    if(i_tensor->GetDataType() == TENGINE_DT_INT8)
+    {
+        auto* o_quant = o_tensor->GetQuantParam();
+        QuantParam q_param;
+        q_param.scale = op_param.scale[2];
+        o_quant->resize(0);
+        o_quant->push_back(q_param);
+    }
+    return true;
+}
+
+bool RefDeconv::Postrun(Node* node)
+{
+    return true;
+}
+
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+    RefDeconv* ops = new RefDeconv();
+
+    if(node->IsDynamicShape())
+        ops->dynamic_shape = true;
+    else
+        ops->dynamic_shape = false;
+
+    return ops;
+}
+
+}    // namespace RefDeconvolutionOps
+
+void RegisterRefDeconv2d(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Deconvolution", RefDeconvolutionOps::SelectFunc,RefDeconvolutionOps::default_prio);
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/ref/ref_detection_postprocess.cpp b/executor/operator/ref/ref_detection_postprocess.cpp
new file mode 100644
index 000000000..4a82891f8
--- /dev/null
+++ b/executor/operator/ref/ref_detection_postprocess.cpp
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+#include <vector>
+#include <math.h>
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/detection_postprocess.hpp"
+#include "kernel/dpp/ref_dpp_kernel.h"
+
+namespace TEngine {
+
+namespace RefDetectionPostOps {
+
+struct RefDetectionPost : public NodeOps
+{
+    bool Prerun(Node * node) override;
+    bool Run(Node * node) override; 
+    void InitRegistry(void);
+
+    dpp_param param;
+    ref_dpp_kernel_t  kernel_run;
+    KernelRegistry<ref_dpp_kernel_t>  kernel_registry;
+
+    RefDetectionPost(void) 
+    {
+        kernel_run=nullptr;
+
+        InitRegistry();
+    }
+};
+
+bool RefDetectionPost::Prerun(Node * node)
+{
+    if(node->GetInputNum() != 3 || node->GetOutputNum()!=4)
+        return false;
+
+    int  layout = exec_attr->graph_layout;
+    DetectionPostProcess* dpp_op = dynamic_cast<DetectionPostProcess*>(node->GetOp());
+    DetectionPostProcessParam* param_ = dpp_op->GetParam();
+    param.max_classes_per_detection = param_->max_classes_per_detection;
+    param.nms_iou_threshold = param_->nms_iou_threshold;
+    param.nms_score_threshold = param_->nms_score_threshold;
+    param.num_classes = param_->num_classes;
+    param.max_detections = param_->max_detections;
+    param.scales[0] = param_->scales[0];
+    param.scales[1] = param_->scales[1];
+    param.scales[2] = param_->scales[2];
+    param.scales[3] = param_->scales[3];
+
+    Tensor* input = node->GetInputTensor(0);
+    if( input->GetDataType() != TENGINE_DT_FP32 && 
+        input->GetDataType() != TENGINE_DT_FP16 &&
+        input->GetDataType() != TENGINE_DT_UINT8)
+        return false;
+    param.num_boxes = input->GetShape().Shape(1);
+    auto i_quant   = input->GetQuantParam();
+        
+    Tensor* score = node->GetInputTensor(1);
+    auto s_quant   = score->GetQuantParam();
+    
+    Tensor* anchor = node->GetInputTensor(2);
+    auto a_quant   = anchor->GetQuantParam();
+    
+    if(input->GetDataType() == TENGINE_DT_UINT8)
+    {
+        if(i_quant->size() == 0 || s_quant->size() == 0 || a_quant->size() == 0)
+        {
+            std::cerr<<"RefDetectionPost <UINT8> one quant is NONE: <"<<i_quant->size()<<","
+                <<s_quant->size()<<","<<a_quant->size()<<"\n";
+            return false;
+        }
+        param.quant_scale[0] = (*i_quant)[0].scale;
+        param.quant_scale[1] = (*s_quant)[0].scale;
+        param.quant_scale[2] = (*a_quant)[0].scale;
+        param.zero[0]  = (*i_quant)[0].zero_point;
+        param.zero[1]  = (*s_quant)[0].zero_point;
+        param.zero[2]  = (*a_quant)[0].zero_point;
+    }
+    
+    if(!kernel_registry.GetKernel(kernel_run, layout, input->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+
+    return true;
+}
+
+bool RefDetectionPost::Run(Node * node)
+{
+    if(kernel_run == nullptr)
+        return false;
+
+    //printf(" ********** run ref dpp\n");
+    
+    Tensor* input = node->GetInputTensor(0);
+    const void* input_data = get_tensor_mem(input);
+    Tensor* score = node->GetInputTensor(1);
+    void* score_data = get_tensor_mem(score);
+    Tensor* anchor = node->GetInputTensor(2);
+    void* anchor_data = get_tensor_mem(anchor);
+    
+    Tensor* detect_boxes = node->GetOutputTensor(0);
+    float* detect_boxes_data = (float*)get_tensor_mem(detect_boxes);
+    Tensor* detect_classes = node->GetOutputTensor(1);
+    float* detect_classes_data = (float*)get_tensor_mem(detect_classes);
+    Tensor* detect_scores = node->GetOutputTensor(2);
+    float* detect_scores_data = (float*)get_tensor_mem(detect_scores);
+    Tensor* detect_num = node->GetOutputTensor(3);
+    float* detect_num_data = (float*)get_tensor_mem(detect_num);
+
+    if(kernel_run(input_data, score_data, anchor_data, detect_num_data,
+        detect_classes_data, detect_scores_data, detect_boxes_data, &param)<0)
+        return false;
+
+    return true;
+}
+
+void RefDetectionPost::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    RefDetectionPost* ops = new RefDetectionPost();
+
+    LOG_DEBUG()<<"Demo RefDetectionPost is selected\n";
+
+    return ops;
+}
+
+}    // namespace RefDetectionPostOps
+
+void RegisterRefDetectionPostOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "DetectionPostProcess", RefDetectionPostOps::SelectFunc, 1000);
+}
+
+
+}    // namespace TEngine
diff --git a/executor/operator/ref/ref_fully_connected.cpp b/executor/operator/ref/ref_fully_connected.cpp
new file mode 100644
index 000000000..e189ef5f4
--- /dev/null
+++ b/executor/operator/ref/ref_fully_connected.cpp
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+#include <vector>
+#include <math.h>
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/fully_connected.hpp"
+#include "kernel/fully_connected/ref_fc_kernel.h"
+
+namespace TEngine {
+
+namespace RefFCOps {
+
+struct RefFC : public MTNodeOps
+{
+    bool Prerun(Node * node) override;
+    bool Run(Node * node) override; 
+    void InitRegistry(void);
+
+    struct fc_data param;
+    ref_fc_kernel_t  kernel_run;
+    KernelRegistry<ref_fc_kernel_t>  kernel_registry;
+
+    RefFC(void) 
+    {
+        kernel_run=nullptr;
+
+        InitRegistry();
+    }
+};
+
+bool RefFC::Prerun(Node * node)
+{
+    int  layout = exec_attr->graph_layout;
+    FullyConnected* fc_op = dynamic_cast<FullyConnected*>(node->GetOp());
+    FCParam* param_ = fc_op->GetParam();
+    param.out_number = param_->num_output;
+
+    Tensor* input = node->GetInputTensor(0);
+    auto i_quant   = input->GetQuantParam();
+        
+    Tensor* weight = node->GetInputTensor(1);
+    int weight_out = weight->GetShape().Shape(0);
+    if(weight_out == param.out_number)
+        param.need_trans = 0;
+    else
+        param.need_trans = 1;
+    auto w_quant   = weight->GetQuantParam();
+    
+    Tensor* output = node->GetOutputTensor(0);
+    auto o_quant   = output->GetQuantParam();
+
+    if(input->GetDataType() == TENGINE_DT_UINT8)
+    {
+        if(i_quant->size() == 0 || w_quant->size() == 0 || o_quant->size() == 0)
+        {
+            std::cerr<<"FC <UINT8> one quant is NONE: <"<<i_quant->size()<<","
+                <<w_quant->size()<<","<<o_quant->size()<<"\n";
+            return false;
+        }
+        param.scale[0] = (*i_quant)[0].scale;
+        param.scale[1] = (*w_quant)[0].scale;
+        param.scale[2] = (*o_quant)[0].scale;
+        param.zero[0]  = (*i_quant)[0].zero_point;
+        param.zero[1]  = (*w_quant)[0].zero_point;
+        param.zero[2]  = (*o_quant)[0].zero_point;
+    }
+    else if(input->GetDataType() == TENGINE_DT_INT8)
+    {
+        if(w_quant->size() == 0)
+        {
+            std::cerr<<"FC <INT8> weight quant size is NONE\n";
+            return false;
+        }
+        param.scale[1] = (*w_quant)[0].scale;
+        param.zero[1]  = (*w_quant)[0].zero_point;
+    }
+    
+    if(!kernel_registry.GetKernel(kernel_run, layout, input->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+
+    return true;
+}
+
+bool RefFC::Run(Node * node)
+{
+    if(kernel_run == nullptr)
+        return false;
+
+    Tensor* input = node->GetInputTensor(0);
+    param.batch   = input->GetShape().Shape(0);
+    param.hidden  = input->GetShape().GetSize()/param.batch;
+    const void* input_data = get_tensor_mem(input);
+    Tensor* weight = node->GetInputTensor(1);
+    
+    void* weight_data = get_tensor_mem(weight);
+    
+    Tensor* output = node->GetOutputTensor(0);
+    void* output_data = get_tensor_mem(output);
+
+    /* INT8 get input scale */
+    if(input->GetDataType() == TENGINE_DT_INT8)
+    {
+        auto i_quant = input->GetQuantParam();
+        param.scale[0] = (*i_quant)[0].scale;
+        param.zero[0]  = (*i_quant)[0].zero_point;
+    }
+
+    void* bias_data = nullptr;
+    if(node->GetInputNum() > 2)
+    {
+        Tensor* bias = node->GetInputTensor(2);
+        bias_data = get_tensor_mem(bias);
+    }
+    if(kernel_run(input_data, output_data, weight_data, bias_data, &param)<0)
+        return false;
+
+    /* INT8 set output scale */
+    if(input->GetDataType() == TENGINE_DT_INT8)
+    {
+        auto o_quant = output->GetQuantParam();
+        o_quant->resize(1);
+        (*o_quant)[0].scale = param.scale[2];
+        (*o_quant)[0].zero_point = param.zero[2];
+    }
+
+    return true;
+}
+
+void RefFC::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((ref_fc_kernel_t)ref_fc_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((ref_fc_kernel_t)ref_fc_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((ref_fc_kernel_t)ref_fc_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((ref_fc_kernel_t)ref_fc_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((ref_fc_kernel_t)ref_fc_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((ref_fc_kernel_t)ref_fc_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((ref_fc_kernel_t)ref_fc_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((ref_fc_kernel_t)ref_fc_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    RefFC* ops = new RefFC();
+
+    LOG_DEBUG()<<"Demo RefFCOp is selected\n";
+
+    return ops;
+}
+
+}    // namespace RefFCOps
+
+void RegisterRefFCOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "FullyConnected", RefFCOps::SelectFunc, 1000);
+}
+
+
+}    // namespace TEngine
diff --git a/executor/operator/ref/ref_lrn.cpp b/executor/operator/ref/ref_lrn.cpp
new file mode 100644
index 000000000..262462b41
--- /dev/null
+++ b/executor/operator/ref/ref_lrn.cpp
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+
+#include <iostream>
+#include <math.h>
+
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "graph.hpp"
+#include "operator/lrn.hpp"
+
+#include "kernel/lrn/ref_lrn_kernel.h"
+
+namespace TEngine
+{
+    namespace RefLrnOps
+    {
+        const int default_prio = 1500;
+        struct RefLrn : public MTNodeOps
+        {
+            bool Prerun(Node* node) override;
+            bool Run(Node* node) override;
+            bool RunNHWC(Node* node);
+            bool RunNCHW(Node* node);
+            bool Postrun(Node* node) override;
+            void InitRegistry(void);
+
+            RefLrn()
+            {
+                kernel_run = nullptr;
+                InitRegistry();
+            }
+            
+            ref_lrn_param op_param;
+            ref_lrn_kernel_t kernel_run;
+            KernelRegistry<ref_lrn_kernel_t> kernel_registry;
+        };
+
+        void RefLrn::InitRegistry(void)
+        {
+            #ifdef CONFIG_KERNEL_FP32
+            kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+            kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+            #endif
+            #ifdef CONFIG_KERNEL_FP16
+            kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+            kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+            #endif
+            #ifdef CONFIG_KERNEL_INT8
+            kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+            kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+            #endif
+            #ifdef CONFIG_KERNEL_UINT8
+            kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+            kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+            #endif
+        }
+
+        bool RefLrn::Prerun(Node* node)
+        {
+            int layout = exec_attr->graph_layout;
+            Tensor* input_tensor = node->GetInputTensor(0);
+            int data_type = input_tensor->GetDataType();
+
+            LRN* lrn_op = dynamic_cast<LRN*>(node->GetOp());
+            LRNParam* param = lrn_op->GetParam();
+
+            op_param.layout = layout;
+            op_param.alpha = param->alpha;
+            op_param.beta = param->beta;
+            op_param.bias = param->k;
+            op_param.local_size = param->local_size;
+            op_param.norm_region = param->norm_region;
+
+            auto dims =  input_tensor->GetShape().GetDim();
+            for(unsigned int i = 0; i < dims.size(); i++)
+            {
+                op_param.dims[i] = dims[i];
+            }
+
+            if(!kernel_registry.GetKernel(kernel_run, layout, data_type))
+            {
+                set_tengine_errno(ENOENT);
+                return false;
+            }
+
+            return true;
+        }
+
+        bool RefLrn::Run(Node* node)
+        {
+            if(exec_attr->graph_layout == TENGINE_LAYOUT_NCHW)
+            {
+                return RunNCHW(node);
+            }
+            else
+            {
+                // TODO: support NCHW
+                return false;
+            }
+        }
+
+        bool RefLrn::RunNCHW(Node* node)
+        {
+            Tensor* input_tensor = node->GetInputTensor(0);
+            Tensor* output_tensor = node->GetOutputTensor(0);
+            void* input = get_tensor_mem(input_tensor);
+            void* output = get_tensor_mem(output_tensor);
+            int data_type = input_tensor->GetDataType();
+
+            auto dims = input_tensor->GetShape().GetDim();
+            for(unsigned int i = 0; i < dims.size(); i++)
+            {
+                op_param.dims[i] = dims[i];
+            }
+
+            auto* in_quant = input_tensor->GetQuantParam();
+            if((*in_quant).size() != 0 )
+            {
+                op_param.scale[0] = (*in_quant)[0].scale;
+                op_param.zero[0] = (*in_quant)[0].zero_point;
+            }
+            else
+            {
+                op_param.scale[0] = 1;
+                op_param.zero[0] = 0;
+            }
+
+            auto* o_quant = output_tensor->GetQuantParam();
+            if((*o_quant).size() !=0)
+            {
+                op_param.scale[1] = (*o_quant)[0].scale;
+                op_param.zero[1] = (*o_quant)[0].zero_point;
+            }
+            else
+            {
+                op_param.scale[1] = 1;
+                op_param.zero[1] = 0;
+            }
+
+            if(kernel_run(input, output, &op_param) < 0)
+                return false;
+
+            if(data_type == TENGINE_DT_INT8)
+            {
+                QuantParam q_param;
+                q_param.scale = op_param.scale[1];
+                o_quant->resize(0);
+                o_quant->push_back(q_param);
+            }
+            
+            return true;
+        }
+
+        bool RefLrn::Postrun(Node* node)
+        {
+            return true;
+        }
+
+        NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+        {
+            RefLrn* ops = new RefLrn();
+
+            LOG_DEBUG()<<"RefLrn is selected\n";
+
+            return ops;
+        }
+        
+        
+    } //end namespace RefLrnOps
+
+    void RegisterRefLrn(void)
+    {
+        NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "LRN", RefLrnOps::SelectFunc, RefLrnOps::default_prio);
+    }
+}
+
diff --git a/executor/operator/ref/ref_normalize.cpp b/executor/operator/ref/ref_normalize.cpp
new file mode 100644
index 000000000..a227d3a6f
--- /dev/null
+++ b/executor/operator/ref/ref_normalize.cpp
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+#include <iostream>
+#include <cstring>
+#include <cstdlib>
+#include <math.h>
+#include <cmath>
+
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+
+#include "graph.hpp"
+#include "operator/normalize.hpp"
+#include "kernel/ref_normalize/ref_normalize_kernel.h"
+
+namespace TEngine {
+
+namespace RefNormalizeOps {
+
+
+struct RefNormalize : public MTNodeOps
+{
+    bool Prerun(Node* node) override;
+    bool Run(Node* node) override;
+    void InitRegistry(void);
+    ref_normalize_param op_param;
+    ref_normalize_kernel_t  kernel_run;
+    KernelRegistry<ref_normalize_kernel_t>  kernel_registry;
+    RefNormalize(void)
+    {
+        kernel_run=nullptr;
+        InitRegistry();
+    }
+};
+
+void RefNormalize::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+bool RefNormalize::Prerun(Node* node)
+{
+    int layout = exec_attr->graph_layout;
+    Tensor* input_tensor = node->GetInputTensor(0);
+    if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+
+    return true;
+}
+
+bool RefNormalize::Run(Node* node)
+{
+    
+    Tensor* input_tensor = node->GetInputTensor(0);
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    //Normalize* normalize_op = dynamic_cast<Normalize*>(node->GetOp());
+    //NormalizeParam* param_ = normalize_op->GetParam();
+
+    TShape& shape = input_tensor->GetShape();
+    std::vector<int> dims = shape.GetDim();
+    const ExecAttr* exec_attr = any_cast<const ExecAttr*>(node->GetAttr(ATTR_EXEC_ATTR));
+    
+    op_param.layout = exec_attr->graph_layout;
+    if(TENGINE_LAYOUT_NCHW == op_param.layout)
+    {
+        op_param.input_n = dims[0];
+        op_param.input_h = dims[2];
+        op_param.input_w = dims[3];
+        op_param.input_c = dims[1];
+    }
+    else // nhwc
+    {
+        op_param.input_n = dims[0];
+        op_param.input_h = dims[1];
+        op_param.input_w = dims[2];
+        op_param.input_c = dims[3];
+    }
+    
+    uint8_t *scale = NULL;
+    if(node->GetInputNum() > 1)
+    {
+        const Tensor* scale_tensor = node->GetInputTensor(1);
+        scale = (uint8_t* )get_tensor_mem(scale_tensor);
+    }
+    uint8_t* input = (uint8_t *)get_tensor_mem(input_tensor);
+    uint8_t* output = (uint8_t *)get_tensor_mem(output_tensor);
+    if(TENGINE_DT_UINT8 == input_tensor->GetDataType() ||
+       TENGINE_DT_INT8 == input_tensor->GetDataType())
+    {
+        auto *in_quant = input_tensor->GetQuantParam();
+        if(in_quant->size())
+        {
+            op_param.in_scale = (*in_quant)[0].scale;
+            op_param.in_zero = (*in_quant)[0].zero_point;
+        }
+        if(node->GetInputNum() == 2)
+        {
+            Tensor* scale_tensor = node->GetInputTensor(1);
+            auto *scale_quant = scale_tensor->GetQuantParam();
+            if(scale_quant->size())
+            {
+                op_param.scale_scale = (*scale_quant)[0].scale;
+                op_param.scale_zero = (*scale_quant)[0].zero_point;
+            }
+        }
+    }
+    if(TENGINE_DT_UINT8 == input_tensor->GetDataType())
+    {
+        auto *out_quant = output_tensor->GetQuantParam();
+        if(out_quant->size())
+        {
+            op_param.out_scale = (*out_quant)[0].scale;
+            op_param.out_zero = (*out_quant)[0].zero_point;
+        }
+    }
+    int ret = kernel_run(input,output,scale,&(this->op_param));
+    if(ret < 0)
+        return false;
+    
+    if(TENGINE_DT_INT8 == input_tensor->GetDataType())
+    {
+        auto *out_quant = output_tensor->GetQuantParam();
+        QuantParam q_param;
+        q_param.scale = op_param.out_scale;
+        q_param.zero_point = 0;
+        out_quant->resize(0);
+        out_quant->push_back(q_param);
+    }
+    
+    return true;
+}
+
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{ 
+    RefNormalize* ops = new RefNormalize();
+
+    return ops;
+}
+
+}    // namespace RefNormalizeOps
+
+void RegisterRefNormlizeOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Normalize", RefNormalizeOps::SelectFunc, 2000);
+}
+}    // namespace TEngine
diff --git a/executor/operator/ref/ref_permute.cpp b/executor/operator/ref/ref_permute.cpp
new file mode 100644
index 000000000..59838625e
--- /dev/null
+++ b/executor/operator/ref/ref_permute.cpp
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jjzeng@openailab.com
+ */
+
+#include <iostream>
+#include <math.h>
+
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "graph.hpp"
+#include "operator/permute.hpp"
+
+#include "kernel/permute/permute_kernel.h"
+
+namespace TEngine
+{
+    namespace RefPermuteOps
+    {
+        const int default_prio = 1500;
+        struct RefPermute : public MTNodeOps
+        {
+            bool Prerun(Node* node) override;
+            bool Run(Node* node) override;
+            bool Reshape(Node* node) override;
+            bool Postrun(Node* node) override;
+            bool GetSharedMemorySize(Node*, unsigned int& mem_size) override;
+            bool SetSharedMemoryAddr(Node*, void* mem_addr, int mem_size) override;
+            void InitRegistry(void);
+
+            RefPermute()
+            {
+                kernel_run = nullptr;
+                InitRegistry();
+            }
+            
+            permute_param op_param;
+            permute_t kernel_run;
+            KernelRegistry<permute_t> kernel_registry;
+        };
+
+        void RefPermute::InitRegistry(void)
+        {
+            #ifdef CONFIG_KERNEL_FP32
+            kernel_registry.Register((permute_t)ref_permute_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+            kernel_registry.Register((permute_t)ref_permute_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+            #endif
+            #ifdef CONFIG_KERNEL_FP16
+            kernel_registry.Register((permute_t)ref_permute_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+            kernel_registry.Register((permute_t)ref_permute_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+            #endif
+            #ifdef CONFIG_KERNEL_INT8
+            kernel_registry.Register((permute_t)ref_permute_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+            kernel_registry.Register((permute_t)ref_permute_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+            #endif
+            #ifdef CONFIG_KERNEL_UINT8
+            kernel_registry.Register((permute_t)ref_permute_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+            kernel_registry.Register((permute_t)ref_permute_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+            #endif
+            
+        }
+
+        bool RefPermute::Reshape(Node* node)
+        {
+            return true;
+        }
+
+        bool RefPermute::SetSharedMemoryAddr(Node* node, void* mem_addr, int mem_size)
+        {
+            return true;
+        }
+        
+        bool RefPermute::GetSharedMemorySize(Node* node, unsigned int& mem_size)
+        {
+            return true;
+        }
+
+        bool RefPermute::Prerun(Node* node)
+        {
+            int  layout=exec_attr->graph_layout;
+
+            Permute* permute_op = dynamic_cast<Permute*>(node->GetOp());
+            PermuteParam* param = permute_op->GetParam();
+
+            op_param.order0 = param->order0;
+            op_param.order1 = param->order1;
+            op_param.order2 = param->order2;
+            op_param.order3 = param->order3;
+
+            Tensor* in_tensor = node->GetInputTensor(0);
+            auto dims = in_tensor->GetShape().GetDim();
+            for(std::size_t ii=0; ii<dims.size();++ii)
+            {
+                op_param.in_dim[ii] = dims[ii];
+            }
+            op_param.layout = layout;
+
+            if(!kernel_registry.GetKernel(kernel_run,layout,in_tensor->GetDataType()))
+            {
+                set_tengine_errno(ENOENT);
+                return false;
+            }
+            
+            return true;
+        }
+
+        bool RefPermute::Run(Node* node)
+        {
+            Tensor* o_tensor = node->GetOutputTensor(0);
+            void* output = get_tensor_mem(o_tensor);
+            Tensor* i_tensor = node->GetInputTensor(0);
+            const void* input = get_tensor_mem(i_tensor);
+            float scale = 1;
+            int data_type = i_tensor->GetDataType();
+            auto* i_quant = i_tensor->GetQuantParam();
+            if( (*i_quant).size() !=0 )
+            {
+                scale = (*i_quant)[0].scale;
+            }
+            
+            int ret = kernel_run(input,output,&op_param);
+            if(ret<0)
+                return false;
+
+            if( data_type == TENGINE_DT_INT8 )
+            {
+                auto* o_quant = o_tensor->GetQuantParam();
+                QuantParam q_param;
+                q_param.scale = scale;
+                o_quant->resize(0);
+                o_quant->push_back(q_param);
+            }
+            
+            return true;
+        }
+
+        bool RefPermute::Postrun(Node* node)
+        {
+            return true;
+        }
+
+        NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+        {
+            RefPermute* ops = new RefPermute();
+
+            LOG_DEBUG()<<"Refpermute is selected\n";
+
+            return ops;
+        }
+        
+        
+    } //end namespace RefConcatOps
+
+    void RegisterRefPermute(void)
+    {
+        NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Permute", RefPermuteOps::SelectFunc,RefPermuteOps::default_prio);
+    }
+
+}
diff --git a/executor/operator/ref/ref_pooling.cpp b/executor/operator/ref/ref_pooling.cpp
new file mode 100644
index 000000000..5c055e71a
--- /dev/null
+++ b/executor/operator/ref/ref_pooling.cpp
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+#include <vector>
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/pooling.hpp"
+#include "kernel/pooling/ref_pooling_kernel.h"
+
+namespace TEngine {
+
+namespace RefPoolingOps {
+
+struct RefPooling : public MTNodeOps
+{
+    bool Prerun(Node * node) override;
+    bool Reshape(Node * node) override; 
+    bool Run(Node * node) override; 
+    void InitRegistry(void);
+
+    struct op_data param;
+    ref_pooling_kernel_t  kernel_run;
+    KernelRegistry<ref_pooling_kernel_t>  kernel_registry;
+
+    RefPooling(void) 
+    {
+        kernel_run=nullptr;
+
+        InitRegistry();
+    }
+};
+
+bool RefPooling::Prerun(Node * node)
+{
+    int  layout = exec_attr->graph_layout;
+    param.layout = layout;
+    Pooling* pooling_op = dynamic_cast<Pooling*>(node->GetOp());
+    PoolParam* param_ = pooling_op->GetParam();
+    param.kernels[0] = param_->kernel_h;
+    param.kernels[1] = param_->kernel_w;
+    param.strides[0] = param_->stride_h;
+    param.strides[1] = param_->stride_w;
+    param.pads[0]    = param_->pad_h0;
+    param.pads[1]    = param_->pad_w0;
+    param.method     = param_->alg;
+    param.caffe_flavor = param_->caffe_flavor;
+
+    Tensor * input = node->GetInputTensor(0);
+    param.batch    = input->GetShape().GetN();
+    param.channel  = input->GetShape().GetC();
+    param.input[0] = input->GetShape().GetH();
+    param.input[1] = input->GetShape().GetW(); 
+    
+    Tensor * output= node->GetOutputTensor(0);
+    param.output[0] = output->GetShape().GetH();
+    param.output[1] = output->GetShape().GetW(); 
+
+    if(input->GetDataType() == TENGINE_DT_UINT8)
+    {
+        auto quant_param = input->GetQuantParam();
+        param.zero_point = (*quant_param)[0].zero_point;
+    }
+    
+    if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+
+    return true;
+}
+
+bool RefPooling::Reshape(Node* node)
+{
+    Pooling* pooling_op = dynamic_cast<Pooling*>(node->GetOp());
+    PoolParam* param_ = pooling_op->GetParam();
+    param.kernels[0] = param_->kernel_h;
+    param.kernels[1] = param_->kernel_w;
+    
+    Tensor * input = node->GetInputTensor(0);
+    param.batch    = input->GetShape().GetN();
+    param.channel  = input->GetShape().GetC();
+    param.input[0] = input->GetShape().GetH();
+    param.input[1] = input->GetShape().GetW(); 
+    
+    Tensor * output= node->GetOutputTensor(0);
+    param.output[0] = output->GetShape().GetH();
+    param.output[1] = output->GetShape().GetW(); 
+    return true;
+}
+
+bool RefPooling::Run(Node * node)
+{
+    if(kernel_run == nullptr)
+        return false;
+
+    Tensor* input = node->GetInputTensor(0);
+    Tensor* output = node->GetOutputTensor(0);
+    auto i_quant = input->GetQuantParam();
+    auto o_quant = output->GetQuantParam();
+    if(input->GetDataType() == TENGINE_DT_INT8)
+    {
+        if(i_quant->size() != 1)
+        {
+            std::cerr<<"Input data_type is INT8 ,and quant param num is not 1 !!!!\n";
+            return false;
+        }
+        o_quant->resize(0);
+        o_quant->push_back((*i_quant)[0]);
+    }
+
+    
+    const void* input_data = get_tensor_mem(input);
+    void* output_data = get_tensor_mem(output);
+
+    if(kernel_run(input_data, output_data, &param)<0)
+        return false;
+
+
+    return true;
+}
+
+void RefPooling::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    RefPooling* ops = new RefPooling();
+
+    LOG_DEBUG()<<"Demo RefPoolingOp is selected\n";
+
+    return ops;
+}
+
+}    // namespace RefPoolingOps
+
+void RegisterRefPoolingOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Pooling", RefPoolingOps::SelectFunc, 8000);
+}
+
+
+}    // namespace TEngine
diff --git a/executor/operator/ref/ref_rpn.cpp b/executor/operator/ref/ref_rpn.cpp
new file mode 100644
index 000000000..b1a6b3aec
--- /dev/null
+++ b/executor/operator/ref/ref_rpn.cpp
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+#include <iostream>
+#include <functional>
+#include <cstring>
+#include <algorithm>
+#include <math.h>
+
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/rpn.hpp"
+#include "kernel/rpn/ref_rpn_kernel.h"
+
+void ref_proposal_local_anchor(int feat_height, int feat_width, int feat_stride, std::vector<Anchor>& anchors,
+                           float* local_anchors)
+{
+    int feat_size = feat_height*feat_width;
+    int num_anchors = ( int )anchors.size();
+    for(int i = 0; i < num_anchors; ++i)
+    {
+        for(int j = 0; j < feat_height; j++)
+            for(int k = 0; k < feat_width; k++)
+            {
+                local_anchors[(i * 4 + 0) * feat_size + j * feat_width + k] = anchors[i].x0 + k*feat_stride;
+                local_anchors[(i * 4 + 1) * feat_size + j * feat_width + k] = anchors[i].y0 + j*feat_stride;
+                local_anchors[(i * 4 + 2) * feat_size + j * feat_width + k] = anchors[i].x1 + k*feat_stride;
+                local_anchors[(i * 4 + 3) * feat_size + j * feat_width + k] = anchors[i].y1 + j*feat_stride;
+            }
+    }
+}
+
+namespace TEngine {
+
+namespace RefRPNImpl {
+
+struct RefRPNOps : public NodeOps
+{
+    bool Prerun(Node * node) override;
+    bool Run(Node * node) override;
+    void InitRegistry(void);
+
+    struct rpn_param param;
+    ref_rpn_kernel_t kernel_run;
+    KernelRegistry<ref_rpn_kernel_t>  kernel_registry;
+
+    RefRPNOps(void) 
+    {
+        kernel_run=nullptr;
+
+        InitRegistry();
+    }
+
+};
+void RefRPNOps::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((ref_rpn_kernel_t)ref_rpn_fp32, TENGINE_LAYOUT_NCHW, TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((ref_rpn_kernel_t)ref_rpn_fp16, TENGINE_LAYOUT_NCHW, TENGINE_DT_FP16);
+#endif
+/*
+
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+*/
+}
+
+bool RefRPNOps::Prerun(Node* node)
+{
+    RPN* RPN_op = dynamic_cast<RPN*>(node->GetOp());
+    RPNParam* param_ = RPN_op->GetParam();
+    param.feat_stride = param_->feat_stride;
+    param.min_size = param_->min_size;
+    param.per_nms_topn = param_->per_nms_topn;
+    param.post_nms_topn = param_->post_nms_topn;
+    param.nms_thresh = param_->nms_thresh;
+
+    int  layout = exec_attr->graph_layout;
+    Tensor * input = node->GetInputTensor(0);
+
+    if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+
+    return true;
+}
+
+bool RefRPNOps::Run(Node* node)
+{
+    printf("ref RPN run\n");
+    RPN* RPN_op = dynamic_cast<RPN*>(node->GetOp());
+    RPNParam* param_ = RPN_op->GetParam();
+
+    const Tensor* score_tensor = node->GetInputTensor(0);
+    const Tensor* featmap_tensor = node->GetInputTensor(1);
+    const Tensor* info_tensor = node->GetInputTensor(2);
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    TShape& out_shape = output_tensor->GetShape();
+
+    const void* score_org = get_tensor_mem(score_tensor);
+    void* featmap_org = get_tensor_mem(featmap_tensor);
+    const float* info_org = (float*)get_tensor_mem(info_tensor);
+    void* output_org = get_tensor_mem(output_tensor);
+
+    const TShape& featmap_shape = featmap_tensor->GetShape();
+    const int feat_channel = featmap_shape.GetC();
+    const int feat_height = featmap_shape.GetH();
+    const int feat_width = featmap_shape.GetW();
+    const int feat_size = feat_height * feat_width;
+
+    const TShape& score_shape = score_tensor->GetShape();
+    param.num_anchors = ( int )param_->anchors_.size();
+    param.feat_chan = feat_channel;
+    param.feat_height = feat_height;
+    param.feat_width = feat_width;
+    param.score_chan = score_shape.GetC();
+    param.src_height = info_org[0];
+    param.src_width = info_org[1];
+    param.src_scale = info_org[2];
+
+
+    // local_anchors (1, anchors_nums_ * 4, map_height_, map_width_);
+    int size = param.num_anchors * 4 * feat_size;
+    float* local_anchors = new float[size];
+
+    ref_proposal_local_anchor(feat_height, feat_width, param.feat_stride, param_->anchors_, local_anchors);
+
+    int output_num = kernel_run(score_org, featmap_org, local_anchors, output_org, &param);
+
+    std::vector<int> outdim = {1, output_num, 4, 1};
+    out_shape.SetDim(outdim);
+
+    delete[] local_anchors;
+
+    return true;
+}
+
+NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node)
+{
+
+    RefRPNOps* ops = new RefRPNOps();
+
+    return ops;
+}
+
+}    // namespace RefRPNImpl
+
+void RegisterRefRPNOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor("reference", "RPN", RefRPNImpl::SelectFunc, 1000);
+
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/ref/ref_slice.cpp b/executor/operator/ref/ref_slice.cpp
new file mode 100644
index 000000000..579fed5be
--- /dev/null
+++ b/executor/operator/ref/ref_slice.cpp
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: ruizhang@openailab.com
+ */
+
+#include <iostream>
+#include <math.h>
+
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "graph.hpp"
+#include "operator/slice.hpp"
+
+#include "kernel/slice/slice_kernel.h"
+
+namespace TEngine
+{
+    namespace RefSliceOps
+    {
+        const int default_prio = 1500;
+        struct RefSlice : public MTNodeOps
+        {
+            bool Prerun(Node* node) override;
+            bool Run(Node* node) override;
+            bool Postrun(Node* node) override;
+            void InitRegistry(void);
+
+            RefSlice()
+            {
+                kernel_run = nullptr;
+                InitRegistry();
+            }
+            struct slice_param op_param;
+            slice_t kernel_run;
+            int8_t** out_data_ptrs;
+            KernelRegistry<slice_t> kernel_registry;
+        };
+
+        void RefSlice::InitRegistry(void)
+        {
+            #ifdef CONFIG_KERNEL_FP32
+            kernel_registry.Register((slice_t)ref_slice_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+            kernel_registry.Register((slice_t)ref_slice_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+            #endif
+            #ifdef CONFIG_KERNEL_FP16
+            kernel_registry.Register((slice_t)ref_slice_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+            kernel_registry.Register((slice_t)ref_slice_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+            #endif
+            #ifdef CONFIG_KERNEL_INT8
+            kernel_registry.Register((slice_t)ref_slice_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+            kernel_registry.Register((slice_t)ref_slice_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+            #endif
+            #ifdef CONFIG_KERNEL_UINT8
+            kernel_registry.Register((slice_t)ref_slice_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+            kernel_registry.Register((slice_t)ref_slice_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+            #endif
+            
+        }
+
+        bool RefSlice::Prerun(Node* node)
+        {
+            int layout=exec_attr->graph_layout;
+            Slice* slice_op = dynamic_cast<Slice*>(node->GetOp());
+            SliceParam * param = slice_op->GetParam();
+            Tensor* input_tensor = node->GetInputTensor(0);
+            int data_type = input_tensor->GetDataType();
+            auto in_dim = input_tensor->GetShape().GetDim();
+            unsigned int out_num = node->GetOutputNum();
+            out_data_ptrs = new int8_t*[out_num];
+            op_param.axis = param->axis;
+            op_param.output_shape = new shape_dim[out_num];
+            op_param.out_num = out_num;
+            op_param.dim_num = (int)(in_dim.size());
+            op_param.iscaffe = param->iscaffe;
+            if(!kernel_registry.GetKernel(kernel_run,layout,data_type))
+            {
+                set_tengine_errno(ENOENT);
+                return false;
+            }
+            
+            return true;
+        }
+
+        bool RefSlice::Run(Node* node)
+        {
+            Slice* slice_op = dynamic_cast<Slice*>(node->GetOp());
+            SliceParam * param = slice_op->GetParam();            
+            Tensor * input_tensor = node->GetInputTensor(0);
+            int8_t *input = (int8_t*)get_tensor_mem(input_tensor);
+            auto in_dim = input_tensor->GetShape().GetDim();
+            auto *in_quant = input_tensor->GetQuantParam();
+            if(in_quant->size() > 0)
+	    {
+	        op_param.out_scale = (*in_quant)[0].scale;
+	    }
+	    const int data_type = input_tensor->GetDataType(); 
+            if(op_param.iscaffe)
+            {
+                //set the input dim and output dim 
+                for(int i = 0; i < op_param.dim_num;i++)
+                {
+                    op_param.in_shape[i] = in_dim[i];
+                }
+                // set the output 
+                for(int i = 0; i < op_param.out_num; ++i)
+                {
+                    Tensor * out_tensor = node->GetOutputTensor(i);
+                    auto out_dim = out_tensor->GetShape().GetDim();
+                    for(int j = 0; j < op_param.dim_num; ++j)
+                    {
+                        op_param.output_shape[i].dims[j] = out_dim[j];
+                    }
+                    out_data_ptrs[i] = (int8_t*)get_tensor_mem(out_tensor);
+                    //set the output quant param
+                    if( data_type == TENGINE_DT_INT8 )
+                    {
+                        auto* o_quant = out_tensor->GetQuantParam();
+                        QuantParam q_param;
+                        q_param.scale = op_param.out_scale;
+                        o_quant->resize(0);
+                        o_quant->push_back(q_param);
+                    }
+                }
+            }
+            else // For tensorflow, there is only one output tensor
+            {
+                int maxdim = 4;
+                int real_dim = op_param.dim_num;
+                int dim_idx = 0;
+                for(int idx = 0; idx < maxdim; idx++)
+                {
+                    if(maxdim - idx > real_dim)
+                    {
+                        op_param.output_shape[0].begins[idx] = 0;
+                        op_param.output_shape[0].sizes[idx] = 1;
+                        op_param.in_shape[idx] = 1;
+                    }
+                    else
+                    {
+                        op_param.output_shape[0].begins[idx] = param->begin_[dim_idx];
+                        op_param.output_shape[0].sizes[idx] = param->size_[dim_idx];
+                        op_param.in_shape[idx] = in_dim[dim_idx];
+                        dim_idx++;
+                    }
+                }
+                Tensor* o_tensor = node->GetOutputTensor(0);
+                out_data_ptrs[0] = (int8_t*)get_tensor_mem(o_tensor);
+                 // Set the int8 output quant param
+                if( data_type == TENGINE_DT_INT8 )
+                {
+                    auto* o_quant = o_tensor->GetQuantParam();
+                    QuantParam q_param;
+                    q_param.scale = op_param.out_scale;
+                    o_quant->resize(0);
+                    o_quant->push_back(q_param);
+                }
+                
+            }
+            int ret = kernel_run(input,out_data_ptrs,&op_param);
+            if(ret<0)
+                return false;
+            return true;
+        }
+
+        bool RefSlice::Postrun(Node* node)
+        {
+            delete[] out_data_ptrs;
+            delete[] op_param.output_shape;
+            return true;
+        }
+
+        NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+        {
+            RefSlice* ops = new RefSlice();
+
+            LOG_DEBUG()<<"RefSlice is selected\n";
+
+            return ops;
+        }
+        
+        
+    } //end namespace RefSliceOps
+
+    void RegisterRefSlice(void)
+    {
+        NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Slice",
+        RefSliceOps::SelectFunc,RefSliceOps::default_prio);
+    }
+    
+}
diff --git a/executor/operator/ref/ref_softmax.cpp b/executor/operator/ref/ref_softmax.cpp
new file mode 100644
index 000000000..3bd6cca28
--- /dev/null
+++ b/executor/operator/ref/ref_softmax.cpp
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+
+#include <vector>
+#include <algorithm>
+#include "kernel/softmax/ref_softmax.h"
+
+#include "data_type.hpp"
+#include "operator/softmax.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+
+namespace TEngine {
+
+namespace RefSoftmaxOps {
+
+/* impl ref softmax op */
+//
+inline static int get_scale_zero(Tensor* itensor, Tensor* otensor, op_data* param)
+{
+    auto* i_quant = itensor->GetQuantParam();
+    auto* o_quant = otensor->GetQuantParam();
+    if( i_quant->size() != 1)
+    {
+        std::cerr<<"quant size: input("<< i_quant->size()<<")\n";
+        return -1;
+    }
+    param->i_scale = (*i_quant)[0].scale;
+    if(itensor->GetDataType() == TENGINE_DT_UINT8)
+    {
+        if( o_quant->size() != 1)
+        {
+            std::cerr<<"output quant size: "<<o_quant->size()<<"\n";
+            return -1;
+        }
+
+        param->o_scale = (*o_quant)[0].scale;
+        param->o_zero = (*o_quant)[0].zero_point;
+
+        param->i_zero = (*i_quant)[0].zero_point;
+    }
+    return 0;
+}
+//
+struct RefSoftmax : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+    bool Run(Node * node) override; 
+    void InitRegistry(void);
+
+    float * max_array;
+    float * sum_array;
+    
+    op_data op_param;
+
+    ref_softmax_kernel_t  kernel_run;
+
+    KernelRegistry<ref_softmax_kernel_t>  kernel_registry;
+
+    RefSoftmax(void) 
+    {
+       max_array=nullptr; 
+       sum_array=nullptr; 
+       
+       kernel_run=nullptr;
+
+       InitRegistry();
+    }
+};
+
+bool RefSoftmax::Prerun(Node * node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    int layout = exec_attr->graph_layout;
+
+    if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+    return true;
+}
+
+bool RefSoftmax::Run(Node * node)
+{
+    Tensor * input_tensor=node->GetInputTensor(0);
+    Tensor * output_tensor=node->GetOutputTensor(0);
+ 
+    const std::vector<int>& dims = input_tensor->GetShape().GetDim();
+    //
+    Softmax* softmax_op = dynamic_cast<Softmax*>(node->GetOp());
+    SoftmaxParam* param_ = softmax_op->GetParam();
+    int axis = param_->axis;
+    int out_size = 1;
+    for(int i = 0; i < axis; i++)
+    {
+        out_size *= dims[i];
+    }
+    int in_size = 1;
+    for(size_t i = axis + 1; i < dims.size(); i++)
+    {
+        in_size *= dims[i];
+    }
+    int on_size = dims[axis];
+
+    max_array = ( float* )std::malloc(in_size * sizeof(float));
+    sum_array = ( float* )std::malloc(in_size * sizeof(float));
+
+    //
+    op_param.out_size=out_size;
+    op_param.in_size=in_size;
+    op_param.on_size=on_size;
+
+    //
+    void* input=(void*)get_tensor_mem(input_tensor);
+    void* output=(void*)get_tensor_mem(output_tensor);
+    //
+    /* Get input,kernel,output scale & zero */
+    /* Current: one tensor has only one quantparam(scale)*/
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8 ||
+        input_tensor->GetDataType() == TENGINE_DT_UINT8 )
+    {
+        if(get_scale_zero(input_tensor, output_tensor, &op_param) < 0)
+            return false;
+    }
+    //
+    int ret = kernel_run(input,output,max_array,sum_array,&op_param);
+    //
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8)
+    {
+        auto* o_quant = output_tensor->GetQuantParam();
+        QuantParam q_param;
+        q_param.scale = op_param.o_scale;
+        o_quant->resize(0);
+        o_quant->push_back(q_param);
+    }
+
+    std::free(max_array);
+    std::free(sum_array);
+
+    if(ret<0)
+         return false;
+    else
+         return true;
+}
+
+void RefSoftmax::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    RefSoftmax* ops = new RefSoftmax();
+
+    LOG_DEBUG()<<"RefSoftmaxOp is selected\n";
+
+    return ops;
+}
+
+}    // namespace RefSoftmaxOps
+
+void RegisterRefSoftmaxOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Softmax", RefSoftmaxOps::SelectFunc, 1000);
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/ref/ref_swap_axis.cpp b/executor/operator/ref/ref_swap_axis.cpp
new file mode 100644
index 000000000..7afba1f6b
--- /dev/null
+++ b/executor/operator/ref/ref_swap_axis.cpp
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+
+#include <vector>
+
+
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/swap_axis.hpp"
+
+#include "kernel/swap_axis/ref_swap_axis_kernel.h"
+
+namespace TEngine {
+
+namespace RefSwapAxisOps {
+
+
+
+struct RefSwapAxis : public MTNodeOps
+{
+    bool Prerun(Node * node) override;
+    bool Run(Node * node) override;
+    void InitRegistry(void);
+
+    int dims[5];
+    ref_swap_axis_kernel_t kernel_run;
+    KernelRegistry<ref_swap_axis_kernel_t> kernel_registry;
+    RefSwapAxis(void) 
+    {
+       
+       kernel_run=nullptr;
+
+       InitRegistry();
+    }
+};
+
+void RefSwapAxis::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+/*
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axisl_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+*/
+}
+
+bool RefSwapAxis::Prerun(Node * node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    int layout = exec_attr->graph_layout;
+    if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+    
+    std::vector<int>& in_dims = input_tensor->GetShape().GetDim();
+    int in_dims_size = in_dims.size();
+    SwapAxis* swap = dynamic_cast<SwapAxis*>(node->GetOp());
+    SwapAxisParam* param_ = swap->GetParam();
+    int dim0 = param_->dim_0;
+    int dim1 = param_->dim_1;
+    if(dim0 > dim1)
+    {
+        int tmp = dim0;
+        dim0 = dim1;
+        dim1 = tmp;
+    }
+    
+    for(int i = 0; i < 5; i++)
+        dims[i] = 1;
+    //dim0
+    for(int i = 0; i < dim0; i++)
+        dims[0] *= in_dims[i];
+    //dim1
+    dims[1] = in_dims[dim0];
+    //dim2
+    for(int i = dim0+1; i < dim1; i++ )
+        dims[2] *= in_dims[i];
+    //dim3
+    dims[3] = in_dims[dim1];
+    //dim4
+    for(int i = dim1+1; i < in_dims_size; i++ )
+        dims[4] *= in_dims[i];
+    
+
+    return true;
+}
+
+bool RefSwapAxis::Run(Node * node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    Tensor* output_tensor = node->GetOutputTensor(0);
+
+    void* input_org = get_tensor_mem(input_tensor);
+    void* output_org = get_tensor_mem(output_tensor);
+
+    kernel_run(input_org, output_org, dims);
+
+    return true;
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    RefSwapAxis* ops = new RefSwapAxis();
+
+    LOG_DEBUG()<<"RefSwapAxis is selected\n";
+
+    return ops;
+}
+
+
+}    // namespace RefSwapAxisOps
+
+void RegisterSwapAxisOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "SwapAxis", RefSwapAxisOps::SelectFunc, 1000);
+}
+
+}    // namespace TEngine
\ No newline at end of file
diff --git a/executor/operator/ref/relu.cpp b/executor/operator/ref/relu.cpp
new file mode 100644
index 000000000..b94661946
--- /dev/null
+++ b/executor/operator/ref/relu.cpp
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/relu.hpp"
+
+#include "kernel/relu/relu.h"
+
+namespace TEngine {
+
+namespace RefReluOps {
+
+
+
+struct ReluOps : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+    bool OnBind(Node * node) override; 
+    bool Run(Node * node) override; 
+    void InitRegistry(void);
+
+    relu_t  kernel_run;
+
+    KernelRegistry<relu_t>  kernel_registry;
+
+    ReluOps(void) 
+    {
+       kernel_run=nullptr;
+
+       InitRegistry();
+    }
+};
+
+bool ReluOps::Prerun(Node * node)
+{
+    Tensor * input=node->GetInputTensor(0);
+    int  layout=exec_attr->graph_layout;
+
+      
+    if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        //printf("errorno: %d\n",ENOENT);
+        return false;
+    }
+ 
+
+    return true;
+}
+
+bool ReluOps::OnBind(Node * node)
+{
+    inplace_t io_map;
+
+    io_map[0] = 0;
+
+    node->SetAttr(ATTR_INPLACE, io_map);
+    return true;
+}
+
+bool ReluOps::Run(Node * node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    const TShape& shape = output_tensor->GetShape();
+    int elem_num = shape.GetSize();
+
+    ReLu* relu_op = dynamic_cast<ReLu*>(node->GetOp());
+    ReLuParam* param = relu_op->GetParam();
+    void* data = get_tensor_mem(output_tensor);
+    float negativeslope=param->negative_slope;
+
+    float scale = 1.f;
+    int zero_point = 0;
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8 || input_tensor->GetDataType() == TENGINE_DT_UINT8)
+    {
+        auto quant_param = input_tensor->GetQuantParam();
+        scale = (*quant_param)[0].scale;        
+        zero_point = (*quant_param)[0].zero_point;
+        auto out_quant_param = output_tensor->GetQuantParam();
+        out_quant_param->resize(0);
+        out_quant_param->push_back((*quant_param)[0]);
+    }
+    
+    int ret=kernel_run(data,elem_num,negativeslope, scale, zero_point);
+
+    if(ret<0)
+         return false;
+    else
+         return true;
+}
+
+void ReluOps::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((relu_t)relu_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((relu_t)relu_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((relu_t)relu_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((relu_t)relu_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((relu_t)relu_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((relu_t)relu_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((relu_t)relu_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((relu_t)relu_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    ReluOps* ops = new ReluOps();
+
+    LOG_DEBUG()<<"ReluOps RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefReluOps
+void RegisterReluOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "ReLu", RefReluOps::SelectFunc, 1000);
+}
+}    // namespace TEngine
diff --git a/executor/operator/ref/relu6.cpp b/executor/operator/ref/relu6.cpp
new file mode 100644
index 000000000..5306f3070
--- /dev/null
+++ b/executor/operator/ref/relu6.cpp
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+#include "kernel/relu6/relu6.h"
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+
+namespace TEngine {
+
+namespace RefRelu6Ops {
+
+
+
+struct Relu6Ops : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+    bool OnBind(Node * node) override; 
+    bool Run(Node * node) override; 
+    void InitRegistry(void);
+
+    relu6_t  kernel_run;
+
+    KernelRegistry<relu6_t>  kernel_registry;
+
+    Relu6Ops(void) 
+    {
+       kernel_run=nullptr;
+
+       InitRegistry();
+    }
+};
+
+bool Relu6Ops::Prerun(Node * node)
+{
+    Tensor * input=node->GetInputTensor(0);
+    int  layout=exec_attr->graph_layout;
+
+      
+    if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+ 
+    //int elem_size=DataType::GetTypeSize(input->GetDataType());
+
+    return true;
+}
+
+bool Relu6Ops::OnBind(Node * node)
+{
+    inplace_t io_map;
+
+    io_map[0] = 0;
+
+    node->SetAttr(ATTR_INPLACE, io_map);
+    return true;
+}
+
+bool Relu6Ops::Run(Node * node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    //int element_size = DataType::GetTypeSize(input_tensor->GetDataType());
+    const TShape& shape = input_tensor->GetShape();
+    int elem_num = shape.GetSize();
+    void* data = get_tensor_mem(output_tensor);
+    
+    float scale = 1.f;
+    int zero_point = 0;
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8 || input_tensor->GetDataType() == TENGINE_DT_UINT8)
+    {
+        auto quant_param = input_tensor->GetQuantParam();
+        scale = (*quant_param)[0].scale;        
+        zero_point = (*quant_param)[0].zero_point;
+        auto out_quant_param = output_tensor->GetQuantParam();
+        out_quant_param->resize(0);
+        out_quant_param->push_back((*quant_param)[0]);
+    }
+
+    int ret=kernel_run(data,elem_num, scale, zero_point);
+
+    if(ret<0)
+         return false;
+    else
+         return true;
+}
+
+void Relu6Ops::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((relu6_t)relu6_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((relu6_t)relu6_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((relu6_t)relu6_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((relu6_t)relu6_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((relu6_t)relu6_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((relu6_t)relu6_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((relu6_t)relu6_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((relu6_t)relu6_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    Relu6Ops* ops = new Relu6Ops();
+
+    LOG_DEBUG()<<"ReluOps RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefRelu6Ops
+void RegisterRelu6Ops(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "ReLu6", RefRelu6Ops::SelectFunc, 1000);
+}
+
+}    // namespace TEngine
\ No newline at end of file
diff --git a/executor/operator/ref/reshape.cpp b/executor/operator/ref/reshape.cpp
new file mode 100644
index 000000000..39a2f7669
--- /dev/null
+++ b/executor/operator/ref/reshape.cpp
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/reshape.hpp"
+
+namespace TEngine {
+
+namespace RefReshapeOps {
+
+
+
+struct RefReshape : public MTNodeOps
+{
+    bool OnBind(Node * node) override; 
+    bool Run(Node * node) override;
+
+};
+
+
+bool RefReshape::OnBind(Node * node)
+{
+    inplace_t io_map;
+
+    io_map[0] = 0;
+
+    node->SetAttr(ATTR_INPLACE, io_map);
+    return true;
+}
+
+bool RefReshape::Run(Node * node)
+{
+   
+    return true;
+}
+
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    RefReshape* ops = new RefReshape();
+
+    LOG_DEBUG()<<"ReluOps RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefReluOps
+void RegisterReshapeOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Reshape", RefReshapeOps::SelectFunc, 1000);
+}
+}    // namespace TEngine
\ No newline at end of file
diff --git a/executor/operator/ref/resize.cpp b/executor/operator/ref/resize.cpp
new file mode 100644
index 000000000..adf601da6
--- /dev/null
+++ b/executor/operator/ref/resize.cpp
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+
+#include "operator/resize.hpp"
+#include "kernel/resize/resize_kernel.h"
+
+namespace TEngine {
+
+namespace RefResizeOps {
+
+
+struct ResizeOps : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+    bool Run(Node * node) override; 
+    bool Postrun(Node * node) override;
+    void InitRegistry(void);
+    
+    struct resize_param op_param;
+    resize_t  kernel_run;
+
+
+    KernelRegistry<resize_t>  kernel_registry;
+    
+    ResizeOps(void) 
+    {
+       kernel_run=nullptr;
+
+       InitRegistry();
+    }
+};
+
+bool ResizeOps::Prerun(Node * node)
+{
+    Tensor * input=node->GetInputTensor(0);
+    int  layout=exec_attr->graph_layout;
+    Resize* resize_op = dynamic_cast<Resize*>(node->GetOp());
+    ResizeParam* param_ = resize_op->GetParam();
+    op_param.scale_x = 1.f / param_->scale_w;
+    op_param.scale_x = 1.f / param_->scale_h;
+    op_param.type = param_->type;
+      
+    if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+
+    return true;
+}
+
+bool ResizeOps::Run(Node * node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    const TShape& shape = input_tensor->GetShape();
+
+    op_param.batch = shape.GetN();
+    op_param.channel = shape.GetC();
+    op_param.input_h = shape.GetH();
+    op_param.input_w = shape.GetW();
+
+    const TShape& shape1 = output_tensor->GetShape();    
+    op_param.output_h = shape1.GetH();
+    op_param.output_w = shape1.GetW();
+
+
+    float* input = ( float* )get_tensor_mem(input_tensor);
+    float* output = ( float* )get_tensor_mem(output_tensor);
+    int ret=-1;
+
+    ret=kernel_run(input, output, &op_param);
+
+    if(ret<0)
+        return false;
+    else
+        return true;
+  
+}
+
+bool ResizeOps::Postrun(Node * node)
+{
+    return true;
+}
+
+void ResizeOps::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((resize_t)resize_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((resize_t)resize_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+#endif
+// #ifdef CONFIG_KERNEL_INT8
+//     kernel_registry.Register((resize_t)resize_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+//     kernel_registry.Register((resize_t)resize_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+// #endif
+
+// #ifdef CONFIG_KERNEL_UINT8
+//     kernel_registry.Register((resize_t)resize_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+//     kernel_registry.Register((resize_t)resize_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+// #endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    ResizeOps* ops = new ResizeOps();
+
+    LOG_DEBUG()<<"ReluOps RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefReluOps
+void RegisterResizeOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Resize", RefResizeOps::SelectFunc, 1000);
+}
+}    // namespace TEngine
diff --git a/executor/operator/ref/sigmoid.cpp b/executor/operator/ref/sigmoid.cpp
new file mode 100644
index 000000000..dc617610a
--- /dev/null
+++ b/executor/operator/ref/sigmoid.cpp
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+#include "kernel/sigmoid/sigmoid.h"
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+
+namespace TEngine {
+
+namespace RefSigmoidOps {
+
+
+
+struct SigmoidOps : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+    bool OnBind(Node * node) override; 
+    bool Run(Node * node) override; 
+    bool Postrun(Node * node) override;
+    void InitRegistry(void);
+
+    sigmoid_param op_param;
+    sigmoid_t  kernel_run;
+
+    KernelRegistry<sigmoid_t>  kernel_registry;
+
+    SigmoidOps(void) 
+    {
+       kernel_run=nullptr;
+
+       InitRegistry();
+    }
+};
+
+static int get_scale_zero(Tensor* itensor,Tensor * otensor,sigmoid_param* param)
+{
+    auto* i_quant = itensor->GetQuantParam();
+    auto* o_quant = otensor->GetQuantParam();
+    if( i_quant->size() != 1 )
+    {
+        return -1;
+    }
+    param->scale[0] = (*i_quant)[0].scale;
+    param->zero[0] = (*i_quant)[0].zero_point;
+    if(itensor->GetDataType() == TENGINE_DT_UINT8)
+    {
+        if( o_quant->size() != 1)
+        {
+            return -1;
+        }
+
+        param->scale[1] = (*o_quant)[0].scale;
+        param->zero[1] = (*o_quant)[0].zero_point;
+
+    }
+    return 0;
+}
+
+bool SigmoidOps::Prerun(Node * node)
+{
+    Tensor * input=node->GetInputTensor(0);
+    int  layout=exec_attr->graph_layout;
+
+    
+    if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+
+    return true;
+}
+
+bool SigmoidOps::OnBind(Node * node)
+{
+    inplace_t io_map;
+
+    io_map[0] = 0;
+
+    node->SetAttr(ATTR_INPLACE, io_map);
+    return true;
+}
+
+bool SigmoidOps::Run(Node * node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    Tensor* output_tensor = node->GetOutputTensor(0);
+    const TShape& shape = input_tensor->GetShape();
+    int elem_num = shape.GetSize();
+    void* data = get_tensor_mem(input_tensor);
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8 ||
+        input_tensor->GetDataType() == TENGINE_DT_UINT8 )
+    {
+        if(get_scale_zero(input_tensor, output_tensor, &op_param) < 0)
+            return false;
+    }
+    
+    int ret = kernel_run(data, elem_num, &op_param);
+
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8)
+    {
+        auto* o_quant = output_tensor->GetQuantParam();
+        QuantParam q_param;
+        q_param.scale = 1/127;
+        q_param.zero_point = 0;
+        o_quant->resize(0);
+        o_quant->push_back(q_param);
+    }
+
+
+    if(ret<0)
+         return false;
+    else
+         return true;
+}
+
+bool SigmoidOps::Postrun(Node * node)
+{
+    return true;
+}
+
+void SigmoidOps::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((sigmoid_t)sigmoid_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((sigmoid_t)sigmoid_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((sigmoid_t)sigmoid_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((sigmoid_t)sigmoid_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((sigmoid_t)sigmoid_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((sigmoid_t)sigmoid_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((sigmoid_t)sigmoid_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((sigmoid_t)sigmoid_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    SigmoidOps* ops = new SigmoidOps();
+
+    LOG_DEBUG()<<"ReluOps RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefsigmoidOps
+void RegisterSigmoidOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Sigmoid", RefSigmoidOps::SelectFunc, 1000);
+}
+
+}    // namespace TEngine
diff --git a/executor/operator/ref/split.cpp b/executor/operator/ref/split.cpp
new file mode 100644
index 000000000..de9582b58
--- /dev/null
+++ b/executor/operator/ref/split.cpp
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <iostream>
+#include <math.h>
+
+#include "logger.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "graph.hpp"
+#include "operator/split.hpp"
+
+#include "kernel/split/split_kernel.h"
+
+namespace TEngine
+{
+    namespace RefSplitOps
+    {
+        const int default_prio = 1500;
+        struct RefSplit : public MTNodeOps
+        {
+            bool Prerun(Node* node) override;
+            bool Run(Node* node) override;
+            bool Postrun(Node* node) override;
+            void InitRegistry(void);
+
+            RefSplit()
+            {
+                kernel_run = nullptr;
+                InitRegistry();
+            }
+            
+            struct split_param op_param;
+            split_t kernel_run;
+            void** output_data;
+            KernelRegistry<split_t> kernel_registry;
+        };
+
+        void RefSplit::InitRegistry(void)
+        {
+            #ifdef CONFIG_KERNEL_FP32
+            kernel_registry.Register((split_t)split_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+            kernel_registry.Register((split_t)split_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+            #endif
+            #ifdef CONFIG_KERNEL_FP16
+            kernel_registry.Register((split_t)split_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+            kernel_registry.Register((split_t)split_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+            #endif
+            #ifdef CONFIG_KERNEL_INT8
+            kernel_registry.Register((split_t)split_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+            kernel_registry.Register((split_t)split_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+            #endif
+            #ifdef CONFIG_KERNEL_UINT8
+            kernel_registry.Register((split_t)split_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+            kernel_registry.Register((split_t)split_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+            #endif
+            
+        }
+
+        bool RefSplit::Prerun(Node* node)
+        {
+            int  layout=exec_attr->graph_layout;
+            Tensor* output_tensor = node->GetOutputTensor(0);
+            Split* split_op = dynamic_cast<Split*>(node->GetOp());
+            SplitParam* param = split_op->GetParam();
+
+            Tensor* input_tensor = node->GetInputTensor(0);
+            int data_type = input_tensor->GetDataType();
+            op_param.axis = param->axis;
+            
+            int out_nums = node->GetOutputNum();
+            output_data = new void*[out_nums];
+
+            op_param.output_shape = new shape_dim[out_nums];
+            op_param.output_counts = out_nums;
+
+
+
+            auto dims =  output_tensor->GetShape().GetDim();
+            op_param.output_dim = (int)(dims.size());
+            for(int i=0;i<out_nums;i++)
+            {
+                for(std::size_t ii=0; ii<dims.size();++ii)
+                {
+                    op_param.output_shape[i].dim[ii] = dims[ii];
+                }
+            }
+            
+
+            if(!kernel_registry.GetKernel(kernel_run,layout,data_type))
+            {
+                set_tengine_errno(ENOENT);
+                return false;
+            }
+            
+            return true;
+
+        }
+
+        bool RefSplit::Run(Node* node)
+        {
+            Tensor* i_tensor = node->GetInputTensor(0);
+            void* input = get_tensor_mem(i_tensor);
+            int data_type = -1;
+        
+            data_type = i_tensor->GetDataType();
+            auto* in_quant = i_tensor->GetQuantParam();
+            if( (*in_quant).size() != 0 )
+            {
+                op_param.input_shape.scale = (*in_quant)[0].scale;
+                op_param.out_scale = (*in_quant)[0].scale;
+                op_param.input_shape.zero = (*in_quant)[0].zero_point;
+            }
+            else
+            {
+                op_param.input_shape.scale = 1;
+                op_param.input_shape.zero = 0;
+            }
+
+            auto dims = i_tensor->GetShape().GetDim();
+            op_param.input_dim = (int)(dims.size());
+
+            for(std::size_t jj=0; jj<dims.size();++jj)
+            {
+                op_param.input_shape.dim[jj] = dims[jj];
+            }
+            
+            
+            
+            for(int ii=0; ii<op_param.output_counts;++ii)
+            {
+                Tensor* o_tensor = node->GetOutputTensor(ii);
+                auto* o_quant = o_tensor->GetQuantParam();
+                if( (*o_quant).size() !=0)
+                {
+                    op_param.output_shape[ii].scale = (*o_quant)[0].scale;
+                    op_param.output_shape[ii].zero = (*o_quant)[0].zero_point;
+                }
+                else
+                {
+                    op_param.output_shape[ii].scale = 1;
+                    op_param.output_shape[ii].zero = 0;
+                }
+                output_data[ii] = get_tensor_mem(o_tensor);
+
+            }
+           
+            int ret = kernel_run(input, output_data, &op_param);
+            if(ret<0)
+                return false;
+
+            if( data_type == TENGINE_DT_INT8 )
+            {
+
+                for(int ii=0; ii<op_param.output_counts;++ii)
+                {
+                    Tensor* o_tensor = node->GetOutputTensor(ii);
+                    auto* o_quant = o_tensor->GetQuantParam();
+                    QuantParam q_param;
+                    q_param.scale = op_param.out_scale;
+                    o_quant->resize(0);
+                    o_quant->push_back(q_param);
+                }
+            }
+            
+            return true;
+
+        }
+
+        bool RefSplit::Postrun(Node* node)
+        {
+            delete[] output_data;
+            delete[] op_param.output_shape;
+            return true;
+        }
+
+        NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+        {
+            RefSplit* ops = new RefSplit();
+
+            LOG_DEBUG()<<"Refconcat is selected\n";
+
+            return ops;
+        }
+        
+        
+    } //end namespace RefSplitOps
+
+    void RegisterSplitOps(void)
+    {
+        NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Split", RefSplitOps::SelectFunc,RefSplitOps::default_prio);
+    }
+    
+}
diff --git a/executor/operator/ref/squeeze.cpp b/executor/operator/ref/squeeze.cpp
new file mode 100644
index 000000000..db0a2ef2f
--- /dev/null
+++ b/executor/operator/ref/squeeze.cpp
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+#include "operator/squeeze.hpp"
+
+namespace TEngine {
+
+namespace RefSqueezeOps {
+
+
+
+struct RefSqueeze : public MTNodeOps
+{
+    bool OnBind(Node * node) override; 
+    bool Run(Node * node) override;
+  
+    
+};
+
+
+bool RefSqueeze::OnBind(Node * node)
+{
+    inplace_t io_map;
+
+    io_map[0] = 0;
+
+    node->SetAttr(ATTR_INPLACE, io_map);
+    return true;
+}
+
+bool RefSqueeze::Run(Node * node)
+{
+   
+    return true;
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    RefSqueeze* ops = new RefSqueeze();
+
+    LOG_DEBUG()<<"ReluOps RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace RefSqueezeOps
+void RegisterSqueezeOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Squeeze", RefSqueezeOps::SelectFunc, 1000);
+}
+}    // namespace TEngine
\ No newline at end of file
diff --git a/executor/operator/ref/tanh.cpp b/executor/operator/ref/tanh.cpp
new file mode 100644
index 000000000..858ab20b5
--- /dev/null
+++ b/executor/operator/ref/tanh.cpp
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+
+#include <vector>
+
+#include "kernel/tanh/tanh.h"
+
+#include "data_type.hpp"
+#include "kernel_registry.hpp"
+#include "tengine_errno.hpp"
+#include "logger.hpp"
+#include "graph.hpp"
+#include "node_ops.hpp"
+#include "tensor_mem.hpp"
+
+namespace TEngine {
+
+namespace RefTanhOps {
+
+
+
+struct TanhOps : public MTNodeOps
+{
+    bool Prerun(Node * node) override; 
+    bool OnBind(Node * node) override; 
+    bool Run(Node * node) override; 
+    void InitRegistry(void);
+
+    struct tanh_param op_param;
+    tanh_t  kernel_run;
+
+    KernelRegistry<tanh_t>  kernel_registry;
+
+    TanhOps(void) 
+    {
+       kernel_run=nullptr;
+
+       InitRegistry();
+    }
+};
+
+bool TanhOps::Prerun(Node * node)
+{
+    Tensor * input=node->GetInputTensor(0);
+    Tensor * output=node->GetOutputTensor(0);
+    int  layout=exec_attr->graph_layout;
+
+    if(output->GetDataType() == TENGINE_DT_UINT8 )
+    {
+        auto output_quant = output->GetQuantParam();
+        if(output_quant->size() < 1)
+            return false;
+        op_param.output_scale = (*output_quant)[0].scale;
+        op_param.output_zero = (*output_quant)[0].zero_point;
+    }
+      
+    if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType()))
+    {
+        set_tengine_errno(ENOENT);
+        return false;
+    }
+ 
+
+    return true;
+}
+
+bool TanhOps::OnBind(Node * node)
+{
+    inplace_t io_map;
+
+    io_map[0] = 0;
+
+    node->SetAttr(ATTR_INPLACE, io_map);
+    return true;
+}
+
+bool TanhOps::Run(Node * node)
+{
+    Tensor* input_tensor = node->GetInputTensor(0);
+    const TShape& shape = input_tensor->GetShape();
+    int elem_num = shape.GetSize();
+    void* data = get_tensor_mem(input_tensor);
+    
+    if(input_tensor->GetDataType() == TENGINE_DT_INT8 ||
+        input_tensor->GetDataType() == TENGINE_DT_UINT8 )
+    {
+        auto input_quant = input_tensor->GetQuantParam();
+        if(input_quant->size() < 1)
+            return false;
+        op_param.input_scale = (*input_quant)[0].scale;
+        op_param.input_zero = (*input_quant)[0].zero_point;
+    }
+
+    int ret=kernel_run(data, elem_num, &op_param);
+
+    if(ret<0)
+         return false;
+    else
+         return true;
+}
+
+void TanhOps::InitRegistry(void)
+{
+#ifdef CONFIG_KERNEL_FP32
+    kernel_registry.Register((tanh_t)tanh_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32);
+    kernel_registry.Register((tanh_t)tanh_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32);
+#endif
+
+#ifdef CONFIG_KERNEL_FP16
+    kernel_registry.Register((tanh_t)tanh_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16);
+    kernel_registry.Register((tanh_t)tanh_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16);
+#endif
+#ifdef CONFIG_KERNEL_INT8
+    kernel_registry.Register((tanh_t)tanh_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8);
+    kernel_registry.Register((tanh_t)tanh_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8);
+#endif
+
+#ifdef CONFIG_KERNEL_UINT8
+    kernel_registry.Register((tanh_t)tanh_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8);
+    kernel_registry.Register((tanh_t)tanh_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8);
+#endif
+
+}
+
+NodeOps* SelectFunc(const CPUInfo* info, Node* node)
+{
+    TanhOps* ops = new TanhOps();
+
+    LOG_DEBUG()<<"ReluOps RefOp is selected\n";
+
+    return ops;
+}
+
+
+
+
+}    // namespace ReftanhOps
+void RegisterTanhOps(void)
+{
+    NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Tanh", RefTanhOps::SelectFunc, 1000);
+}
+
+}    // namespace TEngine
diff --git a/executor/plugin/init.cpp b/executor/plugin/init.cpp
index 28d84d651..f21986061 100644
--- a/executor/plugin/init.cpp
+++ b/executor/plugin/init.cpp
@@ -34,11 +34,6 @@ using namespace TEngine;
 namespace TEngine {
 
 extern void NodeOpsRegistryManagerInit(void);
-extern void RegisterCommonOps(void);
-
-#if CONFIG_ARCH_ARM64 == 1 || CONFIG_ARCH_ARM32 == 1
-extern void RegisterArmOps(void);
-#endif
 
 void DevAllocatorManagerInit(void);
 void DevSchedulerManagerInit(void);
@@ -48,12 +43,6 @@ int executor_plugin_init(void)
 {
     NodeOpsRegistryManagerInit();
 
-    RegisterCommonOps();
-
-#if CONFIG_ARCH_ARM64 || CONFIG_ARCH_ARM32
-    RegisterArmOps();
-#endif
-
     DevAllocatorManagerInit();
     DevSchedulerManagerInit();
 
diff --git a/hclarm/Makefile b/hclarm/Makefile
new file mode 100644
index 000000000..ded2ce2e2
--- /dev/null
+++ b/hclarm/Makefile
@@ -0,0 +1,99 @@
+BUILD_DIR?=$(shell pwd)/build
+INSTALL_DIR?=$(shell pwd)/install
+MAKEBUILD?=$(shell pwd)/makefile.build
+
+export CC CXX CFLAGS  LD LDFLAGS CXXFLAGS COMMON_CFLAGS
+
+default: $(LIB_HCL_SO)
+
+include $(MAKEFILE_CONFIG)
+
+INC_DIR+=-I$(shell pwd)/../include 
+INC_DIR+=-I$(shell pwd)/../core/include 
+INC_DIR+=-I$(shell pwd)/../operator/include 
+INC_DIR+=-I$(shell pwd)/../executor/include
+
+CXXFLAGS+=
+
+
+COMMON_CFLAGS+=$(CONFIG_OPT_CFLAGS)
+COMMON_CFLAGS+= -Wall -g  -fPIC  $(INC_DIR) -Werror
+
+ifeq ($(CONFIG_INTERN_RELEASE),y)
+    COMMON_CFLAGS+=-DCONFIG_INTERN_RELEASE
+endif
+
+ifeq ($(CONFIG_INTERN_TRIAL),y)
+    COMMON_CFLAGS+=-DCONFIG_INTERN_TRIAL
+endif
+
+ifneq ($(CONFIG_OPT_CFLAGS),)
+    COMMON_CFLAGS+=-O3 -funroll-loops
+endif
+
+ARM_BLOB=$(BUILD_DIR)/arm-builtin.o
+LIB_HCL_SO?=$(BUILD_DIR)/../libhclcpu.so
+
+
+LIB_SUB_DIRS+=../executor/operator  lib/
+
+ifeq ($(CONFIG_ARCH_ARM64),y)
+	COMMON_CFLAGS+= -DCONFIG_ARCH_ARM64=1
+endif
+
+ifeq ($(CONFIG_ARCH_BLAS),y)
+	COMMON_CFLAGS+=-DCONFIG_ARCH_BLAS=1
+endif
+
+ifeq ($(CONFIG_ARCH_ARM32),y)
+	COMMON_CFLAGS+= -DCONFIG_ARCH_ARM32=1
+	CC+= -march=armv7-a -mfpu=neon
+	CXX+=-march=armv7-a -mfpu=neon
+endif
+
+
+ifeq ($(CONFIG_ACL_GPU),y)
+	ACL_LIBS+=-Wl,-rpath,$(ACL_ROOT)/build/ -L$(ACL_ROOT)/build
+	ACL_LIBS+= -larm_compute_core -larm_compute
+	LIB_LDFLAGS+=$(ACL_LIBS)
+endif
+
+ARM_OBJS =$(addprefix $(BUILD_DIR)/, $(foreach f,$(LIB_SUB_DIRS),$(f)/built-in.o))
+
+ifeq ($(CONFIG_ARCH_BLAS),y)
+	LIB_LDFLAGS+=-lopenblas
+endif
+
+$(LIB_HCL_SO): $(ARM_BLOB)
+	$(CC) -o $@ -shared -Wl,-Bsymbolic -Wl,-Bsymbolic-functions $(ARM_BLOB) $(LIB_LDFLAGS)
+
+$(ARM_BLOB): $(ARM_OBJS)
+	$(BUILT_IN_LD) -r -o $@ $(ARM_OBJS)
+
+
+$(ARM_OBJS): $(LIB_SUB_DIRS);
+
+
+
+build: default install
+
+
+install:
+	@mkdir -p $(INSTALL_DIR)/lib
+	cp -f $(LIB_HCL_SO) $(INSTALL_DIR)/lib
+
+$(LIB_SUB_DIRS):
+	@$(MAKE) -C $@ -f $(MAKEBUILD) BUILD_DIR=$(BUILD_DIR)/$@ $(MAKECMDGOALS)
+
+
+clean: $(LIB_SUB_DIRS) 
+	@rm -rf $(ARM_BLOB) $(LIB_HCL_SO)
+
+
+.PHONY:  build clean default test install  $(LIB_SUB_DIRS) 
+
+
+
+
+
+
diff --git a/hclarm/lib/Makefile b/hclarm/lib/Makefile
new file mode 100644
index 000000000..6b1f91f43
--- /dev/null
+++ b/hclarm/lib/Makefile
@@ -0,0 +1 @@
+obj-y+=hcl_version.o
diff --git a/hclarm/lib/hcl_version.c b/hclarm/lib/hcl_version.c
new file mode 100644
index 000000000..46bb1f81e
--- /dev/null
+++ b/hclarm/lib/hcl_version.c
@@ -0,0 +1,29 @@
+#include <stdio.h>
+
+#define HCL_VERSION "1.2.2"
+
+const char * get_hcl_version(void)
+{
+    static char hcl_version[64];
+    const char * postfix="github";
+
+#ifdef CONFIG_INTERN_RELEASE
+    postfix="trial";
+#endif
+
+#ifdef CONFIG_INTERN_TRIAL
+    postfix="release";
+#endif
+
+#ifdef CONFIG_AUTHENICATION
+    postfix="authed";
+#endif
+    int ret=snprintf(hcl_version,64,"%s-%s", HCL_VERSION,postfix);
+
+    if(ret>=64)
+       hcl_version[63]=0;
+
+    return hcl_version;
+}
+
+
diff --git a/include/any.hpp b/include/any.hpp
index 758fc4ceb..741c5b81a 100644
--- a/include/any.hpp
+++ b/include/any.hpp
@@ -20,23 +20,21 @@
 #include <cxxabi.h>
 #include <iostream>
 #include <cstdlib>
+#include <string.h>
 
-
-namespace TEngine
-{
+namespace TEngine {
 
 static inline std::string GetRealName(const char* name)
 {
-      std::string result;
+    std::string result;
+
+    char* real_name = abi::__cxa_demangle(name, nullptr, nullptr, nullptr);
 
-      char * real_name=abi::__cxa_demangle(name, nullptr,
-                        nullptr, nullptr);
+    result = real_name;
 
-      result=real_name;
+    std::free(real_name);
 
-      std::free(real_name);
-     
-      return result;
+    return result;
 }
 
 class bad_any_cast : public std::bad_cast
@@ -44,31 +42,33 @@ class bad_any_cast : public std::bad_cast
 public:
     bad_any_cast(const std::type_info& expected, const std::type_info& real)
     {
-         std::string& message=GetMessage();
+        std::string& message = GetMessage();
 
-         message=std::string("Bad any cast:  Expected: ")+GetRealName(real.name());
-         message+="   Real: "+GetRealName(expected.name());
+        message = std::string("Bad any cast:  Expected: ") + GetRealName(real.name());
+        message += "   Real: " + GetRealName(expected.name());
 
-		 const char * str=getenv("HALT_ON_MISMATCH");
+        const char* str = getenv("HALT_ON_MISMATCH");
 
-		 if(str)
-		 {
-             std::cerr<<message<<"\n";
-			 std::cerr<<"halt due to env set...\n";
-		     while(1);
-		 }
+        if(str)
+        {
+            std::cerr << message << "\n";
+            std::cerr << "halt due to env set...\n";
+            while(1)
+                ;
+        }
     }
 
     const char* what() const noexcept override
     {
         return GetMessage().c_str();
     }
+
 private:
-    static  std::string& GetMessage(void)
+    static std::string& GetMessage(void)
     {
-         static  std::string message;
+        static std::string message;
 
-         return message;
+        return message;
     }
 };
 
@@ -76,14 +76,10 @@ class any final
 {
 public:
     /// Constructs an object of type any with an empty state.
-    any() :
-        vtable(nullptr)
-    {
-    }
+    any() : vtable(nullptr) {}
 
     /// Constructs an object of type any with an equivalent state as other.
-    any(const any& rhs) :
-        vtable(rhs.vtable)
+    any(const any& rhs) : vtable(rhs.vtable)
     {
         if(!rhs.empty())
         {
@@ -93,8 +89,7 @@ class any final
 
     /// Constructs an object of type any with a state equivalent to the original state of other.
     /// rhs is left in a valid but otherwise unspecified state.
-    any(any&& rhs) noexcept :
-        vtable(rhs.vtable)
+    any(any&& rhs) noexcept : vtable(rhs.vtable)
     {
         if(!rhs.empty())
         {
@@ -109,15 +104,18 @@ class any final
         this->clear();
     }
 
-    /// Constructs an object of type any that contains an object of type T direct-initialized with std::forward<ValueType>(value).
+    /// Constructs an object of type any that contains an object of type T direct-initialized with
+    /// std::forward<ValueType>(value).
     ///
     /// T shall satisfy the CopyConstructible requirements, otherwise the program is ill-formed.
-    /// This is because an `any` may be copy constructed into another `any` at any time, so a copy should always be allowed.
-    template<typename ValueType, typename = typename std::enable_if<!std::is_same<typename std::decay<ValueType>::type, any>::value>::type>
+    /// This is because an `any` may be copy constructed into another `any` at any time, so a copy should always be
+    /// allowed.
+    template <typename ValueType,
+              typename = typename std::enable_if<!std::is_same<typename std::decay<ValueType>::type, any>::value>::type>
     any(ValueType&& value)
     {
         static_assert(std::is_copy_constructible<typename std::decay<ValueType>::type>::value,
-            "T shall satisfy the CopyConstructible requirements.");
+                      "T shall satisfy the CopyConstructible requirements.");
         this->construct(std::forward<ValueType>(value));
     }
 
@@ -141,12 +139,14 @@ class any final
     /// Has the same effect as any(std::forward<ValueType>(value)).swap(*this). No effect if a exception is thrown.
     ///
     /// T shall satisfy the CopyConstructible requirements, otherwise the program is ill-formed.
-    /// This is because an `any` may be copy constructed into another `any` at any time, so a copy should always be allowed.
-    template<typename ValueType, typename = typename std::enable_if<!std::is_same<typename std::decay<ValueType>::type, any>::value>::type>
+    /// This is because an `any` may be copy constructed into another `any` at any time, so a copy should always be
+    /// allowed.
+    template <typename ValueType,
+              typename = typename std::enable_if<!std::is_same<typename std::decay<ValueType>::type, any>::value>::type>
     any& operator=(ValueType&& value)
     {
         static_assert(std::is_copy_constructible<typename std::decay<ValueType>::type>::value,
-            "T shall satisfy the CopyConstructible requirements.");
+                      "T shall satisfy the CopyConstructible requirements.");
         any(std::forward<ValueType>(value)).swap(*this);
         return *this;
     }
@@ -170,7 +170,7 @@ class any final
     /// If *this has a contained object of type T, typeid(T); otherwise typeid(void).
     const std::type_info& type() const noexcept
     {
-        return empty()? typeid(void) : this->vtable->type();
+        return empty() ? typeid(void) : this->vtable->type();
     }
 
     /// Exchange the states of *this and rhs.
@@ -185,7 +185,7 @@ class any final
             if(this->vtable != nullptr)
             {
                 this->vtable->move(this->storage, rhs.storage);
-                //this->vtable = nullptr; -- uneeded, see below
+                // this->vtable = nullptr; -- uneeded, see below
             }
 
             // move from tmp (previously rhs) to *this.
@@ -196,84 +196,81 @@ class any final
                 tmp.vtable = nullptr;
             }
         }
-        else // same types
+        else    // same types
         {
             if(this->vtable != nullptr)
                 this->vtable->swap(this->storage, rhs.storage);
         }
     }
 
-private: // Storage and Virtual Method Table
-
+private:    // Storage and Virtual Method Table
     union storage_union
     {
         using stack_storage_t = typename std::aligned_storage<2 * sizeof(void*), std::alignment_of<void*>::value>::type;
 
-        void*               dynamic;
-        stack_storage_t     stack;      // 2 words for e.g. shared_ptr
+        void* dynamic;
+        stack_storage_t stack;    // 2 words for e.g. shared_ptr
     };
-    
-        /// Base VTable specification.
-        struct vtable_type
+
+    /// Base VTable specification.
+    struct vtable_type
+    {
+        // Note: The caller is responssible for doing .vtable = nullptr after destructful operations
+        // such as destroy() and/or move().
+
+        /// The type of the object this vtable is for.
+        const std::type_info& (*type)();
+
+        /// Destroys the object in the union.
+        /// The state of the union after this call is unspecified, caller must ensure not to use src anymore.
+        void (*destroy)(storage_union&);
+
+        /// Copies the **inner** content of the src union into the yet unitialized dest union.
+        /// As such, both inner objects will have the same state, but on separate memory locations.
+        void (*copy)(const storage_union& src, storage_union& dest);
+
+        /// Moves the storage from src to the yet unitialized dest union.
+        /// The state of src after this call is unspecified, caller must ensure not to use src anymore.
+        void (*move)(storage_union& src, storage_union& dest);
+
+        /// Exchanges the storage between lhs and rhs.
+        void (*swap)(storage_union& lhs, storage_union& rhs);
+    };
+
+    /// VTable for dynamically allocated storage.
+    template <typename T> struct vtable_dynamic
+    {
+        static const std::type_info& type() noexcept
         {
-            // Note: The caller is responssible for doing .vtable = nullptr after destructful operations
-            // such as destroy() and/or move().
-    
-            /// The type of the object this vtable is for.
-            const std::type_info& (*type)();
-    
-            /// Destroys the object in the union.
-            /// The state of the union after this call is unspecified, caller must ensure not to use src anymore.
-            void(*destroy)(storage_union&);
-    
-            /// Copies the **inner** content of the src union into the yet unitialized dest union.
-            /// As such, both inner objects will have the same state, but on separate memory locations.
-            void(*copy)(const storage_union& src, storage_union& dest);
-    
-            /// Moves the storage from src to the yet unitialized dest union.
-            /// The state of src after this call is unspecified, caller must ensure not to use src anymore.
-            void(*move)(storage_union& src, storage_union& dest);
-    
-            /// Exchanges the storage between lhs and rhs.
-            void(*swap)(storage_union& lhs, storage_union& rhs);
-        };
-    
-        /// VTable for dynamically allocated storage.
-        template<typename T>
-        struct vtable_dynamic
+            return typeid(T);
+        }
+
+        static void destroy(storage_union& storage) noexcept
         {
-            static const std::type_info& type() noexcept
-            {
-                return typeid(T);
-            }
-    
-            static void destroy(storage_union& storage) noexcept
-            {
-                //assert(reinterpret_cast<T*>(storage.dynamic));
-                delete reinterpret_cast<T*>(storage.dynamic);
-            }
-    
-            static void copy(const storage_union& src, storage_union& dest)
-            {
-                dest.dynamic = new T(*reinterpret_cast<const T*>(src.dynamic));
-            }
-    
-            static void move(storage_union& src, storage_union& dest) noexcept
-            {
-                dest.dynamic = src.dynamic;
-                src.dynamic = nullptr;
-            }
-    
-            static void swap(storage_union& lhs, storage_union& rhs) noexcept
-            {
-                // just exchage the storage pointers.
-                std::swap(lhs.dynamic, rhs.dynamic);
-            }
-        };
+            // assert(reinterpret_cast<T*>(storage.dynamic));
+            delete reinterpret_cast<T*>(storage.dynamic);
+        }
+
+        static void copy(const storage_union& src, storage_union& dest)
+        {
+            dest.dynamic = new T(*reinterpret_cast<const T*>(src.dynamic));
+        }
+
+        static void move(storage_union& src, storage_union& dest) noexcept
+        {
+            dest.dynamic = src.dynamic;
+            src.dynamic = nullptr;
+        }
+
+        static void swap(storage_union& lhs, storage_union& rhs) noexcept
+        {
+            // just exchage the storage pointers.
+            std::swap(lhs.dynamic, rhs.dynamic);
+        }
+    };
 
     /// VTable for stack allocated storage.
-    template<typename T>
-    struct vtable_stack
+    template <typename T> struct vtable_stack
     {
         static const std::type_info& type() noexcept
         {
@@ -287,14 +284,14 @@ class any final
 
         static void copy(const storage_union& src, storage_union& dest)
         {
-            new (&dest.stack) T(reinterpret_cast<const T&>(src.stack));
+            new(&dest.stack) T(reinterpret_cast<const T&>(src.stack));
         }
 
         static void move(storage_union& src, storage_union& dest) noexcept
         {
             // one of the conditions for using vtable_stack is a nothrow move constructor,
             // so this move constructor will never throw a exception.
-            new (&dest.stack) T(std::move(reinterpret_cast<T&>(src.stack)));
+            new(&dest.stack) T(std::move(reinterpret_cast<T&>(src.stack)));
             destroy(src);
         }
 
@@ -305,32 +302,30 @@ class any final
     };
 
     /// Whether the type T must be dynamically allocated or can be stored on the stack.
-    template<typename T>
-    struct requires_allocation :
-        std::integral_constant<bool,
-                !(std::is_nothrow_move_constructible<T>::value      // N4562 §6.3/3 [any.class]
-                  && sizeof(T) <= sizeof(storage_union::stack)
-                  && std::alignment_of<T>::value <= std::alignment_of<storage_union::stack_storage_t>::value)>
-    {};
+    template <typename T>
+    struct requires_allocation
+        : std::integral_constant<bool,
+                                 !(std::is_nothrow_move_constructible<T>::value    // N4562 §6.3/3 [any.class]
+                                   && sizeof(T) <= sizeof(storage_union::stack) &&
+                                   std::alignment_of<T>::value <=
+                                       std::alignment_of<storage_union::stack_storage_t>::value)>
+    {
+    };
 
     /// Returns the pointer to the vtable of the type T.
-    template<typename T>
-    static vtable_type* vtable_for_type()
+    template <typename T> static vtable_type* vtable_for_type()
     {
-        using VTableType = typename std::conditional<requires_allocation<T>::value, vtable_dynamic<T>, vtable_stack<T>>::type;
+        using VTableType =
+            typename std::conditional<requires_allocation<T>::value, vtable_dynamic<T>, vtable_stack<T>>::type;
         static vtable_type table = {
-            VTableType::type, VTableType::destroy,
-            VTableType::copy, VTableType::move,
-            VTableType::swap,
+            VTableType::type, VTableType::destroy, VTableType::copy, VTableType::move, VTableType::swap,
         };
         return &table;
     }
 
 protected:
-    template<typename T>
-    friend const T* any_cast(const any* operand) noexcept;
-    template<typename T>
-    friend T* any_cast(any* operand) noexcept;
+    template <typename T> friend const T* any_cast(const any* operand) noexcept;
+    template <typename T> friend T* any_cast(any* operand) noexcept;
 
     /// Same effect as is_same(this->type(), t);
     bool is_typed(const std::type_info& t) const
@@ -347,9 +342,13 @@ class any final
     static bool is_same(const std::type_info& a, const std::type_info& b)
     {
 #ifdef ANY_IMPL_FAST_TYPE_INFO_COMPARE
-        return &a == &b;
+         return &a == &b;
+#else
+#ifdef __ANDROID__
+        return a == b || strcmp(a.name(),b.name()) == 0;
 #else
         return a == b;
+#endif
 #endif
     }
 
@@ -480,7 +479,13 @@ class any final
     inline T* any_cast(any* operand) noexcept
     {
         if(operand == nullptr || !operand->is_typed(typeid(T)))
+        {
+            if(operand != nullptr ) 
+            {
+                std::cout << "type is not same-----------------------\n";
+            }
             return nullptr;
+        }
         else
             return operand->cast<T>();
     }
@@ -494,4 +499,3 @@ namespace std
         lhs.swap(rhs);
     }
 }
-        
diff --git a/include/share_lib_parser.hpp b/include/share_lib_parser.hpp
index d7903d6ec..881ff0ce8 100644
--- a/include/share_lib_parser.hpp
+++ b/include/share_lib_parser.hpp
@@ -54,7 +54,7 @@ class ShareLibParser
         sl = ::dlopen(so_path.c_str(), RTLD_LAZY | RTLD_GLOBAL);
         if(!sl)
         {
-            std::printf("%s\n", dlerror());
+            //std::printf("%s\n", dlerror());
             throw te_error_unable_to_load_library(so_path);
             return -1;
         }
@@ -70,7 +70,7 @@ class ShareLibParser
             if(!f)
             {
                 throw te_error_shared_function_not_found(func_name);
-                return nullptr;
+                //return nullptr;
             }
             func_map.emplace(func_name, ( func* )f);
             it = func_map.find(func_name);
diff --git a/include/te_error.hpp b/include/te_error.hpp
index 9ab8ee9ce..44101e377 100644
--- a/include/te_error.hpp
+++ b/include/te_error.hpp
@@ -34,22 +34,40 @@ struct te_error_base : public std::runtime_error
     {
         return error_code;
     }
-    te_error_base(error_code_t e) : runtime_error("tengine error"), error_code(e) {}
+    te_error_base() : runtime_error("tengine error"){}
 };
 struct te_error_shared_function_not_found : public te_error_base
 {
     using te_error_base::te_error_base;
+    static std::string msg;
+    
+    te_error_shared_function_not_found(const std::string& func_name) 
+    {
+        msg="\nShared function not found: ";
+        msg+=func_name;
+        msg+="\n";
+    }
+
     const char* what() const throw() override
     {
-        return "Shared function not found";
+        return msg.c_str(); 
     }
 };
 struct te_error_unable_to_load_library : public te_error_base
 {
     using te_error_base::te_error_base;
+    static std::string msg;
+
+    te_error_unable_to_load_library(const std::string& so_name)
+    {
+        msg="\nShared library not found: ";
+        msg+=so_name;
+        msg+="\n";
+    } 
+
     const char* what() const throw() override
     {
-        return "Unable to load library";
+        return msg.c_str();
     }
 };
 struct te_error_general : public te_error_base
diff --git a/include/type_name.hpp b/include/type_name.hpp
index de0560ba6..f3e49ef8e 100644
--- a/include/type_name.hpp
+++ b/include/type_name.hpp
@@ -37,7 +37,7 @@ template <class T> static std::string type_name()
 {
     typedef typename std::remove_reference<T>::type TR;
     std::unique_ptr<char, void (*)(void*)> own(
-#ifndef __GNUC__
+#if !defined(__GNUC__) || defined(NO_CXA_DEMANGLE)
         nullptr,
 #else
         abi::__cxa_demangle(typeid(TR).name(), nullptr, nullptr, nullptr),
@@ -72,7 +72,7 @@ template <typename T> static std::string GetNameForType(T&& t)
 
 static std::string GetTypeName(const char* name)
 {
-#ifndef __GNUC__
+#if !defined(__GNUC__) || defined(NO_CXA_DEMANGLE)
     return name;
 #else
     std::unique_ptr<char, void (*)(void*)> own(abi::__cxa_demangle(name, nullptr, nullptr, nullptr), std::free);
diff --git a/makefile.config.example b/makefile.config.example
index cbac07ee3..bb6c32aae 100644
--- a/makefile.config.example
+++ b/makefile.config.example
@@ -15,8 +15,19 @@
 #  $ make -j8
 #-------------------------------------------------------------------------------
 
+# cross compile for ARM64
+# CROSS_COMPILE=aarch64-linux-gnu-
+# cross compile for ARM32
+# CROSS_COMPILE=arm-linux-gnueabihf-
+
+# Just to differentiate with sysroot for embedded toolchains building 
+# As toolchains just need a few pre-built libraries
+# 
+# EMBEDDED_CROSS_ROOT=/opt/install/
+
 # Set the target arch 
-CONFIG_ARCH_ARM64=y
+ CONFIG_ARCH_ARM64=y
+# CONFIG_ARCH_ARM32=y
 
 # Enable Compiling Optimization
 CONFIG_OPT_CFLAGS = -O2
@@ -35,6 +46,7 @@ CONFIG_CAFFE_SERIALIZER=y
 # CONFIG_MXNET_SERIALIZER=y
 # CONFIG_ONNX_SERIALIZER=y
 # CONFIG_TF_SERIALIZER=y
+# CONFIG_TFLITE_SERIALIZER=y
 CONFIG_TENGINE_SERIALIZER=y
 
 # Enable Wrappers
@@ -46,3 +58,5 @@ CONFIG_VERSION_POSTFIX=github
 # support legacy API 
 CONFIG_LEGACY_API=y
 
+# kernel configuration
+CONFIG_KERNEL_FP32=y
diff --git a/operator/include/operator/add_n.hpp b/operator/include/operator/add_n.hpp
new file mode 100644
index 000000000..ae528d306
--- /dev/null
+++ b/operator/include/operator/add_n.hpp
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+#ifndef __ADDN_HPP__
+#define __ADDN_HPP__
+
+#include "operator.hpp"
+#include "addn_param.hpp"
+
+namespace TEngine {
+
+class Addn : public OperatorWithParam<Addn, AddnParam>
+{
+public:
+    Addn()
+    {
+        name_ = "Addn";
+    }
+    Addn(const Addn& src) = default;
+
+    void SetSchema(void) override;
+
+    bool InferShape(const std::vector<TEngine::TShape>&, std::vector<TEngine::TShape>&, int layout) override;
+};
+}    // namespace TEngine
+
+#endif
diff --git a/operator/include/operator/addn_param.hpp b/operator/include/operator/addn_param.hpp
new file mode 100644
index 000000000..41cb560fd
--- /dev/null
+++ b/operator/include/operator/addn_param.hpp
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+#ifndef __ADDN_PARAM_HPP__
+#define __ADDN_PARAM_HPP__
+
+#include "parameter.hpp"
+
+namespace TEngine {
+
+struct AddnParam : public NamedParam
+{
+    int axis;
+    DECLARE_PARSER_STRUCTURE(AddnParam)
+    {
+        DECLARE_PARSER_ENTRY(axis);
+    }
+};
+
+}    // namespace TEngine
+
+#endif
diff --git a/operator/include/operator/conv_param.hpp b/operator/include/operator/conv_param.hpp
index 565c53877..064b623a2 100644
--- a/operator/include/operator/conv_param.hpp
+++ b/operator/include/operator/conv_param.hpp
@@ -46,14 +46,16 @@ struct ConvParam : public NamedParam
     int kernel_w;
     int stride_h;
     int stride_w;
-    int pad_h;
-    int pad_w;
     int dilation_h;
     int dilation_w;
+    int input_channel;
     int output_channel;
     int group;
     int activation;
-    std::vector<int> pads;
+    int pad_h0;  // top padding rows
+    int pad_w0;  // left padding columns
+    int pad_h1;  // bottom padding rows
+    int pad_w1;  // right padding columns
 
     DECLARE_PARSER_STRUCTURE(ConvParam)
     {
@@ -61,13 +63,16 @@ struct ConvParam : public NamedParam
         DECLARE_PARSER_ENTRY(kernel_w);
         DECLARE_PARSER_ENTRY(stride_h);
         DECLARE_PARSER_ENTRY(stride_w);
-        DECLARE_PARSER_ENTRY(pad_h);
-        DECLARE_PARSER_ENTRY(pad_w);
         DECLARE_PARSER_ENTRY(dilation_h);
         DECLARE_PARSER_ENTRY(dilation_w);
+        DECLARE_PARSER_ENTRY(input_channel);
         DECLARE_PARSER_ENTRY(output_channel);
         DECLARE_PARSER_ENTRY(group);
         DECLARE_PARSER_ENTRY(activation);
+        DECLARE_PARSER_ENTRY(pad_h0);
+        DECLARE_PARSER_ENTRY(pad_w0);
+        DECLARE_PARSER_ENTRY(pad_h1);
+        DECLARE_PARSER_ENTRY(pad_w1);
     };
 };
 
diff --git a/operator/include/operator/deconv_param.hpp b/operator/include/operator/deconv_param.hpp
index f8bad7170..f3bb0c93e 100644
--- a/operator/include/operator/deconv_param.hpp
+++ b/operator/include/operator/deconv_param.hpp
@@ -30,19 +30,36 @@ namespace TEngine {
 
 struct DeconvParam : public NamedParam
 {
-    int kernel_size;
-    int stride;
-    int pad;
     int num_output;
-    int dilation;
+	int kernel_h;
+	int kernel_w;
+	int stride_h;
+	int stride_w;
+	int pad_h0;
+	int pad_w0;
+	int pad_h1;
+	int pad_w1;
+	int dilation_h;
+	int dilation_w;
+	int group;
+	int activation;
 
     DECLARE_PARSER_STRUCTURE(DeconvParam)
     {
-        DECLARE_PARSER_ENTRY(kernel_size);
-        DECLARE_PARSER_ENTRY(stride);
-        DECLARE_PARSER_ENTRY(pad);
-        DECLARE_PARSER_ENTRY(num_output);
-        DECLARE_PARSER_ENTRY(dilation);
+   		DECLARE_PARSER_ENTRY(num_output);
+        DECLARE_PARSER_ENTRY(kernel_h);
+        DECLARE_PARSER_ENTRY(kernel_w);
+        DECLARE_PARSER_ENTRY(stride_h);
+        DECLARE_PARSER_ENTRY(stride_w);
+		DECLARE_PARSER_ENTRY(pad_h0);
+	    DECLARE_PARSER_ENTRY(pad_w0);
+		DECLARE_PARSER_ENTRY(pad_h1);
+	    DECLARE_PARSER_ENTRY(pad_w1);
+	    DECLARE_PARSER_ENTRY(dilation_h);
+	    DECLARE_PARSER_ENTRY(dilation_w);
+        DECLARE_PARSER_ENTRY(group);
+ 		DECLARE_PARSER_ENTRY(activation);
+		
     };
 };
 
diff --git a/operator/include/operator/demo_op.hpp b/operator/include/operator/demo_op.hpp
index d57736818..c8ed423f4 100644
--- a/operator/include/operator/demo_op.hpp
+++ b/operator/include/operator/demo_op.hpp
@@ -37,6 +37,8 @@ class DemoOp : public OperatorNoParam<DemoOp>
     }
     DemoOp(const DemoOp& src) = default;
 
+    bool InferShape(const std::vector<TShape>& ishape, std::vector<TShape>& oshape, int layout) override;
+
     void SetSchema(void) override;
 };
 }    // namespace TEngine
diff --git a/operator/include/operator/eltwise.hpp b/operator/include/operator/eltwise.hpp
index ccc677ce2..34b329fa1 100644
--- a/operator/include/operator/eltwise.hpp
+++ b/operator/include/operator/eltwise.hpp
@@ -38,23 +38,23 @@ class Eltwise : public OperatorWithParam<Eltwise, EltwiseParam>
     Eltwise(const Eltwise& src) = default;
     virtual ~Eltwise(){};
 
-    void MethodToType(EltwiseParam& param)
-    {
-        std::string& method = param.method;
+    // void MethodToType(EltwiseParam& param)
+    // {
+    //     std::string& method = param.method;
 
-        /* default eltwise_SUM */
-        param.type = ELT_SUM;
+    //     /* default eltwise_SUM */
+    //     param.type = ELT_SUM;
 
-        if(method == "max")
-            param.type = ELT_MAX;
-        else if(method == "prod")
-            param.type = ELT_PROD;
-    }
-    void ParseParam(EltwiseParam& param, Operator* op) override
-    {
-        ParsePredefinedParam(param, op);
-        MethodToType(param);
-    }
+    //     if(method == "max")
+    //         param.type = ELT_MAX;
+    //     else if(method == "prod")
+    //         param.type = ELT_PROD;
+    // }
+    // void ParseParam(EltwiseParam& param, Operator* op) override
+    // {
+    //     ParsePredefinedParam(param, op);
+    //     MethodToType(param);
+    // }
     void SetSchema(void) override;
 
     bool InferShape(const std::vector<TShape>& ishape, std::vector<TShape>& oshape, int layout) override;
diff --git a/operator/include/operator/eltwise_param.hpp b/operator/include/operator/eltwise_param.hpp
index 740677a44..c63c1b428 100644
--- a/operator/include/operator/eltwise_param.hpp
+++ b/operator/include/operator/eltwise_param.hpp
@@ -37,20 +37,28 @@ enum EltType
     ELT_MAX,
     ELT_RSQRT,
     ELT_MIN_SCALAR,
-    ELT_LAST
+    ELT_LAST,
+    ELT_DIV,
+    ELT_LOG,
+    ELT_EXP,
+    ELT_SQRT,
+    ELT_FLOOR,
+    ELT_SQUARE,
+    ELT_POW
 };
 
 namespace TEngine {
 
 struct EltwiseParam : public NamedParam
 {
-    std::string method;
-    EltType type;
+    // std::string method;
+    // EltType type;
+    int type;
     int caffe_flavor;
 
     DECLARE_PARSER_STRUCTURE(EltwiseParam)
     {
-        DECLARE_PARSER_ENTRY(method);
+        DECLARE_PARSER_ENTRY(type);
         DECLARE_PARSER_ENTRY(caffe_flavor);
     };
 };
diff --git a/operator/include/operator/gru.hpp b/operator/include/operator/gru.hpp
new file mode 100644
index 000000000..6a13fdd9e
--- /dev/null
+++ b/operator/include/operator/gru.hpp
@@ -0,0 +1,63 @@
+#ifndef __GRU_HPP__
+#define __GRU_HPP__
+
+#include "operator.hpp"
+#include "gru_param.hpp"
+
+namespace TEngine {
+
+class GRU : public OperatorWithParam<GRU, GRUParam>
+{
+public:
+    GRU(void)
+    {
+        name_ = "GRU";
+    }
+    GRU(const GRU&) = default;
+    void SetSchema(void) override;
+    bool InferShape(const std::vector<TShape>&, std::vector<TShape>&, int layout) override;
+    const char* GetBiasName(void)
+    {
+        return "gates/bias";
+    }
+    const char* GetKernelName(void)
+    {
+        return "gates/kernel";
+    }
+    const char* GetInitHiddenName(void)
+    {
+        return "init_h";
+    }
+    const char* GetCandidateKernelName(void)
+    {
+        return "candidate/kernel";
+    }
+    const char* GetCandidateBiasName(void)
+    {
+        return "candidate/bias";
+    }
+    const char* Geti2hweightName(void)
+    {
+        return "i2h_weight";
+    }
+    const char* Geti2hbiasName(void)
+    {
+        return "i2h_bias";
+    }
+    const char* Geth2hweightName(void)
+    {
+        return "h2h_weight";
+    }
+    const char* Geth2hbiasName(void)
+    {
+        return "h2h_bias";
+    }
+    const char* GetFusedKernelName(void)
+    {
+        return "parameters";
+    }
+};
+
+}    // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/operator/include/operator/gru_param.hpp b/operator/include/operator/gru_param.hpp
new file mode 100644
index 000000000..514c3bab0
--- /dev/null
+++ b/operator/include/operator/gru_param.hpp
@@ -0,0 +1,58 @@
+/*
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#ifndef __GRU_PARAM_HPP__
+#define __GRU_PARAM_HPP__
+
+#include <vector>
+
+#include "parameter.hpp"
+
+namespace TEngine {
+
+#define GRU_ACT_TANH    1
+
+struct GRUParam : public NamedParam
+{
+    float clip;
+    int output_len;
+    int sequence_len;
+    int input_size;
+    int hidden_size;
+    int has_clip;
+    int has_gate_bias;
+    int has_candidate_bias;
+    int has_init_state;
+    int mxnet_flag;
+
+    DECLARE_PARSER_STRUCTURE(GRUParam)
+    {
+        DECLARE_PARSER_ENTRY(clip);
+        DECLARE_PARSER_ENTRY(output_len);
+        DECLARE_PARSER_ENTRY(sequence_len);
+        DECLARE_PARSER_ENTRY(input_size);
+        DECLARE_PARSER_ENTRY(hidden_size);
+        DECLARE_PARSER_ENTRY(has_clip);
+        DECLARE_PARSER_ENTRY(has_gate_bias);
+        DECLARE_PARSER_ENTRY(has_candidate_bias);
+        DECLARE_PARSER_ENTRY(has_init_state);
+        DECLARE_PARSER_ENTRY(mxnet_flag);
+    };
+};
+
+}    // namespace TEngine
+
+#endif
diff --git a/operator/include/operator/lstm.hpp b/operator/include/operator/lstm.hpp
index 62c21ed59..8d4dcf13b 100644
--- a/operator/include/operator/lstm.hpp
+++ b/operator/include/operator/lstm.hpp
@@ -16,6 +16,10 @@ class LSTM : public OperatorWithParam<LSTM, LSTMParam>
     LSTM(const LSTM&) = default;
     void SetSchema(void) override;
     bool InferShape(const std::vector<TShape>&, std::vector<TShape>&, int layout) override;
+    const char* GetKernelName(void)
+    {
+        return "kernel";
+    }
     const char* GetBiasName(void)
     {
         return "bias";
@@ -44,6 +48,27 @@ class LSTM : public OperatorWithParam<LSTM, LSTMParam>
     {
         return "init_h";
     }
+     const char* Geti2hKernelName(void)
+    {
+        return "i2h_weight";
+    }
+    const char* Geti2hBiasName(void)
+    {
+        return "i2h_bias";
+    }
+     const char* Geth2hKernelName(void)
+    {
+        return "h2h_weight";
+    }
+    const char* Geth2hBiasName(void)
+    {
+        return "h2h_bias";
+    }
+    const char* GetFusedKernelName(void)
+    {
+        return "parameters";
+    }
+    
 };
 
 }    // namespace TEngine
diff --git a/operator/include/operator/lstm_param.hpp b/operator/include/operator/lstm_param.hpp
index 6e0551902..5c01cbf02 100644
--- a/operator/include/operator/lstm_param.hpp
+++ b/operator/include/operator/lstm_param.hpp
@@ -30,6 +30,9 @@
 
 namespace TEngine {
 
+#define LSTM_ACT_SIGMOID 1
+#define LSTM_ACT_TANH    2
+
 struct LSTMParam : public NamedParam
 {
     float forget_bias;
@@ -44,11 +47,12 @@ struct LSTMParam : public NamedParam
     int has_clip;
     int has_bias;
     int has_init_state;
-    const char* forget_act;
-    const char* input_act;
-    const char* output_act;
-    const char* cellin_act;
-    const char* cellout_act;
+    int forget_act;
+    int input_act;
+    int output_act;
+    int cellin_act;
+    int cellout_act;
+    int mxnet_flag;
 
     DECLARE_PARSER_STRUCTURE(LSTMParam)
     {
@@ -69,6 +73,7 @@ struct LSTMParam : public NamedParam
         DECLARE_PARSER_ENTRY(cellin_act);
         DECLARE_PARSER_ENTRY(output_act);
         DECLARE_PARSER_ENTRY(cellout_act);
+        DECLARE_PARSER_ENTRY(mxnet_flag);
     };
 };
 
diff --git a/operator/include/operator/pad.hpp b/operator/include/operator/pad.hpp
new file mode 100644
index 000000000..d11183ce2
--- /dev/null
+++ b/operator/include/operator/pad.hpp
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#ifndef __PAD_HPP__
+#define __PAD_HPP__
+
+#include "operator.hpp"
+#include "pad_param.hpp"
+
+namespace TEngine {
+
+class Pad : public OperatorWithParam<Pad, PadParam>
+{
+public:
+    Pad()
+    {
+        name_ = "Pad";
+    }
+    Pad(const Pad& src) = default;
+
+    virtual ~Pad() {}
+    bool InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEngine::TShape>& oshape,
+                    int layout) override;
+    void SetSchema(void) override;
+};
+
+}    // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/operator/include/operator/pad_param.hpp b/operator/include/operator/pad_param.hpp
new file mode 100644
index 000000000..82eb30fb9
--- /dev/null
+++ b/operator/include/operator/pad_param.hpp
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#ifndef __PAD_PARAM_HPP__
+#define __PAD_PARAM_HPP__
+
+#include "parameter.hpp"
+
+namespace TEngine {
+
+struct PadParam : public NamedParam
+{
+    //mode : 0: CONSTANT; 1: REFLECT; 2: SYMMETRIC.
+    int mode;
+    int pad_0_h;
+    int pad_0_w;
+    int pad_1_h;
+    int pad_1_w;
+    int pad_2_h;
+    int pad_2_w;
+    int pad_3_h;
+    int pad_3_w;
+    float value;
+
+    DECLARE_PARSER_STRUCTURE(PadParam)
+    {
+        DECLARE_PARSER_ENTRY(mode);
+        DECLARE_PARSER_ENTRY(pad_0_h);
+        DECLARE_PARSER_ENTRY(pad_0_w);
+        DECLARE_PARSER_ENTRY(pad_1_h);
+        DECLARE_PARSER_ENTRY(pad_1_w);
+        DECLARE_PARSER_ENTRY(pad_2_h);
+        DECLARE_PARSER_ENTRY(pad_2_w);
+        DECLARE_PARSER_ENTRY(pad_3_h);
+        DECLARE_PARSER_ENTRY(pad_3_w);
+        DECLARE_PARSER_ENTRY(value);
+    };
+};
+}    // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/operator/include/operator/pool_param.hpp b/operator/include/operator/pool_param.hpp
index 7254b882e..7f1ea4e39 100644
--- a/operator/include/operator/pool_param.hpp
+++ b/operator/include/operator/pool_param.hpp
@@ -48,15 +48,14 @@ struct PoolParam : public NamedParam
     int alg;
     int kernel_h;
     int kernel_w;
-    int pad_h;
-    int pad_w;
     int stride_h;
     int stride_w;
     int global;
     int caffe_flavor;
-    std::vector<int> kernel_shape;    ///> The size of the kernel along each axis (H, W).
-    std::vector<int> strides;    ///> stride along each axis (H, W).
-    std::vector<int> pads;    ///> [x1_begin, x2_begin...x1_end, x2_end,...] for each axis.
+    int pad_h0;  // top padding rows
+    int pad_w0;  // left padding columns
+    int pad_h1;  // bottom padding rows
+    int pad_w1;  // right padding columns
 
     DECLARE_PARSER_STRUCTURE(PoolParam)
     {
@@ -65,10 +64,12 @@ struct PoolParam : public NamedParam
         DECLARE_PARSER_ENTRY(kernel_w);
         DECLARE_PARSER_ENTRY(stride_h);
         DECLARE_PARSER_ENTRY(stride_w);
-        DECLARE_PARSER_ENTRY(pad_h);
-        DECLARE_PARSER_ENTRY(pad_w);
         DECLARE_PARSER_ENTRY(global);
         DECLARE_PARSER_ENTRY(caffe_flavor);
+        DECLARE_PARSER_ENTRY(pad_h0);
+        DECLARE_PARSER_ENTRY(pad_w0);
+        DECLARE_PARSER_ENTRY(pad_h1);
+        DECLARE_PARSER_ENTRY(pad_w1);
     };
 };
 
diff --git a/operator/include/operator/pooling.hpp b/operator/include/operator/pooling.hpp
index cd62f9176..34e446ccf 100644
--- a/operator/include/operator/pooling.hpp
+++ b/operator/include/operator/pooling.hpp
@@ -47,26 +47,6 @@ class Pooling : public OperatorWithParam<Pooling, PoolParam>
 
     void SetSchema(void) override;
 
-    void ParseParam(PoolParam& param, Operator* op) override
-    {
-        ParsePredefinedParam(param, op);
-
-        /* translate to onnx parameters */
-        param.kernel_shape.resize(2);
-
-        param.kernel_shape[0] = param.kernel_h;
-        param.kernel_shape[1] = param.kernel_w;
-
-        param.strides.resize(2);
-        param.strides[0] = param.stride_h;
-        param.strides[1] = param.stride_w;
-
-        param.pads.resize(4);
-        param.pads[0] = param.pad_h;
-        param.pads[1] = param.pad_w;
-        param.pads[2] = param.pad_h;
-        param.pads[3] = param.pad_w;
-    }
 };
 
 }    // namespace TEngine
diff --git a/operator/include/operator/reduction.hpp b/operator/include/operator/reduction.hpp
new file mode 100644
index 000000000..65b887dac
--- /dev/null
+++ b/operator/include/operator/reduction.hpp
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#ifndef __REDUCTION_HPP__
+#define __REDUCTION_HPP__
+
+#include "operator.hpp"
+#include "reduction_param.hpp"
+
+namespace TEngine {
+
+class Reduction : public OperatorWithParam<Reduction, ReductionParam>
+{
+public:
+    Reduction()
+    {
+        name_ = "Reduction";
+    }
+    Reduction(const Reduction& src) = default;
+
+    virtual ~Reduction() {}
+    bool InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEngine::TShape>& oshape,
+                    int layout) override;
+    void SetSchema(void) override;
+};
+
+}    // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/operator/include/operator/reduction_param.hpp b/operator/include/operator/reduction_param.hpp
new file mode 100644
index 000000000..49c1c4ed7
--- /dev/null
+++ b/operator/include/operator/reduction_param.hpp
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#ifndef __REDUCTION_PARAM_HPP__
+#define __REDUCTION_PARAM_HPP__
+
+#include "parameter.hpp"
+
+namespace TEngine {
+
+struct ReductionParam : public NamedParam
+{
+    int dim_0;
+    int dim_1;
+    int dim_2;
+    int dim_3;
+    //type : 0: sum; 1: mean.
+    int type;
+    int keepdim;
+    DECLARE_PARSER_STRUCTURE(ReductionParam)
+    {
+        DECLARE_PARSER_ENTRY(dim_0);
+        DECLARE_PARSER_ENTRY(dim_1);
+        DECLARE_PARSER_ENTRY(dim_2);
+        DECLARE_PARSER_ENTRY(dim_3);
+        DECLARE_PARSER_ENTRY(keepdim);
+        DECLARE_PARSER_ENTRY(type);
+    };
+};
+
+}    // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/operator/include/operator/rnn.hpp b/operator/include/operator/rnn.hpp
new file mode 100644
index 000000000..8811bf4e2
--- /dev/null
+++ b/operator/include/operator/rnn.hpp
@@ -0,0 +1,31 @@
+#ifndef __RNN_HPP__
+#define __RNN_HPP__
+
+#include "operator.hpp"
+#include "rnn_param.hpp"
+
+namespace TEngine {
+
+class RNN : public OperatorWithParam<RNN, RNNParam>
+{
+public:
+    RNN(void)
+    {
+        name_ = "RNN";
+    }
+    RNN(const RNN&) = default;
+    void SetSchema(void) override;
+    bool InferShape(const std::vector<TShape>&, std::vector<TShape>&, int layout) override;
+    const char* GetBiasName(void)
+    {
+        return "bias";
+    }
+    const char* GetInitHiddenName(void)
+    {
+        return "init_h";
+    }
+};
+
+}    // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/operator/include/operator/rnn_param.hpp b/operator/include/operator/rnn_param.hpp
new file mode 100644
index 000000000..cc79455c4
--- /dev/null
+++ b/operator/include/operator/rnn_param.hpp
@@ -0,0 +1,56 @@
+/*
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#ifndef __RNN_PARAM_HPP__
+#define __RNN_PARAM_HPP__
+
+#include <vector>
+
+#include "parameter.hpp"
+
+namespace TEngine {
+
+#define RNN_ACT_TANH    1
+
+struct RNNParam : public NamedParam
+{
+    float clip;
+    int output_len;
+    int sequence_len;
+    int input_size;
+    int hidden_size;
+    int has_clip;
+    int has_bias;
+    int has_init_state;
+    int activation;
+
+    DECLARE_PARSER_STRUCTURE(RNNParam)
+    {
+        DECLARE_PARSER_ENTRY(clip);
+        DECLARE_PARSER_ENTRY(output_len);
+        DECLARE_PARSER_ENTRY(sequence_len);
+        DECLARE_PARSER_ENTRY(input_size);
+        DECLARE_PARSER_ENTRY(hidden_size);
+        DECLARE_PARSER_ENTRY(has_clip);
+        DECLARE_PARSER_ENTRY(has_bias);
+        DECLARE_PARSER_ENTRY(has_init_state);
+        DECLARE_PARSER_ENTRY(activation);
+    };
+};
+
+}    // namespace TEngine
+
+#endif
diff --git a/operator/include/operator/sigmoid.hpp b/operator/include/operator/sigmoid.hpp
new file mode 100644
index 000000000..c1a9ce5d7
--- /dev/null
+++ b/operator/include/operator/sigmoid.hpp
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+#ifndef __SIGMOID_HPP__
+#define __SIGMOID_HPP__
+
+#include "operator.hpp"
+
+namespace TEngine {
+
+class Sigmoid : public OperatorNoParam<Sigmoid>
+{
+public:
+    Sigmoid()
+    {
+        name_ = "Sigmoid";
+    }
+    Sigmoid(const Sigmoid& src) = default;
+    virtual ~Sigmoid(){};
+
+    float GetFops(const std::vector<TShape>& inputs, const std::vector<TShape>& outputs) override;
+
+    void SetSchema(void) override;
+};
+
+}    // namespace TEngine
+
+#endif
diff --git a/operator/include/operator/slice_param.hpp b/operator/include/operator/slice_param.hpp
index f19744a7a..32c4d5a27 100644
--- a/operator/include/operator/slice_param.hpp
+++ b/operator/include/operator/slice_param.hpp
@@ -31,6 +31,10 @@ namespace TEngine {
 struct SliceParam : public NamedParam
 {
     int axis;
+    std::vector<int> slice_point_;
+    std::vector<int> begin_;
+    std::vector<int> size_;
+    bool iscaffe;
 
     DECLARE_PARSER_STRUCTURE(SliceParam)
     {
diff --git a/operator/include/operator/split.hpp b/operator/include/operator/split.hpp
index e1712c5ff..fa6881127 100644
--- a/operator/include/operator/split.hpp
+++ b/operator/include/operator/split.hpp
@@ -25,10 +25,11 @@
 #define __SPLIT_HPP__
 
 #include "operator.hpp"
+#include "split_param.hpp"
 
 namespace TEngine {
 
-class Split : public OperatorNoParam<Split>
+class Split : public OperatorWithParam<Split,SplitParam>
 {
 public:
     Split()
@@ -36,7 +37,7 @@ class Split : public OperatorNoParam<Split>
         name_ = "Split";
     }
     Split(const Split& src) = default;
-    ~Split() {}
+    virtual ~Split() {}
 
     bool InferShape(const std::vector<TShape>& ishape, std::vector<TShape>& oshape, int layout) override;
 
diff --git a/operator/include/operator/split_param.hpp b/operator/include/operator/split_param.hpp
new file mode 100644
index 000000000..6e5627b6a
--- /dev/null
+++ b/operator/include/operator/split_param.hpp
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#ifndef __SPLIT_PARAM_HPP__
+#define __SPLIT_PARAM_HPP__
+
+#include "parameter.hpp"
+
+namespace TEngine {
+
+struct SplitParam : public NamedParam
+{
+    int axis;
+    int split_dim;
+    bool is_caffe;
+    std::vector<int> split_sizes_;
+
+    DECLARE_PARSER_STRUCTURE(SplitParam)
+    {
+        DECLARE_PARSER_ENTRY(axis);
+        DECLARE_PARSER_ENTRY(split_dim);
+        DECLARE_PARSER_ENTRY(is_caffe);
+        DECLARE_PARSER_ENTRY(split_sizes_);
+    }
+};
+
+}    // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/operator/include/operator/squeeze.hpp b/operator/include/operator/squeeze.hpp
new file mode 100644
index 000000000..7113861ad
--- /dev/null
+++ b/operator/include/operator/squeeze.hpp
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#ifndef __SQUEEZE_HPP__
+#define __SQUEEZE_HPP__
+
+#include "operator.hpp"
+#include "squeeze_param.hpp"
+
+namespace TEngine {
+
+class Squeeze : public OperatorWithParam<Squeeze, SqueezeParam>
+{
+public:
+    Squeeze()
+    {
+        name_ = "Squeeze";
+    }
+    Squeeze(const Squeeze& src) = default;
+
+    virtual ~Squeeze() {}
+    bool InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEngine::TShape>& oshape,
+                    int layout) override;
+    void SetSchema(void) override;
+};
+
+}    // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/operator/include/operator/squeeze_param.hpp b/operator/include/operator/squeeze_param.hpp
new file mode 100644
index 000000000..9377f0078
--- /dev/null
+++ b/operator/include/operator/squeeze_param.hpp
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#ifndef __SQUEEZE_PARAM_HPP__
+#define __SQUEEZE_PARAM_HPP__
+
+#include "parameter.hpp"
+
+namespace TEngine {
+
+struct SqueezeParam : public NamedParam
+{
+    int dim_0;
+    int dim_1;
+    int dim_2;
+    int dim_3;
+    DECLARE_PARSER_STRUCTURE(SqueezeParam)
+    {
+        DECLARE_PARSER_ENTRY(dim_0);
+        DECLARE_PARSER_ENTRY(dim_1);
+        DECLARE_PARSER_ENTRY(dim_2);
+        DECLARE_PARSER_ENTRY(dim_3);
+    };
+};
+
+}    // namespace TEngine
+
+#endif
diff --git a/operator/include/operator/swap_axis.hpp b/operator/include/operator/swap_axis.hpp
new file mode 100644
index 000000000..9ae5f8cf9
--- /dev/null
+++ b/operator/include/operator/swap_axis.hpp
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+#ifndef __SWAP_AXIS_HPP__
+#define __SWAP_AXIS_HPP__
+
+#include "operator.hpp"
+#include "swap_axis_param.hpp"
+
+namespace TEngine {
+
+class SwapAxis : public OperatorWithParam<SwapAxis, SwapAxisParam>
+{
+public:
+    SwapAxis()
+    {
+        name_ = "SwapAxis";
+    }
+    SwapAxis(const SwapAxis& src) = default;
+
+    virtual ~SwapAxis() {}
+    bool InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEngine::TShape>& oshape,
+                    int layout) override;
+    void SetSchema(void) override;
+};
+
+}    // namespace TEngine
+
+#endif
diff --git a/operator/include/operator/swap_axis_param.hpp b/operator/include/operator/swap_axis_param.hpp
new file mode 100644
index 000000000..1099442e3
--- /dev/null
+++ b/operator/include/operator/swap_axis_param.hpp
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+#ifndef __SWAP_AXIS_PARAM_HPP__
+#define __SWAP_AXIS_PARAM_HPP__
+
+#include "parameter.hpp"
+
+namespace TEngine {
+
+struct SwapAxisParam : public NamedParam
+{
+    int dim_0;
+    int dim_1;
+    DECLARE_PARSER_STRUCTURE(SwapAxisParam)
+    {
+        DECLARE_PARSER_ENTRY(dim_0);
+        DECLARE_PARSER_ENTRY(dim_1);
+    };
+};
+
+}    // namespace TEngine
+
+#endif
diff --git a/operator/include/operator/tanh.hpp b/operator/include/operator/tanh.hpp
new file mode 100644
index 000000000..9635388c8
--- /dev/null
+++ b/operator/include/operator/tanh.hpp
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+#ifndef __TANH_HPP__
+#define __TANH_HPP__
+
+#include "operator.hpp"
+
+namespace TEngine {
+
+class Tanh : public OperatorNoParam<Tanh>
+{
+public:
+    Tanh()
+    {
+        name_ = "Tanh";
+    }
+    Tanh(const Tanh& src) = default;
+    virtual ~Tanh(){};
+
+    void SetSchema(void) override;
+};
+
+}    // namespace TEngine
+
+#endif
diff --git a/operator/operator/Makefile b/operator/operator/Makefile
index 5a0169172..376db442b 100644
--- a/operator/operator/Makefile
+++ b/operator/operator/Makefile
@@ -1,37 +1,46 @@
-obj-y+=convolution.o
-obj-y+=softmax.o
-obj-y+=pooling.o
-obj-y+=input_op.o
-obj-y+=fully_connected.o
-obj-y+=relu.o
-obj-y+=const_op.o
-obj-y+=split.o
-obj-y+=concat.o
-obj-y+=dropout.o
 obj-y+=accuracy.o
+obj-y+=addn.o
 obj-y+=batch_norm.o
-obj-y+=scale.o
-obj-y+=lrn.o
-obj-y+=fused_operator.o
-obj-y+=prelu.o
-obj-y+=eltwise.o
-obj-y+=slice.o
+obj-y+=concat.o
+obj-y+=const_op.o
+obj-y+=convolution.o
+obj-y+=deconvolution.o
 obj-y+=demo_op.o
+obj-y+=detection_output.o
+obj-y+=detection_postprocess.o
+obj-y+=dropout.o
+obj-y+=eltwise.o
+obj-y+=flatten.o
+obj-y+=fully_connected.o
+obj-y+=fused_operator.o
+obj-y+=gemm.o
+obj-y+=generic.o
+obj-y+=input_op.o
+obj-y+=logistic.o
+obj-y+=lrn.o
+obj-y+=lstm.o
 obj-y+=normalize.o
+obj-y+=pad.o
 obj-y+=permute.o
-obj-y+=flatten.o
+obj-y+=pooling.o
+obj-y+=prelu.o
 obj-y+=priorbox.o
-obj-y+=reshape.o
-obj-y+=detection_output.o
-obj-y+=rpn.o
-obj-y+=roi_pooling.o
-obj-y+=reorg.o
+obj-y+=reduction.o
 obj-y+=region.o
+obj-y+=relu.o
 obj-y+=relu6.o
-obj-y+=deconvolution.o
+obj-y+=reorg.o
+obj-y+=reshape.o
 obj-y+=resize.o
-obj-y+=gemm.o
-obj-y+=generic.o
-obj-y+=lstm.o
-obj-y+=logistic.o
-obj-y+=detection_postprocess.o
+obj-y+=rnn.o
+obj-y+=roi_pooling.o
+obj-y+=rpn.o
+obj-y+=scale.o
+obj-y+=sigmoid.o
+obj-y+=slice.o
+obj-y+=softmax.o
+obj-y+=split.o
+obj-y+=squeeze.o
+obj-y+=swap_axis.o
+obj-y+=tanh.o
+obj-y+=gru.o
diff --git a/operator/operator/accuracy.cpp b/operator/operator/accuracy.cpp
index 1e6633a2e..aca5330e2 100644
--- a/operator/operator/accuracy.cpp
+++ b/operator/operator/accuracy.cpp
@@ -27,7 +27,7 @@ namespace TEngine {
 
 void Accuracy::SetSchema(void)
 {
-    Input({"input:float32"}).Output({"output:float32"}).SetLayout("W").SetDoc(R"DOC(Accuracy Operator)DOC");
+    Input({"input:float32"}).Output({"output:float32"}).SetDoc(R"DOC(Accuracy Operator)DOC");
 }
 
 }    // namespace TEngine
diff --git a/core/lib/data_layout.cpp b/operator/operator/addn.cpp
similarity index 66%
rename from core/lib/data_layout.cpp
rename to operator/operator/addn.cpp
index 515861cf8..15a84372b 100644
--- a/core/lib/data_layout.cpp
+++ b/operator/operator/addn.cpp
@@ -21,23 +21,22 @@
  * Copyright (c) 2017, Open AI Lab
  * Author: haitao@openailab.com
  */
-#include "data_layout.hpp"
+#include "operator/add_n.hpp"
+#include "static_graph.hpp"
 
 namespace TEngine {
 
-template <> void NamedData<DataLayout>::InitPredefinedData()
+bool Addn::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEngine::TShape>& oshape, int layout)
 {
-#define DUMMY_OBJECT(type) static type DUMMY_OBJECT_##type
-#define DUMMY_OBJECT_DEFAULT(type) static type DUMMY_OBJECT_##type(true)
+    oshape[0] = ishape[0];
+    return true;
+}
 
-    DUMMY_OBJECT_DEFAULT(LayoutNCHW);
-    DUMMY_OBJECT(LayoutNCDHW);
-    DUMMY_OBJECT(LayoutNHWC);
-    DUMMY_OBJECT(LayoutNDHWC);
-    DUMMY_OBJECT(LayoutNHW);
-    DUMMY_OBJECT(LayoutNW);
-    DUMMY_OBJECT(LayoutHW);
-    DUMMY_OBJECT(LayoutW);
+void Addn::SetSchema(void)
+{
+    Input({"input:float32"})
+        .Output({"output:float32"})
+        .SetDoc(R"DOC(Addn Operator)DOC");
 }
 
 }    // namespace TEngine
diff --git a/operator/operator/concat.cpp b/operator/operator/concat.cpp
index 76205f0e3..5edd368eb 100644
--- a/operator/operator/concat.cpp
+++ b/operator/operator/concat.cpp
@@ -55,7 +55,6 @@ void Concat::SetSchema(void)
     Input({"input:float32"})
         .Output({"output:float32"})
         .SetAttr("axis", 1)
-        .SetLayout("NCHW")
         .SetDoc(R"DOC(Concat Operator)DOC");
 }
 
diff --git a/operator/operator/convolution.cpp b/operator/operator/convolution.cpp
index 954808867..3a0f51b13 100644
--- a/operator/operator/convolution.cpp
+++ b/operator/operator/convolution.cpp
@@ -73,57 +73,43 @@ bool Convolution::InferShape(const std::vector<TShape>& ishape, std::vector<TSha
         return false;
     }
 
-    if(param_.pads.size() == 0)
+    param_.input_channel = input_c;
+
+    if(param_.pad_h0 < 0 )
     {
-        // not onnx format
-        param_.pads.resize(4);
+        int n = (input_h - 1) / param_.stride_h + 1;
+        int total_len = (n - 1) * param_.stride_h + param_.kernel_h;
+        int pad_num = total_len - input_h;
 
-        if(param_.pad_h >= 0)
+        if(param_.pad_h0 == -1)    // TF or SAME_UPPER in ONNX
         {
-            param_.pads[0] = param_.pad_h;
-            param_.pads[2] = param_.pad_h;
+            param_.pad_h0 = pad_num / 2;
+            param_.pad_h1 = pad_num - pad_num / 2;
         }
         else
         {
-            int n = (input_h - 1) / param_.stride_h + 1;
-            int total_len = (n - 1) * param_.stride_h + param_.kernel_h;
-            int pad_num = total_len - input_h;
-
-            if(param_.pad_h == -1)    // TF or SAME_UPPER in ONNX
-            {
-                param_.pads[0] = pad_num / 2;
-                param_.pads[2] = pad_num - pad_num / 2;
-            }
-            else
-            {
-                // SAME_LOWER in ONNX
-                param_.pads[0] = pad_num - pad_num / 2;
-                param_.pads[2] = pad_num / 2;
-            }
+            // SAME_LOWER in ONNX
+            param_.pad_h0 = pad_num - pad_num / 2;
+            param_.pad_h1 = pad_num / 2;
         }
+    }
+
+    if(param_.pad_w0 < 0)
+    {
+        int n = (input_w - 1) / param_.stride_w + 1;
+        int total_len = (n - 1) * param_.stride_w + param_.kernel_w;
+        int pad_num = total_len - input_w;
 
-        if(param_.pad_w >= 0)
+        if(param_.pad_w0 == -1)    // TF or SAME_UPPER in ONNX
         {
-            param_.pads[1] = param_.pad_w;
-            param_.pads[3] = param_.pad_w;
+            param_.pad_w0 = pad_num / 2;
+            param_.pad_w1 = pad_num - pad_num / 2;
         }
         else
         {
-            int n = (input_w - 1) / param_.stride_w + 1;
-            int total_len = (n - 1) * param_.stride_w + param_.kernel_w;
-            int pad_num = total_len - input_w;
-
-            if(param_.pad_w == -1)    // TF or SAME_UPPER in ONNX
-            {
-                param_.pads[1] = pad_num / 2;
-                param_.pads[3] = pad_num - pad_num / 2;
-            }
-            else
-            {
-                // SAME_LOWER in ONNX
-                param_.pads[1] = pad_num - pad_num / 2;
-                param_.pads[3] = pad_num / 2;
-            }
+            // SAME_LOWER in ONNX
+            param_.pad_w0 = pad_num - pad_num / 2;
+            param_.pad_w1 = pad_num / 2;
         }
     }
 
@@ -131,9 +117,9 @@ bool Convolution::InferShape(const std::vector<TShape>& ishape, std::vector<TSha
     int dilation_w = param_.dilation_w;
 
     int output_h =
-        (input_h - dilation_h * (param_.kernel_h - 1) - 1 + param_.pads[0] + param_.pads[2]) / param_.stride_h + 1;
+        (input_h - dilation_h * (param_.kernel_h - 1) - 1 + param_.pad_h0 + param_.pad_h1) / param_.stride_h + 1;
     int output_w =
-        (input_w - dilation_w * (param_.kernel_w - 1) - 1 + param_.pads[1] + param_.pads[3]) / param_.stride_w + 1;
+        (input_w - dilation_w * (param_.kernel_w - 1) - 1 + param_.pad_w0 + param_.pad_w1) / param_.stride_w + 1;
 
     TShape result;
 
@@ -141,13 +127,13 @@ bool Convolution::InferShape(const std::vector<TShape>& ishape, std::vector<TSha
     {
         std::vector<int> dim = {input_n, output_h, output_w, output_c};
         result.SetDim(dim);
-        result.SetDataLayout("NHWC");
+        result.SetDataLayout(TENGINE_LAYOUT_NHWC);
     }
     else
     {
         std::vector<int> dim = {input_n, output_c, output_h, output_w};
         result.SetDim(dim);
-        result.SetDataLayout("NCHW");
+        result.SetDataLayout(TENGINE_LAYOUT_NCHW);
     }
 
     oshape[0] = result;
@@ -176,18 +162,20 @@ void Convolution::SetSchema(void)
 {
     Input({"input:float32", "weight:float32", "bias:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("kernel_h", 1)
         .SetAttr("kernel_w", 1)
         .SetAttr("stride_h", 1)
         .SetAttr("stride_w", 1)
-        .SetAttr("pad_h", 0)
-        .SetAttr("pad_w", 0)
         .SetAttr("dilation_h", 1)
         .SetAttr("dilation_w", 1)
+        .SetAttr("input_channel", 1)
         .SetAttr("output_channel", 1)
         .SetAttr("group", 1)
         .SetAttr("activation", -1)
+        .SetAttr("pad_h0", 0)
+        .SetAttr("pad_w0", 0)
+        .SetAttr("pad_h1", 0)
+        .SetAttr("pad_w1", 0)
         .SetDoc(R"DOC(Convolution Layer)DOC");
 }
 
diff --git a/operator/operator/deconvolution.cpp b/operator/operator/deconvolution.cpp
index 80ef1d3cf..4101839f6 100644
--- a/operator/operator/deconvolution.cpp
+++ b/operator/operator/deconvolution.cpp
@@ -35,16 +35,18 @@ bool Deconvolution::InferShape(const std::vector<TShape>& ishape, std::vector<TS
     int input_h = input_shape.GetH();
     int input_w = input_shape.GetW();
 
-    int kernel_extent = param_.dilation * (param_.kernel_size - 1) + 1;
+    int kernel_extent_w = param_.dilation_w * (param_.kernel_w - 1) + 1;
+	int kernel_extent_h = param_.dilation_h * (param_.kernel_h - 1) + 1;
 
-    int output_h = (input_h - 1) * param_.stride + kernel_extent - 2 * param_.pad;
-    int output_w = (input_w - 1) * param_.stride + kernel_extent - 2 * param_.pad;
+
+    int output_h = (input_h - 1) * param_.stride_h + kernel_extent_h - param_.pad_h0 - param_.pad_h1;
+    int output_w = (input_w - 1) * param_.stride_w + kernel_extent_w - param_.pad_w0 - param_.pad_w1;
 
     std::vector<int> dim = {input_n, param_.num_output, output_h, output_w};
     TShape result;
 
     result.SetDim(dim);
-    result.SetDataLayout("NCHW");
+    result.SetDataLayout(input_shape.GetDataLayout());
 
     oshape[0] = result;
 
@@ -53,7 +55,7 @@ bool Deconvolution::InferShape(const std::vector<TShape>& ishape, std::vector<TS
 
 float Deconvolution::GetFops(const std::vector<TShape>& inputs, const std::vector<TShape>& outputs)
 {
-    float ops = 1.0f * param_.num_output * param_.kernel_size * param_.kernel_size * inputs[0].GetSize() * 2;
+    float ops = 1.0f * param_.num_output * param_.kernel_h * param_.kernel_w * inputs[0].GetSize() * 2;
 
     return ops;
 }
@@ -61,13 +63,20 @@ float Deconvolution::GetFops(const std::vector<TShape>& inputs, const std::vecto
 void Deconvolution::SetSchema(void)
 {
     Input({"input:float32", "weight:float32", "bias:float32"})
-        .Output({"output:float32"})
-        .SetLayout("NCHW")
-        .SetAttr("kernel_size", 1)
-        .SetAttr("stride", 1)
-        .SetAttr("pad", 1)
+        .Output({"output:float32"})  
+        .SetAttr("kernel_h", 1)
+        .SetAttr("kernel_w", 1)
+        .SetAttr("stride_h", 1)
+        .SetAttr("stride_w", 1)
+        .SetAttr("pad_h0", 0)
+        .SetAttr("pad_w0", 0)
+        .SetAttr("pad_h1", 0)
+        .SetAttr("pad_w1", 0)
+        .SetAttr("dilation_h", 1)
+        .SetAttr("dilation_w", 1)
         .SetAttr("num_output", 1)
-        .SetAttr("dilation", 1)
+        .SetAttr("group", 1)
+        .SetAttr("activation", -1)
 
         .SetDoc(R"DOC(Deconvolution Layer)DOC");
 }
diff --git a/operator/operator/demo_op.cpp b/operator/operator/demo_op.cpp
index b59bcdca7..8f72ce846 100644
--- a/operator/operator/demo_op.cpp
+++ b/operator/operator/demo_op.cpp
@@ -25,11 +25,31 @@
 
 namespace TEngine {
 
+/* 
+   DemoOps demos to permute a 2d matrix and 
+   then expanding one column to summerize each row of the permuted matrix 
+*/
+
+bool DemoOp::InferShape(const std::vector<TShape>& ishape, std::vector<TShape>& oshape, int layout)
+{
+    int h=ishape[0].Shape(0);
+    int w=ishape[0].Shape(1);
+    std::vector<int> dims;
+
+    dims.push_back(w);
+    dims.push_back(h+1);
+
+    oshape[0].SetDim(dims);
+    oshape[0].SetDataLayout(layout); 
+
+    return true;
+}
+
+
 void DemoOp::SetSchema(void)
 {
     Input({"input:float32/int8"})
         .Output({"output:float32/int8"})
-        .SetLayout("NCHW")
         .SetDoc(R"DOC(Demo Operator: a demo operator to show how to define and run a operator)DOC");
 }
 
diff --git a/operator/operator/detection_output.cpp b/operator/operator/detection_output.cpp
index 66b7f9b01..8d2983c50 100644
--- a/operator/operator/detection_output.cpp
+++ b/operator/operator/detection_output.cpp
@@ -35,7 +35,7 @@ bool DetectionOutput::InferShape(const std::vector<TEngine::TShape>& ishape, std
     TShape shape;
     std::vector<int> dim = {in_dim[0], 1, 6, 1};
     shape.SetDim(dim);
-    shape.SetDataLayout("NCHW");
+    shape.SetDataLayout(input.GetDataLayout());
     oshape[0] = shape;
     return true;
 }
@@ -44,7 +44,6 @@ void DetectionOutput::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("num_classes", 21)
 
         .SetDoc(R"DOC(DetectionOutput Layer)DOC");
diff --git a/operator/operator/detection_postprocess.cpp b/operator/operator/detection_postprocess.cpp
index 313ba8733..d12a4089b 100644
--- a/operator/operator/detection_postprocess.cpp
+++ b/operator/operator/detection_postprocess.cpp
@@ -46,7 +46,7 @@ bool DetectionPostProcess::InferShape(const std::vector<TEngine::TShape>& ishape
     std::vector<int> dim3 = {1, num_detected_boxes};
     std::vector<int> dim4 = {1};
 
-    shape.SetDataLayout("NCHW");
+    shape.SetDataLayout(ishape[0].GetDataLayout());
     shape.SetDim(dim1);
     oshape[0] = shape;
     shape.SetDim(dim2);
@@ -61,7 +61,7 @@ bool DetectionPostProcess::InferShape(const std::vector<TEngine::TShape>& ishape
 
 void DetectionPostProcess::SetSchema(void)
 {
-    Input({"input:float32"}).Output({"output:float32"}).SetLayout("NCHW").SetDoc(R"DOC(DetectionPostProcess Layer)DOC");
+    Input({"input:float32"}).Output({"output:float32"}).SetDoc(R"DOC(DetectionPostProcess Layer)DOC");
 }
 
 }    // namespace TEngine
diff --git a/operator/operator/dropout.cpp b/operator/operator/dropout.cpp
index 066315536..6df51b791 100644
--- a/operator/operator/dropout.cpp
+++ b/operator/operator/dropout.cpp
@@ -27,7 +27,7 @@ namespace TEngine {
 
 void Dropout::SetSchema(void)
 {
-    Input({"input:float32"}).Output({"output:float32"}).SetLayout("NCHW").SetDoc(R"DOC(Dropout Operator)DOC");
+    Input({"input:float32"}).Output({"output:float32"}).SetDoc(R"DOC(Dropout Operator)DOC");
 }
 
 }    // namespace TEngine
diff --git a/operator/operator/eltwise.cpp b/operator/operator/eltwise.cpp
index ab317d90c..438a17c9d 100644
--- a/operator/operator/eltwise.cpp
+++ b/operator/operator/eltwise.cpp
@@ -59,8 +59,7 @@ void Eltwise::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
-        .SetAttr("method", "sum")
+        .SetAttr("type", 2)
         .SetAttr("caffe_flavor", 1)
         .SetDoc(R"DOC(Eltwise Layer)DOC");
 }
diff --git a/operator/operator/flatten.cpp b/operator/operator/flatten.cpp
index 800d00bf4..b59d469a9 100644
--- a/operator/operator/flatten.cpp
+++ b/operator/operator/flatten.cpp
@@ -40,7 +40,7 @@ bool Flatten::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector
     TShape shape;
     std::vector<int> dim = {in_dim[0], new_channel, 1, 1};
     shape.SetDim(dim);
-    shape.SetDataLayout("NCHW");
+    shape.SetDataLayout(input.GetDataLayout());
     oshape[0] = shape;
     return true;
 }
@@ -49,7 +49,6 @@ void Flatten::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("axis", 1)
         .SetAttr("end_axis", 3)
         .SetDoc(R"DOC(Flatten Layer)DOC");
diff --git a/operator/operator/fully_connected.cpp b/operator/operator/fully_connected.cpp
index 2a882afe1..4cb992bc5 100644
--- a/operator/operator/fully_connected.cpp
+++ b/operator/operator/fully_connected.cpp
@@ -35,8 +35,8 @@ bool FullyConnected::InferShape(const std::vector<TEngine::TShape>& ishape, std:
     int m = input.GetN();
     int input_k = input.GetW() * input.GetH() * input.GetC();
 
-    int n = weight.GetH();
-    int k = weight.GetW();
+    int n = weight.Shape(0);
+    int k = weight.Shape(1);
 
     if(k != input_k)
         return false;
@@ -46,7 +46,7 @@ bool FullyConnected::InferShape(const std::vector<TEngine::TShape>& ishape, std:
     std::vector<int> dim = {m, n, 1, 1};
 
     shape.SetDim(dim);
-    shape.SetDataLayout("NCHW");
+    shape.SetDataLayout(input.GetDataLayout());
 
     oshape[0] = shape;
 
@@ -72,7 +72,6 @@ void FullyConnected::SetSchema(void)
 {
     Input({"input:float32", "weight:float32", "bias:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("num_output", 10)
         .SetDoc(R"DOC(Fully Connected Operator)DOC");
 }
diff --git a/operator/operator/gemm.cpp b/operator/operator/gemm.cpp
index 8f6ae56ef..34a691198 100644
--- a/operator/operator/gemm.cpp
+++ b/operator/operator/gemm.cpp
@@ -51,6 +51,7 @@ bool Gemm::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TE
     dim[1] = n;
 
     out_shape.SetDim(dim);
+    out_shape.SetDataLayout(input.GetDataLayout());
 
     oshape[0] = out_shape;
 
@@ -65,7 +66,6 @@ void Gemm::SetSchema(void)
         .SetAttr("beta", 1.0f)
         .SetAttr("transA", 0)
         .SetAttr("transB", 0)
-        .SetLayout("NCHW")
         .SetDoc(R"DOC(Gemm Operator)DOC");
 }
 
diff --git a/operator/operator/gru.cpp b/operator/operator/gru.cpp
new file mode 100644
index 000000000..26c4b7531
--- /dev/null
+++ b/operator/operator/gru.cpp
@@ -0,0 +1,55 @@
+#include "operator/gru.hpp"
+#include "operator/gru_param.hpp"
+#include "static_graph.hpp"
+
+namespace TEngine {
+
+bool GRU::InferShape(const std::vector<TShape>& ishape, std::vector<TShape>& oshape, int layout)
+{
+    // input tensors:
+    // 0 --- input: [seq_length, batch_size,input_size]
+    // 1 --- kernel [ (input_size+hidden_size),hidden_state_size]
+    // others: optional
+
+    // output tensor: [output_len,batch_size,hidden_size]
+
+    const TShape input_shape = ishape[0];
+
+    int batch_size = input_shape.Shape(1);
+
+    std::vector<int> dims(3);
+
+    dims[1] = param_.output_len;
+    dims[0] = batch_size;
+    dims[2] = param_.hidden_size;
+
+    oshape[0].SetDim(dims);
+
+    //std::cout<<dims[0]<<","<< dims[1]<<","<<dims[2]<<"\n";
+
+    return true;
+}
+
+void GRU::SetSchema(void)
+{
+    Input({"input:float32", "kernel:float32", "bias:float32", "init_h:float32"})
+        .Output({"output:float32"})
+        .SetAttr("clip", 0.0f)
+        .SetAttr("output_len", 1)
+        .SetAttr("sequence_len", 1)
+        .SetAttr("input_size", 1)
+        .SetAttr("hidden_size", 1)
+        .SetAttr("has_clip", 0)
+        .SetAttr("has_gate_bias", 0)
+        .SetAttr("has_candidate_bias", 0)
+        .SetAttr("has_init_state", 0)
+        .SetDoc(R"DOC(GRU Cell
+              input: input sequences, a 3D tensor [seq_length,batch_size,input_size]
+              gate_kernel: gate weight tensor,[num_directions, hidden_size, ]
+              gate_bias:   gate bias tensor, [num_directions, hidden_size]
+              candidate_kernel: candidate weight tensor,[num_directions, hidden_size, ]
+              candidate_bias:   candidate bias tensor, [num_directions, hidden_size]
+              init_h: optional [hidden_size]
+                 )DOC");
+}
+}    // namespace TEngine
\ No newline at end of file
diff --git a/operator/operator/input_op.cpp b/operator/operator/input_op.cpp
index 8b561dfe5..8d1b4b216 100644
--- a/operator/operator/input_op.cpp
+++ b/operator/operator/input_op.cpp
@@ -27,7 +27,7 @@ namespace TEngine {
 
 void InputOp::SetSchema(void)
 {
-    SetLayout("NCHW").SetDoc(R"DOC(Input Data Operator)DOC");
+    SetDoc(R"DOC(Input Data Operator)DOC");
 }
 
 }    // namespace TEngine
diff --git a/operator/operator/logistic.cpp b/operator/operator/logistic.cpp
index 4d69e2874..74a8177f4 100644
--- a/operator/operator/logistic.cpp
+++ b/operator/operator/logistic.cpp
@@ -27,7 +27,7 @@ namespace TEngine {
 
 void Logistic::SetSchema(void)
 {
-    Input({"input:float32"}).Output({"output:float32"}).SetLayout("NCHW").SetDoc(R"DOC(Logistic Layer)DOC");
+    Input({"input:float32"}).Output({"output:float32"}).SetDoc(R"DOC(Logistic Layer)DOC");
 }
 
 }    // namespace TEngine
diff --git a/operator/operator/lstm.cpp b/operator/operator/lstm.cpp
index 8c03f9efa..0c2a2152f 100644
--- a/operator/operator/lstm.cpp
+++ b/operator/operator/lstm.cpp
@@ -12,7 +12,7 @@ bool LSTM::InferShape(const std::vector<TShape>& ishape, std::vector<TShape>& os
     // others: optional
 
     // output tensor: [output_len, batch_size,hidden_size]
-
+    // std::cout<<"!!!!!!!\n";
     const TShape input_shape = ishape[0];
 
     int batch_size = input_shape.Shape(1);
@@ -22,6 +22,8 @@ bool LSTM::InferShape(const std::vector<TShape>& ishape, std::vector<TShape>& os
     dims[0] = param_.output_len;
     dims[1] = batch_size;
     dims[2] = param_.hidden_size;
+    
+    // std::cout<<dims[0]<<","<< dims[1]<<","<<dims[2]<<"\n"; 
 
     oshape[0].SetDim(dims);
 
@@ -45,11 +47,11 @@ void LSTM::SetSchema(void)
         .SetAttr("has_clip", 0)
         .SetAttr("has_bias", 0)
         .SetAttr("has_init_state", 0)
-        .SetAttr("forget_act", "sigmoid")
-        .SetAttr("input_act", "sigmoid")
-        .SetAttr("output_act", "sigmoid")
-        .SetAttr("cellin_act", "tanh")
-        .SetAttr("cellout_act", "tanh")
+        .SetAttr("forget_act", LSTM_ACT_SIGMOID)
+        .SetAttr("input_act", LSTM_ACT_SIGMOID)
+        .SetAttr("output_act", LSTM_ACT_SIGMOID)
+        .SetAttr("cellin_act", LSTM_ACT_TANH)
+        .SetAttr("cellout_act", LSTM_ACT_TANH)
         .SetDoc(R"DOC(LSTM Cell
               input: input sequences, a 3D tensor [seq_length,batch_size,input_size]
               kernel: i/c/f/o weight tensor,[num_directions, 4*hidden_size, ]
diff --git a/operator/operator/normalize.cpp b/operator/operator/normalize.cpp
index 1de248b66..2c31b9d80 100644
--- a/operator/operator/normalize.cpp
+++ b/operator/operator/normalize.cpp
@@ -29,7 +29,6 @@ void Normalize::SetSchema(void)
 {
     Input({"input:float32", "scale:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("across_spatial", 0)
         .SetAttr("channel_shared", 0)
         .SetDoc(R"DOC(Normalize Operator)DOC");
diff --git a/operator/operator/pad.cpp b/operator/operator/pad.cpp
new file mode 100644
index 000000000..67b7eb3b6
--- /dev/null
+++ b/operator/operator/pad.cpp
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#include "operator/pad.hpp"
+
+namespace TEngine {
+
+bool Pad::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEngine::TShape>& oshape, int layout)
+{
+    const TShape& input = ishape[0];
+    // TShape& output = oshape[0];
+    int n = input.GetN();
+    int c = input.GetC();
+    int h = input.GetH();
+    int w = input.GetW();
+
+    std::vector<int> o_dim=input.GetDim();
+    if(param_.pad_0_h!=-1 && param_.pad_0_w!=-1
+    &&param_.pad_1_h!=-1 && param_.pad_1_w!=-1
+    &&param_.pad_2_h!=-1 && param_.pad_2_w!=-1
+    &&param_.pad_3_h!=-1 && param_.pad_3_w!=-1)
+    {
+        o_dim[0]=n+param_.pad_0_h+param_.pad_0_w;
+        o_dim[1]=h+param_.pad_1_h+param_.pad_1_w;
+        o_dim[2]=w+param_.pad_2_h+param_.pad_2_w;
+        o_dim[3]=c+param_.pad_3_h+param_.pad_3_w;
+    }
+    else
+    {
+        return false;
+    }
+    TShape shape;
+    shape.SetDim(o_dim);
+    shape.SetDataLayout(TENGINE_LAYOUT_NHWC);
+    oshape[0] = shape;
+    return true;
+}
+
+void Pad::SetSchema(void)
+{
+    Input({"input:float32"})
+        .Output({"output:float32"})
+        .SetAttr("mode", 0)
+        .SetAttr("pad_0_h", -1)
+        .SetAttr("pad_0_w", -1)
+        .SetAttr("pad_1_h", -1)
+        .SetAttr("pad_1_w", -1)
+        .SetAttr("pad_2_h", -1)
+        .SetAttr("pad_2_w", -1)
+        .SetAttr("pad_3_h", -1)
+        .SetAttr("pad_3_w", -1)
+        .SetAttr("value", 0)
+        .SetDoc(R"DOC(Pad Layer)DOC");
+}
+
+}    // namespace TEngine
diff --git a/operator/operator/permute.cpp b/operator/operator/permute.cpp
index 23db8bf64..30f67d07e 100644
--- a/operator/operator/permute.cpp
+++ b/operator/operator/permute.cpp
@@ -28,18 +28,31 @@ namespace TEngine {
 bool Permute::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEngine::TShape>& oshape, int layout)
 {
     const TShape& input = ishape[0];
-    int n = input.GetN();
-    int c = input.GetC();
-    int h = input.GetH();
-    int w = input.GetW();
+    const std::vector<int> dims = input.GetDim();
 
     // only support for 0231[bhwc]
     if((param_.order0 == 0) && (param_.order1 == 2) && (param_.order2 == 3) && (param_.order3 == 1))
     {
+        int n = input.GetN();
+        int c = input.GetC();
+        int h = input.GetH();
+        int w = input.GetW();
         TShape shape;
         std::vector<int> dim = {n, h, w, c};
         shape.SetDim(dim);
-        shape.SetDataLayout("NCHW");
+        shape.SetDataLayout(TENGINE_LAYOUT_NHWC);
+        oshape[0] = shape;
+        return true;
+    }
+    else if((param_.order0 == 1) && (param_.order1 == 0) && (param_.order2 == 2) && dims.size()==3)
+    {
+        // int n = input.GetN();
+        int c = input.Shape(0);
+        int h = input.Shape(1);
+        int w = input.Shape(2);
+        TShape shape;
+        std::vector<int> dim = {h,c,w};
+        shape.SetDim(dim);
         oshape[0] = shape;
         return true;
     }
@@ -53,7 +66,6 @@ void Permute::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("flag", 0)
         .SetAttr("order0", 0)
         .SetAttr("order1", 1)
diff --git a/operator/operator/pooling.cpp b/operator/operator/pooling.cpp
index e0e02dc7d..78ebcb89d 100644
--- a/operator/operator/pooling.cpp
+++ b/operator/operator/pooling.cpp
@@ -85,34 +85,25 @@ bool Pooling::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector
 
     if(param_.global)
     {
-        param_.pad_h = 0;
-        param_.pad_w = 0;
         param_.stride_h = 1;
         param_.stride_w = 1;
-
-        param_.kernel_shape[0] = input_h;
-        param_.kernel_shape[1] = input_w;
-        param_.pads[0] = param_.pads[1] = param_.pads[2] = param_.pads[3] = 0;
-        param_.strides[0] = param_.strides[1] = 1;
-
+        param_.kernel_h = input_h;
+        param_.kernel_w = input_w;
+        param_.pad_h0 = param_.pad_w0 = param_.pad_h1 = param_.pad_w1 = 0;
         output_h = 1;
         output_w = 1;
     }
     else
     {
-        param_.kernel_shape[0] = param_.kernel_h;
-        param_.kernel_shape[1] = param_.kernel_w;
-        param_.strides[0] = param_.stride_h;
-        param_.strides[1] = param_.stride_w;
         output_h =
-            calc_output_size(input_h, param_.kernel_shape[0], param_.stride_h, param_.pad_h, param_.caffe_flavor);
+            calc_output_size(input_h, param_.kernel_h, param_.stride_h, param_.pad_h0, param_.caffe_flavor);
         output_w =
-            calc_output_size(input_w, param_.kernel_shape[1], param_.stride_w, param_.pad_w, param_.caffe_flavor);
+            calc_output_size(input_w, param_.kernel_w, param_.stride_w, param_.pad_w0, param_.caffe_flavor);
 
-        calc_real_pads(output_h, input_h, param_.kernel_shape[0], param_.stride_h, param_.pad_h, &param_.pads[0],
-                       &param_.pads[2]);
-        calc_real_pads(output_w, input_w, param_.kernel_shape[1], param_.stride_w, param_.pad_w, &param_.pads[1],
-                       &param_.pads[3]);
+        calc_real_pads(output_h, input_h, param_.kernel_h, param_.stride_h, param_.pad_h0, &param_.pad_h0,
+                       &param_.pad_h1);
+        calc_real_pads(output_w, input_w, param_.kernel_w, param_.stride_w, param_.pad_w0, &param_.pad_w0,
+                       &param_.pad_w1);
     }
 
     TShape shape;
@@ -121,14 +112,14 @@ bool Pooling::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector
         std::vector<int> dim = {input_shape.GetN(), input_shape.GetC(), output_h, output_w};
 
         shape.SetDim(dim);
-        shape.SetDataLayout("NCHW");
+        shape.SetDataLayout(TENGINE_LAYOUT_NCHW);
     }
     else
     {
         std::vector<int> dim = {input_shape.GetN(), output_h, output_w, input_shape.GetC()};
 
         shape.SetDim(dim);
-        shape.SetDataLayout("NHWC");
+        shape.SetDataLayout(TENGINE_LAYOUT_NHWC);
     }
     oshape[0] = shape;
     return true;
@@ -136,7 +127,7 @@ bool Pooling::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector
 
 float Pooling::GetFops(const std::vector<TShape>& inputs, const std::vector<TShape>& outputs)
 {
-    float patch_fops = param_.kernel_shape[0] * param_.kernel_shape[1];
+    float patch_fops = param_.kernel_h * param_.kernel_w;
 
     return (patch_fops * outputs[0].GetSize());
 }
@@ -145,16 +136,17 @@ void Pooling::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("alg", 0)
         .SetAttr("kernel_h", 2)
         .SetAttr("kernel_w", 2)
         .SetAttr("stride_h", 1)
         .SetAttr("stride_w", 1)
-        .SetAttr("pad_h", 0)
-        .SetAttr("pad_w", 0)
         .SetAttr("global", 0)
         .SetAttr("caffe_flavor", 0)
+        .SetAttr("pad_h0", 0)
+        .SetAttr("pad_w0", 0)
+        .SetAttr("pad_h1", 0)
+        .SetAttr("pad_w1", 0)
         .SetDoc(R"DOC(Pooling Layer)DOC");
 }
 
diff --git a/operator/operator/prelu.cpp b/operator/operator/prelu.cpp
index 97bec951f..37f5ccb02 100644
--- a/operator/operator/prelu.cpp
+++ b/operator/operator/prelu.cpp
@@ -34,7 +34,6 @@ void PReLU::SetSchema(void)
 {
     Input({"input:float32", "slope:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")    // to check
         .SetDoc(R"DOC(PreLu Operator)DOC");
 }
 
diff --git a/operator/operator/priorbox.cpp b/operator/operator/priorbox.cpp
index 1d5d9d287..59363ec21 100644
--- a/operator/operator/priorbox.cpp
+++ b/operator/operator/priorbox.cpp
@@ -54,7 +54,7 @@ bool PriorBox::InferShape(const std::vector<TEngine::TShape>& ishape, std::vecto
     TShape shape;
     std::vector<int> dim = {feat_dim[0], 2, param_.out_dim_, 1};
     shape.SetDim(dim);
-    shape.SetDataLayout("NCHW");
+    shape.SetDataLayout(input.GetDataLayout());
     oshape[0] = shape;
     return true;
 }
@@ -63,7 +63,6 @@ void PriorBox::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("offset", 0.5)
 
         .SetDoc(R"DOC(PriorBox Layer)DOC");
diff --git a/operator/operator/reduction.cpp b/operator/operator/reduction.cpp
new file mode 100644
index 000000000..f8001a9b0
--- /dev/null
+++ b/operator/operator/reduction.cpp
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#include "operator/reduction.hpp"
+
+namespace TEngine {
+
+bool Reduction::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEngine::TShape>& oshape, int layout)
+{
+    const TShape& input = ishape[0];
+
+    const std::vector<int>& in_dim = input.GetDim();
+    int in_size=in_dim.size();
+    std::vector<int> new_shape;
+    if(param_.dim_0 != -2)
+        new_shape.push_back(param_.dim_0);
+    if(param_.dim_1 != -2)
+        new_shape.push_back(param_.dim_1);
+    if(param_.dim_2 != -2)
+        new_shape.push_back(param_.dim_2);
+    if(param_.dim_3 != -2)
+        new_shape.push_back(param_.dim_3);
+    bool should_reduced[4] = {false};
+    int reduceddim=0;
+    int kd=param_.keepdim;
+    int newshape_size = new_shape.size();
+    std::vector<int> real_shape={0,2,3,1};
+    if(newshape_size)
+    {
+        for(int i=0;i<newshape_size;i++)
+        {
+           
+           if(new_shape[i]>=0)
+           {
+                int idx=new_shape[i];
+                if(input.GetDataLayout()==TENGINE_LAYOUT_NCHW)
+                    idx=real_shape[idx];
+                if(idx>=0 && idx<4)
+                {
+                    
+                    should_reduced[idx]=true;
+                    ++reduceddim;
+                }
+           }
+           else if(new_shape[i]<0)
+           {
+                int current=in_dim.size()+new_shape[i];
+                if(input.GetDataLayout()==TENGINE_LAYOUT_NCHW)
+                {
+                    current=real_shape[current];
+                }
+            
+                should_reduced[current]=true;
+                ++reduceddim;
+                                 
+           }
+        }
+    }
+    else
+    {
+        for(int idx=0;idx<in_size;++idx)
+        {
+            
+            should_reduced[idx] = true;
+            ++reduceddim;
+        
+        }
+    }
+    if(in_size-reduceddim==0)
+    {
+        if(kd==0)
+        {
+            std::vector<int> odim={1};
+            TShape shape;
+            shape.SetDim(odim);
+
+            shape.SetDataLayout(input.GetDataLayout());
+            oshape[0] = shape;
+            return true;
+        }
+        else
+        {
+            std::vector<int> odim(in_size);
+            for(int i_idx=0,o_idx=0;i_idx<in_size;i_idx++)
+            {
+                odim[o_idx++]=1;
+            }
+            TShape shape;
+            shape.SetDim(odim);
+
+            shape.SetDataLayout(input.GetDataLayout());
+            oshape[0] = shape;
+            return true;
+        }
+
+        return false;
+        
+    }
+    else
+    {
+        int o_size=0;
+        if(kd==0)
+        {
+            o_size=in_size-reduceddim;
+        }
+        else
+        {
+            o_size=in_size;
+        }
+        std::vector<int> odim(o_size);
+        for(int i_idx=0,o_idx=0;i_idx<in_size;i_idx++)
+        {
+            if(!should_reduced[i_idx])
+            {
+                odim[o_idx++]=in_dim[i_idx];
+            }
+            else if(should_reduced[i_idx]&&kd==1)
+            {
+                odim[o_idx++]=1;
+            }
+        }
+        TShape shape;
+        shape.SetDim(odim);
+
+        shape.SetDataLayout(input.GetDataLayout());
+        oshape[0] = shape;
+        return true;
+
+    }
+
+    return false;
+}
+
+void Reduction::SetSchema(void)
+{
+    Input({"input:float32"})
+        .Output({"output:float32"})
+        .SetAttr("dim_0", -2)
+        .SetAttr("dim_1", -2)
+        .SetAttr("dim_2", -2)
+        .SetAttr("dim_3", -2)
+        .SetAttr("keepdim",0)
+        .SetAttr("type",0)
+        .SetDoc(R"DOC(Squeeze Layer)DOC");
+}
+
+}    // namespace TEngine
diff --git a/operator/operator/region.cpp b/operator/operator/region.cpp
index d313b6240..412ddda9d 100644
--- a/operator/operator/region.cpp
+++ b/operator/operator/region.cpp
@@ -29,7 +29,6 @@ void Region::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("num_classes", 1)
         .SetDoc(R"DOC(Region Operator)DOC");
 }
diff --git a/operator/operator/relu.cpp b/operator/operator/relu.cpp
index 9a8680d03..27fa0c587 100644
--- a/operator/operator/relu.cpp
+++ b/operator/operator/relu.cpp
@@ -35,7 +35,6 @@ void ReLu::SetSchema(void)
     Input({"input:float32"})
         .Output({"output:float32"})
         .SetAttr("negative_slope", 0.f)
-        .SetLayout("NCHW")
         .SetDoc(R"DOC(ReLu Operator)DOC");
 }
 
diff --git a/operator/operator/relu6.cpp b/operator/operator/relu6.cpp
index 37e2e16b8..fd2874b1b 100644
--- a/operator/operator/relu6.cpp
+++ b/operator/operator/relu6.cpp
@@ -32,7 +32,7 @@ float ReLu6::GetFops(const std::vector<TShape>& inputs, const std::vector<TShape
 
 void ReLu6::SetSchema(void)
 {
-    Input({"input:float32"}).Output({"output:float32"}).SetLayout("NCHW").SetDoc(R"DOC(ReLu6 Operator)DOC");
+    Input({"input:float32"}).Output({"output:float32"}).SetDoc(R"DOC(ReLu6 Operator)DOC");
 }
 
 }    // namespace TEngine
diff --git a/operator/operator/reorg.cpp b/operator/operator/reorg.cpp
index 6150fa76f..fd7dc3cf6 100644
--- a/operator/operator/reorg.cpp
+++ b/operator/operator/reorg.cpp
@@ -37,7 +37,7 @@ bool Reorg::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<T
     std::vector<int> dim = {n, c * (stride * stride), h / stride, w / stride};
 
     shape.SetDim(dim);
-    shape.SetDataLayout("NCHW");
+    shape.SetDataLayout(input.GetDataLayout());
 
     oshape[0] = shape;
     return true;
@@ -47,7 +47,6 @@ void Reorg::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("stride", 1)
         .SetDoc(R"DOC(Reorg Operator)DOC");
 }
diff --git a/operator/operator/reshape.cpp b/operator/operator/reshape.cpp
index 70c99d7f9..362a3eac9 100644
--- a/operator/operator/reshape.cpp
+++ b/operator/operator/reshape.cpp
@@ -60,24 +60,8 @@ bool Reshape::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector
 
     TShape shape;
     shape.SetDim(new_shape);
-    // only support 2-D 3-D or 4-D
-    if(new_shape.size() == 4)
-    {
-        if(layout == TENGINE_LAYOUT_NCHW)
-            shape.SetDataLayout("NCHW");
-        else
-            shape.SetDataLayout("NHWC");
-    }
-    else if(new_shape.size() == 3)
-    {
-        shape.SetDataLayout("NHW");
-    }
-    else if(new_shape.size() == 2)
-    {
-        shape.SetDataLayout("HW");
-    }
-    else
-        return false;
+    shape.SetDataLayout(input.GetDataLayout());
+
     oshape[0] = shape;
     return true;
 }
@@ -86,7 +70,6 @@ void Reshape::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("dim_0", -2)
         .SetAttr("dim_1", -2)
         .SetAttr("dim_2", -2)
diff --git a/operator/operator/resize.cpp b/operator/operator/resize.cpp
index 336650e68..9966c90bb 100644
--- a/operator/operator/resize.cpp
+++ b/operator/operator/resize.cpp
@@ -37,7 +37,7 @@ bool Resize::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<
     std::vector<int> dim = {in_dim[0], in_dim[1], out_h, out_w};
 
     shape.SetDim(dim);
-    shape.SetDataLayout("NCHW");
+    shape.SetDataLayout(input.GetDataLayout());
 
     oshape[0] = shape;
 
@@ -48,7 +48,6 @@ void Resize::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("scale_h", 1.f)
         .SetAttr("scale_w", 1.f)
 
diff --git a/operator/operator/rnn.cpp b/operator/operator/rnn.cpp
new file mode 100644
index 000000000..17934062a
--- /dev/null
+++ b/operator/operator/rnn.cpp
@@ -0,0 +1,51 @@
+#include "operator/rnn.hpp"
+#include "operator/rnn_param.hpp"
+#include "static_graph.hpp"
+
+namespace TEngine {
+
+bool RNN::InferShape(const std::vector<TShape>& ishape, std::vector<TShape>& oshape, int layout)
+{
+    // input tensors:
+    // 0 --- input: [seq_length, batch_size,input_size]
+    // 1 --- kernel [ (input_size+hidden_size),hidden_state_size]
+    // others: optional
+
+    // output tensor: [output_len,batch_size,hidden_size]
+
+    const TShape input_shape = ishape[0];
+
+    int batch_size = input_shape.Shape(1);
+
+    std::vector<int> dims(3);
+
+    dims[0] = param_.output_len;
+    dims[1] = batch_size;
+    dims[2] = param_.hidden_size;
+
+    oshape[0].SetDim(dims);
+
+    return true;
+}
+
+void RNN::SetSchema(void)
+{
+    Input({"input:float32", "kernel:float32", "bias:float32", "init_h:float32"})
+        .Output({"output:float32"})
+        .SetAttr("clip", 0.0f)
+        .SetAttr("output_len", 1)
+        .SetAttr("sequence_len", 1)
+        .SetAttr("input_size", 1)
+        .SetAttr("hidden_size", 1)
+        .SetAttr("has_clip", 0)
+        .SetAttr("has_bias", 0)
+        .SetAttr("has_init_state", 0)
+        .SetAttr("activation", RNN_ACT_TANH)
+        .SetDoc(R"DOC(LSTM Cell
+              input: input sequences, a 3D tensor [seq_length,batch_size,input_size]
+              kernel: gate weight tensor,[num_directions, hidden_size, ]
+              bias:   gate bias tensor, [num_directions, hidden_size]
+              init_h: optional [hidden_size]
+                 )DOC");
+}
+}    // namespace TEngine
diff --git a/operator/operator/roi_pooling.cpp b/operator/operator/roi_pooling.cpp
index a1614fb0a..bff14634d 100644
--- a/operator/operator/roi_pooling.cpp
+++ b/operator/operator/roi_pooling.cpp
@@ -37,7 +37,7 @@ bool ROIPooling::InferShape(const std::vector<TEngine::TShape>& ishape, std::vec
     std::vector<int> dim = {300, c, param_.pooled_h, param_.pooled_w};
 
     shape.SetDim(dim);
-    shape.SetDataLayout("NCHW");
+    shape.SetDataLayout(input.GetDataLayout());
 
     oshape[0] = shape;
 
@@ -48,7 +48,6 @@ void ROIPooling::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("spatial_scale", 1.f)
 
         .SetDoc(R"DOC(ROIPooling Layer)DOC");
diff --git a/operator/operator/rpn.cpp b/operator/operator/rpn.cpp
index cf900e460..3175067c7 100644
--- a/operator/operator/rpn.cpp
+++ b/operator/operator/rpn.cpp
@@ -100,7 +100,7 @@ bool RPN::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEn
     TShape shape;
     std::vector<int> dim = {feat_dim[0], param_.post_nms_topn + 1, 4, 1};
     shape.SetDim(dim);
-    shape.SetDataLayout("NCHW");
+    shape.SetDataLayout(input.GetDataLayout());
     oshape[0] = shape;
     return true;
 }
@@ -109,7 +109,6 @@ void RPN::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("feat_stride", 16)
 
         .SetDoc(R"DOC(RPN Layer)DOC");
diff --git a/operator/operator/sigmoid.cpp b/operator/operator/sigmoid.cpp
new file mode 100644
index 000000000..ff7dabae9
--- /dev/null
+++ b/operator/operator/sigmoid.cpp
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+#include "operator/sigmoid.hpp"
+
+namespace TEngine {
+
+float Sigmoid::GetFops(const std::vector<TShape>& inputs, const std::vector<TShape>& outputs)
+{
+    return inputs[0].GetSize();
+}
+
+void Sigmoid::SetSchema(void)
+{
+    Input({"input:float32"})
+        .Output({"output:float32"})
+        .SetDoc(R"DOC(ReLu Operator)DOC");
+}
+
+}    // namespace TEngine
diff --git a/operator/operator/slice.cpp b/operator/operator/slice.cpp
index 8d701f946..3b1a0e52c 100644
--- a/operator/operator/slice.cpp
+++ b/operator/operator/slice.cpp
@@ -26,35 +26,66 @@
 namespace TEngine {
 bool Slice::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEngine::TShape>& oshape, int layout)
 {
-    // only support for slice_axis=1
     const TShape& input = ishape[0];
+    std::vector<int> input_dim = input.GetDim();
 
-    int n = input.GetN();
-    int c = input.GetC();
-    int h = input.GetH();
-    int w = input.GetW();
-
-    if(c % 2 != 0)
-        return false;
-
-    TShape shape;
-
-    std::vector<int> dim = {n, c / 2, h, w};
-
-    shape.SetDim(dim);
-    shape.SetDataLayout("NCHW");
-
-    oshape[0] = shape;
-    oshape[1] = shape;
-
+    if(param_.iscaffe)
+    {
+        int slice_axis = param_.axis;
+        if(param_.slice_point_.size()!= 0)
+        {
+            int prev = 0;
+            int input_slice_num = input_dim[slice_axis];
+        unsigned int i = 0 ;
+            for (; i < param_.slice_point_.size(); ++i)
+            {
+                input_dim[slice_axis] = (param_.slice_point_[i] - prev);
+                prev = param_.slice_point_[i];
+                oshape[i].SetDim(input_dim);
+                oshape[i].SetDataLayout(input.GetDataLayout());
+            }
+            //The last one
+            input_dim[slice_axis] = (input_slice_num - prev);
+            oshape[i].SetDim(input_dim);
+            oshape[i].SetDataLayout(input.GetDataLayout());
+        }
+        else
+        {
+            int out_num = oshape.size();
+            if(input.Shape(slice_axis) % out_num != 0)
+                return false;
+            if(slice_axis > (int)input_dim.size())
+                return false;
+            input_dim[slice_axis] = input_dim[slice_axis] / out_num;
+            for(int i = 0; i < out_num; i++)
+            {
+                oshape[i].SetDim(input_dim);
+                oshape[i].SetDataLayout(input.GetDataLayout());
+            }
+        }
+    }
+    else
+    {
+        std::vector<int> out_dim;
+        //input shape size must be equal to begin and size's size;
+        if( (param_.size_.size()!= param_.begin_.size())|| (param_.size_.size()!= input_dim.size()))
+            return false;
+        out_dim.reserve(input_dim.size());
+        for(unsigned int i = 0; i < input_dim.size(); i++)
+        {
+            out_dim[i] = param_.size_[i];
+        }
+        oshape[0].SetDim(out_dim);
+        oshape[0].SetDataLayout(input.GetDataLayout());
+    }
     return true;
 }
 void Slice::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("axis", 1)
+        .SetAttr("iscaffe", true)
         .SetDoc(R"DOC(Slice Operator)DOC");
 }
 
diff --git a/operator/operator/softmax.cpp b/operator/operator/softmax.cpp
index 669983bab..b12146c1b 100644
--- a/operator/operator/softmax.cpp
+++ b/operator/operator/softmax.cpp
@@ -29,7 +29,6 @@ void Softmax::SetSchema(void)
 {
     Input({"input:float32"})
         .Output({"output:float32"})
-        .SetLayout("NCHW")
         .SetAttr("axis", 1)
         .SetDoc(R"DOC(Softmax Operator)DOC");
 }
diff --git a/operator/operator/split.cpp b/operator/operator/split.cpp
index c10d420be..a85002c36 100644
--- a/operator/operator/split.cpp
+++ b/operator/operator/split.cpp
@@ -28,14 +28,65 @@ namespace TEngine {
 
 bool Split::InferShape(const std::vector<TShape>& ishape, std::vector<TShape>& oshape, int layout)
 {
-    for(unsigned int i = 0; i < oshape.size(); i++)
-        oshape[i] = ishape[0];
+    int axis = param_.axis;
+    const TShape shape = ishape[0];
+    std::vector<int> input_dim = shape.GetDim();
+
+    if(param_.is_caffe)
+    {
+        for(unsigned int i = 0; i < oshape.size(); i++)
+            oshape[i] = ishape[0];
+    }
+    else
+    {
+        if(param_.split_sizes_.size()!= 0)
+        {
+            int sumcheck = 0;
+            int input_slice_num = input_dim[axis];
+            for (unsigned int i = 0; i < param_.split_sizes_.size(); ++i)
+            {
+                sumcheck+=param_.split_sizes_[i];
+            }
+            if(sumcheck!=input_slice_num)
+            {
+                return false;
+            }
+            for (unsigned int i = 0; i < param_.split_sizes_.size(); ++i)
+            {
+                input_dim[axis] = (param_.split_sizes_[i]);
+                oshape[i].SetDim(input_dim);
+                oshape[i].SetDataLayout(shape.GetDataLayout());
+            }
+        }
+        else
+        {
+            int split_dim = param_.split_dim;
+            int split_shape = 0;
+            std::vector<int> dim;
+            dim = ishape[0].GetDim();
+            if(dim[axis]% split_dim!=0)
+                return false;
+            split_shape= dim[axis]/split_dim;
+            input_dim[axis]=split_shape;
+            for(unsigned int i = 0; i < oshape.size(); i++)
+            {
+                oshape[i].SetDim(input_dim);
+                oshape[i].SetDataLayout(shape.GetDataLayout());
+            }
+        }
+
+    }
+
 
     return true;
 }
-
 void Split::SetSchema(void)
 {
-    Input({"input:float32"}).Output({"output:float32"}).SetLayout("NCHW").SetDoc(R"DOC(Split Operator)DOC");
+    Input({"input:float32"})
+    .Output({"output:float32"})
+    .SetAttr("axis", 0)
+    .SetAttr("split_dim", 1)
+    .SetAttr("is_caffe", false)
+    .SetDoc(R"DOC(Split Operator)DOC");
 }
 }    // namespace TEngine
diff --git a/operator/operator/squeeze.cpp b/operator/operator/squeeze.cpp
new file mode 100644
index 000000000..0e357fb85
--- /dev/null
+++ b/operator/operator/squeeze.cpp
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#include "operator/squeeze.hpp"
+
+namespace TEngine {
+
+bool Squeeze::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEngine::TShape>& oshape, int layout)
+{
+    const TShape& input = ishape[0];
+
+    const std::vector<int>& in_dim = input.GetDim();
+    int in_size=in_dim.size();
+    std::vector<int> new_shape;
+    if(param_.dim_0 != -2)
+        new_shape.push_back(param_.dim_0);
+    if(param_.dim_1 != -2)
+        new_shape.push_back(param_.dim_1);
+    if(param_.dim_2 != -2)
+        new_shape.push_back(param_.dim_2);
+    if(param_.dim_3 != -2)
+        new_shape.push_back(param_.dim_3);
+    bool should_squeeze[4] = {false};
+    int squeezeddim=0;
+    int newshape_size = new_shape.size();
+    std::vector<int> real_shape={0,2,3,1};
+    if(newshape_size)
+    {
+        for(int i=0;i<newshape_size;i++)
+        {
+
+           if(new_shape[i]>=0)
+           {
+                int idx=new_shape[i];
+                if(input.GetDataLayout()==TENGINE_LAYOUT_NCHW)
+                    idx=real_shape[idx];
+                if(in_dim[idx]==1 && idx>=0 && idx<4)
+                {
+
+                    should_squeeze[idx]=true;
+                    ++squeezeddim;
+                }
+           }
+           else if(new_shape[i]<0)
+           {
+               int idx=new_shape[i];
+               if(input.GetDataLayout()==TENGINE_LAYOUT_NCHW)
+                    idx=real_shape[idx];
+               if(in_dim[idx]==1 && idx>0 && idx<3)
+               {
+                   int current=in_dim.size()+idx;
+                   should_squeeze[current]=true;
+                   ++squeezeddim;
+               }
+           }
+        }
+    }
+    else
+    {
+        for(int idx=0;idx<in_size;++idx)
+        {
+            if (in_dim[idx] == 1) 
+            {
+                should_squeeze[idx] = true;
+                ++squeezeddim;
+            }
+        }
+    }
+    std::vector<int> odim(in_size-squeezeddim);
+    int o_idx=0;
+    for(int i_idx=0;i_idx<in_size;i_idx++)
+    {
+        if(!should_squeeze[i_idx])
+            odim[o_idx++] = in_dim[i_idx];
+    }
+
+
+    TShape shape;
+    shape.SetDim(odim);
+
+    shape.SetDataLayout(input.GetDataLayout());
+    oshape[0] = shape;
+    return true;
+}
+
+void Squeeze::SetSchema(void)
+{
+    Input({"input:float32"})
+        .Output({"output:float32"})
+        .SetAttr("dim_0", -2)
+        .SetAttr("dim_1", -2)
+        .SetAttr("dim_2", -2)
+        .SetAttr("dim_3", -2)
+        .SetDoc(R"DOC(Squeeze Layer)DOC");
+}
+
+}    // namespace TEngine
diff --git a/operator/operator/swap_axis.cpp b/operator/operator/swap_axis.cpp
new file mode 100644
index 000000000..d1430187f
--- /dev/null
+++ b/operator/operator/swap_axis.cpp
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+#include "operator/swap_axis.hpp"
+
+namespace TEngine {
+
+bool SwapAxis::InferShape(const std::vector<TEngine::TShape>& ishape, std::vector<TEngine::TShape>& oshape, int layout)
+{
+    if(param_.dim_0 == param_.dim_1 )
+    {
+        return false;
+    }
+    if(ishape.size()!=1 || oshape.size()!=1)
+        return false;
+
+    const std::vector<int>& in_dim = ishape[0].GetDim();
+    int in_dim_size = in_dim.size();
+
+    if( param_.dim_0 >= in_dim_size ||  param_.dim_1 >= in_dim_size)
+        return false;
+
+    std::vector<int> new_dim;
+    new_dim.resize(in_dim_size);
+    for(int i=0;i<in_dim_size;i++)
+        new_dim[i] = in_dim[i];
+    new_dim[param_.dim_0] = in_dim[param_.dim_1];
+    new_dim[param_.dim_1] = in_dim[param_.dim_0];
+
+    TShape new_shape;
+    new_shape.SetDim(new_dim);
+
+    new_shape.SetDataLayout(layout);
+    oshape[0] = new_shape;
+    return true;
+}
+
+void SwapAxis::SetSchema(void)
+{
+    Input({"input:float32"})
+        .Output({"output:float32"})
+        .SetAttr("dim_0", 0)
+        .SetAttr("dim_1", 1)
+        .SetDoc(R"DOC(SwapAxis Layer)DOC");
+}
+
+}    // namespace TEngine
diff --git a/operator/operator/tanh.cpp b/operator/operator/tanh.cpp
new file mode 100644
index 000000000..af3e980b6
--- /dev/null
+++ b/operator/operator/tanh.cpp
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haoluo@openailab.com
+ */
+#include "operator/tanh.hpp"
+
+namespace TEngine {
+
+void Tanh::SetSchema(void)
+{
+    Input({"input:float32"})
+        .Output({"output:float32"})
+        .SetDoc(R"DOC(Tanh Layer)DOC");
+}
+
+}    // namespace TEngine
diff --git a/operator/plugin/init.cpp b/operator/plugin/init.cpp
index 2d001a327..9f4058cd7 100644
--- a/operator/plugin/init.cpp
+++ b/operator/plugin/init.cpp
@@ -62,6 +62,15 @@
 #include "operator/lstm.hpp"
 #include "operator/logistic.hpp"
 #include "operator/detection_postprocess.hpp"
+#include "operator/rnn.hpp"
+#include "operator/tanh.hpp"
+#include "operator/sigmoid.hpp"
+#include "operator/squeeze.hpp"
+#include "operator/pad.hpp"
+#include "operator/reduction.hpp"
+#include "operator/swap_axis.hpp"
+#include "operator/gru.hpp"
+#include "operator/add_n.hpp"
 
 using namespace TEngine;
 
@@ -104,6 +113,15 @@ int operator_plugin_init(void)
     RegisterOp<LSTM>("LSTM");
     RegisterOp<Logistic>("Logistic");
     RegisterOp<DetectionPostProcess>("DetectionPostProcess");
+    RegisterOp<RNN>("RNN");
+    RegisterOp<Tanh>("Tanh");
+    RegisterOp<Sigmoid>("Sigmoid");
+    RegisterOp<Squeeze>("Squeeze");
+    RegisterOp<Pad>("Pad");
+    RegisterOp<Reduction>("Reduction");
+    RegisterOp<SwapAxis>("SwapAxis");
+    RegisterOp<GRU>("GRU");
+    RegisterOp<Addn>("Addn");
 
     // std::cout<<"OPERATOR PLUGIN INITED\n";
     return 0;
diff --git a/scripts/makefile.build b/scripts/makefile.build
index 52a546b46..ff2704a16 100644
--- a/scripts/makefile.build
+++ b/scripts/makefile.build
@@ -19,7 +19,7 @@ bin-obj-y:=
 obj-y:=
 subdir-y:=
 
--include $(MAKEFILE_CONFIG)
+include $(MAKEFILE_CONFIG)
 include Makefile
 
 
@@ -48,9 +48,7 @@ endif
 
 prebuilt_objs=$(prebuilt-obj-y)
 
-#real_subdir_built_in=$(foreach f, $(subdir_objs), $(wildcard $(f)) )
-real_subdir_built_in=$(subdir_objs)
-real_built_in_objs=$(real_subdir_built_in) $(curdir_objs) $(prebuilt_objs)
+real_built_in_objs= $(subdir_objs) $(curdir_objs) $(prebuilt_objs)
 
 #add BUILD_DIR PREFIX
 curdir_objs:=$(addprefix $(BUILD_DIR)/, $(cur_objs))
@@ -79,10 +77,14 @@ $(subdir_objs): $(subdir-y);
 endif
 
 $(BUILT_IN_OBJ): $(real_built_in_objs) 
-	@echo $(BUILT_IN_LD) -r -o $@  $(real_built_in_objs); \
-               $(BUILT_IN_LD) -r -o $@ $(real_built_in_objs);
+	@for file in $? ; do if [  -f $$file  ] ; then \
+     NEED_BUILD=true; break; fi; done; \
+    if [ $$NEED_BUILD ]; then \
+    echo "$(BUILT_IN_LD) -r -o $@  $(wildcard $(real_built_in_objs))"; \
+    $(BUILT_IN_LD) -r -o $@ $(wildcard $(real_built_in_objs)); fi;
 else
 $(BUILT_IN_OBJ):
+
 endif
 
 clean::  $(subdir-y)
diff --git a/serializer/Makefile b/serializer/Makefile
index ab60e187f..6b9126e89 100644
--- a/serializer/Makefile
+++ b/serializer/Makefile
@@ -8,6 +8,7 @@ MODULE_DIR+=
 ifeq ($(CONFIG_CAFFE_SERIALIZER),y)
     obj-y+=caffe/
     COMMON_CFLAGS+= -DCONFIG_CAFFE_SERIALIZER
+    PROTOBUF_NEEDED=y
 endif
 ifeq ($(CONFIG_MXNET_SERIALIZER),y)
     obj-y+=mxnet/
@@ -20,6 +21,7 @@ endif
 ifeq ($(CONFIG_TF_SERIALIZER),y)
     obj-y+=tensorflow/
     COMMON_CFLAGS+= -DCONFIG_TF_SERIALIZER
+    PROTOBUF_NEEDED=y
 endif
 ifeq ($(CONFIG_TFLITE_SERIALIZER),y)
     obj-y+=tf_lite/
@@ -31,14 +33,19 @@ ifeq ($(CONFIG_TENGINE_SERIALIZER),y)
     COMMON_CFLAGS+= -DCONFIG_TENGINE_SERIALIZER
 endif
 
-obj-y+=source/
-obj-y+=plugin/
+ifeq ($(PROTOBUF_NEEDED),y)
+    #to get the protobuf header file
+    PROTOBUF_HEADER=$(shell pkg-config --cflags protobuf)
+    COMMON_CFLAGS+=$(PROTOBUF_HEADER)
+endif
 
 CXXFLAGS+=
 COMMON_CFLAGS+=$(CONFIG_OPT_CFLAGS)
 COMMON_CFLAGS+= -Wall -g -I$(shell pwd)/include -fPIC  $(INC_DIR) -Werror
 
 
+obj-y+=source/
+obj-y+=plugin/
 
 install:
 
diff --git a/serializer/caffe/caffe_serializer.cpp b/serializer/caffe/caffe_serializer.cpp
index 06fe33cda..7b8333456 100644
--- a/serializer/caffe/caffe_serializer.cpp
+++ b/serializer/caffe/caffe_serializer.cpp
@@ -32,8 +32,10 @@
 #include <google/protobuf/text_format.h>
 #include <google/protobuf/message.h>
 
+#include "tengine_c_api.h"
 #include "data_type.hpp"
 #include "type_name.hpp"
+#include "exec_attr.hpp"
 #include "tengine_errno.hpp"
 #include "caffe_serializer.hpp"
 #include "operator_manager.hpp"
@@ -60,6 +62,8 @@
 #include "operator/region_param.hpp"
 #include "operator/deconv_param.hpp"
 #include "operator/resize_param.hpp"
+#include "operator/split_param.hpp"
+
 
 namespace TEngine {
 
@@ -207,6 +211,10 @@ bool CaffeSingle::LoadModel(const std::vector<std::string>& file_list, StaticGra
     SetGraphSource(graph, file_list[0]);
     SetGraphSourceFormat(graph, "caffe");
     SetGraphConstTensorFile(graph, file_list[0]);
+    SetGraphLayout(graph,TENGINE_LAYOUT_NCHW);
+    SetModelLayout(graph,TENGINE_LAYOUT_NCHW);
+    SetModelFormat(graph,MODEL_FORMAT_CAFFE);
+
 
     return LoadGraph(caffe_net, graph);
 }
@@ -237,7 +245,6 @@ bool CaffeSingle::LoadNode(StaticGraph* graph, StaticNode* node, const te_caffe:
 
         StaticTensor* tensor = CreateStaticTensor(graph, tensor_name);
 
-        SetTensorDataLayout(tensor, "NCHW");
         SetTensorDataType(tensor, DataType::GetTypeID("float32"));
 
         AddNodeOutputTensor(node, tensor);
@@ -308,6 +315,10 @@ bool CaffeBuddy::LoadModel(const std::vector<std::string>& file_list, StaticGrap
     SetGraphSource(graph, file_list[1]);
     SetGraphSourceFormat(graph, "caffe");
     SetGraphConstTensorFile(graph, file_list[1]);
+    SetGraphLayout(graph,TENGINE_LAYOUT_NCHW);
+    SetModelLayout(graph,TENGINE_LAYOUT_NCHW);
+    SetModelFormat(graph,MODEL_FORMAT_CAFFE);
+
 
     return LoadGraph(test_net, train_net, graph);
 }
@@ -425,7 +436,6 @@ static void LoadCaffeBlob(StaticGraph* graph, StaticNode* node, const std::vecto
 
         SetTensorDim(tensor, dims);
         SetTensorDataType(tensor, DataType::GetTypeID("float32"));
-        SetTensorDataLayout(tensor, layout_list[i]);
 
         int mem_size = blob.data_size() * 4;
 
@@ -459,7 +469,6 @@ static void CreatePresetNode(StaticGraph* graph, StaticNode* node, const char* n
 
     SetTensorDim(tensor, dims);
     SetTensorDataType(tensor, DataType::GetTypeID("float32"));
-    SetTensorDataLayout(tensor, layout);
 
     int elem_size = 1;
 
@@ -685,24 +694,21 @@ static bool LoadCaffeNormalize(StaticGraph* graph, StaticNode* node, const te_ca
 
 static bool LoadCaffeSlice(StaticGraph* graph, StaticNode* node, const te_caffe::LayerParameter& layer_param)
 {
-    const te_caffe::SliceParameter& slice_param = layer_param.slice_param();
-
-    SliceParam param = any_cast<SliceParam>(OpManager::GetOpDefParam("Slice"));
-
-    if(slice_param.has_axis())
+     const te_caffe::SliceParameter& slice_param = layer_param.slice_param();
+     SliceParam param = any_cast<SliceParam>(OpManager::GetOpDefParam("Slice"));
+     if(slice_param.has_axis())
         param.axis = slice_param.axis();
-    else
+     else
         param.axis = 1;
-
-    StaticOp* op = CreateStaticOp(graph, "Slice");
-
-    SetOperatorParam(op, param);
-
-    SetNodeOp(node, op);
-
-    return true;
+     param.iscaffe = true;
+     param.slice_point_.clear();
+     std::copy(slice_param.slice_point().begin(),slice_param.slice_point().end(),std::back_inserter(param.slice_point_));
+     StaticOp* op = CreateStaticOp(graph, "Slice");
+     SetOperatorParam(op, param);
+     SetNodeOp(node, op);
+
+     return true;
 }
-
 static bool LoadCaffeReLu(StaticGraph* graph, StaticNode* node, const te_caffe::LayerParameter& layer_param)
 {
     ReLuParam param = any_cast<ReLuParam>(OpManager::GetOpDefParam("ReLu"));
@@ -724,10 +730,11 @@ static bool LoadCaffeReLu(StaticGraph* graph, StaticNode* node, const te_caffe::
 
 static bool LoadCaffeSplit(StaticGraph* graph, StaticNode* node, const te_caffe::LayerParameter& layer_param)
 {
+    SplitParam param = any_cast<SplitParam>(OpManager::GetOpDefParam("Split"));
+    param.is_caffe=true;
     StaticOp* op = CreateStaticOp(graph, "Split");
-
+    SetOperatorParam(op, param);
     SetNodeOp(node, op);
-
     return true;
 }
 
@@ -1050,13 +1057,17 @@ static bool LoadCaffeConvolution(StaticGraph* graph, StaticNode* node, const te_
 
     if(conv_param.has_pad_h() && conv_param.has_pad_w())
     {
-        param.pad_h = conv_param.pad_h();
-        param.pad_w = conv_param.pad_w();
+        param.pad_h0 = conv_param.pad_h();
+        param.pad_h1 = conv_param.pad_h();
+        param.pad_w0 = conv_param.pad_w();
+        param.pad_w1 = conv_param.pad_w();
     }
     else if(conv_param.pad_size())
     {
-        param.pad_h = conv_param.pad(0);
-        param.pad_w = conv_param.pad(0);
+        param.pad_h0 = conv_param.pad(0);
+        param.pad_h1 = conv_param.pad(0);
+        param.pad_w0 = conv_param.pad(0);
+        param.pad_w1 = conv_param.pad(0);
     }
 
     param.output_channel = conv_param.num_output();
@@ -1092,16 +1103,54 @@ static bool LoadCaffeDeconvolution(StaticGraph* graph, StaticNode* node, const t
 
     DeconvParam param = any_cast<DeconvParam>(OpManager::GetOpDefParam("Deconvolution"));
 
-    param.kernel_size = conv_param.kernel_size(0);
-    param.stride = conv_param.stride(0);
-    param.pad = conv_param.pad(0);
+    if(conv_param.has_kernel_h() && conv_param.has_kernel_w())
+    {
+        param.kernel_h = conv_param.kernel_h();
+        param.kernel_w = conv_param.kernel_w();
+    }
+    else
+    {
+        param.kernel_h = conv_param.kernel_size(0);
+        param.kernel_w = conv_param.kernel_size(0);
+    }
+
+    if(conv_param.has_stride_h() && conv_param.has_stride_w())
+    {
+        param.stride_h = conv_param.stride_h();
+        param.stride_w = conv_param.stride_w();
+    }
+    else if(conv_param.stride_size())
+    {
+        param.stride_h = conv_param.stride(0);
+        param.stride_w = conv_param.stride(0);
+    }
+
+    if(conv_param.has_pad_h() && conv_param.has_pad_w())
+    {
+        param.pad_h0 = conv_param.pad_h();
+        param.pad_h1 = conv_param.pad_h();
+        param.pad_w0 = conv_param.pad_w();
+        param.pad_w1 = conv_param.pad_w();
+    }
+    else if(conv_param.pad_size())
+    {
+        param.pad_h0 = conv_param.pad(0);
+        param.pad_w0 = conv_param.pad(0);
+        param.pad_h1 = conv_param.pad(0);
+        param.pad_w1 = conv_param.pad(0);
+    }
     param.num_output = conv_param.num_output();
 
+    if(conv_param.has_group())
+        param.group = conv_param.group();
+
     if(conv_param.dilation_size())
     {
-        param.dilation = conv_param.dilation(0);
+        param.dilation_h = conv_param.dilation(0);
+        param.dilation_w = conv_param.dilation(0);
     }
 
+
     StaticOp* op = CreateStaticOp(graph, "Deconvolution");
 
     SetOperatorParam(op, param);
@@ -1147,27 +1196,23 @@ static bool LoadCaffePooling(StaticGraph* graph, StaticNode* node, const te_caff
         param.kernel_h = pool_param.kernel_h();
         param.kernel_w = pool_param.kernel_w();
     }
-    param.kernel_shape.resize(2);
-    param.kernel_shape[0] = param.kernel_h;
-    param.kernel_shape[1] = param.kernel_w;
 
     param.global = pool_param.global_pooling();
 
     if(pool_param.has_pad())
     {
-        param.pad_h = pool_param.pad();
-        param.pad_w = pool_param.pad();
+        param.pad_h0 = pool_param.pad();
+        param.pad_h1 = pool_param.pad();
+        param.pad_w0 = pool_param.pad();
+        param.pad_w1 = pool_param.pad();
     }
     else if(pool_param.has_pad_h() && pool_param.has_pad_w())
     {
-        param.pad_h = pool_param.pad_h();
-        param.pad_w = pool_param.pad_w();
+        param.pad_h0 = pool_param.pad_h();
+        param.pad_h1 = pool_param.pad_h();
+        param.pad_w0 = pool_param.pad_w();
+        param.pad_w1 = pool_param.pad_w();
     }
-    param.pads.resize(4);
-    param.pads[0] = param.pad_h;
-    param.pads[1] = param.pad_w;
-    param.pads[2] = param.pad_h;
-    param.pads[3] = param.pad_w;
 
     if(pool_param.has_stride())
     {
@@ -1179,9 +1224,6 @@ static bool LoadCaffePooling(StaticGraph* graph, StaticNode* node, const te_caff
         param.stride_h = pool_param.stride_h();
         param.stride_w = pool_param.stride_w();
     }
-    param.strides.resize(2);
-    param.strides[0] = param.stride_h;
-    param.strides[1] = param.stride_w;
 
     param.caffe_flavor = 1;
 
diff --git a/serializer/include/tm_format.h b/serializer/include/tengine/v1/tm1_format.h
similarity index 98%
rename from serializer/include/tm_format.h
rename to serializer/include/tengine/v1/tm1_format.h
index f28ae033e..890b37e13 100644
--- a/serializer/include/tm_format.h
+++ b/serializer/include/tengine/v1/tm1_format.h
@@ -21,11 +21,11 @@
  * Copyright (c) 2018, Open AI Lab
  * Author: jingyou@openailab.com
  */
-#ifndef __TM_FORMAT_H__
-#define __TM_FORMAT_H__
+#ifndef __TM1_FORMAT_H__
+#define __TM1_FORMAT_H__
+
+#include "tm_generate.h"
 
-#include <stdio.h>
-#include <stdlib.h>
 #include <stdint.h>
 
 #ifdef __cplusplus
@@ -38,8 +38,6 @@ extern "C" {
 
 #define NOT_SET 0x00
 
-#define TM_FILE_MAX_SIZE 1 << 30 /* 1G */
-
 /* Type define */
 typedef uint32_t tm_uoffset_t; /* offset is 4-byte unsigned integer */
 typedef uint32_t tm_size_t; /* size is 4-byte unsigned integer */
diff --git a/serializer/include/tm_op_serializer.hpp b/serializer/include/tengine/v1/tm1_op_serializer.hpp
similarity index 97%
rename from serializer/include/tm_op_serializer.hpp
rename to serializer/include/tengine/v1/tm1_op_serializer.hpp
index 6e98add41..930202153 100644
--- a/serializer/include/tm_op_serializer.hpp
+++ b/serializer/include/tengine/v1/tm1_op_serializer.hpp
@@ -21,8 +21,8 @@
  * Copyright (c) 2018, Open AI Lab
  * Author: jingyou@openailab.com
  */
-#ifndef __TM_OP_SERIALIZER_HPP__
-#define __TM_OP_SERIALIZER_HPP__
+#ifndef __TM1_OP_SERIALIZER_HPP__
+#define __TM1_OP_SERIALIZER_HPP__
 
 #include <string>
 #include "static_graph_interface.hpp"
@@ -73,10 +73,13 @@
 #include "operator/slice_param.hpp"
 #include "operator/softmax_param.hpp"
 #include "logger.hpp"
-#include "tm_generate.h"
+
+#include "tm1_format.h"
 
 namespace TEngine {
 
+namespace TMSerializer1 {
+
 using op_load_t = std::function<bool(StaticGraph*, StaticNode*, void* const, const TM_Operator*)>;
 
 tm_uoffset_t SaveTmOperator(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
@@ -121,6 +124,8 @@ template <typename T> const T* GetTmPtr(void* const start_ptr, tm_uoffset_t tm_o
         return nullptr;
 }
 
+}    // namespace TMSerializer1
+
 }    // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/serializer/include/tengine/v1/tm1_serializer.hpp b/serializer/include/tengine/v1/tm1_serializer.hpp
new file mode 100644
index 000000000..49c309e65
--- /dev/null
+++ b/serializer/include/tengine/v1/tm1_serializer.hpp
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#ifndef __TM1_SERIALIZER_HPP__
+#define __TM1_SERIALIZER_HPP__
+
+#include "serializer.hpp"
+#include "static_graph_interface.hpp"
+#include "logger.hpp"
+
+#include "tm1_format.h"
+#include "tm_serializer.hpp"
+
+namespace TEngine {
+
+namespace TMSerializer1 {
+
+class TmSerializer1 : public TmSerializer
+{
+    using name_map_t = std::unordered_map<std::string, unsigned int>;
+
+public:
+    TmSerializer1()
+    {
+        name_ = "tm_loader";
+        version_ = "1.0";
+        format_name_ = "tengine";
+    }
+
+    virtual ~TmSerializer1(){};
+
+    bool LoadModelFromMem(void* mmap_buf, StaticGraph* graph) override;
+    bool SaveModelIntoMem(void* start_ptr, Graph* graph, uint32_t* tm_model_size) override;
+
+    bool LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int& size);
+    bool LoadNode(StaticGraph* graph, StaticNode* node, const TM_Node* tm_node, void* mmap_buf);
+    bool LoadTensor(StaticGraph* graph, const TM_Tensor* tm_tensor, const TM_Buffer* tm_buf, void* mmap_buf);
+    bool LoadGraph(StaticGraph* graph, const TM_Model* tm_model, void* mmap_buf);
+
+    tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph);
+    tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node, name_map_t& tensor_name_map);
+    tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor, unsigned int tensor_id,
+                              unsigned int buffer_id);
+
+    bool IsSaveString(void);
+    bool IsSaveData(void);
+};
+
+}    // namespace TMSerializer1
+
+}    // namespace TEngine
+
+#endif
diff --git a/serializer/include/tengine/v2/tm2_format.h b/serializer/include/tengine/v2/tm2_format.h
new file mode 100644
index 000000000..f297425f9
--- /dev/null
+++ b/serializer/include/tengine/v2/tm2_format.h
@@ -0,0 +1,535 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#ifndef __TM2_FORMAT_H__
+#define __TM2_FORMAT_H__
+
+#include "tm_generate.h"
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TM2_FILE_VER_MAIN 2
+#define TM2_FILE_VER_SUB 0
+#define TM2_FILE_VER_COMPILE 0
+
+#define TM2_OP_VER 1
+
+#define TM2_NOT_SET 0x00
+
+/* Type define */
+typedef uint32_t tm_uoffset_t; /* offset is 4-byte unsigned integer */
+typedef uint32_t tm_size_t; /* size is 4-byte unsigned integer */
+typedef uint8_t tm_bool_t; /* bool is 1-byte unsigned integer */
+
+/* Operator strings */
+#define TM2_OPSTR_ACCURACY "Accuracy"
+#define TM2_OPSTR_BATCHNORMALIZATION "BatchNormalization"
+#define TM2_OPSTR_BILINEARRESIZE "BilinearResize"
+#define TM2_OPSTR_CONCAT "Concat"
+#define TM2_OPSTR_CONST "Const"
+#define TM2_OPSTR_CONVOLUTION "Convolution"
+#define TM2_OPSTR_DECONVOLUTION "Deconvolution"
+#define TM2_OPSTR_DETECTIONOUTPUT "DetectionOutput"
+#define TM2_OPSTR_DROPOUT "Dropout"
+#define TM2_OPSTR_ELTWISE "Eltwise"
+#define TM2_OPSTR_FLATTEN "Flatten"
+#define TM2_OPSTR_FULLYCONNECTED "FullyConnected"
+#define TM2_OPSTR_INPUTOP "InputOp"
+#define TM2_OPSTR_LRN "LRN"
+#define TM2_OPSTR_NORMALIZE "Normalize"
+#define TM2_OPSTR_PERMUTE "Permute"
+#define TM2_OPSTR_POOLING "Pooling"
+#define TM2_OPSTR_PRELU "PReLU"
+#define TM2_OPSTR_PRIORBOX "PriorBox"
+#define TM2_OPSTR_REGION "Region"
+#define TM2_OPSTR_RELU "ReLu"
+#define TM2_OPSTR_RELU6 "ReLu6"
+#define TM2_OPSTR_REORG "Reorg"
+#define TM2_OPSTR_RESHAPE "Reshape"
+#define TM2_OPSTR_ROIPOOLING "ROIPooling"
+#define TM2_OPSTR_RPN "RPN"
+#define TM2_OPSTR_SCALE "Scale"
+#define TM2_OPSTR_SLICE "Slice"
+#define TM2_OPSTR_SOFTMAX "Softmax"
+#define TM2_OPSTR_SPLIT "Split"
+#define TM2_OPSTR_DETECTIONPOSTPROCESS "DetectionPostProcess"
+#define TM2_OPSTR_GEMM "Gemm"
+#define TM2_OPSTR_GENERIC "Generic"
+#define TM2_OPSTR_LOGISTIC "Logistic"
+#define TM2_OPSTR_LSTM "LSTM"
+#define TM2_OPSTR_RNN "RNN"
+#define TM2_OPSTR_TANH "Tanh"
+#define TM2_OPSTR_SIGMOID "Sigmoid"
+#define TM2_OPSTR_SQUEEZE "Squeeze"
+#define TM2_OPSTR_FUSEDBNSCALERELU "Fused.BNScaleReLu"
+
+/* Operator types */
+#define TM2_OPTYPE_ACCURACY 0           /* No Param                 */
+#define TM2_OPTYPE_BATCHNORMALIZATION 1 /* TM2_BatchNormParam       */
+#define TM2_OPTYPE_BILINEARRESIZE 2     /* TM2_ResizeParam          */
+#define TM2_OPTYPE_CONCAT 3             /* TM2_ConcatParam          */
+#define TM2_OPTYPE_CONST 4              /* No Param                 */
+#define TM2_OPTYPE_CONVOLUTION 5        /* TM2_ConvParam            */
+#define TM2_OPTYPE_DECONVOLUTION 6      /* TM2_DeconvParam          */
+#define TM2_OPTYPE_DETECTIONOUTPUT 7    /* TM2_DetectionOutputParam */
+#define TM2_OPTYPE_DROPOUT 8            /* No Param                 */
+#define TM2_OPTYPE_ELTWISE 9            /* TM2_EltwiseParam         */
+#define TM2_OPTYPE_FLATTEN 10           /* TM2_FlattenParam         */
+#define TM2_OPTYPE_FULLYCONNECTED 11    /* TM2_FCParam              */
+#define TM2_OPTYPE_INPUTOP 12           /* No Param                 */
+#define TM2_OPTYPE_LRN 13               /* TM2_LRNParam             */
+#define TM2_OPTYPE_NORMALIZE 14         /* TM2_NormalizeParam       */
+#define TM2_OPTYPE_PERMUTE 15           /* TM2_PermuteParam         */
+#define TM2_OPTYPE_POOLING 16           /* TM2_PoolParam            */
+#define TM2_OPTYPE_PRELU 17             /* No Param                 */
+#define TM2_OPTYPE_PRIORBOX 18          /* TM2_PriorBoxParam        */
+#define TM2_OPTYPE_REGION 19            /* TM2_RegionParam          */
+#define TM2_OPTYPE_RELU 20              /* TM2_ReLuParam            */
+#define TM2_OPTYPE_RELU6 21             /* No Param                 */
+#define TM2_OPTYPE_REORG 22             /* TM2_ReorgParam           */
+#define TM2_OPTYPE_RESHAPE 23           /* TM2_ReshapeParam         */
+#define TM2_OPTYPE_ROIPOOLING 24        /* TM2_ROIPoolingParam      */
+#define TM2_OPTYPE_RPN 25               /* TM2_RPNParam             */
+#define TM2_OPTYPE_SCALE 26             /* TM2_ScaleParam           */
+#define TM2_OPTYPE_SLICE 27             /* TM2_SliceParam           */
+#define TM2_OPTYPE_SOFTMAX 28           /* TM2_SoftmaxParam         */
+#define TM2_OPTYPE_SPLIT 29             /* No Param                 */
+#define TM2_OPTYPE_DETECTIONPOSTPROCESS 30 /* TM2_DetectionPostProcessParam */
+#define TM2_OPTYPE_GEMM 31              /* TM2_GemmParam            */
+#define TM2_OPTYPE_GENERIC 32           /* TM2_GenericParam         */
+#define TM2_OPTYPE_LOGISTIC 33          /* No Param                 */
+#define TM2_OPTYPE_LSTM 34              /* TM2_LstmParam            */
+#define TM2_OPTYPE_RNN 35               /* TM2_RnnParam             */
+#define TM2_OPTYPE_TANH 36              /* No Param                 */
+#define TM2_OPTYPE_SIGMOID 37           /* No Param                 */
+#define TM2_OPTYPE_SQUEEZE 38           /* TM2_SqueezeParam         */
+#define TM2_OPTYPE_FUSEDBNSCALERELU 39  /* No Param                 */
+#define TM2_OPTYPE_NUM 40
+
+/* --------------------- -------- TM objects -------------------------------- */
+
+typedef struct
+{
+    uint16_t ver_main; /* main version of Tengine model file format */
+    uint16_t ver_sub; /* sub version of Tengine model file format */
+    uint16_t ver_compile; /* compile version of Tengine model file format */
+    tm_uoffset_t offset_root; /* offset of root table (TM2_Model) */
+} TM2_Header;
+
+/* Root table of Tengine model */
+typedef struct
+{
+    int32_t orig_format; /* format of original model */
+    int32_t sub_format; /* sub format for DLA model */
+    tm_uoffset_t offset_vo_subgraphs; /* offset of TM2_Vector_offsets <offsets of subgraphs> */
+    tm_uoffset_t offset_s_mname; /* offset of string <model name> */
+} TM2_Model;
+
+/* Only 1 subgraph is supported currently */
+typedef struct
+{
+    uint32_t subgraph_id; /* subgraph id */
+    int32_t graph_layout; /* actual data layout */
+    int32_t model_layout; /* data layout of original model */
+    tm_uoffset_t offset_vi_input_indices; /* offset of TM2_Vector_indices <indices of input nodes> */
+    tm_uoffset_t offset_vi_output_indices; /* offset of TM2_Vector_indices <indices of output nodes> */
+    tm_uoffset_t offset_vo_seq_nodes; /* offset of TM2_Vector_offsets <nodes> */
+    tm_uoffset_t offset_vo_tensors; /* offset of TM2_Vector_offsets <tensors> */
+    tm_uoffset_t offset_vo_buffers; /* offset of TM2_Vector_offsets <buffers> */
+    tm_uoffset_t offset_s_sname; /* offset of string <subgraph name> */
+} TM2_Subgraph;
+
+typedef struct
+{
+    tm_uoffset_t offset_s_attrname; /* offset of string <attr name> */
+    tm_uoffset_t offset_s_attrval; /* offset of string <attr value> */
+    int32_t attr_type;
+} TM2_Attr;
+
+typedef struct
+{
+    uint32_t node_id; /* node id */
+    tm_uoffset_t offset_vi_input_tensors; /* offset of TM2_Vector_indices <indices of input tensors> */
+    tm_uoffset_t offset_vi_output_tensors; /* offset of TM2_Vector_indices <indices of output tensors> */
+    tm_uoffset_t offset_t_operator; /* offset of table  <operator> */
+    tm_uoffset_t offset_s_nname; /* offset of string <node name> */
+    tm_uoffset_t offset_vo_attrs; /* offset of TM2_Vector_offsets <attrs> */
+    tm_bool_t dynamic_shape;
+} TM2_Node;
+
+typedef struct
+{
+    uint32_t op_ver; /* version of operator */
+    uint32_t operator_type; /* operator type */
+    tm_uoffset_t offset_t_param; /* offset of table <operator param> */
+} TM2_Operator;
+
+typedef struct
+{
+    int32_t zero_point;
+    float scale;
+    int32_t width;
+} TM2_QuantParam;
+
+typedef struct
+{
+    uint32_t tensor_id;
+    uint32_t buffer_id;
+    tm_uoffset_t offset_vd_dims; /* offset of TM2_Vector_dims <dims> */
+    tm_uoffset_t offset_s_tname; /* offset of string <tensor name> */
+    tm_uoffset_t offect_vo_quantparams; /* offset of TM2_Vector_offsets <quant params> */
+    int32_t layout;
+    int32_t type;
+    int32_t data_type;
+} TM2_Tensor;
+
+typedef struct
+{
+    tm_size_t size; /* buffer size */
+    tm_uoffset_t offset_data; /* offset of buffer data */
+} TM2_Buffer;
+
+typedef struct
+{
+    tm_size_t size; /* string size */
+    tm_uoffset_t offset_data; /* offset of string data */
+} TM2_String;
+
+/* ------------------------ ------- Vectors --------------------------------- */
+
+typedef struct
+{
+    tm_size_t v_num; /* number of vector elements */
+    tm_uoffset_t offsets[0];
+} TM2_Vector_offsets;
+
+typedef struct
+{
+    tm_size_t v_num; /* number of vector elements */
+    uint32_t indices[0];
+} TM2_Vector_indices;
+
+typedef struct
+{
+    tm_size_t v_num; /* number of vector elements */
+    int32_t dims[0];
+} TM2_Vector_dims;
+
+typedef struct
+{
+    tm_size_t v_num; /* number of vector elements */
+    float data[0];
+} TM2_Vector_floats;
+
+typedef struct
+{
+    tm_size_t v_num; /* number of vector elements */
+    float data[0][4]; /* x0, y0, x1, y1 */
+} TM2_Vector_anchors;
+
+/* -------------------- ------- Operator params ----------------------------- */
+
+typedef struct
+{
+    int32_t max_input_num;
+    int32_t max_output_num;
+    tm_uoffset_t offset_s_opname; /* offset of string <op name> */
+} TM2_GenericParam;
+
+typedef struct
+{
+    float rescale_factor;
+    float eps;
+    int32_t caffe_flavor;
+} TM2_BatchNormParam;
+
+typedef struct
+{
+    int32_t axis;
+} TM2_ConcatParam;
+
+typedef struct
+{
+    int32_t kernel_h;
+    int32_t kernel_w;
+    int32_t stride_h;
+    int32_t stride_w;
+    int32_t dilation_h;
+    int32_t dilation_w;
+    int32_t input_channel;
+    int32_t output_channel;
+    int32_t group;
+    int32_t activation;
+    int32_t pad_h0;  /* top padding rows */
+    int32_t pad_w0;  /* left padding columns */
+    int32_t pad_h1;  /* bottom padding rows */
+    int32_t pad_w1;  /* right padding columns */
+} TM2_ConvParam;
+
+typedef struct
+{
+    int32_t num_output;
+    int32_t kernel_h;
+    int32_t kernel_w;
+    int32_t stride_h;
+    int32_t stride_w;
+    int32_t pad_w0;
+    int32_t pad_h0;
+    int32_t pad_w1;
+    int32_t pad_h1;
+    int32_t dilation_h;
+    int32_t dilation_w;
+    int32_t group;
+    int32_t activation;
+} TM2_DeconvParam;
+
+typedef struct
+{
+    int32_t num_classes;
+    int32_t keep_top_k;
+    int32_t nms_top_k;
+    float confidence_threshold;
+    float nms_threshold;
+} TM2_DetectionOutputParam;
+
+typedef struct
+{
+    uint32_t type;
+    int32_t caffe_flavor;
+} TM2_EltwiseParam;
+
+typedef struct
+{
+    int32_t num_output;
+} TM2_FCParam;
+
+typedef struct
+{
+    int32_t axis;
+    int32_t end_axis;
+} TM2_FlattenParam;
+
+typedef struct
+{
+    int32_t local_size;
+    float alpha;
+    float beta;
+    int32_t norm_region;
+    float k;
+} TM2_LRNParam;
+
+typedef struct
+{
+    int32_t across_spatial;
+    int32_t channel_shared;
+} TM2_NormalizeParam;
+
+typedef struct
+{
+    int32_t flag;
+    int32_t order0;
+    int32_t order1;
+    int32_t order2;
+    int32_t order3;
+} TM2_PermuteParam;
+
+typedef struct
+{
+    uint32_t alg;
+    int32_t kernel_h;
+    int32_t kernel_w;
+    int32_t stride_h;
+    int32_t stride_w;
+    int32_t global;
+    int32_t caffe_flavor;
+    int32_t pad_h0;  /* top padding rows */
+    int32_t pad_w0;  /* left padding columns */
+    int32_t pad_h1;  /* bottom padding rows */
+    int32_t pad_w1;  /* right padding columns */
+} TM2_PoolParam;
+
+typedef struct
+{
+    tm_uoffset_t offset_vf_min_size; /* offset of TM2_Vector_floats <min_sizes> */
+    tm_uoffset_t offset_vf_max_size; /* offset of TM2_Vector_floats <max_sizes> */
+    tm_uoffset_t offset_vf_variance; /* offset of TM2_Vector_floats <variances> */
+    tm_uoffset_t offset_vf_aspect_ratio; /* offset of TM2_Vector_floats <aspect_ratios> */
+    int32_t flip;
+    int32_t clip;
+    int32_t img_size;
+    int32_t img_h;
+    int32_t img_w;
+    float step_w;
+    float step_h;
+    float offset;
+    int32_t num_priors;
+    int32_t out_dim;
+} TM2_PriorBoxParam;
+
+typedef struct
+{
+    int32_t num_classes;
+    int32_t side;
+    int32_t num_box;
+    int32_t coords;
+    float confidence_threshold;
+    float nms_threshold;
+    tm_uoffset_t offset_vf_biases; /* offset of TM2_Vector_floats <biases> */
+} TM2_RegionParam;
+
+typedef struct
+{
+    float negative_slope;
+} TM2_ReLuParam;
+
+typedef struct
+{
+    int32_t stride;
+} TM2_ReorgParam;
+
+typedef struct
+{
+    int32_t dim_0;
+    int32_t dim_1;
+    int32_t dim_2;
+    int32_t dim_3;
+    int32_t dim_size;
+    int32_t axis;
+} TM2_ReshapeParam;
+
+typedef struct
+{
+    float scale_x;
+    float scale_y;
+} TM2_ResizeParam;
+
+typedef struct
+{
+    int32_t pooled_h;
+    int32_t pooled_w;
+    float spatial_scale;
+} TM2_ROIPoolingParam;
+
+typedef struct
+{
+    tm_uoffset_t offset_vf_ratios; /* pointer to TM2_Vector_floats <ratios> */
+    tm_uoffset_t offset_vf_anchor_scales; /* pointer to TM2_Vector_floats <anchor_scales> */
+    int32_t feat_stride;
+    int32_t basesize;
+    int32_t min_size;
+    int32_t per_nms_topn;
+    int32_t post_nms_topn;
+    float nms_thresh;
+    tm_uoffset_t offset_va_anchors; /* offset of TM2_Vector_anchors <anchors> */
+} TM2_RPNParam;
+
+typedef struct
+{
+    int32_t axis;
+    int32_t num_axes;
+    int32_t bias_term;
+} TM2_ScaleParam;
+
+typedef struct
+{
+    int32_t axis;
+    tm_uoffset_t offset_vi_slice_points; /* offset of TM2_Vector_dims <slice_points> */
+    tm_uoffset_t offset_vi_begins; /* offset of TM2_Vector_dims <begins> */
+    tm_uoffset_t offset_vi_sizes; /* offset of TM2_Vector_dims <sizes> */
+    int32_t iscaffe;
+} TM2_SliceParam;
+
+typedef struct
+{
+    int32_t axis;
+} TM2_SoftmaxParam;
+
+typedef struct
+{
+    int32_t max_detections;
+    int32_t max_classes_per_detection;
+    float nms_score_threshold;
+    float nms_iou_threshold;
+    int32_t num_classes;
+    tm_uoffset_t offset_vf_scales; /* y_scale, x_scale, h_scale, w_scale */
+} TM2_DetectionPostProcessParam;
+
+typedef struct
+{
+    float alpha;
+    float beta;
+    int32_t transA;
+    int32_t transB;
+} TM2_GemmParam;
+
+typedef struct
+{
+    float forget_bias;
+    float clip;
+    int32_t output_len;
+    int32_t sequence_len;
+    int32_t input_size;
+    int32_t hidden_size;
+    int32_t cell_size;
+    int32_t has_peephole;
+    int32_t has_projection;
+    int32_t has_clip;
+    int32_t has_bias;
+    int32_t has_init_state;
+    int32_t forget_act;
+    int32_t input_act;
+    int32_t output_act;
+    int32_t cellin_act;
+    int32_t cellout_act;
+} TM2_LstmParam;
+
+typedef struct
+{
+    float clip;
+    int32_t output_len;
+    int32_t sequence_len;
+    int32_t input_size;
+    int32_t hidden_size;
+    int32_t has_clip;
+    int32_t has_bias;
+    int32_t has_init_state;
+    int32_t activation;
+} TM2_RnnParam;
+
+typedef struct
+{
+    int32_t dim_0;
+    int32_t dim_1;
+    int32_t dim_2;
+    int32_t dim_3;
+} TM2_SqueezeParam;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/serializer/include/tengine/v2/tm2_op_serializer.hpp b/serializer/include/tengine/v2/tm2_op_serializer.hpp
new file mode 100644
index 000000000..f243f08c1
--- /dev/null
+++ b/serializer/include/tengine/v2/tm2_op_serializer.hpp
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#ifndef __TM2_OP_SERIALIZER_HPP__
+#define __TM2_OP_SERIALIZER_HPP__
+
+#include "static_graph_interface.hpp"
+#include "logger.hpp"
+
+#include "operator/batch_norm.hpp"
+#include "operator/concat.hpp"
+#include "operator/convolution.hpp"
+#include "operator/deconvolution.hpp"
+#include "operator/detection_output.hpp"
+#include "operator/eltwise.hpp"
+#include "operator/fully_connected.hpp"
+#include "operator/flatten.hpp"
+#include "operator/lrn.hpp"
+#include "operator/normalize.hpp"
+#include "operator/permute.hpp"
+#include "operator/pooling.hpp"
+#include "operator/priorbox.hpp"
+#include "operator/region.hpp"
+#include "operator/relu.hpp"
+#include "operator/reorg.hpp"
+#include "operator/reshape.hpp"
+#include "operator/resize.hpp"
+#include "operator/roi_pooling.hpp"
+#include "operator/rpn.hpp"
+#include "operator/scale.hpp"
+#include "operator/slice.hpp"
+#include "operator/softmax.hpp"
+#include "operator/detection_postprocess.hpp"
+#include "operator/gemm.hpp"
+#include "operator/generic.hpp"
+#include "operator/logistic.hpp"
+#include "operator/lstm.hpp"
+#include "operator/rnn.hpp"
+#include "operator/tanh.hpp"
+#include "operator/sigmoid.hpp"
+#include "operator/squeeze.hpp"
+#include "operator/fused_operator.hpp"
+
+#include "operator/batch_norm_param.hpp"
+#include "operator/concat_param.hpp"
+#include "operator/conv_param.hpp"
+#include "operator/deconv_param.hpp"
+#include "operator/detection_output_param.hpp"
+#include "operator/eltwise_param.hpp"
+#include "operator/fc_param.hpp"
+#include "operator/flatten_param.hpp"
+#include "operator/lrn_param.hpp"
+#include "operator/normalize_param.hpp"
+#include "operator/permute_param.hpp"
+#include "operator/pool_param.hpp"
+#include "operator/priorbox_param.hpp"
+#include "operator/region_param.hpp"
+#include "operator/relu_param.hpp"
+#include "operator/reorg_param.hpp"
+#include "operator/reshape_param.hpp"
+#include "operator/resize_param.hpp"
+#include "operator/roi_pooling_param.hpp"
+#include "operator/rpn_param.hpp"
+#include "operator/scale_param.hpp"
+#include "operator/slice_param.hpp"
+#include "operator/softmax_param.hpp"
+#include "operator/detection_postprocess_param.hpp"
+#include "operator/gemm_param.hpp"
+#include "operator/generic_param.hpp"
+#include "operator/lstm_param.hpp"
+#include "operator/rnn_param.hpp"
+#include "operator/squeeze_param.hpp"
+
+#include "tm2_format.h"
+
+namespace TEngine {
+
+namespace TMSerializer2 {
+
+using op_load_t = std::function<bool(StaticGraph*, StaticNode*, void* const, const TM2_Operator*)>;
+using op_save_t = std::function<tm_uoffset_t(void* const, tm_uoffset_t*, Operator*)>;
+
+std::string GetOpStr(uint32_t op_type);
+
+op_load_t LoadTmOpFunc(uint32_t op_type);
+bool LoadTmAccuracyOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmBatchNormOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmResizeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmConcatOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmConstOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmConvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmDeconvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmDetectionOutputOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmDropoutOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmEltwiseOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmFlattenOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmFCOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmInputOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmLRNOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmNormalizeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmPermuteOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmPoolingOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmPreluOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmPriorBoxOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmRegionOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmReLuOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmRelu6Op(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmReorgOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmReshapeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmROIPoolingOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmRPNOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmScaleOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmSliceOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmSoftmaxOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmSplitOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmDetectionPostProcessOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmGemmOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmGenericOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmLogisticOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmLstmOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmRnnOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmTanhOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmSigmoidOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmSqueezeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+bool LoadTmFusedbnscalereluOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op);
+
+op_save_t SaveTmOpFunc(uint32_t op_type);
+tm_uoffset_t SaveTmAccuracyOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmBatchNormOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmConcatOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmConstOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmConvOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmDeconvOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmDetectionOutputOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmDropoutOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmEltwiseOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmFCOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmFlattenOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmInputOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmLRNOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmNormalizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmPermuteOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmPoolingOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmPreluOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmRegionOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmReLuOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmRelu6Op(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmReorgOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmReshapeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmResizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmROIPoolingOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmScaleOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmSoftmaxOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmSplitOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmDetectionPostProcessOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmGemmOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmGenericOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmLogisticOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmLstmOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmRnnOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmTanhOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmSigmoidOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmSqueezeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+tm_uoffset_t SaveTmFusedbnscalereluOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op);
+
+template <typename T> const T* GetTmPtr(void* const start_ptr, tm_uoffset_t tm_offset)
+{
+    if(tm_offset != TM2_NOT_SET)
+        return reinterpret_cast<const T*>(reinterpret_cast<char*>(start_ptr) + tm_offset);
+    else
+        return nullptr;
+}
+
+}    // namespace TMSerializer2
+
+}    // namespace TEngine
+
+#endif
diff --git a/serializer/include/tengine/v2/tm2_serializer.hpp b/serializer/include/tengine/v2/tm2_serializer.hpp
new file mode 100644
index 000000000..850aa6bb5
--- /dev/null
+++ b/serializer/include/tengine/v2/tm2_serializer.hpp
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#ifndef __TM2_SERIALIZER_HPP__
+#define __TM2_SERIALIZER_HPP__
+
+#include "serializer.hpp"
+#include "static_graph_interface.hpp"
+#include "logger.hpp"
+
+#include "tm2_format.h"
+#include "tm_serializer.hpp"
+
+namespace TEngine {
+
+namespace TMSerializer2 {
+
+class TmSerializer2 : public TmSerializer
+{
+    using name_map_t = std::unordered_map<std::string, unsigned int>;
+
+public:
+    TmSerializer2()
+    {
+        name_ = "tm2_loader";
+        version_ = "2.0";
+        format_name_ = "tengine";
+    }
+
+    virtual ~TmSerializer2(){};
+
+    bool LoadModelFromMem(void* mmap_buf, StaticGraph* graph) override;
+    bool SaveModelIntoMem(void* start_ptr, Graph* graph, uint32_t* tm_model_size) override;
+
+    bool LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int& size);
+    bool LoadNode(StaticGraph* graph, StaticNode* node, const TM2_Node* tm_node, void* mmap_buf);
+    bool LoadTensor(StaticGraph* graph, const TM2_Tensor* tm_tensor, const TM2_Buffer* tm_buf, void* mmap_buf);
+    bool LoadGraph(StaticGraph* graph, const TM2_Model* tm_model, void* mmap_buf);
+
+    tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph);
+    tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node, name_map_t& tensor_name_map);
+    tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor, unsigned int tensor_id,
+                              unsigned int buffer_id);
+
+    bool IsSaveString(void);
+    bool IsSaveData(void);
+};
+
+}    // namespace TMSerializer2
+
+}    // namespace TEngine
+
+#endif
diff --git a/serializer/include/tf_lite/flatbuffers/flatbuffers.h b/serializer/include/tf_lite/flatbuffers/flatbuffers.h
index e34c55d8a..4154639a3 100644
--- a/serializer/include/tf_lite/flatbuffers/flatbuffers.h
+++ b/serializer/include/tf_lite/flatbuffers/flatbuffers.h
@@ -1902,7 +1902,7 @@ class Verifier FLATBUFFERS_FINAL_CLASS
 public:
     Verifier(const uint8_t* buf, size_t buf_len, uoffset_t _max_depth = 64, uoffset_t _max_tables = 1000000)
         : buf_(buf), end_(buf + buf_len), depth_(0), max_depth_(_max_depth), num_tables_(0), max_tables_(_max_tables)
-    // clang-format off
+// clang-format off
     #ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
         , upper_bound_(buf)
     #endif
diff --git a/serializer/include/tf_serializer.hpp b/serializer/include/tf_serializer.hpp
index 2a9231be6..127e23cc9 100644
--- a/serializer/include/tf_serializer.hpp
+++ b/serializer/include/tf_serializer.hpp
@@ -106,6 +106,81 @@ struct LSTMNode : public TFNode
     }
 };
 
+struct RNNNode : public TFNode
+{
+    float clip;
+
+    std::string direction;
+
+    /* optional inputs */
+    TFNode* kernel;
+    TFNode* bias;
+    TFNode* init_h;
+
+    std::set<TFNode*> rnn_graph;
+
+    RNNNode()
+    {
+        kernel = nullptr;
+        bias = nullptr;
+        init_h = nullptr;
+    }
+
+    ~RNNNode()
+    {
+        auto rnn_ir = rnn_graph.begin();
+        auto rnn_end = rnn_graph.end();
+
+        while(rnn_ir != rnn_end)
+        {
+            delete(*rnn_ir);
+            rnn_ir++;
+        }
+    }
+};
+
+struct GRUNode : public TFNode
+{
+    float clip;
+
+    std::string direction;
+
+    /* optional inputs */
+    TFNode* kernel;
+    TFNode* bias;
+    TFNode* init_h;
+    //gru kernel & bias
+    TFNode* gate_kernel;
+    TFNode* gate_bias;
+    TFNode* candidate_kernel;
+    TFNode* candidate_bias;
+
+    std::set<TFNode*> rnn_graph;
+
+    GRUNode()
+    {
+        kernel = nullptr;
+        bias = nullptr;
+        init_h = nullptr;
+        gate_kernel= nullptr;
+        gate_bias= nullptr;
+        candidate_kernel= nullptr;
+        candidate_bias= nullptr;
+    }
+
+    ~GRUNode()
+    {
+        auto rnn_ir = rnn_graph.begin();
+        auto rnn_end = rnn_graph.end();
+
+        while(rnn_ir != rnn_end)
+        {
+            delete(*rnn_ir);
+            rnn_ir++;
+        }
+    }
+};
+
 struct TFGraph
 {
     std::vector<TFNode*> seq_nodes;
@@ -120,7 +195,7 @@ struct TFGraph
 #define TF_RNN_LSTM 0
 #define TF_RNN_GRU 1
 #define TF_RNN_BASIC_LSTM 2
-
+#define TF_RNN_BASIC_RNN 3
 class TFSerializer : public Serializer
 {
 public:
@@ -165,6 +240,10 @@ class TFSerializer : public Serializer
     void StripRNNScope(TFGraph& tf_graph, std::string& rnn_scope, int rnn_type);
 
     void ParseLSTMGraph(TFGraph& tf_graph, LSTMNode* lstm_node, std::set<TFNode*>& rnn_graph);
+
+    void ParseRNNGraph(TFGraph& tf_graph, RNNNode* rnn_node, std::set<TFNode*>& rnn_graph);
+
+    void ParseGRUGraph(TFGraph& tf_graph, GRUNode* gru_node, std::set<TFNode*>& rnn_graph);
 };
 
 }    // namespace TEngine
diff --git a/serializer/include/tm_generate.h b/serializer/include/tm_generate.h
index df0f6e120..ab09492d3 100644
--- a/serializer/include/tm_generate.h
+++ b/serializer/include/tm_generate.h
@@ -24,15 +24,15 @@
 #ifndef __TM_GENERATE_H__
 #define __TM_GENERATE_H__
 
-#include "tm_format.h"
+#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-tm_uoffset_t WriteTmFileAlign1(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size);
-tm_uoffset_t WriteTmFileAlign4(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size);
-tm_uoffset_t WriteTmObject(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size);
+uint32_t WriteTmFileAlign1(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size);
+uint32_t WriteTmFileAlign4(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size);
+uint32_t WriteTmObject(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size);
 
 #ifdef __cplusplus
 }
diff --git a/serializer/include/tm_serializer.hpp b/serializer/include/tm_serializer.hpp
index 99f8b2c67..0e719a06d 100644
--- a/serializer/include/tm_serializer.hpp
+++ b/serializer/include/tm_serializer.hpp
@@ -18,7 +18,7 @@
  */
 
 /*
- * Copyright (c) 2018, Open AI Lab
+ * Copyright (c) 2019, Open AI Lab
  * Author: jingyou@openailab.com
  */
 #ifndef __TM_SERIALIZER_HPP__
@@ -26,23 +26,13 @@
 
 #include "serializer.hpp"
 #include "static_graph_interface.hpp"
-#include "logger.hpp"
-#include "tm_generate.h"
 
 namespace TEngine {
 
 class TmSerializer : public Serializer
 {
-    using name_map_t = std::unordered_map<std::string, unsigned int>;
-
 public:
-    TmSerializer()
-    {
-        name_ = "tm_loader";
-        version_ = "0.1";
-        format_name_ = "tengine";
-    }
-
+    TmSerializer() {};
     virtual ~TmSerializer(){};
 
     unsigned int GetFileNum(void) override
@@ -65,21 +55,17 @@ class TmSerializer : public Serializer
         return false;
     }
 
-    bool LoadModelFromMem(void* mmap_buf, StaticGraph* graph);
+    bool LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int& size);
 
-    bool IsSaveString(void);
-    bool IsSaveData(void);
+    virtual bool LoadModelFromMem(void* mmap_buf, StaticGraph* graph) { return false; }
+    virtual bool SaveModelIntoMem(void* start_ptr, Graph* graph, uint32_t* tm_model_size) { return false; }
+};
 
-protected:
-    bool LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int& size);
-    bool LoadNode(StaticGraph* graph, StaticNode* node, const TM_Node* tm_node, void* mmap_buf);
-    bool LoadTensor(StaticGraph* graph, const TM_Tensor* tm_tensor, const TM_Buffer* tm_buf, void* mmap_buf);
-    bool LoadGraph(StaticGraph* graph, const TM_Model* tm_model, void* mmap_buf);
+using TmSerializerPtr = std::shared_ptr<TmSerializer>;
+using TmSerializerFactory = SpecificFactory<TmSerializer>;
 
-    tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph);
-    tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node, name_map_t& tensor_name_map);
-    tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor, unsigned int tensor_id,
-                              unsigned int buffer_id);
+class TmSerializerManager : public SimpleObjectManagerWithLock<TmSerializerManager, TmSerializerPtr>
+{
 };
 
 }    // namespace TEngine
diff --git a/serializer/mxnet/mxnet_serializer.cpp b/serializer/mxnet/mxnet_serializer.cpp
index 67c016898..1d06ee129 100644
--- a/serializer/mxnet/mxnet_serializer.cpp
+++ b/serializer/mxnet/mxnet_serializer.cpp
@@ -24,6 +24,8 @@
 
 #include "mxnet_serializer.hpp"
 
+#include "tengine_c_api.h"
+#include "exec_attr.hpp"
 #include "type_name.hpp"
 #include "data_type.hpp"
 #include "tengine_errno.hpp"
@@ -38,6 +40,11 @@
 #include "operator/eltwise_param.hpp"
 #include "operator/fc_param.hpp"
 #include "operator/reshape_param.hpp"
+#include "operator/swap_axis_param.hpp"
+#include "operator/addn_param.hpp"
+#include "operator/lstm_param.hpp"
+#include "operator/gru_param.hpp"
+#include "operator/permute_param.hpp"
 
 //#define DEBUG
 
@@ -47,6 +54,20 @@ typedef std::string::size_type pos;
 typedef std::map<std::string, std::string>::const_iterator const_iterator;
 using op_load_t = std::function<bool(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node)>;
 
+std::vector<int> &split(const std::string &str, char delim, std::vector<int> &elems, bool skip_empty = true) {
+    std::istringstream iss(str);
+    for (std::string item; getline(iss, item, delim); )
+        if (skip_empty && item.empty()) continue;
+        else elems.push_back(atoi(item.c_str()));
+    return elems;
+}
+
+static void ParseAttr_n(const std::string str, std::vector<int>& result)
+{
+    std::string s = str.substr(1, str.length() - 2);
+    split(s,',',result);
+}
+
 static void Trim(std::string& s, const char charlist[])
 {
     // Erase the leading characters
@@ -194,8 +215,10 @@ bool MxnetSerializer::LoadTextFile(const char* fname, std::vector<MxnetNode>& no
                     node.name = unknown.str();
                     cnt_unknown_name++;
                 }
-                if(node.op == "Flatten")
+
+                if(node.op == "Flatten"||node.op == "SliceChannel")
                     node.op = "Dropout";
+
                 nodelist.push_back(node);
                 nest--;
                 continue;
@@ -430,8 +453,17 @@ bool MxnetSerializer::LoadModel(const std::vector<std::string>& file_list, Stati
     SetGraphSource(graph, file_list[1]);
     SetGraphSourceFormat(graph, "mxnet");
     SetGraphConstTensorFile(graph, file_list[1]);
+    SetGraphLayout(graph,TENGINE_LAYOUT_NCHW);
+    SetModelLayout(graph,TENGINE_LAYOUT_NCHW);
+    SetModelFormat(graph,MODEL_FORMAT_MXNET);
 
-    return LoadGraph(graph, nodelist, paramlist);
+    bool res = LoadGraph(graph, nodelist, paramlist);
+    for(std::size_t ii=0; ii < paramlist.size(); ++ii)
+    {
+	    std::free(paramlist[ii].raw_data);
+    }
+
+    return res;
 }
 
 bool MxnetSerializer::LoadConstTensor(StaticGraph* graph, const std::vector<MxnetParam>& paramlist)
@@ -513,7 +545,6 @@ void MxnetSerializer::CreateInputNode(StaticGraph* graph, const std::vector<Mxne
 
             SetTensorDataType(tensor, DataType::GetTypeID("float32"));
 
-            SetTensorDataLayout(tensor, "NCHW");
 
             MxnetParam param;
             if(GetParam(mxnet_node.name, paramlist, param))
@@ -551,7 +582,9 @@ void MxnetSerializer::LoadNode(StaticGraph* graph, StaticNode* node, const Mxnet
     {
         int input_idx = mxnet_node.inputs.at(i);
         const MxnetNode& input_node = nodelist.at(input_idx);
-        if(input_node.name == "prob_label")
+
+        if(input_node.name.find("label")!=std::string::npos ||
+           input_node.name.find("state")!=std::string::npos)
             continue;
 
 #ifdef DEBUG
@@ -559,27 +592,6 @@ void MxnetSerializer::LoadNode(StaticGraph* graph, StaticNode* node, const Mxnet
 #endif
 
         StaticTensor* tensor = FindTensor(graph, input_node.name);
-        if(input_node.op == "null")
-        {
-            if(mxnet_node.op == "BatchNorm" || mxnet_node.op == "LeakyReLU")
-            {
-                SetTensorDataLayout(tensor, "W");
-            }
-            else if(mxnet_node.op == "FullyConnected")
-            {
-                if(i == 1)    // weight
-                    SetTensorDataLayout(tensor, "HW");
-                else if(i == 2)    // bias
-                    SetTensorDataLayout(tensor, "W");
-            }
-            else
-            {
-                if(i == 1)    // weight
-                    SetTensorDataLayout(tensor, "NCHW");
-                else if(i == 2)    // bias
-                    SetTensorDataLayout(tensor, "W");
-            }
-        }
 
         AddNodeInputTensor(node, tensor);
     }
@@ -589,7 +601,6 @@ void MxnetSerializer::LoadNode(StaticGraph* graph, StaticNode* node, const Mxnet
     StaticTensor* tensor = CreateStaticTensor(graph, output_name);
 
     SetTensorDataType(tensor, DataType::GetTypeID("float32"));
-    SetTensorDataLayout(tensor, "NCHW");
     AddNodeOutputTensor(node, tensor);
 }
 
@@ -606,7 +617,7 @@ bool MxnetSerializer::LoadGraph(StaticGraph* graph, const std::vector<MxnetNode>
     {
         MxnetNode mxnet_node = nodelist.at(i);
 
-        if(mxnet_node.op == "null")
+        if(mxnet_node.op == "null"||mxnet_node.op == "_zeros")
             continue;
 
         if(!FindOpLoadMethod(mxnet_node.op))
@@ -640,6 +651,7 @@ static bool LoadMxnetSoftmax(StaticGraph* graph, StaticNode* node, const MxnetNo
     StaticOp* op = CreateStaticOp(graph, "Softmax");
 
     SoftmaxParam param = any_cast<SoftmaxParam>(OpManager::GetOpDefParam("Softmax"));
+    param.axis = 1;
 
     SetOperatorParam(op, param);
 
@@ -669,18 +681,30 @@ static void ParseAttr(const std::string str, std::vector<int>& result)
     // Remove leading '(' and trailing ')'
     std::string s = str.substr(1, str.length() - 2);
 
-    pos comma_pos = s.find(',');
-    std::string s1 = s.substr(0, comma_pos);
-    std::string s2 = s.substr(comma_pos + 1);
-    s2.erase(0, s2.find_first_not_of(" "));
-
-    std::istringstream ist1(s1);
-    std::istringstream ist2(s2);
-    int i, j;
-    ist1 >> i;
-    ist2 >> j;
-    result.push_back(i);
-    result.push_back(j);
+    std::string s1,s2;
+    int i;
+    while(1)
+    {
+        pos comma_pos = s.find(',');
+        if(comma_pos != std::string::npos)
+        {
+            s1 = s.substr(0, comma_pos);
+            s2 = s.substr(comma_pos + 1);
+            s2.erase(0, s2.find_first_not_of(" "));
+            std::istringstream ist1(s1);
+            ist1 >> i;
+            result.push_back(i);
+            s = s2;
+
+        }else
+        {
+            std::istringstream ist2(s2);
+            ist2 >> i;
+            result.push_back(i);
+	    break;
+        }
+    }
+
 }
 
 static bool LoadMxnetConvolution(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node)
@@ -708,8 +732,10 @@ static bool LoadMxnetConvolution(StaticGraph* graph, StaticNode* node, const Mxn
     if(cit != mxnet_node.attrs.end())
     {
         ParseAttr(cit->second, v3);
-        param.pad_h = v3.at(0);
-        param.pad_w = v3.at(1);
+        param.pad_h0 = v3.at(0);
+        param.pad_h1 = v3.at(0);
+        param.pad_w0 = v3.at(1);
+        param.pad_w1 = v3.at(1);
     }
     cit = mxnet_node.attrs.find("num_group");
     if(cit != mxnet_node.attrs.end())
@@ -730,7 +756,7 @@ static bool LoadMxnetConvolution(StaticGraph* graph, StaticNode* node, const Mxn
 
 #ifdef DEBUG
     std::cout << "ConvParam : " << param.kernel_h << ", " << param.kernel_w << ", " << param.stride_h << ", "
-              << param.stride_w << ", " << param.pad_h << ", " << param.pad_w << ", " << param.group << ", "
+              << param.stride_w << ", " << param.pad_h0 << ", " << param.pad_w0 << ", " << param.group << ", "
               << param.output_channel << std::endl;
 #endif
 
@@ -754,10 +780,6 @@ static bool LoadMxnetPooling(StaticGraph* graph, StaticNode* node, const MxnetNo
         ParseAttr(cit->second, v1);
         param.kernel_h = v1.at(0);
         param.kernel_w = v1.at(1);
-
-        param.kernel_shape.resize(2);
-        param.kernel_shape[0] = param.kernel_h;
-        param.kernel_shape[1] = param.kernel_w;
     }
     cit = mxnet_node.attrs.find("stride");
     if(cit != mxnet_node.attrs.end())
@@ -765,43 +787,42 @@ static bool LoadMxnetPooling(StaticGraph* graph, StaticNode* node, const MxnetNo
         ParseAttr(cit->second, v2);
         param.stride_h = v2.at(0);
         param.stride_w = v2.at(1);
-
-        param.strides.resize(2);
-        param.strides[0] = param.stride_h;
-        param.strides[1] = param.stride_w;
     }
     cit = mxnet_node.attrs.find("pad");
     if(cit != mxnet_node.attrs.end())
     {
         ParseAttr(cit->second, v3);
-        param.pad_h = v3.at(0);
-        param.pad_w = v3.at(1);
-
-        param.pads.resize(4);
-        param.pads[0] = param.pad_h;
-        param.pads[1] = param.pad_w;
-        param.pads[2] = param.pad_h;
-        param.pads[3] = param.pad_w;
+        param.pad_h0 = v3.at(0);
+        param.pad_h1 = v3.at(0);
+        param.pad_w0 = v3.at(1);
+        param.pad_w1 = v3.at(1);
     }
     cit = mxnet_node.attrs.find("pool_type");
     if(cit != mxnet_node.attrs.end())
     {
         if(cit->second == "max")
         {
-            param.global = 0;
             param.alg = kPoolMax;
         }
         else if(cit->second == "avg")
         {
-            param.global = 1;
             param.alg = kPoolAvg;
         }
     }
+    param.global = 0;
+    cit = mxnet_node.attrs.find("global_pool");
+    if(cit != mxnet_node.attrs.end())
+    {
+        if(cit->second == "True")
+        {
+            param.global = 1;
+        }
+    }
     param.caffe_flavor = 0;
 
 #ifdef DEBUG
     std::cout << "PoolParam : " << param.kernel_h << ", " << param.kernel_w << ", " << param.stride_h << ", "
-              << param.stride_w << ", " << param.pad_h << ", " << param.pad_w << ", " << param.global << ", "
+              << param.stride_w << ", " << param.pad_h0 << ", " << param.pad_w0 << ", " << param.global << ", "
               << param.alg << std::endl;
 #endif
 
@@ -851,14 +872,42 @@ static bool LoadMxnetDropout(StaticGraph* graph, StaticNode* node, const MxnetNo
     return true;
 }
 
-static bool LoadMxnetRelu(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node)
+static bool LoadMxnetActivation(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node)
 {
-    ReLuParam param = any_cast<ReLuParam>(OpManager::GetOpDefParam("ReLu"));
-    param.negative_slope = 0.f;
+    const_iterator act_type = mxnet_node.attrs.find("act_type");
+    if(act_type != mxnet_node.attrs.end())
+    {
+        if(act_type->second == "relu")
+        {
+            ReLuParam param = any_cast<ReLuParam>(OpManager::GetOpDefParam("ReLu"));
+            param.negative_slope = 0.f;
 
-    StaticOp* op = CreateStaticOp(graph, "ReLu");
-    SetOperatorParam(op, param);
-    SetNodeOp(node, op);
+            StaticOp* op = CreateStaticOp(graph, "ReLu");
+            SetOperatorParam(op, param);
+            SetNodeOp(node, op);
+        }
+        else if(act_type->second == "tanh")
+        {
+            StaticOp* op = CreateStaticOp(graph, "Tanh");
+            SetNodeOp(node, op);
+        }
+        else if(act_type->second == "sigmoid")
+        {
+            StaticOp* op = CreateStaticOp(graph, "Sigmoid");
+            SetNodeOp(node, op);
+        }
+        else if(act_type->second == "softmax")
+        {
+            SoftmaxParam param = any_cast<SoftmaxParam>(OpManager::GetOpDefParam("Softmax"));
+            param.axis = 1;
+
+            StaticOp* op = CreateStaticOp(graph, "Softmax");
+            SetOperatorParam(op, param);
+            SetNodeOp(node, op);
+        }
+        else
+            return false;
+    }
 
     return true;
 }
@@ -877,7 +926,6 @@ static bool LoadMxnetEltScalar(StaticGraph* graph, StaticNode* node, const Mxnet
     dims.push_back(1);
     SetTensorDim(tensor, dims);
     SetTensorDataType(tensor, DataType::GetTypeID("float32"));
-    SetTensorDataLayout(tensor, "W");
     SetTensorSize(tensor, sizeof(float));
 
     float* mem_buf = ( float* )std::malloc(sizeof(float));
@@ -993,7 +1041,116 @@ static bool LoadMxnetReshape(StaticGraph* graph, StaticNode* node, const MxnetNo
 
     return true;
 }
+static bool LoadMxnetPermute(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node)
+{
+    PermuteParam param = any_cast<PermuteParam>(OpManager::GetOpDefParam("Permute"));
+
+    const_iterator cit;
+    std::vector<int> v1;
+
+    cit = mxnet_node.attrs.find("axes");
+    
+    ParseAttr_n(cit->second, v1);
+    
+    param.order0 = v1[0];
+    param.order1 = v1[1];
+    param.order2 = v1[2];
+    param.order3 = -2;
+
+    StaticOp* op = CreateStaticOp(graph, "Permute");
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+
+    return true;
+}
 
+static bool LoadMxnetSwapAxis(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node)
+{
+    SwapAxisParam param = any_cast<SwapAxisParam>(OpManager::GetOpDefParam("SwapAxis"));
+
+    const_iterator cit;
+    cit = mxnet_node.attrs.find("dim1");
+    if(cit != mxnet_node.attrs.end())
+    {
+        std::istringstream ist(cit->second);
+        ist >> param.dim_0;
+    }
+    cit = mxnet_node.attrs.find("dim2");
+    if(cit != mxnet_node.attrs.end())
+    {
+        std::istringstream ist(cit->second);
+        ist >> param.dim_1;
+    }
+
+    StaticOp* op = CreateStaticOp(graph, "SwapAxis");
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+
+    return true;
+}
+
+static bool LoadMxnetAddN(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node)
+{
+    AddnParam param = any_cast<AddnParam>(OpManager::GetOpDefParam("Addn"));
+    param.axis = 1;
+
+    StaticOp* op = CreateStaticOp(graph, "Addn");
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+
+    return true;
+}
+
+static bool LoadMxnetClip(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node)
+{
+    const_iterator cit1, cit2;
+    cit1 = mxnet_node.attrs.find("a_max");
+    cit2 = mxnet_node.attrs.find("a_min");
+    if(cit1 != mxnet_node.attrs.end() && cit1->second == "6" &&
+       cit2 != mxnet_node.attrs.end() && cit2->second == "0")
+    {
+        StaticOp* op = CreateStaticOp(graph, "ReLu6");
+        SetNodeOp(node, op);
+    }
+    else
+        return false;
+
+    return true;
+}
+
+static bool LoadMxnetRNN(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node)
+{
+    const_iterator cit = mxnet_node.attrs.find("mode");
+    const_iterator cit1 = mxnet_node.attrs.find("state_size");
+    int s_size=atoi(cit1->second.c_str());
+
+    if(cit->second == "lstm")
+    {
+        LSTMParam param = any_cast<LSTMParam>(OpManager::GetOpDefParam("LSTM"));
+        param.mxnet_flag=1;
+
+        param.hidden_size=s_size;
+        param.cell_size=s_size;
+
+        StaticOp* op = CreateStaticOp(graph, "LSTM");
+        SetOperatorParam(op, param);
+        // SetOperatorDynamicShape(op);
+        SetNodeOp(node, op);
+        
+    }
+    else if(cit->second == "gru")
+    {
+        GRUParam param = any_cast<GRUParam>(OpManager::GetOpDefParam("GRU"));
+        param.mxnet_flag=1;
+        param.hidden_size=s_size;
+
+        StaticOp* op = CreateStaticOp(graph, "GRU");
+        SetOperatorParam(op, param);
+        // SetOperatorDynamicShape(op);
+        SetNodeOp(node, op);
+    }
+    return true;
+}
 bool MxnetSerializerRegisterOpLoader(void)
 {
     SerializerPtr serializer;
@@ -1009,7 +1166,7 @@ bool MxnetSerializerRegisterOpLoader(void)
     p_mxnet->RegisterOpLoadMethod("Concat", op_load_t(LoadMxnetConcat));
     p_mxnet->RegisterOpLoadMethod("BatchNorm", op_load_t(LoadMxnetBatchNorm));
     p_mxnet->RegisterOpLoadMethod("Dropout", op_load_t(LoadMxnetDropout));
-    p_mxnet->RegisterOpLoadMethod("Activation", op_load_t(LoadMxnetRelu));
+    p_mxnet->RegisterOpLoadMethod("Activation", op_load_t(LoadMxnetActivation));
 
     p_mxnet->RegisterOpLoadMethod("_minus_scalar", op_load_t(LoadMxnetEltScalar));
     p_mxnet->RegisterOpLoadMethod("_mul_scalar", op_load_t(LoadMxnetEltScalar));
@@ -1018,6 +1175,11 @@ bool MxnetSerializerRegisterOpLoader(void)
     p_mxnet->RegisterOpLoadMethod("FullyConnected", op_load_t(LoadMxnetFullyConnected));
 
     p_mxnet->RegisterOpLoadMethod("Reshape", op_load_t(LoadMxnetReshape));
+    p_mxnet->RegisterOpLoadMethod("SwapAxis", op_load_t(LoadMxnetSwapAxis));
+    p_mxnet->RegisterOpLoadMethod("add_n", op_load_t(LoadMxnetAddN));
+    p_mxnet->RegisterOpLoadMethod("clip", op_load_t(LoadMxnetClip));
+    p_mxnet->RegisterOpLoadMethod("RNN", op_load_t(LoadMxnetRNN));
+    p_mxnet->RegisterOpLoadMethod("transpose", op_load_t(LoadMxnetPermute));
 
     return true;
 }
diff --git a/serializer/onnx/onnx_serializer.cpp b/serializer/onnx/onnx_serializer.cpp
index 73e6dda00..668e8db2e 100644
--- a/serializer/onnx/onnx_serializer.cpp
+++ b/serializer/onnx/onnx_serializer.cpp
@@ -27,6 +27,8 @@
 #include <google/protobuf/text_format.h>
 #include <google/protobuf/message.h>
 
+#include "tengine_c_api.h"
+#include "exec_attr.hpp"
 #include "data_type.hpp"
 #include "tengine_errno.hpp"
 #include "operator_manager.hpp"
@@ -63,6 +65,9 @@ bool OnnxSerializer::LoadModel(const std::vector<std::string>& file_list, Static
     SetGraphSource(graph, file_list[0]);
     SetGraphSourceFormat(graph, "onnx");
     SetGraphConstTensorFile(graph, file_list[0]);
+    SetGraphLayout(graph,TENGINE_LAYOUT_NCHW);
+    SetModelLayout(graph,TENGINE_LAYOUT_NCHW);
+    SetModelFormat(graph,MODEL_FORMAT_ONNX);
 
     return LoadGraph(model, graph);
 }
@@ -191,7 +196,6 @@ void OnnxSerializer::CreateInputNode(StaticGraph* graph, const onnx::GraphProto&
         StaticTensor* tensor = CreateStaticTensor(graph, val.name());
 
         SetTensorDataType(tensor, DataType::GetTypeID("float32"));
-        SetTensorDataLayout(tensor, "NCHW");
 
         if(has_shape)
             SetTensorDim(tensor, dims);
@@ -239,7 +243,6 @@ bool OnnxSerializer::LoadNode(StaticGraph* graph, StaticNode* node, const onnx::
         StaticTensor* tensor = CreateStaticTensor(graph, output_name);
 
         SetTensorDataType(tensor, DataType::GetTypeID("float32"));
-        SetTensorDataLayout(tensor, "NCHW");
         AddNodeOutputTensor(node, tensor);
     }
 
@@ -307,8 +310,10 @@ static bool LoadOnnxConvolutionOp(StaticGraph* graph, StaticNode* node, const on
         }
         else if(attr.name() == "pads")
         {
-            param.pad_h = attr.ints(0);
-            param.pad_w = attr.ints(1);
+            param.pad_h0 = attr.ints(0);
+            param.pad_h1 = attr.ints(0);
+            param.pad_w0 = attr.ints(1);
+            param.pad_w1 = attr.ints(1);
         }
     }
 
@@ -323,13 +328,10 @@ static bool LoadOnnxConvolutionOp(StaticGraph* graph, StaticNode* node, const on
         {
             const std::vector<int>& dim = GetTensorDim(tensor);
 
-            SetTensorDataLayout(tensor, "NCHW");
 
             /* onnx hide the output channel in weight ..*/
             param.output_channel = dim[0];
         }
-        else if(k == 2)
-            SetTensorDataLayout(tensor, "W");
     }
 
     StaticOp* op = CreateStaticOp(graph, "Convolution");
@@ -353,13 +355,6 @@ static bool LoadOnnxBN(StaticGraph* graph, StaticNode* node, const onnx::NodePro
             param.eps = attr.f();
     }
 
-    for(int k = 1; k < onnx_node.input_size(); k++)
-    {
-        const std::string& input_name = onnx_node.input(k);
-        StaticTensor* tensor = FindTensor(graph, input_name);
-        SetTensorDataLayout(tensor, "W");
-    }
-
     StaticOp* op = CreateStaticOp(graph, "BatchNormalization");
     SetOperatorParam(op, param);
     SetNodeOp(node, op);
@@ -415,8 +410,10 @@ static bool LoadOnnxPooling(StaticGraph* graph, StaticNode* node, const onnx::No
             }
             else if(attr.name() == "pads")
             {
-                param.pad_h = attr.ints(0);
-                param.pad_w = attr.ints(1);
+                param.pad_h0 = attr.ints(0);
+                param.pad_h1 = attr.ints(0);
+                param.pad_w0 = attr.ints(1);
+                param.pad_w1 = attr.ints(1);
             }
         }
     }
@@ -426,20 +423,6 @@ static bool LoadOnnxPooling(StaticGraph* graph, StaticNode* node, const onnx::No
         return false;
     }
 
-    param.kernel_shape.resize(2);
-    param.kernel_shape[0] = param.kernel_h;
-    param.kernel_shape[1] = param.kernel_w;
-
-    param.pads.resize(4);
-    param.pads[0] = param.pad_h;
-    param.pads[1] = param.pad_w;
-    param.pads[2] = param.pad_h;
-    param.pads[3] = param.pad_w;
-
-    param.strides.resize(2);
-    param.strides[0] = param.stride_h;
-    param.strides[1] = param.stride_w;
-
     StaticOp* op = CreateStaticOp(graph, "Pooling");
 
     SetOperatorParam(op, param);
@@ -488,8 +471,6 @@ static bool LoadOnnxGemm(StaticGraph* graph, StaticNode* node, const onnx::NodeP
 
     StaticTensor* bias_tensor = FindTensor(graph, onnx_node.input(2));
 
-    SetTensorDataLayout(weight_tensor, "HW");
-    SetTensorDataLayout(bias_tensor, "W");
 
     if(param.transA)
     {
diff --git a/serializer/plugin/init.cpp b/serializer/plugin/init.cpp
index 747c7dc56..4f78bf9fa 100644
--- a/serializer/plugin/init.cpp
+++ b/serializer/plugin/init.cpp
@@ -76,8 +76,9 @@ extern bool TFLiteSerializerRegisterOpLoader();
 #endif
 
 #ifdef CONFIG_TENGINE_SERIALIZER
-extern bool TmSerializerRegisterOpLoader();
+bool TmSerializerInit(void);
 #endif
+
 }    // namespace TEngine
 
 using namespace TEngine;
@@ -137,12 +138,7 @@ int serializer_plugin_init(void)
 #endif
 
 #ifdef CONFIG_TENGINE_SERIALIZER
-    factory->RegisterInterface<TmSerializer>("tengine");
-    auto tm_serializer = factory->Create("tengine");
-
-    SerializerManager::SafeAdd("tengine", SerializerPtr(tm_serializer));
-
-    TmSerializerRegisterOpLoader();
+    TmSerializerInit();
 
 #define SrcTmName "src_tm"
 
diff --git a/serializer/tengine/Makefile b/serializer/tengine/Makefile
index acfa649af..703441439 100644
--- a/serializer/tengine/Makefile
+++ b/serializer/tengine/Makefile
@@ -1,4 +1,7 @@
 obj-y+=tm_generate.o
-obj-y+=tm_op_load.o
-obj-y+=tm_op_save.o
 obj-y+=tm_serializer.o
+obj-y+=v1/
+obj-y+=v2/
+
+COMMON_CFLAGS+=-I$(shell pwd)/../include/tengine
+
diff --git a/serializer/tengine/tm_generate.c b/serializer/tengine/tm_generate.c
index 48be4188e..27f598b93 100644
--- a/serializer/tengine/tm_generate.c
+++ b/serializer/tengine/tm_generate.c
@@ -30,22 +30,22 @@ extern "C" {
 
 #define ALIGN(pos, alignbytes) (((pos) + ( alignbytes )-1) & ~(( alignbytes )-1))
 
-tm_uoffset_t WriteTmFileAlign1(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size)
+uint32_t WriteTmFileAlign1(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size)
 {
-    tm_uoffset_t buf_pos = *cur_pos;
+    uint32_t buf_pos = *cur_pos;
     memcpy(start_ptr + *cur_pos, buf, buf_size);
     *cur_pos += buf_size;
     return buf_pos;
 }
 
-tm_uoffset_t WriteTmFileAlign4(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size)
+uint32_t WriteTmFileAlign4(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size)
 {
     *cur_pos = ALIGN(*cur_pos, 4);
 
     return WriteTmFileAlign1(start_ptr, cur_pos, buf, buf_size);
 }
 
-tm_uoffset_t WriteTmObject(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size)
+uint32_t WriteTmObject(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size)
 {
     return WriteTmFileAlign4(start_ptr, cur_pos, buf, buf_size);
 }
diff --git a/serializer/tengine/tm_serializer.cpp b/serializer/tengine/tm_serializer.cpp
index 047f99006..d18962674 100644
--- a/serializer/tengine/tm_serializer.cpp
+++ b/serializer/tengine/tm_serializer.cpp
@@ -18,274 +18,26 @@
  */
 
 /*
- * Copyright (c) 2018, Open AI Lab
+ * Copyright (c) 2019, Open AI Lab
  * Author: jingyou@openailab.com
  */
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
-#include <string.h>
 
-#include "data_type.hpp"
 #include "operator_manager.hpp"
 #include "static_graph.hpp"
 #include "graph.hpp"
-#include "node.hpp"
-#include "tensor.hpp"
 
 #include "tm_serializer.hpp"
-#include "tm_op_serializer.hpp"
-#include "compiler.hpp"
 
-namespace TEngine {
-
-bool TmSerializer::IsSaveString(void)
-{
-    const char* env = std::getenv("TM_WITH_STRING");
-
-    if(env)
-        return true;
-    else
-        return false;
-}
-
-bool TmSerializer::IsSaveData(void)
-{
-    const char* env = std::getenv("TM_FOR_BENCHMARK");
-
-    if(env)
-        return false;
-    else
-        return true;
-}
-
-tm_uoffset_t TmSerializer::SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor,
-                                        unsigned int tensor_id, unsigned int buffer_id)
-{
-    TM_Tensor tm_tensor;
-    tm_tensor.tensor_id = tensor_id;
-    tm_tensor.buffer_id = buffer_id;
-    tm_tensor.type = tensor->GetType();
-
-    bool tm_with_string = IsSaveString();
+#define TM_FILE_MAX_SIZE 1 << 30 /* 1G */
 
-    if(tm_with_string)
-    {
-        std::string name = tensor->GetName();
-        TM_String tensor_name;
-        tensor_name.size = name.size();
-        tensor_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), tensor_name.size);
-        tm_tensor.offset_s_tname = WriteTmObject(start_ptr, cur_pos, &tensor_name, sizeof(TM_String));
-    }
-    else
-        tm_tensor.offset_s_tname = NOT_SET;
-
-    const std::string& data_type = DataType::GetTypeName(tensor->GetDataType());
-    if(data_type == "float32")
-        tm_tensor.data_type = TM_DT_FLOAT32;
-    else if(data_type == "float16")
-        tm_tensor.data_type = TM_DT_FLOAT16;
-    else if(data_type == "int")
-        tm_tensor.data_type = TM_DT_INT32;
-    else if(data_type == "int8")
-        tm_tensor.data_type = TM_DT_INT8;
-
-    /* Get the dims of the tensor */
-    TShape& shape = tensor->GetShape();
-    std::vector<int>& dim = shape.GetDim();
-    if(dim.size())
-    {
-        /* Write the vector of dims */
-        size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * dim.size();
-        TM_Vector_dims* v_dims = ( TM_Vector_dims* )malloc(vector_size);
-        v_dims->v_num = dim.size();
-        for(unsigned int i = 0; i < dim.size(); i++)
-        {
-            v_dims->dims[i] = dim[i];
-        }
-        tm_tensor.offset_vd_dims = WriteTmObject(start_ptr, cur_pos, v_dims, vector_size);
-        free(v_dims);
-    }
-    else
-        tm_tensor.offset_vd_dims = NOT_SET;
-
-    /* Write the tensor */
-    return WriteTmObject(start_ptr, cur_pos, &tm_tensor, sizeof(TM_Tensor));
-}
-
-tm_uoffset_t TmSerializer::SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node,
-                                      name_map_t& tensor_name_map)
-{
-    TM_Node tm_node;
-    tm_node.node_id = node->GetNodeIndex();
-    tm_node.dynamic_shape = node->IsDynamicShape();
-
-    bool tm_with_string = IsSaveString();
-
-    if(tm_with_string)
-    {
-        std::string name = node->GetName();
-        TM_String node_name;
-        node_name.size = name.size();
-        node_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), node_name.size);
-        tm_node.offset_s_nname = WriteTmObject(start_ptr, cur_pos, &node_name, sizeof(TM_String));
-    }
-    else
-        tm_node.offset_s_nname = NOT_SET;
-
-    unsigned int input_num = node->GetInputNum();
-    unsigned int output_num = node->GetOutputNum();
-
-    if(input_num)
-    {
-        /* Write the vector of input indices */
-        size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * input_num;
-        TM_Vector_indices* v_input_indices = ( TM_Vector_indices* )malloc(vector_size);
-        v_input_indices->v_num = input_num;
-        for(unsigned int i = 0; i < input_num; i++)
-        {
-            Tensor* p_tensor = node->GetInputTensor(i);
-            v_input_indices->indices[i] = tensor_name_map[p_tensor->GetName()];
-        }
-        tm_node.offset_vi_input_tensors = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size);
-        free(v_input_indices);
-    }
-    else
-        tm_node.offset_vi_input_tensors = NOT_SET;
-
-    if(output_num)
-    {
-        /* Write the vector of output indices */
-        size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * output_num;
-        TM_Vector_indices* v_output_indices = ( TM_Vector_indices* )malloc(vector_size);
-        v_output_indices->v_num = output_num;
-        for(unsigned int i = 0; i < output_num; i++)
-        {
-            Tensor* p_tensor = node->GetOutputTensor(i);
-            v_output_indices->indices[i] = tensor_name_map[p_tensor->GetName()];
-        }
-        tm_node.offset_vi_output_tensors = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size);
-        free(v_output_indices);
-    }
-    else
-        tm_node.offset_vi_output_tensors = NOT_SET;
-
-    tm_node.offset_t_operator = SaveTmOperator(start_ptr, cur_pos, node->GetOp());
-
-    /* Write the node */
-    return WriteTmObject(start_ptr, cur_pos, &tm_node, sizeof(TM_Node));
-}
-
-tm_uoffset_t TmSerializer::SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph)
-{
-    TM_Subgraph tm_subgraph;
-    tm_subgraph.subgraph_id = 0; /* subgraph_id starts from 0 */
-    tm_subgraph.offset_s_sname = NOT_SET;
-
-    unsigned int tensor_num = 0;
-    unsigned int buffer_num = 0;
-    std::vector<Tensor*> tensor_ptrs;
-    std::vector<void*> buf_ptrs;
-    std::vector<unsigned int> buf_sizes;
-    name_map_t tensor_name_map; /* map of tensor name and tensor index */
-    bool tm_no_data = !IsSaveData();
-
-    /* Write the nodes */
-    size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * graph->seq_nodes.size();
-    TM_Vector_offsets* v_nodes = ( TM_Vector_offsets* )malloc(vector_size);
-    v_nodes->v_num = graph->seq_nodes.size();
-    for(unsigned int i = 0; i < graph->seq_nodes.size(); i++)
-    {
-        Node* p_node = graph->seq_nodes[i];
-        for(unsigned int k = 0; k < p_node->GetOutputNum(); k++)
-        {
-            Tensor* p_tensor = p_node->GetOutputTensor(k);
-            tensor_ptrs.push_back(p_tensor);
-            tensor_name_map[p_tensor->GetName()] = tensor_num;
-            tensor_num++;
-        }
-        v_nodes->offsets[i] = SaveTmNode(start_ptr, cur_pos, p_node, tensor_name_map);
-    }
-    /* Write the vector of nodes */
-    tm_subgraph.offset_vo_seq_nodes = WriteTmObject(start_ptr, cur_pos, v_nodes, vector_size);
-
-    /* Write the tensors */
-    vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor_num;
-    TM_Vector_offsets* v_tensors = ( TM_Vector_offsets* )malloc(vector_size);
-    v_tensors->v_num = tensor_num;
-    for(unsigned int i = 0; i < tensor_num; i++)
-    {
-        Tensor* p_tensor = tensor_ptrs[i];
-        if(p_tensor->GetType() == kConstTensor)
-        {
-            buf_ptrs.push_back(p_tensor->GetMemAddr());
-            buf_sizes.push_back(p_tensor->GetTotalSize());
-            buffer_num++;
-        }
-
-        v_tensors->offsets[i] = SaveTmTensor(start_ptr, cur_pos, p_tensor, i, buffer_num - 1);
-    }
-    /* Write the vector of tensors */
-    tm_subgraph.offset_vo_tensors = WriteTmObject(start_ptr, cur_pos, v_tensors, vector_size);
-
-    /* Write the buffers */
-    vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * buffer_num;
-    TM_Vector_offsets* v_buffers = ( TM_Vector_offsets* )malloc(vector_size);
-    v_buffers->v_num = buffer_num;
-    for(unsigned int i = 0; i < buffer_num; i++)
-    {
-        TM_Buffer tm_buf;
-        tm_buf.size = buf_sizes[i];
-
-        if(tm_no_data)
-        {
-            /* TM_FOR_BENCHMARK environment variable exists. Not write buf data into the tm file */
-            tm_buf.offset_data = NOT_SET;
-        }
-        else
-        {
-            /* TM_FOR_BENCHMARK environment variable does not exist */
-            tm_buf.offset_data =
-                WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast<const uint8_t*>(buf_ptrs[i]), tm_buf.size);
-        }
-        v_buffers->offsets[i] = WriteTmObject(start_ptr, cur_pos, &tm_buf, sizeof(TM_Buffer));
-    }
-    /* Write the vector of buffers */
-    tm_subgraph.offset_vo_buffers = WriteTmObject(start_ptr, cur_pos, v_buffers, vector_size);
-
-    /* Write the vector of input indices */
-    vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->input_nodes.size();
-    TM_Vector_indices* v_input_indices = ( TM_Vector_indices* )malloc(vector_size);
-    v_input_indices->v_num = graph->input_nodes.size();
-    for(unsigned int i = 0; i < graph->input_nodes.size(); i++)
-    {
-        v_input_indices->indices[i] = graph->input_nodes[i]->GetNodeIndex();
-    }
-    tm_subgraph.offset_vi_input_indices = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size);
-
-    /* Write the vector of output indices */
-    vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->output_nodes.size();
-    TM_Vector_indices* v_output_indices = ( TM_Vector_indices* )malloc(vector_size);
-    v_output_indices->v_num = graph->output_nodes.size();
-    for(unsigned int i = 0; i < graph->output_nodes.size(); i++)
-    {
-        v_output_indices->indices[i] = graph->output_nodes[i]->GetNodeIndex();
-    }
-    tm_subgraph.offset_vi_output_indices = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size);
-
-    /* Write the subgraph */
-    tm_uoffset_t ret = WriteTmObject(start_ptr, cur_pos, &tm_subgraph, sizeof(TM_Subgraph));
-
-    /* Free the memory of vectors */
-    free(v_tensors);
-    free(v_buffers);
-    free(v_nodes);
-    free(v_input_indices);
-    free(v_output_indices);
+namespace TEngine {
 
-    return ret;
-}
+extern bool register_tm1_serializer();
+extern bool register_tm2_serializer();
 
 bool TmSerializer::SaveModel(const std::vector<std::string>& file_list, Graph* graph)
 {
@@ -294,7 +46,7 @@ bool TmSerializer::SaveModel(const std::vector<std::string>& file_list, Graph* g
         return false;
 
     /* Open the tengine model file */
-    int fd = open(file_list[0].c_str(), O_RDWR | O_CREAT, 0666);
+    int fd = open(file_list[0].c_str(), O_RDWR | O_CREAT | O_TRUNC, 0666);
     if(fd == -1)
     {
         LOG_ERROR() << "Could not open " << file_list[0] << "\n";
@@ -325,267 +77,29 @@ bool TmSerializer::SaveModel(const std::vector<std::string>& file_list, Graph* g
 
 bool TmSerializer::SaveModel(std::vector<void*>& addr_list, std::vector<int>& size_list, Graph* graph)
 {
-    bool tm_with_string = IsSaveString();
+    uint32_t tm_model_size = 0;
 
-    void* start_ptr = ( void* )malloc(TM_FILE_MAX_SIZE);
+    uint32_t malloc_size = TM_FILE_MAX_SIZE;
+    const char* env = std::getenv("TM_FILE_MAX_SIZE");
+    if(env)
+        malloc_size = std::atoi(env);
+     
+    void* start_ptr = ( void* )malloc(malloc_size);
     if(start_ptr == nullptr)
     {
-        LOG_ERROR() << "No enough memory for saving tengine model.\n";
+        LOG_ERROR() << "Malloc memory failed: " << malloc_size << ".\n";
         return false;
     }
 
-    tm_size_t tm_model_size = 0;
-    tm_uoffset_t cur_pos = sizeof(TM_Header);
-
-    /* Define the TM_Header object */
-    TM_Header header;
-    header.ver_main = TM_FILE_VER_MAIN;
-    header.ver_sub = TM_FILE_VER_SUB;
-    header.ver_compile = TM_FILE_VER_COMPILE;
-
-    /* Define the TM_Model object */
-    TM_Model tm_model;
-    if(tm_with_string)
-    {
-        const std::string& fname = graph->GetName();
-        TM_String model_name;
-        model_name.size = fname.size();
-        model_name.offset_data = WriteTmFileAlign1(start_ptr, &cur_pos, fname.c_str(), model_name.size);
-        tm_model.offset_s_mname = WriteTmObject(start_ptr, &cur_pos, &model_name, sizeof(TM_String));
-    }
-    else
-        tm_model.offset_s_mname = NOT_SET;
-
-    /* Write the subgraphs */
-    /* Only 1 subgraph is supported currently */
-    size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * 1;
-    TM_Vector_offsets* v_subgraphs = ( TM_Vector_offsets* )malloc(vector_size);
-    v_subgraphs->v_num = 1;
-    v_subgraphs->offsets[0] = SaveTmSubgraph(start_ptr, &cur_pos, graph);
-
-    /* Write the vector of subgraphs */
-    tm_model.offset_vo_subgraphs = WriteTmObject(start_ptr, &cur_pos, v_subgraphs, vector_size);
-
-    /* Write the model */
-    header.offset_root = WriteTmObject(start_ptr, &cur_pos, &tm_model, sizeof(TM_Model));
-    tm_model_size = cur_pos;
-
-    /* Write the header */
-    cur_pos = 0;
-    WriteTmObject(start_ptr, &cur_pos, &header, sizeof(TM_Header));
-
-    free(v_subgraphs);
+    TmSerializerPtr tm_serializer;
+    TmSerializerManager::SafeGet("tm_v2", tm_serializer);
+        
+    bool ret = tm_serializer->SaveModelIntoMem(start_ptr, graph, &tm_model_size);
 
     addr_list.push_back(start_ptr);
     size_list.push_back(tm_model_size);
 
-    return true;
-}
-
-bool TmSerializer::LoadNode(StaticGraph* graph, StaticNode* node, const TM_Node* tm_node, void* mmap_buf)
-{
-    if(tm_node->offset_vi_input_tensors != NOT_SET)
-    {
-        const TM_Vector_indices* v_input_tensors =
-            GetTmPtr<TM_Vector_indices>(mmap_buf, tm_node->offset_vi_input_tensors);
-
-        /* Set the input tensors to the node */
-        for(unsigned int i = 0; i < v_input_tensors->v_num; i++)
-        {
-            StaticTensor* tensor = graph->tensor_list[v_input_tensors->indices[i]].get();
-            if(!tensor)
-            {
-                LOG_ERROR() << "The input tensor not exist: " << v_input_tensors->indices[i] << "\n";
-                return false;
-            }
-            AddNodeInputTensor(node, tensor);
-        }
-    }
-
-    if(tm_node->offset_vi_output_tensors != NOT_SET)
-    {
-        const TM_Vector_indices* v_output_tensors =
-            GetTmPtr<TM_Vector_indices>(mmap_buf, tm_node->offset_vi_output_tensors);
-
-        /* Set the output tensors to the node */
-        for(unsigned int i = 0; i < v_output_tensors->v_num; i++)
-        {
-            StaticTensor* tensor = graph->tensor_list[v_output_tensors->indices[i]].get();
-            if(!tensor)
-            {
-                LOG_ERROR() << "The output tensor not exist: " << v_output_tensors->indices[i] << "\n";
-                return false;
-            }
-            AddNodeOutputTensor(node, tensor);
-        }
-    }
-    return true;
-}
-
-bool TmSerializer::LoadTensor(StaticGraph* graph, const TM_Tensor* tm_tensor, const TM_Buffer* tm_buf, void* mmap_buf)
-{
-    /* Set the tensor name */
-    int idx = tm_tensor->tensor_id;
-    std::string tm_tensor_name;
-    if(tm_tensor->offset_s_tname == NOT_SET)
-        tm_tensor_name = "tensor_" + std::to_string(idx);
-    else
-    {
-        const TM_String* tm_string = GetTmPtr<TM_String>(mmap_buf, tm_tensor->offset_s_tname);
-        tm_tensor_name.assign(GetTmPtr<char>(mmap_buf, tm_string->offset_data), tm_string->size);
-    }
-
-    /* Create the static tensor */
-    StaticTensor* tensor;
-    if(tm_tensor->type == kConstTensor)
-        tensor = CreateStaticConstTensor(graph, tm_tensor_name);
-    else
-        tensor = CreateStaticTensor(graph, tm_tensor_name);
-    if(!tensor)
-    {
-        LOG_ERROR() << "Create static const tensor failed: " << tm_tensor_name << "\n";
-        return false;
-    }
-
-    /* Set the dims */
-    if(tm_tensor->offset_vd_dims != NOT_SET)
-    {
-        const TM_Vector_dims* v_dims = GetTmPtr<TM_Vector_dims>(mmap_buf, tm_tensor->offset_vd_dims);
-        if(!v_dims || !(v_dims->v_num))
-        {
-            LOG_ERROR() << "Get tensor dims failed\n";
-            return false;
-        }
-        std::vector<int> dims;
-        for(unsigned int i = 0; i < v_dims->v_num; i++)
-            dims.push_back(v_dims->dims[i]);
-        SetTensorDim(tensor, dims);
-
-        /* Set the daya layout */
-        if(v_dims->v_num == 4)
-            SetTensorDataLayout(tensor, "NCHW");
-        else if(v_dims->v_num == 2)
-            SetTensorDataLayout(tensor, "HW");
-        else if(v_dims->v_num == 1)
-            SetTensorDataLayout(tensor, "W");
-    }
-
-    /* Set the data type */
-    if(tm_tensor->data_type == TM_DT_FLOAT32)
-        SetTensorDataType(tensor, DataType::GetTypeID("float32"));
-    else if(tm_tensor->data_type == TM_DT_FLOAT16)
-        SetTensorDataType(tensor, DataType::GetTypeID("float16"));
-    else if(tm_tensor->data_type == TM_DT_INT32)
-        SetTensorDataType(tensor, DataType::GetTypeID("int"));
-    else if(tm_tensor->data_type == TM_DT_INT8)
-        SetTensorDataType(tensor, DataType::GetTypeID("int8"));
-
-    /* Set the memory size and pointer */
-    if(tm_tensor->type == kConstTensor)
-    {
-        SetTensorSize(tensor, tm_buf->size);
-        void* buf = malloc(tm_buf->size);
-        if(tm_buf->offset_data != NOT_SET)
-        {
-            memcpy(buf, GetTmPtr<void>(mmap_buf, tm_buf->offset_data), tm_buf->size);
-        }
-
-        SetConstTensorBuffer(tensor, buf);
-        SetConstTensorFileLocation(tensor, -1, 0);
-    }
-
-    return true;
-}
-
-bool TmSerializer::LoadGraph(StaticGraph* graph, const TM_Model* tm_model, void* mmap_buf)
-{
-    const TM_Vector_offsets* v_graphs = GetTmPtr<TM_Vector_offsets>(mmap_buf, tm_model->offset_vo_subgraphs);
-    const TM_Subgraph* tm_graph = GetTmPtr<TM_Subgraph>(mmap_buf, v_graphs->offsets[0]);
-
-    const TM_Vector_offsets* v_nodes = GetTmPtr<TM_Vector_offsets>(mmap_buf, tm_graph->offset_vo_seq_nodes);
-    const TM_Vector_offsets* v_tensors = GetTmPtr<TM_Vector_offsets>(mmap_buf, tm_graph->offset_vo_tensors);
-    const TM_Vector_offsets* v_buffers = GetTmPtr<TM_Vector_offsets>(mmap_buf, tm_graph->offset_vo_buffers);
-
-    /* Load const tensors */
-    for(unsigned int i = 0; i < v_tensors->v_num; i++)
-    {
-        const TM_Tensor* tm_tensor = GetTmPtr<TM_Tensor>(mmap_buf, v_tensors->offsets[i]);
-        const TM_Buffer* tm_buf;
-        if(tm_tensor->type == kConstTensor)
-            tm_buf = GetTmPtr<TM_Buffer>(mmap_buf, v_buffers->offsets[tm_tensor->buffer_id]);
-        else
-            tm_buf = nullptr;
-        LoadTensor(graph, tm_tensor, tm_buf, mmap_buf);
-    }
-
-    /* Create static nodes */
-    unsigned int i;
-    for(i = 0; i < v_nodes->v_num; i++)
-    {
-        const TM_Node* tm_node = GetTmPtr<TM_Node>(mmap_buf, v_nodes->offsets[i]);
-        int idx = tm_node->node_id;
-        std::string tm_node_name;
-        if(tm_node->offset_s_nname == NOT_SET)
-            tm_node_name = "node_" + std::to_string(idx);
-        else
-        {
-            const TM_String* tm_string = GetTmPtr<TM_String>(mmap_buf, tm_node->offset_s_nname);
-            tm_node_name.assign(GetTmPtr<char>(mmap_buf, tm_string->offset_data), tm_string->size);
-        }
-
-        const TM_Operator* tm_operator = GetTmPtr<TM_Operator>(mmap_buf, tm_node->offset_t_operator);
-        const std::string& tm_op_name = GetOpStr(tm_operator->operator_type);
-
-        if(!FindOpLoadMethod(tm_op_name))
-        {
-            LOG_ERROR() << "cannot find load function for operator: " << tm_op_name << "\n";
-            break;
-        }
-
-        StaticNode* node = CreateStaticNode(graph, tm_node_name);
-        if(!LoadNode(graph, node, tm_node, mmap_buf))
-            break;
-
-        op_load_t op_func = any_cast<op_load_t>(GetOpLoadMethod(tm_op_name));
-
-        if(!op_func(graph, node, mmap_buf, tm_operator))
-            break;
-
-        /* Set the dynamic shape of the operator */
-        node->op->dynamic_shape = tm_node->dynamic_shape;
-    }
-
-    if(i < v_nodes->v_num)
-        return false;
-
-    const TM_Vector_indices* v_input_nodes = GetTmPtr<TM_Vector_indices>(mmap_buf, tm_graph->offset_vi_input_indices);
-    const TM_Vector_indices* v_output_nodes = GetTmPtr<TM_Vector_indices>(mmap_buf, tm_graph->offset_vi_output_indices);
-
-    /* Set the input nodes */
-    for(unsigned int i = 0; i < v_input_nodes->v_num; i++)
-    {
-        StaticNode* node = graph->node_list[v_input_nodes->indices[i]].get();
-        if(!node)
-        {
-            LOG_ERROR() << "Input node #" << v_input_nodes->indices[i] << " not exist\n";
-            return false;
-        }
-        AddGraphInputNode(graph, node);
-    }
-
-    /* Set the output nodes */
-    for(unsigned int i = 0; i < v_output_nodes->v_num; i++)
-    {
-        StaticNode* node = graph->node_list[v_output_nodes->indices[i]].get();
-        if(!node)
-        {
-            LOG_ERROR() << "Output node #" << v_output_nodes->indices[i] << " not exist\n";
-            return false;
-        }
-        AddGraphOutputNode(graph, node);
-    }
-
-    return true;
+    return ret;
 }
 
 bool TmSerializer::LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int& size)
@@ -611,35 +125,6 @@ bool TmSerializer::LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int
     return true;
 }
 
-bool TmSerializer::LoadModelFromMem(void* mmap_buf, StaticGraph* graph)
-{
-    const TM_Header* tm_header = reinterpret_cast<const TM_Header*>(mmap_buf);
-    /* Check the version of tm file format */
-    if(tm_header->ver_main != TM_FILE_VER_MAIN || tm_header->ver_sub != TM_FILE_VER_SUB ||
-       tm_header->ver_compile != TM_FILE_VER_COMPILE)
-    {
-        printf("Wrong version of tm file\n");
-        return false;
-    }
-
-    const TM_Model* tm_model = GetTmPtr<TM_Model>(mmap_buf, tm_header->offset_root);
-    if(tm_model->offset_s_mname == NOT_SET)
-    {
-        SetGraphIdentity(graph, "tengine", "tengine_model", "0");
-    }
-    else
-    {
-        std::string tm_model_name;
-        const TM_String* tm_string = GetTmPtr<TM_String>(mmap_buf, tm_model->offset_s_mname);
-        tm_model_name.assign(GetTmPtr<char>(mmap_buf, tm_string->offset_data), tm_string->size);
-        SetGraphIdentity(graph, "tengine", tm_model_name, "0");
-    }
-
-    if(LoadGraph(graph, tm_model, mmap_buf))
-        return true;
-    else
-        return false;
-}
 bool TmSerializer::LoadModel(const std::vector<std::string>& file_list, StaticGraph* graph)
 {
     int fd;
@@ -656,7 +141,17 @@ bool TmSerializer::LoadModel(const std::vector<std::string>& file_list, StaticGr
     SetGraphSourceFormat(graph, "tengine");
     SetGraphConstTensorFile(graph, file_list[0]);
 
-    bool ret = LoadModelFromMem(mmap_buf, graph);
+    const uint16_t* ver_main = reinterpret_cast<const uint16_t*>(mmap_buf);
+    TmSerializerPtr tm_serializer;
+    if(*ver_main < 2)
+    {
+        LOG_WARN() << "The input tengine model file is in old format, please regenerate it by using tengine convert tool.\n";
+        TmSerializerManager::SafeGet("tm_v1", tm_serializer);
+    }
+    else
+        TmSerializerManager::SafeGet("tm_v2", tm_serializer);
+        
+    bool ret = tm_serializer->LoadModelFromMem(mmap_buf, graph);
 
     munmap(const_cast<void*>(mmap_buf), mmap_size);
     close(fd);
@@ -674,7 +169,17 @@ bool TmSerializer::LoadModel(const std::vector<const void*>& addr_list, const st
     SetGraphSource(graph, "in_mem");
     SetGraphSourceFormat(graph, "tengine");
 
-    bool ret = LoadModelFromMem(mmap_buf, graph);
+    const uint16_t* ver_main = reinterpret_cast<const uint16_t*>(mmap_buf);
+    TmSerializerPtr tm_serializer;
+    if(*ver_main < 2)
+    {
+        LOG_WARN() << "The input tengine model file is in old format, please regenerate it by using tengine convert tool.\n";
+        TmSerializerManager::SafeGet("tm_v1", tm_serializer);
+    }
+    else
+        TmSerializerManager::SafeGet("tm_v2", tm_serializer);
+
+    bool ret = tm_serializer->LoadModelFromMem(mmap_buf, graph);
 
     if(ret)
         graph->mem_src.push_back(mmap_buf);
@@ -682,21 +187,19 @@ bool TmSerializer::LoadModel(const std::vector<const void*>& addr_list, const st
     return ret;
 }
 
-bool TmSerializerRegisterOpLoader(void)
+bool TmSerializerInit(void)
 {
-    SerializerPtr serializer;
+    auto factory = SerializerFactory::GetFactory();
 
-    if(!SerializerManager::SafeGet("tengine", serializer))
-        return false;
+    factory->RegisterInterface<TmSerializer>("tengine");
+    auto tm_serializer = factory->Create("tengine");
 
-    TmSerializer* p_tengine = dynamic_cast<TmSerializer*>(serializer.get());
+    SerializerManager::SafeAdd("tengine", SerializerPtr(tm_serializer));
 
-    for(int i = 0; i < TM_OPTYPE_NUM; i++)
-    {
-        p_tengine->RegisterOpLoadMethod(GetOpStr(i), op_load_t(LoadTmOpFunc(i)));
-    }
+    bool ret1 = register_tm1_serializer();
+    bool ret2 = register_tm2_serializer();
 
-    return true;
+    return (ret1 && ret2);
 }
 
 }    // namespace TEngine
diff --git a/serializer/tengine/v1/Makefile b/serializer/tengine/v1/Makefile
new file mode 100644
index 000000000..c62ec5829
--- /dev/null
+++ b/serializer/tengine/v1/Makefile
@@ -0,0 +1,7 @@
+obj-y+=init.o
+obj-y+=tm1_op_load.o
+obj-y+=tm1_op_save.o
+obj-y+=tm1_serializer.o
+
+COMMON_CFLAGS+=-I$(shell pwd)/../../include/tengine/v1
+
diff --git a/serializer/tengine/v1/init.cpp b/serializer/tengine/v1/init.cpp
new file mode 100644
index 000000000..734be3860
--- /dev/null
+++ b/serializer/tengine/v1/init.cpp
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#include "tm_serializer.hpp"
+#include "tm1_serializer.hpp"
+
+namespace TEngine {
+namespace TMSerializer1 {
+
+extern bool TmSerializerRegisterOpLoader1();
+
+}
+
+using namespace TMSerializer1;
+
+bool register_tm1_serializer(void)
+{
+    auto factory = TmSerializerFactory::GetFactory();
+
+    factory->RegisterInterface<TmSerializer1>("tm_v1");
+    auto tm_serializer = factory->Create("tm_v1");
+
+    TmSerializerManager::SafeAdd("tm_v1", TmSerializerPtr(tm_serializer));
+
+    return TmSerializerRegisterOpLoader1();
+}
+
+}    // namespace TEngine
+
diff --git a/serializer/tengine/tm_op_load.cpp b/serializer/tengine/v1/tm1_op_load.cpp
similarity index 96%
rename from serializer/tengine/tm_op_load.cpp
rename to serializer/tengine/v1/tm1_op_load.cpp
index 27cc00db4..ffaa76e26 100644
--- a/serializer/tengine/tm_op_load.cpp
+++ b/serializer/tengine/v1/tm1_op_load.cpp
@@ -21,10 +21,12 @@
  * Copyright (c) 2018, Open AI Lab
  * Author: jingyou@openailab.com
  */
-#include "tm_op_serializer.hpp"
+#include "tm1_op_serializer.hpp"
 
 namespace TEngine {
 
+namespace TMSerializer1 {
+
 bool LoadTmAccuracyOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM_Operator* tm_op)
 {
     StaticOp* op = CreateStaticOp(graph, OP_STR_ACCURACY);
@@ -98,13 +100,15 @@ bool LoadTmConvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, c
     param.kernel_w = tm_param->kernel_w;
     param.stride_h = tm_param->stride_h;
     param.stride_w = tm_param->stride_w;
-    param.pad_h = tm_param->pad_h;
-    param.pad_w = tm_param->pad_w;
     param.dilation_h = tm_param->dilation_h;
     param.dilation_w = tm_param->dilation_w;
     param.output_channel = tm_param->output_channel;
     param.activation = tm_param->activation;
     param.group = tm_param->group;
+    param.pad_h0 = tm_param->pad_h;
+    param.pad_h1 = tm_param->pad_h;
+    param.pad_w0 = tm_param->pad_w;
+    param.pad_w1 = tm_param->pad_w;
 
     StaticOp* op = CreateStaticOp(graph, op_str);
     SetOperatorParam(op, param);
@@ -119,11 +123,18 @@ bool LoadTmDeconvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr,
     DeconvParam param = any_cast<DeconvParam>(OpManager::GetOpDefParam(op_str));
     const TM_DeconvParam* tm_param = GetTmPtr<TM_DeconvParam>(start_ptr, tm_op->offset_t_param);
 
-    param.kernel_size = tm_param->kernel_size;
-    param.stride = tm_param->stride;
-    param.pad = tm_param->pad;
+    param.kernel_h = tm_param->kernel_size;
+    param.kernel_w = tm_param->kernel_size;
+    param.stride_h = tm_param->stride;
+    param.stride_w = tm_param->stride;
+    param.pad_w0 = tm_param->pad;
+    param.pad_w1 = tm_param->pad;
+    param.pad_h0 = tm_param->pad;
+    param.pad_h1 = tm_param->pad;
     param.num_output = tm_param->num_output;
-    param.dilation = tm_param->dilation;
+    param.dilation_h = tm_param->dilation;
+    param.dilation_w = tm_param->dilation;
+    param.group = 1;
 
     StaticOp* op = CreateStaticOp(graph, op_str);
     SetOperatorParam(op, param);
@@ -275,23 +286,14 @@ bool LoadTmPoolingOp(StaticGraph* graph, StaticNode* node, void* const start_ptr
     param.alg = static_cast<PoolArg>(tm_param->alg);
     param.kernel_h = tm_param->kernel_h;
     param.kernel_w = tm_param->kernel_w;
-    param.pad_h = tm_param->pad_h;
-    param.pad_w = tm_param->pad_w;
     param.stride_h = tm_param->stride_h;
     param.stride_w = tm_param->stride_w;
     param.global = tm_param->global;
     param.caffe_flavor = tm_param->caffe_flavor;
-    param.kernel_shape.resize(2);
-    param.kernel_shape[0] = tm_param->kernel_shape[0];
-    param.kernel_shape[1] = tm_param->kernel_shape[1];
-    param.strides.resize(2);
-    param.strides[0] = tm_param->strides[0];
-    param.strides[1] = tm_param->strides[1];
-    param.pads.resize(4);
-    param.pads[0] = tm_param->pads[0];
-    param.pads[1] = tm_param->pads[1];
-    param.pads[2] = tm_param->pads[2];
-    param.pads[3] = tm_param->pads[3];
+    param.pad_h0 = tm_param->pads[0];
+    param.pad_w0 = tm_param->pads[1];
+    param.pad_h1 = tm_param->pads[2];
+    param.pad_w1 = tm_param->pads[3];
 
     StaticOp* op = CreateStaticOp(graph, op_str);
     SetOperatorParam(op, param);
@@ -490,6 +492,7 @@ bool LoadTmSliceOp(StaticGraph* graph, StaticNode* node, void* const start_ptr,
     const TM_SliceParam* tm_param = GetTmPtr<TM_SliceParam>(start_ptr, tm_op->offset_t_param);
 
     param.axis = tm_param->axis;
+    param.iscaffe = true;
 
     StaticOp* op = CreateStaticOp(graph, op_str);
     SetOperatorParam(op, param);
@@ -659,4 +662,6 @@ std::string GetOpStr(uint32_t op_type)
     }
 }
 
+}    // namespace TMSerializer1
+
 }    // namespace TEngine
diff --git a/serializer/tengine/tm_op_save.cpp b/serializer/tengine/v1/tm1_op_save.cpp
similarity index 95%
rename from serializer/tengine/tm_op_save.cpp
rename to serializer/tengine/v1/tm1_op_save.cpp
index ab879a12e..8afa95103 100644
--- a/serializer/tengine/tm_op_save.cpp
+++ b/serializer/tengine/v1/tm1_op_save.cpp
@@ -21,10 +21,12 @@
  * Copyright (c) 2018, Open AI Lab
  * Author: jingyou@openailab.com
  */
-#include "tm_op_serializer.hpp"
+#include "tm1_op_serializer.hpp"
 
 namespace TEngine {
 
+namespace TMSerializer1 {
+
 inline void SetTmOperator(TM_Operator* tm_op, const uint32_t op_type, const tm_uoffset_t offset1,
                           const tm_uoffset_t offset2)
 {
@@ -81,27 +83,17 @@ static tm_uoffset_t SaveTmConvOp(void* const start_ptr, tm_uoffset_t* cur_pos, O
     tm_param.kernel_w = p->kernel_w;
     tm_param.stride_h = p->stride_h;
     tm_param.stride_w = p->stride_w;
-    tm_param.pad_h = p->pad_h;
-    tm_param.pad_w = p->pad_w;
     tm_param.dilation_h = p->dilation_h;
     tm_param.dilation_w = p->dilation_w;
     tm_param.output_channel = p->output_channel;
     tm_param.activation = p->activation;
     tm_param.group = p->group;
-    if(p->pads.size() == 4)
-    {
-        tm_param.pads[0] = p->pads[0];
-        tm_param.pads[1] = p->pads[1];
-        tm_param.pads[2] = p->pads[2];
-        tm_param.pads[3] = p->pads[3];
-    }
-    else
-    {
-        tm_param.pads[0] = 0;
-        tm_param.pads[1] = 0;
-        tm_param.pads[2] = 0;
-        tm_param.pads[3] = 0;
-    }
+    tm_param.pad_h = p->pad_h0;
+    tm_param.pad_w = p->pad_w0;
+    tm_param.pads[0] = p->pad_h0;
+    tm_param.pads[1] = p->pad_w0;
+    tm_param.pads[2] = p->pad_h1;
+    tm_param.pads[3] = p->pad_w1;
 
     TM_Operator tm_op;
     SetTmOperator(&tm_op, TM_OPTYPE_CONVOLUTION, NOT_SET,
@@ -113,11 +105,12 @@ static tm_uoffset_t SaveTmDeconvOp(void* const start_ptr, tm_uoffset_t* cur_pos,
 {
     DeconvParam* p = (dynamic_cast<Deconvolution*>(op))->GetParam();
     TM_DeconvParam tm_param;
-    tm_param.kernel_size = p->kernel_size;
-    tm_param.stride = p->stride;
-    tm_param.pad = p->pad;
+
+    tm_param.kernel_size = p->kernel_h;
+    tm_param.stride = p->stride_h;
+    tm_param.pad = p->pad_w0;
     tm_param.num_output = p->num_output;
-    tm_param.dilation = p->dilation;
+    tm_param.dilation = p->dilation_h;
 
     TM_Operator tm_op;
     SetTmOperator(&tm_op, TM_OPTYPE_DECONVOLUTION, NOT_SET,
@@ -244,20 +237,20 @@ static tm_uoffset_t SaveTmPoolOp(void* const start_ptr, tm_uoffset_t* cur_pos, O
     tm_param.alg = p->alg;
     tm_param.kernel_h = p->kernel_h;
     tm_param.kernel_w = p->kernel_w;
-    tm_param.pad_h = p->pad_h;
-    tm_param.pad_w = p->pad_w;
+    tm_param.pad_h = p->pad_h0;
+    tm_param.pad_w = p->pad_w0;
     tm_param.stride_h = p->stride_h;
     tm_param.stride_w = p->stride_w;
     tm_param.global = p->global;
     tm_param.caffe_flavor = p->caffe_flavor;
-    tm_param.kernel_shape[0] = p->kernel_shape[0];
-    tm_param.kernel_shape[1] = p->kernel_shape[1];
-    tm_param.strides[0] = p->strides[0];
-    tm_param.strides[1] = p->strides[1];
-    tm_param.pads[0] = p->pads[0];
-    tm_param.pads[1] = p->pads[1];
-    tm_param.pads[2] = p->pads[2];
-    tm_param.pads[3] = p->pads[3];
+    tm_param.kernel_shape[0] = p->kernel_h;
+    tm_param.kernel_shape[1] = p->kernel_w;
+    tm_param.strides[0] = p->stride_h;
+    tm_param.strides[1] = p->stride_w;
+    tm_param.pads[0] = p->pad_h0;
+    tm_param.pads[1] = p->pad_w0;
+    tm_param.pads[2] = p->pad_h1;
+    tm_param.pads[3] = p->pad_w1;
 
     TM_Operator tm_op;
     SetTmOperator(&tm_op, TM_OPTYPE_POOLING, NOT_SET,
@@ -600,4 +593,6 @@ tm_uoffset_t SaveTmOperator(void* const start_ptr, tm_uoffset_t* cur_pos, Operat
     return 0;
 }
 
+}    // namespace TMSerializer1
+
 }    // namespace TEngine
diff --git a/serializer/tengine/v1/tm1_serializer.cpp b/serializer/tengine/v1/tm1_serializer.cpp
new file mode 100644
index 000000000..0938fd1b3
--- /dev/null
+++ b/serializer/tengine/v1/tm1_serializer.cpp
@@ -0,0 +1,595 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#include "tengine_c_api.h"
+#include "exec_attr.hpp"
+#include "data_type.hpp"
+#include "operator_manager.hpp"
+#include "static_graph.hpp"
+#include "graph.hpp"
+#include "node.hpp"
+#include "tensor.hpp"
+#include "compiler.hpp"
+
+#include "tm1_format.h"
+#include "tm1_serializer.hpp"
+#include "tm1_op_serializer.hpp"
+
+namespace TEngine {
+
+namespace TMSerializer1 {
+
+bool TmSerializer1::IsSaveString(void)
+{
+    const char* env = std::getenv("TM_WITH_STRING");
+
+    if(env)
+        return true;
+    else
+        return false;
+}
+
+bool TmSerializer1::IsSaveData(void)
+{
+    const char* env = std::getenv("TM_FOR_BENCHMARK");
+
+    if(env)
+        return false;
+    else
+        return true;
+}
+
+tm_uoffset_t TmSerializer1::SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor,
+                                        unsigned int tensor_id, unsigned int buffer_id)
+{
+    TM_Tensor tm_tensor;
+    tm_tensor.tensor_id = tensor_id;
+    tm_tensor.buffer_id = buffer_id;
+    tm_tensor.type = tensor->GetType();
+
+    bool tm_with_string = IsSaveString();
+
+    if(tm_with_string)
+    {
+        std::string name = tensor->GetName();
+        TM_String tensor_name;
+        tensor_name.size = name.size();
+        tensor_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), tensor_name.size);
+        tm_tensor.offset_s_tname = WriteTmObject(start_ptr, cur_pos, &tensor_name, sizeof(TM_String));
+    }
+    else
+        tm_tensor.offset_s_tname = NOT_SET;
+
+    const std::string& data_type = DataType::GetTypeName(tensor->GetDataType());
+    if(data_type == "float32")
+        tm_tensor.data_type = TM_DT_FLOAT32;
+    else if(data_type == "float16")
+        tm_tensor.data_type = TM_DT_FLOAT16;
+    else if(data_type == "int")
+        tm_tensor.data_type = TM_DT_INT32;
+    else if(data_type == "int8")
+        tm_tensor.data_type = TM_DT_INT8;
+
+    /* Get the dims of the tensor */
+    TShape& shape = tensor->GetShape();
+    std::vector<int>& dim = shape.GetDim();
+    if(dim.size())
+    {
+        /* Write the vector of dims */
+        size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * dim.size();
+        TM_Vector_dims* v_dims = ( TM_Vector_dims* )malloc(vector_size);
+        v_dims->v_num = dim.size();
+        for(unsigned int i = 0; i < dim.size(); i++)
+        {
+            v_dims->dims[i] = dim[i];
+        }
+        tm_tensor.offset_vd_dims = WriteTmObject(start_ptr, cur_pos, v_dims, vector_size);
+        free(v_dims);
+    }
+    else
+        tm_tensor.offset_vd_dims = NOT_SET;
+
+    /* Write the tensor */
+    return WriteTmObject(start_ptr, cur_pos, &tm_tensor, sizeof(TM_Tensor));
+}
+
+tm_uoffset_t TmSerializer1::SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node,
+                                      name_map_t& tensor_name_map)
+{
+    TM_Node tm_node;
+    tm_node.node_id = node->GetNodeIndex();
+    tm_node.dynamic_shape = node->IsDynamicShape();
+
+    bool tm_with_string = IsSaveString();
+
+    if(tm_with_string)
+    {
+        std::string name = node->GetName();
+        TM_String node_name;
+        node_name.size = name.size();
+        node_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), node_name.size);
+        tm_node.offset_s_nname = WriteTmObject(start_ptr, cur_pos, &node_name, sizeof(TM_String));
+    }
+    else
+        tm_node.offset_s_nname = NOT_SET;
+
+    unsigned int input_num = node->GetInputNum();
+    unsigned int output_num = node->GetOutputNum();
+
+    if(input_num)
+    {
+        /* Write the vector of input indices */
+        size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * input_num;
+        TM_Vector_indices* v_input_indices = ( TM_Vector_indices* )malloc(vector_size);
+        v_input_indices->v_num = input_num;
+        for(unsigned int i = 0; i < input_num; i++)
+        {
+            Tensor* p_tensor = node->GetInputTensor(i);
+            v_input_indices->indices[i] = tensor_name_map[p_tensor->GetName()];
+        }
+        tm_node.offset_vi_input_tensors = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size);
+        free(v_input_indices);
+    }
+    else
+        tm_node.offset_vi_input_tensors = NOT_SET;
+
+    if(output_num)
+    {
+        /* Write the vector of output indices */
+        size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * output_num;
+        TM_Vector_indices* v_output_indices = ( TM_Vector_indices* )malloc(vector_size);
+        v_output_indices->v_num = output_num;
+        for(unsigned int i = 0; i < output_num; i++)
+        {
+            Tensor* p_tensor = node->GetOutputTensor(i);
+            v_output_indices->indices[i] = tensor_name_map[p_tensor->GetName()];
+        }
+        tm_node.offset_vi_output_tensors = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size);
+        free(v_output_indices);
+    }
+    else
+        tm_node.offset_vi_output_tensors = NOT_SET;
+
+    tm_node.offset_t_operator = SaveTmOperator(start_ptr, cur_pos, node->GetOp());
+
+    /* Write the node */
+    return WriteTmObject(start_ptr, cur_pos, &tm_node, sizeof(TM_Node));
+}
+
+tm_uoffset_t TmSerializer1::SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph)
+{
+    TM_Subgraph tm_subgraph;
+    tm_subgraph.subgraph_id = 0; /* subgraph_id starts from 0 */
+    tm_subgraph.offset_s_sname = NOT_SET;
+
+    unsigned int tensor_num = 0;
+    unsigned int buffer_num = 0;
+    std::vector<Tensor*> tensor_ptrs;
+    std::vector<void*> buf_ptrs;
+    std::vector<unsigned int> buf_sizes;
+    name_map_t tensor_name_map; /* map of tensor name and tensor index */
+    bool tm_no_data = !IsSaveData();
+
+    /* Write the nodes */
+    size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * graph->seq_nodes.size();
+    TM_Vector_offsets* v_nodes = ( TM_Vector_offsets* )malloc(vector_size);
+    v_nodes->v_num = graph->seq_nodes.size();
+    for(unsigned int i = 0; i < graph->seq_nodes.size(); i++)
+    {
+        Node* p_node = graph->seq_nodes[i];
+        for(unsigned int k = 0; k < p_node->GetOutputNum(); k++)
+        {
+            Tensor* p_tensor = p_node->GetOutputTensor(k);
+            tensor_ptrs.push_back(p_tensor);
+            tensor_name_map[p_tensor->GetName()] = tensor_num;
+            tensor_num++;
+        }
+        v_nodes->offsets[i] = SaveTmNode(start_ptr, cur_pos, p_node, tensor_name_map);
+    }
+    /* Write the vector of nodes */
+    tm_subgraph.offset_vo_seq_nodes = WriteTmObject(start_ptr, cur_pos, v_nodes, vector_size);
+
+    /* Write the tensors */
+    vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor_num;
+    TM_Vector_offsets* v_tensors = ( TM_Vector_offsets* )malloc(vector_size);
+    v_tensors->v_num = tensor_num;
+    for(unsigned int i = 0; i < tensor_num; i++)
+    {
+        Tensor* p_tensor = tensor_ptrs[i];
+        if(p_tensor->GetType() == kConstTensor)
+        {
+            buf_ptrs.push_back(p_tensor->GetMemAddr());
+            buf_sizes.push_back(p_tensor->GetTotalSize());
+            buffer_num++;
+        }
+
+        v_tensors->offsets[i] = SaveTmTensor(start_ptr, cur_pos, p_tensor, i, buffer_num - 1);
+    }
+    /* Write the vector of tensors */
+    tm_subgraph.offset_vo_tensors = WriteTmObject(start_ptr, cur_pos, v_tensors, vector_size);
+
+    /* Write the buffers */
+    vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * buffer_num;
+    TM_Vector_offsets* v_buffers = ( TM_Vector_offsets* )malloc(vector_size);
+    v_buffers->v_num = buffer_num;
+    for(unsigned int i = 0; i < buffer_num; i++)
+    {
+        TM_Buffer tm_buf;
+        tm_buf.size = buf_sizes[i];
+
+        if(tm_no_data)
+        {
+            /* TM_FOR_BENCHMARK environment variable exists. Not write buf data into the tm file */
+            tm_buf.offset_data = NOT_SET;
+        }
+        else
+        {
+            /* TM_FOR_BENCHMARK environment variable does not exist */
+            tm_buf.offset_data =
+                WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast<const uint8_t*>(buf_ptrs[i]), tm_buf.size);
+        }
+        v_buffers->offsets[i] = WriteTmObject(start_ptr, cur_pos, &tm_buf, sizeof(TM_Buffer));
+    }
+    /* Write the vector of buffers */
+    tm_subgraph.offset_vo_buffers = WriteTmObject(start_ptr, cur_pos, v_buffers, vector_size);
+
+    /* Write the vector of input indices */
+    vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->input_nodes.size();
+    TM_Vector_indices* v_input_indices = ( TM_Vector_indices* )malloc(vector_size);
+    v_input_indices->v_num = graph->input_nodes.size();
+    for(unsigned int i = 0; i < graph->input_nodes.size(); i++)
+    {
+        v_input_indices->indices[i] = graph->input_nodes[i]->GetNodeIndex();
+    }
+    tm_subgraph.offset_vi_input_indices = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size);
+
+    /* Write the vector of output indices */
+    vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->output_nodes.size();
+    TM_Vector_indices* v_output_indices = ( TM_Vector_indices* )malloc(vector_size);
+    v_output_indices->v_num = graph->output_nodes.size();
+    for(unsigned int i = 0; i < graph->output_nodes.size(); i++)
+    {
+        v_output_indices->indices[i] = graph->output_nodes[i]->GetNodeIndex();
+    }
+    tm_subgraph.offset_vi_output_indices = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size);
+
+    /* Write the subgraph */
+    tm_uoffset_t ret = WriteTmObject(start_ptr, cur_pos, &tm_subgraph, sizeof(TM_Subgraph));
+
+    /* Free the memory of vectors */
+    free(v_tensors);
+    free(v_buffers);
+    free(v_nodes);
+    free(v_input_indices);
+    free(v_output_indices);
+
+    return ret;
+}
+
+bool TmSerializer1::SaveModelIntoMem(void* start_ptr, Graph* graph, uint32_t* tm_model_size)
+{
+    bool tm_with_string = IsSaveString();
+
+    tm_uoffset_t cur_pos = sizeof(TM_Header);
+
+    /* Define the TM_Header object */
+    TM_Header header;
+    header.ver_main = TM_FILE_VER_MAIN;
+    header.ver_sub = TM_FILE_VER_SUB;
+    header.ver_compile = TM_FILE_VER_COMPILE;
+
+    /* Define the TM_Model object */
+    TM_Model tm_model;
+    if(tm_with_string)
+    {
+        const std::string& fname = graph->GetName();
+        TM_String model_name;
+        model_name.size = fname.size();
+        model_name.offset_data = WriteTmFileAlign1(start_ptr, &cur_pos, fname.c_str(), model_name.size);
+        tm_model.offset_s_mname = WriteTmObject(start_ptr, &cur_pos, &model_name, sizeof(TM_String));
+    }
+    else
+        tm_model.offset_s_mname = NOT_SET;
+
+    /* Write the subgraphs */
+    /* Only 1 subgraph is supported currently */
+    size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * 1;
+    TM_Vector_offsets* v_subgraphs = ( TM_Vector_offsets* )malloc(vector_size);
+    v_subgraphs->v_num = 1;
+    v_subgraphs->offsets[0] = SaveTmSubgraph(start_ptr, &cur_pos, graph);
+
+    /* Write the vector of subgraphs */
+    tm_model.offset_vo_subgraphs = WriteTmObject(start_ptr, &cur_pos, v_subgraphs, vector_size);
+
+    /* Write the model */
+    header.offset_root = WriteTmObject(start_ptr, &cur_pos, &tm_model, sizeof(TM_Model));
+    *tm_model_size = cur_pos;
+
+    /* Write the header */
+    cur_pos = 0;
+    WriteTmObject(start_ptr, &cur_pos, &header, sizeof(TM_Header));
+
+    free(v_subgraphs);
+
+    return true;
+}
+
+bool TmSerializer1::LoadNode(StaticGraph* graph, StaticNode* node, const TM_Node* tm_node, void* mmap_buf)
+{
+    if(tm_node->offset_vi_input_tensors != NOT_SET)
+    {
+        const TM_Vector_indices* v_input_tensors =
+            GetTmPtr<TM_Vector_indices>(mmap_buf, tm_node->offset_vi_input_tensors);
+
+        /* Set the input tensors to the node */
+        for(unsigned int i = 0; i < v_input_tensors->v_num; i++)
+        {
+            StaticTensor* tensor = graph->tensor_list[v_input_tensors->indices[i]].get();
+            if(!tensor)
+            {
+                LOG_ERROR() << "The input tensor not exist: " << v_input_tensors->indices[i] << "\n";
+                return false;
+            }
+            AddNodeInputTensor(node, tensor);
+        }
+    }
+
+    if(tm_node->offset_vi_output_tensors != NOT_SET)
+    {
+        const TM_Vector_indices* v_output_tensors =
+            GetTmPtr<TM_Vector_indices>(mmap_buf, tm_node->offset_vi_output_tensors);
+
+        /* Set the output tensors to the node */
+        for(unsigned int i = 0; i < v_output_tensors->v_num; i++)
+        {
+            StaticTensor* tensor = graph->tensor_list[v_output_tensors->indices[i]].get();
+            if(!tensor)
+            {
+                LOG_ERROR() << "The output tensor not exist: " << v_output_tensors->indices[i] << "\n";
+                return false;
+            }
+            AddNodeOutputTensor(node, tensor);
+        }
+    }
+    return true;
+}
+
+bool TmSerializer1::LoadTensor(StaticGraph* graph, const TM_Tensor* tm_tensor, const TM_Buffer* tm_buf, void* mmap_buf)
+{
+    /* Set the tensor name */
+    int idx = tm_tensor->tensor_id;
+    std::string tm_tensor_name;
+    if(tm_tensor->offset_s_tname == NOT_SET)
+        tm_tensor_name = "tensor_" + std::to_string(idx);
+    else
+    {
+        const TM_String* tm_string = GetTmPtr<TM_String>(mmap_buf, tm_tensor->offset_s_tname);
+        tm_tensor_name.assign(GetTmPtr<char>(mmap_buf, tm_string->offset_data), tm_string->size);
+    }
+
+    /* Create the static tensor */
+    StaticTensor* tensor;
+    if(tm_tensor->type == kConstTensor)
+        tensor = CreateStaticConstTensor(graph, tm_tensor_name);
+    else
+        tensor = CreateStaticTensor(graph, tm_tensor_name);
+    if(!tensor)
+    {
+        LOG_ERROR() << "Create static const tensor failed: " << tm_tensor_name << "\n";
+        return false;
+    }
+
+    /* Set the dims */
+    if(tm_tensor->offset_vd_dims != NOT_SET)
+    {
+        const TM_Vector_dims* v_dims = GetTmPtr<TM_Vector_dims>(mmap_buf, tm_tensor->offset_vd_dims);
+        if(!v_dims || !(v_dims->v_num))
+        {
+            LOG_ERROR() << "Get tensor dims failed\n";
+            return false;
+        }
+        std::vector<int> dims;
+        for(unsigned int i = 0; i < v_dims->v_num; i++)
+            dims.push_back(v_dims->dims[i]);
+        SetTensorDim(tensor, dims);
+
+    }
+
+    /* Set the data type */
+    if(tm_tensor->data_type == TM_DT_FLOAT32)
+        SetTensorDataType(tensor, DataType::GetTypeID("float32"));
+    else if(tm_tensor->data_type == TM_DT_FLOAT16)
+        SetTensorDataType(tensor, DataType::GetTypeID("float16"));
+    else if(tm_tensor->data_type == TM_DT_INT32)
+        SetTensorDataType(tensor, DataType::GetTypeID("int"));
+    else if(tm_tensor->data_type == TM_DT_INT8)
+        SetTensorDataType(tensor, DataType::GetTypeID("int8"));
+
+    /* Set the memory size and pointer */
+    if(tm_tensor->type == kConstTensor)
+    {
+        SetTensorSize(tensor, tm_buf->size);
+        void* buf = malloc(tm_buf->size);
+        if(tm_buf->offset_data != NOT_SET)
+        {
+            memcpy(buf, GetTmPtr<void>(mmap_buf, tm_buf->offset_data), tm_buf->size);
+        }
+
+        SetConstTensorBuffer(tensor, buf);
+        SetConstTensorFileLocation(tensor, -1, 0);
+    }
+
+    return true;
+}
+
+bool TmSerializer1::LoadGraph(StaticGraph* graph, const TM_Model* tm_model, void* mmap_buf)
+{
+    const TM_Vector_offsets* v_graphs = GetTmPtr<TM_Vector_offsets>(mmap_buf, tm_model->offset_vo_subgraphs);
+    const TM_Subgraph* tm_graph = GetTmPtr<TM_Subgraph>(mmap_buf, v_graphs->offsets[0]);
+
+    const TM_Vector_offsets* v_nodes = GetTmPtr<TM_Vector_offsets>(mmap_buf, tm_graph->offset_vo_seq_nodes);
+    const TM_Vector_offsets* v_tensors = GetTmPtr<TM_Vector_offsets>(mmap_buf, tm_graph->offset_vo_tensors);
+    const TM_Vector_offsets* v_buffers = GetTmPtr<TM_Vector_offsets>(mmap_buf, tm_graph->offset_vo_buffers);
+
+    /* Load const tensors */
+    for(unsigned int i = 0; i < v_tensors->v_num; i++)
+    {
+        const TM_Tensor* tm_tensor = GetTmPtr<TM_Tensor>(mmap_buf, v_tensors->offsets[i]);
+        const TM_Buffer* tm_buf;
+        if(tm_tensor->type == kConstTensor)
+            tm_buf = GetTmPtr<TM_Buffer>(mmap_buf, v_buffers->offsets[tm_tensor->buffer_id]);
+        else
+            tm_buf = nullptr;
+        LoadTensor(graph, tm_tensor, tm_buf, mmap_buf);
+    }
+
+    /* Create static nodes */
+    unsigned int i;
+    for(i = 0; i < v_nodes->v_num; i++)
+    {
+        const TM_Node* tm_node = GetTmPtr<TM_Node>(mmap_buf, v_nodes->offsets[i]);
+        int idx = tm_node->node_id;
+        std::string tm_node_name;
+        if(tm_node->offset_s_nname == NOT_SET)
+            tm_node_name = "node_" + std::to_string(idx);
+        else
+        {
+            const TM_String* tm_string = GetTmPtr<TM_String>(mmap_buf, tm_node->offset_s_nname);
+            tm_node_name.assign(GetTmPtr<char>(mmap_buf, tm_string->offset_data), tm_string->size);
+        }
+
+        const TM_Operator* tm_operator = GetTmPtr<TM_Operator>(mmap_buf, tm_node->offset_t_operator);
+        const std::string& tm_op_name = GetOpStr(tm_operator->operator_type);
+
+        if(!FindOpLoadMethod(tm_op_name))
+        {
+            LOG_ERROR() << "cannot find load function for operator: " << tm_op_name << "\n";
+            break;
+        }
+
+        StaticNode* node = CreateStaticNode(graph, tm_node_name);
+        if(!LoadNode(graph, node, tm_node, mmap_buf))
+            break;
+
+        op_load_t op_func = any_cast<op_load_t>(GetOpLoadMethod(tm_op_name));
+
+        if(!op_func(graph, node, mmap_buf, tm_operator))
+            break;
+
+        /* Set the dynamic shape of the operator */
+        node->op->dynamic_shape = tm_node->dynamic_shape;
+    }
+
+    if(i < v_nodes->v_num)
+        return false;
+
+    const TM_Vector_indices* v_input_nodes = GetTmPtr<TM_Vector_indices>(mmap_buf, tm_graph->offset_vi_input_indices);
+    const TM_Vector_indices* v_output_nodes = GetTmPtr<TM_Vector_indices>(mmap_buf, tm_graph->offset_vi_output_indices);
+
+    /* Set the input nodes */
+    for(unsigned int i = 0; i < v_input_nodes->v_num; i++)
+    {
+        StaticNode* node = graph->node_list[v_input_nodes->indices[i]].get();
+        if(!node)
+        {
+            LOG_ERROR() << "Input node #" << v_input_nodes->indices[i] << " not exist\n";
+            return false;
+        }
+        AddGraphInputNode(graph, node);
+    }
+
+    /* Set the output nodes */
+    for(unsigned int i = 0; i < v_output_nodes->v_num; i++)
+    {
+        StaticNode* node = graph->node_list[v_output_nodes->indices[i]].get();
+        if(!node)
+        {
+            LOG_ERROR() << "Output node #" << v_output_nodes->indices[i] << " not exist\n";
+            return false;
+        }
+        AddGraphOutputNode(graph, node);
+    }
+
+    return true;
+}
+
+bool TmSerializer1::LoadModelFromMem(void* mmap_buf, StaticGraph* graph)
+{
+    const TM_Header* tm_header = reinterpret_cast<const TM_Header*>(mmap_buf);
+    /* Check the version of tm file format */
+    if(tm_header->ver_main != TM_FILE_VER_MAIN || tm_header->ver_sub != TM_FILE_VER_SUB ||
+       tm_header->ver_compile != TM_FILE_VER_COMPILE)
+    {
+        printf("Wrong version of tm file\n");
+        return false;
+    }
+
+    const TM_Model* tm_model = GetTmPtr<TM_Model>(mmap_buf, tm_header->offset_root);
+    if(tm_model->offset_s_mname == NOT_SET)
+    {
+        SetGraphIdentity(graph, "tengine", "tengine_model", "0");
+    }
+    else
+    {
+        std::string tm_model_name;
+        const TM_String* tm_string = GetTmPtr<TM_String>(mmap_buf, tm_model->offset_s_mname);
+        tm_model_name.assign(GetTmPtr<char>(mmap_buf, tm_string->offset_data), tm_string->size);
+        SetGraphIdentity(graph, "tengine", tm_model_name, "0");
+    }
+
+    SetModelFormat(graph,MODEL_FORMAT_TENGINE);
+    SetGraphLayout(graph,TENGINE_LAYOUT_NCHW);
+    SetModelLayout(graph,TENGINE_LAYOUT_NCHW);
+
+    if(LoadGraph(graph, tm_model, mmap_buf))
+        return true;
+    else
+        return false;
+}
+
+bool TmSerializerRegisterOpLoader1(void)
+{
+    TmSerializerPtr serializer;
+
+    if(!TmSerializerManager::SafeGet("tm_v1", serializer))
+        return false;
+
+    TmSerializer1* p_tengine = dynamic_cast<TmSerializer1*>(serializer.get());
+
+    for(int i = 0; i < TM_OPTYPE_NUM; i++)
+    {
+        p_tengine->RegisterOpLoadMethod(GetOpStr(i), op_load_t(LoadTmOpFunc(i)));
+    }
+
+    return true;
+}
+
+}    // namespace TMSerializer1
+
+}    // namespace TEngine
diff --git a/serializer/tengine/v2/Makefile b/serializer/tengine/v2/Makefile
new file mode 100644
index 000000000..dc9eea616
--- /dev/null
+++ b/serializer/tengine/v2/Makefile
@@ -0,0 +1,7 @@
+obj-y+=init.o
+obj-y+=tm2_op_load.o
+obj-y+=tm2_op_save.o
+obj-y+=tm2_serializer.o
+
+COMMON_CFLAGS+=-I$(shell pwd)/../../include/tengine/v2
+
diff --git a/serializer/tengine/v2/init.cpp b/serializer/tengine/v2/init.cpp
new file mode 100644
index 000000000..3c9825059
--- /dev/null
+++ b/serializer/tengine/v2/init.cpp
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#include "tm_serializer.hpp"
+#include "tm2_serializer.hpp"
+
+namespace TEngine {
+namespace TMSerializer2 {
+
+extern bool TmSerializerRegisterOpLoader2();
+
+}
+
+using namespace TMSerializer2;
+
+bool register_tm2_serializer(void)
+{
+    auto factory = TmSerializerFactory::GetFactory();
+
+    factory->RegisterInterface<TmSerializer2>("tm_v2");
+    auto tm_serializer = factory->Create("tm_v2");
+
+    TmSerializerManager::SafeAdd("tm_v2", TmSerializerPtr(tm_serializer));
+
+    return TmSerializerRegisterOpLoader2();
+}
+
+}    // namespace TEngine
+
diff --git a/serializer/tengine/v2/tm2_op_load.cpp b/serializer/tengine/v2/tm2_op_load.cpp
new file mode 100644
index 000000000..ad119d509
--- /dev/null
+++ b/serializer/tengine/v2/tm2_op_load.cpp
@@ -0,0 +1,895 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#include <string.h>
+
+#include "tm2_format.h"
+#include "tm2_op_serializer.hpp"
+
+namespace TEngine {
+
+namespace TMSerializer2 {
+
+bool LoadTmAccuracyOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_ACCURACY);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmBatchNormOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_BATCHNORMALIZATION;
+
+    BatchNormParam param = any_cast<BatchNormParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_BatchNormParam* tm_param = GetTmPtr<TM2_BatchNormParam>(start_ptr, tm_op->offset_t_param);
+
+    param.rescale_factor = tm_param->rescale_factor;
+    param.eps = tm_param->eps;
+    param.caffe_flavor = tm_param->caffe_flavor;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmResizeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_BILINEARRESIZE;
+
+    ResizeParam param = any_cast<ResizeParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_ResizeParam* tm_param = GetTmPtr<TM2_ResizeParam>(start_ptr, tm_op->offset_t_param);
+
+    param.scale_w = tm_param->scale_x;
+    param.scale_h = tm_param->scale_y;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmConcatOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_CONCAT;
+
+    ConcatParam param = any_cast<ConcatParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_ConcatParam* tm_param = GetTmPtr<TM2_ConcatParam>(start_ptr, tm_op->offset_t_param);
+
+    param.axis = tm_param->axis;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmConstOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_CONST);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmConvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_CONVOLUTION;
+
+    ConvParam param = any_cast<ConvParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_ConvParam* tm_param = GetTmPtr<TM2_ConvParam>(start_ptr, tm_op->offset_t_param);
+
+    param.kernel_h = tm_param->kernel_h;
+    param.kernel_w = tm_param->kernel_w;
+    param.stride_h = tm_param->stride_h;
+    param.stride_w = tm_param->stride_w;
+    param.dilation_h = tm_param->dilation_h;
+    param.dilation_w = tm_param->dilation_w;
+    param.input_channel = tm_param->input_channel;
+    param.output_channel = tm_param->output_channel;
+    param.group = tm_param->group;
+    param.activation = tm_param->activation;
+    param.pad_h0 = tm_param->pad_h0;
+    param.pad_h1 = tm_param->pad_h1;
+    param.pad_w0 = tm_param->pad_w0;
+    param.pad_w1 = tm_param->pad_w1;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmDeconvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_DECONVOLUTION;
+
+    DeconvParam param = any_cast<DeconvParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_DeconvParam* tm_param = GetTmPtr<TM2_DeconvParam>(start_ptr, tm_op->offset_t_param);
+
+    param.kernel_h = tm_param->kernel_h;
+    param.kernel_w = tm_param->kernel_w;
+    param.stride_h = tm_param->stride_h;
+    param.stride_w = tm_param->stride_w;
+    param.pad_w0 = tm_param->pad_w0;
+    param.pad_w1 = tm_param->pad_w1;
+    param.pad_h0 = tm_param->pad_h0;
+    param.pad_h1 = tm_param->pad_h1;
+    param.num_output = tm_param->num_output;
+    param.dilation_h = tm_param->dilation_h;
+    param.dilation_w = tm_param->dilation_w;
+    param.group = tm_param->group;
+    param.activation = tm_param->activation;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmDetectionOutputOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_DETECTIONOUTPUT;
+
+    DetectionOutputParam param = any_cast<DetectionOutputParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_DetectionOutputParam* tm_param = GetTmPtr<TM2_DetectionOutputParam>(start_ptr, tm_op->offset_t_param);
+
+    param.num_classes = tm_param->num_classes;
+    param.keep_top_k = tm_param->keep_top_k;
+    param.nms_top_k = tm_param->nms_top_k;
+    param.confidence_threshold = tm_param->confidence_threshold;
+    param.nms_threshold = tm_param->nms_threshold;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmDropoutOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_DROPOUT);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmEltwiseOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_ELTWISE;
+
+    EltwiseParam param = any_cast<EltwiseParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_EltwiseParam* tm_param = GetTmPtr<TM2_EltwiseParam>(start_ptr, tm_op->offset_t_param);
+
+    param.type = static_cast<EltType>(tm_param->type);
+    param.caffe_flavor = tm_param->caffe_flavor;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmFlattenOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_FLATTEN;
+
+    FlattenParam param = any_cast<FlattenParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_FlattenParam* tm_param = GetTmPtr<TM2_FlattenParam>(start_ptr, tm_op->offset_t_param);
+
+    param.axis = tm_param->axis;
+    param.end_axis = tm_param->end_axis;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmFCOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_FULLYCONNECTED;
+
+    FCParam param = any_cast<FCParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_FCParam* tm_param = GetTmPtr<TM2_FCParam>(start_ptr, tm_op->offset_t_param);
+
+    param.num_output = tm_param->num_output;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmInputOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_INPUTOP);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmLRNOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_LRN;
+
+    LRNParam param = any_cast<LRNParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_LRNParam* tm_param = GetTmPtr<TM2_LRNParam>(start_ptr, tm_op->offset_t_param);
+
+    param.local_size = tm_param->local_size;
+    param.alpha = tm_param->alpha;
+    param.beta = tm_param->beta;
+    param.norm_region = tm_param->norm_region;
+    param.k = tm_param->k;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmNormalizeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_NORMALIZE;
+
+    NormalizeParam param = any_cast<NormalizeParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_NormalizeParam* tm_param = GetTmPtr<TM2_NormalizeParam>(start_ptr, tm_op->offset_t_param);
+
+    param.across_spatial = tm_param->across_spatial;
+    param.channel_shared = tm_param->channel_shared;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmPermuteOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_PERMUTE;
+
+    PermuteParam param = any_cast<PermuteParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_PermuteParam* tm_param = GetTmPtr<TM2_PermuteParam>(start_ptr, tm_op->offset_t_param);
+
+    param.flag = tm_param->flag;
+    param.order0 = tm_param->order0;
+    param.order1 = tm_param->order1;
+    param.order2 = tm_param->order2;
+    param.order3 = tm_param->order3;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmPoolingOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_POOLING;
+
+    PoolParam param = any_cast<PoolParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_PoolParam* tm_param = GetTmPtr<TM2_PoolParam>(start_ptr, tm_op->offset_t_param);
+
+    param.alg = static_cast<PoolArg>(tm_param->alg);
+    param.kernel_h = tm_param->kernel_h;
+    param.kernel_w = tm_param->kernel_w;
+    param.stride_h = tm_param->stride_h;
+    param.stride_w = tm_param->stride_w;
+    param.global = tm_param->global;
+    param.caffe_flavor = tm_param->caffe_flavor;
+    param.pad_h0 = tm_param->pad_h0;
+    param.pad_w0 = tm_param->pad_w0;
+    param.pad_h1 = tm_param->pad_h1;
+    param.pad_w1 = tm_param->pad_w1;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmPreluOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_PRELU);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmPriorBoxOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_PRIORBOX;
+
+    PriorBoxParam param = any_cast<PriorBoxParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_PriorBoxParam* tm_param = GetTmPtr<TM2_PriorBoxParam>(start_ptr, tm_op->offset_t_param);
+    const TM2_Vector_floats* v_minsizes = GetTmPtr<TM2_Vector_floats>(start_ptr, tm_param->offset_vf_min_size);
+    const TM2_Vector_floats* v_maxsizes = GetTmPtr<TM2_Vector_floats>(start_ptr, tm_param->offset_vf_max_size);
+    const TM2_Vector_floats* v_variances = GetTmPtr<TM2_Vector_floats>(start_ptr, tm_param->offset_vf_variance);
+    const TM2_Vector_floats* v_ratios = GetTmPtr<TM2_Vector_floats>(start_ptr, tm_param->offset_vf_aspect_ratio);
+
+    for(unsigned int i = 0; i < v_minsizes->v_num; i++)
+        param.min_size.push_back(v_minsizes->data[i]);
+    for(unsigned int i = 0; i < v_maxsizes->v_num; i++)
+        param.max_size.push_back(v_maxsizes->data[i]);
+    for(unsigned int i = 0; i < v_variances->v_num; i++)
+        param.variance.push_back(v_variances->data[i]);
+    for(unsigned int i = 0; i < v_ratios->v_num; i++)
+        param.aspect_ratio.push_back(v_ratios->data[i]);
+    param.flip = tm_param->flip;
+    param.clip = tm_param->clip;
+    param.img_size = tm_param->img_size;
+    param.img_h = tm_param->img_h;
+    param.img_w = tm_param->img_w;
+    param.step_w = tm_param->step_w;
+    param.step_h = tm_param->step_h;
+    param.offset = tm_param->offset;
+    param.num_priors_ = tm_param->num_priors;
+    param.out_dim_ = tm_param->out_dim;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmRegionOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_REGION;
+
+    RegionParam param = any_cast<RegionParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_RegionParam* tm_param = GetTmPtr<TM2_RegionParam>(start_ptr, tm_op->offset_t_param);
+    const TM2_Vector_floats* v_biases = GetTmPtr<TM2_Vector_floats>(start_ptr, tm_param->offset_vf_biases);
+
+    for(unsigned int i = 0; i < v_biases->v_num; i++)
+        param.biases.push_back(v_biases->data[i]);
+    param.num_classes = tm_param->num_classes;
+    param.side = tm_param->side;
+    param.num_box = tm_param->num_box;
+    param.coords = tm_param->coords;
+    param.confidence_threshold = tm_param->confidence_threshold;
+    param.nms_threshold = tm_param->nms_threshold;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmReLuOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_RELU;
+
+    ReLuParam param = any_cast<ReLuParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_ReLuParam* tm_param = GetTmPtr<TM2_ReLuParam>(start_ptr, tm_op->offset_t_param);
+
+    param.negative_slope = tm_param->negative_slope;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmRelu6Op(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_RELU6);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmReorgOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_REORG;
+
+    ReorgParam param = any_cast<ReorgParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_ReorgParam* tm_param = GetTmPtr<TM2_ReorgParam>(start_ptr, tm_op->offset_t_param);
+
+    param.stride = tm_param->stride;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmReshapeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_RESHAPE;
+
+    ReshapeParam param = any_cast<ReshapeParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_ReshapeParam* tm_param = GetTmPtr<TM2_ReshapeParam>(start_ptr, tm_op->offset_t_param);
+
+    param.dim_0 = tm_param->dim_0;
+    param.dim_1 = tm_param->dim_1;
+    param.dim_2 = tm_param->dim_2;
+    param.dim_3 = tm_param->dim_3;
+    param.dim_size = tm_param->dim_size;
+    param.axis = tm_param->axis;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmROIPoolingOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_ROIPOOLING;
+
+    ROIPoolingParam param = any_cast<ROIPoolingParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_ROIPoolingParam* tm_param = GetTmPtr<TM2_ROIPoolingParam>(start_ptr, tm_op->offset_t_param);
+
+    param.pooled_h = tm_param->pooled_h;
+    param.pooled_w = tm_param->pooled_w;
+    param.spatial_scale = tm_param->spatial_scale;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmRPNOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_RPN;
+
+    RPNParam param = any_cast<RPNParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_RPNParam* tm_param = GetTmPtr<TM2_RPNParam>(start_ptr, tm_op->offset_t_param);
+    const TM2_Vector_floats* v_ratios = GetTmPtr<TM2_Vector_floats>(start_ptr, tm_param->offset_vf_ratios);
+    const TM2_Vector_floats* v_scales = GetTmPtr<TM2_Vector_floats>(start_ptr, tm_param->offset_vf_anchor_scales);
+
+    for(unsigned int i = 0; i < v_ratios->v_num; i++)
+        param.ratios.push_back(v_ratios->data[i]);
+    for(unsigned int i = 0; i < v_scales->v_num; i++)
+        param.anchor_scales.push_back(v_scales->data[i]);
+    param.feat_stride = tm_param->feat_stride;
+    param.basesize = tm_param->basesize;
+    param.min_size = tm_param->min_size;
+    param.per_nms_topn = tm_param->per_nms_topn;
+    param.post_nms_topn = tm_param->post_nms_topn;
+    param.nms_thresh = tm_param->nms_thresh;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmScaleOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_SCALE;
+
+    ScaleParam param = any_cast<ScaleParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_ScaleParam* tm_param = GetTmPtr<TM2_ScaleParam>(start_ptr, tm_op->offset_t_param);
+
+    param.axis = tm_param->axis;
+    param.num_axes = tm_param->num_axes;
+    param.bias_term = tm_param->bias_term;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmSliceOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_SLICE;
+
+    SliceParam param = any_cast<SliceParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_SliceParam* tm_param = GetTmPtr<TM2_SliceParam>(start_ptr, tm_op->offset_t_param);
+
+    if(tm_param->offset_vi_slice_points != TM2_NOT_SET)
+    {
+        const TM2_Vector_dims* v_slice_points = GetTmPtr<TM2_Vector_dims>(start_ptr, tm_param->offset_vi_slice_points);
+        for(unsigned int i = 0; i < v_slice_points->v_num; i++)
+            param.slice_point_.push_back(v_slice_points->dims[i]);
+    }
+    if(tm_param->offset_vi_begins != TM2_NOT_SET)
+    {
+        const TM2_Vector_dims* v_begins = GetTmPtr<TM2_Vector_dims>(start_ptr, tm_param->offset_vi_begins);
+        for(unsigned int i = 0; i < v_begins->v_num; i++)
+            param.begin_.push_back(v_begins->dims[i]);
+    }
+    if(tm_param->offset_vi_sizes != TM2_NOT_SET)
+    {
+        const TM2_Vector_dims* v_sizes = GetTmPtr<TM2_Vector_dims>(start_ptr, tm_param->offset_vi_sizes);
+        for(unsigned int i = 0; i < v_sizes->v_num; i++)
+            param.size_.push_back(v_sizes->dims[i]);
+    }
+
+    param.axis = tm_param->axis;
+    param.iscaffe = tm_param->iscaffe;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmSoftmaxOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_SOFTMAX;
+
+    SoftmaxParam param = any_cast<SoftmaxParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_SoftmaxParam* tm_param = GetTmPtr<TM2_SoftmaxParam>(start_ptr, tm_op->offset_t_param);
+
+    param.axis = tm_param->axis;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmSplitOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_SPLIT);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmDetectionPostProcessOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_DETECTIONPOSTPROCESS;
+
+    DetectionPostProcessParam param = any_cast<DetectionPostProcessParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_DetectionPostProcessParam* tm_param = GetTmPtr<TM2_DetectionPostProcessParam>(start_ptr, tm_op->offset_t_param);
+
+    param.max_detections = tm_param->max_detections;
+    param.max_classes_per_detection = tm_param->max_classes_per_detection;
+    param.nms_score_threshold = tm_param->nms_score_threshold;
+    param.nms_iou_threshold = tm_param->nms_iou_threshold;
+    param.num_classes = tm_param->num_classes;
+
+    const TM2_Vector_floats* v_scales = GetTmPtr<TM2_Vector_floats>(start_ptr, tm_param->offset_vf_scales);
+
+    for(unsigned int i = 0; i < v_scales->v_num; i++)
+        param.scales.push_back(v_scales->data[i]);
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmGemmOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_GEMM;
+
+    GemmParam param = any_cast<GemmParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_GemmParam* tm_param = GetTmPtr<TM2_GemmParam>(start_ptr, tm_op->offset_t_param);
+
+    param.alpha = tm_param->alpha;
+    param.beta = tm_param->beta;
+    param.transA = tm_param->transA;
+    param.transB = tm_param->transB;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmGenericOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_GENERIC;
+
+    GenericParam param = any_cast<GenericParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_GenericParam* tm_param = GetTmPtr<TM2_GenericParam>(start_ptr, tm_op->offset_t_param);
+
+    param.max_input_num = tm_param->max_input_num;
+    param.max_output_num = tm_param->max_output_num;
+
+    const TM2_String* tm_string = GetTmPtr<TM2_String>(start_ptr, tm_param->offset_s_opname);
+    char *op_name = (char *)malloc(tm_string->size);
+    memcpy(op_name, GetTmPtr<char>(start_ptr, tm_string->offset_data), tm_string->size);
+    param.op_name = op_name;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+
+    return true;
+}
+
+bool LoadTmLogisticOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_LOGISTIC);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmLstmOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_LSTM;
+
+    LSTMParam param = any_cast<LSTMParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_LstmParam* tm_param = GetTmPtr<TM2_LstmParam>(start_ptr, tm_op->offset_t_param);
+
+    param.forget_bias = tm_param->forget_bias;
+    param.clip = tm_param->clip;
+    param.output_len = tm_param->output_len;
+    param.sequence_len = tm_param->sequence_len;
+    param.input_size = tm_param->input_size;
+    param.hidden_size = tm_param->hidden_size;
+    param.cell_size = tm_param->cell_size;
+    param.has_peephole = tm_param->has_peephole;
+    param.has_projection = tm_param->has_projection;
+    param.has_clip = tm_param->has_clip;
+    param.has_bias = tm_param->has_bias;
+    param.has_init_state = tm_param->has_init_state;
+    param.forget_act = tm_param->forget_act;
+    param.input_act = tm_param->input_act;
+    param.output_act = tm_param->output_act;
+    param.cellin_act = tm_param->cellin_act;
+    param.cellout_act = tm_param->cellout_act;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmRnnOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_RNN;
+
+    RNNParam param = any_cast<RNNParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_RnnParam* tm_param = GetTmPtr<TM2_RnnParam>(start_ptr, tm_op->offset_t_param);
+
+    param.clip = tm_param->clip;
+    param.output_len = tm_param->output_len;
+    param.sequence_len = tm_param->sequence_len;
+    param.input_size = tm_param->input_size;
+    param.hidden_size = tm_param->hidden_size;
+    param.has_clip = tm_param->has_clip;
+    param.has_bias = tm_param->has_bias;
+    param.has_init_state = tm_param->has_init_state;
+    param.activation = tm_param->activation;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmTanhOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_TANH);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmSigmoidOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_SIGMOID);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmSqueezeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    const std::string& op_str = TM2_OPSTR_SQUEEZE;
+
+    SqueezeParam param = any_cast<SqueezeParam>(OpManager::GetOpDefParam(op_str));
+    const TM2_SqueezeParam* tm_param = GetTmPtr<TM2_SqueezeParam>(start_ptr, tm_op->offset_t_param);
+
+    param.dim_0 = tm_param->dim_0;
+    param.dim_1 = tm_param->dim_1;
+    param.dim_2 = tm_param->dim_2;
+    param.dim_3 = tm_param->dim_3;
+
+    StaticOp* op = CreateStaticOp(graph, op_str);
+    SetOperatorParam(op, param);
+    SetNodeOp(node, op);
+    return true;
+}
+
+bool LoadTmFusedbnscalereluOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op)
+{
+    StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_FUSEDBNSCALERELU);
+    SetNodeOp(node, op);
+    return true;
+}
+
+op_load_t LoadTmOpFunc(uint32_t op_type)
+{
+    switch(op_type)
+    {
+        case TM2_OPTYPE_ACCURACY:
+            return LoadTmAccuracyOp;
+        case TM2_OPTYPE_BATCHNORMALIZATION:
+            return LoadTmBatchNormOp;
+        case TM2_OPTYPE_BILINEARRESIZE:
+            return LoadTmResizeOp;
+        case TM2_OPTYPE_CONCAT:
+            return LoadTmConcatOp;
+        case TM2_OPTYPE_CONST:
+            return LoadTmConstOp;
+        case TM2_OPTYPE_CONVOLUTION:
+            return LoadTmConvOp;
+        case TM2_OPTYPE_DECONVOLUTION:
+            return LoadTmDeconvOp;
+        case TM2_OPTYPE_DETECTIONOUTPUT:
+            return LoadTmDetectionOutputOp;
+        case TM2_OPTYPE_DROPOUT:
+            return LoadTmDropoutOp;
+        case TM2_OPTYPE_ELTWISE:
+            return LoadTmEltwiseOp;
+        case TM2_OPTYPE_FLATTEN:
+            return LoadTmFlattenOp;
+        case TM2_OPTYPE_FULLYCONNECTED:
+            return LoadTmFCOp;
+        case TM2_OPTYPE_INPUTOP:
+            return LoadTmInputOp;
+        case TM2_OPTYPE_LRN:
+            return LoadTmLRNOp;
+        case TM2_OPTYPE_NORMALIZE:
+            return LoadTmNormalizeOp;
+        case TM2_OPTYPE_PERMUTE:
+            return LoadTmPermuteOp;
+        case TM2_OPTYPE_POOLING:
+            return LoadTmPoolingOp;
+        case TM2_OPTYPE_PRELU:
+            return LoadTmPreluOp;
+        case TM2_OPTYPE_PRIORBOX:
+            return LoadTmPriorBoxOp;
+        case TM2_OPTYPE_REGION:
+            return LoadTmRegionOp;
+        case TM2_OPTYPE_RELU:
+            return LoadTmReLuOp;
+        case TM2_OPTYPE_RELU6:
+            return LoadTmRelu6Op;
+        case TM2_OPTYPE_REORG:
+            return LoadTmReorgOp;
+        case TM2_OPTYPE_RESHAPE:
+            return LoadTmReshapeOp;
+        case TM2_OPTYPE_ROIPOOLING:
+            return LoadTmROIPoolingOp;
+        case TM2_OPTYPE_RPN:
+            return LoadTmRPNOp;
+        case TM2_OPTYPE_SCALE:
+            return LoadTmScaleOp;
+        case TM2_OPTYPE_SLICE:
+            return LoadTmSliceOp;
+        case TM2_OPTYPE_SOFTMAX:
+            return LoadTmSoftmaxOp;
+        case TM2_OPTYPE_SPLIT:
+            return LoadTmSplitOp;
+        case TM2_OPTYPE_DETECTIONPOSTPROCESS:
+            return LoadTmDetectionPostProcessOp;
+        case TM2_OPTYPE_GEMM:
+            return LoadTmGemmOp;
+        case TM2_OPTYPE_GENERIC:
+            return LoadTmGenericOp;
+        case TM2_OPTYPE_LOGISTIC:
+            return LoadTmLogisticOp;
+        case TM2_OPTYPE_LSTM:
+            return LoadTmLstmOp;
+        case TM2_OPTYPE_RNN:
+            return LoadTmRnnOp;
+        case TM2_OPTYPE_TANH:
+            return LoadTmTanhOp;
+        case TM2_OPTYPE_SIGMOID:
+            return LoadTmSigmoidOp;
+        case TM2_OPTYPE_SQUEEZE:
+            return LoadTmSqueezeOp;
+        case TM2_OPTYPE_FUSEDBNSCALERELU:
+            return LoadTmFusedbnscalereluOp;
+        default:
+            LOG_ERROR() << "Operator #" << op_type << " not supported in tengine model yet\n";
+            return nullptr;
+    }
+}
+
+std::string GetOpStr(uint32_t op_type)
+{
+    switch(op_type)
+    {
+        case TM2_OPTYPE_ACCURACY:
+            return std::string(TM2_OPSTR_ACCURACY);
+        case TM2_OPTYPE_BATCHNORMALIZATION:
+            return std::string(TM2_OPSTR_BATCHNORMALIZATION);
+        case TM2_OPTYPE_BILINEARRESIZE:
+            return std::string(TM2_OPSTR_BILINEARRESIZE);
+        case TM2_OPTYPE_CONCAT:
+            return std::string(TM2_OPSTR_CONCAT);
+        case TM2_OPTYPE_CONST:
+            return std::string(TM2_OPSTR_CONST);
+        case TM2_OPTYPE_CONVOLUTION:
+            return std::string(TM2_OPSTR_CONVOLUTION);
+        case TM2_OPTYPE_DECONVOLUTION:
+            return std::string(TM2_OPSTR_DECONVOLUTION);
+        case TM2_OPTYPE_DETECTIONOUTPUT:
+            return std::string(TM2_OPSTR_DETECTIONOUTPUT);
+        case TM2_OPTYPE_DROPOUT:
+            return std::string(TM2_OPSTR_DROPOUT);
+        case TM2_OPTYPE_ELTWISE:
+            return std::string(TM2_OPSTR_ELTWISE);
+        case TM2_OPTYPE_FLATTEN:
+            return std::string(TM2_OPSTR_FLATTEN);
+        case TM2_OPTYPE_FULLYCONNECTED:
+            return std::string(TM2_OPSTR_FULLYCONNECTED);
+        case TM2_OPTYPE_INPUTOP:
+            return std::string(TM2_OPSTR_INPUTOP);
+        case TM2_OPTYPE_LRN:
+            return std::string(TM2_OPSTR_LRN);
+        case TM2_OPTYPE_NORMALIZE:
+            return std::string(TM2_OPSTR_NORMALIZE);
+        case TM2_OPTYPE_PERMUTE:
+            return std::string(TM2_OPSTR_PERMUTE);
+        case TM2_OPTYPE_POOLING:
+            return std::string(TM2_OPSTR_POOLING);
+        case TM2_OPTYPE_PRELU:
+            return std::string(TM2_OPSTR_PRELU);
+        case TM2_OPTYPE_PRIORBOX:
+            return std::string(TM2_OPSTR_PRIORBOX);
+        case TM2_OPTYPE_REGION:
+            return std::string(TM2_OPSTR_REGION);
+        case TM2_OPTYPE_RELU:
+            return std::string(TM2_OPSTR_RELU);
+        case TM2_OPTYPE_RELU6:
+            return std::string(TM2_OPSTR_RELU6);
+        case TM2_OPTYPE_REORG:
+            return std::string(TM2_OPSTR_REORG);
+        case TM2_OPTYPE_RESHAPE:
+            return std::string(TM2_OPSTR_RESHAPE);
+        case TM2_OPTYPE_ROIPOOLING:
+            return std::string(TM2_OPSTR_ROIPOOLING);
+        case TM2_OPTYPE_RPN:
+            return std::string(TM2_OPSTR_RPN);
+        case TM2_OPTYPE_SCALE:
+            return std::string(TM2_OPSTR_SCALE);
+        case TM2_OPTYPE_SLICE:
+            return std::string(TM2_OPSTR_SLICE);
+        case TM2_OPTYPE_SOFTMAX:
+            return std::string(TM2_OPSTR_SOFTMAX);
+        case TM2_OPTYPE_SPLIT:
+            return std::string(TM2_OPSTR_SPLIT);
+        case TM2_OPTYPE_DETECTIONPOSTPROCESS:
+            return std::string(TM2_OPSTR_DETECTIONPOSTPROCESS);
+        case TM2_OPTYPE_GEMM:
+            return std::string(TM2_OPSTR_GEMM);
+        case TM2_OPTYPE_GENERIC:
+            return std::string(TM2_OPSTR_GENERIC);
+        case TM2_OPTYPE_LOGISTIC:
+            return std::string(TM2_OPSTR_LOGISTIC);
+        case TM2_OPTYPE_LSTM:
+            return std::string(TM2_OPSTR_LSTM);
+        case TM2_OPTYPE_RNN:
+            return std::string(TM2_OPSTR_RNN);
+        case TM2_OPTYPE_TANH:
+            return std::string(TM2_OPSTR_TANH);
+        case TM2_OPTYPE_SIGMOID:
+            return std::string(TM2_OPSTR_SIGMOID);
+        case TM2_OPTYPE_SQUEEZE:
+            return std::string(TM2_OPSTR_SQUEEZE);
+        case TM2_OPTYPE_FUSEDBNSCALERELU:
+            return std::string(TM2_OPSTR_FUSEDBNSCALERELU);
+        default:
+            LOG_ERROR() << "Get operator string failed\n";
+            return std::string("");
+    }
+}
+
+}    // namespace TMSerializer2
+
+}    // namespace TEngine
diff --git a/serializer/tengine/v2/tm2_op_save.cpp b/serializer/tengine/v2/tm2_op_save.cpp
new file mode 100644
index 000000000..f2e56695a
--- /dev/null
+++ b/serializer/tengine/v2/tm2_op_save.cpp
@@ -0,0 +1,825 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#include <string.h>
+
+#include "tm2_format.h"
+#include "tm2_op_serializer.hpp"
+
+namespace TEngine {
+
+namespace TMSerializer2 {
+
+inline void SetTmOperator(TM2_Operator* tm_op, const uint32_t op_type, const tm_uoffset_t offset)
+{
+    tm_op->op_ver = TM2_OP_VER;
+    tm_op->operator_type = op_type;
+    tm_op->offset_t_param = offset;
+}
+
+tm_uoffset_t SaveTmAccuracyOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_ACCURACY, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmBatchNormOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    BatchNormParam* p = (dynamic_cast<BatchNorm*>(op))->GetParam();
+    TM2_BatchNormParam tm_param;
+    tm_param.rescale_factor = p->rescale_factor;
+    tm_param.eps = p->eps;
+    tm_param.caffe_flavor = p->caffe_flavor;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_BATCHNORMALIZATION,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_BatchNormParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmConcatOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    ConcatParam* p = (dynamic_cast<Concat*>(op))->GetParam();
+    TM2_ConcatParam tm_param;
+    tm_param.axis = p->axis;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_CONCAT,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ConcatParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmConstOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_CONST, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmConvOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    ConvParam* p = (dynamic_cast<Convolution*>(op))->GetParam();
+    TM2_ConvParam tm_param;
+
+    tm_param.kernel_h = p->kernel_h;
+    tm_param.kernel_w = p->kernel_w;
+    tm_param.stride_h = p->stride_h;
+    tm_param.stride_w = p->stride_w;
+    tm_param.dilation_h = p->dilation_h;
+    tm_param.dilation_w = p->dilation_w;
+    tm_param.input_channel = p->input_channel;
+    tm_param.output_channel = p->output_channel;
+    tm_param.group = p->group;
+    tm_param.activation = p->activation;
+    tm_param.pad_h0 = p->pad_h0;
+    tm_param.pad_h1 = p->pad_h1;
+    tm_param.pad_w0 = p->pad_w0;
+    tm_param.pad_w1 = p->pad_w1;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_CONVOLUTION,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ConvParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmDeconvOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    DeconvParam* p = (dynamic_cast<Deconvolution*>(op))->GetParam();
+    TM2_DeconvParam tm_param;
+
+    tm_param.kernel_h = p->kernel_h;
+    tm_param.kernel_w = p->kernel_w;
+    tm_param.stride_h = p->stride_h;
+    tm_param.stride_w = p->stride_w;
+    tm_param.pad_w0 = p->pad_w0;
+    tm_param.pad_w1 = p->pad_w1;
+    tm_param.pad_h0 = p->pad_h0;
+    tm_param.pad_h1 = p->pad_h1;
+    tm_param.num_output = p->num_output;
+    tm_param.dilation_h = p->dilation_h;
+    tm_param.dilation_w = p->dilation_w;
+    tm_param.group = p->group;
+    tm_param.activation = p->activation;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_DECONVOLUTION,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_DeconvParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmDetectionOutputOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    DetectionOutputParam* p = (dynamic_cast<DetectionOutput*>(op))->GetParam();
+    TM2_DetectionOutputParam tm_param;
+    tm_param.num_classes = p->num_classes;
+    tm_param.keep_top_k = p->keep_top_k;
+    tm_param.nms_top_k = p->nms_top_k;
+    tm_param.confidence_threshold = p->confidence_threshold;
+    tm_param.nms_threshold = p->nms_threshold;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_DETECTIONOUTPUT,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_DetectionOutputParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmDropoutOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_DROPOUT, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmEltwiseOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    EltwiseParam* p = (dynamic_cast<Eltwise*>(op))->GetParam();
+    TM2_EltwiseParam tm_param;
+    tm_param.type = p->type;
+    tm_param.caffe_flavor = p->caffe_flavor;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_ELTWISE,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_EltwiseParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmFCOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    FCParam* p = (dynamic_cast<FullyConnected*>(op))->GetParam();
+    TM2_FCParam tm_param;
+    tm_param.num_output = p->num_output;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_FULLYCONNECTED,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_FCParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmFlattenOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    FlattenParam* p = (dynamic_cast<Flatten*>(op))->GetParam();
+    TM2_FlattenParam tm_param;
+    tm_param.axis = p->axis;
+    tm_param.end_axis = p->end_axis;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_FLATTEN,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_FlattenParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmInputOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_INPUTOP, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmLRNOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    LRNParam* p = (dynamic_cast<LRN*>(op))->GetParam();
+    TM2_LRNParam tm_param;
+    tm_param.local_size = p->local_size;
+    tm_param.alpha = p->alpha;
+    tm_param.beta = p->beta;
+    tm_param.norm_region = p->norm_region;
+    tm_param.k = p->k;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_LRN, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_LRNParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmNormalizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    NormalizeParam* p = (dynamic_cast<Normalize*>(op))->GetParam();
+    TM2_NormalizeParam tm_param;
+    tm_param.across_spatial = p->across_spatial;
+    tm_param.channel_shared = p->channel_shared;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_NORMALIZE,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_NormalizeParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmPermuteOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    PermuteParam* p = (dynamic_cast<Permute*>(op))->GetParam();
+    TM2_PermuteParam tm_param;
+    tm_param.flag = p->flag;
+    tm_param.order0 = p->order0;
+    tm_param.order1 = p->order1;
+    tm_param.order2 = p->order2;
+    tm_param.order3 = p->order3;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_PERMUTE,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_PermuteParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmPoolingOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    PoolParam* p = (dynamic_cast<Pooling*>(op))->GetParam();
+    TM2_PoolParam tm_param;
+    tm_param.alg = p->alg;
+    tm_param.kernel_h = p->kernel_h;
+    tm_param.kernel_w = p->kernel_w;
+    tm_param.stride_h = p->stride_h;
+    tm_param.stride_w = p->stride_w;
+    tm_param.global = p->global;
+    tm_param.caffe_flavor = p->caffe_flavor;
+    tm_param.pad_h0 = p->pad_h0;
+    tm_param.pad_w0 = p->pad_w0;
+    tm_param.pad_h1 = p->pad_h1;
+    tm_param.pad_w1 = p->pad_w1;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_POOLING,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_PoolParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmPreluOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_PRELU, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    PriorBoxParam* p = (dynamic_cast<PriorBox*>(op))->GetParam();
+    TM2_PriorBoxParam tm_param;
+
+    size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->min_size.size();
+    TM2_Vector_floats* v_minsizes = ( TM2_Vector_floats* )malloc(vector_size);
+    v_minsizes->v_num = p->min_size.size();
+    for(unsigned int i = 0; i < p->min_size.size(); i++)
+    {
+        v_minsizes->data[i] = p->min_size[i];
+    }
+    tm_param.offset_vf_min_size = WriteTmObject(start_ptr, cur_pos, v_minsizes, vector_size);
+    free(v_minsizes);
+
+    vector_size = sizeof(tm_size_t) + sizeof(float) * p->max_size.size();
+    TM2_Vector_floats* v_maxsizes = ( TM2_Vector_floats* )malloc(vector_size);
+    v_maxsizes->v_num = p->max_size.size();
+    for(unsigned int i = 0; i < p->max_size.size(); i++)
+    {
+        v_maxsizes->data[i] = p->max_size[i];
+    }
+    tm_param.offset_vf_max_size = WriteTmObject(start_ptr, cur_pos, v_maxsizes, vector_size);
+    free(v_maxsizes);
+
+    vector_size = sizeof(tm_size_t) + sizeof(float) * p->variance.size();
+    TM2_Vector_floats* v_variance = ( TM2_Vector_floats* )malloc(vector_size);
+    v_variance->v_num = p->variance.size();
+    for(unsigned int i = 0; i < p->variance.size(); i++)
+    {
+        v_variance->data[i] = p->variance[i];
+    }
+    tm_param.offset_vf_variance = WriteTmObject(start_ptr, cur_pos, v_variance, vector_size);
+    free(v_variance);
+
+    vector_size = sizeof(tm_size_t) + sizeof(float) * p->aspect_ratio.size();
+    TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size);
+    v_ratios->v_num = p->aspect_ratio.size();
+    for(unsigned int i = 0; i < p->aspect_ratio.size(); i++)
+    {
+        v_ratios->data[i] = p->aspect_ratio[i];
+    }
+    tm_param.offset_vf_aspect_ratio = WriteTmObject(start_ptr, cur_pos, v_ratios, vector_size);
+    free(v_ratios);
+
+    tm_param.flip = p->flip;
+    tm_param.clip = p->clip;
+    tm_param.img_size = p->img_size;
+    tm_param.img_h = p->img_h;
+    tm_param.img_w = p->img_w;
+    tm_param.step_w = p->step_w;
+    tm_param.step_h = p->step_h;
+    tm_param.offset = p->offset;
+    tm_param.num_priors = p->num_priors_;
+    tm_param.out_dim = p->out_dim_;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_PRIORBOX,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_PriorBoxParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmRegionOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    RegionParam* p = (dynamic_cast<Region*>(op))->GetParam();
+    TM2_RegionParam tm_param;
+    tm_param.num_classes = p->num_classes;
+    tm_param.side = p->side;
+    tm_param.num_box = p->num_box;
+    tm_param.coords = p->coords;
+    tm_param.confidence_threshold = p->confidence_threshold;
+    tm_param.nms_threshold = p->nms_threshold;
+
+    size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->biases.size();
+    TM2_Vector_floats* v_biases = ( TM2_Vector_floats* )malloc(vector_size);
+    v_biases->v_num = p->biases.size();
+    for(unsigned int i = 0; i < p->biases.size(); i++)
+    {
+        v_biases->data[i] = p->biases[i];
+    }
+    tm_param.offset_vf_biases = WriteTmObject(start_ptr, cur_pos, v_biases, vector_size);
+    free(v_biases);
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_REGION,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_RegionParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmReLuOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    ReLuParam* p = (dynamic_cast<ReLu*>(op))->GetParam();
+    TM2_ReLuParam tm_param;
+    tm_param.negative_slope = p->negative_slope;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_RELU, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ReLuParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmRelu6Op(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_RELU6, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmReorgOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    ReorgParam* p = (dynamic_cast<Reorg*>(op))->GetParam();
+    TM2_ReorgParam tm_param;
+    tm_param.stride = p->stride;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_REORG,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ReorgParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmReshapeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    ReshapeParam* p = (dynamic_cast<Reshape*>(op))->GetParam();
+    TM2_ReshapeParam tm_param;
+
+    tm_param.dim_0 = p->dim_0;
+    tm_param.dim_1 = p->dim_1;
+    tm_param.dim_2 = p->dim_2;
+    tm_param.dim_3 = p->dim_3;
+    tm_param.dim_size = p->dim_size;
+    tm_param.axis = p->axis;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_RESHAPE,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ReshapeParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmResizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    ResizeParam* p = (dynamic_cast<Resize*>(op))->GetParam();
+    TM2_ResizeParam tm_param;
+    tm_param.scale_x = p->scale_w;
+    tm_param.scale_y = p->scale_h;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_BILINEARRESIZE,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ResizeParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmROIPoolingOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    ROIPoolingParam* p = (dynamic_cast<ROIPooling*>(op))->GetParam();
+    TM2_ROIPoolingParam tm_param;
+    tm_param.pooled_h = p->pooled_h;
+    tm_param.pooled_w = p->pooled_w;
+    tm_param.spatial_scale = p->spatial_scale;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_ROIPOOLING,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ROIPoolingParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    RPNParam* p = (dynamic_cast<RPN*>(op))->GetParam();
+    TM2_RPNParam tm_param;
+
+    size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->ratios.size();
+    TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size);
+    v_ratios->v_num = p->ratios.size();
+    for(unsigned int i = 0; i < p->ratios.size(); i++)
+    {
+        v_ratios->data[i] = p->ratios[i];
+    }
+    tm_param.offset_vf_ratios = WriteTmObject(start_ptr, cur_pos, v_ratios, vector_size);
+    free(v_ratios);
+
+    vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchor_scales.size();
+    TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size);
+    v_scales->v_num = p->anchor_scales.size();
+    for(unsigned int i = 0; i < p->anchor_scales.size(); i++)
+    {
+        v_scales->data[i] = p->anchor_scales[i];
+    }
+    tm_param.offset_vf_anchor_scales = WriteTmObject(start_ptr, cur_pos, v_scales, vector_size);
+    free(v_scales);
+
+    vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchors_.size() * 4;
+    TM2_Vector_anchors* v_anchors = ( TM2_Vector_anchors* )malloc(vector_size);
+    v_anchors->v_num = p->anchors_.size();
+    for(unsigned int i = 0; i < p->anchors_.size(); i++)
+    {
+        v_anchors->data[i][0] = p->anchors_[i].x0;
+        v_anchors->data[i][1] = p->anchors_[i].y0;
+        v_anchors->data[i][2] = p->anchors_[i].x1;
+        v_anchors->data[i][3] = p->anchors_[i].y1;
+    }
+    tm_param.offset_va_anchors = WriteTmObject(start_ptr, cur_pos, v_anchors, vector_size);
+    free(v_anchors);
+
+    tm_param.feat_stride = p->feat_stride;
+    tm_param.basesize = p->basesize;
+    tm_param.min_size = p->min_size;
+    tm_param.per_nms_topn = p->per_nms_topn;
+    tm_param.post_nms_topn = p->post_nms_topn;
+    tm_param.nms_thresh = p->nms_thresh;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_RPN, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_RPNParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmScaleOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    ScaleParam* p = (dynamic_cast<Scale*>(op))->GetParam();
+    TM2_ScaleParam tm_param;
+    tm_param.axis = p->axis;
+    tm_param.num_axes = p->num_axes;
+    tm_param.bias_term = p->bias_term;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_SCALE,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ScaleParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    SliceParam* p = (dynamic_cast<Slice*>(op))->GetParam();
+    TM2_SliceParam tm_param;
+
+    tm_param.axis = p->axis;
+    tm_param.iscaffe = p->iscaffe;
+
+    if((p->slice_point_).size())
+    {
+        size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * (p->slice_point_).size();
+        TM2_Vector_dims* v_slice_points = ( TM2_Vector_dims* )malloc(vector_size);
+        v_slice_points->v_num = (p->slice_point_).size();
+        for(unsigned int i = 0; i < (p->slice_point_).size(); i++)
+        {
+            v_slice_points->dims[i] = p->slice_point_[i];
+        }
+        tm_param.offset_vi_slice_points = WriteTmObject(start_ptr, cur_pos, v_slice_points, vector_size);
+        free(v_slice_points);
+    }
+    else
+        tm_param.offset_vi_slice_points = TM2_NOT_SET;
+
+    if((p->begin_).size())
+    {
+        size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * (p->begin_).size();
+        TM2_Vector_dims* v_begins = ( TM2_Vector_dims* )malloc(vector_size);
+        v_begins->v_num = (p->begin_).size();
+        for(unsigned int i = 0; i < (p->begin_).size(); i++)
+        {
+            v_begins->dims[i] = p->begin_[i];
+        }
+        tm_param.offset_vi_begins = WriteTmObject(start_ptr, cur_pos, v_begins, vector_size);
+        free(v_begins);
+    }
+    else
+        tm_param.offset_vi_begins = TM2_NOT_SET;
+
+    if((p->size_).size())
+    {
+        size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * (p->size_).size();
+        TM2_Vector_dims* v_sizes = ( TM2_Vector_dims* )malloc(vector_size);
+        v_sizes->v_num = (p->size_).size();
+        for(unsigned int i = 0; i < (p->size_).size(); i++)
+        {
+            v_sizes->dims[i] = p->size_[i];
+        }
+        tm_param.offset_vi_sizes = WriteTmObject(start_ptr, cur_pos, v_sizes, vector_size);
+        free(v_sizes);
+    }
+    else
+        tm_param.offset_vi_sizes = TM2_NOT_SET;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_SLICE,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SliceParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmSoftmaxOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    SoftmaxParam* p = (dynamic_cast<Softmax*>(op))->GetParam();
+    TM2_SoftmaxParam tm_param;
+    tm_param.axis = p->axis;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_SOFTMAX,
+                  WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SoftmaxParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmSplitOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_SPLIT, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmDetectionPostProcessOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    DetectionPostProcessParam* p = (dynamic_cast<DetectionPostProcess*>(op))->GetParam();
+    TM2_DetectionPostProcessParam tm_param;
+
+    tm_param.max_detections = p->max_detections;
+    tm_param.max_classes_per_detection = p->max_classes_per_detection;
+    tm_param.nms_score_threshold = p->nms_score_threshold;
+    tm_param.nms_iou_threshold = p->nms_iou_threshold;
+    tm_param.num_classes = p->num_classes;
+
+    size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->scales.size();
+    TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size);
+    v_scales->v_num = p->scales.size();
+    for(unsigned int i = 0; i < p->scales.size(); i++)
+    {
+        v_scales->data[i] = p->scales[i];
+    }
+    tm_param.offset_vf_scales = WriteTmObject(start_ptr, cur_pos, v_scales, vector_size);
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_DETECTIONPOSTPROCESS,
+                   WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_DetectionPostProcessParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmGemmOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    GemmParam* p = (dynamic_cast<Gemm*>(op))->GetParam();
+    TM2_GemmParam tm_param;
+
+    tm_param.alpha = p->alpha;
+    tm_param.beta = p->beta;
+    tm_param.transA = p->transA;
+    tm_param.transB = p->transB;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_GEMM,
+                   WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_GemmParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmGenericOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    GenericParam* p = (dynamic_cast<Generic*>(op))->GetParam();
+    TM2_GenericParam tm_param;
+
+    tm_param.max_input_num = p->max_input_num;
+    tm_param.max_output_num = p->max_output_num;
+
+    TM2_String op_name;
+    op_name.size = strlen(p->op_name) + 1;  // including trailing \0
+    op_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, p->op_name, op_name.size);
+    tm_param.offset_s_opname = WriteTmObject(start_ptr, cur_pos, &op_name, sizeof(TM2_String));
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_GENERIC,
+                   WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_GenericParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmLogisticOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_LOGISTIC, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmLstmOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    LSTMParam* p = (dynamic_cast<LSTM*>(op))->GetParam();
+    TM2_LstmParam tm_param;
+
+    tm_param.forget_bias = p->forget_bias;
+    tm_param.clip = p->clip;
+    tm_param.output_len = p->output_len;
+    tm_param.sequence_len = p->sequence_len;
+    tm_param.input_size = p->input_size;
+    tm_param.hidden_size = p->hidden_size;
+    tm_param.cell_size = p->cell_size;
+    tm_param.has_peephole = p->has_peephole;
+    tm_param.has_projection = p->has_projection;
+    tm_param.has_clip = p->has_clip;
+    tm_param.has_bias = p->has_bias;
+    tm_param.has_init_state = p->has_init_state;
+    tm_param.forget_act = p->forget_act;
+    tm_param.input_act = p->input_act;
+    tm_param.output_act = p->output_act;
+    tm_param.cellin_act = p->cellin_act;
+    tm_param.cellout_act = p->cellout_act;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_LSTM,
+                   WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_LstmParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmRnnOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    RNNParam* p = (dynamic_cast<RNN*>(op))->GetParam();
+    TM2_RnnParam tm_param;
+
+    tm_param.clip = p->clip;
+    tm_param.output_len = p->output_len;
+    tm_param.sequence_len = p->sequence_len;
+    tm_param.input_size = p->input_size;
+    tm_param.hidden_size = p->hidden_size;
+    tm_param.has_clip = p->has_clip;
+    tm_param.has_bias = p->has_bias;
+    tm_param.has_init_state = p->has_init_state;
+    tm_param.activation = p->activation;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_RNN,
+                   WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_RnnParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmTanhOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_TANH, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmSigmoidOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_SIGMOID, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmSqueezeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    SqueezeParam* p = (dynamic_cast<Squeeze*>(op))->GetParam();
+    TM2_SqueezeParam tm_param;
+
+    tm_param.dim_0 = p->dim_0;
+    tm_param.dim_1 = p->dim_1;
+    tm_param.dim_2 = p->dim_2;
+    tm_param.dim_3 = p->dim_3;
+
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_SQUEEZE,
+                   WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SqueezeParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmFusedbnscalereluOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_FUSEDBNSCALERELU, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+op_save_t SaveTmOpFunc(uint32_t op_type)
+{
+    switch(op_type)
+    {
+        case TM2_OPTYPE_ACCURACY:
+            return SaveTmAccuracyOp;
+        case TM2_OPTYPE_BATCHNORMALIZATION:
+            return SaveTmBatchNormOp;
+        case TM2_OPTYPE_BILINEARRESIZE:
+            return SaveTmResizeOp;
+        case TM2_OPTYPE_CONCAT:
+            return SaveTmConcatOp;
+        case TM2_OPTYPE_CONST:
+            return SaveTmConstOp;
+        case TM2_OPTYPE_CONVOLUTION:
+            return SaveTmConvOp;
+        case TM2_OPTYPE_DECONVOLUTION:
+            return SaveTmDeconvOp;
+        case TM2_OPTYPE_DETECTIONOUTPUT:
+            return SaveTmDetectionOutputOp;
+        case TM2_OPTYPE_DROPOUT:
+            return SaveTmDropoutOp;
+        case TM2_OPTYPE_ELTWISE:
+            return SaveTmEltwiseOp;
+        case TM2_OPTYPE_FLATTEN:
+            return SaveTmFlattenOp;
+        case TM2_OPTYPE_FULLYCONNECTED:
+            return SaveTmFCOp;
+        case TM2_OPTYPE_INPUTOP:
+            return SaveTmInputOp;
+        case TM2_OPTYPE_LRN:
+            return SaveTmLRNOp;
+        case TM2_OPTYPE_NORMALIZE:
+            return SaveTmNormalizeOp;
+        case TM2_OPTYPE_PERMUTE:
+            return SaveTmPermuteOp;
+        case TM2_OPTYPE_POOLING:
+            return SaveTmPoolingOp;
+        case TM2_OPTYPE_PRELU:
+            return SaveTmPreluOp;
+        case TM2_OPTYPE_PRIORBOX:
+            return SaveTmPriorBoxOp;
+        case TM2_OPTYPE_REGION:
+            return SaveTmRegionOp;
+        case TM2_OPTYPE_RELU:
+            return SaveTmReLuOp;
+        case TM2_OPTYPE_RELU6:
+            return SaveTmRelu6Op;
+        case TM2_OPTYPE_REORG:
+            return SaveTmReorgOp;
+        case TM2_OPTYPE_RESHAPE:
+            return SaveTmReshapeOp;
+        case TM2_OPTYPE_ROIPOOLING:
+            return SaveTmROIPoolingOp;
+        case TM2_OPTYPE_RPN:
+            return SaveTmRPNOp;
+        case TM2_OPTYPE_SCALE:
+            return SaveTmScaleOp;
+        case TM2_OPTYPE_SLICE:
+            return SaveTmSliceOp;
+        case TM2_OPTYPE_SOFTMAX:
+            return SaveTmSoftmaxOp;
+        case TM2_OPTYPE_SPLIT:
+            return SaveTmSplitOp;
+        case TM2_OPTYPE_DETECTIONPOSTPROCESS:
+            return SaveTmDetectionPostProcessOp;
+        case TM2_OPTYPE_GEMM:
+            return SaveTmGemmOp;
+        case TM2_OPTYPE_GENERIC:
+            return SaveTmGenericOp;
+        case TM2_OPTYPE_LOGISTIC:
+            return SaveTmLogisticOp;
+        case TM2_OPTYPE_LSTM:
+            return SaveTmLstmOp;
+        case TM2_OPTYPE_RNN:
+            return SaveTmRnnOp;
+        case TM2_OPTYPE_TANH:
+            return SaveTmTanhOp;
+        case TM2_OPTYPE_SIGMOID:
+            return SaveTmSigmoidOp;
+        case TM2_OPTYPE_SQUEEZE:
+            return SaveTmSqueezeOp;
+        case TM2_OPTYPE_FUSEDBNSCALERELU:
+            return SaveTmFusedbnscalereluOp;
+        default:
+            LOG_ERROR() << "Operator #" << op_type << " not supported in tengine model yet\n";
+            return nullptr;
+    }
+}
+
+}    // namespace TMSerializer2
+
+}    // namespace TEngine
diff --git a/serializer/tengine/v2/tm2_serializer.cpp b/serializer/tengine/v2/tm2_serializer.cpp
new file mode 100644
index 000000000..33209e82a
--- /dev/null
+++ b/serializer/tengine/v2/tm2_serializer.cpp
@@ -0,0 +1,750 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: jingyou@openailab.com
+ */
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <typeinfo>
+#include <assert.h>
+
+#include "operator_manager.hpp"
+#include "static_graph.hpp"
+#include "graph.hpp"
+#include "node.hpp"
+#include "tensor.hpp"
+#include "compiler.hpp"
+
+#include "tm2_format.h"
+#include "tm2_serializer.hpp"
+#include "tm2_op_serializer.hpp"
+
+#define TYPE_INFO_INT32 1
+#define TYPE_INFO_UINT32 2
+#define TYPE_INFO_FLOAT 3
+#define TYPE_INFO_POINTER 4
+#define TYPE_INFO_GENERIC 5
+
+namespace TEngine {
+
+extern int NodeSetParamGeneric(void* node, const char* param_name, const char* type_name, const void* param_val, int size);
+extern int NodeAddParamGeneric(void* node, const char* param_name, const char* type_name, int param_size);
+
+} 
+
+using namespace TEngine;
+
+namespace TEngine {
+
+namespace TMSerializer2 {
+
+static int typename_to_int(const char* name)
+{
+    if(name == nullptr)
+        return TYPE_INFO_POINTER;
+
+    if(!strcmp(name, typeid(int).name()))
+        return TYPE_INFO_INT32;
+    if(!strcmp(name, typeid(unsigned int).name()))
+        return TYPE_INFO_UINT32;
+    if(!strcmp(name, typeid(float).name()))
+        return TYPE_INFO_FLOAT;
+
+    return TYPE_INFO_GENERIC;
+}
+
+static const char* int_to_typename(int id)
+{
+    switch(id)
+    {
+        case TYPE_INFO_INT32:
+            return typeid(int).name();
+        case TYPE_INFO_UINT32:
+            return typeid(unsigned int).name();
+        case TYPE_INFO_FLOAT:
+            return typeid(float).name();
+        case TYPE_INFO_POINTER:
+        case TYPE_INFO_GENERIC:
+        default:
+            return nullptr;
+    }
+}
+
+bool TmSerializer2::IsSaveString(void)
+{
+    const char* env = std::getenv("TM_NO_STRING");
+
+    if(env)
+        return false;
+    else
+        return true;
+}
+
+bool TmSerializer2::IsSaveData(void)
+{
+    const char* env = std::getenv("TM_FOR_BENCHMARK");
+
+    if(env)
+        return false;
+    else
+        return true;
+}
+
+tm_uoffset_t TmSerializer2::SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor,
+                                         unsigned int tensor_id, unsigned int buffer_id)
+{
+    TM2_Tensor tm_tensor;
+    tm_tensor.tensor_id = tensor_id;
+    tm_tensor.buffer_id = buffer_id;
+    tm_tensor.type = tensor->GetType();
+    tm_tensor.data_type = tensor->GetDataType();
+    tm_tensor.layout = (tensor->GetShape()).GetDataLayout();
+
+    bool tm_with_string = IsSaveString();
+
+    if(tm_with_string)
+    {
+        std::string name = tensor->GetName();
+        TM2_String tensor_name;
+        tensor_name.size = name.size()+1;    // including trailing \0
+        tensor_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), tensor_name.size);
+        tm_tensor.offset_s_tname = WriteTmObject(start_ptr, cur_pos, &tensor_name, sizeof(TM2_String));
+    }
+    else
+        tm_tensor.offset_s_tname = TM2_NOT_SET;
+
+    /* Get the dims of the tensor */
+    TShape& shape = tensor->GetShape();
+    std::vector<int>& dim = shape.GetDim();
+    size_t vector_size;
+    if(dim.size())
+    {
+        /* Write the vector of dims */
+        vector_size = sizeof(tm_size_t) + sizeof(int32_t) * dim.size();
+        TM2_Vector_dims* v_dims = ( TM2_Vector_dims* )malloc(vector_size);
+        v_dims->v_num = dim.size();
+        for(unsigned int i = 0; i < dim.size(); i++)
+        {
+            v_dims->dims[i] = dim[i];
+        }
+        tm_tensor.offset_vd_dims = WriteTmObject(start_ptr, cur_pos, v_dims, vector_size);
+        free(v_dims);
+    }
+    else
+        tm_tensor.offset_vd_dims = TM2_NOT_SET;
+
+    /* Write the quant params */
+    std::vector<QuantParam>* params = tensor->GetQuantParam();
+    if(params->size() != 0)
+    {
+        vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * params->size();
+        TM2_Vector_offsets* v_qtparams = ( TM2_Vector_offsets* )malloc(vector_size);
+        v_qtparams->v_num = params->size();
+        for(unsigned int i = 0; i < v_qtparams->v_num; i++)
+        {
+            QuantParam& p = (*params)[i];
+            TM2_QuantParam qtparam;
+
+            qtparam.zero_point = p.zero_point;
+            qtparam.scale = p.scale;
+            qtparam.width = p.width;
+
+            v_qtparams->offsets[i] = WriteTmObject(start_ptr, cur_pos, &qtparam, sizeof(TM2_QuantParam));
+        }
+
+        /* Write the vector of quant params */
+        tm_tensor.offect_vo_quantparams = WriteTmObject(start_ptr, cur_pos, v_qtparams, vector_size);
+    }
+    else
+        tm_tensor.offect_vo_quantparams = TM2_NOT_SET;
+
+    /* Write the tensor */
+    return WriteTmObject(start_ptr, cur_pos, &tm_tensor, sizeof(TM2_Tensor));
+}
+
+tm_uoffset_t TmSerializer2::SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node,
+                                       name_map_t& tensor_name_map)
+{
+    TM2_Node tm_node;
+    tm_node.node_id = node->GetNodeIndex();
+    tm_node.dynamic_shape = node->IsDynamicShape();
+
+    bool tm_with_string = IsSaveString();
+
+    if(tm_with_string)
+    {
+        std::string name = node->GetName();
+        TM2_String node_name;
+        node_name.size = name.size()+1;    // including trailing \0
+        node_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), node_name.size);
+        tm_node.offset_s_nname = WriteTmObject(start_ptr, cur_pos, &node_name, sizeof(TM2_String));
+    }
+    else
+        tm_node.offset_s_nname = TM2_NOT_SET;
+
+    unsigned int input_num = node->GetInputNum();
+    unsigned int output_num = node->GetOutputNum();
+
+    if(input_num)
+    {
+        /* Write the vector of input indices */
+        size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * input_num;
+        TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size);
+        v_input_indices->v_num = input_num;
+        for(unsigned int i = 0; i < input_num; i++)
+        {
+            Tensor* p_tensor = node->GetInputTensor(i);
+            v_input_indices->indices[i] = tensor_name_map[p_tensor->GetName()];
+        }
+        tm_node.offset_vi_input_tensors = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size);
+        free(v_input_indices);
+    }
+    else
+        tm_node.offset_vi_input_tensors = TM2_NOT_SET;
+
+    if(output_num)
+    {
+        /* Write the vector of output indices */
+        size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * output_num;
+        TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size);
+        v_output_indices->v_num = output_num;
+        for(unsigned int i = 0; i < output_num; i++)
+        {
+            Tensor* p_tensor = node->GetOutputTensor(i);
+            v_output_indices->indices[i] = tensor_name_map[p_tensor->GetName()];
+        }
+        tm_node.offset_vi_output_tensors = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size);
+        free(v_output_indices);
+    }
+    else
+        tm_node.offset_vi_output_tensors = TM2_NOT_SET;
+
+    /* Write tm operator */
+    std::string op_name = node->GetOp()->GetName();
+    if(op_name == "Input")
+        op_name = TM2_OPSTR_INPUTOP;
+    if(!FindOpSaveMethod(op_name))
+    {
+        LOG_ERROR() << "cannot find save function for operator: " << op_name << "\n";
+        return false;
+    }
+    op_save_t op_save_func = any_cast<op_save_t>(GetOpSaveMethod(op_name));
+    tm_node.offset_t_operator = op_save_func(start_ptr, cur_pos, node->GetOp());
+
+    /* No custom attrs */
+    if(!node->ExistAttr(ATTR_CUSTOM_ATTR))
+    {
+        tm_node.offset_vo_attrs = TM2_NOT_SET;
+        /* Write the node */
+        return WriteTmObject(start_ptr, cur_pos, &tm_node, sizeof(TM2_Node));
+    }
+
+    /* Get custom attrs of node */
+    std::vector<TM2_Attr> tm_attrs;
+    node_custom_attr_map_t* attr_map = any_cast<node_custom_attr_map_t>(&node->GetAttr(ATTR_CUSTOM_ATTR));
+    node_custom_attr_map_t::iterator it = (*attr_map).begin();
+    while(it != (*attr_map).end())
+    {
+        TM2_Attr tm_attr;
+        std::string attr_name = it->first;
+        CustomNodeAttr attr = it->second;
+
+        TM2_String tm_attr_name, tm_attr_val;
+        tm_attr_name.size = attr_name.size()+1;    // including trailing \0
+        tm_attr_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, attr_name.c_str(), attr_name.size());
+        tm_attr.offset_s_attrname = WriteTmObject(start_ptr, cur_pos, &tm_attr_name, sizeof(TM2_String));
+
+        tm_attr_val.size = attr.attr_size;    // no trailing \0
+        tm_attr_val.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, &(attr.mem), attr.attr_size);
+        tm_attr.offset_s_attrval = WriteTmObject(start_ptr, cur_pos, &tm_attr_val, sizeof(TM2_String));
+
+        tm_attr.attr_type = typename_to_int(attr.type_name);
+
+        tm_attrs.push_back(tm_attr);
+        ++it;
+    }
+
+    /* Write custom attrs */
+    size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tm_attrs.size();
+    TM2_Vector_offsets* v_attrs = ( TM2_Vector_offsets* )malloc(vector_size);
+    v_attrs->v_num = tm_attrs.size();
+    for(unsigned int i = 0; i < tm_attrs.size(); i++)
+    {
+        v_attrs->offsets[i] = WriteTmObject(start_ptr, cur_pos, &(tm_attrs[i]), sizeof(TM2_Attr));
+    }
+    tm_node.offset_vo_attrs = WriteTmObject(start_ptr, cur_pos, v_attrs, vector_size);
+    free(v_attrs);
+
+    /* Write the node */
+    return WriteTmObject(start_ptr, cur_pos, &tm_node, sizeof(TM2_Node));
+}
+
+tm_uoffset_t TmSerializer2::SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph)
+{
+    TM2_Subgraph tm_subgraph;
+    tm_subgraph.subgraph_id = 0; /* subgraph_id starts from 0 */
+    tm_subgraph.offset_s_sname = TM2_NOT_SET;
+
+    tm_subgraph.graph_layout = graph->GetLayout();
+    tm_subgraph.model_layout = graph->GetModelLayout();
+
+    unsigned int tensor_num = 0;
+    unsigned int buffer_num = 0;
+    std::vector<Tensor*> tensor_ptrs;
+    std::vector<void*> buf_ptrs;
+    std::vector<unsigned int> buf_sizes;
+    name_map_t tensor_name_map; /* map of tensor name and tensor index */
+    bool tm_no_data = !IsSaveData();
+
+    /* Write the nodes */
+    size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * graph->seq_nodes.size();
+    TM2_Vector_offsets* v_nodes = ( TM2_Vector_offsets* )malloc(vector_size);
+    v_nodes->v_num = graph->seq_nodes.size();
+    for(unsigned int i = 0; i < graph->seq_nodes.size(); i++)
+    {
+        Node* p_node = graph->seq_nodes[i];
+        for(unsigned int k = 0; k < p_node->GetOutputNum(); k++)
+        {
+            Tensor* p_tensor = p_node->GetOutputTensor(k);
+            tensor_ptrs.push_back(p_tensor);
+            tensor_name_map[p_tensor->GetName()] = tensor_num;
+            tensor_num++;
+        }
+        v_nodes->offsets[i] = SaveTmNode(start_ptr, cur_pos, p_node, tensor_name_map);
+    }
+    /* Write the vector of nodes */
+    tm_subgraph.offset_vo_seq_nodes = WriteTmObject(start_ptr, cur_pos, v_nodes, vector_size);
+
+    /* Write the tensors */
+    vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor_num;
+    TM2_Vector_offsets* v_tensors = ( TM2_Vector_offsets* )malloc(vector_size);
+    v_tensors->v_num = tensor_num;
+    for(unsigned int i = 0; i < tensor_num; i++)
+    {
+        Tensor* p_tensor = tensor_ptrs[i];
+        if(p_tensor->GetType() == kConstTensor)
+        {
+            buf_ptrs.push_back(p_tensor->GetMemAddr());
+            buf_sizes.push_back(p_tensor->GetTotalSize());
+            buffer_num++;
+        }
+
+        v_tensors->offsets[i] = SaveTmTensor(start_ptr, cur_pos, p_tensor, i, buffer_num - 1);
+    }
+    /* Write the vector of tensors */
+    tm_subgraph.offset_vo_tensors = WriteTmObject(start_ptr, cur_pos, v_tensors, vector_size);
+
+    /* Write the buffers */
+    vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * buffer_num;
+    TM2_Vector_offsets* v_buffers = ( TM2_Vector_offsets* )malloc(vector_size);
+    v_buffers->v_num = buffer_num;
+    for(unsigned int i = 0; i < buffer_num; i++)
+    {
+        TM2_Buffer tm_buf;
+        tm_buf.size = buf_sizes[i];
+
+        if(tm_no_data)
+        {
+            /* TM2_FOR_BENCHMARK environment variable exists. Not write buf data into the tm file */
+            tm_buf.offset_data = TM2_NOT_SET;
+        }
+        else
+        {
+            /* TM2_FOR_BENCHMARK environment variable does not exist */
+            tm_buf.offset_data =
+                WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast<const uint8_t*>(buf_ptrs[i]), tm_buf.size);
+        }
+        v_buffers->offsets[i] = WriteTmObject(start_ptr, cur_pos, &tm_buf, sizeof(TM2_Buffer));
+    }
+    /* Write the vector of buffers */
+    tm_subgraph.offset_vo_buffers = WriteTmObject(start_ptr, cur_pos, v_buffers, vector_size);
+
+    /* Write the vector of input indices */
+    vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->input_nodes.size();
+    TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size);
+    v_input_indices->v_num = graph->input_nodes.size();
+    for(unsigned int i = 0; i < graph->input_nodes.size(); i++)
+    {
+        v_input_indices->indices[i] = graph->input_nodes[i]->GetNodeIndex();
+    }
+    tm_subgraph.offset_vi_input_indices = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size);
+
+    /* Write the vector of output indices */
+    vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->output_nodes.size();
+    TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size);
+    v_output_indices->v_num = graph->output_nodes.size();
+    for(unsigned int i = 0; i < graph->output_nodes.size(); i++)
+    {
+        v_output_indices->indices[i] = graph->output_nodes[i]->GetNodeIndex();
+    }
+    tm_subgraph.offset_vi_output_indices = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size);
+
+    /* Write the subgraph */
+    tm_uoffset_t ret = WriteTmObject(start_ptr, cur_pos, &tm_subgraph, sizeof(TM2_Subgraph));
+
+    /* Free the memory of vectors */
+    free(v_tensors);
+    free(v_buffers);
+    free(v_nodes);
+    free(v_input_indices);
+    free(v_output_indices);
+
+    return ret;
+}
+
+bool TmSerializer2::SaveModelIntoMem(void* start_ptr, Graph* graph, uint32_t* tm_model_size)
+{
+    bool tm_with_string = IsSaveString();
+
+    tm_uoffset_t cur_pos = sizeof(TM2_Header);
+
+    /* Define the TM2_Header object */
+    TM2_Header header;
+    header.ver_main = TM2_FILE_VER_MAIN;
+    header.ver_sub = TM2_FILE_VER_SUB;
+    header.ver_compile = TM2_FILE_VER_COMPILE;
+
+    /* Define the TM2_Model object */
+    TM2_Model tm_model;
+    tm_model.orig_format = graph->GetModelFormat();
+    tm_model.sub_format = 0;
+
+    if(tm_with_string)
+    {
+        const std::string& fname = graph->GetName();
+        TM2_String model_name;
+        model_name.size = fname.size()+1;    // including trailing \0
+        model_name.offset_data = WriteTmFileAlign1(start_ptr, &cur_pos, fname.c_str(), model_name.size);
+        tm_model.offset_s_mname = WriteTmObject(start_ptr, &cur_pos, &model_name, sizeof(TM2_String));
+    }
+    else
+        tm_model.offset_s_mname = TM2_NOT_SET;
+
+    /* Write the subgraphs */
+    /* Only 1 subgraph is supported currently */
+    size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * 1;
+    TM2_Vector_offsets* v_subgraphs = ( TM2_Vector_offsets* )malloc(vector_size);
+    v_subgraphs->v_num = 1;
+    v_subgraphs->offsets[0] = SaveTmSubgraph(start_ptr, &cur_pos, graph);
+
+    /* Write the vector of subgraphs */
+    tm_model.offset_vo_subgraphs = WriteTmObject(start_ptr, &cur_pos, v_subgraphs, vector_size);
+
+    /* Write the model */
+    header.offset_root = WriteTmObject(start_ptr, &cur_pos, &tm_model, sizeof(TM2_Model));
+    *tm_model_size = cur_pos;
+
+    /* Write the header */
+    cur_pos = 0;
+    WriteTmObject(start_ptr, &cur_pos, &header, sizeof(TM2_Header));
+
+    free(v_subgraphs);
+
+    return true;
+}
+
+bool TmSerializer2::LoadNode(StaticGraph* graph, StaticNode* node, const TM2_Node* tm_node, void* mmap_buf)
+{
+    if(tm_node->offset_vi_input_tensors != TM2_NOT_SET)
+    {
+        const TM2_Vector_indices* v_input_tensors =
+            GetTmPtr<TM2_Vector_indices>(mmap_buf, tm_node->offset_vi_input_tensors);
+
+        /* Set the input tensors to the node */
+        for(unsigned int i = 0; i < v_input_tensors->v_num; i++)
+        {
+            StaticTensor* tensor = graph->tensor_list[v_input_tensors->indices[i]].get();
+            if(!tensor)
+            {
+                LOG_ERROR() << "The input tensor not exist: " << v_input_tensors->indices[i] << "\n";
+                return false;
+            }
+            AddNodeInputTensor(node, tensor);
+        }
+    }
+
+    if(tm_node->offset_vi_output_tensors != TM2_NOT_SET)
+    {
+        const TM2_Vector_indices* v_output_tensors =
+            GetTmPtr<TM2_Vector_indices>(mmap_buf, tm_node->offset_vi_output_tensors);
+
+        /* Set the output tensors to the node */
+        for(unsigned int i = 0; i < v_output_tensors->v_num; i++)
+        {
+            StaticTensor* tensor = graph->tensor_list[v_output_tensors->indices[i]].get();
+            if(!tensor)
+            {
+                LOG_ERROR() << "The output tensor not exist: " << v_output_tensors->indices[i] << "\n";
+                return false;
+            }
+            AddNodeOutputTensor(node, tensor);
+        }
+    }
+
+    /* set the custom attributes into static node */
+    if(tm_node->offset_vo_attrs == TM2_NOT_SET)
+        return true;
+
+    const TM2_Vector_offsets* v_attrs = GetTmPtr<TM2_Vector_offsets>(mmap_buf, tm_node->offset_vo_attrs);
+    for(unsigned int i = 0; i < v_attrs->v_num; i++)
+    {
+        const TM2_Attr* tm_attr = GetTmPtr<TM2_Attr>(mmap_buf, v_attrs->offsets[i]);
+        const TM2_String* tm_attr_name = GetTmPtr<TM2_String>(mmap_buf, tm_attr->offset_s_attrname);
+        const TM2_String* tm_attr_val = GetTmPtr<TM2_String>(mmap_buf, tm_attr->offset_s_attrval);
+
+        const char* attr_name = GetTmPtr<char>(mmap_buf, tm_attr_name->offset_data);
+        const char* attr_val = GetTmPtr<char>(mmap_buf, tm_attr_val->offset_data);
+        const char* type_name = int_to_typename(tm_attr->attr_type);
+
+        if(NodeAddParamGeneric(node, attr_name, type_name, tm_attr_val->size) < 0 ||
+           NodeSetParamGeneric(node, attr_name, type_name, attr_val, tm_attr_val->size) < 0)
+        {
+            LOG_ERROR() << "Add and set node param failed\n";
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool TmSerializer2::LoadTensor(StaticGraph* graph, const TM2_Tensor* tm_tensor, const TM2_Buffer* tm_buf, void* mmap_buf)
+{
+    /* Set the tensor name */
+    int idx = tm_tensor->tensor_id;
+    std::string tm_tensor_name;
+    if(tm_tensor->offset_s_tname == TM2_NOT_SET)
+        tm_tensor_name = "tensor_" + std::to_string(idx);
+    else
+    {
+        const TM2_String* tm_str = GetTmPtr<TM2_String>(mmap_buf, tm_tensor->offset_s_tname);
+        tm_tensor_name.assign(GetTmPtr<char>(mmap_buf, tm_str->offset_data), tm_str->size-1);
+    }
+
+    /* Create the static tensor */
+    StaticTensor* tensor;
+    if(tm_tensor->type == kConstTensor)
+        tensor = CreateStaticConstTensor(graph, tm_tensor_name);
+    else
+        tensor = CreateStaticTensor(graph, tm_tensor_name);
+    if(!tensor)
+    {
+        LOG_ERROR() << "Create static const tensor failed: " << tm_tensor_name << "\n";
+        return false;
+    }
+
+    /* Set the dims */
+    if(tm_tensor->offset_vd_dims != TM2_NOT_SET)
+    {
+        const TM2_Vector_dims* v_dims = GetTmPtr<TM2_Vector_dims>(mmap_buf, tm_tensor->offset_vd_dims);
+        if(!v_dims || !(v_dims->v_num))
+        {
+            LOG_ERROR() << "Get tensor dims failed\n";
+            return false;
+        }
+        std::vector<int> dims;
+        for(unsigned int i = 0; i < v_dims->v_num; i++)
+            dims.push_back(v_dims->dims[i]);
+        SetTensorDim(tensor, dims);
+    }
+
+    /* Set the tensor type and the data type */
+    SetTensorType(tensor, tm_tensor->type);
+    SetTensorDataType(tensor, tm_tensor->data_type);
+
+    /* Set the memory size and pointer */
+    if(tm_tensor->type == kConstTensor)
+    {
+        SetTensorSize(tensor, tm_buf->size);
+        void* buf = malloc(tm_buf->size);
+        if(tm_buf->offset_data != TM2_NOT_SET)
+        {
+            memcpy(buf, GetTmPtr<void>(mmap_buf, tm_buf->offset_data), tm_buf->size);
+        }
+
+        SetConstTensorBuffer(tensor, buf);
+        SetConstTensorFileLocation(tensor, -1, 0);
+    }
+
+    /* Set the quant params */
+    if(tm_tensor->offect_vo_quantparams != TM2_NOT_SET)
+    {
+        const TM2_Vector_offsets* v_quantparams = GetTmPtr<TM2_Vector_offsets>(mmap_buf, tm_tensor->offect_vo_quantparams);
+
+        /* currently only support one quant param */
+        assert(v_quantparams->v_num == 1);
+
+        const TM2_QuantParam* tm_qtparam = GetTmPtr<TM2_QuantParam>(mmap_buf, v_quantparams->offsets[0]);
+        tensor->zero_point = tm_qtparam->zero_point;
+        tensor->scale = tm_qtparam->scale;
+        tensor->width = tm_qtparam->width;
+    }
+
+    return true;
+}
+
+bool TmSerializer2::LoadGraph(StaticGraph* graph, const TM2_Model* tm_model, void* mmap_buf)
+{
+    const TM2_Vector_offsets* v_graphs = GetTmPtr<TM2_Vector_offsets>(mmap_buf, tm_model->offset_vo_subgraphs);
+    const TM2_Subgraph* tm_graph = GetTmPtr<TM2_Subgraph>(mmap_buf, v_graphs->offsets[0]);
+
+    const TM2_Vector_offsets* v_nodes = GetTmPtr<TM2_Vector_offsets>(mmap_buf, tm_graph->offset_vo_seq_nodes);
+    const TM2_Vector_offsets* v_tensors = GetTmPtr<TM2_Vector_offsets>(mmap_buf, tm_graph->offset_vo_tensors);
+    const TM2_Vector_offsets* v_buffers = GetTmPtr<TM2_Vector_offsets>(mmap_buf, tm_graph->offset_vo_buffers);
+
+    SetGraphLayout(graph, tm_graph->graph_layout);
+    SetModelLayout(graph, tm_graph->model_layout);
+
+    /* Load const tensors */
+    for(unsigned int i = 0; i < v_tensors->v_num; i++)
+    {
+        const TM2_Tensor* tm_tensor = GetTmPtr<TM2_Tensor>(mmap_buf, v_tensors->offsets[i]);
+        const TM2_Buffer* tm_buf;
+        if(tm_tensor->type == kConstTensor)
+            tm_buf = GetTmPtr<TM2_Buffer>(mmap_buf, v_buffers->offsets[tm_tensor->buffer_id]);
+        else
+            tm_buf = nullptr;
+        LoadTensor(graph, tm_tensor, tm_buf, mmap_buf);
+    }
+
+    /* Create static nodes */
+    unsigned int i;
+    for(i = 0; i < v_nodes->v_num; i++)
+    {
+        const TM2_Node* tm_node = GetTmPtr<TM2_Node>(mmap_buf, v_nodes->offsets[i]);
+        int idx = tm_node->node_id;
+        std::string tm_node_name;
+        if(tm_node->offset_s_nname == TM2_NOT_SET)
+            tm_node_name = "node_" + std::to_string(idx);
+        else
+        {
+            const TM2_String* tm_str = GetTmPtr<TM2_String>(mmap_buf, tm_node->offset_s_nname);
+            tm_node_name.assign(GetTmPtr<char>(mmap_buf, tm_str->offset_data), tm_str->size-1);
+        }
+
+        const TM2_Operator* tm_operator = GetTmPtr<TM2_Operator>(mmap_buf, tm_node->offset_t_operator);
+        const std::string& tm_op_name = GetOpStr(tm_operator->operator_type);
+
+        if(!FindOpLoadMethod(tm_op_name))
+        {
+            LOG_ERROR() << "cannot find load function for operator: " << tm_op_name << "\n";
+            break;
+        }
+
+        StaticNode* node = CreateStaticNode(graph, tm_node_name);
+        if(!LoadNode(graph, node, tm_node, mmap_buf))
+            break;
+
+        op_load_t op_func = any_cast<op_load_t>(GetOpLoadMethod(tm_op_name));
+
+        if(!op_func(graph, node, mmap_buf, tm_operator))
+            break;
+
+        /* Set the dynamic shape of the operator */
+        node->op->dynamic_shape = tm_node->dynamic_shape;
+    }
+
+    if(i < v_nodes->v_num)
+        return false;
+
+    const TM2_Vector_indices* v_input_nodes = GetTmPtr<TM2_Vector_indices>(mmap_buf, tm_graph->offset_vi_input_indices);
+    const TM2_Vector_indices* v_output_nodes = GetTmPtr<TM2_Vector_indices>(mmap_buf, tm_graph->offset_vi_output_indices);
+
+    /* Set the input nodes */
+    for(unsigned int i = 0; i < v_input_nodes->v_num; i++)
+    {
+        StaticNode* node = graph->node_list[v_input_nodes->indices[i]].get();
+        if(!node)
+        {
+            LOG_ERROR() << "Input node #" << v_input_nodes->indices[i] << " not exist\n";
+            return false;
+        }
+        AddGraphInputNode(graph, node);
+    }
+
+    /* Set the output nodes */
+    for(unsigned int i = 0; i < v_output_nodes->v_num; i++)
+    {
+        StaticNode* node = graph->node_list[v_output_nodes->indices[i]].get();
+        if(!node)
+        {
+            LOG_ERROR() << "Output node #" << v_output_nodes->indices[i] << " not exist\n";
+            return false;
+        }
+        AddGraphOutputNode(graph, node);
+    }
+
+    return true;
+}
+
+bool TmSerializer2::LoadModelFromMem(void* mmap_buf, StaticGraph* graph)
+{
+    const TM2_Header* tm_header = reinterpret_cast<const TM2_Header*>(mmap_buf);
+
+    const TM2_Model* tm_model = GetTmPtr<TM2_Model>(mmap_buf, tm_header->offset_root);
+
+    /* Load dla tengine model */
+    //if(tm_model->orig_format == MODEL_FORMAT_DLA)
+    //    return LoadDlaModel(mmap_buf, graph);
+
+    if(tm_model->offset_s_mname == TM2_NOT_SET)
+    {
+        SetGraphIdentity(graph, "tengine", "tengine_model", "0");
+    }
+    else
+    {
+        std::string tm_model_name;
+        const TM2_String* tm_str = GetTmPtr<TM2_String>(mmap_buf, tm_model->offset_s_mname);
+        tm_model_name.assign(GetTmPtr<char>(mmap_buf, tm_str->offset_data), tm_str->size-1);
+        SetGraphIdentity(graph, "tengine", tm_model_name, "0");
+    }
+
+    SetModelFormat(graph, tm_model->orig_format);
+
+    if(LoadGraph(graph, tm_model, mmap_buf))
+        return true;
+    else
+        return false;
+}
+
+bool TmSerializerRegisterOpLoader2(void)
+{
+    TmSerializerPtr serializer;
+
+    if(!TmSerializerManager::SafeGet("tm_v2", serializer))
+        return false;
+
+    TmSerializer2* p_tengine = dynamic_cast<TmSerializer2*>(serializer.get());
+
+    for(int i = 0; i < TM2_OPTYPE_NUM; i++)
+    {
+        p_tengine->RegisterOpLoadMethod(GetOpStr(i), op_load_t(LoadTmOpFunc(i)));
+        p_tengine->RegisterOpSaveMethod(GetOpStr(i), op_save_t(SaveTmOpFunc(i)));
+    }
+
+    return true;
+}
+
+}    // namespace TMSerializer2
+
+}    // namespace TEngine
diff --git a/serializer/tensorflow/tf_serializer.cpp b/serializer/tensorflow/tf_serializer.cpp
index 90f05a1ed..e2f363071 100644
--- a/serializer/tensorflow/tf_serializer.cpp
+++ b/serializer/tensorflow/tf_serializer.cpp
@@ -30,6 +30,8 @@
 
 #include "tf_serializer.hpp"
 
+#include "tengine_c_api.h"
+#include "exec_attr.hpp"
 #include "data_type.hpp"
 #include "tengine_errno.hpp"
 
@@ -48,6 +50,8 @@
 #include "operator/softmax_param.hpp"
 #include "operator/generic_param.hpp"
 #include "operator/lstm_param.hpp"
+#include "operator/rnn_param.hpp"
+#include "operator/gru_param.hpp"
 #include "operator_manager.hpp"
 #include "type_name.hpp"
 
@@ -101,6 +105,9 @@ bool TFSerializer::LoadModel(const std::vector<std::string>& file_list, StaticGr
     SetGraphSource(graph, file_list[0]);
     SetGraphSourceFormat(graph, "tensorflow");
     SetGraphConstTensorFile(graph, file_list[0]);
+    SetGraphLayout(graph,TENGINE_LAYOUT_NCHW);
+    SetModelLayout(graph,TENGINE_LAYOUT_NHWC);
+    SetModelFormat(graph,MODEL_FORMAT_TENSORFLOW);
 
     return LoadGraph(tf_net, graph);
 }
@@ -179,7 +186,7 @@ int TFSerializer::FindRNNScope(TFGraph& tf_graph, std::string& rnn_scope)
             break;
         }
 
-        cell_pos = name.find("gru", while_pos);
+        cell_pos = name.find("gru_cell", while_pos);
 
         if(cell_pos != std::string::npos)
         {
@@ -196,6 +203,15 @@ int TFSerializer::FindRNNScope(TFGraph& tf_graph, std::string& rnn_scope)
             rnn_type = TF_RNN_BASIC_LSTM;
             break;
         }
+
+        cell_pos = name.find("basic_rnn_cell", while_pos);
+
+        if(cell_pos != std::string::npos)
+        {
+            rnn_node = node->name;
+            rnn_type = TF_RNN_BASIC_RNN;
+            break;
+        }
     }
 
     if(rnn_node.empty())
@@ -269,136 +285,469 @@ void TFSerializer::ParseLSTMGraph(TFGraph& tf_graph, LSTMNode* lstm_node, std::s
         rnn_ir++;
     }
 }
-
-void TFSerializer::StripRNNScope(TFGraph& tf_graph, std::string& rnn_scope, int rnn_type)
+void TFSerializer::ParseRNNGraph(TFGraph& tf_graph, RNNNode* rnn_node, std::set<TFNode*>& rnn_graph)
 {
-    LSTMNode* lstm_node = new LSTMNode();
+    /* parse input node */
 
-    lstm_node->name = rnn_scope + "lstm";
-    lstm_node->op = "LSTM";
+    for(unsigned int i = 0; i < rnn_node->inputs.size(); i++)
+    {
+        TFNode* node = rnn_node->inputs[i];
 
-    std::set<TFNode*>& rnn_graph = lstm_node->rnn_graph;
+        if(node->op != "Const")
+            continue;
 
-    std::set<TFNode*> rnn_inputs;
-    std::set<TFNode*> rnn_outputs;
+        // node->no_static_node=true; //do not automatically create Static Node
 
-    auto ir = tf_graph.seq_nodes.begin();
-    std::string::size_type prefix_len = rnn_scope.size();
+        if(node->name.find("basic_rnn_cell/kernel") != std::string::npos)
+        {
+            rnn_node->kernel = node;
+        }
+        else if(node->name.find("basic_rnn_cell/bias") != std::string::npos)
+        {
+            rnn_node->bias = node;
+        }
+        
+    }
 
-    while(ir != tf_graph.seq_nodes.end())
+    auto rnn_ir = rnn_graph.begin();
+    auto rnn_ir_end = rnn_graph.end();
+
+    while(rnn_ir != rnn_ir_end)
     {
-        TFNode* node = *ir;
+        TFNode* node = *rnn_ir;
+        int name_len = node->name.size();
+        std::string zero_name = "BasicRNNCellZeroState/zeros";
 
-        if(node->name.find(rnn_scope.c_str(), 0, prefix_len) == std::string::npos)
-        {
-            ir++;
+        if(node->name.find(zero_name, name_len - zero_name.size()) != std::string::npos)
+            rnn_node->init_h = node;
+
+        rnn_ir++;
+    }
+}
+void TFSerializer::ParseGRUGraph(TFGraph& tf_graph, GRUNode* gru_node, std::set<TFNode*>& rnn_graph)
+{
+    /* parse input node */
+
+    for(unsigned int i = 0; i < gru_node->inputs.size(); i++)
+    {
+        TFNode* node = gru_node->inputs[i];
+
+        if(node->op != "Const")
             continue;
-        }
 
-        /* this is a node, inside rnn scope, remove it from graph first */
-        ir = tf_graph.seq_nodes.erase(ir);
+        // node->no_static_node=true; //do not automatically create Static Node
 
-        rnn_graph.insert(node);
+        if(node->name.find("gru_cell/gates/kernel") != std::string::npos)
+        {
+            gru_node->gate_kernel = node;
+        }
+        else if(node->name.find("gru_cell/gates/bias") != std::string::npos)
+        {
+            gru_node->gate_bias = node;
+        }
+        else if(node->name.find("gru_cell/candidate/kernel") != std::string::npos)
+        {
+            gru_node->candidate_kernel = node;
+        }
+        else if(node->name.find("gru_cell/candidate/bias") != std::string::npos)
+        {
+            gru_node->candidate_bias = node;
+        }
+        
     }
 
     auto rnn_ir = rnn_graph.begin();
-    auto rnn_end = rnn_graph.end();
+    auto rnn_ir_end = rnn_graph.end();
 
-    while(rnn_ir != rnn_end)
+    while(rnn_ir != rnn_ir_end)
     {
         TFNode* node = *rnn_ir;
+        int name_len = node->name.size();
+        std::string zero_name = "GRUCellZeroState/zeros";
 
-        for(unsigned int i = 0; i < node->inputs.size(); i++)
+        if(node->name.find(zero_name, name_len - zero_name.size()) != std::string::npos)
+            gru_node->init_h = node;
+
+        rnn_ir++;
+    }
+}
+
+void TFSerializer::StripRNNScope(TFGraph& tf_graph, std::string& rnn_scope, int rnn_type)
+{
+
+    // collect attributes according to rnn_type
+
+    if(rnn_type == TF_RNN_LSTM)
+    {
+        LSTMNode* lstm_node = new LSTMNode();
+
+        lstm_node->name = rnn_scope + "lstm";
+        lstm_node->op = "LSTM";
+
+        std::set<TFNode*>& rnn_graph = lstm_node->rnn_graph;
+
+        std::set<TFNode*> rnn_inputs;
+        std::set<TFNode*> rnn_outputs;
+
+        auto ir = tf_graph.seq_nodes.begin();
+        std::string::size_type prefix_len = rnn_scope.size();
+
+        while(ir != tf_graph.seq_nodes.end())
         {
-            TFNode* input = node->inputs[i];
+            TFNode* node = *ir;
 
-            if(!rnn_graph.count(input))
-                rnn_inputs.insert(input);
+            if(node->name.find(rnn_scope.c_str(), 0, prefix_len) == std::string::npos)
+            {
+                ir++;
+                continue;
+            }
+
+            /* this is a node, inside rnn scope, remove it from graph first */
+            ir = tf_graph.seq_nodes.erase(ir);
+
+            rnn_graph.insert(node);
         }
 
-        for(unsigned int i = 0; i < node->outputs.size(); i++)
+        auto rnn_ir = rnn_graph.begin();
+        auto rnn_end = rnn_graph.end();
+
+        while(rnn_ir != rnn_end)
         {
-            TFNode* output = node->outputs[i];
+            TFNode* node = *rnn_ir;
+
+            for(unsigned int i = 0; i < node->inputs.size(); i++)
+            {
+                TFNode* input = node->inputs[i];
 
-            if(!rnn_graph.count(output))
-                rnn_outputs.insert(output);
+                if(!rnn_graph.count(input))
+                    rnn_inputs.insert(input);
+            }
+
+            for(unsigned int i = 0; i < node->outputs.size(); i++)
+            {
+                TFNode* output = node->outputs[i];
+
+                if(!rnn_graph.count(output))
+                    rnn_outputs.insert(output);
+            }
+
+            rnn_ir++;
         }
 
-        rnn_ir++;
-    }
+        // insert lstm node
+        auto seq_ir = tf_graph.seq_nodes.begin();
 
-    // insert lstm node
-    auto seq_ir = tf_graph.seq_nodes.begin();
+        while(seq_ir != tf_graph.seq_nodes.end())
+        {
+            TFNode* node = *seq_ir;
 
-    while(seq_ir != tf_graph.seq_nodes.end())
-    {
-        TFNode* node = *seq_ir;
+            if(rnn_inputs.count(node))
+            {
+                tf_graph.seq_nodes.insert(seq_ir, lstm_node);
+                break;
+            }
+
+            seq_ir++;
+        }
+
+        // connect inputs and outputs
+        auto set_ir = rnn_inputs.begin();
+        auto set_ir_end = rnn_inputs.end();
 
-        if(rnn_inputs.count(node))
+        while(set_ir != set_ir_end)
         {
-            tf_graph.seq_nodes.insert(seq_ir, lstm_node);
-            break;
+            TFNode* input_node = *set_ir;
+
+            for(unsigned int j = 0; j < input_node->outputs.size(); j++)
+            {
+                TFNode* child_node = input_node->outputs[j];
+
+                if(rnn_graph.count(child_node))
+                    input_node->outputs[j] = lstm_node;
+            }
+
+            lstm_node->inputs.push_back(input_node);
+
+            if(input_node->op == "Identity")
+            {
+                TFNode* parent_node = input_node->inputs[0];
+
+                MergeChildNode(parent_node, input_node);
+            }
+
+            set_ir++;
         }
 
-        seq_ir++;
-    }
+        set_ir = rnn_outputs.begin();
+        set_ir_end = rnn_outputs.end();
 
-    // connect inputs and outputs
-    auto set_ir = rnn_inputs.begin();
-    auto set_ir_end = rnn_inputs.end();
+        while(set_ir != set_ir_end)
+        {
+            TFNode* output_node = *set_ir;
+
+            for(unsigned int j = 0; j < output_node->inputs.size(); j++)
+            {
+                TFNode* parent_node = output_node->inputs[j];
 
-    while(set_ir != set_ir_end)
+                if(rnn_graph.count(parent_node))
+                    output_node->inputs[j] = lstm_node;
+            }
+
+            lstm_node->outputs.push_back(output_node);
+            set_ir++;
+        }
+        ParseLSTMGraph(tf_graph, lstm_node, rnn_graph);
+    }
+
+    if(rnn_type == TF_RNN_BASIC_RNN)
     {
-        TFNode* input_node = *set_ir;
+        RNNNode* rnn_node = new RNNNode();
+
+        rnn_node->name = rnn_scope + "rnn";
+        //std::cout<<rnn_scope<<std::endl;
+        rnn_node->op = "RNN";
 
-        for(unsigned int j = 0; j < input_node->outputs.size(); j++)
+        std::set<TFNode*>& rnn_graph = rnn_node->rnn_graph;
+
+        std::set<TFNode*> rnn_inputs;
+        std::set<TFNode*> rnn_outputs;
+
+        auto ir = tf_graph.seq_nodes.begin();
+        std::string::size_type prefix_len = rnn_scope.size();
+
+        while(ir != tf_graph.seq_nodes.end())
         {
-            TFNode* child_node = input_node->outputs[j];
+            TFNode* node = *ir;
 
-            if(rnn_graph.count(child_node))
-                input_node->outputs[j] = lstm_node;
+            if(node->name.find(rnn_scope.c_str(), 0, prefix_len) == std::string::npos)
+            {
+                ir++;
+                continue;
+            }
+
+            /* this is a node, inside rnn scope, remove it from graph first */
+            ir = tf_graph.seq_nodes.erase(ir);
+
+            rnn_graph.insert(node);
         }
 
-        lstm_node->inputs.push_back(input_node);
+        auto rnn_ir = rnn_graph.begin();
+        auto rnn_end = rnn_graph.end();
 
-        if(input_node->op == "Identity")
+        while(rnn_ir != rnn_end)
         {
-            TFNode* parent_node = input_node->inputs[0];
+            TFNode* node = *rnn_ir;
+
+            for(unsigned int i = 0; i < node->inputs.size(); i++)
+            {
+                TFNode* input = node->inputs[i];
+
+                if(!rnn_graph.count(input))
+                    rnn_inputs.insert(input);
+            }
+
+            for(unsigned int i = 0; i < node->outputs.size(); i++)
+            {
+                TFNode* output = node->outputs[i];
+
+                if(!rnn_graph.count(output))
+                    rnn_outputs.insert(output);
+            }
 
-            MergeChildNode(parent_node, input_node);
+            rnn_ir++;
         }
 
-        set_ir++;
-    }
+        // insert rnn node
+        auto seq_ir = tf_graph.seq_nodes.begin();
 
-    set_ir = rnn_outputs.begin();
-    set_ir_end = rnn_outputs.end();
+        while(seq_ir != tf_graph.seq_nodes.end())
+        {
+            TFNode* node = *seq_ir;
 
-    while(set_ir != set_ir_end)
-    {
-        TFNode* output_node = *set_ir;
+            if(rnn_inputs.count(node))
+            {
+                tf_graph.seq_nodes.insert(seq_ir, rnn_node);
+                break;
+            }
+
+            seq_ir++;
+        }
+
+        // connect inputs and outputs
+        auto set_ir = rnn_inputs.begin();
+        auto set_ir_end = rnn_inputs.end();
+
+        while(set_ir != set_ir_end)
+        {
+            TFNode* input_node = *set_ir;
 
-        for(unsigned int j = 0; j < output_node->inputs.size(); j++)
+            for(unsigned int j = 0; j < input_node->outputs.size(); j++)
+            {
+                TFNode* child_node = input_node->outputs[j];
+
+                if(rnn_graph.count(child_node))
+                    input_node->outputs[j] = rnn_node;
+            }
+
+            rnn_node->inputs.push_back(input_node);
+
+            if(input_node->op == "Identity")
+            {
+                TFNode* parent_node = input_node->inputs[0];
+
+                MergeChildNode(parent_node, input_node);
+            }
+
+            set_ir++;
+        }
+
+        set_ir = rnn_outputs.begin();
+        set_ir_end = rnn_outputs.end();
+
+        while(set_ir != set_ir_end)
         {
-            TFNode* parent_node = output_node->inputs[j];
+            TFNode* output_node = *set_ir;
+
+            for(unsigned int j = 0; j < output_node->inputs.size(); j++)
+            {
+                TFNode* parent_node = output_node->inputs[j];
 
-            if(rnn_graph.count(parent_node))
-                output_node->inputs[j] = lstm_node;
+                if(rnn_graph.count(parent_node))
+                    output_node->inputs[j] = rnn_node;
+            }
+
+            rnn_node->outputs.push_back(output_node);
+            set_ir++;
         }
 
-        lstm_node->outputs.push_back(output_node);
-        set_ir++;
+        ParseRNNGraph(tf_graph, rnn_node, rnn_graph);
     }
+    if(rnn_type == TF_RNN_GRU)
+    {
+        GRUNode* gru_node = new GRUNode();
 
-    // collect attributes according to rnn_type
+        gru_node->name = rnn_scope + "gru";
+        //std::cout<<rnn_scope<<std::endl;
+        gru_node->op = "GRU";
 
-    if(rnn_type == TF_RNN_LSTM)
-    {
-        ParseLSTMGraph(tf_graph, lstm_node, rnn_graph);
+        std::set<TFNode*>& rnn_graph = gru_node->rnn_graph;
+
+        std::set<TFNode*> rnn_inputs;
+        std::set<TFNode*> rnn_outputs;
+
+        auto ir = tf_graph.seq_nodes.begin();
+        std::string::size_type prefix_len = rnn_scope.size();
+
+        while(ir != tf_graph.seq_nodes.end())
+        {
+            TFNode* node = *ir;
+
+            if(node->name.find(rnn_scope.c_str(), 0, prefix_len) == std::string::npos)
+            {
+                ir++;
+                continue;
+            }
+
+            /* this is a node, inside rnn scope, remove it from graph first */
+            ir = tf_graph.seq_nodes.erase(ir);
+
+            rnn_graph.insert(node);
+        }
+
+        auto rnn_ir = rnn_graph.begin();
+        auto rnn_end = rnn_graph.end();
+
+        while(rnn_ir != rnn_end)
+        {
+            TFNode* node = *rnn_ir;
+
+            for(unsigned int i = 0; i < node->inputs.size(); i++)
+            {
+                TFNode* input = node->inputs[i];
+
+                if(!rnn_graph.count(input))
+                    rnn_inputs.insert(input);
+            }
+
+            for(unsigned int i = 0; i < node->outputs.size(); i++)
+            {
+                TFNode* output = node->outputs[i];
+
+                if(!rnn_graph.count(output))
+                    rnn_outputs.insert(output);
+            }
+
+            rnn_ir++;
+        }
+
+        // insert rnn node
+        auto seq_ir = tf_graph.seq_nodes.begin();
+
+        while(seq_ir != tf_graph.seq_nodes.end())
+        {
+            TFNode* node = *seq_ir;
+
+            if(rnn_inputs.count(node))
+            {
+                tf_graph.seq_nodes.insert(seq_ir, gru_node);
+                break;
+            }
+
+            seq_ir++;
+        }
+
+        // connect inputs and outputs
+        auto set_ir = rnn_inputs.begin();
+        auto set_ir_end = rnn_inputs.end();
+
+        while(set_ir != set_ir_end)
+        {
+            TFNode* input_node = *set_ir;
+
+            for(unsigned int j = 0; j < input_node->outputs.size(); j++)
+            {
+                TFNode* child_node = input_node->outputs[j];
+
+                if(rnn_graph.count(child_node))
+                    input_node->outputs[j] = gru_node;
+            }
+
+            gru_node->inputs.push_back(input_node);
+
+            if(input_node->op == "Identity")
+            {
+                TFNode* parent_node = input_node->inputs[0];
+
+                MergeChildNode(parent_node, input_node);
+            }
+
+            set_ir++;
+        }
+
+        set_ir = rnn_outputs.begin();
+        set_ir_end = rnn_outputs.end();
+
+        while(set_ir != set_ir_end)
+        {
+            TFNode* output_node = *set_ir;
+
+            for(unsigned int j = 0; j < output_node->inputs.size(); j++)
+            {
+                TFNode* parent_node = output_node->inputs[j];
+
+                if(rnn_graph.count(parent_node))
+                    output_node->inputs[j] = gru_node;
+            }
+
+            gru_node->outputs.push_back(output_node);
+            set_ir++;
+        }
+
+        ParseGRUGraph(tf_graph, gru_node, rnn_graph);
     }
 
     // cleanup zero in/zero out node
-    seq_ir = tf_graph.seq_nodes.begin();
+    auto seq_ir = tf_graph.seq_nodes.begin();
 
     while(seq_ir != tf_graph.seq_nodes.end())
     {
@@ -1352,7 +1701,6 @@ bool TFSerializer::GenerateStaticGraph(TFGraph& tf_graph, StaticGraph* graph)
         /* create tensor */
         StaticTensor* tensor = CreateStaticTensor(graph, tf_node->name);
 
-        SetTensorDataLayout(tensor, "NCHW");
         SetTensorDataType(tensor, DataType::GetTypeID("float32"));
 
         AddNodeOutputTensor(node, tensor);
@@ -1422,7 +1770,6 @@ static void CreateInputNode(TFNode* tf_node, StaticGraph* graph)
 
     StaticTensor* tensor = CreateStaticTensor(graph, tf_node->name);
 
-    SetTensorDataLayout(tensor, "NCHW");
     SetTensorDataType(tensor, DataType::GetTypeID("float32"));
 
     // if has shape, set it
@@ -1678,7 +2025,6 @@ static bool LoadConstTensor(TFNode* tf_node, StaticGraph* graph)
 
         SetTensorDim(tensor, dims);
         SetTensorSize(tensor, mem_size);
-        SetTensorDataLayout(tensor, layout);
         SetConstTensorBuffer(tensor, mem_ptr);
     }
 
@@ -1730,13 +2076,17 @@ static bool LoadConv2D(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph)
     {
         if(value.s() == "VALID")
         {
-            param.pad_h = 0;
-            param.pad_w = 0;
+            param.pad_h0 = 0;
+            param.pad_h1 = 0;
+            param.pad_w0 = 0;
+            param.pad_w1 = 0;
         }
         else if(value.s() == "SAME")
         {
-            param.pad_h = -1;
-            param.pad_w = -1;
+            param.pad_h0 = -1;
+            param.pad_h1 = -1;
+            param.pad_w0 = -1;
+            param.pad_w1 = -1;
         }
     }
 
@@ -1790,7 +2140,6 @@ static bool LoadConv2D(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph)
     dims.push_back(kernel_w);
 
     SetTensorDim(weight_tensor, dims);
-    SetTensorDataLayout(weight_tensor, "NCHW");
 
     param.kernel_h = kernel_h;
     param.kernel_w = kernel_w;
@@ -1872,15 +2221,12 @@ static bool LoadConv2D(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph)
                         }
                     }
 
-                    /* update the padding arguments */
-                    saved_param.pads.resize(4);
-
                     /* h pad */
-                    saved_param.pads[0] = shape_data[2];
-                    saved_param.pads[2] = shape_data[3];
+                    saved_param.pad_h0 = shape_data[2];
+                    saved_param.pad_h1 = shape_data[3];
                     /* w pad */
-                    saved_param.pads[1] = shape_data[4];
-                    saved_param.pads[3] = shape_data[5];
+                    saved_param.pad_w0 = shape_data[4];
+                    saved_param.pad_w1 = shape_data[5];
 
                     SetOperatorParam(op, saved_param);
                 }
@@ -1919,13 +2265,17 @@ static bool LoadPool(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph)
     {
         if(value.s() == "VALID")
         {
-            param.pad_h = 0;
-            param.pad_w = 0;
+            param.pad_h0 = 0;
+            param.pad_h1 = 0;
+            param.pad_w0 = 0;
+            param.pad_w1 = 0;
         }
         else if(value.s() == "SAME")
         {
-            param.pad_h = -1;
-            param.pad_w = -1;
+            param.pad_h0 = -1;
+            param.pad_h1 = -1;
+            param.pad_w0 = -1;
+            param.pad_w1 = -1;
         }
     }
 
@@ -1938,21 +2288,6 @@ static bool LoadPool(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph)
         param.alg = kPoolMax;
     }
 
-    // convert to onnx format
-    param.kernel_shape.resize(2);
-    param.kernel_shape[0] = param.kernel_h;
-    param.kernel_shape[1] = param.kernel_w;
-
-    param.pads.resize(4);
-    param.pads[0] = param.pad_h;
-    param.pads[1] = param.pad_w;
-    param.pads[2] = param.pad_h;
-    param.pads[3] = param.pad_w;
-
-    param.strides.resize(2);
-    param.strides[0] = param.stride_h;
-    param.strides[1] = param.stride_w;
-
     StaticOp* op = CreateStaticOp(graph, "Pooling");
     SetOperatorParam(op, param);
     SetNodeOp(node, op);
@@ -2160,7 +2495,6 @@ static void CreatePresetNode(StaticGraph* graph, StaticNode* node, const char* n
     StaticTensor* tensor = CreateStaticConstTensor(graph, new_tensor_name);
     SetTensorDim(tensor, dims);
     SetTensorDataType(tensor, DataType::GetTypeID("float32"));
-    SetTensorDataLayout(tensor, layout);
 
     int elem_size = 1;
 
@@ -2409,14 +2743,11 @@ static bool LoadGemm(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph)
     param.beta = 1;
 
     StaticTensor* weight_tensor = FindTensor(graph, input1->name);
-    SetTensorDataLayout(weight_tensor, "HW");
 
     if(tf_node->inputs.size() > 2)
     {
         TFNode* bias = tf_node->inputs[2];
         AddNodeInputTensor(node, bias->static_tensor);
-        StaticTensor* bias_tensor = FindTensor(graph, bias->name);
-        SetTensorDataLayout(bias_tensor, "W");
     }
 
     if(param.transA)
@@ -2552,7 +2883,6 @@ static bool LoadLSTMInitState(LSTMNode* lstm_node, TFNode* init_node, StaticGrap
     SetTensorDataType(const_tensor, DataType::GetTypeID("float32"));
     SetTensorDim(const_tensor, dims);
     SetTensorSize(const_tensor, dims[0] * dims[1] * sizeof(float));
-    SetTensorDataLayout(const_tensor, "W");
     SetConstTensorBuffer(const_tensor, mem_ptr);
     SetConstTensorFileLocation(const_tensor, -1, 0);
 
@@ -2565,6 +2895,136 @@ static bool LoadLSTMInitState(LSTMNode* lstm_node, TFNode* init_node, StaticGrap
 
     return true;
 }
+static bool LoadGRUInitState(GRUNode* gru_node, TFNode* init_node, StaticGraph* graph)
+{
+    /* load const value */
+    TFNode* const_val_node;
+    TFNode* concat_node;
+
+    if(init_node->inputs[0]->op == "Const")
+    {
+        const_val_node = init_node->inputs[0];
+        concat_node = init_node->inputs[1];
+    }
+    else
+    {
+        const_val_node = init_node->inputs[1];
+        concat_node = init_node->inputs[0];
+    }
+
+    int* const_ptr = ( int* )LoadConstParam(const_val_node);
+    float const_val = const_ptr[0];
+
+    free(const_ptr);
+
+    // int* dim0_ptr = ( int* )LoadConstParam(concat_node->inputs[0]);
+    int* dim0_ptr = ( int* )LoadConstParam(concat_node->inputs[1]);
+
+    std::vector<int> dims(1);
+
+    dims[0] = dim0_ptr[0];
+    // dims[1] = dim1_ptr[0];
+
+    free(dim0_ptr);
+    // free(dim1_ptr);
+
+    float* mem_ptr = ( float* )malloc(dims[0] * sizeof(float));
+
+    for(int i = 0; i < dims[0]; i++)
+    {
+        mem_ptr[i] = const_val;
+    }
+
+    /* create node and tensor */
+
+    std::string const_node_name;
+
+    if(init_node == gru_node->init_h)
+        const_node_name = gru_node->name + "/init_h";
+
+    StaticNode* const_node = CreateStaticNode(graph, const_node_name);
+    StaticTensor* const_tensor = CreateStaticConstTensor(graph, const_node_name);
+
+    SetTensorDataType(const_tensor, DataType::GetTypeID("float32"));
+    SetTensorDim(const_tensor, dims);
+    SetTensorSize(const_tensor, dims[0]* sizeof(float));
+    SetConstTensorBuffer(const_tensor, mem_ptr);
+    SetConstTensorFileLocation(const_tensor, -1, 0);
+
+    AddNodeOutputTensor(const_node, const_tensor);
+
+    StaticOp* const_op = CreateStaticOp(graph, "Const");
+    SetNodeOp(const_node, const_op);
+
+    AddNodeInputTensor(gru_node->static_node, const_tensor);
+
+    return true;
+}
+static bool LoadRNNInitState(RNNNode* rnn_node, TFNode* init_node, StaticGraph* graph)
+{
+    /* load const value */
+    TFNode* const_val_node;
+    TFNode* concat_node;
+
+    if(init_node->inputs[0]->op == "Const")
+    {
+        const_val_node = init_node->inputs[0];
+        concat_node = init_node->inputs[1];
+    }
+    else
+    {
+        const_val_node = init_node->inputs[1];
+        concat_node = init_node->inputs[0];
+    }
+
+    int* const_ptr = ( int* )LoadConstParam(const_val_node);
+    float const_val = const_ptr[0];
+
+    free(const_ptr);
+
+    int* dim0_ptr = ( int* )LoadConstParam(concat_node->inputs[0]);
+    int* dim1_ptr = ( int* )LoadConstParam(concat_node->inputs[1]);
+
+    std::vector<int> dims(2);
+
+    dims[0] = dim0_ptr[0];
+    dims[1] = dim1_ptr[0];
+
+    free(dim0_ptr);
+    free(dim1_ptr);
+
+    float* mem_ptr = ( float* )malloc(dims[0] * dims[1] * sizeof(float));
+
+    for(int i = 0; i < dims[0] * dims[1]; i++)
+    {
+        mem_ptr[i] = const_val;
+    }
+
+    /* create node and tensor */
+
+    std::string const_node_name;
+
+    if(init_node == rnn_node->init_h)
+        const_node_name = rnn_node->name + "/init_h";
+
+    StaticNode* const_node = CreateStaticNode(graph, const_node_name);
+    StaticTensor* const_tensor = CreateStaticConstTensor(graph, const_node_name);
+
+    SetTensorDataType(const_tensor, DataType::GetTypeID("float32"));
+    SetTensorDim(const_tensor, dims);
+    SetTensorSize(const_tensor, dims[0] * dims[1] * sizeof(float));
+    SetConstTensorBuffer(const_tensor, mem_ptr);
+    SetConstTensorFileLocation(const_tensor, -1, 0);
+
+    AddNodeOutputTensor(const_node, const_tensor);
+
+    StaticOp* const_op = CreateStaticOp(graph, "Const");
+    SetNodeOp(const_node, const_op);
+
+    AddNodeInputTensor(rnn_node->static_node, const_tensor);
+
+    return true;
+}
 
 static bool LoadLSTM(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph)
 {
@@ -2629,6 +3089,8 @@ static bool LoadLSTM(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph)
     int cell_size = kernel_dims[1] / 4;
 
     param.cell_size = cell_size;
+    //mxnet false
+    param.mxnet_flag =0;
 
     if(lstm_node->projection)
     {
@@ -2650,6 +3112,99 @@ static bool LoadLSTM(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph)
     return true;
 }
 
+
+static bool LoadRNN(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph)
+{
+    StaticNode* node = tf_node->static_node;
+
+    RNNNode* rnn_node = dynamic_cast<RNNNode*>(tf_node);
+    RNNParam param = any_cast<RNNParam>(OpManager::GetOpDefParam("RNN"));
+
+    // those two are mandatory
+    AddNodeInputTensor(node, tf_node->inputs[0]->static_tensor);
+    AddNodeInputTensor(node, rnn_node->kernel->static_tensor);
+
+    // optional tensors
+    if(rnn_node->bias)
+    {
+        param.has_bias = 1;
+        AddNodeInputTensor(node, rnn_node->bias->static_tensor);
+    }
+
+    if(rnn_node->init_h)
+    {
+        param.has_init_state = 1;
+        LoadRNNInitState(rnn_node, rnn_node->init_h, graph);
+    }
+
+    /* calculate and set other paremeters*/
+    const std::vector<int>& kernel_dims = GetTensorDim(rnn_node->kernel->static_tensor);
+
+    int data_size = kernel_dims[0];
+
+    int hidden_size = kernel_dims[1];
+
+    param.hidden_size = hidden_size;
+
+    param.input_size = data_size - param.hidden_size;
+
+    StaticOp* op = CreateStaticOp(graph, "RNN");
+    SetOperatorParam(op, param);
+
+    SetNodeOp(node, op);
+
+    return true;
+}
+static bool LoadGRU(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph)
+{
+    StaticNode* node = tf_node->static_node;
+
+    GRUNode* gru_node = dynamic_cast<GRUNode*>(tf_node);
+    GRUParam param = any_cast<GRUParam>(OpManager::GetOpDefParam("GRU"));
+
+    // those 3 are mandatory
+    AddNodeInputTensor(node, tf_node->inputs[0]->static_tensor);
+    AddNodeInputTensor(node, gru_node->gate_kernel->static_tensor);
+    AddNodeInputTensor(node, gru_node->candidate_kernel->static_tensor);
+
+    // optional tensors
+    if(gru_node->gate_bias)
+    {
+        param.has_gate_bias = 1;
+        AddNodeInputTensor(node, gru_node->gate_bias->static_tensor);
+    }
+    if(gru_node->candidate_bias)
+    {
+        param.has_candidate_bias = 1;
+        AddNodeInputTensor(node, gru_node->candidate_bias->static_tensor);
+    }
+
+    if(gru_node->init_h)
+    {
+        param.has_init_state = 1;
+        LoadGRUInitState(gru_node, gru_node->init_h, graph);
+    }
+
+    /* calculate and set other paremeters*/
+    const std::vector<int>& kernel_dims = GetTensorDim(gru_node->gate_kernel->static_tensor);
+
+    int data_size = kernel_dims[0];
+
+    int hidden_size = kernel_dims[1];
+
+    param.hidden_size = hidden_size/2;
+
+    param.input_size = data_size - param.hidden_size;
+
+    param.mxnet_flag=0;
+    StaticOp* op = CreateStaticOp(graph, "GRU");
+    SetOperatorParam(op, param);
+
+    SetNodeOp(node, op);
+
+    return true;
+}
+
 }    // namespace tf_serializer
 
 using namespace tf_serializer;
@@ -2688,7 +3243,8 @@ bool TFSerializerRegisterOpLoader(void)
     p_tf->RegisterOpLoadMethod("AudioSpectrogram", op_load_t(LoadGeneric));
     p_tf->RegisterOpLoadMethod("Mfcc", op_load_t(LoadGeneric));
     p_tf->RegisterOpLoadMethod("LSTM", op_load_t(LoadLSTM));
-
+    p_tf->RegisterOpLoadMethod("RNN", op_load_t(LoadRNN));
+    p_tf->RegisterOpLoadMethod("GRU", op_load_t(LoadGRU));
     return true;
 }
 
diff --git a/serializer/tf_lite/Makefile b/serializer/tf_lite/Makefile
new file mode 100644
index 000000000..8a973b585
--- /dev/null
+++ b/serializer/tf_lite/Makefile
@@ -0,0 +1,3 @@
+obj-y+=tf_lite_serializer.o
+
+COMMON_CFLAGS+=-I../include/tf_lite
diff --git a/serializer/tf_lite/tf_lite_serializer.cpp b/serializer/tf_lite/tf_lite_serializer.cpp
new file mode 100644
index 000000000..88dd6be58
--- /dev/null
+++ b/serializer/tf_lite/tf_lite_serializer.cpp
@@ -0,0 +1,846 @@
+#include <iostream>
+#include <fstream>
+
+#include "tengine_c_api.h"
+#include "exec_attr.hpp"
+#include "tf_lite_serializer.hpp"
+#include "logger.hpp"
+#include "data_type.hpp"
+
+#include "operator/conv_param.hpp"
+#include "operator/pool_param.hpp"
+#include "operator/concat_param.hpp"
+#include "operator/reshape_param.hpp"
+#include "operator/softmax_param.hpp"
+#include "operator/detection_postprocess_param.hpp"
+#include "operator/eltwise_param.hpp"
+#include "flatbuffers/flexbuffers.h"
+
+namespace TEngine {
+
+using LiteNode = TFLiteSerializer::LiteNode;
+using LiteTensor = TFLiteSerializer::LiteTensor;
+using LiteGraph = TFLiteSerializer::LiteGraph;
+
+using op_load_t = std::function<bool(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph)>;
+
+bool TFLiteSerializer::LoadModel(const std::vector<std::string>& file_list, StaticGraph* graph)
+{
+    if(file_list.size() != GetFileNum())
+        return false;
+
+    std::ifstream input_file;
+
+    input_file.open(file_list[0], std::ios::binary | std::ios::in);
+    input_file.seekg(0, std::ios::end);
+
+    int model_len = input_file.tellg();
+    char* model_data = new char[model_len];
+
+    input_file.seekg(0, std::ios::beg);
+    input_file.read(model_data, model_len);
+    input_file.close();
+
+    SetGraphSource(graph, file_list[0]);
+    SetGraphSourceFormat(graph, "tflite");
+    SetGraphLayout(graph,TENGINE_LAYOUT_NHWC);
+    SetModelLayout(graph,TENGINE_LAYOUT_NHWC);
+    SetModelFormat(graph,MODEL_FORMAT_TFLITE);
+
+    bool ret = LoadModelFromMem(model_data, model_len, graph);
+
+    if(!ret)
+        delete[] model_data;
+
+    return ret;
+}
+
+bool TFLiteSerializer::LoadModelFromMem(char* mem_addr, int mem_size, StaticGraph* graph)
+{
+    ::flatbuffers::Verifier verifier(( const unsigned char* )mem_addr, mem_size);
+
+    if(!::tflite::VerifyModelBuffer(verifier))
+    {
+        LOG_ERROR() << "bad tf lite model file\n";
+        return false;
+    }
+
+    const LiteModel* lite_model = ::tflite::GetModel(mem_addr);
+
+    if(!lite_model->subgraphs() || lite_model->subgraphs()->size() != 1)
+    {
+        LOG_ERROR() << "bad graph format\n";
+        return false;
+    }
+
+    LiteGraph lite_graph;
+
+    lite_graph.lite_model = lite_model;
+
+    if(!ConstructGraph(lite_model, &lite_graph))
+        return false;
+
+    // DumpLiteGraph(&lite_graph);
+
+    if(!OptimizeGraph(&lite_graph))
+        return false;
+
+    if(!GenerateStaticGraph(&lite_graph, graph))
+        return false;
+
+    return true;
+}
+
+bool TFLiteSerializer::ConstructGraph(const LiteModel* lite_model, LiteGraph* lite_graph)
+{
+    // load all tensors first
+
+    auto tensors = (*lite_model->subgraphs())[0]->tensors();
+
+    int i = 0;
+
+    for(auto* tensor : *tensors)
+    {
+        LiteTensor* lite_tensor = new LiteTensor();
+
+        lite_tensor->tf_tensor = tensor;
+        lite_tensor->idx = i++;
+        lite_tensor->name = tensor->name()->c_str();
+
+        auto shape = tensor->shape();
+
+        for(unsigned int i = 0; i < shape->Length(); ++i)
+            lite_tensor->shape.push_back(shape->Get(i));
+
+        int type = tensor->type();
+
+        switch(type)
+        {
+            case ::tflite::TensorType_FLOAT32:
+                lite_tensor->type = "FP32";
+                break;
+            case ::tflite::TensorType_UINT8:
+                lite_tensor->type = "UINT8";
+                break;
+            case ::tflite::TensorType_INT32:
+                lite_tensor->type = "INT32";
+                break;
+            default:
+                lite_tensor->type = "unknown";
+        }
+
+        lite_graph->tensor_list.push_back(lite_tensor);
+    }
+
+    // load ops
+
+    const auto ops = (*lite_model->subgraphs())[0]->operators();
+    const auto opcodes = lite_model->operator_codes();
+
+    i = 0;
+
+    for(auto* op : *ops)
+    {
+        LiteNode* lite_node = new LiteNode();
+
+        lite_node->lite_op = op;
+
+        /* get op name */
+
+        int op_code_idx = op->opcode_index();
+
+        const auto* op_code = opcodes->Get(op_code_idx);
+
+        if(op_code->builtin_code() == ::tflite::BuiltinOperator_CUSTOM)
+            lite_node->op = op_code->custom_code()->c_str();
+        else
+            lite_node->op = EnumNameBuiltinOperator(op_code->builtin_code());
+
+        /*inputs and outputs */
+        auto inputs = op->inputs();
+
+        for(unsigned int i = 0; i < inputs->Length(); i++)
+        {
+            auto input_idx = inputs->Get(i);
+
+            if(input_idx != -1)
+            {
+                LiteTensor* lite_tensor = lite_graph->tensor_list.at(input_idx);
+                lite_node->inputs.push_back(lite_tensor);
+            }
+            else
+            {
+                LiteTensor* lite_tensor = new LiteTensor();
+
+                lite_tensor->name = "NoData";
+                lite_tensor->idx = lite_graph->tensor_list.size();
+
+                lite_graph->tensor_list.push_back(lite_tensor);
+
+                lite_node->inputs.push_back(lite_tensor);
+            }
+        }
+
+        auto outputs = op->outputs();
+
+        for(unsigned int i = 0; i < outputs->Length(); i++)
+        {
+            auto output_idx = outputs->Get(i);
+            LiteTensor* lite_tensor;
+
+            if(output_idx != -1)
+            {
+                lite_tensor = lite_graph->tensor_list.at(output_idx);
+                lite_node->outputs.push_back(lite_tensor);
+            }
+            else
+            {
+                lite_tensor = new LiteTensor();
+                lite_node->outputs.push_back(lite_tensor);
+            }
+
+            lite_tensor->producer = lite_node;
+        }
+
+        lite_node->name = lite_node->outputs[0]->name;
+
+        lite_graph->seq_nodes.push_back(lite_node);
+    }
+
+    // setup graph inputs/outputs
+    auto inputs = (*lite_model->subgraphs())[0]->inputs();
+
+    if(inputs)
+    {
+        for(int input : *inputs)
+        {
+            LiteTensor* tensor = lite_graph->tensor_list.at(input);
+            lite_graph->input_tensors.push_back(tensor);
+            tensor->graph_input = true;
+        }
+    }
+
+    auto outputs = (*lite_model->subgraphs())[0]->outputs();
+
+    if(outputs)
+    {
+        for(int output : *outputs)
+        {
+            LiteTensor* tensor = lite_graph->tensor_list.at(output);
+            tensor->graph_output = true;
+            lite_graph->output_tensors.push_back(tensor);
+        }
+    }
+
+    return true;
+}
+
+bool TFLiteSerializer::OptimizeGraph(LiteGraph* lite_graph)
+{
+    return true;
+}
+
+bool TFLiteSerializer::LoadTensorScaleAndZero(StaticTensor* static_tensor, LiteTensor* lite_tensor)
+{
+    auto quantization = lite_tensor->tf_tensor->quantization();
+    float scale = 1.f;
+    int zero_point = 0;
+
+    if(quantization->scale() && quantization->zero_point())
+    {
+        scale = quantization->scale()->Get(0);
+        zero_point = quantization->zero_point()->Get(0);
+    }
+    static_tensor->scale = scale;
+    static_tensor->zero_point = zero_point;
+
+    return true;
+}
+
+bool TFLiteSerializer::LoadConstLiteTensor(StaticTensor* static_tensor, LiteTensor* tensor, LiteGraph* lite_graph,
+                                           StaticGraph* graph)
+{
+    void* mem_buf;
+    int shape_size = 1;
+    int mem_size;
+    const TFLiteTensor* tf_tensor = tensor->tf_tensor;
+
+    auto* buffers = lite_graph->lite_model->buffers();
+    int buf_idx = tf_tensor->buffer();
+
+    auto* buffer = buffers->Get(buf_idx);
+    auto* src_buf = buffer->data();
+
+    for(unsigned int i = 0; i < tensor->shape.size(); i++)
+        shape_size *= tensor->shape[i];
+
+    int element_size = DataType::GetTypeSize(static_tensor->data_type);
+    mem_size = shape_size * element_size;
+
+    mem_buf = malloc(mem_size);
+
+    if(tensor->type == "UINT8")
+    {
+        const uint8_t* src_ptr = ( const uint8_t* )(src_buf->data());
+        memcpy(mem_buf, src_ptr, mem_size);
+    }
+    else if(tensor->type == "INT32")
+    {
+        const int* src_ptr = ( const int* )src_buf->data();
+        memcpy(mem_buf, src_ptr, mem_size);
+    }
+    else
+    {
+        const void* src_ptr = src_buf->data();
+        memcpy(mem_buf, src_ptr, mem_size);
+    }
+
+    // DIM SWITCH WILL BE DELAYED to OP LOAD
+    SetConstTensorBuffer(static_tensor, mem_buf);
+    SetConstTensorFileLocation(static_tensor, -1, 0);
+
+    StaticOp* op = CreateStaticOp(graph, "Const");
+    StaticNode* node = CreateStaticNode(graph, tensor->name);
+
+    SetNodeOp(node, op);
+
+    AddNodeOutputTensor(node, static_tensor);
+
+    return true;
+}
+
+bool TFLiteSerializer::LoadLiteTensor(LiteTensor* tensor, LiteGraph* lite_graph, StaticGraph* graph)
+{
+    StaticTensor* static_tensor;
+    bool const_tensor = false;
+
+    if(tensor->producer || tensor->graph_input)
+    {
+        static_tensor = CreateStaticTensor(graph, tensor->name);
+    }
+    else
+    {
+        const_tensor = true;
+        static_tensor = CreateStaticConstTensor(graph, tensor->name);
+    }
+    int data_type;
+    if(tensor->type == "UINT8")
+        data_type = TENGINE_DT_UINT8;
+    else if(tensor->type == "INT32")
+        data_type = TENGINE_DT_INT32;
+    else
+    {
+        data_type = TENGINE_DT_FP32;
+    }
+    SetTensorDataType(static_tensor, data_type);
+    SetTensorDim(static_tensor, tensor->shape);
+
+    LoadTensorScaleAndZero(static_tensor, tensor);
+
+    tensor->static_tensor = static_tensor;
+
+    // layout will be set during the op load
+
+    // Load Const Tensor
+    if(const_tensor)
+        return LoadConstLiteTensor(static_tensor, tensor, lite_graph, graph);
+
+    return true;
+}
+
+bool TFLiteSerializer::LoadLiteNode(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph)
+{
+    if(!FindOpLoadMethod(node->op))
+    {
+        LOG_ERROR() << "cannot find load method for op: " << node->op << "\n";
+        return false;
+    }
+
+    StaticNode* static_node = CreateStaticNode(graph, node->name);
+
+    // handle input
+    for(unsigned int i = 0; i < node->inputs.size(); i++)
+    {
+        LiteTensor* input = node->inputs.at(i);
+        AddNodeInputTensor(static_node, input->static_tensor);
+    }
+
+    // handle output
+
+    for(unsigned int i = 0; i < node->outputs.size(); i++)
+    {
+        LiteTensor* output = node->outputs.at(i);
+        AddNodeOutputTensor(static_node, output->static_tensor);
+    }
+
+    // for each op, load the op
+    op_load_t op_func = any_cast<op_load_t>(GetOpLoadMethod(node->op));
+
+    node->static_node = static_node;
+
+    if(!op_func(node, lite_graph, graph))
+    {
+        LOG_ERROR() << "failed to load node: " << node->name << " op: " << node->op << "\n";
+        return false;
+    }
+
+    return true;
+}
+
+void TFLiteSerializer::CreateGraphInputNode(LiteTensor* tensor, StaticGraph* graph)
+{
+    StaticOp* op = CreateStaticOp(graph, "InputOp");
+    StaticNode* node = CreateStaticNode(graph, tensor->name);
+
+    SetNodeOp(node, op);
+
+    AddNodeOutputTensor(node, tensor->static_tensor);
+
+    AddGraphInputNode(graph, node);
+}
+
+bool TFLiteSerializer::GenerateStaticGraph(LiteGraph* lite_graph, StaticGraph* graph)
+{
+    // first load all tensor
+    int tensor_number = lite_graph->tensor_list.size();
+
+    for(int i = 0; i < tensor_number; i++)
+    {
+        LiteTensor* tensor = lite_graph->tensor_list.at(i);
+
+        LoadLiteTensor(tensor, lite_graph, graph);
+    }
+
+    // create input node for graph_input tensor
+    for(unsigned int i = 0; i < lite_graph->input_tensors.size(); i++)
+    {
+        LiteTensor* tensor = lite_graph->input_tensors.at(i);
+
+        CreateGraphInputNode(tensor, graph);
+    }
+
+    // second load all nodes
+    int node_number = lite_graph->seq_nodes.size();
+
+    for(int i = 0; i < node_number; i++)
+    {
+        LiteNode* node = lite_graph->seq_nodes.at(i);
+
+        if(!LoadLiteNode(node, lite_graph, graph))
+            return false;
+    }
+
+    return true;
+}
+
+void TFLiteSerializer::DumpLiteTensor(LiteTensor* tensor)
+{
+    std::cout << tensor->name << " " << tensor->type << " [";
+    for(unsigned int i = 0; i < tensor->shape.size(); i++)
+        std::cout << " " << tensor->shape[i];
+
+    std::cout << "] ";
+
+    if(!tensor->producer && !tensor->graph_input)
+        std::cout << " Const ";
+}
+
+void TFLiteSerializer::DumpLiteGraph(LiteGraph* lite_graph)
+{
+    for(unsigned int i = 0; i < lite_graph->seq_nodes.size(); i++)
+    {
+        LiteNode* node = lite_graph->seq_nodes.at(i);
+
+        std::cout << i << ":\t" << node->op << " \t" << node->name << "\n";
+        std::cout << "\tInput: " << node->inputs.size() << " Output: " << node->outputs.size() << "\n";
+
+        for(unsigned int j = 0; j < node->inputs.size(); j++)
+        {
+            LiteTensor* tensor = node->inputs[j];
+            std::cout << "\t I" << j << ": ";
+            DumpLiteTensor(tensor);
+            std::cout << "\n";
+        }
+
+        for(unsigned int j = 0; j < node->outputs.size(); j++)
+        {
+            LiteTensor* tensor = node->outputs[j];
+            std::cout << "\t O" << j << ": ";
+            DumpLiteTensor(tensor);
+            std::cout << "\n";
+        }
+    }
+    std::cout << "\nGraph Inputs:\n";
+
+    for(unsigned int i = 0; i < lite_graph->input_tensors.size(); i++)
+    {
+        LiteTensor* tensor = lite_graph->input_tensors.at(i);
+        std::cout << "\t" << i << "\t" << tensor->name << "\n";
+    }
+
+    std::cout << "\nGraph Outputs:\n";
+
+    for(unsigned int i = 0; i < lite_graph->output_tensors.size(); i++)
+    {
+        LiteTensor* tensor = lite_graph->output_tensors.at(i);
+        std::cout << "\t" << i << "\t" << tensor->name << "\n";
+    }
+}
+
+namespace tf_lite_serializer {
+
+static void ExchangeNC(const std::vector<int>& shape, std::vector<int>& new_shape)
+{
+    new_shape.resize(4);
+
+    new_shape[0] = shape[3];
+    new_shape[1] = shape[1];
+    new_shape[2] = shape[2];
+    new_shape[3] = shape[0];
+}
+
+static bool LoadConv2D(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph)
+{
+    StaticNode* static_node = node->static_node;
+    int kernel_h = 1, kernel_w = 1, output_channel = 1;
+    LiteTensor* lite_tensor = node->inputs[1];
+
+    output_channel = lite_tensor->shape[0];
+    kernel_h = lite_tensor->shape[1];
+    kernel_w = lite_tensor->shape[2];
+
+    ConvParam param = any_cast<ConvParam>(OpManager::GetOpDefParam("Convolution"));
+    const tflite::Conv2DOptions* lite_param = node->lite_op->builtin_options_as<tflite::Conv2DOptions>();
+
+    int lite_activation = lite_param->fused_activation_function();
+    switch(lite_activation)
+    {
+        case 0:
+            param.activation = -1;
+            break;
+        case 1:
+            param.activation = 0;
+            break;
+        case 2:
+            param.activation = 1;
+            break;
+        case 3:
+            param.activation = 6;
+            break;
+        default:
+            param.activation = -4;
+            break;
+    }
+    param.stride_h = lite_param->stride_h();
+    param.stride_w = lite_param->stride_w();
+    int padding = lite_param->padding();
+    if(padding == 0)
+    {
+        param.pad_h0 = -1;
+        param.pad_h1 = -1;
+        param.pad_w0 = -1;
+        param.pad_w1 = -1;
+    }
+    else
+    {
+        param.pad_h0 = 0;
+        param.pad_h1 = 0;
+        param.pad_w0 = 0;
+        param.pad_w1 = 0;
+    }
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.group = 1;
+    param.kernel_h = kernel_h;
+    param.kernel_w = kernel_w;
+    param.output_channel = output_channel;
+
+    StaticOp* op = CreateStaticOp(graph, "Convolution");
+
+    SetOperatorParam(op, param);
+    SetNodeOp(static_node, op);
+
+    // bias
+
+    return true;
+}
+
+static bool LoadConv2DDepthwise(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph)
+{
+    StaticNode* static_node = node->static_node;
+    int kernel_h = 1, kernel_w = 1, output_channel = 1;
+    LiteTensor* lite_tensor = node->inputs[1];
+    {
+        output_channel = lite_tensor->static_tensor->dims[3];
+        kernel_h = lite_tensor->static_tensor->dims[1];
+        kernel_w = lite_tensor->static_tensor->dims[2];
+    }
+    ConvParam param = any_cast<ConvParam>(OpManager::GetOpDefParam("Convolution"));
+    const tflite::DepthwiseConv2DOptions* lite_param =
+        node->lite_op->builtin_options_as<tflite::DepthwiseConv2DOptions>();
+
+    int lite_activation = lite_param->fused_activation_function();
+    switch(lite_activation)
+    {
+        case 0:
+            param.activation = -1;
+            break;
+        case 1:
+            param.activation = 0;
+            break;
+        case 2:
+            param.activation = 1;
+            break;
+        case 3:
+            param.activation = 6;
+            break;
+        default:
+            param.activation = -4;
+            break;
+    }
+
+    param.stride_h = lite_param->stride_h();
+    param.stride_w = lite_param->stride_w();
+    param.group = output_channel / lite_param->depth_multiplier();
+    int padding = lite_param->padding();
+    if(padding == 0)
+    {
+        param.pad_h0 = -1;
+        param.pad_h1 = -1;
+        param.pad_w0 = -1;
+        param.pad_w1 = -1;
+    }
+    else
+    {
+        param.pad_h0 = 0;
+        param.pad_h1 = 0;
+        param.pad_w0 = 0;
+        param.pad_w1 = 0;
+    }
+
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.kernel_h = kernel_h;
+    param.kernel_w = kernel_w;
+    param.output_channel = output_channel;
+
+    StaticOp* op = CreateStaticOp(graph, "Convolution");
+
+    SetOperatorParam(op, param);
+    SetNodeOp(static_node, op);
+
+    std::vector<int> new_shape;
+    ExchangeNC(node->inputs[1]->shape, new_shape);
+    SetTensorDim(node->inputs[1]->static_tensor, new_shape);
+
+
+    return true;
+}
+
+static bool LoadPooling(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph)
+{
+    StaticNode* static_node = node->static_node;
+    PoolParam param = any_cast<PoolParam>(OpManager::GetOpDefParam("Pooling"));
+    const tflite::Pool2DOptions* lite_param = node->lite_op->builtin_options_as<tflite::Pool2DOptions>();
+
+    param.kernel_h = lite_param->filter_height();
+    param.kernel_w = lite_param->filter_width();
+
+    param.stride_h = lite_param->stride_h();
+    param.stride_w = lite_param->stride_w();
+
+    if(lite_param->padding() == 0)
+    {
+        param.pad_h0 = -1;
+        param.pad_h1 = -1;
+        param.pad_w0 = -1;
+        param.pad_w1 = -1;
+    }
+    else
+    {
+        param.pad_h0 = 0;
+        param.pad_h1 = 0;
+        param.pad_w0 = 0;
+        param.pad_w1 = 0;
+    }
+
+    if(node->op == "AVERAGE_POOL_2D")
+        param.alg = kPoolAvg;
+    else if(node->op == "MAX_POOL_2D")
+        param.alg = kPoolMax;
+
+    StaticOp* op = CreateStaticOp(graph, "Pooling");
+    SetOperatorParam(op, param);
+    SetNodeOp(static_node, op);
+
+    return true;
+}
+
+static bool LoadConcat(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph)
+{
+    StaticNode* static_node = node->static_node;
+    ConcatParam param = any_cast<ConcatParam>(OpManager::GetOpDefParam("Concat"));
+    const tflite::ConcatenationOptions* lite_param = node->lite_op->builtin_options_as<tflite::ConcatenationOptions>();
+    int activation = lite_param->fused_activation_function();
+
+    param.axis = lite_param->axis();
+
+    StaticOp* op = CreateStaticOp(graph, "Concat");
+    if(activation)
+        AddOperatorAttr(op, "Activation", activation);
+    SetOperatorParam(op, param);
+    SetNodeOp(static_node, op);
+
+
+    return true;
+}
+
+static bool LoadReshape(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph)
+{
+    StaticNode* static_node = node->static_node;
+
+    ReshapeParam param = any_cast<ReshapeParam>(OpManager::GetOpDefParam("Reshape"));
+    StaticTensor* output_tensor = node->outputs[0]->static_tensor;
+    // const tflite::ReshapeOptions * lite_param =
+    // node->lite_op->builtin_options_as<tflite::ReshapeOptions>();
+    // set dims
+    auto new_shape = output_tensor->dims;
+    if(new_shape.size() == 4)
+    {
+        param.dim_0 = new_shape[0];
+        param.dim_1 = new_shape[1];
+        param.dim_2 = new_shape[2];
+        param.dim_3 = new_shape[3];
+    }
+    else if(new_shape.size() == 3)
+    {
+        param.dim_0 = new_shape[0];
+        param.dim_1 = new_shape[1];
+        param.dim_2 = new_shape[2];
+    }
+    else if(new_shape.size() == 2)
+    {
+        param.dim_0 = new_shape[0];
+        param.dim_1 = new_shape[1];
+    }
+    else
+        return false;
+
+    StaticOp* op = CreateStaticOp(graph, "Reshape");
+    SetOperatorParam(op, param);
+    SetNodeOp(static_node, op);
+
+    return true;
+}
+
+static bool LoadLogistic(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph)
+{
+    StaticNode* static_node = node->static_node;
+
+    StaticOp* op = CreateStaticOp(graph, "Logistic");
+    SetNodeOp(static_node, op);
+
+    return true;
+}
+
+static bool LoadSoftmax(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph)
+{
+    StaticNode* static_node = node->static_node;
+    SoftmaxParam param = any_cast<SoftmaxParam>(OpManager::GetOpDefParam("Softmax"));
+
+    param.axis = 1;
+    StaticOp* op = CreateStaticOp(graph, "Softmax");
+    SetOperatorParam(op, param);
+    SetNodeOp(static_node, op);
+
+    return true;
+}
+
+static bool LoadEltwise(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph)
+{
+    StaticNode* static_node = node->static_node;
+    EltwiseParam param = any_cast<EltwiseParam>(OpManager::GetOpDefParam("Eltwise"));
+
+    if(node->op == "ADD")
+        param.type = ELT_SUM;
+    else if(node->op == "SUB")
+        param.type = ELT_SUB;
+    else if(node->op == "PROD")
+        param.type = ELT_PROD;
+    else if(node->op == "RSQRT")
+        param.type = ELT_RSQRT;
+    else if(node->op == "DIV")
+        param.type = ELT_DIV;
+    else if(node->op == "LOG")
+        param.type = ELT_LOG;
+    else if(node->op == "EXP")
+        param.type = ELT_EXP;
+    else if(node->op == "POW")
+        param.type = ELT_POW;
+    else if(node->op == "SQRT")
+        param.type = ELT_SQRT;
+    else if(node->op == "FLOOR")
+        param.type = ELT_FLOOR;
+    StaticOp* op = CreateStaticOp(graph, "Eltwise");
+    SetOperatorParam(op, param);
+    SetNodeOp(static_node, op);
+
+    return true;
+}
+
+static bool LoadDetectionPostProcess(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph)
+{
+    StaticNode* static_node = node->static_node;
+
+    DetectionPostProcessParam param =
+        any_cast<DetectionPostProcessParam>(OpManager::GetOpDefParam("DetectionPostProcess"));
+    const uint8_t* lite_buffer = node->lite_op->custom_options()->data();
+    size_t lite_buffer_len = node->lite_op->custom_options()->size();
+
+    const flexbuffers::Map& m = flexbuffers::GetRoot(lite_buffer, lite_buffer_len).AsMap();
+    param.max_detections = m["max_detections"].AsInt32();
+    param.max_classes_per_detection = m["max_classes_per_detection"].AsInt32();
+    param.nms_score_threshold = m["nms_score_threshold"].AsFloat();
+    param.nms_iou_threshold = m["nms_iou_threshold"].AsFloat();
+    param.num_classes = m["num_classes"].AsInt32();
+    param.scales.resize(4);
+    param.scales[0] = m["y_scale"].AsFloat();
+    param.scales[1] = m["x_scale"].AsFloat();
+    param.scales[2] = m["h_scale"].AsFloat();
+    param.scales[3] = m["w_scale"].AsFloat();
+
+    StaticOp* op = CreateStaticOp(graph, "DetectionPostProcess");
+    SetOperatorParam(op, param);
+    SetNodeOp(static_node, op);
+    return true;
+}
+
+}    // namespace tf_lite_serializer
+
+using namespace tf_lite_serializer;
+
+bool TFLiteSerializerRegisterOpLoader(void)
+{
+    SerializerPtr serializer;
+
+    if(!SerializerManager::SafeGet("tflite", serializer))
+        return false;
+
+    TFLiteSerializer* tf_lite = dynamic_cast<TFLiteSerializer*>(serializer.get());
+
+    tf_lite->RegisterOpLoadMethod("CONV_2D", op_load_t(LoadConv2D));
+    tf_lite->RegisterOpLoadMethod("AVERAGE_POOL_2D", op_load_t(LoadPooling));
+    tf_lite->RegisterOpLoadMethod("MAX_POOL_2D", op_load_t(LoadPooling));
+    tf_lite->RegisterOpLoadMethod("DEPTHWISE_CONV_2D", op_load_t(LoadConv2DDepthwise));
+    tf_lite->RegisterOpLoadMethod("RESHAPE", op_load_t(LoadReshape));
+    tf_lite->RegisterOpLoadMethod("SQUEEZE", op_load_t(LoadReshape));
+    tf_lite->RegisterOpLoadMethod("CONCATENATION", op_load_t(LoadConcat));
+    tf_lite->RegisterOpLoadMethod("LOGISTIC", op_load_t(LoadLogistic));
+    tf_lite->RegisterOpLoadMethod("SOFTMAX", op_load_t(LoadSoftmax));
+    tf_lite->RegisterOpLoadMethod("ADD", op_load_t(LoadEltwise));
+    tf_lite->RegisterOpLoadMethod("TFLite_Detection_PostProcess", op_load_t(LoadDetectionPostProcess));
+
+    return true;
+}
+
+}    // namespace TEngine
diff --git a/sysroot/Makefile b/sysroot/Makefile
index 1c8007936..00e7cb9ea 100644
--- a/sysroot/Makefile
+++ b/sysroot/Makefile
@@ -14,3 +14,4 @@ debian32:
 .PHONY: ubuntu debian ubuntu32 debian32
 
 
+
diff --git a/tests/Makefile b/tests/Makefile
index b3534eb7b..b6dd4e0ca 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -69,6 +69,7 @@ $(BUILD_DIR)/%: $(BUILD_DIR)/%.o
 OPENCV_LIB=$(shell pkg-config --libs-only-l --libs-only-L opencv) 
 
 LIBS+=-L ../build/ -ltengine -Wl,-allow-shlib-undefined
+
 SO_LIBS+=-lprotobuf -ldl -lpthread
 SO_LIBS+=-Wl,-rpath,./build/
 SO_LIBS+=$(OPENCV_LIB)
diff --git a/tests/bin/Makefile b/tests/bin/Makefile
index 0d1345d50..85b49b8c2 100644
--- a/tests/bin/Makefile
+++ b/tests/bin/Makefile
@@ -1,32 +1,43 @@
 bin-obj-y+=bench_sqz.o
 bin-obj-y+=bench_mobilenet.o
-bin-obj-y+=test_mxnet_sqz.o
-bin-obj-y+=test_mxnet_mobilenet.o
-bin-obj-y+=test_onnx_sqz.o
 bin-obj-y+=vgg16.o
 bin-obj-y+=test_deploy.o
 bin-obj-y+=demo.o
 bin-obj-y+=test_perf_stat.o
 bin-obj-y+=test_node_dump.o
 bin-obj-y+=two_model_demo.o
-bin-obj-y+=test_lstm.o
+
+bin-obj-y+=test_rnn.o
 
 bin-obj-$(CONFIG_ACL_GPU)+=mt_mssd.o
 
+ifeq ($(CONFIG_MXNET_SERIALIZER),y)
+bin-obj-y+=test_mxnet.o
+bin-obj-y+=test_mxnet_sqz.o
+bin-obj-y+=test_mxnet_mobilenet.o
+bin-obj-y+=test_mxnet_mobileface.o
+bin-obj-y+=test_mxnet_lstm.o
+bin-obj-y+=test_mxnet_gru.o
+endif
+
 ifeq ($(CONFIG_TF_SERIALIZER),y)
 bin-obj-y+=test_tf_mobilenet.o
 bin-obj-y+=test_tf_inceptionv3.o
 bin-obj-y+=test_tf_resnet50.o
 bin-obj-y+=test_tf.o
+bin-obj-y+=test_tf_gru.o
+bin-obj-y+=test_tf_lstm.o
 endif
 
 ifeq ($(CONFIG_TENGINE_SERIALIZER),y)
 bin-obj-y+=test_tm.o
 bin-obj-y+=save_model_src.o
 bin-obj-y+=load_model_src.o
+bin-obj-y+=test_mobilenet.o
 endif
 
 ifeq ($(CONFIG_ONNX_SERIALIZER),y)
+bin-obj-y+=test_onnx_sqz.o
 bin-obj-y+=test_onnx.o
 endif
 
@@ -35,3 +46,4 @@ bin-obj-y+=tf_lite_mssd.o
 bin-obj-y+=tf_lite_mssd_quant.o
 bin-obj-y+=tf_lite_mobilenet_quant.o
 endif
+
diff --git a/tests/bin/bench_mobilenet.cpp b/tests/bin/bench_mobilenet.cpp
index fe4fdda54..d28969acd 100644
--- a/tests/bin/bench_mobilenet.cpp
+++ b/tests/bin/bench_mobilenet.cpp
@@ -126,10 +126,9 @@ int main(int argc, char* argv[])
 
     get_input_data(image_file, input_data, img_h, img_w, channel_mean, 0.017);
 
-
     if(cpu_list_str)
-        set_cpu_list(cpu_list_str);
-        
+    set_cpu_list(cpu_list_str);
+
     init_tengine();
 
     std::cout << "run-time library version: " << get_tengine_version() << "\n";
@@ -137,7 +136,6 @@ int main(int argc, char* argv[])
     if(request_tengine_version("0.9") < 0)
         return -1;
 
-
     graph_t graph = create_graph(nullptr, "caffe", text_file, model_file);
 
     if(graph == nullptr)
diff --git a/tests/bin/bench_sqz.cpp b/tests/bin/bench_sqz.cpp
index 55f82bb93..e6afcc152 100644
--- a/tests/bin/bench_sqz.cpp
+++ b/tests/bin/bench_sqz.cpp
@@ -133,7 +133,7 @@ int main(int argc, char* argv[])
     get_input_data(image_file, input_data, img_h, img_w, channel_mean, 1);
 
     if(cpu_list_str)
-        set_cpu_list(cpu_list_str);
+    set_cpu_list(cpu_list_str);
 
     init_tengine();
 
@@ -142,6 +142,7 @@ int main(int argc, char* argv[])
     if(request_tengine_version("0.9") < 0)
         return -1;
 
+
     graph_t graph = create_graph(nullptr, "caffe", text_file, model_file);
 
     if(graph == nullptr)
diff --git a/tests/bin/load_model_src.cpp b/tests/bin/load_model_src.cpp
index cde37a44d..1c5b64ab7 100644
--- a/tests/bin/load_model_src.cpp
+++ b/tests/bin/load_model_src.cpp
@@ -139,17 +139,15 @@ int main(int argc, char* argv[])
     float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3);
 
     get_input_data(image_file, input_data, img_h, img_w, channel_mean, 1);
-
+    
     if(cpu_list_str)
         set_cpu_list(cpu_list_str);
-        
+
     init_tengine();
 
     if(request_tengine_version("0.9") < 0)
         return 1;
 
-
-
     /* src_tm: the serailizer registered name
      * squeeze_net: the model name when saving the model
      */
diff --git a/tests/bin/test_deploy.cpp b/tests/bin/test_deploy.cpp
index a92029b84..daa3f3051 100644
--- a/tests/bin/test_deploy.cpp
+++ b/tests/bin/test_deploy.cpp
@@ -72,6 +72,14 @@ int main(int argc, char* argv[])
     tensor_t input_tensor = get_graph_tensor(graph, input_tensor_name);
     int dims[] = {1, 3, img_h, img_w};
     set_tensor_shape(input_tensor, dims, 4);
+
+    // if use gpu
+    int use_gpu = 0;
+    const char* gpu_flag = std::getenv("USE_GPU");
+    if (gpu_flag) use_gpu= atoi(gpu_flag);
+    if (use_gpu) set_graph_device(graph, "acl_opencl");
+    // 
+
     int ret_prerun = prerun_graph(graph);
     if(ret_prerun < 0)
     {
diff --git a/tests/bin/test_mobilenet.cpp b/tests/bin/test_mobilenet.cpp
new file mode 100644
index 000000000..07342f5a3
--- /dev/null
+++ b/tests/bin/test_mobilenet.cpp
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2017, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+#include <unistd.h>
+#include <sys/time.h>
+
+#include <iostream>
+#include <functional>
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+
+#include "tengine_c_api.h"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+const char* model_file = "./models/mobilenet.tm";
+const char* image_file = "./tests/images/cat.jpg";
+const char* label_file = "./models/synset_words.txt";
+
+const float channel_mean[3] = {104.007, 116.669, 122.679};
+
+
+int repeat_count = 100;
+
+unsigned long get_cur_time(void)
+{
+    struct timeval tv;
+
+    gettimeofday(&tv, NULL);
+
+    return (tv.tv_sec * 1000000 + tv.tv_usec);
+}
+
+
+void LoadLabelFile(std::vector<std::string>& result, const char* fname)
+{
+    std::ifstream labels(fname);
+
+    std::string line;
+    while(std::getline(labels, line))
+        result.push_back(line);
+}
+
+static inline bool PairCompare(const std::pair<float, int>& lhs, const std::pair<float, int>& rhs)
+{
+    return lhs.first > rhs.first;
+}
+
+
+static inline std::vector<int> Argmax(const std::vector<float>& v, int N)
+{
+    std::vector<std::pair<float, int>> pairs;
+    for(size_t i = 0; i < v.size(); ++i)
+        pairs.push_back(std::make_pair(v[i], i));
+    std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);
+
+    std::vector<int> result;
+    for(int i = 0; i < N; ++i)
+        result.push_back(pairs[i].second);
+    return result;
+}
+
+
+void get_input_data(const char* image_file, float* input_data, int img_h, int img_w, const float* mean, float scale)
+{
+    cv::Mat img = cv::imread(image_file, -1);
+
+    if(img.empty())
+    {
+        std::cerr << "failed to read image file " << image_file << "\n";
+        return;
+    }
+    cv::resize(img, img, cv::Size(img_h, img_w));
+    img.convertTo(img, CV_32FC3);
+    float* img_data = ( float* )img.data;
+    int hw = img_h * img_w;
+    for(int h = 0; h < img_h; h++)
+        for(int w = 0; w < img_w; w++)
+            for(int c = 0; c < 3; c++)
+            {
+                input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale;
+                img_data++;
+            }
+}
+
+int main(int argc, char* argv[])
+{
+    int res;
+
+    while((res = getopt(argc, argv, "r:")) != -1)
+    {
+        switch(res)
+        {
+            case 'r':
+                repeat_count = strtoul(optarg, NULL, 10);
+                break;
+
+            default:
+                break;
+        }
+    }
+
+    int img_h = 224;
+    int img_w = 224;
+
+    /* prepare input data */
+    float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3);
+
+    get_input_data(image_file, input_data, img_h, img_w, channel_mean, 0.017);
+
+
+    init_tengine();
+
+    std::cout << "run-time library version: " << get_tengine_version() << "\n";
+
+    if(request_tengine_version("1.0") < 0)
+        return -1;
+
+    graph_t graph = create_graph(nullptr, "tengine", model_file);
+
+    if(graph == nullptr)
+    {
+        std::cout << "Create graph0 failed\n";
+        std::cout << "errno: " << get_tengine_errno() << "\n";
+        return -1;
+    }
+
+    /* get input tensor */
+    int node_idx = 0;
+    int tensor_idx = 0;
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, node_idx, tensor_idx);
+
+    if(input_tensor == nullptr)
+    {
+        std::printf("Cannot find input tensor,node_idx: %d,tensor_idx: %d\n", node_idx, tensor_idx);
+        return -1;
+    }
+
+    int dims[] = {1, 3, img_h, img_w};
+
+    set_tensor_shape(input_tensor, dims, 4);
+
+    /* setup input buffer */
+
+    if(set_tensor_buffer(input_tensor, input_data, 3 * img_h * img_w * 4) < 0)
+    {
+        std::printf("Set buffer for tensor failed\n");
+        return -1;
+    }
+
+
+    /* run the graph */
+    int ret_prerun = prerun_graph(graph);
+    if(ret_prerun < 0)
+    {
+        std::printf("prerun failed\n");
+        return -1;
+    }
+
+    dump_graph(graph);
+
+    run_graph(graph, 1);
+
+    // benchmark start here
+    printf("REPEAT COUNT= %d\n", repeat_count);
+
+    unsigned long start_time = get_cur_time();
+
+    for(int i = 0; i < repeat_count; i++)
+        run_graph(graph, 1);
+
+    unsigned long end_time = get_cur_time();
+
+    unsigned long off_time = end_time - start_time;
+    std::printf("Repeat [%d] time %.2f us per RUN. used %lu us\n", repeat_count, 1.0f * off_time / repeat_count,
+                off_time);
+
+    /* get output tensor */
+    tensor_t output_tensor = get_graph_output_tensor(graph, node_idx, tensor_idx);
+
+    if(output_tensor == nullptr)
+    {
+        std::printf("Cannot find output tensor , node_idx: %d,tensor_idx: %d\n", node_idx, tensor_idx);
+        return -1;
+    }
+
+    int count = get_tensor_buffer_size(output_tensor) / 4;
+
+    float* data = ( float* )(get_tensor_buffer(output_tensor));
+    float* end = data + count;
+
+    std::vector<float> result(data, end);
+
+    std::vector<int> top_N = Argmax(result, 5);
+
+    std::vector<std::string> labels;
+
+    LoadLabelFile(labels, label_file);
+
+    for(unsigned int i = 0; i < top_N.size(); i++)
+    {
+        int idx = top_N[i];
+
+        std::cout << std::fixed << std::setprecision(4) << result[idx] << " - \"";
+        std::cout << labels[idx] << "\"\n";
+    }
+
+    release_graph_tensor(output_tensor);
+    release_graph_tensor(input_tensor);
+    postrun_graph(graph);
+    destroy_graph(graph);
+
+    free(input_data);
+
+    std::cout << "ALL TEST DONE\n";
+
+    release_tengine();
+    return 0;
+}
diff --git a/tests/bin/test_mxnet.cpp b/tests/bin/test_mxnet.cpp
new file mode 100644
index 000000000..e62f85903
--- /dev/null
+++ b/tests/bin/test_mxnet.cpp
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2018, Open AI Lab
+ * Author: chunyinglv@openailab.com
+ */
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include "tengine_c_api.h"
+#include <sys/time.h>
+
+int main(int argc, char* argv[])
+{
+    if(argc < 4)
+    {
+        std::cout << "[Usage]: " << argv[0] << " <model> <size> \n";
+        return 1;
+    }
+
+    // init tengine
+    init_tengine();
+    if(request_tengine_version("0.9") < 0)
+        return 1;
+
+    // create graph
+    printf("%s\n",argv[1]);
+    printf("%s\n",argv[2]);
+    graph_t graph = create_graph(nullptr, "mxnet", argv[1],argv[2]);
+    if(graph == nullptr)
+    {
+        std::cout << "Create graph failed\n";
+        std::cout << "errno: " << get_tengine_errno() << "\n";
+        return 1;
+    }
+    std::cout << "Create graph success\n";
+
+    // input
+    int img_h = atoi(argv[3]);
+    int img_w = img_h;
+    if(argc == 5)
+        img_w = atoi(argv[4]);
+    int img_size = img_h * img_w * 1 * 1;
+
+    float* input_data = ( float* )malloc(sizeof(float) * img_size);
+    for(int i =0;i<img_size;i++)
+        input_data[i] = -1 + i*3.0/(28*28-1);
+    for(int i=0;i<10;i++)
+        printf("%f,",input_data[i]);
+    printf("\n");
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    int dims[] = {1, 1, img_h, img_w};
+    set_tensor_shape(input_tensor, dims, 4);
+
+    prerun_graph(graph);
+
+    int repeat_count = 1;
+    const char* repeat = std::getenv("REPEAT_COUNT");
+    if(repeat)
+        repeat_count = std::strtoul(repeat, NULL, 10);
+
+    set_tensor_buffer(input_tensor, input_data, sizeof(float) * img_size);
+
+    run_graph(graph, 1);
+
+    struct timeval t0, t1;
+    float avg_time = 0.f;
+    gettimeofday(&t0, NULL);
+
+    for(int i = 0; i < repeat_count; i++)
+        run_graph(graph, 1);
+
+    gettimeofday(&t1, NULL);
+    float mytime = ( float )((t1.tv_sec * 1000000 + t1.tv_usec) - (t0.tv_sec * 1000000 + t0.tv_usec)) / 1000;
+    avg_time += mytime;
+
+    std::cout << "--------------------------------------\n";
+    std::cout << "repeat " << repeat_count << " times, avg time per run is " << avg_time / repeat_count << " ms\n";
+
+    tensor_t output = get_graph_output_tensor(graph, 0, 0);
+    int shape[4];
+    int shape_num = get_tensor_shape(output, shape, 4);
+    int output_size = get_tensor_buffer_size(output)/4;
+    std::cout<<"output_size: "<< output_size << " shape: ";
+    for(int i=0;i<shape_num; i++)
+    std::cout<<shape[i]<<",";
+    std::cout<<"\n";
+    float* output_buf = (float*)get_tensor_buffer(output);
+    for(int j =0;j<10;j++)
+    {
+        printf("%g,",output_buf[j]);
+    }
+        printf("]\n");
+
+    release_graph_tensor(input_tensor);
+    release_graph_tensor(output);
+
+    free(input_data);
+
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    std::cout << "ALL TEST DONE\n";
+
+    return 0;
+}
diff --git a/tests/bin/test_mxnet_gru.cpp b/tests/bin/test_mxnet_gru.cpp
new file mode 100644
index 000000000..5bdc6307e
--- /dev/null
+++ b/tests/bin/test_mxnet_gru.cpp
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#include <unistd.h>
+
+#include <iostream>
+#include <functional>
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <time.h>
+
+#include "tengine_c_api.h"
+
+std::string model_name1 = "./models/GRU/Fused_Neural_Net-symbol.json";
+std::string model_name2 = "./models/GRU/Fused_Neural_Net-0100.params";
+
+int main(int argc, char* argv[])
+{
+    int steps = 1;
+    int res;
+
+    while((res = getopt(argc, argv, "n:")) != -1)
+    {
+        switch(res)
+        {
+            case 'n':
+                steps = strtoul(optarg, NULL, 10);
+                break;
+            default:
+                break;
+        }
+    }
+
+    init_tengine();
+
+    graph_t graph = create_graph(nullptr, "mxnet", model_name1.c_str(),model_name2.c_str());
+
+    // set_graph_layout(graph,TENGINE_LAYOUT_NCHW);
+    // dump_graph(graph);
+
+    if(graph == nullptr)
+    {
+        std::cout << "Create graph0 failed\n";
+        return 1;
+    }
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+
+    int dim[3] = {steps,28, 28};
+
+    set_tensor_shape(input_tensor, dim, 3);
+
+    int input_size = get_tensor_buffer_size(input_tensor);
+    float* input_data = ( float* )malloc(input_size);
+
+    for(unsigned int i = 0; i < input_size / sizeof(float); i++)
+        input_data[i] = 45;
+
+    set_tensor_buffer(input_tensor, input_data, input_size);
+
+    // std::cout<<"intensr "<<input_tensor<<" buffsize"<<get_tensor_buffer_size(input_tensor)<<
+    // "  menptr: "<<get_tensor_buffer(input_tensor)<<"\n";
+    prerun_graph(graph);
+    // std::cout<<"after pre run intensr "<<input_tensor<<" buffsize"<<get_tensor_buffer_size(input_tensor)<<
+    // "  menptr: "<<get_tensor_buffer(input_tensor)<<"\n";
+
+    // dump_graph(graph);
+
+    run_graph(graph, 1);
+
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+
+    std::cout << "final hidden state after " << steps << " steps:\n";
+
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor);
+
+    for(unsigned int i = 0; i < output_size / sizeof(float); i++)
+    {
+        if((i % 4) == 0)
+            printf("\n%d:\t", i);
+        printf("  %.12g", output_data[i]);
+    }
+
+    printf("\n");
+
+    release_graph_tensor(input_tensor);
+    release_graph_tensor(output_tensor);
+    postrun_graph(graph);
+    destroy_graph(graph);
+    free(input_data);
+
+    std::cout << "ALL TEST DONE\n";
+
+    release_tengine();
+    return 0;
+}
diff --git a/tests/bin/test_mxnet_lstm.cpp b/tests/bin/test_mxnet_lstm.cpp
new file mode 100644
index 000000000..87d214158
--- /dev/null
+++ b/tests/bin/test_mxnet_lstm.cpp
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#include <unistd.h>
+
+#include <iostream>
+#include <functional>
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <time.h>
+
+#include "tengine_c_api.h"
+
+std::string model_name1 = "./models/LSTM/Fused_Neural_Net-symbol.json";
+std::string model_name2 = "./models/LSTM/Fused_Neural_Net-0100.params";
+
+int main(int argc, char* argv[])
+{
+    int steps = 1;
+    int res;
+
+    while((res = getopt(argc, argv, "n:")) != -1)
+    {
+        switch(res)
+        {
+            case 'n':
+                steps = strtoul(optarg, NULL, 10);
+                break;
+            default:
+                break;
+        }
+    }
+
+    init_tengine();
+
+    graph_t graph = create_graph(nullptr, "mxnet", model_name1.c_str(),model_name2.c_str());
+
+    // set_graph_layout(graph,TENGINE_LAYOUT_NCHW);
+    // dump_graph(graph);
+
+    if(graph == nullptr)
+    {
+        std::cout << "Create graph0 failed\n";
+        return 1;
+    }
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+
+    int dim[3] = {steps,28, 28};
+
+    set_tensor_shape(input_tensor, dim, 3);
+
+    int input_size = get_tensor_buffer_size(input_tensor);
+    float* input_data = ( float* )malloc(input_size);
+
+    for(unsigned int i = 0; i < input_size / sizeof(float); i++)
+        input_data[i] = 45;
+
+    set_tensor_buffer(input_tensor, input_data, input_size);
+
+    // std::cout<<"intensr "<<input_tensor<<" buffsize"<<get_tensor_buffer_size(input_tensor)<<
+    // "  menptr: "<<get_tensor_buffer(input_tensor)<<"\n";
+    prerun_graph(graph);
+    // std::cout<<"after pre run intensr "<<input_tensor<<" buffsize"<<get_tensor_buffer_size(input_tensor)<<
+    // "  menptr: "<<get_tensor_buffer(input_tensor)<<"\n";
+
+    // dump_graph(graph);
+
+    run_graph(graph, 1);
+
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+
+    std::cout << "final hidden state after " << steps << " steps:\n";
+
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor);
+
+    for(unsigned int i = 0; i < output_size / sizeof(float); i++)
+    {
+        if((i % 4) == 0)
+            printf("\n%d:\t", i);
+        printf("  %.12g", output_data[i]);
+    }
+
+    printf("\n");
+
+    release_graph_tensor(input_tensor);
+    release_graph_tensor(output_tensor);
+    postrun_graph(graph);
+    destroy_graph(graph);
+    free(input_data);
+
+    std::cout << "ALL TEST DONE\n";
+
+    release_tengine();
+    return 0;
+}
diff --git a/tests/bin/test_mxnet_mobileface.cpp b/tests/bin/test_mxnet_mobileface.cpp
new file mode 100644
index 000000000..bb5c77e23
--- /dev/null
+++ b/tests/bin/test_mxnet_mobileface.cpp
@@ -0,0 +1,184 @@
+#include <unistd.h>
+
+#include <iostream>
+#include <functional>
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+
+#include "tengine_c_api.h"
+#include "common_util.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+const char* mxnet_text_file = "./models/model-symbol.json";
+const char* mxnet_model_file = "./models/model-mobilefacenet-128-0077.params";
+
+const char* caffe_text_file = "./models/model-77.prototxt";
+const char* caffe_model_file = "./models/model-77.caffemodel";
+
+const char* image_file = "./tests/images/mobileface01.jpg";
+// const char* image_file = "./tests/images/mobileface02.jpg";
+
+using namespace TEngine;
+
+int repeat_count = 1;
+int img_h = 112;
+int img_w = 112;
+
+const float mxnet_mean[3] = {0.0, 0.0, 0.0};
+const float mxnet_scale = 1.0;
+
+const float caffe_mean[3] = {127.5, 127.5, 127.5};
+const float caffe_scale = 0.00781;
+
+void get_input_data(const char* image_file, float* input_data, int img_h, int img_w, const float* mean, float scale)
+{
+    cv::Mat img = cv::imread(image_file, -1);
+    if(img.empty())
+    {
+        std::cerr << "failed to read image file " << image_file << "\n";
+        return;
+    }
+
+    cv::resize(img, img, cv::Size(img_h, img_w));
+    img.convertTo(img, CV_32FC3);
+
+    float* img_data = ( float* )img.data;
+    int hw = img_h * img_w;
+    for(int h = 0; h < img_h; h++)
+        for(int w = 0; w < img_w; w++)
+            for(int c = 0; c < 3; c++)
+            {
+                input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale;
+                img_data++;
+            }
+}
+
+void dump_float(const char* fname, float* data, int number)
+{
+    FILE* fp = fopen(fname, "w");
+
+    for(int i = 0; i < number; i++)
+        fprintf(fp, " %f\n", data[i]);
+
+    fclose(fp);
+}
+
+int test_graph(graph_t graph, const char* dump_file, const float means[], const float scale)
+{
+    /* get input tensor */
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if(input_tensor == nullptr)
+    {
+        std::printf("Cannot find input tensor of graph\n");
+        return -1;
+    }
+
+    int dims[] = {1, 3, img_h, img_w};
+    set_tensor_shape(input_tensor, dims, 4);
+
+    /* setup input buffer */
+    float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3);
+    get_input_data(image_file, input_data, img_h, img_w, means, scale);
+
+    if(set_tensor_buffer(input_tensor, input_data, 3 * img_h * img_w * 4) < 0)
+    {
+        std::printf("Set buffer for tensor failed\n");
+        return -1;
+    }
+
+    /* run the graph */
+    int ret_prerun = prerun_graph(graph);
+    if(ret_prerun < 0)
+    {
+        std::printf("prerun failed\n");
+        return -1;
+    }
+    // dump_graph(graph);
+
+    unsigned long start_time = get_cur_time();
+
+    for(int i = 0; i < repeat_count; i++)
+        run_graph(graph, 1);
+
+    unsigned long end_time = get_cur_time();
+
+    unsigned long off_time = end_time - start_time;
+    std::printf("Repeat [%d] time %.2f us per RUN. used %lu us\n", repeat_count, 1.0f * off_time / repeat_count,
+                off_time);
+
+    /* get output tensor */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    if(output_tensor == nullptr)
+    {
+        std::printf("Cannot find output tensor\n");
+        return -1;
+    }
+
+    int size = get_tensor_buffer_size(output_tensor);
+    std::printf("output tensor buffer size: %d\n", size);
+
+    float* data1 = ( float* )get_tensor_buffer(output_tensor);
+
+    dump_float(dump_file, data1, size / 4);
+
+    free(input_data);
+    release_graph_tensor(input_tensor);
+    release_graph_tensor(output_tensor);
+    return 0;
+}
+
+int main(int argc, char* argv[])
+{
+    int res;
+    while((res = getopt(argc, argv, "r:")) != -1)
+    {
+        switch(res)
+        {
+            case 'r':
+                repeat_count = strtoul(optarg, NULL, 10);
+                break;
+            default:
+                break;
+        }
+    }
+
+    init_tengine();
+
+    std::cout << "Tengine version: " << get_tengine_version() << "\n";
+
+    if(request_tengine_version("1.0") < 0)
+        return 1;
+
+    graph_t mxnet_graph = create_graph(nullptr, "mxnet", mxnet_text_file, mxnet_model_file);
+    if(mxnet_graph == nullptr)
+    {
+        std::cout << "Create mxnet_graph failed\n";
+        return -1;
+    }
+
+    graph_t caffe_graph = create_graph(nullptr, "caffe", caffe_text_file, caffe_model_file);
+    if(caffe_graph == nullptr)
+    {
+        std::cout << "Create caffe_graph failed\n";
+        return -1;
+    }
+
+    std::cout << "Test mxnet graph:\n";
+    if(test_graph(mxnet_graph, "./mxnet_output_data.txt", mxnet_mean, mxnet_scale) < 0)
+        return -1;
+
+    std::cout << "Test caffe graph:\n";
+    if(test_graph(caffe_graph, "./caffe_output_data.txt", caffe_mean, caffe_scale) < 0)
+        return -1;
+
+    postrun_graph(mxnet_graph);
+    postrun_graph(caffe_graph);
+    destroy_graph(mxnet_graph);
+    destroy_graph(caffe_graph);
+    release_tengine();
+
+    std::cout << "ALL TEST DONE\n";
+    return 0;
+}
diff --git a/tests/bin/test_rnn.cpp b/tests/bin/test_rnn.cpp
new file mode 100644
index 000000000..3f5be99dd
--- /dev/null
+++ b/tests/bin/test_rnn.cpp
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#include <unistd.h>
+
+#include <iostream>
+#include <functional>
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <time.h>
+
+#include "tengine_c_api.h"
+
+std::string model_name = "/home/zpluo/striped_rnn.pb";
+
+int main(int argc, char* argv[])
+{
+    int steps = 49;
+    int res;
+
+    while((res = getopt(argc, argv, "n:")) != -1)
+    {
+        switch(res)
+        {
+            case 'n':
+                steps = strtoul(optarg, NULL, 10);
+                break;
+            default:
+                break;
+        }
+    }
+
+    init_tengine();
+
+    graph_t graph = create_graph(nullptr, "tensorflow", model_name.c_str());
+
+    if(graph == nullptr)
+    {
+        std::cout << "Create graph0 failed\n";
+        return 1;
+    }
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+
+    if(get_tensor_buffer_size(input_tensor) == 0)
+    {
+        int dim[3] = {steps,1,10};
+
+        set_tensor_shape(input_tensor, dim, 3);
+    }
+
+    int input_size = get_tensor_buffer_size(input_tensor);
+    float* input_data = ( float* )malloc(input_size);
+
+    for(unsigned int i = 0; i < input_size / sizeof(float); i++)
+        input_data[i] = 23.05;
+    
+    prerun_graph(graph);
+
+    dump_graph(graph);
+
+    set_tensor_buffer(input_tensor, input_data, input_size);
+    run_graph(graph, 1);
+
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+
+    std::cout << "final hidden state after " << steps << " steps:\n";
+
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor);
+
+    for(unsigned int i = 0; i < output_size / sizeof(float); i++)
+    {
+        if((i % 4) == 0)
+            printf("\n%d:\t", i);
+
+        printf("  %.12g", output_data[i]);
+    }
+
+    printf("\n");
+
+    release_graph_tensor(input_tensor);
+    release_graph_tensor(output_tensor);
+    postrun_graph(graph);
+    destroy_graph(graph);
+    free(input_data);
+
+    std::cout << "ALL TEST DONE\n";
+
+    release_tengine();
+    return 0;
+}
diff --git a/tests/bin/test_tf_gru.cpp b/tests/bin/test_tf_gru.cpp
new file mode 100644
index 000000000..74295f7f3
--- /dev/null
+++ b/tests/bin/test_tf_gru.cpp
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: zpluo@openailab.com
+ */
+#include <unistd.h>
+
+#include <iostream>
+#include <functional>
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <time.h>
+
+#include "tengine_c_api.h"
+
+std::string model_name = "./models/gru.pb";
+
+int main(int argc, char* argv[])
+{
+    int steps = 1;
+    int res;
+
+    while((res = getopt(argc, argv, "n:")) != -1)
+    {
+        switch(res)
+        {
+            case 'n':
+                steps = strtoul(optarg, NULL, 10);
+                break;
+            default:
+                break;
+        }
+    }
+
+    init_tengine();
+
+    graph_t graph = create_graph(nullptr, "tensorflow", model_name.c_str());
+
+    // set_graph_layout(graph,TENGINE_LAYOUT_NCHW);
+    // dump_graph(graph);
+
+    if(graph == nullptr)
+    {
+        std::cout << "Create graph0 failed\n";
+        return 1;
+    }
+     
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+
+    
+    int dim[3] = {steps,28, 28};
+
+    set_tensor_shape(input_tensor, dim, 3);
+    
+
+    int input_size = get_tensor_buffer_size(input_tensor);
+    float* input_data = ( float* )malloc(input_size);
+
+    for(unsigned int i = 0; i < input_size / sizeof(float); i++)
+        input_data[i] = 45;
+
+    set_tensor_buffer(input_tensor, input_data, input_size);
+
+
+
+    // std::cout<<"intensr "<<input_tensor<<" buffsize"<<get_tensor_buffer_size(input_tensor)<<
+    // "  menptr: "<<get_tensor_buffer(input_tensor)<<"\n";
+    prerun_graph(graph);
+    // std::cout<<"after pre run intensr "<<input_tensor<<" buffsize"<<get_tensor_buffer_size(input_tensor)<<
+    // "  menptr: "<<get_tensor_buffer(input_tensor)<<"\n";
+    dump_graph(graph);
+
+    
+    run_graph(graph, 1);
+
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+
+    std::cout << "final hidden state after " << steps << " steps:\n";
+
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor);
+
+    for(unsigned int i = 0; i < output_size / sizeof(float); i++)
+    {
+        if((i % 4) == 0)
+            printf("\n%d:\t", i);
+        printf("  %.12g", output_data[i]);
+    }
+
+    printf("\n");
+
+    release_graph_tensor(input_tensor);
+    release_graph_tensor(output_tensor);
+    postrun_graph(graph);
+    destroy_graph(graph);
+    free(input_data);
+
+    std::cout << "ALL TEST DONE\n";
+
+    release_tengine();
+    return 0;
+}
diff --git a/tests/bin/test_lstm.cpp b/tests/bin/test_tf_lstm.cpp
similarity index 100%
rename from tests/bin/test_lstm.cpp
rename to tests/bin/test_tf_lstm.cpp
diff --git a/tests/bin/tf_lite_mobilenet_quant.cpp b/tests/bin/tf_lite_mobilenet_quant.cpp
index d0430d4d8..3ebc49de9 100644
--- a/tests/bin/tf_lite_mobilenet_quant.cpp
+++ b/tests/bin/tf_lite_mobilenet_quant.cpp
@@ -134,6 +134,7 @@ int main(int argc, char* argv[])
 
     // dump_graph(graph);
 
+
     // input
     int img_h = 224;
     int img_w = 224;
@@ -158,6 +159,8 @@ int main(int argc, char* argv[])
     }
 
     int repeat_count = 1;
+    if(argc>1)
+        repeat_count = atoi(argv[1]);
     const char* repeat = std::getenv("REPEAT_COUNT");
 
     if(repeat)
diff --git a/tests/images/face5.jpg b/tests/images/face5.jpg
new file mode 100644
index 000000000..6b8991708
Binary files /dev/null and b/tests/images/face5.jpg differ
diff --git a/tests/images/mobileface01.jpg b/tests/images/mobileface01.jpg
new file mode 100755
index 000000000..5646abf52
Binary files /dev/null and b/tests/images/mobileface01.jpg differ
diff --git a/tests/images/mobileface02.jpg b/tests/images/mobileface02.jpg
new file mode 100755
index 000000000..b34e27aba
Binary files /dev/null and b/tests/images/mobileface02.jpg differ
diff --git a/tools/Makefile b/tools/Makefile
index 3d9a3ac37..0fd13e429 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -36,7 +36,7 @@ default: $(libsubdir_list) $(binsubdir_list) $(TEST_LIB) $(BIN_EXES)
 tools: $(BIN_EXES)
 
 
-$(BIN_OBJS): $(binsubdir_list)
+$(BIN_OBJS): $(binsubdir_list);
 
 $(TOOLS_LIB_OBJS): $(libsubdir_list)
 
@@ -64,12 +64,14 @@ clean: $(libsubdir_list) $(binsubdir_list)
 
 $(BUILD_DIR)/%: $(BUILD_DIR)/%.o  $(TOOLS_LIB)
 	@mkdir -p $(@D)
-	$(LD) $(LDFLAGS) $< $(COMMON_OBJS) -o $@ $(TENGINE_SO) $(LIBS) $(SO_LIBS)  -L$(shell pwd)/../build
+	$(LD) $(LDFLAGS) $< $(COMMON_OBJS) -o $@ $(LIBS) $(SO_LIBS)  -L$(shell pwd)/../build
 
 #ugly workaround for cross compiling on arm64 ubuntu rootfs
 OPENCV_LIB=$(shell pkg-config --libs-only-l --libs-only-L opencv) 
 
-TENGINE_SO=-Wl,-rpath,$(shell pwd)/../build -ltengine -Wl,-allow-shlib-undefined
+LIBS+=-Wl,-rpath,$(shell pwd)/../build -ltengine -lhclcpu -Wl,-allow-shlib-undefined
+
+
 LIBS+=-lprotobuf -ldl -lpthread -lresolv
 SO_LIBS+=$(OPENCV_LIB)
 
diff --git a/tools/bin/convert_model_to_tm.cpp b/tools/bin/convert_model_to_tm.cpp
index df9192e70..170364233 100644
--- a/tools/bin/convert_model_to_tm.cpp
+++ b/tools/bin/convert_model_to_tm.cpp
@@ -77,7 +77,8 @@ int main(int argc, char* argv[])
             model_file_needed = true;
             input_file_number = 2;
         }
-        else if(file_format == "caffe_single" || file_format == "onnx" || file_format == "tensorflow")
+        else if(file_format == "caffe_single" || file_format == "onnx" || file_format == "tensorflow" ||
+                file_format == "tflite")
         {
             model_file_needed = true;
             input_file_number = 1;
@@ -151,6 +152,24 @@ int main(int argc, char* argv[])
         return -1;
     }
 
+    const char* env = std::getenv("TM_NO_OPTIMIZE");
+    if(env == nullptr)
+    {
+        // optimize graph
+         int optimize_only = 1;
+        if(set_graph_attr(graph, "optimize_only", &optimize_only, sizeof(int)) < 0)
+        {
+            std::cout<<"set optimize only failed\n";
+            return -1;
+        }
+
+        if(prerun_graph(graph) < 0)
+        {
+            std::cout<<"prerun failed\n";
+            return -1;
+        }
+    }
+
     // Save the tengine model file
     if(save_graph(graph, "tengine", output_tmfile.c_str()) == -1)
     {
diff --git a/tools/bin/test_yolov2.cpp b/tools/bin/test_yolov2.cpp
index d59b349f1..8d993ecf7 100644
--- a/tools/bin/test_yolov2.cpp
+++ b/tools/bin/test_yolov2.cpp
@@ -578,7 +578,7 @@ int main(int argc, char** argv)
 
         std::vector<float> param_biases;
 
-        if(get_node_attr_generic(node, "biases", &typeid(std::vector<float>), &param_biases, sizeof(param_biases)) < 0)
+        if(get_node_attr_generic(node, "biases", typeid(std::vector<float>).name(), &param_biases, sizeof(param_biases)) < 0)
         {
             std::cout << "cannot get bias settings\n";
             return 1;
diff --git a/wrapper/Makefile b/wrapper/Makefile
index 407633e01..d582d2181 100644
--- a/wrapper/Makefile
+++ b/wrapper/Makefile
@@ -68,7 +68,7 @@ $(BUILD_DIR)/%: $(BUILD_DIR)/%.o
 
 SO_LIBS=-L../build/core -L../build/wrapper 
 SO_LIBS+=-Wl,-rpath,./build/core:./build/wrapper
-SO_LIBS+=-ltengine -lwrapper 
+SO_LIBS+=-ltengine -lhclcpu -lwrapper
 
 
 LIBS+=-lprotobuf  -lglog