diff --git a/.gitignore b/.gitignore index 1fc741e69..c0ab7fc54 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ protobuf_lib/ sysroot/ android_config.txt model_src/ +.vs/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b6156999..5488ba2d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,17 +1,18 @@ if(CMAKE_TOOLCHAIN_FILE) -set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to") -# get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :( get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME) +message(STATUS,"CMAKE_TOOLCHAIN_FILE_NAME = ${CMAKE_TOOLCHAIN_FILE_NAME}, ${CMAKE_TOOLCHAIN_FILE}, ${CMAKE_SOURCE_DIR}") find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH) message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}") +else() + message(FATAL_ERROR "cmake file only used for Android build") endif() if(NOT DEFINED CMAKE_INSTALL_PREFIX) set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/install" CACHE PATH "Installation Directory") +message(STATUS,"CMAKE_CURRENT_SRC_DIR = ${CMAKE_CURRENT_SOURCE_DIR}") endif() message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}") - cmake_minimum_required(VERSION 3.6) set(CMAKE_BUILD_TYPE debug) @@ -19,15 +20,61 @@ set(CMAKE_BUILD_TYPE debug) # set(CMAKE_BUILD_TYPE release) +#get the NDK_ROOT from android.toolchains.cmake + +set(PARSED_ANDROID_NDK_REGEX "(.+)/build/cmake/android.toolchain.cmake") + +string(REGEX REPLACE "${PARSED_ANDROID_NDK_REGEX}" "\\1" PARSED_ANDROID_NDK ${CMAKE_TOOLCHAIN_FILE}) + +file(READ "${PARSED_ANDROID_NDK}/source.properties" TE_NDK_SOURCE_PROPERTIES) + +set(TE_NDK_REVISION_REGEX + "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.([0-9]+)\\.([0-9]+)?") +if(NOT TE_NDK_SOURCE_PROPERTIES MATCHES "${TE_NDK_REVISION_REGEX}") + message(FATAL_ERROR "Failed to parse Android NDK revision: ${ANDROID_NDK}/source.properties.\n${TE_NDK_SOURCE_PROPERTIES}") +endif() + +set(PARSED_NDK_MAJOR "${CMAKE_MATCH_1}") +set(PARSED_NDK_MINOR "${CMAKE_MATCH_2}") + + +#ndk less than 15, CMAKE 3.6.3 works, while higher version may failed +if(PARSED_NDK_MAJOR LESS 15) +if(CMAKE_VERSION VERSION_GREATER 3.6.3) + message(FATAL_ERROR "please use cmake at most VERSION 3.6.3 for ndk " ${PARSED_NDK_MAJOR} "." ${PARSED_NDK_MINOR}) +endif() +endif() + +#real project logic starts from here + project(tengine_android) option(CONFIG_ARCH_ARM64 "build arm64 version" OFF) +option(CONFIG_ARCH_ARM32 "build arm32 version" OFF) option(CONFIG_ARCH_BLAS "build blas version" OFF) +option(CONFIG_ARCH_ARM8_2 "build float16 for arm8.2" OFF) option(CONFIG_ACL_GPU "build acl gpu version" OFF) option(CONFIG_CAFFE_SERIALIZER "caffe serializer" ON) option(CONFIG_ONNX_SERIALIZER "onnx serializer" OFF) +option(CONFIG_MXNET_SERIALIZER "mxnet serializer" OFF) option(CONFIG_TF_SERIALIZER "tensorflow serializer" OFF) +option(CONFIG_TFLITE_SERIALIZER "tflite serializer" OFF) option(CONFIG_TENGINE_SERIALIZER "tengine serializer" ON) +option(CONFIG_KERNEL_FP32 "KERNEL FP32" ON) +if(ANDROID_NDK_MAJOR AND CONFIG_ARCH_ARM8_2) + option(CONFIG_KERNEL_FP16 "KERNEL FP16" ON) +endif() +option(CONFIG_KERNEL_INT8 "KERNEL INT8" ON) +option(CONFIG_KERNEL_UINT8 "KERNEL UINT8" ON) +option(CONFIG_AUTH_DEVICE "AUTH DEVICE" ON) + + +#in face, this is related with run-time env, since API LEVEL 22 binary can run on API LEVEL 23 platform +if(ANDROID_PLATFORM_LEVEL LESS 23) + add_definitions(-DNO_CXA_DEMANGLE) +endif() + +set(CONFIG_VERSION_POSTFIX github) #message("list dir ${CMAKE_CURRENT_LIST_DIR}/.git") if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/.git) @@ -43,31 +90,64 @@ set(GIT_COMMIT_ID -DGIT_COMMIT_ID="0x${stripped_commit_id}") message("GIT COMMIT ID: " 0x${stripped_commit_id}) if (CONFIG_ARCH_ARM64) - add_definitions(-DCONFIG_ARCH_ARM64=1) + add_definitions(-DCONFIG_ARCH_ARM64=1) +endif() + +if(CONFIG_ARCH_ARM32) + add_definitions(-DCONFIG_ARCH_ARM32=1) endif() if(CONFIG_ARCH_BLAS) - add_definitions(-DCONFIG_ARCH_BLAS=1) + add_definitions(-DCONFIG_ARCH_BLAS=1) +endif() + +if(CONFIG_ARCH_ARM8_2) + add_definitions(-DCONFIG_ARCH_ARM8_2=1) + add_definitions(-mcpu=cortex-a55) endif() if(CONFIG_ACL_GPU) - add_definitions(-DCONFIG_ACL_GPU=1) + add_definitions(-DCONFIG_ACL_GPU=1) endif() if(CONFIG_CAFFE_SERIALIZER) - add_definitions(-DCONFIG_CAFFE_SERIALIZER=1) + add_definitions(-DCONFIG_CAFFE_SERIALIZER=1) endif() if(CONFIG_ONNX_SERIALIZER) - add_definitions(-DCONFIG_ONNX_SERIALIZER=1) + add_definitions(-DCONFIG_ONNX_SERIALIZER=1) +endif() + +if(CONFIG_MXNET_SERIALIZER) + add_definitions(-DCONFIG_MXNET_SERIALIZER=1) endif() if(CONFIG_TF_SERIALIZER) - add_definitions(-DCONFIG_TF_SERIALIZER=1) + add_definitions(-DCONFIG_TF_SERIALIZER=1) endif() if(CONFIG_TENGINE_SERIALIZER) - add_definitions(-DCONFIG_TENGINE_SERIALIZER=1) + add_definitions(-DCONFIG_TENGINE_SERIALIZER=1) +endif() + +if(CONFIG_KERNEL_FP32) + add_definitions(-DCONFIG_KERNEL_FP32=1) +endif() + +if(CONFIG_KERNEL_FP16) + add_definitions(-DCONFIG_KERNEL_FP16=1) +endif() + +if(CONFIG_KERNEL_INT8) + add_definitions(-DCONFIG_KERNEL_INT8=1) +endif() + +if(CONFIG_KERNEL_UINT8) + add_definitions(-DCONFIG_KERNEL_UINT8=1) +endif() + +if (CONFIG_VERSION_POSTFIX) + add_definitions(-DCONFIG_VERSION_POSTFIX="${CONFIG_VERSION_POSTFIX}") endif() add_definitions(${GIT_COMMIT_ID}) @@ -83,7 +163,8 @@ add_definitions(-Wno-overloaded-virtual) set(CMAKE_CXX_STANDARD 11) set(CXX_STANDARD_REQUIRED ON) -#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-overloaded-virtual") + +set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Bsymbolic -Bsymbolic-functions") include_directories(include) @@ -111,10 +192,14 @@ endif() ENDFOREACH() - +ADD_LIBRARY(hclcpu SHARED ${TOPERATOR_LIB_SRCS}) ADD_LIBRARY(tengine SHARED ${TENGINE_LIB_SRCS} ${TENGINE_SIGN_SRCS}) -ADD_DEPENDENCIES(tengine KERNEL_ASM_TARGET) +#executor +ADD_DEPENDENCIES(hclcpu KERNEL_ASM_TARGET) +TARGET_LINK_LIBRARIES(hclcpu tengine) + +#target_compile_definitions(operator,"--allow-shlib-undefined") if(PROTOBUF_DIR) if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) @@ -146,11 +231,11 @@ if(CONFIG_ARCH_BLAS) if( BLAS_DIR) if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) TARGET_LINK_LIBRARIES(tengine ${BLAS_DIR}/arm32/lib/libopenblas.so) - endif() + endif() if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")) - TARGET_LINK_LIBRARIES(tengine ${BLAS_DIR}/arm64/lib/libopenblas.so) - endif() - else() + TARGET_LINK_LIBRARIES(tengine ${BLAS_DIR}/arm64/lib/libopenblas.so) + endif() + else() message(FATAL_ERROR "need to set the blas path") endif() endif() @@ -172,8 +257,8 @@ if(CONFIG_ACL_GPU) endif() install (TARGETS tengine DESTINATION lib) +install (TARGETS hclcpu DESTINATION lib) install (FILES ${CMAKE_CURRENT_SOURCE_DIR}/core/include/tengine_c_api.h DESTINATION include) install (FILES ${CMAKE_CURRENT_SOURCE_DIR}/core/include/cpu_device.h DESTINATION include) -install (FILES ${CMAKE_CURRENT_SOURCE_DIR}/core/include/tengine_test_api.h DESTINATION include) install (FILES ${CMAKE_CURRENT_SOURCE_DIR}/core/include/tengine_c_compat.h DESTINATION include) diff --git a/LICENSE b/LICENSE index d64569567..d83ed7001 100644 --- a/LICENSE +++ b/LICENSE @@ -187,7 +187,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2019 OPEN AI LAB Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/Makefile b/Makefile index f00c89bb4..d67bcb6c3 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,6 @@ -### cross compile for ARM64 -#CROSS_COMPILE=aarch64-linux-gnu- -### cross compile for ARM32 -#CROSS_COMPILE=arm-linux-gnueabihf- +MAKEFILE_CONFIG=$(shell pwd)/makefile.config +include $(MAKEFILE_CONFIG) + SYSROOT:=$(shell pwd)/sysroot/ubuntu_rootfs ifeq ($(CROSS_COMPILE),aarch64-linux-gnu-) @@ -17,11 +16,20 @@ ifeq ($(CROSS_COMPILE),arm-linux-gnueabihf-) export PKG_CONFIG_PATH endif -CC=$(CROSS_COMPILE)gcc -std=gnu99 $(SYSROOT_FLAGS) -CXX=$(CROSS_COMPILE)g++ -std=c++11 $(SYSROOT_FLAGS) -LD=$(CROSS_COMPILE)g++ $(SYSROOT_FLAGS) $(SYSROOT_LDFLAGS) +ifeq ($(EMBEDDED_CROSS_ROOT),) + CC=$(CROSS_COMPILE)gcc -std=gnu99 $(SYSROOT_FLAGS) + CXX=$(CROSS_COMPILE)g++ -std=c++11 $(SYSROOT_FLAGS) + LD=$(CROSS_COMPILE)g++ $(SYSROOT_FLAGS) $(SYSROOT_LDFLAGS) +else + CC=$(CROSS_COMPILE)gcc -std=gnu99 + CXX=$(CROSS_COMPILE)g++ -std=c++11 + LD=$(CROSS_COMPILE)g++ + PKG_CONFIG_PATH:=$(EMBEDDED_CROSS_ROOT)/usr/lib/pkgconfig +endif + AR=$(CROSS_COMPILE)ar + BUILT_IN_LD=$(CROSS_COMPILE)ld GIT_COMMIT_ID=$(shell git rev-parse HEAD) @@ -31,10 +39,9 @@ COMMON_CFLAGS+=-Wno-ignored-attributes -Werror -g export CC CXX CFLAGS BUILT_IN_LD LD LDFLAGS CXXFLAGS COMMON_CFLAGS export GIT_COMMIT_ID -MAKEFILE_CONFIG=$(shell pwd)/makefile.config + MAKEBUILD=$(shell pwd)/scripts/makefile.build -include $(MAKEFILE_CONFIG) BUILD_DIR?=$(shell pwd)/build INSTALL_DIR?=$(shell pwd)/install @@ -45,9 +52,10 @@ export INSTALL_DIR MAKEBUILD TOP_DIR MAKEFILE_CONFIG LIB_SUB_DIRS=core operator executor serializer driver model_src - LIB_SO=$(BUILD_DIR)/libtengine.so LIB_A=$(BUILD_DIR)/libtengine.a +LIB_HCL_SO=$(BUILD_DIR)/libhclcpu.so +export LIB_HCL_SO LIB_OBJS=$(addprefix $(BUILD_DIR)/, $(foreach f,$(LIB_SUB_DIRS),$(f)/built-in.o)) @@ -62,8 +70,14 @@ APP_SUB_DIRS+=tests ifeq ($(CONFIG_ARCH_ARM32),y) COMMON_CFLAGS+=-march=armv7-a -mfpu=neon -mfp16-format=ieee -mfpu=neon-fp16 + export CONFIG_ARCH_ARM32 +endif + +ifeq ($(CONFIG_ARCH_ARM64),y) + export CONFIG_ARCH_ARM64 endif + ifeq ($(CONFIG_FLOAT16),y) COMMON_CFLAGS+=-DCONFIG_FLOAT16 endif @@ -73,22 +87,41 @@ ifeq ($(CONFIG_LEGACY_API),y) endif +HCL_SUB_DIRS+=hclarm +LIB_HCL_OBJS=$(BUILD_DIR)/hclarm/arm-builtin.o + +ifeq ($(CONFIG_KERNEL_FP32),y) + COMMON_CFLAGS+=-DCONFIG_KERNEL_FP32 +endif + +ifeq ($(CONFIG_KERNEL_FP16),y) + COMMON_CFLAGS+=-DCONFIG_KERNEL_FP16 +endif + +ifeq ($(CONFIG_KERNEL_INT8),y) + COMMON_CFLAGS+=-DCONFIG_KERNEL_INT8 +endif + +ifeq ($(CONFIG_KERNEL_UINT8),y) + COMMON_CFLAGS+=-DCONFIG_KERNEL_UINT8 +endif + SUB_DIRS=$(LIB_SUB_DIRS) $(APP_SUB_DIRS) -default: $(LIB_SO) $(APP_SUB_DIRS) +default: $(LIB_SO) $(LIB_HCL_SO) $(APP_SUB_DIRS) build : default -clean: $(SUB_DIRS) +clean: $(SUB_DIRS) $(HCL_SUB_DIRS) -install: $(APP_SUB_DIRS) - @mkdir -p $(INSTALL_DIR)/include $(INSTALL_DIR)/lib +install: $(APP_SUB_DIRS) $(HCL_SUB_DIRS) + @mkdir -p $(INSTALL_DIR)/include $(INSTALL_DIR)/lib $(INSTALL_DIR)/tool cp -f core/include/tengine_c_api.h $(INSTALL_DIR)/include cp -f core/include/tengine_c_compat.h $(INSTALL_DIR)/include cp -f core/include/cpu_device.h $(INSTALL_DIR)/include - cp -f core/include/tengine_test_api.h $(INSTALL_DIR)/include cp -f $(BUILD_DIR)/libtengine.so $(INSTALL_DIR)/lib + cp -f $(BUILD_DIR)/tools/bin/convert_model_to_tm $(INSTALL_DIR)/tool ifeq ($(CONFIG_ACL_GPU),y) @@ -112,8 +145,15 @@ endif -$(LIB_SO): $(REAL_LIB_OBJS) - $(LD) -o $@ -shared -Wl,-Bsymbolic -Wl,-Bsymbolic-functions $(wildcard $(LIB_OBJS)) $(LIB_LDFLAGS) +$(LIB_SO): $(REAL_LIB_OBJS) $(LIB_HCL_SO) + $(LD) -o $@ -shared -Wl,-Bsymbolic -Wl,-Bsymbolic-functions $(wildcard $(LIB_OBJS)) $(LIB_LDFLAGS) $ -L$(BUILD_DIR) -Wl,-rpath,\$$ORIGIN -Wl,-rpath-link=\$$ORIGIN + +ifneq ( $(LIB_HCL_SO),) + $(LIB_HCL_SO): $(HCL_SUB_DIRS); +else + $(LIB_HCL_SO): + +endif static: static_lib static_example @@ -125,10 +165,23 @@ static_lib: static_example: static_lib $(LD) -o $(BUILD_DIR)/test_tm $(BUILD_DIR)/tests/bin/test_tm.o $(LIBS) -ltengine \ - -ldl -lpthread -static -L$(BUILD_DIR) -lprotobuf -lblas -lpthread + -ldl -lpthread -static -L$(BUILD_DIR) @echo ; echo static example: $(BUILD_DIR)/test_tm created -LIB_LDFLAGS+=-lpthread -lprotobuf -ldl +LIB_LDFLAGS+=-lpthread -ldl + +ifeq ($(CONFIG_CAFFE_SERIALIZER),y) + PROTOBUF_NEEDED=y +endif + +ifeq ($(CONFIG_TF_SERIALIZER),y) + PROTOBUF_NEEDED=y +endif + +ifeq ($(PROTOBUF_NEEDED),y) + PROTOBUF_LIB=$(shell export PKG_CONFIG_PATH=${PKG_CONFIG_PATH} && pkg-config --libs protobuf) + LIB_LDFLAGS+=$(PROTOBUF_LIB) +endif ifeq ($(CONFIG_ARCH_BLAS),y) LIB_LDFLAGS+=-lopenblas @@ -141,7 +194,7 @@ endif $(LIB_SUB_DIRS): @$(MAKE) -C $@ -f $(MAKEBUILD) BUILD_DIR=$(BUILD_DIR)/$@ $(MAKECMDGOALS) -$(APP_SUB_DIRS): +$(APP_SUB_DIRS) $(HCL_SUB_DIRS): @$(MAKE) -C $@ BUILD_DIR=$(BUILD_DIR)/$@ $(MAKECMDGOALS) @@ -153,4 +206,4 @@ distclean: find . -name $(BUILD_DIR) | xargs rm -rf find . -name $(INSTALL_DIR) | xargs rm -rf -.PHONY: clean install $(SUB_DIRS) build +.PHONY: clean install $(SUB_DIRS) build $(HCL_SUB_DIRS) diff --git a/README.md b/README.md index 859d7543e..0656d87d0 100644 --- a/README.md +++ b/README.md @@ -63,10 +63,45 @@ Tengine can be extended to support new serialization format, by building new ser ## Release History + +## version 1.3.2 - 2019/04/19 + +**tengine model 2.0** + +**New apis** + +get_graph_node_number() +get_graph_node_by_idx() + +**New features** + +Separate CPU operator as a independent so: hclcpu.so + +Add Reference Operator + +Update Testcase & Update permute for mxnet + +Update lstm grun mxnet serializer + +Support MXNET serializer in CMakelist.txt + +Support TFLITE serializer in CMakelist.txt + +Support eltwise in TFLITE serializer + +**More operator support** + +RNN operator definition and blas implementation + +LSTM operator definition and blas implementation + +GRU operator definition and blas implementation + ## version 1.0.0 - 2018/12/31 **tengine API 2.0** + New API set for NN inference Simplify graph create process: just create_graph() instead of load_model() and create_runtime_graph() diff --git a/android_build_armv7.sh b/android_build_armv7.sh index 6c8fac1c5..c28e75af1 100755 --- a/android_build_armv7.sh +++ b/android_build_armv7.sh @@ -26,7 +26,8 @@ done<../android_config.txt cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="armeabi-v7a" \ -DANDROID_ARM_NEON=ON \ - -DCONFIG_ARCH_BLAS=ON \ + -DANDROID_ALLOW_UNDEFINED_SYMBOLS=TRUE\ + -DCONFIG_ARCH_ARM32=ON \ -DANDROID_PLATFORM=android-21 \ -DANDROID_STL=c++_shared \ -DPROTOBUF_DIR=$PROTOBUF_PATH \ diff --git a/android_build_armv8.sh b/android_build_armv8.sh index 15f5d776c..0f2206938 100755 --- a/android_build_armv8.sh +++ b/android_build_armv8.sh @@ -31,4 +31,5 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DPROTOBUF_DIR=$PROTOBUF_PATH \ -DBLAS_DIR=$BLAS_PATH \ -DACL_ROOT=$ACL_ROOT \ + -DANDROID_ALLOW_UNDEFINED_SYMBOLS=TRUE\ .. diff --git a/android_pack.sh b/android_pack.sh index 862eb781c..46457c83b 100755 --- a/android_pack.sh +++ b/android_pack.sh @@ -41,5 +41,6 @@ else fi cp build/libtengine.so ./android_pack +cp build/libhclcpu.so ./android_pack diff --git a/cmake/executor.cmake b/cmake/executor.cmake old mode 100644 new mode 100755 index 64df99d37..2ab1b9b75 --- a/cmake/executor.cmake +++ b/cmake/executor.cmake @@ -1,57 +1,67 @@ include_directories(executor/include executor/operator/include) FILE(GLOB_RECURSE COMMON_LIB_CPP_SRCS executor/engine/*.cpp executor/lib/*.cpp executor/plugin/*.cpp) -FILE(GLOB COMMON_CPP_SRCS executor/operator/common/*.cpp executor/operator/common/fused/*.cpp) +FILE(GLOB COMMON_CPP_SRCS executor/operator/init.cpp executor/operator/common/*.cpp executor/operator/common/fused/*.cpp) +FILE(GLOB_RECURSE REF_CPP_SRCS executor/operator/ref/*.cpp) + if(CONFIG_ARCH_BLAS) FILE(GLOB COMMON_BLAS_SRCS executor/operator/common/blas/*.cpp) list(APPEND COMMON_CPP_SRCS ${COMMON_BLAS_SRCS}) + if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) + include_directories(${BLAS_DIR}/arm32/include) + endif() + if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")) + include_directories(${BLAS_DIR}/arm64/include) + endif() endif() -list(APPEND TENGINE_LIB_SRCS ${COMMON_LIB_CPP_SRCS}) -list(APPEND TENGINE_LIB_SRCS ${COMMON_CPP_SRCS}) +if(CONFIG_AUTH_DEVICE) +include_directories(hclarm/auth) +FILE(GLOB_RECURSE HCL_AUTH_SRCS hclarm/*.cpp hclarm/*.c) +list(APPEND TOPERATOR_LIB_SRCS ${HCL_AUTH_SRCS}) -include_directories(driver/cpu) +# For different settings, please change the COMPILE_FLAGS +# Please refers to hclarm/auth/auth.config +FOREACH (file ${HCL_AUTH_SRCS}) +SET_SOURCE_FILES_PROPERTIES ( ${file} PROPERTIES COMPILE_FLAGS "-DCONFIG_INTERN_TRIAL -DCONFIG_TIME_LIMIT=7200") -#add openblas include -if(CONFIG_ARCH_BLAS) - if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) - include_directories(${BLAS_DIR}/arm32/include) - endif() - if(ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")) - include_directories(${BLAS_DIR}/arm64/include) - endif() +ENDFOREACH() + endif() +list(APPEND TENGINE_LIB_SRCS ${COMMON_LIB_CPP_SRCS}) +list(APPEND TOPERATOR_LIB_SRCS ${COMMON_CPP_SRCS}) +list(APPEND TOPERATOR_LIB_SRCS ${REF_CPP_SRCS}) -# Now, handle the .S file +include_directories(driver/cpu) if(CONFIG_ARCH_ARM64) - FILE(GLOB_RECURSE ARCH64_LIB_CPP_SRCS executor/operator/arm64/*.cpp) + FILE(GLOB_RECURSE ARCH_LIB_CPP_SRCS executor/operator/arm64/*.cpp) + FILE(GLOB_RECURSE TARGET_ARCH_FILES executor/operator/arm64/*.S) include_directories(executor/operator/arm64/include) - - FOREACH(file ${ARCH64_LIB_CPP_SRCS}) - set(ACL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/executor/operator/arm64/conv/conv_2d_acl") - STRING(REGEX MATCH ${ACL_PREFIX} skip_file2 ${file}) - - if( NOT skip_file2) - list(APPEND ARCH_LIB_CPP_SRCS ${file}) - endif() - - endforeach() endif() +if(CONFIG_ARCH_ARM32) + FILE(GLOB_RECURSE ARCH_LIB_CPP_SRCS executor/operator/arm32/*.cpp) + FILE(GLOB_RECURSE TARGET_ARCH_FILES executor/operator/arm32/*.S) + include_directories(executor/operator/arm32/include) +endif() -list(APPEND TENGINE_LIB_SRCS ${ARCH_LIB_CPP_SRCS}) +if(CONFIG_ARCH_ARM8_2) + FILE(GLOB_RECURSE ARCH_LIB_CPP_SRCS_8_2 executor/operator/arm8_2/*.cpp) + FILE(GLOB_RECURSE TARGET_ARCH_FILES_8_2 executor/operator/arm8_2/*.S) + include_directories(executor/operator/arm8_2/include) + list(APPEND ARCH_LIB_CPP_SRCS ${ARCH_LIB_CPP_SRCS_8_2}) + list(APPEND TARGET_ARCH_FILES ${TARGET_ARCH_FILES_8_2}) +endif() -# Now, handle the .S file +FOREACH(file ${ARCH_LIB_CPP_SRCS}) + set_property(SOURCE ${file} PROPERTY COMPILE_FLAGS "-fvisibility=hidden") +ENDFOREACH() -if( CONFIG_ARCH_ARM64) +list(APPEND TOPERATOR_LIB_SRCS ${ARCH_LIB_CPP_SRCS}) -set(src_path executor/operator/arm64) -FILE(GLOB TARGET_ARCH_FILES ${src_path}/*.S ${src_path}/fc/*.S - ${src_path}/conv/*.S - ${src_path}/fused/*.S) -endif() +# Now, handle the .S file FOREACH( file ${TARGET_ARCH_FILES}) string(REPLACE "\.S" "\.s" PREPROCESS_FILE0 ${file}) @@ -68,7 +78,7 @@ ADD_CUSTOM_COMMAND( #message(${file} --> ${PREPROCESS_FILE}) -list(APPEND TENGINE_LIB_SRCS ${PREPROCESS_FILE}) +list(APPEND TOPERATOR_LIB_SRCS ${PREPROCESS_FILE}) list(APPEND ASM_FILES ${PREPROCESS_FILE}) SET_SOURCE_FILES_PROPERTIES ( ${PREPROCESS_FILE} PROPERTIES GENERATED 1) @@ -79,4 +89,3 @@ ENDFOREACH() ADD_CUSTOM_TARGET(KERNEL_ASM_TARGET DEPENDS ${ASM_FILES}) - diff --git a/cmake/serializer.cmake b/cmake/serializer.cmake index ac747df2b..e77f3a197 100644 --- a/cmake/serializer.cmake +++ b/cmake/serializer.cmake @@ -91,13 +91,28 @@ if(CONFIG_TF_SERIALIZER) endif() if(CONFIG_TENGINE_SERIALIZER) + include_directories(serializer/include/tengine) + include_directories(serializer/include/tengine/v1) + include_directories(serializer/include/tengine/v2) FILE(GLOB_RECURSE tengine_serializer_cpp_src "serializer/tengine/*.cpp") FILE(GLOB_RECURSE tengine_serializer_c_src "serializer/tengine/*.c") list(APPEND TENGINE_LIB_SRCS ${tengine_serializer_cpp_src} ${tengine_serializer_c_src}) endif() - FILE(GLOB_RECURSE source_serializer_cpp_src "serializer/source/*.cpp") - list(APPEND TENGINE_LIB_SRCS ${source_serializer_cpp_src}) +if(CONFIG_MXNET_SERIALIZER) + FILE(GLOB_RECURSE serializer_src "serializer/mxnet/*.cpp") + list(APPEND TENGINE_LIB_SRCS ${serializer_src}) +endif() + +if(CONFIG_TFLITE_SERIALIZER) + include_directories(serializer/include/tf_lite) + FILE(GLOB_RECURSE tflite_serializer_src "serializer/tf_lite/*.cpp") + list(APPEND TENGINE_LIB_SRCS ${tflite_serializer_src}) +endif() + + +FILE(GLOB_RECURSE source_serializer_cpp_src "serializer/source/*.cpp") +list(APPEND TENGINE_LIB_SRCS ${source_serializer_cpp_src}) FILE(GLOB plugin_init "serializer/plugin/init.cpp") diff --git a/core/include/compiler_fp16.h b/core/include/compiler_fp16.h new file mode 100644 index 000000000..8fe09b739 --- /dev/null +++ b/core/include/compiler_fp16.h @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haitao@openailab.com + */ + +#ifndef __COMPILIER_FP16_H__ +#define __COMPILIER_FP16_H__ + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifdef __ARM_ARCH + +#define fp16_to_fp32(data) \ + ({ \ + float f=data; \ + f; \ + }) + +#define fp32_to_fp16(data) \ + ({ \ + __fp16 f=data; \ + f; \ + }) + + +#else + +struct fp16_pack{ + unsigned short frac:10; + unsigned char exp: 5; + unsigned char sign:1; +} __attribute__((packed)); + +struct fp32_pack{ + unsigned int frac:23; + unsigned char exp: 8; + unsigned char sign:1; +} __attribute__((packed)); + +typedef struct fp16_pack __fp16; + +static inline float fp16_to_fp32(__fp16 data) +{ + float f; + struct fp32_pack * fp32=(struct fp32_pack *)&f; + struct fp16_pack * fp16=&data; + + int exp=fp16->exp; + + if(exp==31 && fp16->frac!=0) + { + //return __builtin_inf()-__builtin_inf(); + fp32->sign=fp16->sign; + fp32->exp=255; + fp32->frac=1; + + return f; + } + + if(exp==31) + exp=255; + if(exp==0) + exp=0; + else + exp=(exp-15)+127; + + fp32->exp=exp; + fp32->sign=fp16->sign; + fp32->frac=((int)fp16->frac)<<13; + + return f; + +} + + +static inline __fp16 fp32_to_fp16(float data) +{ + struct fp32_pack * fp32=(struct fp32_pack *)&data; + struct fp16_pack fp16; + + int exp=fp32->exp; + + if(fp32->exp==255 && fp32->frac!=0) + { + //NaN + fp16.exp=31; + fp16.frac=1; + fp16.sign=fp32->sign; + + return fp16; + } + + if((exp-127)<-14) + exp = 0; + else if((exp-127)>15) + exp=31; + else + exp=exp-127+15; + + fp16.exp=exp; + fp16.frac=fp32->frac>>13; + fp16.sign=fp32->sign; + + return fp16; +} + +#endif + + +#ifdef __cplusplus +} +#endif +#endif diff --git a/core/include/cpu_device.h b/core/include/cpu_device.h index 4e279ed12..3509cc27f 100644 --- a/core/include/cpu_device.h +++ b/core/include/cpu_device.h @@ -43,7 +43,7 @@ extern "C" { #define ARCH_ARM_V7 2 #define ARCH_ARM_V8_2 3 -#define MAX_CLUSTER_CPU_NUMBER 4 +#define MAX_CLUSTER_CPU_NUMBER 8 struct cpu_cluster { diff --git a/core/include/data_layout.hpp b/core/include/data_layout.hpp deleted file mode 100644 index e6e753919..000000000 --- a/core/include/data_layout.hpp +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: haitao@openailab.com - */ -#ifndef __DATA_LAYOUT_HPP__ -#define __DATA_LAYOUT_HPP__ - -#include "named_data.hpp" - -namespace TEngine { - -struct DataLayout : public NamedData -{ - DataLayout(const std::string& str, bool as_default = false) - { - layout_name = str; - - SetData(layout_name, this); - - if(as_default) - SetDefaultData(this); - } - - DataLayout(std::string&& str, bool as_default = false) - { - layout_name = std::move(str); - SetData(layout_name, this); - - if(as_default) - SetDefaultData(this); - } - - static const DataLayout* GetLayout(const std::string& name) - { - return GetData(name); - } - - const std::string& GetName(void) const - { - return layout_name; - } - - virtual unsigned int GetDimNum() const - { - return 0; - } - virtual int GetH() const - { - return -1; - } - virtual int GetW() const - { - return -1; - } - virtual int GetC() const - { - return -1; - } - virtual int GetD() const - { - return -1; - } - virtual int GetN() const - { - return -1; - } - - virtual ~DataLayout(){}; - - std::string layout_name; -}; - -struct LayoutNCHW : public DataLayout -{ - LayoutNCHW(bool as_default = false) : DataLayout("NCHW", as_default){}; - - int GetN() const - { - return 0; - } - int GetC() const - { - return 1; - } - int GetH() const - { - return 2; - } - int GetW() const - { - return 3; - } - unsigned int GetDimNum() const - { - return 4; - } -}; - -struct LayoutNCDHW : public DataLayout -{ - LayoutNCDHW(bool as_default = false) : DataLayout("NCDHW", as_default){}; - - int GetN() const - { - return 0; - } - int GetC() const - { - return 1; - } - int GetD() const - { - return 2; - } - int GetH() const - { - return 3; - } - int GetW() const - { - return 4; - } - unsigned int GetDimNum() const - { - return 5; - } -}; - -struct LayoutNHWC : public DataLayout -{ - LayoutNHWC(bool as_default = false) : DataLayout("NHWC", as_default){}; - - int GetN() const - { - return 0; - } - int GetH() const - { - return 1; - } - int GetW() const - { - return 2; - } - int GetC() const - { - return 3; - } - unsigned int GetDimNum() const - { - return 4; - } -}; - -struct LayoutNDHWC : public DataLayout -{ - LayoutNDHWC(bool as_default = false) : DataLayout("NDHWC", as_default){}; - - int GetN() const - { - return 0; - } - int GetD() const - { - return 1; - } - int GetH() const - { - return 2; - } - int GetW() const - { - return 3; - } - int GetC() const - { - return 4; - } - unsigned int GetDimNum() const - { - return 5; - } -}; - -struct LayoutNHW : public DataLayout -{ - LayoutNHW(bool as_default = false) : DataLayout("NHW", as_default){}; - - int GetN() const - { - return 0; - } - int GetH() const - { - return 1; - } - int GetW() const - { - return 2; - } - unsigned int GetDimNum() const - { - return 3; - } -}; - -struct LayoutNW : public DataLayout -{ - LayoutNW(bool as_default = false) : DataLayout("NW", as_default){}; - - int GetN() const - { - return 0; - } - int GetW() const - { - return 1; - } - unsigned int GetDimNum() const - { - return 2; - } -}; - -struct LayoutHW : public DataLayout -{ - LayoutHW(bool as_default = false) : DataLayout("HW", as_default){}; - - int GetH() const - { - return 0; - } - int GetW() const - { - return 1; - } - unsigned int GetDimNum() const - { - return 2; - } -}; - -struct LayoutW : public DataLayout -{ - LayoutW(bool as_default = false) : DataLayout("W", as_default){}; - - int GetW() const - { - return 0; - } - unsigned int GetDimNum() const - { - return 1; - } -}; - -} // namespace TEngine - -#endif diff --git a/core/include/exec_attr.hpp b/core/include/exec_attr.hpp index 455b1f4ce..c1e2adfa6 100644 --- a/core/include/exec_attr.hpp +++ b/core/include/exec_attr.hpp @@ -46,6 +46,7 @@ enum exec_policy_t #define EXEC_KERNEL_FP32 0 #define EXEC_KERNEL_FP16 1 #define EXEC_KERNEL_INT8 2 +#define EXEC_KERNEL_UINT8 3 #define MODEL_FORMAT_UNKNOWN 0 #define MODEL_FORMAT_TENGINE 1 @@ -54,6 +55,10 @@ enum exec_policy_t #define MODEL_FORMAT_MXNET 4 #define MODEL_FORMAT_TENSORFLOW 5 #define MODEL_FORMAT_TFLITE 6 +#define MODEL_FORMAT_DLA 7 + +#define MODEL_SUBFORMAT_AIPU 1 +#define MODEL_SUBFORMAT_NNIE 2 struct ExecAttr { @@ -61,12 +66,13 @@ struct ExecAttr int priority; int kernel_mode; int model_format; + int model_layout; + int graph_layout; bool low_mem_mode; bool fc_mt; // fc should in multi-threaded? bool pooling_mt; // pooling should in multi-threaded? void* exec_context; void* dev_handle; - int layout; ExecAttr(void) { @@ -79,7 +85,8 @@ struct ExecAttr model_format = MODEL_FORMAT_TENGINE; exec_context = nullptr; dev_handle = nullptr; - layout = -1; + graph_layout = -1; + model_layout = -1; } }; diff --git a/core/include/graph.hpp b/core/include/graph.hpp index 979e59400..fcae590e9 100644 --- a/core/include/graph.hpp +++ b/core/include/graph.hpp @@ -50,6 +50,8 @@ class Graph : public BaseObject { name_ = name; model_format_ = -1; + model_subformat_ = -1; + model_layout_ = -1; layout_ = -1; } @@ -147,10 +149,32 @@ class Graph : public BaseObject { model_format_ = model_format; } + int GetModelFormat(void) { return model_format_; } + + void SetModelSubFormat(int model_subformat) + { + model_subformat_ = model_subformat; + } + + int GetModelSubFormat(void) + { + return model_subformat_; + } + + void SetModelLayout(int model_layout) + { + model_layout_ = model_layout; + } + + int GetModelLayout(void) + { + return model_layout_; + } + void SetLayout(int layout) { layout_ = layout; @@ -176,6 +200,8 @@ class Graph : public BaseObject std::unordered_map owned_tensors_; int model_format_; + int model_subformat_; + int model_layout_; int layout_; Attribute attrs_; diff --git a/core/include/graph_executor.hpp b/core/include/graph_executor.hpp index 4320b3fb9..21c4319f3 100644 --- a/core/include/graph_executor.hpp +++ b/core/include/graph_executor.hpp @@ -46,6 +46,7 @@ class GraphExecutor graph_attached_ = false; exec_handle_ = nullptr; prerun_done_ = false; + optimize_only=0; InitAttrIO(); } @@ -137,6 +138,9 @@ class GraphExecutor return -1; } + bool GetOptimizeOnly(const char* name, void* val, int size); + bool SetOptimizeOnly(const char* name, const void* val, int size); + bool GetExecAttrEntry(const char* name, void* val, int size); bool SetExecAttrEntry(const char* name, const void* val, int size); @@ -170,6 +174,7 @@ class GraphExecutor AttrIO attr_io_; bool prerun_done_; + int optimize_only; }; } // namespace TEngine diff --git a/core/include/node.hpp b/core/include/node.hpp index f9e81f2f0..e2d6da1a7 100644 --- a/core/include/node.hpp +++ b/core/include/node.hpp @@ -343,6 +343,17 @@ class Node : public BaseObject bool dynamic_shape_; }; +#define ATTR_CUSTOM_ATTR "CUSTOM_ATTR" + +struct CustomNodeAttr +{ + int attr_size; + const char* type_name; + std::vector mem; +}; + +using node_custom_attr_map_t = std::unordered_map; + } // namespace TEngine #endif diff --git a/core/include/operator.hpp b/core/include/operator.hpp index b55af6a48..f243e1179 100644 --- a/core/include/operator.hpp +++ b/core/include/operator.hpp @@ -59,11 +59,11 @@ class Operator : public BaseObject return true; } - virtual bool GetParamItem(const char* param_name, const std::type_info* type_info, void* val) + virtual bool GetParamItem(const char* param_name, const char * type_name, void* val) { return false; } - virtual bool SetParamItem(const char* param_name, const std::type_info* type_info, const void* val) + virtual bool SetParamItem(const char* param_name, const char * type_name, const void* val) { return false; } @@ -84,6 +84,15 @@ class Operator : public BaseObject return 0.0f; } + void SetOpVer(int op_ver) + { + op_ver_ = op_ver; + } + int GetOpVer(void) + { + return op_ver_; + } + void SetName(const std::string& new_name) { name_ = new_name; @@ -125,13 +134,6 @@ class Operator : public BaseObject return ParseInputOutput(std::move(output_str), outputs_); } - Operator& SetLayout(const std::string& layout_str) - { - layout_ = layout_str; - - return *this; - } - Operator& SetDoc(std::string&& doc_str) { doc_ = doc_str; @@ -174,22 +176,21 @@ class Operator : public BaseObject { return outputs_[idx].second; } - const std::string& GetLayout(void) const + + Operator() { - return layout_; + op_ver_ = 1; } - - Operator() = default; Operator(const Operator&) = default; virtual ~Operator(){}; protected: + int op_ver_; std::string name_; bool dynamic_shape_; std::vector inputs_; std::vector outputs_; - std::string layout_; std::string doc_; private: @@ -290,12 +291,12 @@ template class OperatorWithParam : public Operator { const std::string& str = any_cast(data); - param.SetItemVal(ir->first, &typeid(const char*), str.c_str()); + param.SetItemVal(ir->first, typeid(const char*).name(), str.c_str()); } else if(data_type == typeid(int)) { float f = ( float )any_cast(data); - param.SetItemVal(ir->first, &typeid(float), &f); + param.SetItemVal(ir->first, typeid(float).name(), &f); } ir++; @@ -323,14 +324,14 @@ template class OperatorWithParam : public Operator return param; } - bool GetParamItem(const char* param_name, const std::type_info* type_info, void* val) override + bool GetParamItem(const char* param_name, const char * type_name, void* val) override { - return param_.GetItemVal(param_name, type_info, val); + return param_.GetItemVal(param_name, type_name, val); } - bool SetParamItem(const char* param_name, const std::type_info* type_info, const void* val) override + bool SetParamItem(const char* param_name, const char * type_name, const void* val) override { - return param_.SetItemVal(param_name, type_info, val); + return param_.SetItemVal(param_name, type_name, val); } protected: diff --git a/core/include/parameter.hpp b/core/include/parameter.hpp index 7b15bed90..3aeb79626 100644 --- a/core/include/parameter.hpp +++ b/core/include/parameter.hpp @@ -40,19 +40,19 @@ struct NamedParam { item_cpy_t cpy_func; item_set_any cpy_any; - const std::type_info* type_info; + const char* type_name; int data; }; - ItemInfo* FindItem(const std::string& name, const std::type_info* item_type) + ItemInfo* FindItem(const std::string& name, const char * type_name) { if(item_map_.count(name) == 0) return nullptr; ItemInfo& entry = item_map_.at(name); - // skip type checking if type_info is nullptr - if(item_type && (*item_type != *entry.type_info)) + // skip type checking if type_name is nullptr + if(type_name && entry.type_name && strcmp(type_name,entry.type_name)) { // printf("requested: %s recorded:%s\n",item_type->name(),entry.type_info->name()); return nullptr; @@ -61,9 +61,9 @@ struct NamedParam return &entry; } - bool GetItemVal(const std::string& name, const std::type_info* val_type, void* val) + bool GetItemVal(const std::string& name, const char * type_name, void* val) { - ItemInfo* entry = FindItem(name, val_type); + ItemInfo* entry = FindItem(name, type_name); if(entry == nullptr) return false; @@ -73,9 +73,9 @@ struct NamedParam return true; } - bool SetItemVal(const std::string& name, const std::type_info* val_type, const void* val) + bool SetItemVal(const std::string& name, const char* type_name, const void* val) { - ItemInfo* entry = FindItem(name, val_type); + ItemInfo* entry = FindItem(name, type_name); if(entry == nullptr) return false; @@ -91,11 +91,12 @@ struct NamedParam return false; ItemInfo& entry = item_map_.at(name); - const std::type_info* item_type = entry.type_info; - const std::type_info& any_type = n.type(); + const char * item_type = entry.type_name; + const char * any_type = n.type().name(); /* several special cases */ - if(*item_type == typeid(const char*) && any_type == typeid(std::string)) + if(!strcmp(item_type,typeid(const char*).name()) && + !strcmp(any_type, typeid(std::string).name())) { const char** ptr = ( const char** )(( char* )this + entry.data); const std::string& str = any_cast(n); @@ -105,7 +106,8 @@ struct NamedParam return true; } - if(*item_type == typeid(std::string) && any_type == typeid(const char*)) + if(!strcmp(item_type,typeid(std::string).name()) && + !strcmp(any_type,typeid(const char*).name())) { std::string* p_str = ( std::string* )(( char* )this + entry.data); const char* ptr = any_cast(n); @@ -120,7 +122,7 @@ struct NamedParam bool SetItemFromAny(const std::string& name, const any& n) { - ItemInfo* entry = FindItem(name, &n.type()); + ItemInfo* entry = FindItem(name, n.type().name()); if(entry == nullptr) return SetItemCompatibleAny(name, n); @@ -145,7 +147,7 @@ struct NamedParam { \ typedef decltype(e) T; \ ItemInfo info; \ - info.type_info = &typeid(T); \ + info.type_name = typeid(T).name(); \ info.data = ( char* )&e - ( char* )this; \ info.cpy_func = [](void* data, const void* v) { *( T* )data = *( const T* )v; }; \ info.cpy_any = [](void* data, const any& n) { *( T* )data = any_cast(n); }; \ diff --git a/core/include/serializer.hpp b/core/include/serializer.hpp index 24a5b36ec..9bf80fe5b 100644 --- a/core/include/serializer.hpp +++ b/core/include/serializer.hpp @@ -41,6 +41,7 @@ class Serializer { public: using op_load_map_t = std::unordered_map; + using op_save_map_t = std::unordered_map; Serializer() {} virtual ~Serializer(){}; @@ -103,11 +104,34 @@ class Serializer return op_load_map_[op_name]; } + bool RegisterOpSaveMethod(const std::string& op_name, const any& save_func) + { + if(op_save_map_.count(op_name)) + return false; + + op_save_map_[op_name] = save_func; + return true; + } + + bool FindOpSaveMethod(const std::string& op_name) + { + if(op_save_map_.count(op_name)) + return true; + + return false; + } + + any& GetOpSaveMethod(const std::string& op_name) + { + return op_save_map_[op_name]; + } + protected: std::string version_; std::string name_; std::string format_name_; op_load_map_t op_load_map_; + op_save_map_t op_save_map_; }; using SerializerPtr = std::shared_ptr; diff --git a/core/include/static_graph.hpp b/core/include/static_graph.hpp index a3aaacf88..f39aea2e7 100644 --- a/core/include/static_graph.hpp +++ b/core/include/static_graph.hpp @@ -60,14 +60,20 @@ struct StaticGraph std::vector tensor_list; std::unordered_map const_tensor_map; std::vector mem_src; - int layout; + int graph_layout; + int model_layout; + int model_format; + int model_subformat; // for dla models StaticGraph(void) { exec_context = nullptr; dev_handle = nullptr; release_func = nullptr; - layout = -1; + graph_layout = -1; + model_layout = -1; + model_format = -1; + model_subformat = -1; } ~StaticGraph(void); @@ -82,6 +88,7 @@ struct StaticNode std::string name; int index; StaticOpPtr op; + Attribute attrs; std::vector input_tensor_list; std::vector output_tensor_list; @@ -100,10 +107,10 @@ struct StaticTensor int mem_size; std::vector dims; int data_type; - std::string data_layout; int type; float scale; int zero_point; + int width; NodeSynapse producer; std::vector consumer; virtual ~StaticTensor() {} diff --git a/core/include/static_graph_interface.hpp b/core/include/static_graph_interface.hpp index d8a65cde7..b002f0660 100644 --- a/core/include/static_graph_interface.hpp +++ b/core/include/static_graph_interface.hpp @@ -42,6 +42,9 @@ void DumpStaticGraph(StaticGraph* graph); const void* GetGraphContext(StaticGraph* graph); void SetGraphDevHandle(StaticGraph* graph, void* release_func, void* dev_handle); void SetGraphLayout(StaticGraph* graph, int layout); +void SetModelLayout(StaticGraph* graph, int layout); +void SetModelFormat(StaticGraph* graph, int model_format); +void SetModelSubFormat(StaticGraph* graph, int model_subformat); // TODO: not available to user void SetGraphInternalName(StaticGraph* graph, const std::string& name); @@ -89,7 +92,6 @@ StaticTensor* CreateStaticTensor(StaticGraph* grap, const std::string& name); void SetTensorDim(StaticTensor*, const std::vector& dims); const std::vector& GetTensorDim(StaticTensor*); void SetTensorDataType(StaticTensor*, int data_type); -void SetTensorDataLayout(StaticTensor*, const std::string& data_layout); void SetTensorType(StaticTensor*, int type); int SetTensorSize(StaticTensor*, int size); diff --git a/core/include/tengine_c_api.h b/core/include/tengine_c_api.h index 14902427b..994fd1230 100644 --- a/core/include/tengine_c_api.h +++ b/core/include/tengine_c_api.h @@ -71,6 +71,11 @@ extern "C" { #define GRAPH_PERF_STAT_RESET 4 #define GRAPH_PERF_STAT_GET 5 +/* quant mode */ +#define TENGINE_QUANT_FP16 0 +#define TENGINE_QUANT_INT8 1 +#define TENGINE_QUANT_UINT8 2 + /* follow the std. UNIX log level definitioin */ enum log_level { @@ -349,6 +354,19 @@ graph_t create_graph(context_t context, const char* model_format, const char* fi int save_graph(graph_t graph, const char* model_format, const char* file_name, ...); +/*! + * @brief quant the graph according to the quant mode + * + * @param [in/out] graph, the graph handle + * @param [in] quant_mode, the quant mode(fp16, int8 or uint8). see TENGINE_QUANT_FP16 etc. + * @param [in] node_no_quant_idxs, the index array of nodes not quant + * @param [in] node_no_quant_number, the number of nodes not quant + * + * @return 0 success or -1 fail + */ + +int quant_graph(graph_t graph, int quant_mode, int node_no_quant_idxs[], int node_no_quant_number); + /*! * @brief Set the layout type of the graph * the default layout of graph is NCHW @@ -577,7 +595,6 @@ int get_node_output_number(node_t node); /*! * @brief Get the input tensor number of a node. * - * @param [in] graph: The graph handle. * @param [in] node: The node hanle. * * @return >=1 the number of output tensor, @@ -586,19 +603,44 @@ int get_node_output_number(node_t node); */ int get_node_input_number(node_t node); + +/*! + * @brief Get graph node number + * + * + * @param [in] graph: the graph handle + * + * @return >=0 the number of the graph node + * -1 on error + */ + +int get_graph_node_number(graph_t graph); + +/*! + * @brief Get graph node by idx + * + * + * @param [in] graph: the graph handle + * @param [in] node_idx: the node index + * + * @return the node handle or NULL on error + */ + +node_t get_graph_node_by_idx(graph_t graph, int node_idx); + /*! * @brief Add an attribute to a node. * * @param [in] node: The target node handle. * @param [in] attr_name: The name of the attribute to be added. - * @param [in] type_info: The pointer to the std::type_info of expected type - * can be set to NULL to skip type match checking. + * @param [in] type_name: The c string get by std::type_info::name() + * can be set to NULL to skip type match checking. * @param [in] size: The size of the attribute * * @return 0: Successfully, * -1: Failed. */ -int add_node_attr(node_t node, const char* attr_name, const void* type_info, int size); +int add_node_attr(node_t node, const char* attr_name, const char * type_name, int size); /*! * @brief Get the attribute value (int) of a node @@ -645,7 +687,7 @@ int get_node_attr_pointer(node_t node, const char* attr_name, void* attr_val); * * @param [in] node: The target node. * @param [in] attr_name: The name of the attribute to be retrieval. - * @param [in] type_info: The pointer to the std::type_info of expected type + * @param [in] type_name: The c string get by std::type_info::name() * can be set to NULL to skip type match checking. * @param [out] buf: The pointer to the buffer to save val. * @param [in] size: The buffer size. @@ -654,7 +696,7 @@ int get_node_attr_pointer(node_t node, const char* attr_name, void* attr_val); * -1: Failed; The name does not exist or the type mismatch. * */ -int get_node_attr_generic(node_t node, const char* attr_name, const void* type_info, void* buf, int size); +int get_node_attr_generic(node_t node, const char* attr_name, const char* type_name, void* buf, int size); /*! * @brief Set the attribute value (int) of a node @@ -703,7 +745,7 @@ int set_node_attr_pointer(node_t node, const char* attr_name, const void* attr_v * * @param [in] node: The target node. * @param [in] attr_name: The name of the attribute to be retrieval. - * @param [in] type_info: The pointer to the std::type_info of wanted type, + * @param [in] type_name: The name of std::type_info::name() * can be set to NULL to skip type match checking. * @param [in] buf: The pointer to the buffer to hold val. * @param [in] size: The buffer size. @@ -712,7 +754,7 @@ int set_node_attr_pointer(node_t node, const char* attr_name, const void* attr_v * -1: Failed, The name does not exist or the type mismatch. * */ -int set_node_attr_generic(node_t node, const char* attr_name, const void* type_info, const void* buf, int size); +int set_node_attr_generic(node_t node, const char* attr_name, const char * type_name, const void* buf, int size); /*! * @brief Set customer kernel of a node, on a specific device, diff --git a/core/include/tengine_c_compat.h b/core/include/tengine_c_compat.h index e75a10bb5..4488f9ca6 100644 --- a/core/include/tengine_c_compat.h +++ b/core/include/tengine_c_compat.h @@ -195,7 +195,7 @@ int get_node_param_pointer(node_t node, const char* param_name, void* param_val) * * @param node, the target node * @param param_name, the name of the param to be retrieval - * @param type_info, pointer to the std::type_info of wanted type, NULL to skip type check + * @param type_name, c string return bye the std::type_info::name() , NULL to skip type check * @param param_val, pointer to the val to be saved * @param size, parameter size * @@ -203,7 +203,7 @@ int get_node_param_pointer(node_t node, const char* param_name, void* param_val) * <0, failed; probably the name does not exist or the type mismatch */ -int get_node_param_generic(node_t node, const char* param_name, const void* type_info, void* param_val, int size); +int get_node_param_generic(node_t node, const char* param_name, const char* type_name, void* param_val, int size); /*! * @brief infer shape for graph @@ -216,11 +216,10 @@ int infer_shape(graph_t graph); * @brief Get the layout of tensor. * * @param [in] tensor: The tensor handle. - * @param [out] layout: The layout of tensor. * @return >=1 the valid dim number, or -1 Fail. * */ -int get_tensor_layout(tensor_t tensor, char* layout); +int get_tensor_layout(tensor_t tensor); /*! * @brief Set the layout of tensor. @@ -230,7 +229,7 @@ int get_tensor_layout(tensor_t tensor, char* layout); * @return 0: Success; -1: Fail. * */ -int set_tensor_layout(tensor_t tensor, const char* layout); +int set_tensor_layout(tensor_t tensor, const int layout); #ifdef __cplusplus } diff --git a/core/include/tengine_c_helper.hpp b/core/include/tengine_c_helper.hpp index 1b079129f..4f8d45048 100644 --- a/core/include/tengine_c_helper.hpp +++ b/core/include/tengine_c_helper.hpp @@ -29,10 +29,10 @@ extern "C" { -int node_add_attr(node_t node, const char* attr_name, const void* type_info, int size); +int node_add_attr(node_t node, const char* attr_name, const char* type_name, int size); -int node_get_attr_generic(void* node, const char* param_name, const void* type_info, void* param_val, int param_size); -int node_set_attr_generic(void* node, const char* param_name, const void* type_info, const void* param_val, +int node_get_attr_generic(void* node, const char* param_name, const char* type_name, void* param_val, int param_size); +int node_set_attr_generic(void* node, const char* param_name, const char* type_name, const void* param_val, int param_size); void set_cpu_list(const char* cpu_list_str); @@ -44,6 +44,8 @@ graph_t create_graph_in_context(context_t exec_context, const char* graph_name, int save_graph_internal(graph_t graph, const char* file_format, const char* fname, va_list argp); +int quant_graph_internal(graph_t graph, int quant_mode, int node_no_quant_idxs[], int node_no_quant_number); + const char* get_model_name(graph_t graph); } @@ -51,7 +53,7 @@ namespace TEngine { class GraphExecutor; -void InitAllPlugin(void); +int InitAllPlugin(void); GraphExecutor* do_merge_graph(std::vector& exec_list); diff --git a/core/include/tengine_test_api.h b/core/include/tengine_test_api.h deleted file mode 100644 index 4951b8a16..000000000 --- a/core/include/tengine_test_api.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2018, Open AI Lab - * Author: haitao@openailab.com - */ -#ifndef __TENGINE_TEST_API_H__ -#define __TENGINE_TEST_API_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -typedef void* test_node_t; - -test_node_t create_convolution_test_node(int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h0, int pad_h1, - int pad_w0, int pad_w1, int dilation_h, int dilation_w, int input_channel, - int output_channel, int group); - -test_node_t create_fc_test_node(int hidden_number, int output_number); - -test_node_t create_pooling_test_node(int pool_method, int kernel_h, int kernel_w, int stride_h, int stride_w, - int pad_h0, int pad_h1, int pad_w0, int pad_w1, int global); - -int test_node_set_input(test_node_t node, float* input_data[], int* input_shape[], int input_number); -int test_node_set_output(test_node_t node, float* output_data[], int* output_shape[], int output_number); - -int test_node_prerun(test_node_t node); - -int test_node_run(test_node_t node); - -int test_node_postrun(test_node_t node); - -void destroy_test_node(test_node_t node); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/core/include/tensor_shape.hpp b/core/include/tensor_shape.hpp index abdd042dd..b2a53abd5 100644 --- a/core/include/tensor_shape.hpp +++ b/core/include/tensor_shape.hpp @@ -44,12 +44,12 @@ enum TensorType class TShape { public: - void SetDataLayout(const std::string& layout_name) + void SetDataLayout(int layout) { - layout_ = layout_name; + layout_ = layout; } - const std::string& GetDataLayout(void) const + int GetDataLayout(void) const { return layout_; } @@ -95,7 +95,7 @@ class TShape return true; } - void SetDim(const std::vector& args, bool layout_check = false); + void SetDim(const std::vector& args); void DumpShape(std::ostream& os) const; @@ -103,7 +103,6 @@ class TShape int GetC(void) const; int GetH(void) const; int GetW(void) const; - int GetD(void) const; TShape() = default; @@ -148,7 +147,7 @@ class TShape private: std::vector dim_; - std::string layout_; + int layout_; }; } // namespace TEngine diff --git a/core/include/worker_thread.hpp b/core/include/worker_thread.hpp index edac8c1ea..a3611e81d 100644 --- a/core/include/worker_thread.hpp +++ b/core/include/worker_thread.hpp @@ -26,6 +26,9 @@ #ifndef __WORKER_THREAD_HPP__ #define __WORKER_THREAD_HPP__ +#include +#include + #include #include #include @@ -107,6 +110,19 @@ template class WorkerThread private: void DoWork(void) { + int task_done_count = 0; + bool skip = false; + +#ifdef CONFIG_MAX_RUN_TIME + long start_time; + + struct timeval tv; + + gettimeofday(&tv, NULL); + + start_time = tv.tv_sec; +#endif + // bind CPU first if(bind_cpu_ >= 0) { @@ -126,10 +142,30 @@ template class WorkerThread if(quit_work_) break; - process_(task, bind_cpu_); +#ifdef CONFIG_MAX_RUN_COUNT + if(task_done_count > CONFIG_MAX_RUN_COUNT) + skip = true; +#endif + +#ifdef CONFIG_MAX_RUN_TIME + if(!(task_done_count & 0x3fff)) + { + struct timeval tv; + + gettimeofday(&tv, NULL); + + if((tv.tv_sec - start_time) >= CONFIG_MAX_RUN_TIME) + skip = true; + } + +#endif + if(!skip) + process_(task, bind_cpu_); if(inc_done_) inc_done_(1); + + task_done_count++; } } diff --git a/core/lib/Makefile b/core/lib/Makefile index c5b44fa65..d46063ee7 100644 --- a/core/lib/Makefile +++ b/core/lib/Makefile @@ -1,5 +1,4 @@ obj-y+=data_type.o -obj-y+=data_layout.o obj-y+=exec_context.o obj-y+=tensor.o obj-y+=tensor_shape.o @@ -19,6 +18,7 @@ obj-y+=compiler.o obj-y+=tengine_c_helper.o obj-y+=tengine_version.o obj-y+=tengine_errno.o +obj-y+=tengine_runtime_error.o obj-y+=logger/ obj-$(CONFIG_LEGACY_API)+=tengine_c_compat.o diff --git a/core/lib/graph.cpp b/core/lib/graph.cpp index 1e1285c49..3e8afc0bb 100644 --- a/core/lib/graph.cpp +++ b/core/lib/graph.cpp @@ -142,6 +142,14 @@ bool Graph::CreateNodeFromStatic(Node* node, const StaticGraph* static_graph, co op->SetDynamicShape(static_op->dynamic_shape); node->SetDynamicShape(static_op->dynamic_shape); + /* copy attrs in static_node */ + std::vector node_attr_name = static_node->attrs.ListAttr(); + + for(unsigned int i = 0; i < node_attr_name.size(); i++) + { + node->SetAttr(node_attr_name[i], static_node->attrs.GetAttr(node_attr_name[i])); + } + /* copy attrs in static_op */ std::vector attr_name = static_op->attrs.ListAttr(); @@ -166,7 +174,7 @@ bool Graph::CreateNodeFromStatic(Node* node, const StaticGraph* static_graph, co TShape& shape = tensor->GetShape(); - shape.SetDataLayout(static_tensor->data_layout); + shape.SetDataLayout(static_graph->graph_layout); shape.SetDim(static_tensor->dims); std::vector* quant_param = tensor->GetQuantParam(); @@ -174,6 +182,7 @@ bool Graph::CreateNodeFromStatic(Node* node, const StaticGraph* static_graph, co (*quant_param)[0].scale = static_tensor->scale; (*quant_param)[0].zero_point = static_tensor->zero_point; + (*quant_param)[0].width = static_tensor->width; if(static_tensor->type == kConstTensor) { @@ -227,6 +236,7 @@ bool Graph::SetupConnection(Tensor* tensor, const StaticGraph* static_graph, con return true; } +#if 0 static int model_format_mapping(const std::string& fmt) { if(fmt == "tengine") @@ -258,6 +268,7 @@ static int model_format_mapping(const std::string& fmt) return MODEL_FORMAT_UNKNOWN; } } +#endif bool Graph::RealCreateFromStatic(const StaticGraphPtr& static_graph) { @@ -324,7 +335,11 @@ bool Graph::RealCreateFromStatic(const StaticGraphPtr& static_graph) /* save the model format */ - model_format_ = model_format_mapping(static_graph->source_format); + //model_format_ = model_format_mapping(static_graph->source_format); + model_format_=static_graph->model_format; + model_subformat_=static_graph->model_subformat; + model_layout_=static_graph->model_layout; + layout_=static_graph->graph_layout; return true; } diff --git a/core/lib/graph_executor.cpp b/core/lib/graph_executor.cpp index d806695e0..9d1a89687 100644 --- a/core/lib/graph_executor.cpp +++ b/core/lib/graph_executor.cpp @@ -44,6 +44,8 @@ bool GraphExecutor::CreateGraph(void* exec_context, const char* graph_name, cons { graph = new Graph(graph_name); graph->SetModelFormat(MODEL_FORMAT_TENGINE); + graph->SetLayout(TENGINE_LAYOUT_NCHW); + graph->SetModelLayout(TENGINE_LAYOUT_NCHW); } else { @@ -105,6 +107,7 @@ bool GraphExecutor::PrepareExec(void* exec_context, Graph* graph, StaticGraph* s bool GraphExecutor::SetExecParam(Graph* graph) { +#if 0 int model_format = graph->GetModelFormat(); /* set proper layout */ @@ -112,11 +115,17 @@ bool GraphExecutor::SetExecParam(Graph* graph) model_format == MODEL_FORMAT_TENSORFLOW || model_format == MODEL_FORMAT_MXNET || model_format == MODEL_FORMAT_TENGINE) { - exec_attr_.layout = TENGINE_LAYOUT_NCHW; + exec_attr_.graph_layout = TENGINE_LAYOUT_NCHW; + + if(model_format == MODEL_FORMAT_TENSORFLOW) + exec_attr_.model_layout = TENGINE_LAYOUT_NHWC; + else + exec_attr_.model_layout = TENGINE_LAYOUT_NCHW; } else if(model_format == MODEL_FORMAT_TFLITE) { - exec_attr_.layout = TENGINE_LAYOUT_NHWC; + exec_attr_.graph_layout = TENGINE_LAYOUT_NHWC; + exec_attr_.model_layout = TENGINE_LAYOUT_NHWC; } else { @@ -125,9 +134,28 @@ bool GraphExecutor::SetExecParam(Graph* graph) } exec_attr_.model_format = model_format; +#else - if(graph->GetLayout() >= 0) - exec_attr_.layout = graph->GetLayout(); +#endif + + exec_attr_.graph_layout = graph->GetLayout(); + exec_attr_.model_layout = graph->GetModelLayout(); + exec_attr_.model_format = graph->GetModelFormat(); + + if(exec_attr_.graph_layout<0) + { + LOG_ERROR()<<"why graph layout is: "<output_nodes.size(); + Graph* cur_graph = GetOptimizedGraph(); - return graph_->output_nodes.size(); + return cur_graph->output_nodes.size(); } const std::string& GraphExecutor::GetGraphOutputNodeName(int idx) { - Graph* optimized_graph = GetOptimizedGraph(); - - Graph* cur_graph; - - if(optimized_graph) - cur_graph = optimized_graph; - else - cur_graph = graph_; + Graph* cur_graph = GetOptimizedGraph(); std::vector& outputs = cur_graph->output_nodes; Node* node = outputs[idx]; @@ -286,35 +304,26 @@ bool GraphExecutor::SetGraphOutputNode(const std::vector& node_name Node* GraphExecutor::FindNode(const std::string& name) { - Graph* optimized_graph = GetOptimizedGraph(); - - if(optimized_graph) - { - Node* node = optimized_graph->FindNode(name); - if(node) - return node; - } + Graph* cur_graph = GetOptimizedGraph(); - return graph_->FindNode(name); + Node* node = cur_graph->FindNode(name); + if(node) + return node; + else + return graph_->FindNode(name); } Tensor* GraphExecutor::FindTensor(const std::string& name) { // try to search in optmized graph first - Graph* optimized_graph = GetOptimizedGraph(); - - if(optimized_graph) - { - Tensor* tensor; - - tensor = optimized_graph->FindTensor(name); - - if(tensor) - return tensor; - } + Graph* cur_graph = GetOptimizedGraph(); - return graph_->FindTensor(name); + Tensor* tensor = cur_graph->FindTensor(name); + if(tensor) + return tensor; + else + return graph_->FindTensor(name); } bool GraphExecutor::InferShape(void) @@ -381,7 +390,7 @@ bool GraphExecutor::InferShape(void) outputs.resize(node->GetOutputNum()); - if(!op->InferShape(inputs, outputs, exec_attr_.layout)) + if(!op->InferShape(inputs, outputs, exec_attr_.graph_layout)) { std::cout << "infer shaped for node: " << node->GetName() << " op: " << op->GetName() << " failed\n"; return false; @@ -521,6 +530,18 @@ bool GraphExecutor::Prerun(void) SetExecParam(graph_); + int optimize_only=0; + + GetGraphAttr("optimize_only",&optimize_only,sizeof(int)); + + if(optimize_only) + { + if(exec_engine_->Prerun(exec_handle_)) + return true; + else + return false; + } + if(InferShape() && exec_engine_->Prerun(exec_handle_)) { prerun_done_ = true; @@ -578,6 +599,27 @@ Graph* GraphExecutor::GetOptimizedGraph(void) return graph; } +bool GraphExecutor::GetOptimizeOnly(const char* name, void* val, int size) +{ + if(size!=sizeof(int)) + return false; + + *(int *)val=optimize_only; + + return 0; + +} + +bool GraphExecutor::SetOptimizeOnly(const char* name, const void* val, int size) +{ + const int * int_ptr=(const int *)val; + + optimize_only=int_ptr[0]; + + return true; +} + + bool GraphExecutor::SetExecAttrEntry(const char* name, const void* val, int size) { if(!strcmp("exec_policy", name)) @@ -693,6 +735,17 @@ void GraphExecutor::InitAttrIO(void) attr_io_.RegSetFunc("fc_mt", set_func); attr_io_.RegSetFunc("pooling_mt", set_func); + + auto set_opt_only_func=std::bind(&GraphExecutor::SetOptimizeOnly,this,std::placeholders::_1,std::placeholders::_2, + std::placeholders::_3); + + auto get_opt_only_func=std::bind(&GraphExecutor::GetOptimizeOnly,this,std::placeholders::_1,std::placeholders::_2, + std::placeholders::_3); + + attr_io_.RegSetFunc("optimize_only",set_opt_only_func); + attr_io_.RegGetFunc("optimize_only",get_opt_only_func); + + // bailout auto set_func2 = std::bind(&GraphExecutor::BailoutSetAttr, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); diff --git a/core/lib/logger/logger.cpp b/core/lib/logger/logger.cpp index af75742b2..c5592e8d3 100644 --- a/core/lib/logger/logger.cpp +++ b/core/lib/logger/logger.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include "compiler.hpp" #include "logger.hpp" @@ -217,8 +218,13 @@ log_stream_t StdLogger::Log(LogLevel level) if(option_.log_date) { auto t = system_clock::to_time_t(system_clock::now()); - +#if defined(__GNUC__) && __GNUC__ > 5 (*log_stream) << std::put_time(std::localtime(&t), "%Y-%m-%d %X "); +#else + char buf[128]; + strftime(buf,128,"%Y-%m-%d %X ",localtime(&t)); + (*log_stream)< mem; -}; - -using node_custom_attr_map_t = std::unordered_map; - -int NodeAddParamGeneric(void* node, const char* param_name, const void* type_info, int param_size) +int NodeAddParamGeneric(void* node, const char* param_name, const char * type_name, int param_size) { Node* real_node = ( Node* )node; @@ -159,7 +148,7 @@ int NodeAddParamGeneric(void* node, const char* param_name, const void* type_inf CustomNodeAttr attr_entry; - attr_entry.type_info = type_info; + attr_entry.type_name = type_name; attr_entry.attr_size = param_size; (*attr_map)[param_name] = attr_entry; @@ -167,13 +156,13 @@ int NodeAddParamGeneric(void* node, const char* param_name, const void* type_inf return 0; } -int NodeGetParamGeneric(void* node, const char* param_name, const void* type_info, void* param_val, int size) +int NodeGetParamGeneric(void* node, const char* param_name, const char * type_name, void* param_val, int size) { Node* real_node = ( Node* )node; Operator* op = real_node->GetOp(); - if(op->GetParamItem(param_name, ( const std::type_info* )type_info, param_val)) + if(op->GetParamItem(param_name, type_name, param_val)) return 0; /* check custom attr */ @@ -193,7 +182,7 @@ int NodeGetParamGeneric(void* node, const char* param_name, const void* type_inf CustomNodeAttr* attr_entry = &attr_map->at(param_name); - if((size != attr_entry->attr_size) || (type_info && attr_entry->type_info && type_info != attr_entry->type_info)) + if((size != attr_entry->attr_size) || (type_name && attr_entry->type_name && strcmp(type_name,attr_entry->type_name))) { set_tengine_errno(EINVAL); return -1; @@ -206,13 +195,13 @@ int NodeGetParamGeneric(void* node, const char* param_name, const void* type_inf return 0; } -int NodeSetParamGeneric(void* node, const char* param_name, const void* type_info, const void* param_val, int size) +int NodeSetParamGeneric(void* node, const char* param_name, const char* type_name, const void* param_val, int size) { Node* real_node = ( Node* )node; Operator* op = real_node->GetOp(); - if(op->SetParamItem(param_name, ( const std::type_info* )type_info, param_val)) + if(op->SetParamItem(param_name, type_name, param_val)) return 0; /* check custom attr */ @@ -232,7 +221,7 @@ int NodeSetParamGeneric(void* node, const char* param_name, const void* type_inf CustomNodeAttr* attr_entry = &attr_map->at(param_name); - if((size != attr_entry->attr_size) || (type_info && attr_entry->type_info && type_info != attr_entry->type_info)) + if((size != attr_entry->attr_size) || (type_name && attr_entry->type_name && strcmp(type_name,attr_entry->type_name))) { set_tengine_errno(EINVAL); return -1; diff --git a/core/lib/serializer.cpp b/core/lib/serializer.cpp index 5eaad045f..8dde693c5 100644 --- a/core/lib/serializer.cpp +++ b/core/lib/serializer.cpp @@ -60,4 +60,21 @@ any& GetOpLoadMethod(const std::string& op_name, const std::string& method_name) return op_method_load_map[key]; } +bool FindOpSaveMethod(const std::string& op_name, const std::string& method_name) +{ + std::string key = op_name + method_name; + + if(op_method_save_map.ExistAttr(key)) + return true; + + return false; +} + +any& GetOpSaveMethod(const std::string& op_name, const std::string& method_name) +{ + std::string key = op_name + method_name; + + return op_method_save_map[key]; +} + } // namespace TEngine diff --git a/core/lib/static_graph.cpp b/core/lib/static_graph.cpp index e9b6bab2a..3de56f78b 100644 --- a/core/lib/static_graph.cpp +++ b/core/lib/static_graph.cpp @@ -69,7 +69,22 @@ void SetGraphDevHandle(StaticGraph* graph, void* release_func, void* dev_handle) void SetGraphLayout(StaticGraph* graph, int layout) { - graph->layout = layout; + graph->graph_layout = layout; +} + +void SetModelLayout(StaticGraph* graph, int layout) +{ + graph->model_layout = layout; +} + +void SetModelFormat(StaticGraph* graph, int model_format) +{ + graph->model_format = model_format; +} + +void SetModelSubFormat(StaticGraph* graph, int model_subformat) +{ + graph->model_subformat = model_subformat; } void SetGraphInternalName(StaticGraph* graph, const std::string& name) @@ -382,7 +397,6 @@ StaticTensor* CreateStaticTensor(StaticGraph* graph, const std::string& name) tensor_ptr->index = tensor_idx; tensor_ptr->name = name; tensor_ptr->type = kVarTensor; - graph->tensor_list.push_back(tensor_ptr); return tensor_ptr.get(); @@ -403,11 +417,6 @@ void SetTensorDataType(StaticTensor* tensor, int data_type) tensor->data_type = data_type; } -void SetTensorDataLayout(StaticTensor* tensor, const std::string& data_layout) -{ - tensor->data_layout = data_layout; -} - void SetTensorType(StaticTensor* tensor, int type) { tensor->type = type; @@ -499,7 +508,6 @@ void DumpStaticNode(StaticGraph* graph, StaticNode* node, std::ostream& os) StaticTensorPtr tensor_ptr = graph->tensor_list[index]; os << "\tI" << i << ": " << tensor_ptr->name << " type: " << tensor_ptr->type; - os << " datalayout: " << tensor_ptr->data_layout << " "; os << " data_type: " << tensor_ptr->data_type << " "; if(tensor_ptr->dims.size()) @@ -521,7 +529,6 @@ void DumpStaticNode(StaticGraph* graph, StaticNode* node, std::ostream& os) StaticTensorPtr tensor_ptr = graph->tensor_list[index]; os << "\tO" << i << ": " << tensor_ptr->name << " type: " << tensor_ptr->type; - os << " datalayout: " << tensor_ptr->data_layout << " "; os << " data_type: " << tensor_ptr->data_type << " "; if(tensor_ptr->dims.size()) diff --git a/core/lib/tengine_c_api.cpp b/core/lib/tengine_c_api.cpp index 79d767ce6..f484f5c1b 100644 --- a/core/lib/tengine_c_api.cpp +++ b/core/lib/tengine_c_api.cpp @@ -88,7 +88,11 @@ int init_tengine(void) set_cpu_list(cpu_list_str); } - InitAllPlugin(); + if(InitAllPlugin()<0) + { + return -1; + } + if(TEnginePlugin::InitModule() < 0) { @@ -104,9 +108,37 @@ int init_tengine(void) return 0; } +void dump_mem_prof(void) +{ + int pid=getpid(); + + char fname[128]; + + LOG_INFO()<<"\ntengine memory profile result:\n"; + + sprintf(fname,"/proc/%d/status",pid); + + FILE * fp=fopen(fname,"r"); + + char line[128]; + + while(fgets(line,128,fp)) + { + if(line[0]=='V' && line[1]=='m') + LOG_INFO()<(graph); + Graph* g = executor->GetOptimizedGraph(); + + if(g->GetModelFormat() == MODEL_FORMAT_TFLITE) + { + LOG_INFO() << "Not quant tf-lite model.\n"; + return 0; + } + + if(quant_mode != TENGINE_QUANT_FP16 && quant_mode != TENGINE_QUANT_INT8) + { + LOG_ERROR() << "Currently only support fp16 and int8 quant.\n"; + set_tengine_errno(EINVAL); + return -1; + } + + return quant_graph_internal(graph, quant_mode, node_no_quant_idxs, node_no_quant_number); +} + int set_graph_layout(graph_t graph, int layout_type) { if(layout_type != TENGINE_LAYOUT_NCHW && layout_type != TENGINE_LAYOUT_NHWC) @@ -691,12 +745,40 @@ int get_node_input_number(node_t node) return real_node->GetInputNum(); } -int add_node_attr(node_t node, const char* attr_name, const void* type_info, int size) +int get_graph_node_number(graph_t graph) +{ + GraphExecutor* executor = reinterpret_cast(graph); + Graph* real_graph = executor->GetOptimizedGraph(); + + return real_graph->seq_nodes.size(); +} + +node_t get_graph_node_by_idx(graph_t graph, int node_idx) +{ + GraphExecutor* executor = reinterpret_cast(graph); + Graph* real_graph = executor->GetOptimizedGraph(); + + int node_num=real_graph->seq_nodes.size(); + + if(node_idx<0 || node_idx>=node_num) + { + set_tengine_errno(EINVAL); + return nullptr; + } + + Node* node = real_graph->seq_nodes[node_idx]; + + node->SetAttr(ATTR_API_GRAPH, executor); + + return node; +} + +int add_node_attr(node_t node, const char* attr_name, const char* type_name, int size) { /* first check if the attribute exists*/ void* buf = malloc(size); - int ret = get_node_attr_generic(node, attr_name, type_info, buf, size); + int ret = get_node_attr_generic(node, attr_name, type_name, buf, size); free(buf); @@ -706,17 +788,17 @@ int add_node_attr(node_t node, const char* attr_name, const void* type_info, int return -1; } - return node_add_attr(node, attr_name, type_info, size); + return node_add_attr(node, attr_name, type_name, size); } int get_node_attr_int(node_t node, const char* attr_name, int* attr_val) { - return get_node_attr_generic(node, attr_name, &typeid(int), attr_val, sizeof(int)); + return get_node_attr_generic(node, attr_name, typeid(int).name(), attr_val, sizeof(int)); } int get_node_attr_float(node_t node, const char* attr_name, float* attr_val) { - return get_node_attr_generic(node, attr_name, &typeid(float), attr_val, sizeof(float)); + return get_node_attr_generic(node, attr_name, typeid(float).name(), attr_val, sizeof(float)); } int get_node_attr_pointer(node_t node, const char* attr_name, void* attr_val) @@ -724,19 +806,19 @@ int get_node_attr_pointer(node_t node, const char* attr_name, void* attr_val) return get_node_attr_generic(node, attr_name, nullptr, attr_val, sizeof(void*)); } -int get_node_attr_generic(node_t node, const char* attr_name, const void* type_info, void* buf, int size) +int get_node_attr_generic(node_t node, const char* attr_name, const char * type_name, void* buf, int size) { - return node_get_attr_generic(node, attr_name, type_info, buf, size); + return node_get_attr_generic(node, attr_name, type_name, buf, size); } int set_node_attr_int(node_t node, const char* attr_name, const int* attr_val) { - return set_node_attr_generic(node, attr_name, &typeid(int), attr_val, sizeof(int)); + return set_node_attr_generic(node, attr_name, typeid(int).name(), attr_val, sizeof(int)); } int set_node_attr_float(node_t node, const char* attr_name, const float* attr_val) { - return set_node_attr_generic(node, attr_name, &typeid(float), attr_val, sizeof(float)); + return set_node_attr_generic(node, attr_name, typeid(float).name(), attr_val, sizeof(float)); } int set_node_attr_pointer(node_t node, const char* attr_name, const void* attr_val) @@ -744,9 +826,9 @@ int set_node_attr_pointer(node_t node, const char* attr_name, const void* attr_v return set_node_attr_generic(node, attr_name, nullptr, attr_val, sizeof(void*)); } -int set_node_attr_generic(node_t node, const char* attr_name, const void* type_info, const void* buf, int size) +int set_node_attr_generic(node_t node, const char* attr_name, const char* type_name, const void* buf, int size) { - return node_set_attr_generic(node, attr_name, type_info, buf, size); + return node_set_attr_generic(node, attr_name, type_name, buf, size); } tensor_t create_graph_tensor(graph_t graph, const char* tensor_name, int data_type) @@ -759,6 +841,13 @@ tensor_t create_graph_tensor(graph_t graph, const char* tensor_name, int data_ty return nullptr; } + if(data_type TENGINE_DT_INT16) + { + LOG_ERROR()<<"unknown data type: "<GetGraph(); if(real_graph->FindTensor(tensor_name)) @@ -777,6 +866,7 @@ tensor_t create_graph_tensor(graph_t graph, const char* tensor_name, int data_ty new_tensor->SetDataType(data_type); new_tensor->SetType(TENSOR_TYPE_CONST); + new_tensor->GetShape().SetDataLayout(real_graph->GetLayout()); real_graph->AddTensor(new_tensor); @@ -1469,13 +1559,5 @@ void dump_graph(graph_t graph) /* first: try to dump optimized graph */ Graph* g = executor->GetOptimizedGraph(); - if(g) - { - g->DumpGraph(); - return; - } - - /* get the origin graph */ - g = executor->GetGraph(); g->DumpGraph(); } diff --git a/core/lib/tengine_c_compat.cpp b/core/lib/tengine_c_compat.cpp index 2798190b6..ef497c2af 100644 --- a/core/lib/tengine_c_compat.cpp +++ b/core/lib/tengine_c_compat.cpp @@ -28,7 +28,6 @@ #include "tengine_c_helper.hpp" #include "exec_context.hpp" -#include "data_layout.hpp" #include "graph_executor.hpp" using namespace TEngine; @@ -114,9 +113,9 @@ int get_node_param_pointer(node_t node, const char* param_name, void* param_val) return get_node_attr_pointer(node, param_name, param_val); } -int get_node_param_generic(node_t node, const char* param_name, const void* type_info, void* param_val, int size) +int get_node_param_generic(node_t node, const char* param_name, const char * type_name, void* param_val, int size) { - return get_node_attr_generic(node, param_name, type_info, param_val, size); + return get_node_attr_generic(node, param_name, type_name, param_val, size); } int infer_shape(graph_t graph) @@ -128,32 +127,22 @@ int infer_shape(graph_t graph) return 0; } -int set_tensor_layout(tensor_t tensor, const char* layout) +int set_tensor_layout(tensor_t tensor, int layout) { - std::string real_layout = layout; - const DataLayout* data_layout = DataLayout::GetLayout(real_layout); - if(data_layout == nullptr) - return -1; Tensor* real_tensor = reinterpret_cast(tensor); TShape shape = real_tensor->GetShape(); - shape.SetDataLayout(real_layout); + shape.SetDataLayout(layout); real_tensor->Reshape(shape); return 0; } -int get_tensor_layout(tensor_t tensor, char* layout) +int get_tensor_layout(tensor_t tensor) { Tensor* real_tensor = reinterpret_cast(tensor); TShape shape = real_tensor->GetShape(); - const std::string& data_layout = shape.GetDataLayout(); - if(data_layout.empty()) - return -1; - int len = strlen(data_layout.c_str()); - memcpy(layout, data_layout.c_str(), len); - - return 0; + return shape.GetDataLayout(); } diff --git a/core/lib/tengine_c_helper.cpp b/core/lib/tengine_c_helper.cpp index 5b64dbe70..6400ae9f3 100644 --- a/core/lib/tengine_c_helper.cpp +++ b/core/lib/tengine_c_helper.cpp @@ -24,37 +24,33 @@ #include #include #include - +#include #include #include +#include "share_lib_parser.hpp" #include "cpu_device.h" #include "tengine_c_api.h" #include "tengine_c_compat.h" #include "tengine_c_helper.hpp" -#include "data_layout.hpp" #include "exec_context.hpp" #include "graph_executor.hpp" #include "tengine_errno.hpp" #include "static_graph_interface.hpp" #include "serializer.hpp" +#include "compiler_fp16.h" namespace TEngine { -extern int NodeSetParamGeneric(void* node, const char* param_name, const void* type_info, const void* param_val, +extern int NodeSetParamGeneric(void* node, const char* param_name, const char* type_name, const void* param_val, int size); -extern int NodeGetParamGeneric(void* node, const char* param_name, const void* type_info, void* param_val, int size); -extern int NodeAddParamGeneric(void* node, const char* param_name, const void* type_info, int size); +extern int NodeGetParamGeneric(void* node, const char* param_name, const char* type_name, void* param_val, int size); +extern int NodeAddParamGeneric(void* node, const char* param_name, const char* type_name, int size); } // namespace TEngine using namespace TEngine; -void __attribute__((constructor)) first_init(void) -{ - NamedData::InitPredefinedData(); -} - void set_cpu_list(const char* cpu_list_str) { char* copy_str = strdup(cpu_list_str); @@ -120,20 +116,20 @@ int dump_model(const char* model_name) return -1; } -int node_get_attr_generic(void* node, const char* param_name, const void* type_info, void* param_val, int param_size) +int node_get_attr_generic(void* node, const char* param_name, const char* type_name, void* param_val, int param_size) { - return NodeGetParamGeneric(node, param_name, type_info, param_val, param_size); + return NodeGetParamGeneric(node, param_name, type_name, param_val, param_size); } -int node_set_attr_generic(void* node, const char* param_name, const void* type_info, const void* param_val, +int node_set_attr_generic(void* node, const char* param_name, const char* type_name, const void* param_val, int param_size) { - return NodeSetParamGeneric(node, param_name, type_info, param_val, param_size); + return NodeSetParamGeneric(node, param_name, type_name, param_val, param_size); } -int node_add_attr(void* node, const char* param_name, const void* type_info, int param_size) +int node_add_attr(void* node, const char* param_name, const char* type_name, int param_size) { - return NodeAddParamGeneric(node, param_name, type_info, param_size); + return NodeAddParamGeneric(node, param_name, type_name, param_size); } static int real_vload_model(context_t exec_context, const char* model_name, const char* model_format, const void* addr, @@ -262,7 +258,7 @@ int save_graph_internal(graph_t graph, const char* model_format, const char* fna /* Get runtime graph pointer */ GraphExecutor* executor = static_cast(graph); - Graph* g = executor->GetGraph(); + Graph* g = executor->GetOptimizedGraph(); /* Save the graph to the files */ if(!serializer->SaveModel(file_list, g)) @@ -271,6 +267,131 @@ int save_graph_internal(graph_t graph, const char* model_format, const char* fna return 0; } +static float get_absmax_val(float* data, int data_size) +{ + float max_val = 0.f; + if(data != nullptr) + { + for(int i = 0; i < data_size; i++) + { + float abs_val = fabs(data[i]); + if(abs_val > max_val) + max_val = abs_val; + } + } + return max_val; +} + +static inline bool isSkipQuant(int nodeInedx, int node_no_quant_idxs[], int number) +{ + for(int i = 0; i < number; i++) + { + if(nodeInedx == node_no_quant_idxs[i]) + return true; + } + return false; +} +#define GET_TENGINE_DT(a) (a+1) +int quant_graph_internal(graph_t graph, int quant_mode, int node_no_quant_idxs[], int node_no_quant_number) +{ + GraphExecutor* executor = static_cast(graph); + Graph* g = executor->GetOptimizedGraph(); + + for(unsigned int i = 0; i < g->seq_nodes.size(); i++) + { + if(isSkipQuant(i, node_no_quant_idxs, node_no_quant_number)) + continue; + + Node* node = g->seq_nodes[i]; + Operator* op = node->GetOp(); + if(op->GetName() == "Const") + continue; + + /* set node output */ + Tensor* output = node->GetOutputTensor(0); + output->SetDataType(GET_TENGINE_DT(quant_mode)); + + if(op->GetName() == "Convolution" || op->GetName() == "FullyConnected") + { + // quant weight + Tensor* weight_tensor = node->GetInputTensor(1); + if(weight_tensor->GetDataType() == TENGINE_DT_FP32) + { + int kernel_size = (weight_tensor->GetTotalSize()) / sizeof(float); + float* kernel_org = (float*)weight_tensor->GetMemAddr(); + + // fp16 quant + if(quant_mode == TENGINE_QUANT_FP16) + { + __fp16 *kernel_new = (__fp16*)malloc(kernel_size * sizeof(__fp16)); + for(int i = 0; i < kernel_size; i++) + kernel_new[i] = fp32_to_fp16(kernel_org[i]); + + // set the memory + weight_tensor->FreeTensor(); + weight_tensor->SetMemAddr(kernel_new); + + // set the data type + weight_tensor->SetDataType(TENGINE_DT_FP16); + } + // int8 quant + else if (quant_mode == TENGINE_QUANT_INT8) + { + int8_t *kernel_new = (int8_t *)malloc(kernel_size); + float weight_max = get_absmax_val(kernel_org, kernel_size); + float weight_scale = weight_max / 127; + int zero_point = 0; + + for(int i = 0; i < kernel_size; i++) + kernel_new[i] = (int8_t)(round(kernel_org[i] / weight_scale) + zero_point); + + // set the memory + weight_tensor->FreeTensor(); + weight_tensor->SetMemAddr(kernel_new); + + // set the data type + weight_tensor->SetDataType(TENGINE_DT_INT8); + + // set the quant param + auto p_quant = weight_tensor->GetQuantParam(); + p_quant->resize(1); + QuantParam& param = (*p_quant)[0]; + param.scale = weight_scale; + param.zero_point = zero_point; + } + } + + // quant bias + if(node->GetInputNum() > 2) + { + Tensor* bias_tensor = node->GetInputTensor(2); + if(bias_tensor->GetDataType() == TENGINE_DT_FP32) + { + int bias_size = (bias_tensor->GetTotalSize()) / sizeof(float); + float* bias_org = (float*)bias_tensor->GetMemAddr(); + + if(quant_mode == TENGINE_QUANT_FP16) + { + __fp16 *bias_new = (__fp16*)malloc(bias_size * sizeof(__fp16)); + for(int i = 0; i < bias_size; i++) + bias_new[i] = fp32_to_fp16(bias_org[i]); + + // set the memory + bias_tensor->FreeTensor(); + bias_tensor->SetMemAddr(bias_new); + + // set the data type + bias_tensor->SetDataType(TENGINE_DT_FP16); + } + } + } + } + } + + return 0; +} + + graph_t create_graph_in_context(context_t exec_context, const char* graph_name, const char* model_name) { GraphExecutor* executor = new GraphExecutor(); @@ -301,12 +422,50 @@ extern void driver_plugin_init(void); namespace TEngine { -void InitAllPlugin(void) +int hclcpu_plugin_init(void) +{ + static ShareLibParser so_handle; + + try { + if(so_handle.Load("libhclcpu.so")<0) + { + LOG_ERROR()<<"cannot load libhclcpu.so\n"; + set_tengine_errno(ENOENT); + return -1; + } + + if(so_handle.ExecuteFunc("register_hclcpu_ops")<0) + { + LOG_ERROR()<<"register hcl cpu ops failed\n"; + set_tengine_errno(EFAULT); + return -1; + } + + } + + catch(const std::exception& e) + { + LOG_ERROR()< -#include "data_layout.hpp" #include "tensor_shape.hpp" #include "logger.hpp" #include "compiler.hpp" namespace TEngine { -void TShape::SetDim(const std::vector& args, bool layout_check) +void TShape::SetDim(const std::vector& args) { - if(layout_check) - { - const DataLayout* p_layout = DataLayout::GetLayout(layout_); - - if(args.size() != p_layout->GetDimNum()) - { - throw(std::runtime_error("shape dims mismatch")); - } - } - dim_ = args; } @@ -68,36 +57,34 @@ void TShape::DumpShape(std::ostream& os) const os << result; } -#define GET_DIM(D) \ - const DataLayout* p_layout = DataLayout::GetLayout(layout_); \ - int idx = p_layout->Get##D(); \ - if(idx < 0) \ - return 1; \ - return dim_[idx] - int TShape::GetN(void) const { - GET_DIM(N); + return Shape(0); } int TShape::GetC(void) const { - GET_DIM(C); + if(layout_==TENGINE_LAYOUT_NCHW) + return Shape(1); + else + return Shape(3); } int TShape::GetH(void) const { - GET_DIM(H); + if(layout_==TENGINE_LAYOUT_NCHW) + return Shape(2); + else + return Shape(1); } int TShape::GetW(void) const { - GET_DIM(W); + if(layout_==TENGINE_LAYOUT_NCHW) + return Shape(3); + else + return Shape(2); } -int TShape::GetD(void) const -{ - GET_DIM(D); -} } // namespace TEngine diff --git a/doc/benchmark.md b/doc/benchmark.md index 4f477abaa..c10d40702 100644 --- a/doc/benchmark.md +++ b/doc/benchmark.md @@ -3,110 +3,124 @@ ## **Revision Record** | Date | Rev |Change Description|Author | ---------- | --- |---|---| -| 2017-12-29 | 0.1 |Initial version|FeyaHan -| 2018-01-06 | 0.2 |Add multi CPU performance|HaoLuo -| 2018-06-14 | 0.3 |Add ACL_GPU performance| Chunying +| 2018-12-27 | 0.9 |update newest benchmark|ZhangRui/LuoHao --- ## **Catalog** -#### [Test Environment](benchmark.md#test-environment-1) -#### [Test](benchmark.md#test-1) -#### [Performance](benchmark.md#performance-1) +#### [**Test Environment**](benchmark.md#test-environment-1) +#### [**Test Steps**](benchmark.md#test-steps-1) +#### [**Performance**](benchmark.md#performance-1) --- - -## Test Environment -- Tengine : v0.3 -- Broad : ROCK960 -- CPU : Rockchip RK3399. - - * Dual-core Cortex-A72 up to 2.0GHz (real frequency is 1.8GHz); - - * Quad-core Cortex-A53 up to 1.5GHz (real frequency is 1.4GHz). - -- GPU : Mali T864 (800MHz). +## **Test Environment** +- Tengine : 0.9.0 +- Broad : **Firefly-3399 (RK3399), TinkerBoard (RK3288)** - Operating System : Ubuntu 16.04. --- -## Test +## **Test Steps** -### Step1. install Tengine +### Step1. **install Tengine** - For more information about the build of Tengine, please refer to the documentation of [install](install.md) +For more information about the build of Tengine, please refer to the documentation of [install](install.md). -### Step2. lock the cpu frequency at maximum -```bash - #switch to root user - > sudo su +### Step2. **lock the cpu frequency at maximum** - #check which available policy, policy4 for A72, policy0 for A53 - > cat /sys/devices/system/cpu/cpufreq/policy4/scaling_available_governors +Please set the scaling governer into performance. Below is an example to set the big core of RK33399 to performance mode. - #set performance policy - > echo performance > /sys/devices/system/cpu/cpufreq/policy4/scaling_governor - - #check cpu frequency - > cat /sys/devices/system/cpu/cpufreq/policy4/cpuinfo_cur_freq +```bash +> sudo su #switch to root user +> cat /sys/devices/system/cpu/cpufreq/policy4/scaling_available_governors #check which available policy, note that policy4 is for A72 on RK3399 and policy0 is for A53 +conservative ondemand userspace powersave interactive performance +> echo performance > /sys/devices/system/cpu/cpufreq/policy4/scaling_governor #set performance policy +> cat /sys/devices/system/cpu/cpufreq/policy4/cpuinfo_cur_freq #check cpu frequency +1800000 ``` -### Step3: test bench_sqz, bench_mobilenet +### Step3: **test benchmark squeezenet_v1.1 and mobilenet_v1** * **get model** - You can get the models from [Tengine model zoo](https://pan.baidu.com/s/1LXZ8vOdyOo50IXS0CUPp8g),the pass word is `57vb`. - And then, put the "mobilenet.caffemodel" "mobilenet_deploy.prototxt" "squeezenet_v1.1.caffemodel" "sqz.prototxt" in `~/tengine/models` - -* **set device** - 1. use ACL_GPU + You can get the models from [Tengine model zoo](https://pan.baidu.com/s/1LXZ8vOdyOo50IXS0CUPp8g), the password is `57vb`. + And then, put the "mobilenet.caffemodel", "mobilenet_deploy.prototxt", "squeezenet_v1.1.caffemodel", "sqz.prototxt" in `~/tengine/models`. - For how to build tengine with ACL_GPU, see [acl_driver.md](acl_driver.md). - You can run the test as +* **set CPU** - ``` - ./build/tests/bin/bench_sqz -d acl_opencl - ./build/tests/bin/bench_mobilenet -d acl_opencl - ``` - 2. use CPU: single-core/multi-cores + By setting the environment variable `TENGINE_CPU_LIST`, different working CPUs can be set. + + For RK3399: + ``` + 1 A72: export TENGINE_CPU_LIST=5 + 2 A72: export TENGINE_CPU_LIST=4,5 + 1 A53: export TENGINE_CPU_LIST=2 + 4 A53: export TENGINE_CPU_LIST=0,1,2,3 + + ``` + For RK3288: + ``` + 1 A17: export TENGINE_CPU_LIST=2 + 4 A17: export TENGINE_CPU_LIST=0,1,2,3 + + ``` - To assign on different cpu core, there are two methods: +* **run int8/float32 inference** - - `export TENGINE_CPU_LIST=0,1,2,3` - - `tests/bin/bench_sqz –p 0,1,2,3` + By default, Tengine run inference as **float32**. To run int8 inference, you need to set the env_variable `KERNEL_MODE` as `2`. And set it back to `0` to run float32 inference. + ``` + export KERNEL_MODE=2 # run int8 inference + export KERNEL_MODE=0 # run float32 inference + ``` - For rk3399, cpu(0-3) are A53, cpu(4-5) are A72. - - - 1A72 `tests/bin/bench_sqz –p 4` - - 2A72 `tests/bin/bench_sqz –p 4,5` - - 1A53 `tests/bin/bench_sqz –p 0` - - 4A53 `tests/bin/bench_sqz –p 0,1,2,3` +--- ## Performance +### RK3399 + +#### MobileNet -| | SqueezeNet(ms) |Mobilenet (ms) | +| | Float32(ms) | INT8(ms) | | ---------- | ---|---| -| rk3399(1*A72) | 91.2 |122.1 | -| rk3399(2*A72) | 51.2 |65.4 | -| rk3399(1*A53) | 232.5 |323.6 | -| rk3399(4*A53) | 79.2 |96.3 | -| ACL(GPU)| 61.4| 95.9| +| rk3399(1*A72) | 111.8 |80.1 | +| rk3399(2*A72) | 63.7 |46.5 | +| rk3399(1*A53) | 259.6 |198.0 | +| rk3399(4*A53) | 81.6 |63.7 | -Notes:
-(1) We run N=100 times per test case.
-(2) We take the average time of N repeats. - ---- +#### SqueezeNet +| | Float32(ms) | INT8(ms) | +| ---------- | ---|---| +| rk3399(1*A72) | 79.4 |60.4 | +| rk3399(2*A72) | 49.3 |37.6 | +| rk3399(1*A53) | 177.0 |151.2 | +| rk3399(4*A53) | 68.4 |59.6 | +### RK3288 +#### MobileNet +| | Float32(ms) | INT8(ms) | +| ---------- | ---|---| +| rk3399(1*A17) | 201 |111 | +| rk3399(4*A17) | 67.4 |40 | + + +#### SqueezeNet +| | Float32(ms) | INT8(ms) | +| ---------- | ---|---| +| rk3399(1*A17) | 142 |88 | +| rk3399(4*A17) | 55 |35 | + +Notes:
+(1) We take the average time of N repeats.
+(2) We run N=100 times per test case.
diff --git a/doc/build_android.md b/doc/build_android.md index b8a97fd27..f2302d548 100644 --- a/doc/build_android.md +++ b/doc/build_android.md @@ -94,7 +94,7 @@ cp ~/ComputeLibrary/build_64/libarm_compute* ~/android-ndk-r16b/platforms/androi #For armv7: cp ~/ComputeLibrary/build_32/libarm_compute* ~/android-ndk-r16b/platforms/android-21/arch-arm/usr/lib/ cd ~/tengine/example -mkdir build +mdkir build cd build ../android_build_armv7.sh or ../android_build_armv8.sh make -j4 diff --git a/driver/acl_graph/acl_graph.hpp b/driver/acl_graph/acl_graph.hpp index 19428791f..186463a91 100644 --- a/driver/acl_graph/acl_graph.hpp +++ b/driver/acl_graph/acl_graph.hpp @@ -326,10 +326,10 @@ class CLGraph Convolution* conv_op = dynamic_cast(node->GetOp()); ConvParam* param = conv_op->GetParam(); - int pad_x = param->pads[1]; - int pad_y = param->pads[0]; - int pad_x_1 = param->pads[3]; - int pad_y_1 = param->pads[2]; + int pad_x = param->pad_w0; + int pad_y = param->pad_h0; + int pad_x_1 = param->pad_w1; + int pad_y_1 = param->pad_h1; int stride_x = param->stride_w; int stride_y = param->stride_h; int group = param->group; @@ -518,8 +518,8 @@ class CLGraph /* weight */ Tensor* w_tensor = node->GetInputTensor(1); name = w_tensor->GetName(); - int M = w_tensor->GetShape().GetH(); - int K = w_tensor->GetShape().GetW(); + int M = w_tensor->GetShape().GetN(); + int K = w_tensor->GetShape().GetC(); CLTensor* wtensor = new CLTensor(); wtensor->allocator()->init(TensorInfo(TensorShape(K, M), 1, data_type_)); tensors_map_[name] = wtensor; @@ -573,12 +573,12 @@ class CLGraph { Pooling* pool_op = dynamic_cast(node->GetOp()); PoolParam* param = pool_op->GetParam(); - int pad_x = param->pad_w; - int pad_y = param->pad_h; + int pad_x = param->pad_w0; + int pad_y = param->pad_h0; int stride_x = param->stride_w; int stride_y = param->stride_h; - int kernel_w = param->kernel_shape[1]; - int kernel_h = param->kernel_shape[0]; + int kernel_w = param->kernel_w ; + int kernel_h = param->kernel_h; int type = param->alg; int global = param->global; diff --git a/driver/cpu/cpu_driver.cpp b/driver/cpu/cpu_driver.cpp index 09d46082e..d4d2e92ef 100644 --- a/driver/cpu/cpu_driver.cpp +++ b/driver/cpu/cpu_driver.cpp @@ -587,7 +587,7 @@ static void probe_func(void) cpu_dev->online_cpu_number = default_param.cpu_number; } - create_cpu_device("generic_probe", cpu_dev); + create_cpu_device(cpu_dev->cpu_name, cpu_dev); cpu_dev->online_cpu_list = saved_list; cpu_dev->online_cpu_number = saved_number; diff --git a/driver/cpu/cpu_probe.cpp b/driver/cpu/cpu_probe.cpp index 73ab8eaf8..4138d8940 100644 --- a/driver/cpu/cpu_probe.cpp +++ b/driver/cpu/cpu_probe.cpp @@ -25,104 +25,107 @@ #include #include #include +#include #include #include #include "cpu_device.h" -int get_cpu_number(void) -{ - FILE* fp = fopen("/proc/cpuinfo", "rb"); - int num = 0; - char buf[256]; - - if(fp == NULL) - return 1; +struct cpu_item { + int cpu_id; + int max_freq; + int cluster_leader; +}; - while(fgets(buf, 256, fp)) - { - if(memcmp(buf, "processor", 9) == 0) - num++; - } +/* + for the meaning files in /sys/device/system/cpu/cpu0/cpufreq + please read documentation/cpu-freq/user-guide.txt +*/ - fclose(fp); +int get_cpu_items(struct cpu_item ** p_item) +{ + char cpu_path[128]; + char file_path[128]; + struct cpu_item * cpu_item=NULL; + struct stat stat_buf; + int i=0; - if(num < 1) - num = 1; + while(1) + { + FILE * fp; + int ret; - return num; -} + sprintf(cpu_path,"/sys/devices/system/cpu/cpu%d/cpufreq",i); -#ifdef __ARM_ARCH + if(stat(cpu_path,&stat_buf)<0) + break; -#ifdef __ANDROID__ -int get_cpu_max_freq(int id) -{ - char fname[256]; - int max_freq = 100; - FILE* fp = NULL; + cpu_item=(struct cpu_item * )realloc(cpu_item,sizeof(struct cpu_item)*(i+1)); - sprintf(fname, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", id); + cpu_item[i].cpu_id=i; - fp = fopen(fname, "rb"); + ret=snprintf(file_path,128,"%s/cpuinfo_max_freq",cpu_path); - if(!fp) - { - sprintf(fname, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", id); - fp = fopen(fname, "rb"); - } + if(ret>=128) + file_path[127]=0x0; + + fp=fopen(file_path,"rb"); - if(fp) - { - while(!feof(fp)) - { - int freq; - if(fscanf(fp, "%d %*d\n", &freq) != 1) - break; + if(fp==NULL) + break; - if(freq > max_freq) - max_freq = freq; - } + if(fscanf(fp, "%d", &cpu_item[i].max_freq)<0) + { + fclose(fp); + break; + } - fclose(fp); + fclose(fp); - return max_freq; - } + ret=snprintf(file_path,128,"%s/related_cpus",cpu_path); - sprintf(fname, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", id); - fp = fopen(fname, "rb"); + if(ret>=128) + file_path[127]=0x0; - if(fp) - { - fscanf(fp, "%d", &max_freq); - fclose(fp); - } + fp=fopen(file_path,"rb"); - return max_freq; -} + if(fp==NULL) + break; -#else -int get_cpu_max_freq(int id) -{ - char cpu_fname[256]; - FILE* fp; - int max_freq; + if(fscanf(fp,"%d ",&cpu_item[i].cluster_leader)<0) + { + fclose(fp); + break; + } - sprintf(cpu_fname, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_max_freq", id); + fclose(fp); - fp = fopen(cpu_fname, "r"); + i++; - if(!fp) - return 0; + } - if(fscanf(fp, "%d", &max_freq) < 0) - return 0; + if(i==0) + { + /* + some weird thing happened! just fill a fake one + TODO: add a log here + */ + + cpu_item=(struct cpu_item *)malloc(sizeof(struct cpu_item)); + + cpu_item[0].cpu_id=0; + cpu_item[0].max_freq=100; + cpu_item[0].cluster_leader=0; + + i++; + } - fclose(fp); + *p_item=cpu_item; - return max_freq; + return i; } -#endif + +#ifdef __ARM_ARCH static char* get_target_line(FILE* fp, const char* target_prefix) { @@ -137,7 +140,7 @@ static char* get_target_line(FILE* fp, const char* target_prefix) return nullptr; } -int get_cpu_model_arch(int id, struct cpu_cluster* cluster) +static int get_cpu_model_arch(int id, struct cpu_cluster* cluster) { char cpu_fname[256]; FILE* fp; @@ -239,52 +242,83 @@ int get_cpu_model_arch(int id, struct cpu_cluster* cluster) return 0; } +#else + +static int get_cpu_model_arch(int id, struct cpu_cluster* cluster) +{ + cluster->cpu_model = CPU_GENERIC; + cluster->cpu_arch = CPU_GENERIC; + cluster->l1_size = 32 << 10; + cluster->l2_size = 512 << 10; + + return 0; +} + +#endif struct cpu_info* probe_system_cpu(void) { static struct cpu_info cpu_dev; - int cluster_idx = -1; - int last_max_freq = -1; - int top_max_freq = -1; + struct cpu_item * cpu_item; + int cpu_number; + int cluster_number=1; - int cpu_number = get_cpu_number(); - struct cpu_cluster* cpu_cluster = ( struct cpu_cluster* )malloc(sizeof(struct cpu_cluster) * (cpu_number / 4 + 1)); + cpu_number=get_cpu_items(&cpu_item); - for(int i = 0; i < cpu_number; i++) + /* assuming cluster cpus are continuous */ + for(int i=1;icpu_number = 0; - } - else - cluster = cpu_cluster + cluster_idx; + memset(cpu_cluster->hw_cpu_id,-1,sizeof(int)*MAX_CLUSTER_CPU_NUMBER); - cluster->max_freq = max_freq; - cluster->cpu_number++; + /* setup cpu 0 */ + cpu_cluster[0].cpu_number=1; + cpu_cluster[0].max_freq=cpu_item[0].max_freq; + cpu_cluster[0].hw_cpu_id[0]=cpu_item[0].cpu_id; - last_max_freq = max_freq; - if(top_max_freq < max_freq) - top_max_freq = max_freq; + int top_max_freq=0; + struct cpu_cluster * cluster=cpu_cluster; + + for(int i=1;ihw_cpu_id,-1,sizeof(int)*MAX_CLUSTER_CPU_NUMBER); + cluster->cpu_number=0; + cluster->max_freq=cpu_item[i].max_freq; - if(get_cpu_model_arch(i, cluster) < 0) - return NULL; + if(cluster->max_freq>top_max_freq) + top_max_freq=cluster->max_freq; + } + + cluster->hw_cpu_id[cluster->cpu_number]=cpu_item[i].cpu_id; + cluster->cpu_number++; } - int start_cpu = 0; + free(cpu_item); + + for(int i=0;ihw_cpu_id[0],cluster); + } - cpu_dev.cluster_number = cluster_idx + 1; - cpu_dev.cluster = cpu_cluster; + cpu_dev.cluster_number=cluster_number; + cpu_dev.cluster=cpu_cluster; - // setup the online cpu according to top_max_freq cpu_dev.online_cpu_list = ( int* )malloc(sizeof(int) * cpu_number); + int online_cpu_number = 0; for(int i = 0; i < cpu_dev.cluster_number; i++) @@ -293,79 +327,26 @@ struct cpu_info* probe_system_cpu(void) for(int j = 0; j < cluster->cpu_number; j++) { - cluster->hw_cpu_id[j] = start_cpu + j; - - if(cluster->max_freq == top_max_freq) + if(cluster->max_freq >= top_max_freq) { cpu_dev.online_cpu_list[online_cpu_number++] = cluster->hw_cpu_id[j]; } } - - start_cpu += cluster->cpu_number; } +#ifdef __ARM_ARCH cpu_dev.cpu_name = "arm.probed"; - cpu_dev.board_name = "generic.probed"; - - cpu_dev.online_cpu_number = online_cpu_number; - - return &cpu_dev; -} - #else + cpu_dev.cpu_name = "x86.probed"; +#endif -struct cpu_info* probe_system_cpu(void) -{ - /* create cpu_info */ - static struct cpu_info cpu_dev; - - int cpu_number = get_cpu_number(); - - struct cpu_cluster* cpu_cluster = ( struct cpu_cluster* )malloc(sizeof(struct cpu_cluster) * (cpu_number / 4 + 1)); - - int cluster_number = 0; - - for(int i = 0; i < cpu_number; i += 4) - { - struct cpu_cluster* cluster = cpu_cluster + cluster_number; - int start_cpu_id = cluster_number * 4; - - cluster->cpu_number = start_cpu_id + 4 > cpu_number ? cpu_number - start_cpu_id : 4; - cluster->max_freq = 2000; - cluster->cpu_model = CPU_GENERIC; - cluster->cpu_arch = CPU_GENERIC; - cluster->l1_size = 32 << 10; - cluster->l2_size = 512 << 10; - - for(int j = 0; j < cluster->cpu_number; j++) - cluster->hw_cpu_id[j] = start_cpu_id + j; - - cluster_number++; - } - - int online_cpu_number = cpu_number; - - cpu_dev.cpu_name = "geneirc chip"; - cpu_dev.board_name = "generic board"; - - cpu_dev.cluster_number = 1; - cpu_dev.l3_size = 512 << 10; - + cpu_dev.board_name = "generic.probed"; cpu_dev.online_cpu_number = online_cpu_number; - cpu_dev.online_cpu_list = ( int* )malloc(sizeof(int) * online_cpu_number); - - for(int i = 0; i < cpu_number; i++) - { - cpu_dev.online_cpu_list[i] = i; - } - - cpu_dev.cluster_number = cluster_number; - cpu_dev.cluster = cpu_cluster; return &cpu_dev; } -#endif + void free_probe_cpu_info(struct cpu_info* cpu_dev) { diff --git a/driver/cpu/cpu_runner.cpp b/driver/cpu/cpu_runner.cpp index 87e9116e8..171002378 100644 --- a/driver/cpu/cpu_runner.cpp +++ b/driver/cpu/cpu_runner.cpp @@ -271,7 +271,7 @@ static void parse_node(void* data, int repeat_count, uint64_t total_time) { Convolution* conv_op = dynamic_cast(node->GetOp()); ConvParam* param = conv_op->GetParam(); - printf("%2d x %d / %d_p%d", param->kernel_h, param->kernel_w, param->stride_h, param->pads[0]); + printf("%2d x %d / %d_p%d", param->kernel_h, param->kernel_w, param->stride_h, param->pad_h0); // if(param->kernel_h==3 && param->stride_h==1) // { // printf(" [%d]",param->mth); @@ -386,7 +386,7 @@ bool CPURunner::Run(Subgraph* sub_graph) std::vector outputs(output_number); - if(!op->InferShape(inputs, outputs, node_ops->exec_attr->layout)) + if(!op->InferShape(inputs, outputs, node_ops->exec_attr->graph_layout)) { XLOG_ERROR() << "infer shaped for node: " << node->GetName() << " op: " << op->GetName() << " failed\n"; ret = false; @@ -654,10 +654,13 @@ bool CPURunner::FreeMem(Subgraph* sub_graph) sub_graph->RemoveAttr("shared_temp_memory"); } - MemPool* mem_pool = any_cast(sub_graph->GetAttr("MemPool")); - delete mem_pool; + if(sub_graph->ExistAttr("MemPool")) + { + MemPool* mem_pool = any_cast(sub_graph->GetAttr("MemPool")); + delete mem_pool; - sub_graph->RemoveAttr("MemPool"); + sub_graph->RemoveAttr("MemPool"); + } return true; } @@ -888,11 +891,12 @@ bool CPURunner::AllocateMem(Subgraph* sub_graph) { void* tensor_addr = get_tensor_mem(input_tensor); int total_size = tensor->GetTotalSize(); - set_tensor_mem(tensor, tensor_addr, total_size, nullptr); - - mem_pool->AddRef(tensor); + if(set_tensor_mem(tensor, tensor_addr, total_size, nullptr)) + { + mem_pool->AddRef(tensor); - continue; + continue; + } } } @@ -900,14 +904,16 @@ bool CPURunner::AllocateMem(Subgraph* sub_graph) { int total_size = tensor->GetTotalSize(); void* tensor_addr = mem_pool->Allocate(tensor, total_size); - set_tensor_mem(tensor, tensor_addr, total_size, nullptr); + if(!set_tensor_mem(tensor, tensor_addr, total_size, nullptr)) + return false; } } /* input tensor */ for(unsigned int i = 0; i < node->GetInputNum(); i++) { Tensor* input_tensor = node->GetInputTensor(i); - mem_pool->Free(input_tensor); + if(input_tensor->GetName() != "data") + mem_pool->Free(input_tensor); } } diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9c9ab147a..164187f24 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -38,12 +38,10 @@ add_subdirectory(faster_rcnn) add_subdirectory(lighten_cnn) add_subdirectory(imagenet_classification) add_subdirectory(mobilenet_ssd) - if( NOT ANDROID) - #add_subdirectory(caffe_wrapper) - #add_subdirectory(tensorflow_wrapper) + add_subdirectory(caffe_wrapper) + add_subdirectory(tensorflow_wrapper) endif() - add_subdirectory(tengine_model) diff --git a/examples/YuFaceDetectNet/yu_facedetect.cpp b/examples/YuFaceDetectNet/yu_facedetect.cpp index 9eea88718..b7aefc96a 100644 --- a/examples/YuFaceDetectNet/yu_facedetect.cpp +++ b/examples/YuFaceDetectNet/yu_facedetect.cpp @@ -1,211 +1,211 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2019, Open AI Lab - * Author: chunyinglv@openailab.com - */ - -#include -#include -#include - -#include "tengine_c_api.h" -#include "opencv2/imgproc/imgproc.hpp" -#include "opencv2/highgui/highgui.hpp" - -float show_threshold = 0.5; - -struct Box -{ - float x0; - float y0; - float x1; - float y1; - int class_idx; - float score; -}; - -void post_process_ssd(cv::Mat& img, float threshold, float* outdata, int num, const std::string& save_name) -{ - std::vector boxes; - int line_width = img.cols * 0.005; - printf("--------------------------------------------\n"); - printf("Face id: prob%%\tBOX:( x0 , y0 ),( x1 , y1 )\n"); - printf("--------------------------------------------\n"); - int detected_face_num = 0; - for(int i = 0; i < num; i++) - { - if(outdata[1] >= threshold) - { - detected_face_num += 1; - Box box; - box.class_idx = outdata[0]; - box.score = outdata[1]; - box.x0 = outdata[2] * img.cols; - box.y0 = outdata[3] * img.rows; - box.x1 = outdata[4] * img.cols; - box.y1 = outdata[5] * img.rows; - boxes.push_back(box); - printf("Face %d:\t%.0f%%\t", detected_face_num, box.score * 100); - printf("BOX:( %g , %g ),( %g , %g )\n", box.x0, box.y0, box.x1, box.y1); - } - outdata += 6; - } - printf("detect faces : %d \n", detected_face_num); - for(int i = 0; i < ( int )boxes.size(); i++) - { - Box box = boxes[i]; - cv::rectangle(img, cv::Rect(box.x0, box.y0, (box.x1 - box.x0), (box.y1 - box.y0)), cv::Scalar(255, 255, 0), - line_width); - - std::ostringstream score_str; - score_str.precision(3); - score_str << box.score; - std::string label = score_str.str(); - int baseLine = 0; - cv::Size label_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.3, 1, &baseLine); - cv::rectangle(img, - cv::Rect(cv::Point(box.x0, box.y0 - label_size.height), - cv::Size(label_size.width, label_size.height + baseLine)), - cv::Scalar(255, 255, 0), CV_FILLED); - cv::putText(img, label, cv::Point(box.x0, box.y0), cv::FONT_HERSHEY_SIMPLEX, 0.3, cv::Scalar(0, 0, 0)); - } - cv::imwrite(save_name, img); - std::cout << "======================================\n"; - std::cout << "[DETECTED IMAGE SAVED]:\t" << save_name << "\n"; - std::cout << "======================================\n"; -} - -void get_input_data(cv::Mat& img, float* input_data, int img_h, int img_w) -{ - int mean[3] = { 104,117,123 }; - unsigned char* src_ptr=(unsigned char*)(img.ptr(0)); - int hw = img_h * img_w; - for(int h = 0; h < img_h; h++) - { - for(int w = 0; w < img_w; w++) - { - for(int c = 0; c < 3; c++) - { - input_data[c * hw + h * img_w + w] =(float)(*src_ptr - mean[c]); - src_ptr++; - } - } - } -} - -int main(int argc, char* argv[]) -{ - if(argc < 4) - { - std::cout << "[Usage]: " << argv[0] << " \n"; - return 0; - } - std::string proto_name_ = argv[1]; - std::string mdl_name_ = argv[2]; - std::string image_file = argv[3]; - - std::string save_file = "save.jpg"; - - cv::Mat img = cv::imread(image_file); - if(img.empty()) - { - std::cerr << "failed to read image file " << image_file << "\n"; - return -1; - } -#if 1 - // resize to 320 x 240 - cv::Mat resize_img; - int img_w = 320; - int img_h = 240; - cv::resize(img, resize_img, cv::Size(img_w, img_h), 0, 0,cv::INTER_NEAREST); - float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3); - get_input_data(resize_img, input_data, img_h, img_w); -#else - // use origin image size - int img_h = img.rows; - int img_w = img.cols; - float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3); - get_input_data(img, input_data, img_h, img_w); -#endif - - init_tengine(); - if(request_tengine_version("0.9") < 0) - return 1; - - graph_t graph = create_graph(nullptr, "caffe", proto_name_.c_str(), mdl_name_.c_str()); - if(graph == nullptr) - { - std::cout << "Create graph0 failed\n"; - std::cout << "errno: " << get_tengine_errno() << "\n"; - return -1; - } - - /* get input tensor */ - int node_idx = 0; - int tensor_idx = 0; - tensor_t input_tensor = get_graph_input_tensor(graph, node_idx, tensor_idx); - int dims[] = {1, 3, img_h, img_w}; - set_tensor_shape(input_tensor, dims, 4); - /* setup input buffer */ - if(set_tensor_buffer(input_tensor, input_data, 3 * img_h * img_w * 4) < 0) - { - std::printf("Set buffer for tensor failed\n"); - return -1; - } - - prerun_graph(graph); - - // time run_graph - int repeat_count = 1; - const char* repeat = std::getenv("REPEAT_COUNT"); - if(repeat) - repeat_count = std::strtoul(repeat, NULL, 10); - - struct timeval t0, t1; - float avg_time = 0.f; - gettimeofday(&t0, NULL); - for(int i = 0; i < repeat_count; i++) - run_graph(graph, 1); - gettimeofday(&t1, NULL); - float mytime = ( float )((t1.tv_sec * 1000000 + t1.tv_usec) - (t0.tv_sec * 1000000 + t0.tv_usec)) / 1000; - avg_time += mytime; - std::cout << "--------------------------------------\n"; - std::cout << "repeat " << repeat_count << " times, avg time per run is " << avg_time / repeat_count << " ms\n"; - - // post process - tensor_t out_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out"); - int out_dim[4]; - get_tensor_shape(out_tensor, out_dim, 4); - float* outdata = ( float* )get_tensor_buffer(out_tensor); - int num = out_dim[1]; - - post_process_ssd(img, show_threshold, outdata, num, save_file.c_str()); - - // free - release_graph_tensor(out_tensor); - release_graph_tensor(input_tensor); - postrun_graph(graph); - destroy_graph(graph); - free(input_data); - release_tengine(); - - return 0; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: chunyinglv@openailab.com + */ + +#include +#include +#include + +#include "tengine_c_api.h" +#include "opencv2/imgproc/imgproc.hpp" +#include "opencv2/highgui/highgui.hpp" + +float show_threshold = 0.5; + +struct Box +{ + float x0; + float y0; + float x1; + float y1; + int class_idx; + float score; +}; + +void post_process_ssd(cv::Mat& img, float threshold, float* outdata, int num, const std::string& save_name) +{ + std::vector boxes; + int line_width = img.cols * 0.005; + printf("--------------------------------------------\n"); + printf("Face id: prob%%\tBOX:( x0 , y0 ),( x1 , y1 )\n"); + printf("--------------------------------------------\n"); + int detected_face_num = 0; + for(int i = 0; i < num; i++) + { + if(outdata[1] >= threshold) + { + detected_face_num += 1; + Box box; + box.class_idx = outdata[0]; + box.score = outdata[1]; + box.x0 = outdata[2] * img.cols; + box.y0 = outdata[3] * img.rows; + box.x1 = outdata[4] * img.cols; + box.y1 = outdata[5] * img.rows; + boxes.push_back(box); + printf("Face %d:\t%.0f%%\t", detected_face_num, box.score * 100); + printf("BOX:( %g , %g ),( %g , %g )\n", box.x0, box.y0, box.x1, box.y1); + } + outdata += 6; + } + printf("detect faces : %d \n", detected_face_num); + for(int i = 0; i < ( int )boxes.size(); i++) + { + Box box = boxes[i]; + cv::rectangle(img, cv::Rect(box.x0, box.y0, (box.x1 - box.x0), (box.y1 - box.y0)), cv::Scalar(255, 255, 0), + line_width); + + std::ostringstream score_str; + score_str.precision(3); + score_str << box.score; + std::string label = score_str.str(); + int baseLine = 0; + cv::Size label_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.3, 1, &baseLine); + cv::rectangle(img, + cv::Rect(cv::Point(box.x0, box.y0 - label_size.height), + cv::Size(label_size.width, label_size.height + baseLine)), + cv::Scalar(255, 255, 0), CV_FILLED); + cv::putText(img, label, cv::Point(box.x0, box.y0), cv::FONT_HERSHEY_SIMPLEX, 0.3, cv::Scalar(0, 0, 0)); + } + cv::imwrite(save_name, img); + std::cout << "======================================\n"; + std::cout << "[DETECTED IMAGE SAVED]:\t" << save_name << "\n"; + std::cout << "======================================\n"; +} + +void get_input_data(cv::Mat& img, float* input_data, int img_h, int img_w) +{ + int mean[3] = { 104,117,123 }; + unsigned char* src_ptr=(unsigned char*)(img.ptr(0)); + int hw = img_h * img_w; + for(int h = 0; h < img_h; h++) + { + for(int w = 0; w < img_w; w++) + { + for(int c = 0; c < 3; c++) + { + input_data[c * hw + h * img_w + w] =(float)(*src_ptr - mean[c]); + src_ptr++; + } + } + } +} + +int main(int argc, char* argv[]) +{ + if(argc < 4) + { + std::cout << "[Usage]: " << argv[0] << " \n"; + return 0; + } + std::string proto_name_ = argv[1]; + std::string mdl_name_ = argv[2]; + std::string image_file = argv[3]; + + std::string save_file = "save.jpg"; + + cv::Mat img = cv::imread(image_file); + if(img.empty()) + { + std::cerr << "failed to read image file " << image_file << "\n"; + return -1; + } +#if 1 + // resize to 320 x 240 + cv::Mat resize_img; + int img_w = 320; + int img_h = 240; + cv::resize(img, resize_img, cv::Size(img_w, img_h), 0, 0,cv::INTER_NEAREST); + float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3); + get_input_data(resize_img, input_data, img_h, img_w); +#else + // use origin image size + int img_h = img.rows; + int img_w = img.cols; + float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3); + get_input_data(img, input_data, img_h, img_w); +#endif + + init_tengine(); + if(request_tengine_version("0.9") < 0) + return 1; + + graph_t graph = create_graph(nullptr, "caffe", proto_name_.c_str(), mdl_name_.c_str()); + if(graph == nullptr) + { + std::cout << "Create graph0 failed\n"; + std::cout << "errno: " << get_tengine_errno() << "\n"; + return -1; + } + + /* get input tensor */ + int node_idx = 0; + int tensor_idx = 0; + tensor_t input_tensor = get_graph_input_tensor(graph, node_idx, tensor_idx); + int dims[] = {1, 3, img_h, img_w}; + set_tensor_shape(input_tensor, dims, 4); + /* setup input buffer */ + if(set_tensor_buffer(input_tensor, input_data, 3 * img_h * img_w * 4) < 0) + { + std::printf("Set buffer for tensor failed\n"); + return -1; + } + + prerun_graph(graph); + + // time run_graph + int repeat_count = 1; + const char* repeat = std::getenv("REPEAT_COUNT"); + if(repeat) + repeat_count = std::strtoul(repeat, NULL, 10); + + struct timeval t0, t1; + float avg_time = 0.f; + gettimeofday(&t0, NULL); + for(int i = 0; i < repeat_count; i++) + run_graph(graph, 1); + gettimeofday(&t1, NULL); + float mytime = ( float )((t1.tv_sec * 1000000 + t1.tv_usec) - (t0.tv_sec * 1000000 + t0.tv_usec)) / 1000; + avg_time += mytime; + std::cout << "--------------------------------------\n"; + std::cout << "repeat " << repeat_count << " times, avg time per run is " << avg_time / repeat_count << " ms\n"; + + // post process + tensor_t out_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out"); + int out_dim[4]; + get_tensor_shape(out_tensor, out_dim, 4); + float* outdata = ( float* )get_tensor_buffer(out_tensor); + int num = out_dim[1]; + + post_process_ssd(img, show_threshold, outdata, num, save_file.c_str()); + + // free + release_graph_tensor(out_tensor); + release_graph_tensor(input_tensor); + postrun_graph(graph); + destroy_graph(graph); + free(input_data); + release_tengine(); + + return 0; +} diff --git a/examples/caffe_wrapper/cpp_classification/CMakeLists.txt b/examples/caffe_wrapper/cpp_classification/CMakeLists.txt index dccedf463..5849f617f 100644 --- a/examples/caffe_wrapper/cpp_classification/CMakeLists.txt +++ b/examples/caffe_wrapper/cpp_classification/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8) project(classification) set( INSTALL_DIR ${TENGINE_DIR}/install/ ) -set( TENGINE_LIBS wrapper tengine protobuf boost_thread glog ) +set( TENGINE_LIBS wrapper tengine hclcpu protobuf boost_thread glog ) set( CODE_SRC classification.cpp ) #flag @@ -23,9 +23,14 @@ include_directories(${INSTALL_DIR}/wrapper_include link_directories(${INSTALL_DIR}/lib) #exe + +if ( NOT ARM) +set (OPEN_BLAS_LIB openblas) +endif() + add_executable(classification ${CODE_SRC}) -target_link_libraries(classification ${TENGINE_LIBS} ${OpenCV_LIBS}) +target_link_libraries(classification ${TENGINE_LIBS} ${OpenCV_LIBS} ${OPEN_BLAS_LIB}) add_executable(classification_mobilenet ${CODE_SRC}) target_compile_definitions(classification_mobilenet PUBLIC -DMOBILE_NET) -target_link_libraries(classification_mobilenet ${TENGINE_LIBS} ${OpenCV_LIBS}) +target_link_libraries(classification_mobilenet ${TENGINE_LIBS} ${OpenCV_LIBS} ${OPEN_BLAS_LIB}) diff --git a/examples/caffe_wrapper/mtcnn/CMakeLists.txt b/examples/caffe_wrapper/mtcnn/CMakeLists.txt index 15ca8b939..57b4aee09 100644 --- a/examples/caffe_wrapper/mtcnn/CMakeLists.txt +++ b/examples/caffe_wrapper/mtcnn/CMakeLists.txt @@ -4,7 +4,7 @@ project(CAFFE_MTCNN) set( TENGINE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../ ) set( INSTALL_DIR ${TENGINE_DIR}/install/ ) -set( TENGINE_LIBS wrapper tengine ) +set( TENGINE_LIBS wrapper tengine hclcpu) set( CAFFE_MTCNN_SRCS test_caffe_mtcnn.cpp caffe_mtcnn.cpp caffe_mtcnn_utils.cpp ) #flag @@ -21,5 +21,9 @@ include_directories(${INSTALL_DIR}/wrapper_include link_directories(${INSTALL_DIR}/lib) #exe +if ( NOT (ARM OR ANDROID) ) +set (OPEN_BLAS_LIB openblas) +endif() + add_executable(CAFFE_MTCNN ${CAFFE_MTCNN_SRCS}) -target_link_libraries(CAFFE_MTCNN ${TENGINE_LIBS} ${OpenCV_LIBS} boost_thread) +target_link_libraries(CAFFE_MTCNN ${TENGINE_LIBS} ${OpenCV_LIBS} boost_thread ${OPEN_BLAS_LIB}) diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 666c134cc..f3fd3727f 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -101,24 +101,24 @@ bool set_tengine_config() */ std::string get_file(const char* fname) { - std::fstream test_fs; + FILE* fp; std::string fn = fname; const std::string mod_sch1 = "./" + fn; const std::string mod_sch2 = get_root_path() + "models/" + fn; - test_fs.open(mod_sch1.c_str()); - if(test_fs.is_open()) + fp = fopen(mod_sch1.c_str(), "r"); + if(fp) { - test_fs.close(); + fclose(fp); return mod_sch1; } else { - test_fs.open(mod_sch2.c_str()); - if(test_fs.is_open()) + fp = fopen(mod_sch2.c_str(), "r"); + if(fp) { - test_fs.close(); + fclose(fp); return mod_sch2; } else diff --git a/examples/faster_rcnn/CMakeLists.txt b/examples/faster_rcnn/CMakeLists.txt index 2d4da5631..1e8a54d8a 100644 --- a/examples/faster_rcnn/CMakeLists.txt +++ b/examples/faster_rcnn/CMakeLists.txt @@ -4,7 +4,8 @@ project(FASTER_RCNN) link_directories( ${PROTOBUF_DIR}/lib ) add_definitions(-std=c++11) set( INSTALL_DIR ${TENGINE_DIR}/install ) -set( TENGINE_LIBS tengine) +set( TENGINE_LIBS tengine hclcpu) + if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so) endif() @@ -23,6 +24,11 @@ if( BLAS_DIR) endif() endif() +if ( NOT (ARM OR ANDROID)) +set (BLAS_LIB openblas) +endif() + + #opencv find_package(OpenCV REQUIRED) @@ -37,7 +43,7 @@ add_executable(FASTER_RCNN ${RESNET_SRCS}) if( ANDROID) target_link_libraries(FASTER_RCNN ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB}) else() - target_link_libraries(FASTER_RCNN ${TENGINE_LIBS} ${OpenCV_LIBS}) + target_link_libraries(FASTER_RCNN ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) endif() diff --git a/examples/imagenet_classification/CMakeLists.txt b/examples/imagenet_classification/CMakeLists.txt index 54f3b0222..7ce54f998 100644 --- a/examples/imagenet_classification/CMakeLists.txt +++ b/examples/imagenet_classification/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8) project(Classify) set( INSTALL_DIR ${TENGINE_DIR}/install/ ) -set( TENGINE_LIBS tengine) +set( TENGINE_LIBS tengine hclcpu) if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so) endif() @@ -20,6 +20,9 @@ if( BLAS_DIR) endif() endif() +if ( NOT (ARM OR ANDROID)) +set (BLAS_LIB openblas) +endif() set( CODE_SRCS classification.cpp model_config.cpp ../common/common.cpp ) set( BIN_EXE Classify ) @@ -38,5 +41,5 @@ add_executable(${BIN_EXE} ${CODE_SRCS}) if(ANDROID) target_link_libraries(${BIN_EXE} ${PROTOBUF_LIB} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) else() - target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS}) + target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) endif() diff --git a/examples/lighten_cnn/CMakeLists.txt b/examples/lighten_cnn/CMakeLists.txt index ce2a9d805..b04a1fe9b 100644 --- a/examples/lighten_cnn/CMakeLists.txt +++ b/examples/lighten_cnn/CMakeLists.txt @@ -3,7 +3,8 @@ cmake_minimum_required (VERSION 2.8) project(LIGHTEN_CNN) set( INSTALL_DIR ${TENGINE_DIR}/install/ ) -set( TENGINE_LIBS tengine) +set( TENGINE_LIBS tengine hclcpu) + if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so) endif() @@ -21,6 +22,11 @@ if( BLAS_DIR) endif() endif() +if ( NOT (ARM OR ANDROID)) +set (BLAS_LIB openblas) +endif() + + #opencv find_package(OpenCV REQUIRED) @@ -35,6 +41,6 @@ add_executable(LIGHTEN_CNN ${CODE_SRCS}) if( ANDROID) target_link_libraries(LIGHTEN_CNN ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB}) else() - target_link_libraries(LIGHTEN_CNN ${TENGINE_LIBS} ${OpenCV_LIBS}) + target_link_libraries(LIGHTEN_CNN ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) endif() diff --git a/examples/linux_build.sh b/examples/linux_build.sh index 725c3079e..e6f35dc51 100755 --- a/examples/linux_build.sh +++ b/examples/linux_build.sh @@ -1,4 +1,4 @@ #!/bin/bash -cmake -DTENGINE_DIR=/home/usr/tengine \ +cmake -DTENGINE_DIR=/home/haitao/workshop/tengine \ .. diff --git a/examples/mobilenet_ssd/CMakeLists.txt b/examples/mobilenet_ssd/CMakeLists.txt index 426646c56..3ba1e6806 100644 --- a/examples/mobilenet_ssd/CMakeLists.txt +++ b/examples/mobilenet_ssd/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8) project(MSSD) add_definitions(-std=c++11) set( INSTALL_DIR ${TENGINE_DIR}/install/) -set( TENGINE_LIBS tengine) +set( TENGINE_LIBS tengine hclcpu) if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so) @@ -21,6 +21,9 @@ if( BLAS_DIR) endif() endif() +if ( NOT (ARM OR ANDROID)) +set (BLAS_LIB openblas) +endif() set( CODE_SRCS mssd.cpp ../common/common.cpp) @@ -39,7 +42,7 @@ add_executable(MSSD ${CODE_SRCS}) if( ANDROID) target_link_libraries(MSSD ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB}) else() - target_link_libraries(MSSD ${TENGINE_LIBS} ${OpenCV_LIBS}) + target_link_libraries(MSSD ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) endif() diff --git a/examples/mtcnn/CMakeLists.txt b/examples/mtcnn/CMakeLists.txt index 205f12e6d..6f6d8f49a 100644 --- a/examples/mtcnn/CMakeLists.txt +++ b/examples/mtcnn/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8) project(MTCNN) set( INSTALL_DIR ${TENGINE_DIR}/install/ ) -set( TENGINE_LIBS tengine) +set( TENGINE_LIBS tengine hclcpu) if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so) endif() @@ -20,6 +20,10 @@ if( BLAS_DIR) endif() endif() +if ( NOT (ARM OR ANDROID)) +set (BLAS_LIB openblas) +endif() + set( MTCNN_SRCS mtcnn_utils.cpp mtcnn.cpp test_mtcnn.cpp ../common/common.cpp ) #opencv @@ -37,6 +41,6 @@ add_executable(MTCNN ${MTCNN_SRCS}) if( ANDROID) target_link_libraries(MTCNN ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB}) else() - target_link_libraries(MTCNN ${TENGINE_LIBS} ${OpenCV_LIBS}) + target_link_libraries(MTCNN ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) endif() diff --git a/examples/ssd/CMakeLists.txt b/examples/ssd/CMakeLists.txt index 94cf49cc3..2cc458ca3 100644 --- a/examples/ssd/CMakeLists.txt +++ b/examples/ssd/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8) project(SSD) set( INSTALL_DIR ${TENGINE_DIR}/install/) -set( TENGINE_LIBS tengine) +set( TENGINE_LIBS tengine hclcpu) if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so) endif() @@ -21,6 +21,9 @@ if( BLAS_DIR) endif() endif() +if ( NOT (ARM OR ANDROID)) +set (BLAS_LIB openblas) +endif() #opencv find_package(OpenCV REQUIRED) @@ -37,7 +40,7 @@ add_executable(SSD ${CODE_SRCS}) if( ANDROID) target_link_libraries(SSD ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB}) else() - target_link_libraries(SSD ${TENGINE_LIBS} ${OpenCV_LIBS}) + target_link_libraries(SSD ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) endif() diff --git a/examples/tengine_model/classification/CMakeLists.txt b/examples/tengine_model/classification/CMakeLists.txt index fe6754b18..02fdd4c56 100644 --- a/examples/tengine_model/classification/CMakeLists.txt +++ b/examples/tengine_model/classification/CMakeLists.txt @@ -3,7 +3,8 @@ cmake_minimum_required (VERSION 2.8) project(tm_classify) set( INSTALL_DIR ${TENGINE_DIR}/install/ ) -set( TENGINE_LIBS tengine) +set( TENGINE_LIBS tengine hclcpu) + if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so) endif() @@ -20,10 +21,25 @@ if( BLAS_DIR) endif() endif() +if ( NOT (ARM OR ANDROID)) +set (BLAS_LIB openblas) +endif() set( CODE_SRCS classification.cpp ../../common/common.cpp ) set( BIN_EXE tm_classify ) +set( CODE_SRCS_TF classification_tf.cpp ../../common/common.cpp ) +set( BIN_EXE_TF tm_classify_tf ) + +set( CODE_SRCS_INT8 classification_int8.cpp ../../common/common.cpp ) +set( BIN_EXE_INT8 tm_classify_int8 ) + +if(NOT ANDROID AND CMAKE_SIZEOF_VOID_P EQUAL 4) + add_definitions(-mfp16-format=ieee -mfpu=neon-fp16) +endif() +set( CODE_SRCS_FP16 classification_fp16.cpp ../../common/common.cpp ) +set( BIN_EXE_FP16 tm_classify_fp16 ) + #flag #set(CMAKE_CXX_FLAGS "-std=c++11 -O3 -Wall") @@ -31,7 +47,8 @@ find_package(OpenCV REQUIRED) #include include_directories(${INSTALL_DIR}/include - ${TENGINE_DIR}/examples/common) + ${TENGINE_DIR}/examples/common + ${TENGINE_DIR}/core/include) #lib link_directories(${INSTALL_DIR}/lib) @@ -41,5 +58,27 @@ add_executable(${BIN_EXE} ${CODE_SRCS}) if(ANDROID) target_link_libraries(${BIN_EXE} ${PROTOBUF_LIB} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) else() - target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS}) + target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) +endif() + +add_executable(${BIN_EXE_TF} ${CODE_SRCS_TF}) +if(ANDROID) + target_link_libraries(${BIN_EXE_TF} ${PROTOBUF_LIB} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) +else() + target_link_libraries(${BIN_EXE_TF} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) endif() + +add_executable(${BIN_EXE_INT8} ${CODE_SRCS_INT8}) +if(ANDROID) + target_link_libraries(${BIN_EXE_INT8} ${PROTOBUF_LIB} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) +else() + target_link_libraries(${BIN_EXE_INT8} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) +endif() + +add_executable(${BIN_EXE_FP16} ${CODE_SRCS_FP16}) +if(ANDROID) + target_link_libraries(${BIN_EXE_FP16} ${PROTOBUF_LIB} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) +else() + target_link_libraries(${BIN_EXE_FP16} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) +endif() + diff --git a/examples/tengine_model/classification/classification_fp16.cpp b/examples/tengine_model/classification/classification_fp16.cpp new file mode 100644 index 000000000..bc0cd2555 --- /dev/null +++ b/examples/tengine_model/classification/classification_fp16.cpp @@ -0,0 +1,444 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: jingyou@openailab.com + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opencv2/imgproc/imgproc.hpp" +#include "opencv2/highgui/highgui.hpp" +#include "tengine_c_api.h" +#include "common.hpp" +#include "cpu_device.h" +#include "compiler_fp16.h" + +#define DEFAULT_MODEL_NAME "squeezenet" +#define DEFAULT_IMAGE_FILE "tests/images/cat.jpg" +#define DEFAULT_LABEL_FILE "models/synset_words.txt" +#define DEFAULT_IMG_H 227 +#define DEFAULT_IMG_W 227 +#define DEFAULT_SCALE 1.f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_REPEAT_CNT 1 +#define PRINT_TOP_NUM 5 + +typedef struct +{ + const char* model_name; + int img_h; + int img_w; + float scale; + float mean[3]; + const char* tm_file; + const char* label_file; +} Model_Config; + +const Model_Config model_list[] = { + {"squeezenet", 227, 227, 1.f, {104.007, 116.669, 122.679}, "squeezenet_fp16.tmfile", "synset_words.txt"}, + {"mobilenet", 224, 224, 0.017, {104.007, 116.669, 122.679}, "mobilenet_fp16.tmfile", "synset_words.txt"}, + {"mobilenet_v2", 224, 224, 0.017, {104.007, 116.669, 122.679}, "mobilenet_v2_fp16.tmfile", "synset_words.txt"}, + {"resnet50", 224, 224, 1.f, {104.007, 116.669, 122.679}, "resnet50_fp16.tmfile", "synset_words.txt"}, + {"alexnet", 227, 227, 1.f, {104.007, 116.669, 122.679}, "alexnet_fp16.tmfile", "synset_words.txt"}, + {"googlenet", 224, 224, 1.f, {104.007, 116.669, 122.679}, "googlenet_fp16.tmfile", "synset_words.txt"}, + {"inception_v3", 395, 395, 0.0078, {104.007, 116.669, 122.679}, "inception_v3_fp16.tmfile", "synset2015.txt"}, + {"inception_v4", 299, 299, 1 / 127.5f, {104.007, 116.669, 122.679}, "inception_v4_fp16.tmfile", "synset_words.txt"}, + {"vgg16", 224, 224, 1.f, {104.007, 116.669, 122.679}, "vgg16_fp16.tmfile", "synset_words.txt"} +}; + +const Model_Config* get_model_config(const char* model_name) +{ + std::string name1 = model_name; + for(unsigned int i = 0; i < name1.size(); i++) + name1[i] = tolower(name1[i]); + + for(unsigned int i = 0; i < sizeof(model_list) / sizeof(Model_Config); i++) + { + std::string name2 = model_list[i].model_name; + if(name1 == name2) + { + return &model_list[i]; + } + } + std::cerr << "Not support model name : " << model_name << "\n"; + return nullptr; +} + +void LoadLabelFile(std::vector& result, const char* fname) +{ + std::ifstream labels(fname); + + std::string line; + while(std::getline(labels, line)) + result.push_back(line); +} + +static inline bool PairCompare(const std::pair<__fp16, int>& lhs, const std::pair<__fp16, int>& rhs) +{ + return fp16_to_fp32(lhs.first) > fp16_to_fp32(rhs.first); +} + +static inline std::vector Argmax(const std::vector<__fp16>& v, int N) +{ + std::vector > pairs; + for(size_t i = 0; i < v.size(); ++i) + pairs.push_back(std::make_pair(v[i], i)); + std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare); + + std::vector result; + for(int i = 0; i < N; ++i) + result.push_back(pairs[i].second); + return result; +} + +void get_input_data(const char* image_file, __fp16* input_data, int img_h, int img_w, const float* mean, float scale) +{ + cv::Mat sample = cv::imread(image_file, -1); + if(sample.empty()) + { + std::cerr << "Failed to read image file " << image_file << ".\n"; + return; + } + cv::Mat img; + if(sample.channels() == 4) + { + cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR); + } + else if(sample.channels() == 1) + { + cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR); + } + else + { + img = sample; + } + + cv::resize(img, img, cv::Size(img_h, img_w)); + img.convertTo(img, CV_32FC3); + float* img_data = ( float* )img.data; + int hw = img_h * img_w; + + for(int h = 0; h < img_h; h++) + { + for(int w = 0; w < img_w; w++) + { + for(int c = 0; c < 3; c++) + { + input_data[c * hw + h * img_w + w] = fp32_to_fp16((*img_data - mean[c]) * scale); + img_data++; + } + } + } +} + +void PrintTopLabels(const char* label_file, __fp16* data) +{ + // load labels + std::vector labels; + LoadLabelFile(labels, label_file); + + __fp16* end = data + 1000; + std::vector<__fp16> result(data, end); + std::vector top_N = Argmax(result, PRINT_TOP_NUM); + + for(unsigned int i = 0; i < top_N.size(); i++) + { + int idx = top_N[i]; + + std::cout << std::fixed << std::setprecision(4) << fp16_to_fp32(result[idx]) << " - \"" << labels[idx] << "\"\n"; + } +} + +bool run_tengine_library(const char* model_name, const char* tm_file, const char* label_file, const char* image_file, + int img_h, int img_w, const float* mean, float scale, int repeat_count) +{ + // init + init_tengine(); + if(request_tengine_version("1.2") < 0) + return false; + + // create graph + graph_t graph = create_graph(nullptr, "tengine", tm_file); + if(graph == nullptr) + { + std::cerr << "Create graph failed.\n"; + std::cerr << "errno: " << get_tengine_errno() << "\n"; + return false; + } + + // set input shape + int img_size = img_h * img_w * 3; + int dims[] = {1, 3, img_h, img_w}; + __fp16* input_data = ( __fp16* )malloc(img_size * sizeof(__fp16)); + + tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); + if(input_tensor == nullptr) + { + std::cerr << "Get input tensor failed\n"; + return false; + } + set_tensor_shape(input_tensor, dims, 4); + + // prerun + if(prerun_graph(graph) < 0) + { + std::cerr << "Prerun graph failed\n"; + return false; + } + //dump_graph(graph); + + struct timeval t0, t1; + float avg_time = 0.f; + for(int i = 0; i < repeat_count; i++) + { + get_input_data(image_file, input_data, img_h, img_w, mean, scale); + set_tensor_buffer(input_tensor, input_data, img_size * 4); + + gettimeofday(&t0, NULL); + if(run_graph(graph, 1) < 0) + { + std::cerr << "Run graph failed\n"; + return false; + } + gettimeofday(&t1, NULL); + + float mytime = ( float )((t1.tv_sec * 1000000 + t1.tv_usec) - (t0.tv_sec * 1000000 + t0.tv_usec)) / 1000; + avg_time += mytime; + } + std::cout << "\nModel name : " << model_name << "\n" + << "tengine model file : " << tm_file << "\n" + << "label file : " << label_file << "\n" + << "image file : " << image_file << "\n" + << "img_h, imag_w, scale, mean[3] : " << img_h << " " << img_w << " " << scale << " " << mean[0] << " " + << mean[1] << " " << mean[2] << "\n"; + std::cout << "\nRepeat " << repeat_count << " times, avg time per run is " << avg_time / repeat_count << " ms\n"; + std::cout << "--------------------------------------\n"; + + // print output + tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); + __fp16* data = ( __fp16* )get_tensor_buffer(output_tensor); + PrintTopLabels(label_file, data); + std::cout << "--------------------------------------\n"; + + //tensor_t tensor1 = get_graph_tensor(graph, "pool1"); + //Dumpdata("sqz_pool1_fp16.txt", (__fp16*)get_tensor_buffer(tensor1), get_tensor_buffer_size(tensor1)/2); + + free(input_data); + release_graph_tensor(input_tensor); + release_graph_tensor(output_tensor); + postrun_graph(graph); + destroy_graph(graph); + + return true; +} + +template static std::vector ParseString(const std::string str) +{ + typedef std::string::size_type pos; + const char delim_ch = ','; + std::string str_tmp = str; + std::vector result; + T t; + + pos delim_pos = str_tmp.find(delim_ch); + while(delim_pos != std::string::npos) + { + std::istringstream ist(str_tmp.substr(0, delim_pos)); + ist >> t; + result.push_back(t); + str_tmp.replace(0, delim_pos + 1, ""); + delim_pos = str_tmp.find(delim_ch); + } + if(str_tmp.size() > 0) + { + std::istringstream ist(str_tmp); + ist >> t; + result.push_back(t); + } + + return result; +} + +int main(int argc, char* argv[]) +{ + int repeat_count = DEFAULT_REPEAT_CNT; + const std::string root_path = get_root_path(); + std::string model_name; + std::string tm_file; + std::string label_file; + std::string image_file; + std::vector hw; + std::vector ms; + int img_h = 0; + int img_w = 0; + float scale = 0.0; + float mean[3] = {-1.0, -1.0, -1.0}; + + int res; + while((res = getopt(argc, argv, "n:t:l:i:g:s:w:r:h")) != -1) + { + switch(res) + { + case 'n': + model_name = optarg; + break; + case 't': + tm_file = optarg; + break; + case 'l': + label_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + hw = ParseString(optarg); + if(hw.size() != 2) + { + std::cerr << "Error -g parameter.\n"; + return -1; + } + img_h = hw[0]; + img_w = hw[1]; + break; + case 's': + scale = strtof(optarg, NULL); + break; + case 'w': + ms = ParseString(optarg); + if(ms.size() != 3) + { + std::cerr << "Error -w parameter.\n"; + return -1; + } + mean[0] = ms[0]; + mean[1] = ms[1]; + mean[2] = ms[2]; + break; + case 'r': + repeat_count = std::strtoul(optarg, NULL, 10); + break; + case 'h': + std::cout << "[Usage]: " << argv[0] << " [-h]\n" + << " [-n model_name] [-t tm_file] [-l label_file] [-i image_file]\n" + << " [-g img_h,img_w] [-s scale] [-w mean[0],mean[1],mean[2]] [-r repeat_count]\n"; + return 0; + default: + break; + } + } + + const Model_Config* mod_config; + // if model files not specified + if(tm_file.empty()) + { + // if model name not specified + if(model_name.empty()) + { + // use default model + model_name = DEFAULT_MODEL_NAME; + std::cout << "Model name and tm file not specified, run " << model_name << " by default.\n"; + } + // get model config in predefined model list + mod_config = get_model_config(model_name.c_str()); + if(mod_config == nullptr) + return -1; + + // get tm file + tm_file = get_file(mod_config->tm_file); + if(tm_file.empty()) + return -1; + + // if label file not specified + if(label_file.empty()) + { + // get label file + label_file = get_file(mod_config->label_file); + if(label_file.empty()) + return -1; + } + + if(!hw.size()) + { + img_h = mod_config->img_h; + img_w = mod_config->img_w; + } + if(scale == 0.0) + scale = mod_config->scale; + if(!ms.size()) + { + mean[0] = mod_config->mean[0]; + mean[1] = mod_config->mean[1]; + mean[2] = mod_config->mean[2]; + } + } + + // if label file not specified, use default label file + if(label_file.empty()) + { + label_file = root_path + DEFAULT_LABEL_FILE; + std::cout << "Label file not specified, use " << label_file << " by default.\n"; + } + + // if image file not specified, use default image file + if(image_file.empty()) + { + image_file = root_path + DEFAULT_IMAGE_FILE; + std::cout << "Image file not specified, use " << image_file << " by default.\n"; + } + + if(img_h == 0) + img_h = DEFAULT_IMG_H; + if(img_w == 0) + img_w = DEFAULT_IMG_W; + if(scale == 0.0) + scale = DEFAULT_SCALE; + if(mean[0] == -1.0) + mean[0] = DEFAULT_MEAN1; + if(mean[1] == -1.0) + mean[1] = DEFAULT_MEAN2; + if(mean[2] == -1.0) + mean[2] = DEFAULT_MEAN3; + if(model_name.empty()) + model_name = "unknown"; + + // check input files + if(!check_file_exist(tm_file) || !check_file_exist(label_file) || !check_file_exist(image_file)) + return -1; + + // start to run + if(!run_tengine_library(model_name.c_str(), tm_file.c_str(), label_file.c_str(), image_file.c_str(), img_h, img_w, + mean, scale, repeat_count)) + return -1; + + std::cout << "ALL TEST DONE\n"; + + return 0; +} diff --git a/examples/tengine_model/classification/classification_int8.cpp b/examples/tengine_model/classification/classification_int8.cpp new file mode 100644 index 000000000..1a2a2087d --- /dev/null +++ b/examples/tengine_model/classification/classification_int8.cpp @@ -0,0 +1,474 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: jingyou@openailab.com + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opencv2/imgproc/imgproc.hpp" +#include "opencv2/highgui/highgui.hpp" +#include "tengine_c_api.h" +#include "common.hpp" +#include "cpu_device.h" + +#define DEFAULT_MODEL_NAME "squeezenet" +#define DEFAULT_IMAGE_FILE "tests/images/cat.jpg" +#define DEFAULT_LABEL_FILE "models/synset_words.txt" +#define DEFAULT_IMG_H 227 +#define DEFAULT_IMG_W 227 +#define DEFAULT_SCALE 1.f +#define DEFAULT_MEAN1 104.007 +#define DEFAULT_MEAN2 116.669 +#define DEFAULT_MEAN3 122.679 +#define DEFAULT_REPEAT_CNT 1 +#define PRINT_TOP_NUM 5 + +typedef struct +{ + const char* model_name; + int img_h; + int img_w; + float scale; + float mean[3]; + const char* tm_file; + const char* label_file; +} Model_Config; + +const Model_Config model_list[] = { + {"squeezenet", 227, 227, 1.f, {104.007, 116.669, 122.679}, "squeezenet_int8.tmfile", "synset_words.txt"}, + {"mobilenet", 224, 224, 0.017, {104.007, 116.669, 122.679}, "mobilenet_int8.tmfile", "synset_words.txt"}, + {"mobilenet_v2", 224, 224, 0.017, {104.007, 116.669, 122.679}, "mobilenet_v2_int8.tmfile", "synset_words.txt"}, + {"resnet50", 224, 224, 1.f, {104.007, 116.669, 122.679}, "resnet50_int8.tmfile", "synset_words.txt"}, + {"alexnet", 227, 227, 1.f, {104.007, 116.669, 122.679}, "alexnet_int8.tmfile", "synset_words.txt"}, + {"googlenet", 224, 224, 1.f, {104.007, 116.669, 122.679}, "googlenet_int8.tmfile", "synset_words.txt"}, + {"inception_v3", 395, 395, 0.0078, {104.007, 116.669, 122.679}, "inception_v3_int8.tmfile", "synset2015.txt"}, + {"inception_v4", 299, 299, 1 / 127.5f, {104.007, 116.669, 122.679}, "inception_v4_int8.tmfile", "synset_words.txt"}, + {"vgg16", 224, 224, 1.f, {104.007, 116.669, 122.679}, "vgg16_int8.tmfile", "synset_words.txt"} +}; + + +const Model_Config* get_model_config(const char* model_name) +{ + std::string name1 = model_name; + for(unsigned int i = 0; i < name1.size(); i++) + name1[i] = tolower(name1[i]); + + for(unsigned int i = 0; i < sizeof(model_list) / sizeof(Model_Config); i++) + { + std::string name2 = model_list[i].model_name; + if(name1 == name2) + { + return &model_list[i]; + } + } + std::cerr << "Not support model name : " << model_name << "\n"; + return nullptr; +} + +void LoadLabelFile(std::vector& result, const char* fname) +{ + std::ifstream labels(fname); + + std::string line; + while(std::getline(labels, line)) + result.push_back(line); +} + +static inline bool PairCompare(const std::pair& lhs, const std::pair& rhs) +{ + return lhs.first > rhs.first; +} + +static inline std::vector Argmax(const std::vector& v, int N) +{ + std::vector> pairs; + for(size_t i = 0; i < v.size(); ++i) + pairs.push_back(std::make_pair(v[i], i)); + std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare); + + std::vector result; + for(int i = 0; i < N; ++i) + result.push_back(pairs[i].second); + return result; +} + +static float get_absmax_val(float* data, int data_size) +{ + float max_val = 0.f; + if(data != nullptr) + { + for(int i = 0; i < data_size; i++) + { + float abs_val = fabs(data[i]); + if(abs_val > max_val) + max_val = abs_val; + } + } + return max_val; +} + +void get_input_data(const char* image_file, int8_t* input_data, int img_h, int img_w, const float* mean, float scale, + float *input_scale, int *zero_point) +{ + cv::Mat sample = cv::imread(image_file, -1); + if(sample.empty()) + { + std::cerr << "Failed to read image file " << image_file << ".\n"; + return; + } + cv::Mat img; + if(sample.channels() == 4) + { + cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR); + } + else if(sample.channels() == 1) + { + cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR); + } + else + { + img = sample; + } + + cv::resize(img, img, cv::Size(img_h, img_w)); + img.convertTo(img, CV_32FC3); + float* img_data = ( float* )img.data; + int hw = img_h * img_w; + + float* temp_data = (float*)malloc(hw*3*sizeof(float)); + for(int h = 0; h < img_h; h++) + { + for(int w = 0; w < img_w; w++) + { + for(int c = 0; c < 3; c++) + { + temp_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale; + img_data++; + } + } + } + + float input_max = get_absmax_val(temp_data, hw*3); + *input_scale = input_max / 127; + *zero_point = 0; + + for(int i = 0; i < hw*3; i++) + input_data[i] = (int8_t)(round(temp_data[i] / *input_scale) + *zero_point); + + free(temp_data); +} + +void PrintTopLabels(const char* label_file, int8_t* data, float q_scale) +{ + // load labels + std::vector labels; + LoadLabelFile(labels, label_file); + + int8_t* end = data + 1000; + std::vector result(data, end); + std::vector top_N = Argmax(result, PRINT_TOP_NUM); + + for(unsigned int i = 0; i < top_N.size(); i++) + { + int idx = top_N[i]; + + float val = result[idx] * q_scale; + std::cout << std::fixed << std::setprecision(4) << val << " - \"" << labels[idx] << "\"\n"; + } +} + +bool run_tengine_library(const char* model_name, const char* tm_file, const char* label_file, const char* image_file, + int img_h, int img_w, const float* mean, float scale, int repeat_count) +{ + // init + init_tengine(); + if(request_tengine_version("1.2") < 0) + return false; + + // create graph + graph_t graph = create_graph(nullptr, "tengine", tm_file); + if(graph == nullptr) + { + std::cerr << "Create graph failed.\n"; + std::cerr << "errno: " << get_tengine_errno() << "\n"; + return false; + } + + // set input shape + int img_size = img_h * img_w * 3; + int dims[] = {1, 3, img_h, img_w}; + int8_t* input_data = ( int8_t* )malloc(img_size); + + tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); + if(input_tensor == nullptr) + { + std::cerr << "Get input tensor failed\n"; + return false; + } + set_tensor_shape(input_tensor, dims, 4); + + // prerun + if(prerun_graph(graph) < 0) + { + std::cerr << "Prerun graph failed\n"; + return false; + } + //dump_graph(graph); + + struct timeval t0, t1; + float avg_time = 0.f; + for(int i = 0; i < repeat_count; i++) + { + float input_scale; + int zero_point; + get_input_data(image_file, input_data, img_h, img_w, mean, scale, &input_scale, &zero_point); + set_tensor_buffer(input_tensor, input_data, img_size * 4); + set_tensor_quant_param(input_tensor, &input_scale, &zero_point, 1); + + gettimeofday(&t0, NULL); + if(run_graph(graph, 1) < 0) + { + std::cerr << "Run graph failed\n"; + return false; + } + gettimeofday(&t1, NULL); + + float mytime = ( float )((t1.tv_sec * 1000000 + t1.tv_usec) - (t0.tv_sec * 1000000 + t0.tv_usec)) / 1000; + avg_time += mytime; + } + std::cout << "\nModel name : " << model_name << "\n" + << "tengine model file : " << tm_file << "\n" + << "label file : " << label_file << "\n" + << "image file : " << image_file << "\n" + << "img_h, imag_w, scale, mean[3] : " << img_h << " " << img_w << " " << scale << " " << mean[0] << " " + << mean[1] << " " << mean[2] << "\n"; + std::cout << "\nRepeat " << repeat_count << " times, avg time per run is " << avg_time / repeat_count << " ms\n"; + std::cout << "--------------------------------------\n"; + + // print output + float q_scale; + int q_zero; + tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); + get_tensor_quant_param(output_tensor, &q_scale, &q_zero, 1); + int8_t* data = ( int8_t* )get_tensor_buffer(output_tensor); + PrintTopLabels(label_file, data, q_scale); + std::cout << "--------------------------------------\n"; + + free(input_data); + release_graph_tensor(input_tensor); + release_graph_tensor(output_tensor); + postrun_graph(graph); + destroy_graph(graph); + + return true; +} + +template static std::vector ParseString(const std::string str) +{ + typedef std::string::size_type pos; + const char delim_ch = ','; + std::string str_tmp = str; + std::vector result; + T t; + + pos delim_pos = str_tmp.find(delim_ch); + while(delim_pos != std::string::npos) + { + std::istringstream ist(str_tmp.substr(0, delim_pos)); + ist >> t; + result.push_back(t); + str_tmp.replace(0, delim_pos + 1, ""); + delim_pos = str_tmp.find(delim_ch); + } + if(str_tmp.size() > 0) + { + std::istringstream ist(str_tmp); + ist >> t; + result.push_back(t); + } + + return result; +} + +int main(int argc, char* argv[]) +{ + int repeat_count = DEFAULT_REPEAT_CNT; + const std::string root_path = get_root_path(); + std::string model_name; + std::string tm_file; + std::string label_file; + std::string image_file; + std::vector hw; + std::vector ms; + int img_h = 0; + int img_w = 0; + float scale = 0.0; + float mean[3] = {-1.0, -1.0, -1.0}; + + int res; + while((res = getopt(argc, argv, "n:t:l:i:g:s:w:r:h")) != -1) + { + switch(res) + { + case 'n': + model_name = optarg; + break; + case 't': + tm_file = optarg; + break; + case 'l': + label_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + hw = ParseString(optarg); + if(hw.size() != 2) + { + std::cerr << "Error -g parameter.\n"; + return -1; + } + img_h = hw[0]; + img_w = hw[1]; + break; + case 's': + scale = strtof(optarg, NULL); + break; + case 'w': + ms = ParseString(optarg); + if(ms.size() != 3) + { + std::cerr << "Error -w parameter.\n"; + return -1; + } + mean[0] = ms[0]; + mean[1] = ms[1]; + mean[2] = ms[2]; + break; + case 'r': + repeat_count = std::strtoul(optarg, NULL, 10); + break; + case 'h': + std::cout << "[Usage]: " << argv[0] << " [-h]\n" + << " [-n model_name] [-t tm_file] [-l label_file] [-i image_file]\n" + << " [-g img_h,img_w] [-s scale] [-w mean[0],mean[1],mean[2]] [-r repeat_count]\n"; + return 0; + default: + break; + } + } + + const Model_Config* mod_config; + // if model files not specified + if(tm_file.empty()) + { + // if model name not specified + if(model_name.empty()) + { + // use default model + model_name = DEFAULT_MODEL_NAME; + std::cout << "Model name and tm file not specified, run " << model_name << " by default.\n"; + } + // get model config in predefined model list + mod_config = get_model_config(model_name.c_str()); + if(mod_config == nullptr) + return -1; + + // get tm file + tm_file = get_file(mod_config->tm_file); + if(tm_file.empty()) + return -1; + + // if label file not specified + if(label_file.empty()) + { + // get label file + label_file = get_file(mod_config->label_file); + if(label_file.empty()) + return -1; + } + + if(!hw.size()) + { + img_h = mod_config->img_h; + img_w = mod_config->img_w; + } + if(scale == 0.0) + scale = mod_config->scale; + if(!ms.size()) + { + mean[0] = mod_config->mean[0]; + mean[1] = mod_config->mean[1]; + mean[2] = mod_config->mean[2]; + } + } + + // if label file not specified, use default label file + if(label_file.empty()) + { + label_file = root_path + DEFAULT_LABEL_FILE; + std::cout << "Label file not specified, use " << label_file << " by default.\n"; + } + + // if image file not specified, use default image file + if(image_file.empty()) + { + image_file = root_path + DEFAULT_IMAGE_FILE; + std::cout << "Image file not specified, use " << image_file << " by default.\n"; + } + + if(img_h == 0) + img_h = DEFAULT_IMG_H; + if(img_w == 0) + img_w = DEFAULT_IMG_W; + if(scale == 0.0) + scale = DEFAULT_SCALE; + if(mean[0] == -1.0) + mean[0] = DEFAULT_MEAN1; + if(mean[1] == -1.0) + mean[1] = DEFAULT_MEAN2; + if(mean[2] == -1.0) + mean[2] = DEFAULT_MEAN3; + if(model_name.empty()) + model_name = "unknown"; + + // check input files + if(!check_file_exist(tm_file) || !check_file_exist(label_file) || !check_file_exist(image_file)) + return -1; + + // start to run + if(!run_tengine_library(model_name.c_str(), tm_file.c_str(), label_file.c_str(), image_file.c_str(), img_h, img_w, + mean, scale, repeat_count)) + return -1; + + std::cout << "ALL TEST DONE\n"; + + return 0; +} diff --git a/examples/tengine_model/classification/classification_tf.cpp b/examples/tengine_model/classification/classification_tf.cpp new file mode 100644 index 000000000..bd85e2b08 --- /dev/null +++ b/examples/tengine_model/classification/classification_tf.cpp @@ -0,0 +1,424 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: jingyou@openailab.com + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opencv2/imgproc/imgproc.hpp" +#include "opencv2/highgui/highgui.hpp" +#include "tengine_c_api.h" +#include "common.hpp" +#include "cpu_device.h" + +#define DEFAULT_MODEL_NAME "inception_v3" +#define DEFAULT_IMAGE_FILE "tests/images/cat.jpg" +#define DEFAULT_LABEL_FILE "models/synset_words.txt" +#define DEFAULT_IMG_H 299 +#define DEFAULT_IMG_W 299 +#define DEFAULT_SCALE 1.f +#define DEFAULT_MEAN1 0 +#define DEFAULT_MEAN2 0 +#define DEFAULT_MEAN3 0 +#define DEFAULT_REPEAT_CNT 1 +#define PRINT_TOP_NUM 5 + +typedef struct +{ + const char* model_name; + int img_h; + int img_w; + float scale; + float mean[3]; + const char* tm_file; + const char* label_file; +} Model_Config; + +const Model_Config model_list[] = { + {"inception_v3", 299, 299, 0.0039, {0, 0, 0}, "inception_v3_tf.tmfile", "labels.txt"}, + {"inception_v4", 299, 299, 0.0039, {0, 0, 0}, "inception_v4_tf.tmfile", "labels.txt"}, + {"resnet_v2", 299, 299, 0.0039, {0, 0, 0}, "resnet_v2_tf.tmfile", "labels.txt"}, + {"mobilenet_v1", 224, 224, 0.017, {104.007, 116.669, 122.679}, "mobilenet_v1_tf.tmfile", "labels.txt"}, + {"mobilenet_v2", 224, 224, 0.0078, {128, 128, 128}, "mobilenet_v2_tf.tmfile", "imagenet_slim_labels.txt"}, + {"squeezenet", 224, 224, 0.0039, {0, 0, 0}, "squeezenet_tf.tmfile", "labels.txt"}, + {"resnet50", 224, 224, 1.f, {0, 0, 0}, "resnet50_tf.tmfile", "synset_words.txt"}}; + +const Model_Config* get_model_config(const char* model_name) +{ + std::string name1 = model_name; + for(unsigned int i = 0; i < name1.size(); i++) + name1[i] = tolower(name1[i]); + + for(unsigned int i = 0; i < sizeof(model_list) / sizeof(Model_Config); i++) + { + std::string name2 = model_list[i].model_name; + if(name1 == name2) + { + return &model_list[i]; + } + } + std::cerr << "Not support model name : " << model_name << "\n"; + return nullptr; +} + +void LoadLabelFile(std::vector& result, const char* fname) +{ + std::ifstream labels(fname); + + std::string line; + while(std::getline(labels, line)) + result.push_back(line); +} + +static inline bool PairCompare(const std::pair& lhs, const std::pair& rhs) +{ + return lhs.first > rhs.first; +} + +static inline std::vector Argmax(const std::vector& v, int N) +{ + std::vector> pairs; + for(size_t i = 0; i < v.size(); ++i) + pairs.push_back(std::make_pair(v[i], i)); + std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare); + + std::vector result; + for(int i = 0; i < N; ++i) + result.push_back(pairs[i].second); + return result; +} + +void get_input_data(const char* image_file, float* input_data, const int img_h, const int img_w, const float mean, + const float scale) +{ + cv::Mat img = cv::imread(image_file, -1); + if(img.empty()) + { + std::cerr << "Failed to read image file " << image_file << ".\n"; + return; + } + + cv::cvtColor(img, img, cv::COLOR_BGR2RGB); + cv::resize(img, img, cv::Size(img_w, img_h)); + img.convertTo(img, CV_32FC3); + + img = (img - mean) * scale; + + std::vector input_channels; + float* ptr = input_data; + + for(int i = 0; i < 3; ++i) + { + cv::Mat channel(img_h, img_w, CV_32FC1, ptr); + input_channels.push_back(channel); + ptr += img_h * img_w; + } + + cv::split(img, input_channels); +} + +void PrintTopLabels(const char* label_file, float* data) +{ + // load labels + std::vector labels; + LoadLabelFile(labels, label_file); + + float* end = data + 1000; + std::vector result(data, end); + std::vector top_N = Argmax(result, PRINT_TOP_NUM); + + for(unsigned int i = 0; i < top_N.size(); i++) + { + int idx = top_N[i]; + + std::cout << std::fixed << std::setprecision(4) << result[idx] << " - \"" << labels[idx] << "\"\n"; + } +} + +bool run_tengine_library(const char* model_name, const char* tm_file, const char* label_file, const char* image_file, + int img_h, int img_w, const float* mean, float scale, int repeat_count) +{ + // init + init_tengine(); + if(request_tengine_version("0.9") < 0) + return false; + + // create graph + graph_t graph = create_graph(nullptr, "tengine", tm_file); + if(graph == nullptr) + { + std::cerr << "Create graph failed.\n"; + std::cerr << "errno: " << get_tengine_errno() << "\n"; + return false; + } + + // set input shape + int img_size = img_h * img_w * 3; + int dims[] = {1, 3, img_h, img_w}; + float* input_data = ( float* )malloc(sizeof(float) * img_size); + + tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); + if(input_tensor == nullptr) + { + std::cerr << "Get input tensor failed\n"; + return false; + } + set_tensor_shape(input_tensor, dims, 4); + + // prerun + if(prerun_graph(graph) < 0) + { + std::cerr << "Prerun graph failed\n"; + return false; + } + + struct timeval t0, t1; + float avg_time = 0.f; + for(int i = 0; i < repeat_count; i++) + { + get_input_data(image_file, input_data, img_h, img_w, mean[0], scale); + set_tensor_buffer(input_tensor, input_data, img_size * 4); + + gettimeofday(&t0, NULL); + if(run_graph(graph, 1) < 0) + { + std::cerr << "Run graph failed\n"; + return false; + } + gettimeofday(&t1, NULL); + + float mytime = ( float )((t1.tv_sec * 1000000 + t1.tv_usec) - (t0.tv_sec * 1000000 + t0.tv_usec)) / 1000; + avg_time += mytime; + } + std::cout << "\nModel name : " << model_name << "\n" + << "tengine model file : " << tm_file << "\n" + << "label file : " << label_file << "\n" + << "image file : " << image_file << "\n" + << "img_h, imag_w, scale, mean[3] : " << img_h << " " << img_w << " " << scale << " " << mean[0] << " " + << mean[1] << " " << mean[2] << "\n"; + std::cout << "\nRepeat " << repeat_count << " times, avg time per run is " << avg_time / repeat_count << " ms\n"; + std::cout << "--------------------------------------\n"; + + // print output + tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); + float* data = ( float* )get_tensor_buffer(output_tensor); + PrintTopLabels(label_file, data); + std::cout << "--------------------------------------\n"; + + free(input_data); + release_graph_tensor(input_tensor); + release_graph_tensor(output_tensor); + postrun_graph(graph); + destroy_graph(graph); + + return true; +} + +template static std::vector ParseString(const std::string str) +{ + typedef std::string::size_type pos; + const char delim_ch = ','; + std::string str_tmp = str; + std::vector result; + T t; + + pos delim_pos = str_tmp.find(delim_ch); + while(delim_pos != std::string::npos) + { + std::istringstream ist(str_tmp.substr(0, delim_pos)); + ist >> t; + result.push_back(t); + str_tmp.replace(0, delim_pos + 1, ""); + delim_pos = str_tmp.find(delim_ch); + } + if(str_tmp.size() > 0) + { + std::istringstream ist(str_tmp); + ist >> t; + result.push_back(t); + } + + return result; +} + +int main(int argc, char* argv[]) +{ + int repeat_count = DEFAULT_REPEAT_CNT; + const std::string root_path = get_root_path(); + std::string model_name; + std::string tm_file; + std::string label_file; + std::string image_file; + std::vector hw; + std::vector ms; + int img_h = 0; + int img_w = 0; + float scale = 0.0; + float mean[3] = {-1.0, -1.0, -1.0}; + + int res; + while((res = getopt(argc, argv, "n:t:l:i:g:s:w:r:h")) != -1) + { + switch(res) + { + case 'n': + model_name = optarg; + break; + case 't': + tm_file = optarg; + break; + case 'l': + label_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'g': + hw = ParseString(optarg); + if(hw.size() != 2) + { + std::cerr << "Error -g parameter.\n"; + return -1; + } + img_h = hw[0]; + img_w = hw[1]; + break; + case 's': + scale = strtof(optarg, NULL); + break; + case 'w': + ms = ParseString(optarg); + if(ms.size() != 3) + { + std::cerr << "Error -w parameter.\n"; + return -1; + } + mean[0] = ms[0]; + mean[1] = ms[1]; + mean[2] = ms[2]; + break; + case 'r': + repeat_count = std::strtoul(optarg, NULL, 10); + break; + case 'h': + std::cout << "[Usage]: " << argv[0] << " [-h]\n" + << " [-n model_name] [-t tm_file] [-l label_file] [-i image_file]\n" + << " [-g img_h,img_w] [-s scale] [-w mean[0],mean[1],mean[2]] [-r repeat_count]\n"; + return 0; + default: + break; + } + } + + const Model_Config* mod_config; + // if model files not specified + if(tm_file.empty()) + { + // if model name not specified + if(model_name.empty()) + { + // use default model + model_name = DEFAULT_MODEL_NAME; + std::cout << "Model name and tm file not specified, run " << model_name << " by default.\n"; + } + // get model config in predefined model list + mod_config = get_model_config(model_name.c_str()); + if(mod_config == nullptr) + return -1; + + // get tm file + tm_file = get_file(mod_config->tm_file); + if(tm_file.empty()) + return -1; + + // if label file not specified + if(label_file.empty()) + { + // get label file + label_file = get_file(mod_config->label_file); + if(label_file.empty()) + return -1; + } + + if(!hw.size()) + { + img_h = mod_config->img_h; + img_w = mod_config->img_w; + } + if(scale == 0.0) + scale = mod_config->scale; + if(!ms.size()) + { + mean[0] = mod_config->mean[0]; + mean[1] = mod_config->mean[1]; + mean[2] = mod_config->mean[2]; + } + } + + // if label file not specified, use default label file + if(label_file.empty()) + { + label_file = root_path + DEFAULT_LABEL_FILE; + std::cout << "Label file not specified, use " << label_file << " by default.\n"; + } + + // if image file not specified, use default image file + if(image_file.empty()) + { + image_file = root_path + DEFAULT_IMAGE_FILE; + std::cout << "Image file not specified, use " << image_file << " by default.\n"; + } + + if(img_h == 0) + img_h = DEFAULT_IMG_H; + if(img_w == 0) + img_w = DEFAULT_IMG_W; + if(scale == 0.0) + scale = DEFAULT_SCALE; + if(mean[0] == -1.0) + mean[0] = DEFAULT_MEAN1; + if(mean[1] == -1.0) + mean[1] = DEFAULT_MEAN2; + if(mean[2] == -1.0) + mean[2] = DEFAULT_MEAN3; + if(model_name.empty()) + model_name = "unknown"; + + // check input files + if(!check_file_exist(tm_file) || !check_file_exist(label_file) || !check_file_exist(image_file)) + return -1; + + // start to run + if(!run_tengine_library(model_name.c_str(), tm_file.c_str(), label_file.c_str(), image_file.c_str(), img_h, img_w, + mean, scale, repeat_count)) + return -1; + + std::cout << "ALL TEST DONE\n"; + + return 0; +} diff --git a/examples/tengine_model/convert/CMakeLists.txt b/examples/tengine_model/convert/CMakeLists.txt index 5d24442d3..fd461c91c 100644 --- a/examples/tengine_model/convert/CMakeLists.txt +++ b/examples/tengine_model/convert/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8) project(tm_convert) set( INSTALL_DIR ${TENGINE_DIR}/install/ ) -set( TENGINE_LIBS tengine) +set( TENGINE_LIBS tengine hclcpu) set( CODE_SRCS convert_caffe_to_tm.cpp ../../common/common.cpp) set( BIN_EXE convert_caffe_to_tm ) @@ -33,6 +33,10 @@ if( BLAS_DIR) endif() endif() +if ( NOT (ARM OR ANDROID)) +set (BLAS_LIB openblas) +endif() + #exe add_executable(${BIN_EXE} ${CODE_SRCS}) @@ -40,6 +44,6 @@ add_executable(${BIN_EXE} ${CODE_SRCS}) if( ANDROID) target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${PROTOBUF_LIB} ${BLAS_LIB}) else() - target_link_libraries(${BIN_EXE} ${TENGINE_LIBS}) + target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${BLAS_LIB}) endif() diff --git a/examples/tengine_model/convert/convert_caffe_to_tm.cpp b/examples/tengine_model/convert/convert_caffe_to_tm.cpp index cc6c04205..7207dae3d 100644 --- a/examples/tengine_model/convert/convert_caffe_to_tm.cpp +++ b/examples/tengine_model/convert/convert_caffe_to_tm.cpp @@ -89,6 +89,24 @@ int main(int argc, char* argv[]) return -1; } + const char* env = std::getenv("TM_NO_OPTIMIZE"); + if(env == nullptr) + { + // optimize graph + int optimize_only = 1; + if(set_graph_attr(graph, "optimize_only", &optimize_only, sizeof(int)) < 0) + { + std::cerr<<"set optimize only failed\n"; + return -1; + } + + if(prerun_graph(graph) < 0) + { + std::cerr<<"prerun failed\n"; + return -1; + } + } + // save the tengine model file if(save_graph(graph, "tengine", output_tmfile.c_str()) < 0) { diff --git a/examples/tensorflow_wrapper/label_image/CMakeLists.txt b/examples/tensorflow_wrapper/label_image/CMakeLists.txt index 354849940..fb4443b64 100644 --- a/examples/tensorflow_wrapper/label_image/CMakeLists.txt +++ b/examples/tensorflow_wrapper/label_image/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8) project(label_image) set( INSTALL_DIR ${TENGINE_DIR}/install/ ) -set( TENGINE_LIBS wrapper tengine boost_system boost_thread ) +set( TENGINE_LIBS wrapper tengine hclcpu boost_system boost_thread ) set( CODE_SRC label_image.cpp ../../common/common.cpp ) #opencv @@ -20,15 +20,19 @@ set(CMAKE_CXX_FLAGS "-std=c++11 -O3 -Wall") #lib link_directories(${INSTALL_DIR}/lib) +if ( NOT (ARM OR ANDROID)) +set (BLAS_LIB openblas) +endif() + #exe add_executable(label_image_inceptionv3 ${CODE_SRC}) -target_link_libraries(label_image_inceptionv3 ${TENGINE_LIBS} ${OpenCV_LIBS}) +target_link_libraries(label_image_inceptionv3 ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) add_executable(label_image_mobilenet ${CODE_SRC}) target_compile_definitions(label_image_mobilenet PUBLIC -DMOBILE_NET) -target_link_libraries(label_image_mobilenet ${TENGINE_LIBS} ${OpenCV_LIBS}) +target_link_libraries(label_image_mobilenet ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) add_executable(label_image_resnet50 ${CODE_SRC}) target_compile_definitions(label_image_resnet50 PUBLIC -DRESNET50) -target_link_libraries(label_image_resnet50 ${TENGINE_LIBS} ${OpenCV_LIBS}) +target_link_libraries(label_image_resnet50 ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) diff --git a/examples/yolov2/CMakeLists.txt b/examples/yolov2/CMakeLists.txt index f74ec5d7a..66fd6e73b 100644 --- a/examples/yolov2/CMakeLists.txt +++ b/examples/yolov2/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 2.8) project(YOLOV2) set( INSTALL_DIR ${TENGINE_DIR}/install/ ) -set( TENGINE_LIBS tengine) +set( TENGINE_LIBS tengine hclcpu) if( ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) set( PROTOBUF_LIB ${PROTOBUF_DIR}/arm32_lib/libprotobuf.so) @@ -20,6 +20,11 @@ if( BLAS_DIR) endif() endif() +if ( NOT (ARM OR ANDROID)) +set (BLAS_LIB openblas) +endif() + + set( CODE_SRCS yolov2.cpp ../common/common.cpp) set( BIN_EXE YOLOV2) @@ -46,7 +51,7 @@ add_executable(${BIN_EXE} ${CODE_SRCS}) if( ANDROID) target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${PROTOBUF_LIB} ${OpenCV_LIBS} ${BLAS_LIB}) else() - target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS}) + target_link_libraries(${BIN_EXE} ${TENGINE_LIBS} ${OpenCV_LIBS} ${BLAS_LIB}) endif() diff --git a/examples/yolov2/yolov2.cpp b/examples/yolov2/yolov2.cpp index ba29f571e..e5f1e94f7 100644 --- a/examples/yolov2/yolov2.cpp +++ b/examples/yolov2/yolov2.cpp @@ -479,7 +479,7 @@ int main(int argc, char** argv) std::vector param_biases; - if(get_node_attr_generic(node, "biases", &typeid(std::vector), ¶m_biases, sizeof(param_biases)) < 0) + if(get_node_attr_generic(node, "biases", typeid(std::vector).name(), ¶m_biases, sizeof(param_biases)) < 0) { std::cout << "cannot get bias settings\n"; return 1; diff --git a/executor/Makefile b/executor/Makefile index 8a64f79a3..0bd90ee7f 100644 --- a/executor/Makefile +++ b/executor/Makefile @@ -1,4 +1,3 @@ -obj-y+=operator/ obj-y+=lib/ obj-y+=engine/ obj-y+=plugin/ diff --git a/executor/include/kernel_registry.hpp b/executor/include/kernel_registry.hpp new file mode 100644 index 000000000..0c1e277b3 --- /dev/null +++ b/executor/include/kernel_registry.hpp @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haitao@openailab.com + */ + +#ifndef __KERNEL_REGISTRY_HPP__ +#define __KERNEL_REGISTRY_HPP__ + +#include + +namespace TEngine { + +template +class KernelRegistry { +public: + bool Register(const T& t, int layout, int data_type) + { + int idx=get_idx(layout,data_type); + + if(map.count(idx)) + return false; + + map[idx]=t; + return true; + } + + bool GetKernel(T& t, int layout, int data_type) + { + int idx=get_idx(layout,data_type); + + if(map.count(idx)==0) + return false; + + t=map[idx]; + + return true; + } + +private: + int get_idx(int layout, int data_type) { return (layout<<8)|(data_type);} + + + std::unordered_map map; + +}; + + + +} //namespace TEngine + +#endif diff --git a/executor/include/node_ops.hpp b/executor/include/node_ops.hpp index 0d2e34ab3..5404d20ce 100644 --- a/executor/include/node_ops.hpp +++ b/executor/include/node_ops.hpp @@ -37,6 +37,10 @@ #include "cpu_info.hpp" #include "exec_attr.hpp" +#ifdef __ANDROID__ +#define dynamic_cast static_cast +#endif + namespace TEngine { #define ATTR_NODE_OPS "node_ops" @@ -191,15 +195,26 @@ struct PrioSelector : public NodeOpsSelector auto ops = match_func(cpu_info, node); if(ops) + { + ops->need_free=true; return ops; + } } return nullptr; } - void Register(int priority, select_node_ops_t func) + bool Register(int priority, select_node_ops_t func) { + for(auto ir = prio_list.begin(); ir != prio_list.end(); ir++) + { + auto prio = ir->first; + + if(prio == priority) + return false; + } prio_list[priority] = func; + return true; } std::map prio_list; diff --git a/executor/lib/Makefile b/executor/lib/Makefile index 29a1390c9..dec8747b8 100644 --- a/executor/lib/Makefile +++ b/executor/lib/Makefile @@ -9,8 +9,8 @@ obj-y+=device_driver.o obj-y+=node_dev_executor.o obj-y+=node_dev_driver.o obj-y+=node_ops.o -obj-y+=tengine_test_api.o obj-y+=cpu_info.o obj-y+=custom_kernel.o +obj-y+=custom_kernel_ops.o COMMON_CFLAGS+=-I../../driver/cpu diff --git a/executor/lib/custom_kernel.cpp b/executor/lib/custom_kernel.cpp index 56115c571..b0e71ecd0 100644 --- a/executor/lib/custom_kernel.cpp +++ b/executor/lib/custom_kernel.cpp @@ -26,7 +26,7 @@ static void PrepareOneTensor(Node* node, Tensor* tensor, struct custom_kernel_te t->data_type = tensor->GetDataType(); t->element_num = shape.GetSize(); t->element_size = DataType::GetTypeSize(t->data_type); - t->layout_type = exec_attr->layout; + t->layout_type = exec_attr->graph_layout; t->data = nullptr; } diff --git a/executor/operator/common/custom_kernel_ops.cpp b/executor/lib/custom_kernel_ops.cpp similarity index 99% rename from executor/operator/common/custom_kernel_ops.cpp rename to executor/lib/custom_kernel_ops.cpp index a063ad56b..88598554a 100644 --- a/executor/operator/common/custom_kernel_ops.cpp +++ b/executor/lib/custom_kernel_ops.cpp @@ -170,8 +170,6 @@ NodeOps* CustomKernelNodeOps::NewOps(Node* node, struct custom_kernel_ops* k_ops { NodeOps* ops = new CustomKernelNodeOps(node, k_ops); - ops->need_free = 1; - return ops; } diff --git a/executor/lib/device_driver.cpp b/executor/lib/device_driver.cpp index 3cee38838..dc8555fbc 100644 --- a/executor/lib/device_driver.cpp +++ b/executor/lib/device_driver.cpp @@ -56,7 +56,7 @@ bool DriverManager::UnregisterDriver(Driver* driver) Driver* DriverManager::GetDriver(const std::string& name) { - Driver* driver; + Driver* driver=nullptr; if(!SafeGet(name, driver)) return nullptr; diff --git a/executor/lib/generic_dev_executor.cpp b/executor/lib/generic_dev_executor.cpp index ebbf16f2b..e96a65f08 100644 --- a/executor/lib/generic_dev_executor.cpp +++ b/executor/lib/generic_dev_executor.cpp @@ -233,6 +233,16 @@ bool GenericDevExecutor::PrerunTask(SubgraphTask* task) if(task->graph_handle == nullptr || !OptimizeGraph(task)) return false; + GraphTask * graph_task=task->graph_task; + GraphExecutor * executor=graph_task->GetGraphExecutor(); + + int optimize_only=0; + + executor->GetGraphAttr("optimize_only",&optimize_only,sizeof(int)); + + if(optimize_only) + return true; + unsigned int mem_size; if(DevGetMemorySize(task->graph_handle, mem_size)) diff --git a/executor/lib/graph_optimizer.cpp b/executor/lib/graph_optimizer.cpp index 59621fe08..7920a260b 100644 --- a/executor/lib/graph_optimizer.cpp +++ b/executor/lib/graph_optimizer.cpp @@ -48,29 +48,25 @@ static void AddConstNodeToSubGraph(Subgraph* graph, Tensor* tensor, Node* fused_ static bool Weight_Bn(Subgraph* graph, Node* ConvNode, float* mean, float* var, float* gamma, float* beta, float eps, float rescale_factor, Tensor* bias_tensor) { - Tensor* input_tensor = ConvNode->GetInputTensor(0); + Tensor* kernel_tensor = ConvNode->GetInputTensor(1); Convolution* conv_op = dynamic_cast(ConvNode->GetOp()); ConvParam* param = conv_op->GetParam(); - const TShape& input_shape = input_tensor->GetShape(); + const TShape& kernel_shape = kernel_tensor->GetShape(); int group = param->group; - int input_chan = input_shape.Shape(1) / group; - - Tensor* output_tensor = ConvNode->GetOutputTensor(0); - TShape& output_shape = output_tensor->GetShape(); + int input_chan = kernel_shape.Shape(1) ; - int output_chan = output_shape.GetC() / group; + int output_chan = kernel_shape.Shape(0) / group; int kernel_x = param->kernel_w; int kernel_y = param->kernel_h; int kernel_size = input_chan * kernel_x * kernel_y; - Tensor* kernel_tensor = ConvNode->GetInputTensor(1); float* kernel_org = ( float* )get_tensor_mem(kernel_tensor); - int channel_num = output_shape.GetC(); + int channel_num = kernel_shape.Shape(0); - float* kernel_new = ( float* )(malloc(kernel_size * channel_num * sizeof(float))); + float* kernel_new = ( float* )(malloc(kernel_size * channel_num * sizeof(float) + 128)); memcpy(kernel_new, kernel_org, sizeof(float) * kernel_size * channel_num); @@ -102,7 +98,7 @@ static bool Weight_Bn(Subgraph* graph, Node* ConvNode, float* mean, float* var, { Tensor* new_bias_tensor = new Tensor(bias_name); - std::vector dims{1, channel_num, 1, 1}; + std::vector dims{channel_num}; TShape bias_shape; bias_shape.SetDim(dims); @@ -110,7 +106,7 @@ static bool Weight_Bn(Subgraph* graph, Node* ConvNode, float* mean, float* var, new_bias_tensor->Reshape(bias_shape); new_bias_tensor->SetType(kConstTensor); - void* bias_new = ( void* )malloc(channel_num * sizeof(float)); + void* bias_new = ( void* )malloc(channel_num * sizeof(float) + 128); new_bias_tensor->SetMemAddr(bias_new); diff --git a/executor/lib/graph_task.cpp b/executor/lib/graph_task.cpp index 1b2bf836b..931a6e038 100644 --- a/executor/lib/graph_task.cpp +++ b/executor/lib/graph_task.cpp @@ -207,6 +207,11 @@ Graph* GraphTask::GetOptimizedGraph(void) optimized_graph_ = MergeSubgraph(graph_, sub_list); + optimized_graph_->SetLayout(graph_->GetLayout()); + optimized_graph_->SetModelLayout(graph_->GetModelLayout()); + optimized_graph_->SetModelFormat(graph_->GetModelFormat()); + optimized_graph_->SetModelSubFormat(graph_->GetModelSubFormat()); + return optimized_graph_; } diff --git a/executor/lib/node_ops.cpp b/executor/lib/node_ops.cpp index 19a110a5a..cdea37202 100644 --- a/executor/lib/node_ops.cpp +++ b/executor/lib/node_ops.cpp @@ -347,7 +347,8 @@ bool NodeOpsRegistryManager::RegisterOPImplementor(const std::string& registry_n registry->RegisterSelector(prio_selector); } - prio_selector->Register(priority, select_func); + if(!prio_selector->Register(priority, select_func)) + return false; return true; } diff --git a/executor/lib/tengine_test_api.cpp b/executor/lib/tengine_test_api.cpp deleted file mode 100644 index 4b34e96e7..000000000 --- a/executor/lib/tengine_test_api.cpp +++ /dev/null @@ -1,346 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2018, Open AI Lab - * Author: haitao@openailab.com - */ -#include "data_type.hpp" -#include "exec_context.hpp" -#include "graph.hpp" -#include "tensor_mem.hpp" -#include "operator/convolution.hpp" - -#include "tengine_test_api.h" -#include "node_ops.hpp" -#include "cpu_driver.hpp" -#include "graph_executor.hpp" - -using namespace TEngine; - -test_node_t create_convolution_test_node(int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h0, int pad_h1, - int pad_w0, int pad_w1, int dilation_h, int dilation_w, int input_channel, - int output_channel, int group) -{ - /* create op */ - - Operator* op = OpManager::CreateOp("Convolution"); - Convolution* conv_op = dynamic_cast(op); - - ConvParam* param = conv_op->GetParam(); - - param->kernel_h = kernel_h; - param->kernel_w = kernel_w; - param->stride_h = stride_h; - param->stride_w = stride_w; - param->output_channel = output_channel; - param->group = group; - param->dilation_h = dilation_h; - param->dilation_w = dilation_w; - - param->pad_h = -1; - param->pad_w = -1; - - param->pads.resize(4); - param->pads[0] = pad_h0; - param->pads[1] = pad_w0; - param->pads[2] = pad_h1; - param->pads[3] = pad_w1; - - /* create node */ - - Node* node = new Node("test_convolution"); - - node->SetOp(conv_op); - - return node; -} - -static int test_conv_node_set_input(Node* node, float* input_data[], int* input_shape[], int input_number) -{ - // input - - Tensor* tensor = new Tensor("input"); - - tensor->SetDataType(DataType::GetTypeID("float32")); - tensor->SetType(kConstTensor); - tensor->SetMemAddr(input_data[0]); - - int* input_dim = input_shape[0]; - - std::vector input_dims = {input_dim[0], input_dim[1], input_dim[2], input_dim[3]}; - - TShape& intput_shape = tensor->GetShape(); - - intput_shape.SetDataLayout("NCHW"); - intput_shape.SetDim(input_dims); - - node->AddInputTensor(tensor); - - // weight - - tensor = new Tensor("weight"); - - tensor->SetDataType(DataType::GetTypeID("float32")); - tensor->SetType(kConstTensor); - tensor->SetMemAddr(input_data[1]); - - input_dim = input_shape[1]; - - std::vector weight_dims = {input_dim[0], input_dim[1], input_dim[2], input_dim[3]}; - - TShape& weight_shape = tensor->GetShape(); - - weight_shape.SetDataLayout("NCHW"); - weight_shape.SetDim(weight_dims); - - node->AddInputTensor(tensor); - - if(input_number == 2) - return 0; - - // bias - - tensor = new Tensor("bias"); - - tensor->SetDataType(DataType::GetTypeID("float32")); - tensor->SetType(kConstTensor); - tensor->SetMemAddr(input_data[2]); - - input_dim = input_shape[2]; - - std::vector bias_dims = {input_dim[0]}; - - TShape& bias_shape = tensor->GetShape(); - - bias_shape.SetDataLayout("W"); - bias_shape.SetDim(bias_dims); - - node->AddInputTensor(tensor); - - return 0; -} - -int test_node_set_input(test_node_t node, float* input_data[], int* input_shape[], int input_number) -{ - Node* test_node = ( Node* )node; - - Operator* op = test_node->GetOp(); - - if(op->GetName() == "Convolution") - return test_conv_node_set_input(test_node, input_data, input_shape, input_number); - - return -1; -} - -static int test_conv_node_set_output(Node* node, float* output_data, int* output_shape) -{ - Tensor* tensor = new Tensor("output"); - - tensor->SetDataType(DataType::GetTypeID("float32")); - tensor->SetType(kConstTensor); - tensor->SetMemAddr(output_data); - - int* output_dim = output_shape; - - std::vector output_dims = {output_dim[0], output_dim[1], output_dim[2], output_dim[3]}; - - TShape& shape = tensor->GetShape(); - - shape.SetDataLayout("NCHW"); - shape.SetDim(output_dims); - - node->AddOutputTensor(tensor); - - return 0; -} - -int test_node_set_output(test_node_t node, float* output_data[], int* output_shape[], int output_number) -{ - Node* test_node = ( Node* )node; - - Operator* op = test_node->GetOp(); - - if(op->GetName() == "Convolution") - return test_conv_node_set_output(test_node, output_data[0], output_shape[0]); - - return -1; -} - -static Graph* create_test_graph(Node* node) -{ - Graph* graph = new Graph(node->GetName()); - - node->SetNodeIndex(0); - graph->seq_nodes.push_back(node); - - graph->AddInputNode(node); - graph->AddOutputNode(node); - - /* for all tensors */ - - for(unsigned int i = 0; i < node->GetInputNum(); i++) - { - Tensor* tensor = node->GetInputTensor(i); - graph->AddTensorMap(tensor->GetName(), tensor); - } - - for(unsigned int i = 0; i < node->GetOutputNum(); i++) - { - Tensor* tensor = node->GetOutputTensor(i); - graph->AddTensorMap(tensor->GetName(), tensor); - } - - return graph; -} - -int test_node_prerun(test_node_t node) -{ - Node* test_node = ( Node* )node; - - // create graph for this node - - Graph* graph = create_test_graph(test_node); - - GraphExecutor* executor = new GraphExecutor(); - ExecContext* exec_context = ExecContext::GetDefaultContext(); - - if(!executor->AttachGraph(exec_context, graph) || !executor->Prerun()) - { - std::cout << "Prerun failed\n"; - return -1; - } - - test_node->SetAttr("TEST_EXECUTOR", executor); - - return 0; - - /* - NodeOps * node_ops=NodeOpsRegistryManager::FindNodeOps(cpu_dev->GetCPUInfo(),test_node); - - if(node_ops==nullptr) - return -1; - - auto dispatch=std::bind(&CPUDevice::PushAiderTask,cpu_dev,std::placeholders::_1, - std::placeholders::_2); - - auto wait=std::bind(&CPUDevice::WaitDone,cpu_dev); - - node_ops->SetHelper(std::malloc,std::free,dispatch,wait); - - - if(!node_ops->Prerun(test_node)) - { - std::cout<<"Prerun failed\n"; - return -1; - } - - test_node->SetAttr(ATTR_NODE_OPS,node_ops); - */ - - return 0; -} - -int test_node_run(test_node_t node) -{ - Node* test_node = ( Node* )node; - - GraphExecutor* executor = any_cast(test_node->GetAttr("TEST_EXECUTOR")); - - if(!executor->SyncRun()) - { - std::cout << "Run failed\n"; - return -1; - } - - return 0; - - /* - NodeOps * node_ops=any_cast(test_node->GetAttr(ATTR_NODE_OPS)); - - if(!node_ops->Run(test_node)) - { - std::cout<<"Run failed\n"; - return -1; - } - */ - - return 0; -} - -int test_node_postrun(test_node_t node) -{ - Node* test_node = ( Node* )node; - - GraphExecutor* executor = any_cast(test_node->GetAttr("TEST_EXECUTOR")); - - if(!executor->Postrun()) - { - std::cout << "Postrun failed\n"; - return -1; - } - - return 0; - - /* - NodeOps * node_ops=any_cast(test_node->GetAttr(ATTR_NODE_OPS)); - - if(!node_ops->Postrun(test_node)) - { - std::cout<<"Postrun failed\n"; - return -1; - } - */ - - return 0; -} - -void destroy_test_node(test_node_t node) -{ - Node* test_node = ( Node* )node; - - /* releaset graph executor & graph */ - - GraphExecutor* executor = any_cast(test_node->GetAttr("TEST_EXECUTOR")); - - Graph* graph = executor->GetGraph(); - - delete executor; - delete graph; - - /* free tensor */ - - for(unsigned int i = 0; i < test_node->GetInputNum(); i++) - { - Tensor* tensor = test_node->GetInputTensor(i); - - delete tensor; - } - - for(unsigned int i = 0; i < test_node->GetOutputNum(); i++) - { - Tensor* tensor = test_node->GetOutputTensor(i); - - delete tensor; - } - - /* free node */ - - delete test_node; -} diff --git a/executor/lib/tensor_mem.cpp b/executor/lib/tensor_mem.cpp index 3c4acbc7b..61c00f292 100644 --- a/executor/lib/tensor_mem.cpp +++ b/executor/lib/tensor_mem.cpp @@ -66,6 +66,8 @@ bool get_tensor_memptr(const Tensor* tensor, TensorMemPtr& ptr) bool set_tensor_mem(Tensor* tensor, void* addr, int size, mem_release_t releaser) { + if(addr == nullptr || size == 0) + return false; if(tensor->GetType() == kConstTensor) { LOG_DEBUG() << __FUNCTION__ << ": set const tensor " << tensor->GetName() << " mem: " << addr << "\n"; diff --git a/executor/operator/Makefile b/executor/operator/Makefile index 4d1070788..ade53fe0c 100644 --- a/executor/operator/Makefile +++ b/executor/operator/Makefile @@ -1,13 +1,10 @@ obj-$(CONFIG_ARCH_ARM64)+=arm64/ obj-y+=ref/ obj-y+=common/ +obj-y+=init.o COMMON_CFLAGS+=-I$(shell pwd)/include -ifneq ($(CONFIG_OPT_CFLAGS),) - COMMON_CFLAGS+=-O3 -funroll-loops -endif - #below are examples to build with pre-compiled object #prebuilt-obj-$(CONFIG_ARCH_ARM64)+=arm.o #prebuilt-obj-y+=ref/built-in.o diff --git a/executor/operator/arm64/Makefile b/executor/operator/arm64/Makefile index 19f1fa8de..0b7ba803e 100644 --- a/executor/operator/arm64/Makefile +++ b/executor/operator/arm64/Makefile @@ -8,7 +8,9 @@ obj-y+=pooling.o obj-y+=scale_neon.o obj-y+=init.o - -CXXFLAGS+=-I./include +CXXFLAGS+=-I./include pooling_CXXFLAGS+=-Wno-strict-aliasing + +COMMON_CFLAGS+=-fvisibility=hidden + diff --git a/executor/operator/arm64/batch_norm.cpp b/executor/operator/arm64/batch_norm.cpp index 3e55b5992..0fb59212c 100644 --- a/executor/operator/arm64/batch_norm.cpp +++ b/executor/operator/arm64/batch_norm.cpp @@ -35,7 +35,7 @@ #include namespace TEngine { -namespace BatchNormImpl { +namespace BatchNormImpl64 { struct BNOps : public NodeOps { @@ -206,15 +206,24 @@ struct BNOps : public NodeOps } }; -} // namespace BatchNormImpl +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + if((input->GetShape()).GetDim().size() != 4) + return nullptr; + + BNOps* ops = new BNOps(); + + return ops; +} + +} // namespace BatchNormImpl64 -using namespace BatchNormImpl; +using namespace BatchNormImpl64; void RegisterBatchNormNodeExec(void) { - BNOps* ops = new BNOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("arm64", BatchNormName, ops); + NodeOpsRegistryManager::RegisterOPImplementor("arm64", BatchNormName, BatchNormImpl64::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/arm64/conv/Makefile b/executor/operator/arm64/conv/Makefile index 103780883..8b4834741 100644 --- a/executor/operator/arm64/conv/Makefile +++ b/executor/operator/arm64/conv/Makefile @@ -9,4 +9,7 @@ obj-y+=dw_k3s2p1.o obj-y+=dw_k3s1p1_relu_fused.o obj-y+=dw_k3s2p1_relu_fused.o +CXXFLAGS+=-I../include + +conv_2d_acl_CXXFLAGS+=-I$(ACL_ROOT) -I$(ACL_ROOT)/include -I$(ACL_ROOT)/utils diff --git a/executor/operator/arm64/conv/conv_2d_dw.cpp b/executor/operator/arm64/conv/conv_2d_dw.cpp index f13cfbbe4..0e739eeb4 100644 --- a/executor/operator/arm64/conv/conv_2d_dw.cpp +++ b/executor/operator/arm64/conv/conv_2d_dw.cpp @@ -1,286 +1,286 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: haitao@openailab.com - */ -#include -#include -#include - -#include "logger.hpp" -#include "tensor_mem.hpp" - -#include "graph.hpp" -#include "node_ops.hpp" -#include "operator/convolution.hpp" -#include -namespace TEngine { - -namespace conv_2d_dw { - -const char* conv_name = "CONV_DW"; -const int default_prio = 10; - -extern "C" void dw_k3s1p1(float* data, int h, int w, float* kernel, float* output, float* bias); -extern "C" void dw_k3s2p1(float* data, int h, int w, float* kernel, float* output, float* bias); -extern "C" void dw_k3s1p1_relu_fused(float* data, int h, int w, float* kernel, float* output, float* bias); -extern "C" void dw_k3s2p1_relu_fused(float* data, int h, int w, float* kernel, float* output, float* bias); - -struct dw_param -{ - float* input_buf; - int input_h; - int input_w; - float* output_buf; - int output_h; - int output_w; - float* weight_buf; - int channel_num; - int stride; - float* bias; -}; - -struct Conv2dDepth : public NodeOps -{ - bool Run(Node* node); - - int activation; - - void DirectConv(float* input_buf, int input_h, int input_w, float* output_buf, int output_h, int output_w, - float* weight_buf, int channel_num, int stride, float* bias); - - bool Aider(int cpu, int seq, void* data); -}; - -bool Conv2dDepth::Aider(int cpu, int seq, void* data) -{ - dw_param* param = ( dw_param* )data; - - DirectConv(param->input_buf, param->input_h, param->input_w, param->output_buf, param->output_h, param->output_w, - param->weight_buf, param->channel_num, param->stride, param->bias); - - return true; -} - -void Conv2dDepth::DirectConv(float* input_buf, int input_h, int input_w, float* output_buf, int output_h, int output_w, - float* weight_buf, int channel_num, int stride, float* bias) -{ - int channel_size = input_h * input_w; - float* bias_tmp = bias; - - for(int i = 0; i < channel_num; i++) - { - if(NULL != bias) - bias_tmp = bias + i; - if(stride == 1) - { - if(activation >= 0) - dw_k3s1p1_relu_fused(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp); - else - dw_k3s1p1(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp); - - if(activation > 0) - { - for(int i = 0; i < channel_size; i++) - output_buf[i] = std::min(output_buf[i], ( float )activation); - } - - input_buf += channel_size; - output_buf += channel_size; - weight_buf += 9; - } - else if(stride == 2) - { - if(activation >= 0) - dw_k3s2p1_relu_fused(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp); - else - dw_k3s2p1(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp); - - if(activation > 0) - { - for(int i = 0; i < output_h * output_w; i++) - output_buf[i] = std::min(output_buf[i], ( float )activation); - } - - input_buf += channel_size; - output_buf += output_h * output_w; - weight_buf += 9; - } - } -} - -bool Conv2dDepth::Run(Node* node) -{ - Tensor* input_tensor = node->GetInputTensor(0); - Convolution* conv_op = dynamic_cast(node->GetOp()); - ConvParam* param = conv_op->GetParam(); - const TShape& input_shape = input_tensor->GetShape(); - - int input_c = input_shape.GetC(); - int input_h = input_shape.GetH(); - int input_w = input_shape.GetW(); - - /* output */ - Tensor* output_tensor = node->GetOutputTensor(0); - TShape& output_shape = output_tensor->GetShape(); - - int output_h = output_shape.GetH(); - int output_w = output_shape.GetW(); - int output_n = output_shape.GetN(); - - Tensor* weight_tensor = node->GetInputTensor(1); - float* weight_buf = ( float* )get_tensor_mem(weight_tensor); - float* input_buf = ( float* )get_tensor_mem(input_tensor); - float* output_buf = ( float* )get_tensor_mem(output_tensor); - - int stride_h = param->stride_h; - int cpu_number = cpu_info->GetCPUNumber(); - - float* bias = NULL; - // get bias - if(node->GetInputNum() > 2) - { - Tensor* bias_tensor = node->GetInputTensor(2); - bias = ( float* )get_tensor_mem(bias_tensor); - } - - for(int i = 0; i < output_n; i++) - { - if(cpu_number == 1) - DirectConv(input_buf, input_h, input_w, output_buf, output_h, output_w, weight_buf, input_c, stride_h, - bias); - else - { - // partition into 4 tasks - std::vector task_list; - std::vector param_list; - - auto f = std::bind(&Conv2dDepth::Aider, this, std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3); - - task_list.resize(cpu_number); - param_list.resize(cpu_number); - - int step = input_c / cpu_number; - int channel_size = input_h * input_w; - - for(int i = 0; i < cpu_number; i++) - { - dw_param* param = ¶m_list[i]; - sub_op_task* task = &task_list[i]; - - task->exec_func = f; - task->seq = i; - task->data = param; - - param->input_buf = input_buf; - param->input_h = input_h; - param->input_w = input_w; - param->output_buf = output_buf; - param->output_h = output_h; - param->output_w = output_w; - param->weight_buf = weight_buf; - param->channel_num = step; - param->stride = stride_h; - if(NULL != bias) - param->bias = bias + i * step; - else - param->bias = NULL; - - input_buf += channel_size * step; - if(stride_h == 1) - output_buf += channel_size * step; - else - output_buf += output_h * output_w * step; - weight_buf += 9 * step; - } - - // the last left ones - param_list[cpu_number - 1].channel_num += input_c - cpu_number * step; - - task_dispatch(task_list, -1); - - wait_done(); - } - } - - return true; -} - -static bool isDepthwiseSupported(const ConvParam* param, const TShape& input_shape) -{ - int input_c = input_shape.GetC(); - int group = param->group; - int kernel_h = param->kernel_h; - int kernel_w = param->kernel_w; - int stride_h = param->stride_h; - int stride_w = param->stride_w; - int dilation_h = param->dilation_h; - int dilation_w = param->dilation_w; - int pad_h0 = param->pads[0]; - int pad_w0 = param->pads[1]; - int pad_h1 = param->pads[2]; - int pad_w1 = param->pads[3]; - - if(group == 1 || input_c != group || kernel_h != 3 || kernel_w != 3 || pad_h0 != 1 || pad_w0 != 1 || - pad_h0 != pad_h1 || pad_w0 != pad_w1 || dilation_h != 1 || dilation_w != 1 || stride_w != stride_h) - { - return false; - } - return true; -} - -NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) -{ - const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); - - if(exec_attr->layout == TENGINE_LAYOUT_NHWC) - return nullptr; - - Operator* op = node->GetOp(); - - Convolution* conv_op = dynamic_cast(op); - - ConvParam* param = conv_op->GetParam(); - - const TShape& input_shape = node->GetInputTensor(0)->GetShape(); - - if(!isDepthwiseSupported(param, input_shape)) - return nullptr; - - Conv2dDepth* ops = new Conv2dDepth(); - - ops->activation = param->activation; - - ops->need_free = true; - - return ops; -} - -} // namespace conv_2d_dw - -void RegisterConv2dDepth(void) -{ - NodeOpsRegistryManager::RegisterOPImplementor("arm64", "Convolution", conv_2d_dw::SelectFunc, - conv_2d_dw::default_prio); -} - -} // namespace TEngine +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: haitao@openailab.com + */ +#include +#include +#include + +#include "logger.hpp" +#include "tensor_mem.hpp" + +#include "graph.hpp" +#include "node_ops.hpp" +#include "operator/convolution.hpp" +#include +namespace TEngine { + +namespace conv_2d_dw { + +const char* conv_name = "CONV_DW"; +const int default_prio = 10; + +extern "C" void dw_k3s1p1(float* data, int h, int w, float* kernel, float* output, float* bias); +extern "C" void dw_k3s2p1(float* data, int h, int w, float* kernel, float* output, float* bias); +extern "C" void dw_k3s1p1_relu_fused(float* data, int h, int w, float* kernel, float* output, float* bias); +extern "C" void dw_k3s2p1_relu_fused(float* data, int h, int w, float* kernel, float* output, float* bias); + +struct dw_param +{ + float* input_buf; + int input_h; + int input_w; + float* output_buf; + int output_h; + int output_w; + float* weight_buf; + int channel_num; + int stride; + float* bias; +}; + +struct Conv2dDepth : public NodeOps +{ + bool Run(Node* node); + + int activation; + + void DirectConv(float* input_buf, int input_h, int input_w, float* output_buf, int output_h, int output_w, + float* weight_buf, int channel_num, int stride, float* bias); + + bool Aider(int cpu, int seq, void* data); +}; + +bool Conv2dDepth::Aider(int cpu, int seq, void* data) +{ + dw_param* param = ( dw_param* )data; + + DirectConv(param->input_buf, param->input_h, param->input_w, param->output_buf, param->output_h, param->output_w, + param->weight_buf, param->channel_num, param->stride, param->bias); + + return true; +} + +void Conv2dDepth::DirectConv(float* input_buf, int input_h, int input_w, float* output_buf, int output_h, int output_w, + float* weight_buf, int channel_num, int stride, float* bias) +{ + int channel_size = input_h * input_w; + float* bias_tmp = bias; + + for(int i = 0; i < channel_num; i++) + { + if(NULL != bias) + bias_tmp = bias + i; + if(stride == 1) + { + if(activation >= 0) + dw_k3s1p1_relu_fused(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp); + else + dw_k3s1p1(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp); + + if(activation > 0) + { + for(int i = 0; i < channel_size; i++) + output_buf[i] = std::min(output_buf[i], ( float )activation); + } + + input_buf += channel_size; + output_buf += channel_size; + weight_buf += 9; + } + else if(stride == 2) + { + if(activation >= 0) + dw_k3s2p1_relu_fused(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp); + else + dw_k3s2p1(input_buf, input_h, input_w, weight_buf, output_buf, bias_tmp); + + if(activation > 0) + { + for(int i = 0; i < output_h * output_w; i++) + output_buf[i] = std::min(output_buf[i], ( float )activation); + } + + input_buf += channel_size; + output_buf += output_h * output_w; + weight_buf += 9; + } + } +} + +bool Conv2dDepth::Run(Node* node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + Convolution* conv_op = dynamic_cast(node->GetOp()); + ConvParam* param = conv_op->GetParam(); + const TShape& input_shape = input_tensor->GetShape(); + + int input_c = input_shape.GetC(); + int input_h = input_shape.GetH(); + int input_w = input_shape.GetW(); + + /* output */ + Tensor* output_tensor = node->GetOutputTensor(0); + TShape& output_shape = output_tensor->GetShape(); + + int output_h = output_shape.GetH(); + int output_w = output_shape.GetW(); + int output_n = output_shape.GetN(); + + Tensor* weight_tensor = node->GetInputTensor(1); + float* weight_buf = ( float* )get_tensor_mem(weight_tensor); + float* input_buf = ( float* )get_tensor_mem(input_tensor); + float* output_buf = ( float* )get_tensor_mem(output_tensor); + + int stride_h = param->stride_h; + int cpu_number = cpu_info->GetCPUNumber(); + + float* bias = NULL; + // get bias + if(node->GetInputNum() > 2) + { + Tensor* bias_tensor = node->GetInputTensor(2); + bias = ( float* )get_tensor_mem(bias_tensor); + } + + for(int i = 0; i < output_n; i++) + { + if(cpu_number == 1) + DirectConv(input_buf, input_h, input_w, output_buf, output_h, output_w, weight_buf, input_c, stride_h, + bias); + else + { + // partition into 4 tasks + std::vector task_list; + std::vector param_list; + + auto f = std::bind(&Conv2dDepth::Aider, this, std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + + task_list.resize(cpu_number); + param_list.resize(cpu_number); + + int step = input_c / cpu_number; + int channel_size = input_h * input_w; + + for(int i = 0; i < cpu_number; i++) + { + dw_param* param = ¶m_list[i]; + sub_op_task* task = &task_list[i]; + + task->exec_func = f; + task->seq = i; + task->data = param; + + param->input_buf = input_buf; + param->input_h = input_h; + param->input_w = input_w; + param->output_buf = output_buf; + param->output_h = output_h; + param->output_w = output_w; + param->weight_buf = weight_buf; + param->channel_num = step; + param->stride = stride_h; + if(NULL != bias) + param->bias = bias + i * step; + else + param->bias = NULL; + + input_buf += channel_size * step; + if(stride_h == 1) + output_buf += channel_size * step; + else + output_buf += output_h * output_w * step; + weight_buf += 9 * step; + } + + // the last left ones + param_list[cpu_number - 1].channel_num += input_c - cpu_number * step; + + task_dispatch(task_list, -1); + + wait_done(); + } + } + + return true; +} + +static bool isDepthwiseSupported(const ConvParam* param, const TShape& input_shape) +{ + int input_c = input_shape.GetC(); + int group = param->group; + int kernel_h = param->kernel_h; + int kernel_w = param->kernel_w; + int stride_h = param->stride_h; + int stride_w = param->stride_w; + int dilation_h = param->dilation_h; + int dilation_w = param->dilation_w; + int pad_h0 = param->pad_h0; + int pad_w0 = param->pad_w0; + int pad_h1 = param->pad_h1; + int pad_w1 = param->pad_w1; + + if(group == 1 || input_c != group || kernel_h != 3 || kernel_w != 3 || pad_h0 != 1 || pad_w0 != 1 || + pad_h0 != pad_h1 || pad_w0 != pad_w1 || dilation_h != 1 || dilation_w != 1 || stride_w != stride_h) + { + return false; + } + return true; +} + +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + + if(exec_attr->graph_layout == TENGINE_LAYOUT_NHWC) + return nullptr; + + Operator* op = node->GetOp(); + + Convolution* conv_op = dynamic_cast(op); + + ConvParam* param = conv_op->GetParam(); + + const TShape& input_shape = node->GetInputTensor(0)->GetShape(); + + if(!isDepthwiseSupported(param, input_shape)) + return nullptr; + + Conv2dDepth* ops = new Conv2dDepth(); + + ops->activation = param->activation; + + ops->need_free = true; + + return ops; +} + +} // namespace conv_2d_dw + +void RegisterConv2dDepth(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor("arm64", "Convolution", conv_2d_dw::SelectFunc, + conv_2d_dw::default_prio); +} + +} // namespace TEngine diff --git a/executor/operator/arm64/conv/conv_2d_fast.cpp b/executor/operator/arm64/conv/conv_2d_fast.cpp index 60ed834e1..e066df92f 100644 --- a/executor/operator/arm64/conv/conv_2d_fast.cpp +++ b/executor/operator/arm64/conv/conv_2d_fast.cpp @@ -1,1077 +1,1072 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: xiaowei@openailab.com - */ -#include -#include -#include -#include - -#include "logger.hpp" -#include "node_ops.hpp" -#include "tensor_mem.hpp" - -#include "graph.hpp" -#include "operator/convolution.hpp" -#include - -extern "C" void sgemm_4x16_interleave(bool have_biases, float* biases, float* input, float* kernel, float* output, - long kernel_size); -extern "C" void sgemm_4x4_interleave(bool have_biases, float* biases, float* input, float* kernel, float* output, - long kernel_size); -extern "C" void sgemm_4x16_interleave_relu_fused(bool have_biases, float* biases, float* input, float* kernel, - float* output, long kernel_size); -extern "C" void sgemm_4x4_interleave_relu_fused(bool have_biases, float* biases, float* input, float* kernel, - float* output, long kernel_size); - -namespace TEngine { - -namespace conv_fast { - -#define TYPE_A53 0 -#define TYPE_A72 1 -const char* conv_name = "CONV_FAST"; -const int default_prio = 1000; - -void im2col(float* im, float* col, int input_chan, int input_x, int input_y, int kernel_x, int kernel_y, int stride_x, - int stride_y, int dilation_x, int dilation_y, int pad_x0, int pad_x1, int pad_y0, int pad_y1, int output_x, - int output_y, int col_start, int col_end) -{ - int kernel_size = kernel_x * kernel_y * input_chan; - int input_xy = input_x * input_y; - int pad_x = pad_x0; - int pad_y = pad_y0; - float* cur_col = col + col_start * kernel_size; - bool is_1x1 = (kernel_x == 1) && (kernel_y == 1) && (stride_x == 1) && (stride_y == 1); - bool is_dilation = (dilation_x != 1) || (dilation_y != 1); - bool is_3x3 = (kernel_x == 3) && (kernel_y == 3) && (!is_dilation); - int col_i, col_j, kch, ky, kx, i, j; - - if(is_1x1) - { - for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) - { - for(col_j = 0; col_j < kernel_size; col_j++) - { - for(i = 0; i < 4; i++) - *cur_col++ = *(im + input_xy * col_j + col_i + i); - } - } - // final 4 input - if(col_end & 0x3) - { - for(col_j = 0; col_j < kernel_size; col_j++) - { - for(i = 0; i < 4; i++) - { - if((col_i + i) < col_end) - *cur_col++ = *(im + input_xy * col_j + col_i + i); - else - *cur_col++ = 0.0; - } - } - } - } - else if(is_3x3) - { - int stride_x2 = stride_x * 2; - int stride_x3 = stride_x * 3; - bool is_pad0 = (pad_x0 == 0) && (pad_y0 == 0) && (pad_x1 == 0) && (pad_y1 == 0); - for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) - { - cur_col = col + col_i * kernel_size; - int imy0 = col_i / output_x; - int imy3 = (col_i + 3) / output_x; - int imx0 = col_i - imy0 * output_x; - int imx3 = (col_i + 3) - imy3 * output_x; - if((imy0 == imy3) && - (is_pad0 || (imy0 != 0 && imx0 != 0 && imy0 != (output_y - 1) && imx3 != (output_x - 1)))) - { - float* l0 = im + (imy0 * stride_y - pad_y) * input_x + (imx0 * stride_x - pad_x); - float* l1 = l0 + input_x; - float* l2 = l0 + input_x * 2; - for(i = 0; i < input_chan; i++) - { - for(j = 0; j < 3; j++) - { - cur_col[j * 4 + 0] = l0[j]; - cur_col[j * 4 + 1] = l0[j + stride_x]; - cur_col[j * 4 + 2] = l0[j + stride_x2]; - cur_col[j * 4 + 3] = l0[j + stride_x3]; - cur_col[j * 4 + 12] = l1[j]; - cur_col[j * 4 + 13] = l1[j + stride_x]; - cur_col[j * 4 + 14] = l1[j + stride_x2]; - cur_col[j * 4 + 15] = l1[j + stride_x3]; - cur_col[j * 4 + 24] = l2[j]; - cur_col[j * 4 + 25] = l2[j + stride_x]; - cur_col[j * 4 + 26] = l2[j + stride_x2]; - cur_col[j * 4 + 27] = l2[j + stride_x3]; - } - cur_col += 36; - l0 += input_xy; - l1 += input_xy; - l2 += input_xy; - } - } - else - { - int cnt_y[4] = {imy0, (col_i + 1) / output_x, (col_i + 2) / output_x, imy3}; - int cnt_x[4] = {imx0, col_i - cnt_y[1] * output_x + 1, col_i - cnt_y[2] * output_x + 2, imx3}; - int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, - cnt_x[2] * stride_x - pad_x, cnt_x[3] * stride_x - pad_x}; - int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, - cnt_y[2] * stride_y - pad_y, cnt_y[3] * stride_y - pad_y}; - for(kch = 0; kch < input_chan; kch++) - for(ky = 0; ky < 3; ky++) - for(kx = 0; kx < 3; kx++) - { - int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; - int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for(i = 0; i < 4; i++) - { - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) - *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); - else - *cur_col++ = 0.0; - } - } - } - } - // final 4 input - if(col_end & 0x3) - { - int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x}; - int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1, - col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3}; - int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x, - cnt_x[3] * stride_x - pad_x}; - int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y, - cnt_y[3] * stride_y - pad_y}; - for(kch = 0; kch < input_chan; kch++) - for(ky = 0; ky < 3; ky++) - for(kx = 0; kx < 3; kx++) - { - int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; - int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for(i = 0; i < 4; i++) - { - if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && - imy[i] < input_y) - *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); - else - *cur_col++ = 0.0; - } - } - } - } - else - { // for general cases - for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) - { - int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x}; - int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1, - col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3}; - int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x, - cnt_x[3] * stride_x - pad_x}; - int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y, - cnt_y[3] * stride_y - pad_y}; - for(kch = 0; kch < input_chan; kch++) - for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y) - for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x) - { - int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; - int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for(i = 0; i < 4; i++) - { - if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) - *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); - else - *cur_col++ = 0.0; - } - } - } - // final 4 input - if(col_end & 0x3) - { - int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x}; - int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1, - col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3}; - int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x, - cnt_x[3] * stride_x - pad_x}; - int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y, - cnt_y[3] * stride_y - pad_y}; - for(kch = 0; kch < input_chan; kch++) - for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y) - for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x) - { - int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; - int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for(i = 0; i < 4; i++) - { - if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && - imy[i] < input_y) - *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); - else - *cur_col++ = 0.0; - } - } - } - } -} - -// interleave 0 ~ (output_chan & -16) kernels with 16 in form of k[0-15][0],k[0-15][1],k[0-15][2].. -// interleave (output_chan & -16) ~ ((output_chan + 3) & -4) tail kernls with 4 in form of -// k[0-3][0],k[0-3][1],k[0-3][2].. -void interleave_kernel(float* kernel, float* kernel_interleaved, int kernel_chan, int kernel_size) -{ - int i, j; - float *cur_kernel0, *cur_kernel1, *cur_kernel2, *cur_kernel3, *cur_kernel4, *cur_kernel5, *cur_kernel6, - *cur_kernel7; - float *cur_kernel8, *cur_kernel9, *cur_kernel10, *cur_kernel11, *cur_kernel12, *cur_kernel13, *cur_kernel14, - *cur_kernel15; - float* cur_kernel_interleaved = kernel_interleaved; - - // interleave 16 kernels - for(i = 0; i < (kernel_chan & -16); i += 16) - { - cur_kernel0 = kernel + kernel_size * i; - cur_kernel1 = kernel + kernel_size * (i + 1); - cur_kernel2 = kernel + kernel_size * (i + 2); - cur_kernel3 = kernel + kernel_size * (i + 3); - cur_kernel4 = kernel + kernel_size * (i + 4); - cur_kernel5 = kernel + kernel_size * (i + 5); - cur_kernel6 = kernel + kernel_size * (i + 6); - cur_kernel7 = kernel + kernel_size * (i + 7); - cur_kernel8 = kernel + kernel_size * (i + 8); - cur_kernel9 = kernel + kernel_size * (i + 9); - cur_kernel10 = kernel + kernel_size * (i + 10); - cur_kernel11 = kernel + kernel_size * (i + 11); - cur_kernel12 = kernel + kernel_size * (i + 12); - cur_kernel13 = kernel + kernel_size * (i + 13); - cur_kernel14 = kernel + kernel_size * (i + 14); - cur_kernel15 = kernel + kernel_size * (i + 15); - for(j = 0; j < kernel_size; j++) - { - *(cur_kernel_interleaved++) = cur_kernel0[j]; - *(cur_kernel_interleaved++) = cur_kernel1[j]; - *(cur_kernel_interleaved++) = cur_kernel2[j]; - *(cur_kernel_interleaved++) = cur_kernel3[j]; - *(cur_kernel_interleaved++) = cur_kernel4[j]; - *(cur_kernel_interleaved++) = cur_kernel5[j]; - *(cur_kernel_interleaved++) = cur_kernel6[j]; - *(cur_kernel_interleaved++) = cur_kernel7[j]; - *(cur_kernel_interleaved++) = cur_kernel8[j]; - *(cur_kernel_interleaved++) = cur_kernel9[j]; - *(cur_kernel_interleaved++) = cur_kernel10[j]; - *(cur_kernel_interleaved++) = cur_kernel11[j]; - *(cur_kernel_interleaved++) = cur_kernel12[j]; - *(cur_kernel_interleaved++) = cur_kernel13[j]; - *(cur_kernel_interleaved++) = cur_kernel14[j]; - *(cur_kernel_interleaved++) = cur_kernel15[j]; - } - } - - for(i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4) - { - cur_kernel0 = kernel + kernel_size * i; - cur_kernel1 = kernel + kernel_size * (i + 1); - cur_kernel2 = kernel + kernel_size * (i + 2); - cur_kernel3 = kernel + kernel_size * (i + 3); - for(j = 0; j < kernel_size; j++) - { - *(cur_kernel_interleaved++) = cur_kernel0[j]; - *(cur_kernel_interleaved++) = cur_kernel1[j]; - *(cur_kernel_interleaved++) = cur_kernel2[j]; - *(cur_kernel_interleaved++) = cur_kernel3[j]; - } - } - // last 4 kernel - cur_kernel0 = kernel + kernel_size * i; - cur_kernel1 = kernel + kernel_size * (i + 1); - cur_kernel2 = kernel + kernel_size * (i + 2); - if((kernel_chan & 0x3) == 3) - { - for(j = 0; j < kernel_size; j++) - { - *(cur_kernel_interleaved++) = cur_kernel0[j]; - *(cur_kernel_interleaved++) = cur_kernel1[j]; - *(cur_kernel_interleaved++) = cur_kernel2[j]; - *(cur_kernel_interleaved++) = 0.0; - } - } - else if((kernel_chan & 0x3) == 2) - { - for(j = 0; j < kernel_size; j++) - { - *(cur_kernel_interleaved++) = cur_kernel0[j]; - *(cur_kernel_interleaved++) = cur_kernel1[j]; - *(cur_kernel_interleaved++) = 0.0; - *(cur_kernel_interleaved++) = 0.0; - } - } - else if((kernel_chan & 0x3) == 1) - { - for(j = 0; j < kernel_size; j++) - { - *(cur_kernel_interleaved++) = cur_kernel0[j]; - *(cur_kernel_interleaved++) = 0.0; - *(cur_kernel_interleaved++) = 0.0; - *(cur_kernel_interleaved++) = 0.0; - } - } - - return; -} - -static void sgemm4x16(float* col, float* kernel, float* biases, bool bias_term, float* output, int kernel_size, - int col_start, int col_end, int kernel_start, int kernel_end, int output_xy, int activation, - int cpu_type) -{ - float initial[64], result[64]; - int col_line, kernel_num; - int i, j; - float *cur_col, *cur_kernel; - - for(kernel_num = (kernel_start & -16); kernel_num < (kernel_end & -16); kernel_num += 16) - { - if(bias_term) - for(i = 0; i < 64; i++) - initial[i] = *(biases + kernel_num + (i >> 2)); - cur_kernel = ( float* )(kernel + kernel_num * kernel_size); - - for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) - { - cur_col = ( float* )(col + col_line * kernel_size); - if(activation >= 0) - sgemm_4x16_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - else - sgemm_4x16_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - - if(activation > 0) - { - for(i = 0; i < 16; i++) - { - *(output + (kernel_num + i) * output_xy + col_line) = - std::min(result[(i << 2)], ( float )activation); - *(output + (kernel_num + i) * output_xy + col_line + 1) = - std::min(result[(i << 2) + 1], ( float )activation); - *(output + (kernel_num + i) * output_xy + col_line + 2) = - std::min(result[(i << 2) + 2], ( float )activation); - *(output + (kernel_num + i) * output_xy + col_line + 3) = - std::min(result[(i << 2) + 3], ( float )activation); - } - } - else - { - for(i = 0; i < 16; i++) - { - *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2)]; - *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1]; - *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2]; - *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3]; - } - } - } - if(col_end & 0x3) - { - cur_col = ( float* )(col + col_line * kernel_size); - - if(activation >= 0) - sgemm_4x16_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - else - sgemm_4x16_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - - for(i = 0; i < 16; i++) - for(j = 0; j < (col_end & 0x3); j++) - { - if(activation > 0) - *(output + (kernel_num + i) * output_xy + col_line + j) = - std::min(result[(i << 2) + j], ( float )activation); - else - *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j]; - } - } - } -} - -static void sgemm4x4(float* col, float* kernel, float* biases, bool bias_term, float* output, int kernel_size, - int col_start, int col_end, int kernel_start, int kernel_end, int output_xy, int activation, - int cpu_type) -{ - float initial[16], result[16]; - int col_line, kernel_num; - int i, j; - float *cur_col, *cur_kernel; - - for(kernel_num = kernel_start & -4; kernel_num < (kernel_end & -4); kernel_num += 4) - { - if(bias_term) - for(i = 0; i < 16; i++) - initial[i] = *(biases + kernel_num + (i >> 2)); - cur_kernel = ( float* )(kernel + kernel_num * kernel_size); - for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) - { - cur_col = ( float* )(col + col_line * kernel_size); - - if(activation >= 0) - sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - else - sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - - if(activation > 0) - { - for(i = 0; i < 4; i++) - { - *(output + (kernel_num + i) * output_xy + col_line) = - std::min(result[(i << 2) + 0], ( float )activation); - *(output + (kernel_num + i) * output_xy + col_line + 1) = - std::min(result[(i << 2) + 1], ( float )activation); - *(output + (kernel_num + i) * output_xy + col_line + 2) = - std::min(result[(i << 2) + 2], ( float )activation); - *(output + (kernel_num + i) * output_xy + col_line + 3) = - std::min(result[(i << 2) + 3], ( float )activation); - } - } - else - { - for(i = 0; i < 4; i++) - { - *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2) + 0]; - *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1]; - *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2]; - *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3]; - } - } - } - if(col_end & 0x3) - { - cur_col = ( float* )(col + col_line * kernel_size); - if(activation >= 0) - sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - else - sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - for(i = 0; i < 4; i++) - { - for(j = 0; j < (col_end & 0x3); j++) - { - if(activation > 0) - *(output + (kernel_num + i) * output_xy + col_line + j) = - std::min(result[(i << 2) + j], ( float )activation); - else - *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j]; - } - } - } - } - - if(kernel_end & 0x3) - { - if(bias_term) - for(i = 0; i < ((kernel_end & 0x3) << 2); i++) - initial[i] = *(biases + kernel_num + (i >> 2)); - cur_kernel = ( float* )(kernel + kernel_num * kernel_size); - for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) - { - cur_col = ( float* )(col + col_line * kernel_size); - - if(activation >= 0) - sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - else - sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - - if(activation > 0) - { - for(i = 0; i < (kernel_end & 0x3); i++) - { - *(output + (kernel_num + i) * output_xy + col_line) = - std::min(result[(i << 2) + 0], ( float )activation); - *(output + (kernel_num + i) * output_xy + col_line + 1) = - std::min(result[(i << 2) + 1], ( float )activation); - *(output + (kernel_num + i) * output_xy + col_line + 2) = - std::min(result[(i << 2) + 2], ( float )activation); - *(output + (kernel_num + i) * output_xy + col_line + 3) = - std::min(result[(i << 2) + 3], ( float )activation); - } - } - else - { - for(i = 0; i < (kernel_end & 0x3); i++) - { - *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2) + 0]; - *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1]; - *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2]; - *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3]; - } - } - } - if(col_end & 0x3) - { - cur_col = ( float* )(col + col_line * kernel_size); - if(activation >= 0) - sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - else - sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); - - for(i = 0; i < (kernel_end & 0x3); i++) - { - for(j = 0; j < (col_end & 0x3); j++) - { - if(activation > 0) - *(output + (kernel_num + i) * output_xy + col_line + j) = - std::min(result[(i << 2) + j], ( float )activation); - else - *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j]; - } - } - } - } -} - -struct im2col_param -{ - float* im; - float* col; - int input_chan; - int input_x; - int input_y; - int kernel_x; - int kernel_y; - int stride_x; - int stride_y; - int dilation_x; - int dilation_y; - int pad_x0; - int pad_x1; - int pad_y0; - int pad_y1; - int output_x; - int output_y; - int col_start; - int col_end; -}; - -struct sgemm_param -{ - float* col; - float* kernel; - float* biases; - bool bias_term; - float* output; - int kernel_size; - int col_start; - int col_end; - int kernel_start; - int kernel_end; - int output_xy; -}; - -struct conv1x1s1_param -{ - const float* input; - float* output; - const float* kernel; - const float* bias; - int in_h; - int in_w; - int in_ch; - int out_h; - int out_w; - int out_ch; - bool relu_fused; -}; - -struct ConvFast : public MTNodeOps -{ - bool Prerun(Node* node) override; - bool Reshape(Node* node) override; - bool Run(Node* node) override; - bool Postrun(Node* node) override; - bool GetSharedMemorySize(Node*, unsigned int& mem_size) override; - bool SetSharedMemoryAddr(Node*, void* mem_addr, int mem_size) override; - - bool float_mode; - bool im2col_aider(int cpu, int seq, void* data /* im2col_param * param */); - bool sgemm_aider(int cpu, int seq, void* data /* sgemm_param * param */); - bool sgemm4x4_aider(int cpu, int seq, void* data /* sgemm_param * param */); - - int activation; - bool dynamic_shape; -}; - -bool ConvFast::im2col_aider(int cpu, int seq, void* data) -{ - im2col_param* param = ( im2col_param* )(data); - im2col(param->im, param->col, param->input_chan, param->input_x, param->input_y, param->kernel_x, param->kernel_y, - param->stride_x, param->stride_y, param->dilation_x, param->dilation_y, param->pad_x0, param->pad_x1, - param->pad_y0, param->pad_y1, param->output_x, param->output_y, param->col_start, param->col_end); - - return true; -} - -bool ConvFast::sgemm4x4_aider(int cpu, int seq, void* data) -{ - int cpu_type = TYPE_A72; - sgemm_param* param = ( sgemm_param* )(data); - - sgemm4x4(param->col, param->kernel, param->biases, param->bias_term, param->output, param->kernel_size, - param->col_start, param->col_end, param->kernel_start, param->kernel_end, param->output_xy, activation, - cpu_type); - - return true; -} - -bool ConvFast::sgemm_aider(int cpu, int seq, void* data) -{ - int cpu_type = TYPE_A72; - sgemm_param* param = ( sgemm_param* )(data); - - sgemm4x16(param->col, param->kernel, param->biases, param->bias_term, param->output, param->kernel_size, - param->col_start, param->col_end, param->kernel_start, param->kernel_end, param->output_xy, activation, - cpu_type); - - return true; -} - -bool ConvFast::Prerun(Node* node) -{ - Convolution* conv_op = dynamic_cast(node->GetOp()); - ConvParam* param = conv_op->GetParam(); - int group = param->group; - - Tensor* output_tensor = node->GetOutputTensor(0); - TShape& output_shape = output_tensor->GetShape(); - int output_chan = output_shape.GetC() / group; - - /* pre-allocate col_buf */ - Tensor* input_tensor = node->GetInputTensor(0); - TShape& input_shape = input_tensor->GetShape(); - - int input_chan = input_shape.GetC() / group; - int kernel_size = input_chan * param->kernel_h * param->kernel_w; - - if(!dynamic_shape) - { - if(node->ExistAttr("shared_col_buf")) - { - float* addr = ( float* )any_cast(node->GetAttr("shared_col_buf")); - - (*node)["col_buf"] = addr; - } - else - { - unsigned int col_size; - - GetSharedMemorySize(node, col_size); - - float* col_buf = ( float* )mem_alloc(col_size); - (*node)["col_buf"] = col_buf; - node->SetAttr("col_buf_allocated", col_size); - } - } - - /* packing kernel data */ - Tensor* kernel_tensor = node->GetInputTensor(1); - - float* kernel_interleaved = NULL; - - int kernel_interleaved_size_g = kernel_size * ((output_chan + 3) & -4); - int kernel_size_g = kernel_size * output_chan; - float* kernel_org = ( float* )get_tensor_mem(kernel_tensor); - kernel_interleaved = ( float* )mem_alloc(sizeof(float) * (kernel_interleaved_size_g * group) + 128); - - for(int g = 0; g < group; ++g) - { - float* kernel = kernel_org + g * kernel_size_g; - float* kernel_interleaved_g = kernel_interleaved + g * kernel_interleaved_size_g; - interleave_kernel(kernel, kernel_interleaved_g, output_chan, kernel_size); - } - - (*node)["kernel_interleaved"] = kernel_interleaved; - - if(exec_attr->low_mem_mode) - { - printf("free convolution kernel: %s %d\n", kernel_tensor->GetName().c_str(), kernel_tensor->GetTotalSize()); - - kernel_tensor->FreeMem(); - } - - return true; -} - -bool ConvFast::Reshape(Node* node) -{ - unsigned int new_col_size; - - GetSharedMemorySize(node, new_col_size); - - if(node->ExistAttr("col_buf_allocated")) - { - unsigned int col_size = any_cast(node->GetAttr("col_buf_allocated")); - if(new_col_size == col_size) - return true; - - float* addr = any_cast(node->GetAttr("col_buf")); - mem_free(addr); - } - - float* col_buf = ( float* )mem_alloc(new_col_size); - (*node)["col_buf"] = col_buf; - - node->SetAttr("col_buf_allocated", new_col_size); - return true; -} - -bool ConvFast::Run(Node* node) -{ - /* input */ - Tensor* input_tensor = node->GetInputTensor(0); - - Convolution* conv_op = dynamic_cast(node->GetOp()); - ConvParam* param = conv_op->GetParam(); - - const TShape& input_shape = input_tensor->GetShape(); - - int group = param->group; - int input_chan = input_shape.GetC() / group; - int input_h = input_shape.GetH(); - int input_w = input_shape.GetW(); - int input_size = input_w * input_h * input_chan; - int pad_x0 = param->pads[1]; // left padding columns - int pad_x1 = param->pads[3]; // right padding columns - int pad_y0 = param->pads[0]; // top padding rows - int pad_y1 = param->pads[2]; // bottom padding rows - int stride_x = param->stride_w; - int stride_y = param->stride_h; - int dilation_x = param->dilation_w; - int dilation_y = param->dilation_h; - float* input_org = ( float* )get_tensor_mem(input_tensor); - float* col = any_cast(node->GetAttr("col_buf")); - - /* output */ - Tensor* output_tensor = node->GetOutputTensor(0); - TShape& output_shape = output_tensor->GetShape(); - float* output_org = ( float* )get_tensor_mem(output_tensor); - int output_y = output_shape.GetH(); - int output_x = output_shape.GetW(); - int output_xy = output_x * output_y; - int output_chan = output_shape.GetC() / group; - int output_n = output_shape.GetN(); - - /* kernel */ - int kernel_x = param->kernel_w; - int kernel_y = param->kernel_h; - int kernel_size = input_chan * kernel_x * kernel_y; - - float* kernel_interleaved = any_cast(node->GetAttr("kernel_interleaved")); - - int cpu_number = cpu_info->GetCPUNumber(); - - /* biases */ - - float* biases = NULL; - bool have_biases = (node->GetInputNum() > 2); - - if(have_biases) - { - biases = ( float* )get_tensor_mem(node->GetInputTensor(2)); - } - - int cpu_type; - - if(cpu_info->GetCPUModel(cpu_info->GetMasterCPU()) == CPU_A72) - cpu_type = TYPE_A72; - else - cpu_type = TYPE_A53; - - /* block size split parameter */ - int L2_CACHE_SIZE = (cpu_type == TYPE_A53) ? 512 * 1024 : 1024 * 1024; - int kernel_size_l1 = kernel_size; - int col_cnt_l2 = L2_CACHE_SIZE / 4 / kernel_size_l1 * 7 / 8; - col_cnt_l2 = col_cnt_l2 > 4 ? (col_cnt_l2 & -4) : 4; - - /* one image per time */ - for(int i = 0; i < output_n; i++) - { - float* input = input_org + i * input_size * group; - float* output = output_org + i * output_xy * output_chan * group; - - for(int g = 0; g < group; g++) - { - float* input_g = input + g * input_size; - int total_num = output_xy * input_chan * kernel_x * kernel_y; - - if(cpu_number == 1 || total_num < 100 * 1000) - im2col(input_g, col, input_chan, input_w, input_h, kernel_x, kernel_y, stride_x, stride_y, dilation_x, - dilation_y, pad_x0, pad_x1, pad_y0, pad_y1, output_x, output_y, 0, output_xy); - else - { - std::vector task_list; - std::vector param_list; - - auto f = std::bind(&ConvFast::im2col_aider, this, std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3); - - int steps = output_xy / cpu_number; - - steps = (steps + 3) & (~0x3); - - int offset; - int real_cpu_number = cpu_number; - - while(1) - { - offset = steps * real_cpu_number - output_xy; - - if(offset < steps) - break; - - real_cpu_number--; - } - - task_list.resize(real_cpu_number); - param_list.resize(real_cpu_number); - - for(int i = 0; i < real_cpu_number; i++) - { - im2col_param* param = ¶m_list[i]; - sub_op_task* task = &task_list[i]; - - task->exec_func = f; - task->seq = i; - task->data = param; - - param->im = input_g; - param->col = col; - param->input_chan = input_chan; - param->input_x = input_w; - param->input_y = input_h; - param->kernel_x = kernel_x; - param->kernel_y = kernel_y; - param->stride_x = stride_x; - param->stride_y = stride_y; - param->dilation_x = dilation_x; - param->dilation_y = dilation_y; - param->pad_x0 = pad_x0; - param->pad_x1 = pad_x1; - param->pad_y0 = pad_y0; - param->pad_y1 = pad_y1; - param->output_x = output_x; - param->output_y = output_y; - param->col_start = i * steps; - param->col_end = param->col_start + steps; - } - - param_list[real_cpu_number - 1].col_end = output_xy; - - task_dispatch(task_list, -1); - wait_done(); - } - - float* kernel_g = kernel_interleaved + g * (kernel_size * ((output_chan + 3) & -4)); - float* output_g = output + g * output_xy * output_chan; - float* bias_g = biases + g * output_chan; - - std::vector task_list; - std::vector param_list; - - int chan_16_num = output_chan / 16; - int chan_4_num = (output_chan & 0xf) ? 1 : 0; - int l2_loop = (output_xy - 1) / col_cnt_l2 + 1; - int max_task_num = l2_loop * (chan_16_num + chan_4_num); - - if(cpu_number > 1) - param_list.resize(max_task_num); - - // for input block of L2 cache size - for(int col_i = 0; col_i < output_xy; col_i += col_cnt_l2) - { - int col_start = col_i; - int col_end = col_i + col_cnt_l2; - col_end = col_end > output_xy ? output_xy : col_end; - - if(cpu_number == 1) - { - sgemm4x16(col, kernel_g, bias_g, have_biases, output_g, kernel_size, col_start, col_end, 0, - output_chan & -16, output_xy, activation, cpu_type); - if(output_chan & 0xf) - sgemm4x4(col, kernel_g, bias_g, have_biases, output_g, kernel_size, col_start, col_end, - output_chan & -16, output_chan, output_xy, activation, cpu_type); - } - else - { - auto f = std::bind(&ConvFast::sgemm_aider, this, std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3); - - for(int i = 0; i < chan_16_num; i++) - { - sub_op_task tmp_task; - sgemm_param* param = ¶m_list[task_list.size()]; - sub_op_task* task = &tmp_task; - task->exec_func = f; - task->seq = i; - task->data = param; - - param->col = col; - param->kernel = kernel_g; - param->biases = bias_g; - param->bias_term = have_biases; - param->output = output_g; - param->kernel_size = kernel_size; - param->col_start = col_start; - param->col_end = col_end; - param->kernel_start = i * 16; - param->kernel_end = param->kernel_start + 16; - param->output_xy = output_xy; - - task_list.emplace_back(tmp_task); - } - - if(output_chan & 0xf) - { - auto f = std::bind(&ConvFast::sgemm4x4_aider, this, std::placeholders::_1, - std::placeholders::_2, std::placeholders::_3); - sub_op_task tmp_task; - sgemm_param* param = ¶m_list[task_list.size()]; - sub_op_task* task = &tmp_task; - task->exec_func = f; - task->seq = task_list.size() - 1; - task->data = param; - - param->col = col; - param->kernel = kernel_g; - param->biases = bias_g; - param->bias_term = have_biases; - param->output = output_g; - param->kernel_size = kernel_size; - param->col_start = col_start; - param->col_end = col_end; - param->kernel_start = output_chan & -16; - param->kernel_end = output_chan; - param->output_xy = output_xy; - - task_list.emplace_back(tmp_task); - } - } - } - - if(cpu_number > 1) - { - task_dispatch(task_list, -1); - wait_done(); - } - } - } - - return true; -} - -bool ConvFast::Postrun(Node* node) -{ - if(node->ExistAttr("kernel_interleaved")) - { - float* addr; - addr = any_cast(node->GetAttr("kernel_interleaved")); - - mem_free(addr); - - node->RemoveAttr("kernel_interleaved"); - } - - if(node->ExistAttr("col_buf_allocated")) - { - float* addr = any_cast(node->GetAttr("col_buf")); - mem_free(addr); - - node->RemoveAttr("col_buf_allocated"); - } - - if(node->ExistAttr("col_buf")) - node->RemoveAttr("col_buf"); - - return true; -} - -bool ConvFast::GetSharedMemorySize(Node* node, unsigned int& mem_size) -{ - Convolution* conv_op = dynamic_cast(node->GetOp()); - ConvParam* param = conv_op->GetParam(); - int group = param->group; - - Tensor* output_tensor = node->GetOutputTensor(0); - TShape& output_shape = output_tensor->GetShape(); - int output_y = output_shape.GetH(); - int output_x = output_shape.GetW(); - - Tensor* input_tensor = node->GetInputTensor(0); - TShape& input_shape = input_tensor->GetShape(); - - int input_chan = input_shape.GetC() / group; - int kernel_size = input_chan * param->kernel_h * param->kernel_w; - int output_xy = output_x * output_y; - - mem_size = (sizeof(float) * (kernel_size * ((output_xy + 3) & -4)) + 128); - - return true; -} - -bool ConvFast::SetSharedMemoryAddr(Node* node, void* mem_addr, int mem_size) -{ - (*node)["shared_col_buf"] = mem_addr; - return true; -} - -NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) -{ - const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); - - if(exec_attr->kernel_mode != EXEC_KERNEL_FP32) - return nullptr; - - if(exec_attr->layout == TENGINE_LAYOUT_NHWC) - return nullptr; - - ConvFast* ops = new ConvFast(); - - ops->need_free = true; - - if(node->IsDynamicShape()) - ops->dynamic_shape = true; - else - ops->dynamic_shape = false; - - Convolution* conv_op = dynamic_cast(node->GetOp()); - ConvParam* param = conv_op->GetParam(); - - ops->activation = param->activation; - - return ops; -} - -} // conv_fast - -void RegisterConv2dFast(void) -{ - NodeOpsRegistryManager::RegisterOPImplementor("arm64", "Convolution", conv_fast::SelectFunc, - conv_fast::default_prio); -} - -} // namespace TEngine +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: xiaowei@openailab.com + */ +#include +#include +#include +#include + +#include "logger.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" + +#include "graph.hpp" +#include "operator/convolution.hpp" +#include + +extern "C" void sgemm_4x16_interleave(bool have_biases, float* biases, float* input, float* kernel, float* output, + long kernel_size); +extern "C" void sgemm_4x4_interleave(bool have_biases, float* biases, float* input, float* kernel, float* output, + long kernel_size); +extern "C" void sgemm_4x16_interleave_relu_fused(bool have_biases, float* biases, float* input, float* kernel, + float* output, long kernel_size); +extern "C" void sgemm_4x4_interleave_relu_fused(bool have_biases, float* biases, float* input, float* kernel, + float* output, long kernel_size); + +namespace TEngine { + +namespace conv_fast { + +#define TYPE_A53 0 +#define TYPE_A72 1 +const char* conv_name = "CONV_FAST"; +const int default_prio = 1000; + +void im2col(float* im, float* col, int input_chan, int input_x, int input_y, int kernel_x, int kernel_y, int stride_x, + int stride_y, int dilation_x, int dilation_y, int pad_x0, int pad_x1, int pad_y0, int pad_y1, int output_x, + int output_y, int col_start, int col_end) +{ + int kernel_size = kernel_x * kernel_y * input_chan; + int input_xy = input_x * input_y; + int pad_x = pad_x0; + int pad_y = pad_y0; + float* cur_col = col + col_start * kernel_size; + bool is_1x1 = (kernel_x == 1) && (kernel_y == 1) && (stride_x == 1) && (stride_y == 1); + bool is_dilation = (dilation_x != 1) || (dilation_y != 1); + bool is_3x3 = (kernel_x == 3) && (kernel_y == 3) && (!is_dilation); + int col_i, col_j, kch, ky, kx, i, j; + + if(is_1x1) + { + for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) + { + for(col_j = 0; col_j < kernel_size; col_j++) + { + for(i = 0; i < 4; i++) + *cur_col++ = *(im + input_xy * col_j + col_i + i); + } + } + // final 4 input + if(col_end & 0x3) + { + for(col_j = 0; col_j < kernel_size; col_j++) + { + for(i = 0; i < 4; i++) + { + if((col_i + i) < col_end) + *cur_col++ = *(im + input_xy * col_j + col_i + i); + else + *cur_col++ = 0.0; + } + } + } + } + else if(is_3x3) + { + int stride_x2 = stride_x * 2; + int stride_x3 = stride_x * 3; + bool is_pad0 = (pad_x0 == 0) && (pad_y0 == 0) && (pad_x1 == 0) && (pad_y1 == 0); + for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) + { + cur_col = col + col_i * kernel_size; + int imy0 = col_i / output_x; + int imy3 = (col_i + 3) / output_x; + int imx0 = col_i - imy0 * output_x; + int imx3 = (col_i + 3) - imy3 * output_x; + if((imy0 == imy3) && + (is_pad0 || (imy0 != 0 && imx0 != 0 && imy0 != (output_y - 1) && imx3 != (output_x - 1)))) + { + float* l0 = im + (imy0 * stride_y - pad_y) * input_x + (imx0 * stride_x - pad_x); + float* l1 = l0 + input_x; + float* l2 = l0 + input_x * 2; + for(i = 0; i < input_chan; i++) + { + for(j = 0; j < 3; j++) + { + cur_col[j * 4 + 0] = l0[j]; + cur_col[j * 4 + 1] = l0[j + stride_x]; + cur_col[j * 4 + 2] = l0[j + stride_x2]; + cur_col[j * 4 + 3] = l0[j + stride_x3]; + cur_col[j * 4 + 12] = l1[j]; + cur_col[j * 4 + 13] = l1[j + stride_x]; + cur_col[j * 4 + 14] = l1[j + stride_x2]; + cur_col[j * 4 + 15] = l1[j + stride_x3]; + cur_col[j * 4 + 24] = l2[j]; + cur_col[j * 4 + 25] = l2[j + stride_x]; + cur_col[j * 4 + 26] = l2[j + stride_x2]; + cur_col[j * 4 + 27] = l2[j + stride_x3]; + } + cur_col += 36; + l0 += input_xy; + l1 += input_xy; + l2 += input_xy; + } + } + else + { + int cnt_y[4] = {imy0, (col_i + 1) / output_x, (col_i + 2) / output_x, imy3}; + int cnt_x[4] = {imx0, col_i - cnt_y[1] * output_x + 1, col_i - cnt_y[2] * output_x + 2, imx3}; + int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, + cnt_x[2] * stride_x - pad_x, cnt_x[3] * stride_x - pad_x}; + int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, + cnt_y[2] * stride_y - pad_y, cnt_y[3] * stride_y - pad_y}; + for(kch = 0; kch < input_chan; kch++) + for(ky = 0; ky < 3; ky++) + for(kx = 0; kx < 3; kx++) + { + int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; + int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; + for(i = 0; i < 4; i++) + { + if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); + else + *cur_col++ = 0.0; + } + } + } + } + // final 4 input + if(col_end & 0x3) + { + int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x}; + int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1, + col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3}; + int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x, + cnt_x[3] * stride_x - pad_x}; + int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y, + cnt_y[3] * stride_y - pad_y}; + for(kch = 0; kch < input_chan; kch++) + for(ky = 0; ky < 3; ky++) + for(kx = 0; kx < 3; kx++) + { + int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; + int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; + for(i = 0; i < 4; i++) + { + if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && + imy[i] < input_y) + *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); + else + *cur_col++ = 0.0; + } + } + } + } + else + { // for general cases + for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4) + { + int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x}; + int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1, + col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3}; + int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x, + cnt_x[3] * stride_x - pad_x}; + int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y, + cnt_y[3] * stride_y - pad_y}; + for(kch = 0; kch < input_chan; kch++) + for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y) + for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x) + { + int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; + int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; + for(i = 0; i < 4; i++) + { + if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y) + *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); + else + *cur_col++ = 0.0; + } + } + } + // final 4 input + if(col_end & 0x3) + { + int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x}; + int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1, + col_i - cnt_y[2] * output_x + 2, col_i - cnt_y[3] * output_x + 3}; + int imx_start[4] = {cnt_x[0] * stride_x - pad_x, cnt_x[1] * stride_x - pad_x, cnt_x[2] * stride_x - pad_x, + cnt_x[3] * stride_x - pad_x}; + int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y, + cnt_y[3] * stride_y - pad_y}; + for(kch = 0; kch < input_chan; kch++) + for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y) + for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x) + { + int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; + int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; + for(i = 0; i < 4; i++) + { + if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && + imy[i] < input_y) + *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]); + else + *cur_col++ = 0.0; + } + } + } + } +} + +// interleave 0 ~ (output_chan & -16) kernels with 16 in form of k[0-15][0],k[0-15][1],k[0-15][2].. +// interleave (output_chan & -16) ~ ((output_chan + 3) & -4) tail kernls with 4 in form of +// k[0-3][0],k[0-3][1],k[0-3][2].. +void interleave_kernel(float* kernel, float* kernel_interleaved, int kernel_chan, int kernel_size) +{ + int i, j; + float *cur_kernel0, *cur_kernel1, *cur_kernel2, *cur_kernel3, *cur_kernel4, *cur_kernel5, *cur_kernel6, + *cur_kernel7; + float *cur_kernel8, *cur_kernel9, *cur_kernel10, *cur_kernel11, *cur_kernel12, *cur_kernel13, *cur_kernel14, + *cur_kernel15; + float* cur_kernel_interleaved = kernel_interleaved; + + // interleave 16 kernels + for(i = 0; i < (kernel_chan & -16); i += 16) + { + cur_kernel0 = kernel + kernel_size * i; + cur_kernel1 = kernel + kernel_size * (i + 1); + cur_kernel2 = kernel + kernel_size * (i + 2); + cur_kernel3 = kernel + kernel_size * (i + 3); + cur_kernel4 = kernel + kernel_size * (i + 4); + cur_kernel5 = kernel + kernel_size * (i + 5); + cur_kernel6 = kernel + kernel_size * (i + 6); + cur_kernel7 = kernel + kernel_size * (i + 7); + cur_kernel8 = kernel + kernel_size * (i + 8); + cur_kernel9 = kernel + kernel_size * (i + 9); + cur_kernel10 = kernel + kernel_size * (i + 10); + cur_kernel11 = kernel + kernel_size * (i + 11); + cur_kernel12 = kernel + kernel_size * (i + 12); + cur_kernel13 = kernel + kernel_size * (i + 13); + cur_kernel14 = kernel + kernel_size * (i + 14); + cur_kernel15 = kernel + kernel_size * (i + 15); + for(j = 0; j < kernel_size; j++) + { + *(cur_kernel_interleaved++) = cur_kernel0[j]; + *(cur_kernel_interleaved++) = cur_kernel1[j]; + *(cur_kernel_interleaved++) = cur_kernel2[j]; + *(cur_kernel_interleaved++) = cur_kernel3[j]; + *(cur_kernel_interleaved++) = cur_kernel4[j]; + *(cur_kernel_interleaved++) = cur_kernel5[j]; + *(cur_kernel_interleaved++) = cur_kernel6[j]; + *(cur_kernel_interleaved++) = cur_kernel7[j]; + *(cur_kernel_interleaved++) = cur_kernel8[j]; + *(cur_kernel_interleaved++) = cur_kernel9[j]; + *(cur_kernel_interleaved++) = cur_kernel10[j]; + *(cur_kernel_interleaved++) = cur_kernel11[j]; + *(cur_kernel_interleaved++) = cur_kernel12[j]; + *(cur_kernel_interleaved++) = cur_kernel13[j]; + *(cur_kernel_interleaved++) = cur_kernel14[j]; + *(cur_kernel_interleaved++) = cur_kernel15[j]; + } + } + + for(i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4) + { + cur_kernel0 = kernel + kernel_size * i; + cur_kernel1 = kernel + kernel_size * (i + 1); + cur_kernel2 = kernel + kernel_size * (i + 2); + cur_kernel3 = kernel + kernel_size * (i + 3); + for(j = 0; j < kernel_size; j++) + { + *(cur_kernel_interleaved++) = cur_kernel0[j]; + *(cur_kernel_interleaved++) = cur_kernel1[j]; + *(cur_kernel_interleaved++) = cur_kernel2[j]; + *(cur_kernel_interleaved++) = cur_kernel3[j]; + } + } + // last 4 kernel + cur_kernel0 = kernel + kernel_size * i; + cur_kernel1 = kernel + kernel_size * (i + 1); + cur_kernel2 = kernel + kernel_size * (i + 2); + if((kernel_chan & 0x3) == 3) + { + for(j = 0; j < kernel_size; j++) + { + *(cur_kernel_interleaved++) = cur_kernel0[j]; + *(cur_kernel_interleaved++) = cur_kernel1[j]; + *(cur_kernel_interleaved++) = cur_kernel2[j]; + *(cur_kernel_interleaved++) = 0.0; + } + } + else if((kernel_chan & 0x3) == 2) + { + for(j = 0; j < kernel_size; j++) + { + *(cur_kernel_interleaved++) = cur_kernel0[j]; + *(cur_kernel_interleaved++) = cur_kernel1[j]; + *(cur_kernel_interleaved++) = 0.0; + *(cur_kernel_interleaved++) = 0.0; + } + } + else if((kernel_chan & 0x3) == 1) + { + for(j = 0; j < kernel_size; j++) + { + *(cur_kernel_interleaved++) = cur_kernel0[j]; + *(cur_kernel_interleaved++) = 0.0; + *(cur_kernel_interleaved++) = 0.0; + *(cur_kernel_interleaved++) = 0.0; + } + } + + return; +} + +static void sgemm4x16(float* col, float* kernel, float* biases, bool bias_term, float* output, int kernel_size, + int col_start, int col_end, int kernel_start, int kernel_end, int output_xy, int activation, + int cpu_type) +{ + float initial[64], result[64]; + int col_line, kernel_num; + int i, j; + float *cur_col, *cur_kernel; + + for(kernel_num = (kernel_start & -16); kernel_num < (kernel_end & -16); kernel_num += 16) + { + if(bias_term) + for(i = 0; i < 64; i++) + initial[i] = *(biases + kernel_num + (i >> 2)); + cur_kernel = ( float* )(kernel + kernel_num * kernel_size); + + for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) + { + cur_col = ( float* )(col + col_line * kernel_size); + if(activation >= 0) + sgemm_4x16_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + else + sgemm_4x16_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + + if(activation > 0) + { + for(i = 0; i < 16; i++) + { + *(output + (kernel_num + i) * output_xy + col_line) = + std::min(result[(i << 2)], ( float )activation); + *(output + (kernel_num + i) * output_xy + col_line + 1) = + std::min(result[(i << 2) + 1], ( float )activation); + *(output + (kernel_num + i) * output_xy + col_line + 2) = + std::min(result[(i << 2) + 2], ( float )activation); + *(output + (kernel_num + i) * output_xy + col_line + 3) = + std::min(result[(i << 2) + 3], ( float )activation); + } + } + else + { + for(i = 0; i < 16; i++) + { + *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2)]; + *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1]; + *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2]; + *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3]; + } + } + } + if(col_end & 0x3) + { + cur_col = ( float* )(col + col_line * kernel_size); + + if(activation >= 0) + sgemm_4x16_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + else + sgemm_4x16_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + + for(i = 0; i < 16; i++) + for(j = 0; j < (col_end & 0x3); j++) + { + if(activation > 0) + *(output + (kernel_num + i) * output_xy + col_line + j) = + std::min(result[(i << 2) + j], ( float )activation); + else + *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j]; + } + } + } +} + +static void sgemm4x4(float* col, float* kernel, float* biases, bool bias_term, float* output, int kernel_size, + int col_start, int col_end, int kernel_start, int kernel_end, int output_xy, int activation, + int cpu_type) +{ + float initial[16], result[16]; + int col_line, kernel_num; + int i, j; + float *cur_col, *cur_kernel; + + for(kernel_num = kernel_start & -4; kernel_num < (kernel_end & -4); kernel_num += 4) + { + if(bias_term) + for(i = 0; i < 16; i++) + initial[i] = *(biases + kernel_num + (i >> 2)); + cur_kernel = ( float* )(kernel + kernel_num * kernel_size); + for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) + { + cur_col = ( float* )(col + col_line * kernel_size); + + if(activation >= 0) + sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + else + sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + + if(activation > 0) + { + for(i = 0; i < 4; i++) + { + *(output + (kernel_num + i) * output_xy + col_line) = + std::min(result[(i << 2) + 0], ( float )activation); + *(output + (kernel_num + i) * output_xy + col_line + 1) = + std::min(result[(i << 2) + 1], ( float )activation); + *(output + (kernel_num + i) * output_xy + col_line + 2) = + std::min(result[(i << 2) + 2], ( float )activation); + *(output + (kernel_num + i) * output_xy + col_line + 3) = + std::min(result[(i << 2) + 3], ( float )activation); + } + } + else + { + for(i = 0; i < 4; i++) + { + *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2) + 0]; + *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1]; + *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2]; + *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3]; + } + } + } + if(col_end & 0x3) + { + cur_col = ( float* )(col + col_line * kernel_size); + if(activation >= 0) + sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + else + sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + for(i = 0; i < 4; i++) + { + for(j = 0; j < (col_end & 0x3); j++) + { + if(activation > 0) + *(output + (kernel_num + i) * output_xy + col_line + j) = + std::min(result[(i << 2) + j], ( float )activation); + else + *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j]; + } + } + } + } + + if(kernel_end & 0x3) + { + if(bias_term) + for(i = 0; i < ((kernel_end & 0x3) << 2); i++) + initial[i] = *(biases + kernel_num + (i >> 2)); + cur_kernel = ( float* )(kernel + kernel_num * kernel_size); + for(col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4) + { + cur_col = ( float* )(col + col_line * kernel_size); + + if(activation >= 0) + sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + else + sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + + if(activation > 0) + { + for(i = 0; i < (kernel_end & 0x3); i++) + { + *(output + (kernel_num + i) * output_xy + col_line) = + std::min(result[(i << 2) + 0], ( float )activation); + *(output + (kernel_num + i) * output_xy + col_line + 1) = + std::min(result[(i << 2) + 1], ( float )activation); + *(output + (kernel_num + i) * output_xy + col_line + 2) = + std::min(result[(i << 2) + 2], ( float )activation); + *(output + (kernel_num + i) * output_xy + col_line + 3) = + std::min(result[(i << 2) + 3], ( float )activation); + } + } + else + { + for(i = 0; i < (kernel_end & 0x3); i++) + { + *(output + (kernel_num + i) * output_xy + col_line) = result[(i << 2) + 0]; + *(output + (kernel_num + i) * output_xy + col_line + 1) = result[(i << 2) + 1]; + *(output + (kernel_num + i) * output_xy + col_line + 2) = result[(i << 2) + 2]; + *(output + (kernel_num + i) * output_xy + col_line + 3) = result[(i << 2) + 3]; + } + } + } + if(col_end & 0x3) + { + cur_col = ( float* )(col + col_line * kernel_size); + if(activation >= 0) + sgemm_4x4_interleave_relu_fused(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + else + sgemm_4x4_interleave(bias_term, initial, cur_col, cur_kernel, result, kernel_size); + + for(i = 0; i < (kernel_end & 0x3); i++) + { + for(j = 0; j < (col_end & 0x3); j++) + { + if(activation > 0) + *(output + (kernel_num + i) * output_xy + col_line + j) = + std::min(result[(i << 2) + j], ( float )activation); + else + *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j]; + } + } + } + } +} + +struct im2col_param +{ + float* im; + float* col; + int input_chan; + int input_x; + int input_y; + int kernel_x; + int kernel_y; + int stride_x; + int stride_y; + int dilation_x; + int dilation_y; + int pad_x0; + int pad_x1; + int pad_y0; + int pad_y1; + int output_x; + int output_y; + int col_start; + int col_end; +}; + +struct sgemm_param +{ + float* col; + float* kernel; + float* biases; + bool bias_term; + float* output; + int kernel_size; + int col_start; + int col_end; + int kernel_start; + int kernel_end; + int output_xy; +}; + +struct conv1x1s1_param +{ + const float* input; + float* output; + const float* kernel; + const float* bias; + int in_h; + int in_w; + int in_ch; + int out_h; + int out_w; + int out_ch; + bool relu_fused; +}; + +struct ConvFast : public MTNodeOps +{ + bool Prerun(Node* node) override; + bool Reshape(Node* node) override; + bool Run(Node* node) override; + bool Postrun(Node* node) override; + bool GetSharedMemorySize(Node*, unsigned int& mem_size) override; + bool SetSharedMemoryAddr(Node*, void* mem_addr, int mem_size) override; + + bool float_mode; + bool im2col_aider(int cpu, int seq, void* data /* im2col_param * param */); + bool sgemm_aider(int cpu, int seq, void* data /* sgemm_param * param */); + bool sgemm4x4_aider(int cpu, int seq, void* data /* sgemm_param * param */); + + int activation; + bool dynamic_shape; +}; + +bool ConvFast::im2col_aider(int cpu, int seq, void* data) +{ + im2col_param* param = ( im2col_param* )(data); + im2col(param->im, param->col, param->input_chan, param->input_x, param->input_y, param->kernel_x, param->kernel_y, + param->stride_x, param->stride_y, param->dilation_x, param->dilation_y, param->pad_x0, param->pad_x1, + param->pad_y0, param->pad_y1, param->output_x, param->output_y, param->col_start, param->col_end); + + return true; +} + +bool ConvFast::sgemm4x4_aider(int cpu, int seq, void* data) +{ + int cpu_type = TYPE_A72; + sgemm_param* param = ( sgemm_param* )(data); + + sgemm4x4(param->col, param->kernel, param->biases, param->bias_term, param->output, param->kernel_size, + param->col_start, param->col_end, param->kernel_start, param->kernel_end, param->output_xy, activation, + cpu_type); + + return true; +} + +bool ConvFast::sgemm_aider(int cpu, int seq, void* data) +{ + int cpu_type = TYPE_A72; + sgemm_param* param = ( sgemm_param* )(data); + + sgemm4x16(param->col, param->kernel, param->biases, param->bias_term, param->output, param->kernel_size, + param->col_start, param->col_end, param->kernel_start, param->kernel_end, param->output_xy, activation, + cpu_type); + + return true; +} + +bool ConvFast::Prerun(Node* node) +{ + Convolution* conv_op = dynamic_cast(node->GetOp()); + ConvParam* param = conv_op->GetParam(); + int group = param->group; + + Tensor* output_tensor = node->GetOutputTensor(0); + TShape& output_shape = output_tensor->GetShape(); + int output_chan = output_shape.GetC() / group; + + /* pre-allocate col_buf */ + Tensor* input_tensor = node->GetInputTensor(0); + TShape& input_shape = input_tensor->GetShape(); + + int input_chan = input_shape.GetC() / group; + int kernel_size = input_chan * param->kernel_h * param->kernel_w; + + if(!dynamic_shape) + { + if(node->ExistAttr("shared_col_buf")) + { + float* addr = ( float* )any_cast(node->GetAttr("shared_col_buf")); + + (*node)["col_buf"] = addr; + } + else + { + unsigned int col_size; + + GetSharedMemorySize(node, col_size); + + float* col_buf = ( float* )mem_alloc(col_size); + (*node)["col_buf"] = col_buf; + node->SetAttr("col_buf_allocated", col_size); + } + } + + /* packing kernel data */ + Tensor* kernel_tensor = node->GetInputTensor(1); + + float* kernel_interleaved = NULL; + + int kernel_interleaved_size_g = kernel_size * ((output_chan + 3) & -4); + int kernel_size_g = kernel_size * output_chan; + float* kernel_org = ( float* )get_tensor_mem(kernel_tensor); + kernel_interleaved = ( float* )mem_alloc(sizeof(float) * (kernel_interleaved_size_g * group) + 128); + + for(int g = 0; g < group; ++g) + { + float* kernel = kernel_org + g * kernel_size_g; + float* kernel_interleaved_g = kernel_interleaved + g * kernel_interleaved_size_g; + interleave_kernel(kernel, kernel_interleaved_g, output_chan, kernel_size); + } + + (*node)["kernel_interleaved"] = kernel_interleaved; + + if(exec_attr->low_mem_mode) + { + kernel_tensor->FreeMem(); + } + + return true; +} + +bool ConvFast::Reshape(Node* node) +{ + unsigned int new_col_size; + + GetSharedMemorySize(node, new_col_size); + + if(node->ExistAttr("col_buf_allocated")) + { + unsigned int col_size = any_cast(node->GetAttr("col_buf_allocated")); + if(new_col_size == col_size) + return true; + + float* addr = any_cast(node->GetAttr("col_buf")); + mem_free(addr); + } + + float* col_buf = ( float* )mem_alloc(new_col_size); + (*node)["col_buf"] = col_buf; + + node->SetAttr("col_buf_allocated", new_col_size); + return true; +} + +bool ConvFast::Run(Node* node) +{ + /* input */ + Tensor* input_tensor = node->GetInputTensor(0); + + Convolution* conv_op = dynamic_cast(node->GetOp()); + ConvParam* param = conv_op->GetParam(); + + const TShape& input_shape = input_tensor->GetShape(); + + int group = param->group; + int input_chan = input_shape.GetC() / group; + int input_h = input_shape.GetH(); + int input_w = input_shape.GetW(); + int input_size = input_w * input_h * input_chan; + int pad_x0 = param->pad_w0; // left padding columns + int pad_x1 = param->pad_w1; // right padding columns + int pad_y0 = param->pad_h0; // top padding rows + int pad_y1 = param->pad_h1; // bottom padding rows + int stride_x = param->stride_w; + int stride_y = param->stride_h; + int dilation_x = param->dilation_w; + int dilation_y = param->dilation_h; + float* input_org = ( float* )get_tensor_mem(input_tensor); + float* col = any_cast(node->GetAttr("col_buf")); + + /* output */ + Tensor* output_tensor = node->GetOutputTensor(0); + TShape& output_shape = output_tensor->GetShape(); + float* output_org = ( float* )get_tensor_mem(output_tensor); + int output_y = output_shape.GetH(); + int output_x = output_shape.GetW(); + int output_xy = output_x * output_y; + int output_chan = output_shape.GetC() / group; + int output_n = output_shape.GetN(); + + /* kernel */ + int kernel_x = param->kernel_w; + int kernel_y = param->kernel_h; + int kernel_size = input_chan * kernel_x * kernel_y; + + float* kernel_interleaved = any_cast(node->GetAttr("kernel_interleaved")); + + int cpu_number = cpu_info->GetCPUNumber(); + + /* biases */ + + float* biases = NULL; + bool have_biases = (node->GetInputNum() > 2); + + if(have_biases) + { + biases = ( float* )get_tensor_mem(node->GetInputTensor(2)); + } + + int cpu_type; + + if(cpu_info->GetCPUModel(cpu_info->GetMasterCPU()) == CPU_A72) + cpu_type = TYPE_A72; + else + cpu_type = TYPE_A53; + + /* block size split parameter */ + int L2_CACHE_SIZE = (cpu_type == TYPE_A53) ? 512 * 1024 : 1024 * 1024; + int kernel_size_l1 = kernel_size; + int col_cnt_l2 = L2_CACHE_SIZE / 4 / kernel_size_l1 * 7 / 8; + col_cnt_l2 = col_cnt_l2 > 4 ? (col_cnt_l2 & -4) : 4; + + /* one image per time */ + for(int i = 0; i < output_n; i++) + { + float* input = input_org + i * input_size * group; + float* output = output_org + i * output_xy * output_chan * group; + + for(int g = 0; g < group; g++) + { + float* input_g = input + g * input_size; + int total_num = output_xy * input_chan * kernel_x * kernel_y; + + if(cpu_number == 1 || total_num < 100 * 1000) + im2col(input_g, col, input_chan, input_w, input_h, kernel_x, kernel_y, stride_x, stride_y, dilation_x, + dilation_y, pad_x0, pad_x1, pad_y0, pad_y1, output_x, output_y, 0, output_xy); + else + { + std::vector task_list; + std::vector param_list; + + auto f = std::bind(&ConvFast::im2col_aider, this, std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + + int steps = output_xy / cpu_number; + + steps = (steps + 3) & (~0x3); + + int offset; + int real_cpu_number = cpu_number; + + while(1) + { + offset = steps * real_cpu_number - output_xy; + + if(offset < steps) + break; + + real_cpu_number--; + } + + task_list.resize(real_cpu_number); + param_list.resize(real_cpu_number); + + for(int i = 0; i < real_cpu_number; i++) + { + im2col_param* param = ¶m_list[i]; + sub_op_task* task = &task_list[i]; + + task->exec_func = f; + task->seq = i; + task->data = param; + + param->im = input_g; + param->col = col; + param->input_chan = input_chan; + param->input_x = input_w; + param->input_y = input_h; + param->kernel_x = kernel_x; + param->kernel_y = kernel_y; + param->stride_x = stride_x; + param->stride_y = stride_y; + param->dilation_x = dilation_x; + param->dilation_y = dilation_y; + param->pad_x0 = pad_x0; + param->pad_x1 = pad_x1; + param->pad_y0 = pad_y0; + param->pad_y1 = pad_y1; + param->output_x = output_x; + param->output_y = output_y; + param->col_start = i * steps; + param->col_end = param->col_start + steps; + } + + param_list[real_cpu_number - 1].col_end = output_xy; + + task_dispatch(task_list, -1); + wait_done(); + } + + float* kernel_g = kernel_interleaved + g * (kernel_size * ((output_chan + 3) & -4)); + float* output_g = output + g * output_xy * output_chan; + float* bias_g = biases + g * output_chan; + + std::vector task_list; + std::vector param_list; + + int chan_16_num = output_chan / 16; + int chan_4_num = (output_chan & 0xf) ? 1 : 0; + int l2_loop = (output_xy - 1) / col_cnt_l2 + 1; + int max_task_num = l2_loop * (chan_16_num + chan_4_num); + + if(cpu_number > 1) + param_list.resize(max_task_num); + + // for input block of L2 cache size + for(int col_i = 0; col_i < output_xy; col_i += col_cnt_l2) + { + int col_start = col_i; + int col_end = col_i + col_cnt_l2; + col_end = col_end > output_xy ? output_xy : col_end; + + if(cpu_number == 1) + { + sgemm4x16(col, kernel_g, bias_g, have_biases, output_g, kernel_size, col_start, col_end, 0, + output_chan & -16, output_xy, activation, cpu_type); + if(output_chan & 0xf) + sgemm4x4(col, kernel_g, bias_g, have_biases, output_g, kernel_size, col_start, col_end, + output_chan & -16, output_chan, output_xy, activation, cpu_type); + } + else + { + auto f = std::bind(&ConvFast::sgemm_aider, this, std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + + for(int i = 0; i < chan_16_num; i++) + { + sub_op_task tmp_task; + sgemm_param* param = ¶m_list[task_list.size()]; + sub_op_task* task = &tmp_task; + task->exec_func = f; + task->seq = i; + task->data = param; + + param->col = col; + param->kernel = kernel_g; + param->biases = bias_g; + param->bias_term = have_biases; + param->output = output_g; + param->kernel_size = kernel_size; + param->col_start = col_start; + param->col_end = col_end; + param->kernel_start = i * 16; + param->kernel_end = param->kernel_start + 16; + param->output_xy = output_xy; + + task_list.emplace_back(tmp_task); + } + + if(output_chan & 0xf) + { + auto f = std::bind(&ConvFast::sgemm4x4_aider, this, std::placeholders::_1, + std::placeholders::_2, std::placeholders::_3); + sub_op_task tmp_task; + sgemm_param* param = ¶m_list[task_list.size()]; + sub_op_task* task = &tmp_task; + task->exec_func = f; + task->seq = task_list.size() - 1; + task->data = param; + + param->col = col; + param->kernel = kernel_g; + param->biases = bias_g; + param->bias_term = have_biases; + param->output = output_g; + param->kernel_size = kernel_size; + param->col_start = col_start; + param->col_end = col_end; + param->kernel_start = output_chan & -16; + param->kernel_end = output_chan; + param->output_xy = output_xy; + + task_list.emplace_back(tmp_task); + } + } + } + + if(cpu_number > 1) + { + task_dispatch(task_list, -1); + wait_done(); + } + } + } + + return true; +} + +bool ConvFast::Postrun(Node* node) +{ + if(node->ExistAttr("kernel_interleaved")) + { + float* addr; + addr = any_cast(node->GetAttr("kernel_interleaved")); + + mem_free(addr); + + node->RemoveAttr("kernel_interleaved"); + } + + if(node->ExistAttr("col_buf_allocated")) + { + float* addr = any_cast(node->GetAttr("col_buf")); + mem_free(addr); + + node->RemoveAttr("col_buf_allocated"); + } + + if(node->ExistAttr("col_buf")) + node->RemoveAttr("col_buf"); + + return true; +} + +bool ConvFast::GetSharedMemorySize(Node* node, unsigned int& mem_size) +{ + Convolution* conv_op = dynamic_cast(node->GetOp()); + ConvParam* param = conv_op->GetParam(); + int group = param->group; + + Tensor* output_tensor = node->GetOutputTensor(0); + TShape& output_shape = output_tensor->GetShape(); + int output_y = output_shape.GetH(); + int output_x = output_shape.GetW(); + + Tensor* input_tensor = node->GetInputTensor(0); + TShape& input_shape = input_tensor->GetShape(); + + int input_chan = input_shape.GetC() / group; + int kernel_size = input_chan * param->kernel_h * param->kernel_w; + int output_xy = output_x * output_y; + + mem_size = (sizeof(float) * (kernel_size * ((output_xy + 3) & -4)) + 128); + + return true; +} + +bool ConvFast::SetSharedMemoryAddr(Node* node, void* mem_addr, int mem_size) +{ + (*node)["shared_col_buf"] = mem_addr; + return true; +} + +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + + if(exec_attr->graph_layout == TENGINE_LAYOUT_NHWC) + return nullptr; + + ConvFast* ops = new ConvFast(); + + ops->need_free = true; + + if(node->IsDynamicShape()) + ops->dynamic_shape = true; + else + ops->dynamic_shape = false; + + Convolution* conv_op = dynamic_cast(node->GetOp()); + ConvParam* param = conv_op->GetParam(); + + ops->activation = param->activation; + + return ops; +} + +} // conv_fast + +void RegisterConv2dFast(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor("arm64", "Convolution", conv_fast::SelectFunc, + conv_fast::default_prio); +} + +} // namespace TEngine diff --git a/executor/operator/arm64/conv/dw_k3s1p1.S b/executor/operator/arm64/conv/dw_k3s1p1.S index 36a0d3563..ca3e34223 100644 --- a/executor/operator/arm64/conv/dw_k3s1p1.S +++ b/executor/operator/arm64/conv/dw_k3s1p1.S @@ -1,736 +1,736 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: haitao@openailab.com - */ - - -//x0: input -//x1: h -//x2: w -//x3: kernel -//x4: output //L-2 -//x5 : bias -//x10: L-1 output -//x6: L0 output -//x7: processed item -//x8: counter -//x9: x2*4 - -//v0-v3: L-2 -//v4-v7: L-1 -//v8-v11: L0 -//v12-v15/v16-v20: input two group -//v24-v26: kernel -//v27 --- saved previous vector -// v28,v29 --- shifted - -//v30 : bias -#ifndef KERNEL_NAME -#define KERNEL_NAME dw_k3s1p1 -#endif - -.text -.align 5 -.global KERNEL_NAME -.type KERNEL_NAME, %function - - -KERNEL_NAME: - - //Load Kernel - ld1 {v24.4s,v25.4s,v26.4s}, [x3] - - ext v26.16b,v25.16b,v26.16b,8 - ext v25.16b,v24.16b,v25.16b,12 - - lsl x9,x2,2 - fmov s31,wzr - dup v31.4s,v31.s[0] - - cbz x5 ,non_biases - //get the bias - ldr s30, [x5] - dup v30.4s,v30.s[0] - - b first_row_start -non_biases: - fmov s30, wzr - dup v30.4s,v30.s[0] -first_row_start: - sub x1,x1,1 - sub x7,x2,1 //save last item in row - lsr x8,x7,2 - lsl x7,x8,2 - - ins v27.s[3],v31.s[0] //pre_vector for input - - cbz x1,single_line - mov x10,x4 //L-1 - add x6,x10,x9 //L-0 - - - cbz x8,first_last_4 - - //output - - -first_row_loop: - //load 4 float input - ld1 {v12.4s},[x0],#16 - ld1r {v13.4s},[x0] - - ext v28.16b,v27.16b,v12.16b,12 //last_3 , a00, a01, a02 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - ins v27.s[3],v12.s[3] //save prev vector - - //L-1: k1 xinput - fmul v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v12.4s,v25.s[1] //k11, - fmla v4.4s,v29.4s,v25.s[2] //k12 - - st1 {v4.4s},[x10],#16 - - //L0 - fmul v8.4s,v28.4s,v24.s[0] //k00 - fmla v8.4s,v12.4s,v24.s[1] //k01 - fmla v8.4s,v29.4s,v24.s[2] //k02 - - st1 {v8.4s},[x6],#16 - - //next loop - subs x8,x8,1 - b.ne first_row_loop - -first_last_4: - //left ones: 1-4 - sub x8,x2,x7 - cmp x8,4 - blt first_less_4 - - //4 nodes - ld1 {v12.4s},[x0],#16 - ins v13.s[0],v31.s[0] - - ext v28.16b,v27.16b,v12.16b,12 //last_3 , a00, a01, a02 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - //L-1: k1 xinput - fmul v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v12.4s,v25.s[1] //k11, - fmla v4.4s,v29.4s,v25.s[2] //k12 - - st1 {v4.4s},[x10],#16 - - //L0 - fmul v8.4s,v28.4s,v24.s[0] //k00 - fmla v8.4s,v12.4s,v24.s[1] //k01 - fmla v8.4s,v29.4s,v24.s[2] //k02 - - st1 {v8.4s},[x6],#16 - - b first_row_done - -first_less_4: - cmp x8,1 - bge first_1_2_3 - b first_row_done - -first_1_2_3: - dup v12.4s,v31.s[0] - dup v13.4s,v31.s[0] - - //2 or 3 items - ldr s28,[x0],#4 - ins v12.s[0],v28.s[0] - sub x7,x8,1 - cbz x7, first_left_load_done - - ldr s28,[x0],#4 - ins v12.s[1],v28.s[0] - sub x7,x8,2 - - cbz x7, first_left_load_done - ldr s28,[x0],#4 - ins v12.s[2],v28.s[0] - -first_left_load_done: - - ext v28.16b,v27.16b,v12.16b,12 //last_3 , a00, a01, a02 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - //L-1 - fmul v4.4s,v28.4s,v25.s[0] //k10, - fmul v8.4s,v28.4s,v24.s[0] //k00 - fmla v4.4s,v12.4s,v25.s[1] //k11, - fmla v8.4s,v12.4s,v24.s[1] //k01 - fmla v4.4s,v29.4s,v25.s[2] //k12 - fmla v8.4s,v29.4s,v24.s[2] //k02 - - //save result: 2 or 3 - ins v28.s[0],v4.s[0] - str s28,[x10],#4 - - ins v28.s[0],v8.s[0] - str s28,[x6],#4 - - cmp x8, 2 - blt first_row_done - - ins v28.s[0],v4.s[1] - str s28,[x10],#4 - - - ins v28.s[0],v8.s[1] - str s28,[x6],#4 - - cmp x8,3 - blt first_row_done - - ins v28.s[0],v4.s[2] - str s28,[x10] - - ins v28.s[0],v8.s[2] - str s28,[x6] - -first_row_done: - -mid_row_start: - - sub x1,x1,1 - cbz x1, last_row_start - - sub x7,x2,1 //save one - lsr x8,x7,2 - lsl x7,x8,2 - - add x10,x4,x9 //L-1 - add x6,x10,x9 //L0 - dup v27.4s,v31.s[0] - - cbz x8,mid_last_4 - -mid_loop_start: - - ld1 {v0.4s},[x4] - ld1 {v4.4s},[x10] - //ld1 {v8.4s},[x6],#16 //L0 is always zero - - ld1 {v12.4s},[x0],#16 - ld1r {v13.4s},[x0] - - ext v28.16b,v27.16b,v12.16b,12 // last_3 , a00, a01, a02 - //v12: a00, a01, a02 ,a03 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - //L-2 - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmla v4.4s,v28.4s,v25.s[0] //k10, - fmul v8.4s,v28.4s,v24.s[0] //k00 - fmla v0.4s,v12.4s,v26.s[1] //k21, - fmla v4.4s,v12.4s,v25.s[1] //k11, - fmla v8.4s,v12.4s,v24.s[1] //k01 - fmla v0.4s,v29.4s,v26.s[2] //k22 - fmla v4.4s,v29.4s,v25.s[2] //k12 - fmla v8.4s,v29.4s,v24.s[2] //k02 -//add bias - fadd v0.4s,v0.4s,v30.4s -#ifdef CONV_RELU_FUSE - fmax v0.4s,v0.4s,v31.4s -#endif - st1 {v0.4s},[x4],#16 - - //L-1 - st1 {v4.4s},[x10],#16 - - - //L0 - st1 {v8.4s},[x6],#16 - - ins v27.s[3],v12.s[3] - - //next loop - subs x8,x8,1 - b.ne mid_loop_start - -mid_last_4: - sub x8,x2,x7 - cmp x8,4 - blt mid_less_4 - - ld1 {v0.4s},[x4] - ld1 {v4.4s},[x10] - - ld1 {v12.4s},[x0],#16 - ins v13.s[0],v31.s[0] - - ext v28.16b,v27.16b,v12.16b,12 // last_3 , a00, a01, a02 - //v12: a00, a01, a02 ,a03 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - //L-2 - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmla v4.4s,v28.4s,v25.s[0] //k10, - fmul v8.4s,v28.4s,v24.s[0] //k00 - fmla v0.4s,v12.4s,v26.s[1] //k21, - fmla v4.4s,v12.4s,v25.s[1] //k11, - fmla v8.4s,v12.4s,v24.s[1] //k01 - fmla v0.4s,v29.4s,v26.s[2] //k22 - fmla v4.4s,v29.4s,v25.s[2] //k12 - fmla v8.4s,v29.4s,v24.s[2] //k02 -//add bias - fadd v0.4s,v0.4s,v30.4s -#ifdef CONV_RELU_FUSE - fmax v0.4s,v0.4s,v31.4s -#endif - st1 {v0.4s},[x4],#16 - - - //L-1 - st1 {v4.4s},[x10],#16 - - //L0 - st1 {v8.4s},[x6],#16 - - b mid_row_start - -mid_less_4: - cmp x8,1 - blt mid_row_start - -mid_left_1_2_3: - - dup v12.4s,v31.s[0] - dup v13.4s,v31.s[0] - dup v0.4s,v31.s[0] - dup v4.4s,v31.s[0] - - - ldr s28,[x0],#4 - ins v12.s[0],v28.s[0] - - ldr s28,[x4] - ins v0.s[0],v28.s[0] - ldr s28,[x10] - ins v4.s[0],v28.s[0] - - - cmp x8,2 - blt mid_left_load_done - - ldr s28,[x0],#4 - ins v12.s[1],v28.s[0] - - ldr s28,[x4,#4] - ins v0.s[1],v28.s[0] - ldr s28,[x10, #4] - ins v4.s[1],v28.s[0] - - cmp x8,3 - blt mid_left_load_done - - - ldr s28,[x0],#4 - ins v12.s[2],v28.s[0] - - ldr s28,[x4,#8] - ins v0.s[2],v28.s[0] - ldr s28,[x10, #8] - ins v4.s[2],v28.s[0] - -mid_left_load_done: - - ext v28.16b,v27.16b,v12.16b,12 //last_3 , a00, a01, a02 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - //L-2 - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmla v4.4s,v28.4s,v25.s[0] //k10, - fmul v8.4s,v28.4s,v24.s[0] //k00 - fmla v0.4s,v12.4s,v26.s[1] //k21, - fmla v4.4s,v12.4s,v25.s[1] //k11, - fmla v8.4s,v12.4s,v24.s[1] //k01 - - fmla v0.4s,v29.4s,v26.s[2] //k22 - fmla v4.4s,v29.4s,v25.s[2] //k12 - fmla v8.4s,v29.4s,v24.s[2] //k02 - -//add bias - fadd v0.4s,v0.4s,v30.4s - //save result:1, 2 or 3 - ins v28.s[0],v0.s[0] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - ins v28.s[0],v4.s[0] - str s28,[x10],#4 - - ins v28.s[0],v8.s[0] - str s28,[x6],#4 - - cmp x8,2 - blt mid_row_start - - ins v28.s[0],v0.s[1] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - ins v28.s[0],v4.s[1] - str s28,[x10],#4 - - ins v28.s[0],v8.s[1] - str s28,[x6],#4 - - cmp x8,3 - blt mid_row_start - - ins v28.s[0],v0.s[2] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - ins v28.s[0],v4.s[2] - str s28,[x10] - - ins v28.s[0],v8.s[2] - str s28,[x6] - - b mid_row_start - - -last_row_start: - - - sub x7,x2,1 - lsr x8,x7,2 - lsl x7,x8,2 - - dup v27.4s,v31.s[0] - - add x10,x4,x9 //L-1 - - cbz x8,last_last_4 - -last_loop_start: - - ld1 {v0.4s},[x4] - ld1 {v4.4s},[x10] - - ld1 {v12.4s},[x0],#16 - ld1 {v13.4s},[x0] - - ext v28.16b,v27.16b,v12.16b,12 // last_3 , a00, a01, a02 - //v12: a00, a01, a02 ,a03 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - //L-2 - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmla v4.4s,v28.4s,v25.s[0] //k10, - fmla v0.4s,v12.4s,v26.s[1] //k21, - fmla v4.4s,v12.4s,v25.s[1] //k11, - fmla v0.4s,v29.4s,v26.s[2] //k22 - fmla v4.4s,v29.4s,v25.s[2] //k12 -//add bias - fadd v0.4s,v0.4s,v30.4s - -#ifdef CONV_RELU_FUSE - fmax v0.4s,v0.4s,v31.4s -#endif - st1 {v0.4s},[x4],#16 - - //L-1 -//add bias - fadd v4.4s,v4.4s,v30.4s -#ifdef CONV_RELU_FUSE - fmax v4.4s,v4.4s,v31.4s -#endif - st1 {v4.4s},[x10],#16 - - ins v27.s[3],v12.s[3] - - //next loop - subs x8,x8,1 - b.ne last_loop_start - -last_last_4: - - sub x8,x2,x7 - cmp x8,4 - blt last_less_4 - - ld1 {v12.4s},[x0],#16 - dup v13.4s,v31.s[0] - - ext v28.16b,v27.16b,v12.16b,12 // last_3 , a00, a01, a02 - //v12: a00, a01, a02 ,a03 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - ld1 {v0.4s},[x4] - ld1 {v4.4s},[x10] - - //L-2 - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmla v0.4s,v12.4s,v26.s[1] //k21, - fmla v0.4s,v29.4s,v26.s[2] //k22 -//add bias - fadd v0.4s,v0.4s,v30.4s - -#ifdef CONV_RELU_FUSE - fmax v0.4s,v0.4s,v31.4s -#endif - st1 {v0.4s},[x4],#16 - - - //L-1 - fmla v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v12.4s,v25.s[1] //k11, - fmla v4.4s,v29.4s,v25.s[2] //k12 -//add bias - fadd v4.4s,v4.4s,v30.4s -#ifdef CONV_RELU_FUSE - fmax v4.4s,v4.4s,v31.4s -#endif - st1 {v4.4s},[x10],#16 - - ins v27.s[3],v12.s[3] - - b last_row_done - -last_less_4: - - cmp x8,1 - blt last_row_done - -last_1_2_3: - - dup v12.4s,v31.s[0] - dup v13.4s,v31.s[0] - dup v0.4s,v31.s[0] - dup v4.4s,v31.s[0] - - - ldr s28,[x0],#4 - ins v12.s[0],v28.s[0] - ldr s28,[x4] - ins v0.s[0],v28.s[0] - ldr s28,[x10] - ins v4.s[0],v28.s[0] - - sub x7,x8,1 - cbz x7, last_left_load_done - - ldr s28,[x0],#4 - ins v12.s[1],v28.s[0] - - ldr s28,[x4,#4] - ins v0.s[1],v28.s[0] - ldr s28,[x10,#4] - ins v4.s[1],v28.s[0] - - - sub x7,x8,2 - cbz x7, last_left_load_done - - ldr s28,[x0],#4 - ins v12.s[2],v28.s[0] - - ldr s28,[x4,#8] - ins v0.s[2],v28.s[0] - ldr s28,[x10,#8] - ins v4.s[2],v28.s[0] - -last_left_load_done: - - ext v28.16b,v27.16b,v12.16b,12 //last_3 , a00, a01, a02 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - //L-2 - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmla v0.4s,v12.4s,v26.s[1] //k21, - fmla v0.4s,v29.4s,v26.s[2] //k22 - - //L-1 - fmla v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v12.4s,v25.s[1] //k11, - fmla v4.4s,v29.4s,v25.s[2] //k12 - -//add bias - fadd v0.4s,v0.4s,v30.4s - //save result: 1 2 or 3 - ins v28.s[0],v0.s[0] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - -//add bias - fadd v4.4s,v4.4s,v30.4s - - ins v28.s[0],v4.s[0] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x10],#4 - - cmp x8,2 - blt last_row_done - - ins v28.s[0],v0.s[1] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - ins v28.s[0],v4.s[1] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x10],#4 - - cmp x8,3 - blt last_row_done - - ins v28.s[0],v0.s[2] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4] - - ins v28.s[0],v4.s[2] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x10] - - -last_row_done: - ret - -single_line: - mov x10,x4 - cbz x8,single_line_last_4 - -single_line_row_loop: - //load 4 input - ld1 {v12.4s},[x0],#16 - ld1r {v13.4s},[x0] - - ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - ins v27.s[3],v12.s[3] - - dup v4.4s,v30.s[0] - fmla v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v12.4s,v25.s[1] //k11, - fmla v4.4s,v29.4s,v25.s[2] //k12 -#ifdef CONV_RELU_FUSE - fmax v4.4s,v4.4s,v31.4s -#endif - - st1 {v4.4s},[x10],#16 - - //next loop - subs x8,x8,1 - b.ne single_line_row_loop - -single_line_last_4: - //x8=x2-x7, and x7<=x2-1 and x7=4N and N is non-negative number, so left ones: 1-4 - sub x8,x2,x7 - cmp x8,4 - blt single_line_less_4 - - ld1 {v12.4s},[x0],#16 - ins v13.s[0],v31.s[0] - - ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - - dup v4.4s,v30.s[0] - fmla v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v12.4s,v25.s[1] //k11, - fmla v4.4s,v29.4s,v25.s[2] //k12 -#ifdef CONV_RELU_FUSE - fmax v4.4s,v4.4s,v31.4s -#endif - - st1 {v4.4s},[x10],#16 - b single_line_done - -single_line_less_4: - cmp x8,1 - bge single_line_1_2_3 - b single_line_done - -single_line_1_2_3: - dup v12.4s,v31.s[0] - dup v13.4s,v31.s[0] - - ldr s28,[x0],#4 - ins v12.s[0],v28.s[0] - sub x7,x8,1 - cbz x7,single_line_left_load_done - - ldr s28,[x0],#4 - ins v12.s[1],v28.s[0] - sub x7,x8,2 - - cbz x7,single_line_left_load_done - ldr s28,[x0],#4 - ins v12.s[2],v28.s[0] - -single_line_left_load_done: - ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02 - ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 - - dup v4.4s,v30.s[0] - fmla v4.4s,v28.4s,v25.s[0] //k10 - fmla v4.4s,v12.4s,v25.s[1] //k11 - fmla v4.4s,v29.4s,v25.s[2] //k12 -#ifdef CONV_RELU_FUSE - fmax v4.4s,v4.4s,v31.4s -#endif - - //save result - ins v28.s[0],v4.s[0] - str s28,[x10],#4 - - cmp x8,2 - blt single_line_done - - ins v28.s[0],v4.s[1] - str s28,[x10],#4 - - cmp x8,3 - blt single_line_done - - ins v28.s[0],v4.s[2] - str s28,[x10] - -single_line_done: - ret +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: haitao@openailab.com + */ + + +//x0: input +//x1: h +//x2: w +//x3: kernel +//x4: output //L-2 +//x5 : bias +//x10: L-1 output +//x6: L0 output +//x7: processed item +//x8: counter +//x9: x2*4 + +//v0-v3: L-2 +//v4-v7: L-1 +//v8-v11: L0 +//v12-v15/v16-v20: input two group +//v24-v26: kernel +//v27 --- saved previous vector +// v28,v29 --- shifted + +//v30 : bias +#ifndef KERNEL_NAME +#define KERNEL_NAME dw_k3s1p1 +#endif + +.text +.align 5 +.global KERNEL_NAME +.type KERNEL_NAME, %function + + +KERNEL_NAME: + + //Load Kernel + ld1 {v24.4s,v25.4s,v26.4s}, [x3] + + ext v26.16b,v25.16b,v26.16b,8 + ext v25.16b,v24.16b,v25.16b,12 + + lsl x9,x2,2 + fmov s31,wzr + dup v31.4s,v31.s[0] + + cbz x5 ,non_biases + //get the bias + ldr s30, [x5] + dup v30.4s,v30.s[0] + + b first_row_start +non_biases: + fmov s30, wzr + dup v30.4s,v30.s[0] +first_row_start: + sub x1,x1,1 + sub x7,x2,1 //save last item in row + lsr x8,x7,2 + lsl x7,x8,2 + + ins v27.s[3],v31.s[0] //pre_vector for input + + cbz x1,single_line + mov x10,x4 //L-1 + add x6,x10,x9 //L-0 + + + cbz x8,first_last_4 + + //output + + +first_row_loop: + //load 4 float input + ld1 {v12.4s},[x0],#16 + ld1r {v13.4s},[x0] + + ext v28.16b,v27.16b,v12.16b,12 //last_3 , a00, a01, a02 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + ins v27.s[3],v12.s[3] //save prev vector + + //L-1: k1 xinput + fmul v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v12.4s,v25.s[1] //k11, + fmla v4.4s,v29.4s,v25.s[2] //k12 + + st1 {v4.4s},[x10],#16 + + //L0 + fmul v8.4s,v28.4s,v24.s[0] //k00 + fmla v8.4s,v12.4s,v24.s[1] //k01 + fmla v8.4s,v29.4s,v24.s[2] //k02 + + st1 {v8.4s},[x6],#16 + + //next loop + subs x8,x8,1 + b.ne first_row_loop + +first_last_4: + //left ones: 1-4 + sub x8,x2,x7 + cmp x8,4 + blt first_less_4 + + //4 nodes + ld1 {v12.4s},[x0],#16 + ins v13.s[0],v31.s[0] + + ext v28.16b,v27.16b,v12.16b,12 //last_3 , a00, a01, a02 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + //L-1: k1 xinput + fmul v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v12.4s,v25.s[1] //k11, + fmla v4.4s,v29.4s,v25.s[2] //k12 + + st1 {v4.4s},[x10],#16 + + //L0 + fmul v8.4s,v28.4s,v24.s[0] //k00 + fmla v8.4s,v12.4s,v24.s[1] //k01 + fmla v8.4s,v29.4s,v24.s[2] //k02 + + st1 {v8.4s},[x6],#16 + + b first_row_done + +first_less_4: + cmp x8,1 + bge first_1_2_3 + b first_row_done + +first_1_2_3: + dup v12.4s,v31.s[0] + dup v13.4s,v31.s[0] + + //2 or 3 items + ldr s28,[x0],#4 + ins v12.s[0],v28.s[0] + sub x7,x8,1 + cbz x7, first_left_load_done + + ldr s28,[x0],#4 + ins v12.s[1],v28.s[0] + sub x7,x8,2 + + cbz x7, first_left_load_done + ldr s28,[x0],#4 + ins v12.s[2],v28.s[0] + +first_left_load_done: + + ext v28.16b,v27.16b,v12.16b,12 //last_3 , a00, a01, a02 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + //L-1 + fmul v4.4s,v28.4s,v25.s[0] //k10, + fmul v8.4s,v28.4s,v24.s[0] //k00 + fmla v4.4s,v12.4s,v25.s[1] //k11, + fmla v8.4s,v12.4s,v24.s[1] //k01 + fmla v4.4s,v29.4s,v25.s[2] //k12 + fmla v8.4s,v29.4s,v24.s[2] //k02 + + //save result: 2 or 3 + ins v28.s[0],v4.s[0] + str s28,[x10],#4 + + ins v28.s[0],v8.s[0] + str s28,[x6],#4 + + cmp x8, 2 + blt first_row_done + + ins v28.s[0],v4.s[1] + str s28,[x10],#4 + + + ins v28.s[0],v8.s[1] + str s28,[x6],#4 + + cmp x8,3 + blt first_row_done + + ins v28.s[0],v4.s[2] + str s28,[x10] + + ins v28.s[0],v8.s[2] + str s28,[x6] + +first_row_done: + +mid_row_start: + + sub x1,x1,1 + cbz x1, last_row_start + + sub x7,x2,1 //save one + lsr x8,x7,2 + lsl x7,x8,2 + + add x10,x4,x9 //L-1 + add x6,x10,x9 //L0 + dup v27.4s,v31.s[0] + + cbz x8,mid_last_4 + +mid_loop_start: + + ld1 {v0.4s},[x4] + ld1 {v4.4s},[x10] + //ld1 {v8.4s},[x6],#16 //L0 is always zero + + ld1 {v12.4s},[x0],#16 + ld1r {v13.4s},[x0] + + ext v28.16b,v27.16b,v12.16b,12 // last_3 , a00, a01, a02 + //v12: a00, a01, a02 ,a03 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + //L-2 + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmla v4.4s,v28.4s,v25.s[0] //k10, + fmul v8.4s,v28.4s,v24.s[0] //k00 + fmla v0.4s,v12.4s,v26.s[1] //k21, + fmla v4.4s,v12.4s,v25.s[1] //k11, + fmla v8.4s,v12.4s,v24.s[1] //k01 + fmla v0.4s,v29.4s,v26.s[2] //k22 + fmla v4.4s,v29.4s,v25.s[2] //k12 + fmla v8.4s,v29.4s,v24.s[2] //k02 +//add bias + fadd v0.4s,v0.4s,v30.4s +#ifdef CONV_RELU_FUSE + fmax v0.4s,v0.4s,v31.4s +#endif + st1 {v0.4s},[x4],#16 + + //L-1 + st1 {v4.4s},[x10],#16 + + + //L0 + st1 {v8.4s},[x6],#16 + + ins v27.s[3],v12.s[3] + + //next loop + subs x8,x8,1 + b.ne mid_loop_start + +mid_last_4: + sub x8,x2,x7 + cmp x8,4 + blt mid_less_4 + + ld1 {v0.4s},[x4] + ld1 {v4.4s},[x10] + + ld1 {v12.4s},[x0],#16 + ins v13.s[0],v31.s[0] + + ext v28.16b,v27.16b,v12.16b,12 // last_3 , a00, a01, a02 + //v12: a00, a01, a02 ,a03 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + //L-2 + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmla v4.4s,v28.4s,v25.s[0] //k10, + fmul v8.4s,v28.4s,v24.s[0] //k00 + fmla v0.4s,v12.4s,v26.s[1] //k21, + fmla v4.4s,v12.4s,v25.s[1] //k11, + fmla v8.4s,v12.4s,v24.s[1] //k01 + fmla v0.4s,v29.4s,v26.s[2] //k22 + fmla v4.4s,v29.4s,v25.s[2] //k12 + fmla v8.4s,v29.4s,v24.s[2] //k02 +//add bias + fadd v0.4s,v0.4s,v30.4s +#ifdef CONV_RELU_FUSE + fmax v0.4s,v0.4s,v31.4s +#endif + st1 {v0.4s},[x4],#16 + + + //L-1 + st1 {v4.4s},[x10],#16 + + //L0 + st1 {v8.4s},[x6],#16 + + b mid_row_start + +mid_less_4: + cmp x8,1 + blt mid_row_start + +mid_left_1_2_3: + + dup v12.4s,v31.s[0] + dup v13.4s,v31.s[0] + dup v0.4s,v31.s[0] + dup v4.4s,v31.s[0] + + + ldr s28,[x0],#4 + ins v12.s[0],v28.s[0] + + ldr s28,[x4] + ins v0.s[0],v28.s[0] + ldr s28,[x10] + ins v4.s[0],v28.s[0] + + + cmp x8,2 + blt mid_left_load_done + + ldr s28,[x0],#4 + ins v12.s[1],v28.s[0] + + ldr s28,[x4,#4] + ins v0.s[1],v28.s[0] + ldr s28,[x10, #4] + ins v4.s[1],v28.s[0] + + cmp x8,3 + blt mid_left_load_done + + + ldr s28,[x0],#4 + ins v12.s[2],v28.s[0] + + ldr s28,[x4,#8] + ins v0.s[2],v28.s[0] + ldr s28,[x10, #8] + ins v4.s[2],v28.s[0] + +mid_left_load_done: + + ext v28.16b,v27.16b,v12.16b,12 //last_3 , a00, a01, a02 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + //L-2 + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmla v4.4s,v28.4s,v25.s[0] //k10, + fmul v8.4s,v28.4s,v24.s[0] //k00 + fmla v0.4s,v12.4s,v26.s[1] //k21, + fmla v4.4s,v12.4s,v25.s[1] //k11, + fmla v8.4s,v12.4s,v24.s[1] //k01 + + fmla v0.4s,v29.4s,v26.s[2] //k22 + fmla v4.4s,v29.4s,v25.s[2] //k12 + fmla v8.4s,v29.4s,v24.s[2] //k02 + +//add bias + fadd v0.4s,v0.4s,v30.4s + //save result:1, 2 or 3 + ins v28.s[0],v0.s[0] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + ins v28.s[0],v4.s[0] + str s28,[x10],#4 + + ins v28.s[0],v8.s[0] + str s28,[x6],#4 + + cmp x8,2 + blt mid_row_start + + ins v28.s[0],v0.s[1] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + ins v28.s[0],v4.s[1] + str s28,[x10],#4 + + ins v28.s[0],v8.s[1] + str s28,[x6],#4 + + cmp x8,3 + blt mid_row_start + + ins v28.s[0],v0.s[2] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + ins v28.s[0],v4.s[2] + str s28,[x10] + + ins v28.s[0],v8.s[2] + str s28,[x6] + + b mid_row_start + + +last_row_start: + + + sub x7,x2,1 + lsr x8,x7,2 + lsl x7,x8,2 + + dup v27.4s,v31.s[0] + + add x10,x4,x9 //L-1 + + cbz x8,last_last_4 + +last_loop_start: + + ld1 {v0.4s},[x4] + ld1 {v4.4s},[x10] + + ld1 {v12.4s},[x0],#16 + ld1 {v13.4s},[x0] + + ext v28.16b,v27.16b,v12.16b,12 // last_3 , a00, a01, a02 + //v12: a00, a01, a02 ,a03 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + //L-2 + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmla v4.4s,v28.4s,v25.s[0] //k10, + fmla v0.4s,v12.4s,v26.s[1] //k21, + fmla v4.4s,v12.4s,v25.s[1] //k11, + fmla v0.4s,v29.4s,v26.s[2] //k22 + fmla v4.4s,v29.4s,v25.s[2] //k12 +//add bias + fadd v0.4s,v0.4s,v30.4s + +#ifdef CONV_RELU_FUSE + fmax v0.4s,v0.4s,v31.4s +#endif + st1 {v0.4s},[x4],#16 + + //L-1 +//add bias + fadd v4.4s,v4.4s,v30.4s +#ifdef CONV_RELU_FUSE + fmax v4.4s,v4.4s,v31.4s +#endif + st1 {v4.4s},[x10],#16 + + ins v27.s[3],v12.s[3] + + //next loop + subs x8,x8,1 + b.ne last_loop_start + +last_last_4: + + sub x8,x2,x7 + cmp x8,4 + blt last_less_4 + + ld1 {v12.4s},[x0],#16 + dup v13.4s,v31.s[0] + + ext v28.16b,v27.16b,v12.16b,12 // last_3 , a00, a01, a02 + //v12: a00, a01, a02 ,a03 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + ld1 {v0.4s},[x4] + ld1 {v4.4s},[x10] + + //L-2 + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmla v0.4s,v12.4s,v26.s[1] //k21, + fmla v0.4s,v29.4s,v26.s[2] //k22 +//add bias + fadd v0.4s,v0.4s,v30.4s + +#ifdef CONV_RELU_FUSE + fmax v0.4s,v0.4s,v31.4s +#endif + st1 {v0.4s},[x4],#16 + + + //L-1 + fmla v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v12.4s,v25.s[1] //k11, + fmla v4.4s,v29.4s,v25.s[2] //k12 +//add bias + fadd v4.4s,v4.4s,v30.4s +#ifdef CONV_RELU_FUSE + fmax v4.4s,v4.4s,v31.4s +#endif + st1 {v4.4s},[x10],#16 + + ins v27.s[3],v12.s[3] + + b last_row_done + +last_less_4: + + cmp x8,1 + blt last_row_done + +last_1_2_3: + + dup v12.4s,v31.s[0] + dup v13.4s,v31.s[0] + dup v0.4s,v31.s[0] + dup v4.4s,v31.s[0] + + + ldr s28,[x0],#4 + ins v12.s[0],v28.s[0] + ldr s28,[x4] + ins v0.s[0],v28.s[0] + ldr s28,[x10] + ins v4.s[0],v28.s[0] + + sub x7,x8,1 + cbz x7, last_left_load_done + + ldr s28,[x0],#4 + ins v12.s[1],v28.s[0] + + ldr s28,[x4,#4] + ins v0.s[1],v28.s[0] + ldr s28,[x10,#4] + ins v4.s[1],v28.s[0] + + + sub x7,x8,2 + cbz x7, last_left_load_done + + ldr s28,[x0],#4 + ins v12.s[2],v28.s[0] + + ldr s28,[x4,#8] + ins v0.s[2],v28.s[0] + ldr s28,[x10,#8] + ins v4.s[2],v28.s[0] + +last_left_load_done: + + ext v28.16b,v27.16b,v12.16b,12 //last_3 , a00, a01, a02 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + //L-2 + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmla v0.4s,v12.4s,v26.s[1] //k21, + fmla v0.4s,v29.4s,v26.s[2] //k22 + + //L-1 + fmla v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v12.4s,v25.s[1] //k11, + fmla v4.4s,v29.4s,v25.s[2] //k12 + +//add bias + fadd v0.4s,v0.4s,v30.4s + //save result: 1 2 or 3 + ins v28.s[0],v0.s[0] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + +//add bias + fadd v4.4s,v4.4s,v30.4s + + ins v28.s[0],v4.s[0] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x10],#4 + + cmp x8,2 + blt last_row_done + + ins v28.s[0],v0.s[1] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + ins v28.s[0],v4.s[1] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x10],#4 + + cmp x8,3 + blt last_row_done + + ins v28.s[0],v0.s[2] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4] + + ins v28.s[0],v4.s[2] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x10] + + +last_row_done: + ret + +single_line: + mov x10,x4 + cbz x8,single_line_last_4 + +single_line_row_loop: + //load 4 input + ld1 {v12.4s},[x0],#16 + ld1r {v13.4s},[x0] + + ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + ins v27.s[3],v12.s[3] + + dup v4.4s,v30.s[0] + fmla v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v12.4s,v25.s[1] //k11, + fmla v4.4s,v29.4s,v25.s[2] //k12 +#ifdef CONV_RELU_FUSE + fmax v4.4s,v4.4s,v31.4s +#endif + + st1 {v4.4s},[x10],#16 + + //next loop + subs x8,x8,1 + b.ne single_line_row_loop + +single_line_last_4: + //x8=x2-x7, and x7<=x2-1 and x7=4N and N is non-negative number, so left ones: 1-4 + sub x8,x2,x7 + cmp x8,4 + blt single_line_less_4 + + ld1 {v12.4s},[x0],#16 + ins v13.s[0],v31.s[0] + + ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + + dup v4.4s,v30.s[0] + fmla v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v12.4s,v25.s[1] //k11, + fmla v4.4s,v29.4s,v25.s[2] //k12 +#ifdef CONV_RELU_FUSE + fmax v4.4s,v4.4s,v31.4s +#endif + + st1 {v4.4s},[x10],#16 + b single_line_done + +single_line_less_4: + cmp x8,1 + bge single_line_1_2_3 + b single_line_done + +single_line_1_2_3: + dup v12.4s,v31.s[0] + dup v13.4s,v31.s[0] + + ldr s28,[x0],#4 + ins v12.s[0],v28.s[0] + sub x7,x8,1 + cbz x7,single_line_left_load_done + + ldr s28,[x0],#4 + ins v12.s[1],v28.s[0] + sub x7,x8,2 + + cbz x7,single_line_left_load_done + ldr s28,[x0],#4 + ins v12.s[2],v28.s[0] + +single_line_left_load_done: + ext v28.16b,v27.16b,v12.16b,12 //last_3, a00, a01, a02 + ext v29.16b,v12.16b,v13.16b,4 //a01, a02, a03, a04 + + dup v4.4s,v30.s[0] + fmla v4.4s,v28.4s,v25.s[0] //k10 + fmla v4.4s,v12.4s,v25.s[1] //k11 + fmla v4.4s,v29.4s,v25.s[2] //k12 +#ifdef CONV_RELU_FUSE + fmax v4.4s,v4.4s,v31.4s +#endif + + //save result + ins v28.s[0],v4.s[0] + str s28,[x10],#4 + + cmp x8,2 + blt single_line_done + + ins v28.s[0],v4.s[1] + str s28,[x10],#4 + + cmp x8,3 + blt single_line_done + + ins v28.s[0],v4.s[2] + str s28,[x10] + +single_line_done: + ret diff --git a/executor/operator/arm64/conv/dw_k3s1p1_relu_fused.S b/executor/operator/arm64/conv/dw_k3s1p1_relu_fused.S index 6466b842c..9c4ac3f7a 100644 --- a/executor/operator/arm64/conv/dw_k3s1p1_relu_fused.S +++ b/executor/operator/arm64/conv/dw_k3s1p1_relu_fused.S @@ -1,27 +1,27 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2018, Open AI Lab - * Author: haitao@openailab.com - */ -#define KERNEL_NAME dw_k3s1p1_relu_fused -#define CONV_RELU_FUSE - -#include "./dw_k3s1p1.S" +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: haitao@openailab.com + */ +#define KERNEL_NAME dw_k3s1p1_relu_fused +#define CONV_RELU_FUSE + +#include "./dw_k3s1p1.S" diff --git a/executor/operator/arm64/conv/dw_k3s2p1.S b/executor/operator/arm64/conv/dw_k3s2p1.S index 808ead01d..3f796c5db 100644 --- a/executor/operator/arm64/conv/dw_k3s2p1.S +++ b/executor/operator/arm64/conv/dw_k3s2p1.S @@ -1,689 +1,689 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: haitao@openailab.com - */ -//x0: input -//x1: h -//x2: w -//x3: kernel -//x4: output //L-2 -//x5: bias -//x10: L-1 output -//x6: L0 output -//x7: processed item -//x8: counter -//x9: output width - -//v0-v3: L-2 -//v4-v7: L-1 -//v8-v11: L0 -//v12-v15/v16-v20: input two group -//v24-v26: kernel -//v27 --- saved previous vector -// v28,v29 --- shifted - -//v20 bias - -#ifndef KERNEL_NAME -#define KERNEL_NAME dw_k3s2p1 -#endif - -.text -.align 5 -.global KERNEL_NAME -.type KERNEL_NAME, %function - - -KERNEL_NAME: - //Load Kernel - ld1 {v24.4s,v25.4s,v26.4s}, [x3] - ext v26.16b,v25.16b,v26.16b,8 - ext v25.16b,v24.16b,v25.16b,12 - - sub x9,x2,1 - lsr x9,x9,1 - add x9,x9,1 - lsl x9,x9,2 - fmov s31,wzr - dup v31.4s,v31.s[0] - - //get bias - cbz x5,non_biases - ldr s21,[x5] - dup v21.4s,v21.s[0] - b first_row_start - -non_biases: - fmov s21,wzr - dup v21.4s,v21.s[0] - -//first row - -first_row_start: - sub x1,x1,1 - - lsr x8,x2,3 //x8 loop counter - lsl x7,x8,3 //x7 processed number - - ins v27.s[3],v31.s[0] //pre_vector for input - - mov x10,x4 //L-1 //L1 ONLY - cbz x8,first_less_8 - -first_loop_start: - //load 4 float input - ld1 {v12.4s,v13.4s},[x0],#32 //a00,a01,a02,a03,a04,a05,a06,a07 - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - /* - v28: last_3, a01, a03, a05 - v29 a00 a02, a04, a06 - v30 a01 a03, a05, a07 - */ - - //L-1: k1 xinput - fmul v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v29.4s,v25.s[1] //k11, - fmla v4.4s,v30.4s,v25.s[2] //k12 - - ins v27.s[3],v13.s[3] //save prev vector - - //save data, four are valid - st1 {v4.4s},[x10],#16 - - //next loop - subs x8,x8,1 - b.ne first_loop_start - -first_less_8: - - sub x8,x2,x7 - cmp x8,1 - blt first_row_done - -first_1_7: - dup v13.4s,v31.s[0] - - cmp x8,4 - blt first_1_2_3 - - ld1 {v12.4s},[x0],#16 - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - //L-1 - fmul v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v29.4s,v25.s[1] //k11, - fmla v4.4s,v30.4s,v25.s[2] //k12 - - ins v28.s[0],v4.s[0] - str s28,[x10],#4 - - ins v28.s[0],v4.s[1] - str s28,[x10],#4 - - sub x8,x8,4 - cbz x8,first_row_done - - ins v27.s[3],v12.s[3] - -first_1_2_3: - dup v12.4s,v31.s[0] - - //1-3 items - ldr s28,[x0],#4 - ins v12.s[0],v28.s[0] - - cmp x8,2 - blt first_left_load_done - - ldr s28,[x0],#4 - ins v12.s[1],v28.s[0] - - cmp x8,3 - blt first_left_load_done - - ldr s28,[x0],#4 - ins v12.s[2],v28.s[0] - -first_left_load_done: - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - //L-1 - fmul v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v29.4s,v25.s[1] //k11, - fmla v4.4s,v30.4s,v25.s[2] //k12 - -first_left_save_1_3: - - ins v28.s[0],v4.s[0] - str s28,[x10],#4 - - cmp x8,3 - blt first_row_done - - ins v28.s[0],v4.s[1] - str s28,[x10],#4 - -first_row_done: - - -odd_row_start: - sub x1,x1,1 - cbz x1, last_row_is_odd - - lsr x8,x2,3 - lsl x7,x8,3 - - dup v27.4s,v31.s[0] - //x4: L-2 - add x6,x4,x9 //L0 - - cbz x8,odd_less_8 - -odd_loop_start: - - ld1 {v0.4s}, [x4] //L-2 - ld1 {v12.4s,v13.4s},[x0],#32 - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - - //L-2 - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmul v8.4s,v28.4s,v24.s[0] //k00 - fmla v0.4s,v29.4s,v26.s[1] //k21, - fmla v8.4s,v29.4s,v24.s[1] //k01 - fmla v0.4s,v30.4s,v26.s[2] //k22 - fmla v8.4s,v30.4s,v24.s[2] //k02 -//add bias - fadd v0.4s,v0.4s,v21.4s - -#ifdef CONV_RELU_FUSE - fmax v0.4s,v0.4s,v31.4s -#endif - - //L0 is always zero - - st1 {v0.4s}, [x4],#16 - st1 {v8.4s}, [x6],#16 - - ins v27.s[3],v13.s[3] - - //next loop - subs x8,x8,1 - b.ne odd_loop_start - -odd_less_8: - sub x8,x2,x7 - cmp x8,1 - blt odd_row_done - -odd_1_7: - dup v13.4s,v31.s[0] - cmp x8,4 - blt odd_1_2_3 - - ld1 {v12.4s},[x0],#16 - - ldr s28,[x4] - ins v0.s[0],v28.s[0] - - ldr s28,[x4,#4] - ins v0.s[1],v28.s[0] - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - //L-2 - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmul v8.4s,v28.4s,v24.s[0] //k00 - fmla v0.4s,v29.4s,v26.s[1] //k21, - fmla v8.4s,v29.4s,v24.s[1] //k01 - fmla v0.4s,v30.4s,v26.s[2] //k22 - fmla v8.4s,v30.4s,v24.s[2] //k02 - -//add bias - fadd v0.4s,v0.4s,v21.4s - //L0 is always zero - ins v28.s[0],v0.s[0] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - ins v28.s[0],v8.s[0] - str s28,[x6],#4 - - ins v28.s[0],v0.s[1] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - ins v28.s[0],v8.s[1] - str s28,[x6],#4 - - sub x8,x8,4 - cbz x8, odd_row_done - - ins v27.s[3],v12.s[3] - -odd_1_2_3: - - dup v12.4s,v31.s[0] - - ldr s28,[x0],#4 - ins v12.s[0],v28.s[0] - - ldr s28,[x4] - ins v0.s[0],v28.s[0] - - cmp x8,2 - blt odd_left_load_done - - ldr s28,[x0],#4 - ins v12.s[1],v28.s[0] - - cmp x8,3 - blt odd_left_load_done - - ldr s28,[x0],#4 - ins v12.s[2],v28.s[0] - - ldr s28,[x4,#4] - ins v0.s[1],v28.s[0] - -odd_left_load_done: - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - - //L-2 - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmul v8.4s,v28.4s,v24.s[0] //k00 - fmla v0.4s,v29.4s,v26.s[1] //k21, - fmla v8.4s,v29.4s,v24.s[1] //k01 - fmla v0.4s,v30.4s,v26.s[2] //k22 - fmla v8.4s,v30.4s,v24.s[2] //k02 - - //L0 -//add bias - fadd v0.4s,v0.4s,v21.4s - //save result:1 or 2 - ins v28.s[0],v0.s[0] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - ins v28.s[0],v8.s[0] - str s28,[x6],#4 - - cmp x8,3 - blt odd_row_done - - ins v28.s[0],v0.s[1] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - ins v28.s[0],v8.s[1] - str s28,[x6],#4 - -odd_row_done: - -even_row_start: - - lsr x8,x2,3 - lsl x7,x8,3 - - ins v27.s[3],v31.s[0] //pre_vector for input - - mov x10,x4 //L-1 //L1 ONLY - cbz x8,even_less_8 - -even_loop_start: - //load 4 float input - ld1 {v12.4s,v13.4s},[x0],#32 - ld1 {v4.4s},[x10] - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - //L-1: k1 xinput - fmla v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v29.4s,v25.s[1] //k11, - fmla v4.4s,v30.4s,v25.s[2] //k12 - - ins v27.s[3],v13.s[3] //save prev vector - - st1 {v4.4s},[x10],#16 - - //next loop - subs x8,x8,1 - b.ne even_loop_start - -even_less_8: - - sub x8,x2,x7 - cmp x8,1 - blt even_row_done - -even_1_7: - dup v13.4s,v31.s[0] - - cmp x8,4 - blt even_1_2_3 - - ld1 {v12.4s},[x0],#16 - ldr s28,[x10] - ins v4.s[0],v28.s[0] - ldr s28,[x10,#4] - ins v4.s[1],v28.s[0] - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - //L-1: k1 xinput - fmla v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v29.4s,v25.s[1] //k11, - fmla v4.4s,v30.4s,v25.s[2] //k12 - - ins v28.s[0],v4.s[0] - str s28,[x10],#4 - - ins v28.s[0],v4.s[1] - str s28,[x10],#4 - - sub x8,x8,4 - cbz x8, even_row_done - - ins v27.s[3],v12.s[3] //save prev vector - -even_1_2_3: - dup v12.4s,v31.s[0] - - //1, 2 or 3 items - ldr s28,[x0],#4 - ins v12.s[0],v28.s[0] - - ldr s28,[x10] - ins v4.s[0],v28.s[0] - - sub x7,x8,1 - cbz x7, even_left_load_done - - ldr s28,[x0],#4 - ins v12.s[1],v28.s[0] - - sub x7,x8,2 - cbz x7, even_left_load_done - - ldr s28,[x0],#4 - ins v12.s[2],v28.s[0] - - ldr s28,[x10,#4] - ins v4.s[1],v28.s[0] - -even_left_load_done: - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - //L-1: k1 xinput - fmla v4.4s,v28.4s,v25.s[0] //k10, - fmla v4.4s,v29.4s,v25.s[1] //k11, - fmla v4.4s,v30.4s,v25.s[2] //k12 - - - //save result: 1 or 2 - ins v28.s[0],v4.s[0] - str s28,[x10],#4 - - cmp x8,3 - blt even_row_done - - ins v28.s[0],v4.s[1] - str s28,[x10],#4 - -even_row_done: - sub x1,x1,1 - cbz x1, last_even_add_bias - b odd_row_start - -last_even_add_bias: - mov x10,x4 - //cal out_w - sub x6,x2,1 - lsr x6,x6,1 - add x6,x6,1 - //finish - lsr x8,x6,3 - lsl x7,x8,3 - cbz x8,last_even_less_8 -last_even_loop_start: - ld1 {v12.4s,v13.4s},[x10],#32 -//add bias - fadd v12.4s,v12.4s,v21.4s - fadd v13.4s,v13.4s,v21.4s -#ifdef CONV_RELU_FUSE - fmax v12.4s,v12.4s,v31.4s - fmax v13.4s,v13.4s,v31.4s -#endif - st1 {v12.4s},[x4],#16 - st1 {v13.4s},[x4],#16 -// next loop - subs x8,x8,1 - b.ne last_even_loop_start -last_even_less_8: - subs x8,x6,x7 - cmp x8,1 - blt last_even_loop_done - cmp x8,4 - blt last_even_1_2_3 - ld1 {v0.4s},[x10],#16 -//add bias - fadd v0.4s,v0.4s,v21.4s -#ifdef CONV_RELU_FUSE - fmax v0.4s,v0.4s,v31.4s -#endif - st1 {v0.4s},[x4],#16 - subs x8,x8,4 - cbz x8,last_even_loop_done -last_even_1_2_3: - cmp x8,1 - blt last_even_loop_done - ldr s0,[x10],#0x4 - //add bias - fadd s0,s0,s21 -#ifdef CONV_RELU_FUSE - fmax s0,s0,s31 -#endif - str s0,[x4],#0x4 - subs x8,x8,1 - cbz x8,last_even_loop_done - b last_even_1_2_3 - -last_even_loop_done: - b all_row_done - -// Last Row: even or odd - -last_row_is_odd: - - lsr x8,x2,3 - lsl x7,x8,3 - - dup v27.4s,v31.s[0] - cbz x8,last_odd_less_8 - -last_odd_loop_start: - - ld1 {v0.4s},[x4] //L-2 - ld1 {v12.4s,v13.4s},[x0],#32 - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmla v0.4s,v29.4s,v26.s[1] //k21, - fmla v0.4s,v30.4s,v26.s[2] //k22 -//add bias - fadd v0.4s,v0.4s,v21.4s - -#ifdef CONV_RELU_FUSE - fmax v0.4s,v0.4s,v31.4s -#endif - st1 {v0.4s},[x4],#16 - - ins v27.s[3],v13.s[3] - - //next loop - subs x8,x8,1 - b.ne last_odd_loop_start - -last_odd_less_8: - sub x8,x2,x7 - cmp x8,1 - blt last_odd_row_done - cmp x8,4 - blt last_odd_1_2_3 - - ld1 {v12.4s},[x0],#16 - dup v13.4s,v31.s[0] - - //L-2 - ldr s28,[x4] - ins v0.s[0],v28.s[0] - ldr s28,[x4,#4] - ins v0.s[1],v28.s[0] - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - //L-2 - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmla v0.4s,v29.4s,v26.s[1] //k21, - fmla v0.4s,v30.4s,v26.s[2] //k22 - -//add bias - fadd v0.4s,v0.4s,v21.4s - ins v28.s[0],v0.s[0] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - ins v28.s[0],v0.s[1] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - sub x8,x8,4 - cbz x8,last_odd_row_done - - ins v27.s[3],v12.s[3] - -last_odd_1_2_3: - - dup v12.4s,v31.s[0] - - ldr s28,[x0],#4 - ins v12.s[0],v28.s[0] - - ldr s28,[x4] - ins v0.s[0],v28.s[0] - - cmp x8,2 - blt last_odd_left_load_done - - ldr s28,[x0],#4 - ins v12.s[1],v28.s[0] - - cmp x8,3 - blt last_odd_left_load_done - - ldr s28,[x0],#4 - ins v12.s[2],v28.s[0] - - ldr s28,[x4,#4] - ins v0.s[1],v28.s[0] - -last_odd_left_load_done: - - uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 - uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 - ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 - - //L-2 - fmla v0.4s,v28.4s,v26.s[0] //k20, - fmla v0.4s,v29.4s,v26.s[1] //k21, - fmla v0.4s,v30.4s,v26.s[2] //k22 - -//add bias - fadd v0.4s,v0.4s,v21.4s - //save result:1 or 2 - ins v28.s[0],v0.s[0] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - - cmp x8,3 - blt last_odd_row_done - - ins v28.s[0],v0.s[1] -#ifdef CONV_RELU_FUSE - fmax s28,s28,s31 -#endif - str s28,[x4],#4 - - -last_odd_row_done: -all_row_done: - ret - - - - +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: haitao@openailab.com + */ +//x0: input +//x1: h +//x2: w +//x3: kernel +//x4: output //L-2 +//x5: bias +//x10: L-1 output +//x6: L0 output +//x7: processed item +//x8: counter +//x9: output width + +//v0-v3: L-2 +//v4-v7: L-1 +//v8-v11: L0 +//v12-v15/v16-v20: input two group +//v24-v26: kernel +//v27 --- saved previous vector +// v28,v29 --- shifted + +//v20 bias + +#ifndef KERNEL_NAME +#define KERNEL_NAME dw_k3s2p1 +#endif + +.text +.align 5 +.global KERNEL_NAME +.type KERNEL_NAME, %function + + +KERNEL_NAME: + //Load Kernel + ld1 {v24.4s,v25.4s,v26.4s}, [x3] + ext v26.16b,v25.16b,v26.16b,8 + ext v25.16b,v24.16b,v25.16b,12 + + sub x9,x2,1 + lsr x9,x9,1 + add x9,x9,1 + lsl x9,x9,2 + fmov s31,wzr + dup v31.4s,v31.s[0] + + //get bias + cbz x5,non_biases + ldr s21,[x5] + dup v21.4s,v21.s[0] + b first_row_start + +non_biases: + fmov s21,wzr + dup v21.4s,v21.s[0] + +//first row + +first_row_start: + sub x1,x1,1 + + lsr x8,x2,3 //x8 loop counter + lsl x7,x8,3 //x7 processed number + + ins v27.s[3],v31.s[0] //pre_vector for input + + mov x10,x4 //L-1 //L1 ONLY + cbz x8,first_less_8 + +first_loop_start: + //load 4 float input + ld1 {v12.4s,v13.4s},[x0],#32 //a00,a01,a02,a03,a04,a05,a06,a07 + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + /* + v28: last_3, a01, a03, a05 + v29 a00 a02, a04, a06 + v30 a01 a03, a05, a07 + */ + + //L-1: k1 xinput + fmul v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v29.4s,v25.s[1] //k11, + fmla v4.4s,v30.4s,v25.s[2] //k12 + + ins v27.s[3],v13.s[3] //save prev vector + + //save data, four are valid + st1 {v4.4s},[x10],#16 + + //next loop + subs x8,x8,1 + b.ne first_loop_start + +first_less_8: + + sub x8,x2,x7 + cmp x8,1 + blt first_row_done + +first_1_7: + dup v13.4s,v31.s[0] + + cmp x8,4 + blt first_1_2_3 + + ld1 {v12.4s},[x0],#16 + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + //L-1 + fmul v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v29.4s,v25.s[1] //k11, + fmla v4.4s,v30.4s,v25.s[2] //k12 + + ins v28.s[0],v4.s[0] + str s28,[x10],#4 + + ins v28.s[0],v4.s[1] + str s28,[x10],#4 + + sub x8,x8,4 + cbz x8,first_row_done + + ins v27.s[3],v12.s[3] + +first_1_2_3: + dup v12.4s,v31.s[0] + + //1-3 items + ldr s28,[x0],#4 + ins v12.s[0],v28.s[0] + + cmp x8,2 + blt first_left_load_done + + ldr s28,[x0],#4 + ins v12.s[1],v28.s[0] + + cmp x8,3 + blt first_left_load_done + + ldr s28,[x0],#4 + ins v12.s[2],v28.s[0] + +first_left_load_done: + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + //L-1 + fmul v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v29.4s,v25.s[1] //k11, + fmla v4.4s,v30.4s,v25.s[2] //k12 + +first_left_save_1_3: + + ins v28.s[0],v4.s[0] + str s28,[x10],#4 + + cmp x8,3 + blt first_row_done + + ins v28.s[0],v4.s[1] + str s28,[x10],#4 + +first_row_done: + + +odd_row_start: + sub x1,x1,1 + cbz x1, last_row_is_odd + + lsr x8,x2,3 + lsl x7,x8,3 + + dup v27.4s,v31.s[0] + //x4: L-2 + add x6,x4,x9 //L0 + + cbz x8,odd_less_8 + +odd_loop_start: + + ld1 {v0.4s}, [x4] //L-2 + ld1 {v12.4s,v13.4s},[x0],#32 + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + + //L-2 + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmul v8.4s,v28.4s,v24.s[0] //k00 + fmla v0.4s,v29.4s,v26.s[1] //k21, + fmla v8.4s,v29.4s,v24.s[1] //k01 + fmla v0.4s,v30.4s,v26.s[2] //k22 + fmla v8.4s,v30.4s,v24.s[2] //k02 +//add bias + fadd v0.4s,v0.4s,v21.4s + +#ifdef CONV_RELU_FUSE + fmax v0.4s,v0.4s,v31.4s +#endif + + //L0 is always zero + + st1 {v0.4s}, [x4],#16 + st1 {v8.4s}, [x6],#16 + + ins v27.s[3],v13.s[3] + + //next loop + subs x8,x8,1 + b.ne odd_loop_start + +odd_less_8: + sub x8,x2,x7 + cmp x8,1 + blt odd_row_done + +odd_1_7: + dup v13.4s,v31.s[0] + cmp x8,4 + blt odd_1_2_3 + + ld1 {v12.4s},[x0],#16 + + ldr s28,[x4] + ins v0.s[0],v28.s[0] + + ldr s28,[x4,#4] + ins v0.s[1],v28.s[0] + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + //L-2 + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmul v8.4s,v28.4s,v24.s[0] //k00 + fmla v0.4s,v29.4s,v26.s[1] //k21, + fmla v8.4s,v29.4s,v24.s[1] //k01 + fmla v0.4s,v30.4s,v26.s[2] //k22 + fmla v8.4s,v30.4s,v24.s[2] //k02 + +//add bias + fadd v0.4s,v0.4s,v21.4s + //L0 is always zero + ins v28.s[0],v0.s[0] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + ins v28.s[0],v8.s[0] + str s28,[x6],#4 + + ins v28.s[0],v0.s[1] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + ins v28.s[0],v8.s[1] + str s28,[x6],#4 + + sub x8,x8,4 + cbz x8, odd_row_done + + ins v27.s[3],v12.s[3] + +odd_1_2_3: + + dup v12.4s,v31.s[0] + + ldr s28,[x0],#4 + ins v12.s[0],v28.s[0] + + ldr s28,[x4] + ins v0.s[0],v28.s[0] + + cmp x8,2 + blt odd_left_load_done + + ldr s28,[x0],#4 + ins v12.s[1],v28.s[0] + + cmp x8,3 + blt odd_left_load_done + + ldr s28,[x0],#4 + ins v12.s[2],v28.s[0] + + ldr s28,[x4,#4] + ins v0.s[1],v28.s[0] + +odd_left_load_done: + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + + //L-2 + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmul v8.4s,v28.4s,v24.s[0] //k00 + fmla v0.4s,v29.4s,v26.s[1] //k21, + fmla v8.4s,v29.4s,v24.s[1] //k01 + fmla v0.4s,v30.4s,v26.s[2] //k22 + fmla v8.4s,v30.4s,v24.s[2] //k02 + + //L0 +//add bias + fadd v0.4s,v0.4s,v21.4s + //save result:1 or 2 + ins v28.s[0],v0.s[0] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + ins v28.s[0],v8.s[0] + str s28,[x6],#4 + + cmp x8,3 + blt odd_row_done + + ins v28.s[0],v0.s[1] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + ins v28.s[0],v8.s[1] + str s28,[x6],#4 + +odd_row_done: + +even_row_start: + + lsr x8,x2,3 + lsl x7,x8,3 + + ins v27.s[3],v31.s[0] //pre_vector for input + + mov x10,x4 //L-1 //L1 ONLY + cbz x8,even_less_8 + +even_loop_start: + //load 4 float input + ld1 {v12.4s,v13.4s},[x0],#32 + ld1 {v4.4s},[x10] + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + //L-1: k1 xinput + fmla v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v29.4s,v25.s[1] //k11, + fmla v4.4s,v30.4s,v25.s[2] //k12 + + ins v27.s[3],v13.s[3] //save prev vector + + st1 {v4.4s},[x10],#16 + + //next loop + subs x8,x8,1 + b.ne even_loop_start + +even_less_8: + + sub x8,x2,x7 + cmp x8,1 + blt even_row_done + +even_1_7: + dup v13.4s,v31.s[0] + + cmp x8,4 + blt even_1_2_3 + + ld1 {v12.4s},[x0],#16 + ldr s28,[x10] + ins v4.s[0],v28.s[0] + ldr s28,[x10,#4] + ins v4.s[1],v28.s[0] + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + //L-1: k1 xinput + fmla v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v29.4s,v25.s[1] //k11, + fmla v4.4s,v30.4s,v25.s[2] //k12 + + ins v28.s[0],v4.s[0] + str s28,[x10],#4 + + ins v28.s[0],v4.s[1] + str s28,[x10],#4 + + sub x8,x8,4 + cbz x8, even_row_done + + ins v27.s[3],v12.s[3] //save prev vector + +even_1_2_3: + dup v12.4s,v31.s[0] + + //1, 2 or 3 items + ldr s28,[x0],#4 + ins v12.s[0],v28.s[0] + + ldr s28,[x10] + ins v4.s[0],v28.s[0] + + sub x7,x8,1 + cbz x7, even_left_load_done + + ldr s28,[x0],#4 + ins v12.s[1],v28.s[0] + + sub x7,x8,2 + cbz x7, even_left_load_done + + ldr s28,[x0],#4 + ins v12.s[2],v28.s[0] + + ldr s28,[x10,#4] + ins v4.s[1],v28.s[0] + +even_left_load_done: + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + //L-1: k1 xinput + fmla v4.4s,v28.4s,v25.s[0] //k10, + fmla v4.4s,v29.4s,v25.s[1] //k11, + fmla v4.4s,v30.4s,v25.s[2] //k12 + + + //save result: 1 or 2 + ins v28.s[0],v4.s[0] + str s28,[x10],#4 + + cmp x8,3 + blt even_row_done + + ins v28.s[0],v4.s[1] + str s28,[x10],#4 + +even_row_done: + sub x1,x1,1 + cbz x1, last_even_add_bias + b odd_row_start + +last_even_add_bias: + mov x10,x4 + //cal out_w + sub x6,x2,1 + lsr x6,x6,1 + add x6,x6,1 + //finish + lsr x8,x6,3 + lsl x7,x8,3 + cbz x8,last_even_less_8 +last_even_loop_start: + ld1 {v12.4s,v13.4s},[x10],#32 +//add bias + fadd v12.4s,v12.4s,v21.4s + fadd v13.4s,v13.4s,v21.4s +#ifdef CONV_RELU_FUSE + fmax v12.4s,v12.4s,v31.4s + fmax v13.4s,v13.4s,v31.4s +#endif + st1 {v12.4s},[x4],#16 + st1 {v13.4s},[x4],#16 +// next loop + subs x8,x8,1 + b.ne last_even_loop_start +last_even_less_8: + subs x8,x6,x7 + cmp x8,1 + blt last_even_loop_done + cmp x8,4 + blt last_even_1_2_3 + ld1 {v0.4s},[x10],#16 +//add bias + fadd v0.4s,v0.4s,v21.4s +#ifdef CONV_RELU_FUSE + fmax v0.4s,v0.4s,v31.4s +#endif + st1 {v0.4s},[x4],#16 + subs x8,x8,4 + cbz x8,last_even_loop_done +last_even_1_2_3: + cmp x8,1 + blt last_even_loop_done + ldr s0,[x10],#0x4 + //add bias + fadd s0,s0,s21 +#ifdef CONV_RELU_FUSE + fmax s0,s0,s31 +#endif + str s0,[x4],#0x4 + subs x8,x8,1 + cbz x8,last_even_loop_done + b last_even_1_2_3 + +last_even_loop_done: + b all_row_done + +// Last Row: even or odd + +last_row_is_odd: + + lsr x8,x2,3 + lsl x7,x8,3 + + dup v27.4s,v31.s[0] + cbz x8,last_odd_less_8 + +last_odd_loop_start: + + ld1 {v0.4s},[x4] //L-2 + ld1 {v12.4s,v13.4s},[x0],#32 + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmla v0.4s,v29.4s,v26.s[1] //k21, + fmla v0.4s,v30.4s,v26.s[2] //k22 +//add bias + fadd v0.4s,v0.4s,v21.4s + +#ifdef CONV_RELU_FUSE + fmax v0.4s,v0.4s,v31.4s +#endif + st1 {v0.4s},[x4],#16 + + ins v27.s[3],v13.s[3] + + //next loop + subs x8,x8,1 + b.ne last_odd_loop_start + +last_odd_less_8: + sub x8,x2,x7 + cmp x8,1 + blt last_odd_row_done + cmp x8,4 + blt last_odd_1_2_3 + + ld1 {v12.4s},[x0],#16 + dup v13.4s,v31.s[0] + + //L-2 + ldr s28,[x4] + ins v0.s[0],v28.s[0] + ldr s28,[x4,#4] + ins v0.s[1],v28.s[0] + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + //L-2 + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmla v0.4s,v29.4s,v26.s[1] //k21, + fmla v0.4s,v30.4s,v26.s[2] //k22 + +//add bias + fadd v0.4s,v0.4s,v21.4s + ins v28.s[0],v0.s[0] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + ins v28.s[0],v0.s[1] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + sub x8,x8,4 + cbz x8,last_odd_row_done + + ins v27.s[3],v12.s[3] + +last_odd_1_2_3: + + dup v12.4s,v31.s[0] + + ldr s28,[x0],#4 + ins v12.s[0],v28.s[0] + + ldr s28,[x4] + ins v0.s[0],v28.s[0] + + cmp x8,2 + blt last_odd_left_load_done + + ldr s28,[x0],#4 + ins v12.s[1],v28.s[0] + + cmp x8,3 + blt last_odd_left_load_done + + ldr s28,[x0],#4 + ins v12.s[2],v28.s[0] + + ldr s28,[x4,#4] + ins v0.s[1],v28.s[0] + +last_odd_left_load_done: + + uzp1 v29.4s,v12.4s,v13.4s //a00,a02,a04,a06 + uzp2 v30.4s,v12.4s,v13.4s //a01,a03,a05,a07 + ext v28.16b,v27.16b,v30.16b,12 //last_3 , a01, a03,a05 + + //L-2 + fmla v0.4s,v28.4s,v26.s[0] //k20, + fmla v0.4s,v29.4s,v26.s[1] //k21, + fmla v0.4s,v30.4s,v26.s[2] //k22 + +//add bias + fadd v0.4s,v0.4s,v21.4s + //save result:1 or 2 + ins v28.s[0],v0.s[0] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + + cmp x8,3 + blt last_odd_row_done + + ins v28.s[0],v0.s[1] +#ifdef CONV_RELU_FUSE + fmax s28,s28,s31 +#endif + str s28,[x4],#4 + + +last_odd_row_done: +all_row_done: + ret + + + + diff --git a/executor/operator/arm64/conv/dw_k3s2p1_relu_fused.S b/executor/operator/arm64/conv/dw_k3s2p1_relu_fused.S index f1953b510..2a517114d 100644 --- a/executor/operator/arm64/conv/dw_k3s2p1_relu_fused.S +++ b/executor/operator/arm64/conv/dw_k3s2p1_relu_fused.S @@ -1,27 +1,27 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2018, Open AI Lab - * Author: haitao@openailab.com - */ -#define KERNEL_NAME dw_k3s2p1_relu_fused -#define CONV_RELU_FUSE - -#include "./dw_k3s2p1.S" +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: haitao@openailab.com + */ +#define KERNEL_NAME dw_k3s2p1_relu_fused +#define CONV_RELU_FUSE + +#include "./dw_k3s2p1.S" diff --git a/executor/operator/arm64/conv/sgemm_4x16_interleave.S b/executor/operator/arm64/conv/sgemm_4x16_interleave.S index 565470e89..6de3ef5c5 100644 --- a/executor/operator/arm64/conv/sgemm_4x16_interleave.S +++ b/executor/operator/arm64/conv/sgemm_4x16_interleave.S @@ -1,313 +1,313 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: xiaowei@openailab.com - */ -// -// 4*16 single precise floating point matric multiplication -// -// -- -- -- -- -- -- -- -- -// | i0 - - - - - - | | k0 k1 .. kf | | t00 t01 .. t0f | | i0k0 i0k1 .. i0kf | -// | | | . . . . | | | | | -// | i1 - - - - - - | | . . . . | | t10 t11 . t1f | | i1k0 i1k1 .. i1kf | -// | | x | . . . . | + | | = | | -// | i2 - - - - - - | | . . . . | | t20 t21 . t2f | | i2k0 i2k1 .. i2kf | -// | | | . . . . | | | | | -// | i3 - - - - - - | | . . . . | | t30 t31 . t3f | | i3k0 i3k1 .. i3kf | -// -- -- -- -- -- -- -- -- -// input 4 x p kernel p x 16 biases 4 x 16 output 4 x 16 p = kernel size -// -// -// optimised for Cortex-A72 pipeline 66 cycle per loop (4*16*4 dot product) -// -// input: -// x0 arg0 have biases flag -// x1 arg1 biases start address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3],i[0-3]k[4]..} -// x2 arg2 input start address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...} -// x3 arg3 kernel start address {k[0-15][0],k[0-15][1],k[0-15][2],k[0-15][3],...} -// x4 arg4 output save address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3],i[0-3]k[4]..} -// x5 arg5 kernel size -// -// output: no -// -// register definition -// x0 have biases flag -// x1 biases start address -// x2 input start address -// x3 kernel start address -// x4 output start address -// x5 loop time = kernal size -// x6 ~ x31 not used -// -// v0~v1 4S data of input0 {i3 i2 i1 i0} -// v2-v3 not used -// v4 4S kernal data {k3 | k2 | k1 | k0} -// v5 4S kernal data {k7 | k6 | k5 | k4} -// v6 4S kernal data {kb | ka | k9 | k8} -// v7 4S kernal data {kf | ke | kd | kc} -// v8~v15 not used -// v16 dot product for {i3k0, i2k0, i1k0, i0k0} -// v17 dot product for {i3k1, i2k1, i1k1, i0k1} -// v18 dot product for {i3k2, i2k2, i1k2, i0k2} -// v19 dot product for {i3k3, i2k3, i1k3, i0k3} -// v20 dot product for {i3k4, i2k4, i1k4, i0k4} -// v21 dot product for {i3k5, i2k5, i1k5, i0k5} -// v22 dot product for {i3k6, i2k6, i1k6, i0k6} -// v23 dot product for {i3k7, i2k7, i1k7, i0k7} -// v24 dot product for {i3k8, i2k8, i1k8, i0k8} -// v25 dot product for {i3k9, i2k9, i1k9, i0k9} -// v26 dot product for {i3ka, i2ka, i1ka, i0ka} -// v27 dot product for {i3kb, i2kb, i1kb, i0kb} -// v28 dot product for {i3kc, i2kc, i1kc, i0kc} -// v29 dot product for {i3kd, i2kd, i1kd, i0kd} -// v30 dot product for {i3ke, i2ke, i1ke, i0ke} -// v31 dot product for {i3kf, i2kf, i1kf, i0kf} - -#ifndef INTERLEAVE_FUNC_NAME -#define INTERLEAVE_FUNC_NAME sgemm_4x16_interleave -#endif - - .section .text,"ax" - .align 5 - - .type INTERLEAVE_FUNC_NAME STT_FUNC - .global INTERLEAVE_FUNC_NAME - -INTERLEAVE_FUNC_NAME: -// biases_initial - cbz x0, none_biases - ldp q16, q17 ,[x1] - ldp q18, q19 ,[x1, #0x20] - ldp q20, q21 ,[x1, #0x40] - ldp q22, q23 ,[x1, #0x60] - ldp q24, q25 ,[x1, #0x80] - ldp q26, q27 ,[x1, #0xa0] - ldp q28, q29 ,[x1, #0xc0] - ldp q30, q31 ,[x1, #0xe0] - b convolution_start - -none_biases: - movi d16, #0 - movi d17, #0 - movi d18, #0 - movi d19, #0 - movi d20, #0 - movi d21, #0 - movi d22, #0 - movi d23, #0 - movi d24, #0 - movi d25, #0 - movi d26, #0 - movi d27, #0 - movi d28, #0 - movi d29, #0 - movi d30, #0 - movi d31, #0 - -convolution_start: - // compare to 0x4 - cmp x5, 0x4 - blt loop4_end - lsr x6, x5, 0x2 - -// main loop each loop generate dot prodcut for 4x16SFP -loop4: - ldr q0, [x2] // q0=i[3-0] - ldp q4, q5, [x3] // q4=k[3-0] q5=k[7-4] - fmla v16.4s, v0.4s, v4.s[0] // i[3-0]k[0] - fmla v17.4s, v0.4s, v4.s[1] // i[3-0]k[1] - fmla v18.4s, v0.4s, v4.s[2] // i[3-0]k[2] - fmla v19.4s, v0.4s, v4.s[3] // i[3-0]k[3] - ldp q6, q7, [x3, 0x20] // q6=k[b-8] q7=k[f-c] - fmla v20.4s, v0.4s, v5.s[0] // i[3-0]k[4] - fmla v21.4s, v0.4s, v5.s[1] // i[3-0]k[5] - fmla v22.4s, v0.4s, v5.s[2] // i[3-0]k[6] - fmla v23.4s, v0.4s, v5.s[3] // i[3-0]k[7] - ldr q1, [x2, 0x10] // q1=i[3-0] - fmla v24.4s, v0.4s, v6.s[0] // i[3-0]k[8] - fmla v25.4s, v0.4s, v6.s[1] // i[3-0]k[9] - fmla v26.4s, v0.4s, v6.s[2] // i[3-0]k[a] - ldp q4, q5, [x3, 0x40] // q4=k[3-0] q5=k[7-4] - fmla v27.4s, v0.4s, v6.s[3] // i[3-0]k[b] - fmla v28.4s, v0.4s, v7.s[0] // i[3-0]k[c] - fmla v29.4s, v0.4s, v7.s[1] // i[3-0]k[d] - fmla v30.4s, v0.4s, v7.s[2] // i[3-0]k[e] - fmla v31.4s, v0.4s, v7.s[3] // i[3-0]k[f] - - ldp q6, q7, [x3, 0x60] // q6=k[b-8] q7=k[f-c] - fmla v16.4s, v1.4s, v4.s[0] // i[3-0]k[0] - fmla v17.4s, v1.4s, v4.s[1] // i[3-0]k[1] - fmla v18.4s, v1.4s, v4.s[2] // i[3-0]k[2] - fmla v19.4s, v1.4s, v4.s[3] // i[3-0]k[3] - ldr q0, [x2, 0x20] // q1=i[3-0] - fmla v20.4s, v1.4s, v5.s[0] // i[3-0]k[4] - fmla v21.4s, v1.4s, v5.s[1] // i[3-0]k[5] - fmla v22.4s, v1.4s, v5.s[2] // i[3-0]k[6] - fmla v23.4s, v1.4s, v5.s[3] // i[3-0]k[7] - ldp q4, q5, [x3, 0x80] // q4=k[3-0] q5=k[7-4] - fmla v24.4s, v1.4s, v6.s[0] // i[3-0]k[8] - fmla v25.4s, v1.4s, v6.s[1] // i[3-0]k[9] - fmla v26.4s, v1.4s, v6.s[2] // i[3-0]k[a] - fmla v27.4s, v1.4s, v6.s[3] // i[3-0]k[b] - subs x6, x6, #0x1 - prfm pldl1keep, [x2, 0x80] - fmla v28.4s, v1.4s, v7.s[0] // i[3-0]k[c] - fmla v29.4s, v1.4s, v7.s[1] // i[3-0]k[d] - fmla v30.4s, v1.4s, v7.s[2] // i[3-0]k[e] - fmla v31.4s, v1.4s, v7.s[3] // i[3-0]k[f] - - ldp q6, q7, [x3, 0xa0] // q6=k[b-8] q7=k[f-c] - fmla v16.4s, v0.4s, v4.s[0] // i[3-0]k[0] - fmla v17.4s, v0.4s, v4.s[1] // i[3-0]k[1] - fmla v18.4s, v0.4s, v4.s[2] // i[3-0]k[2] - fmla v19.4s, v0.4s, v4.s[3] // i[3-0]k[3] - ldr q1, [x2, 0x30] // q1=i[3-0] - add x2, x2, #0x40 - fmla v20.4s, v0.4s, v5.s[0] // i[3-0]k[4] - fmla v21.4s, v0.4s, v5.s[1] // i[3-0]k[5] - fmla v22.4s, v0.4s, v5.s[2] // i[3-0]k[6] - fmla v23.4s, v0.4s, v5.s[3] // i[3-0]k[7] - ldp q4, q5, [x3, 0xc0] // q4=k[3-0] q5=k[7-4] - fmla v24.4s, v0.4s, v6.s[0] // i[3-0]k[8] - fmla v25.4s, v0.4s, v6.s[1] // i[3-0]k[9] - fmla v26.4s, v0.4s, v6.s[2] // i[3-0]k[a] - fmla v27.4s, v0.4s, v6.s[3] // i[3-0]k[b] - prfm pldl1keep, [x3, 0x140] - fmla v28.4s, v0.4s, v7.s[0] // i[3-0]k[c] - fmla v29.4s, v0.4s, v7.s[1] // i[3-0]k[d] - fmla v30.4s, v0.4s, v7.s[2] // i[3-0]k[e] - fmla v31.4s, v0.4s, v7.s[3] // i[3-0]k[f] - - ldp q6, q7, [x3, 0xe0] // q6=k[b-8] q7=k[f-c] - fmla v16.4s, v1.4s, v4.s[0] // i[3-0]k[0] - fmla v17.4s, v1.4s, v4.s[1] // i[3-0]k[1] - fmla v18.4s, v1.4s, v4.s[2] // i[3-0]k[2] - fmla v19.4s, v1.4s, v4.s[3] // i[3-0]k[3] - prfm pldl1keep, [x3, 0x180] - fmla v20.4s, v1.4s, v5.s[0] // i[3-0]k[4] - fmla v21.4s, v1.4s, v5.s[1] // i[3-0]k[5] - fmla v22.4s, v1.4s, v5.s[2] // i[3-0]k[6] - fmla v23.4s, v1.4s, v5.s[3] // i[3-0]k[7] - prfm pldl1keep, [x3, 0x1c0] - fmla v24.4s, v1.4s, v6.s[0] // i[3-0]k[8] - fmla v25.4s, v1.4s, v6.s[1] // i[3-0]k[9] - fmla v26.4s, v1.4s, v6.s[2] // i[3-0]k[a] - fmla v27.4s, v1.4s, v6.s[3] // i[3-0]k[b] - prfm pldl1keep, [x3, 0x200] - add x3, x3, #0x100 - fmla v28.4s, v1.4s, v7.s[0] // i[3-0]k[c] - fmla v29.4s, v1.4s, v7.s[1] // i[3-0]k[d] - fmla v30.4s, v1.4s, v7.s[2] // i[3-0]k[e] - fmla v31.4s, v1.4s, v7.s[3] // i[3-0]k[f] - b.ne loop4 - - and x5, x5, 0x3 - -loop4_end: - cbz x5, finish - -loop1: - ldr q0, [x2], 0x10 // q0=i[3-0] - ldp q4, q5, [x3] // q4=k[3-0] q5=k[7-4] - ldp q6, q7, [x3, 0x20] // q6=k[b-8] q7=k[f-c] - subs x5 ,x5 ,0x1 - fmla v16.4s, v0.4s, v4.s[0] // i[3-0]k[0] - fmla v17.4s, v0.4s, v4.s[1] // i[3-0]k[1] - fmla v18.4s, v0.4s, v4.s[2] // i[3-0]k[2] - fmla v19.4s, v0.4s, v4.s[3] // i[3-0]k[3] - fmla v20.4s, v0.4s, v5.s[0] // i[3-0]k[4] - fmla v21.4s, v0.4s, v5.s[1] // i[3-0]k[5] - fmla v22.4s, v0.4s, v5.s[2] // i[3-0]k[6] - fmla v23.4s, v0.4s, v5.s[3] // i[3-0]k[7] - fmla v24.4s, v0.4s, v6.s[0] // i[3-0]k[8] - fmla v25.4s, v0.4s, v6.s[1] // i[3-0]k[9] - fmla v26.4s, v0.4s, v6.s[2] // i[3-0]k[a] - fmla v27.4s, v0.4s, v6.s[3] // i[3-0]k[b] - fmla v28.4s, v0.4s, v7.s[0] // i[3-0]k[c] - fmla v29.4s, v0.4s, v7.s[1] // i[3-0]k[d] - fmla v30.4s, v0.4s, v7.s[2] // i[3-0]k[e] - fmla v31.4s, v0.4s, v7.s[3] // i[3-0]k[f] - add x3, x3, #0x40 - - b.ne loop1 - - -finish: -// store result -#ifdef CONV_RELU_FUSE - fmov s0,wzr - dup v1.4s,v0.s[0] - fmax v16.4s,v16.4s,v1.4s - fmax v17.4s,v17.4s,v1.4s -#endif - stp q16, q17 ,[x4] - -#ifdef CONV_RELU_FUSE - fmax v18.4s,v18.4s,v1.4s - fmax v19.4s,v19.4s,v1.4s -#endif - stp q18, q19 ,[x4, #0x20] - -#ifdef CONV_RELU_FUSE - fmax v20.4s,v20.4s,v1.4s - fmax v21.4s,v21.4s,v1.4s -#endif - stp q20, q21 ,[x4, #0x40] - -#ifdef CONV_RELU_FUSE - fmax v22.4s,v22.4s,v1.4s - fmax v23.4s,v23.4s,v1.4s -#endif - stp q22, q23 ,[x4, #0x60] - -#ifdef CONV_RELU_FUSE - fmax v24.4s,v24.4s,v1.4s - fmax v25.4s,v25.4s,v1.4s -#endif - stp q24, q25 ,[x4, #0x80] - -#ifdef CONV_RELU_FUSE - fmax v26.4s,v26.4s,v1.4s - fmax v27.4s,v27.4s,v1.4s -#endif - stp q26, q27 ,[x4, #0xa0] - -#ifdef CONV_RELU_FUSE - fmax v28.4s,v28.4s,v1.4s - fmax v29.4s,v29.4s,v1.4s -#endif - stp q28, q29 ,[x4, #0xc0] - - -#ifdef CONV_RELU_FUSE - fmax v30.4s,v30.4s,v1.4s - fmax v31.4s,v31.4s,v1.4s -#endif - stp q30, q31 ,[x4, #0xe0] - - ret - -// zero data to fill out a few more cache lines so the prefetcher doesn't -// cause uninitialized memory to be read - - .space 256 - .end - +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: xiaowei@openailab.com + */ +// +// 4*16 single precise floating point matric multiplication +// +// -- -- -- -- -- -- -- -- +// | i0 - - - - - - | | k0 k1 .. kf | | t00 t01 .. t0f | | i0k0 i0k1 .. i0kf | +// | | | . . . . | | | | | +// | i1 - - - - - - | | . . . . | | t10 t11 . t1f | | i1k0 i1k1 .. i1kf | +// | | x | . . . . | + | | = | | +// | i2 - - - - - - | | . . . . | | t20 t21 . t2f | | i2k0 i2k1 .. i2kf | +// | | | . . . . | | | | | +// | i3 - - - - - - | | . . . . | | t30 t31 . t3f | | i3k0 i3k1 .. i3kf | +// -- -- -- -- -- -- -- -- +// input 4 x p kernel p x 16 biases 4 x 16 output 4 x 16 p = kernel size +// +// +// optimised for Cortex-A72 pipeline 66 cycle per loop (4*16*4 dot product) +// +// input: +// x0 arg0 have biases flag +// x1 arg1 biases start address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3],i[0-3]k[4]..} +// x2 arg2 input start address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...} +// x3 arg3 kernel start address {k[0-15][0],k[0-15][1],k[0-15][2],k[0-15][3],...} +// x4 arg4 output save address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3],i[0-3]k[4]..} +// x5 arg5 kernel size +// +// output: no +// +// register definition +// x0 have biases flag +// x1 biases start address +// x2 input start address +// x3 kernel start address +// x4 output start address +// x5 loop time = kernal size +// x6 ~ x31 not used +// +// v0~v1 4S data of input0 {i3 i2 i1 i0} +// v2-v3 not used +// v4 4S kernal data {k3 | k2 | k1 | k0} +// v5 4S kernal data {k7 | k6 | k5 | k4} +// v6 4S kernal data {kb | ka | k9 | k8} +// v7 4S kernal data {kf | ke | kd | kc} +// v8~v15 not used +// v16 dot product for {i3k0, i2k0, i1k0, i0k0} +// v17 dot product for {i3k1, i2k1, i1k1, i0k1} +// v18 dot product for {i3k2, i2k2, i1k2, i0k2} +// v19 dot product for {i3k3, i2k3, i1k3, i0k3} +// v20 dot product for {i3k4, i2k4, i1k4, i0k4} +// v21 dot product for {i3k5, i2k5, i1k5, i0k5} +// v22 dot product for {i3k6, i2k6, i1k6, i0k6} +// v23 dot product for {i3k7, i2k7, i1k7, i0k7} +// v24 dot product for {i3k8, i2k8, i1k8, i0k8} +// v25 dot product for {i3k9, i2k9, i1k9, i0k9} +// v26 dot product for {i3ka, i2ka, i1ka, i0ka} +// v27 dot product for {i3kb, i2kb, i1kb, i0kb} +// v28 dot product for {i3kc, i2kc, i1kc, i0kc} +// v29 dot product for {i3kd, i2kd, i1kd, i0kd} +// v30 dot product for {i3ke, i2ke, i1ke, i0ke} +// v31 dot product for {i3kf, i2kf, i1kf, i0kf} + +#ifndef INTERLEAVE_FUNC_NAME +#define INTERLEAVE_FUNC_NAME sgemm_4x16_interleave +#endif + + .section .text,"ax" + .align 5 + + .type INTERLEAVE_FUNC_NAME STT_FUNC + .global INTERLEAVE_FUNC_NAME + +INTERLEAVE_FUNC_NAME: +// biases_initial + cbz x0, none_biases + ldp q16, q17 ,[x1] + ldp q18, q19 ,[x1, #0x20] + ldp q20, q21 ,[x1, #0x40] + ldp q22, q23 ,[x1, #0x60] + ldp q24, q25 ,[x1, #0x80] + ldp q26, q27 ,[x1, #0xa0] + ldp q28, q29 ,[x1, #0xc0] + ldp q30, q31 ,[x1, #0xe0] + b convolution_start + +none_biases: + movi d16, #0 + movi d17, #0 + movi d18, #0 + movi d19, #0 + movi d20, #0 + movi d21, #0 + movi d22, #0 + movi d23, #0 + movi d24, #0 + movi d25, #0 + movi d26, #0 + movi d27, #0 + movi d28, #0 + movi d29, #0 + movi d30, #0 + movi d31, #0 + +convolution_start: + // compare to 0x4 + cmp x5, 0x4 + blt loop4_end + lsr x6, x5, 0x2 + +// main loop each loop generate dot prodcut for 4x16SFP +loop4: + ldr q0, [x2] // q0=i[3-0] + ldp q4, q5, [x3] // q4=k[3-0] q5=k[7-4] + fmla v16.4s, v0.4s, v4.s[0] // i[3-0]k[0] + fmla v17.4s, v0.4s, v4.s[1] // i[3-0]k[1] + fmla v18.4s, v0.4s, v4.s[2] // i[3-0]k[2] + fmla v19.4s, v0.4s, v4.s[3] // i[3-0]k[3] + ldp q6, q7, [x3, 0x20] // q6=k[b-8] q7=k[f-c] + fmla v20.4s, v0.4s, v5.s[0] // i[3-0]k[4] + fmla v21.4s, v0.4s, v5.s[1] // i[3-0]k[5] + fmla v22.4s, v0.4s, v5.s[2] // i[3-0]k[6] + fmla v23.4s, v0.4s, v5.s[3] // i[3-0]k[7] + ldr q1, [x2, 0x10] // q1=i[3-0] + fmla v24.4s, v0.4s, v6.s[0] // i[3-0]k[8] + fmla v25.4s, v0.4s, v6.s[1] // i[3-0]k[9] + fmla v26.4s, v0.4s, v6.s[2] // i[3-0]k[a] + ldp q4, q5, [x3, 0x40] // q4=k[3-0] q5=k[7-4] + fmla v27.4s, v0.4s, v6.s[3] // i[3-0]k[b] + fmla v28.4s, v0.4s, v7.s[0] // i[3-0]k[c] + fmla v29.4s, v0.4s, v7.s[1] // i[3-0]k[d] + fmla v30.4s, v0.4s, v7.s[2] // i[3-0]k[e] + fmla v31.4s, v0.4s, v7.s[3] // i[3-0]k[f] + + ldp q6, q7, [x3, 0x60] // q6=k[b-8] q7=k[f-c] + fmla v16.4s, v1.4s, v4.s[0] // i[3-0]k[0] + fmla v17.4s, v1.4s, v4.s[1] // i[3-0]k[1] + fmla v18.4s, v1.4s, v4.s[2] // i[3-0]k[2] + fmla v19.4s, v1.4s, v4.s[3] // i[3-0]k[3] + ldr q0, [x2, 0x20] // q1=i[3-0] + fmla v20.4s, v1.4s, v5.s[0] // i[3-0]k[4] + fmla v21.4s, v1.4s, v5.s[1] // i[3-0]k[5] + fmla v22.4s, v1.4s, v5.s[2] // i[3-0]k[6] + fmla v23.4s, v1.4s, v5.s[3] // i[3-0]k[7] + ldp q4, q5, [x3, 0x80] // q4=k[3-0] q5=k[7-4] + fmla v24.4s, v1.4s, v6.s[0] // i[3-0]k[8] + fmla v25.4s, v1.4s, v6.s[1] // i[3-0]k[9] + fmla v26.4s, v1.4s, v6.s[2] // i[3-0]k[a] + fmla v27.4s, v1.4s, v6.s[3] // i[3-0]k[b] + subs x6, x6, #0x1 + prfm pldl1keep, [x2, 0x80] + fmla v28.4s, v1.4s, v7.s[0] // i[3-0]k[c] + fmla v29.4s, v1.4s, v7.s[1] // i[3-0]k[d] + fmla v30.4s, v1.4s, v7.s[2] // i[3-0]k[e] + fmla v31.4s, v1.4s, v7.s[3] // i[3-0]k[f] + + ldp q6, q7, [x3, 0xa0] // q6=k[b-8] q7=k[f-c] + fmla v16.4s, v0.4s, v4.s[0] // i[3-0]k[0] + fmla v17.4s, v0.4s, v4.s[1] // i[3-0]k[1] + fmla v18.4s, v0.4s, v4.s[2] // i[3-0]k[2] + fmla v19.4s, v0.4s, v4.s[3] // i[3-0]k[3] + ldr q1, [x2, 0x30] // q1=i[3-0] + add x2, x2, #0x40 + fmla v20.4s, v0.4s, v5.s[0] // i[3-0]k[4] + fmla v21.4s, v0.4s, v5.s[1] // i[3-0]k[5] + fmla v22.4s, v0.4s, v5.s[2] // i[3-0]k[6] + fmla v23.4s, v0.4s, v5.s[3] // i[3-0]k[7] + ldp q4, q5, [x3, 0xc0] // q4=k[3-0] q5=k[7-4] + fmla v24.4s, v0.4s, v6.s[0] // i[3-0]k[8] + fmla v25.4s, v0.4s, v6.s[1] // i[3-0]k[9] + fmla v26.4s, v0.4s, v6.s[2] // i[3-0]k[a] + fmla v27.4s, v0.4s, v6.s[3] // i[3-0]k[b] + prfm pldl1keep, [x3, 0x140] + fmla v28.4s, v0.4s, v7.s[0] // i[3-0]k[c] + fmla v29.4s, v0.4s, v7.s[1] // i[3-0]k[d] + fmla v30.4s, v0.4s, v7.s[2] // i[3-0]k[e] + fmla v31.4s, v0.4s, v7.s[3] // i[3-0]k[f] + + ldp q6, q7, [x3, 0xe0] // q6=k[b-8] q7=k[f-c] + fmla v16.4s, v1.4s, v4.s[0] // i[3-0]k[0] + fmla v17.4s, v1.4s, v4.s[1] // i[3-0]k[1] + fmla v18.4s, v1.4s, v4.s[2] // i[3-0]k[2] + fmla v19.4s, v1.4s, v4.s[3] // i[3-0]k[3] + prfm pldl1keep, [x3, 0x180] + fmla v20.4s, v1.4s, v5.s[0] // i[3-0]k[4] + fmla v21.4s, v1.4s, v5.s[1] // i[3-0]k[5] + fmla v22.4s, v1.4s, v5.s[2] // i[3-0]k[6] + fmla v23.4s, v1.4s, v5.s[3] // i[3-0]k[7] + prfm pldl1keep, [x3, 0x1c0] + fmla v24.4s, v1.4s, v6.s[0] // i[3-0]k[8] + fmla v25.4s, v1.4s, v6.s[1] // i[3-0]k[9] + fmla v26.4s, v1.4s, v6.s[2] // i[3-0]k[a] + fmla v27.4s, v1.4s, v6.s[3] // i[3-0]k[b] + prfm pldl1keep, [x3, 0x200] + add x3, x3, #0x100 + fmla v28.4s, v1.4s, v7.s[0] // i[3-0]k[c] + fmla v29.4s, v1.4s, v7.s[1] // i[3-0]k[d] + fmla v30.4s, v1.4s, v7.s[2] // i[3-0]k[e] + fmla v31.4s, v1.4s, v7.s[3] // i[3-0]k[f] + b.ne loop4 + + and x5, x5, 0x3 + +loop4_end: + cbz x5, finish + +loop1: + ldr q0, [x2], 0x10 // q0=i[3-0] + ldp q4, q5, [x3] // q4=k[3-0] q5=k[7-4] + ldp q6, q7, [x3, 0x20] // q6=k[b-8] q7=k[f-c] + subs x5 ,x5 ,0x1 + fmla v16.4s, v0.4s, v4.s[0] // i[3-0]k[0] + fmla v17.4s, v0.4s, v4.s[1] // i[3-0]k[1] + fmla v18.4s, v0.4s, v4.s[2] // i[3-0]k[2] + fmla v19.4s, v0.4s, v4.s[3] // i[3-0]k[3] + fmla v20.4s, v0.4s, v5.s[0] // i[3-0]k[4] + fmla v21.4s, v0.4s, v5.s[1] // i[3-0]k[5] + fmla v22.4s, v0.4s, v5.s[2] // i[3-0]k[6] + fmla v23.4s, v0.4s, v5.s[3] // i[3-0]k[7] + fmla v24.4s, v0.4s, v6.s[0] // i[3-0]k[8] + fmla v25.4s, v0.4s, v6.s[1] // i[3-0]k[9] + fmla v26.4s, v0.4s, v6.s[2] // i[3-0]k[a] + fmla v27.4s, v0.4s, v6.s[3] // i[3-0]k[b] + fmla v28.4s, v0.4s, v7.s[0] // i[3-0]k[c] + fmla v29.4s, v0.4s, v7.s[1] // i[3-0]k[d] + fmla v30.4s, v0.4s, v7.s[2] // i[3-0]k[e] + fmla v31.4s, v0.4s, v7.s[3] // i[3-0]k[f] + add x3, x3, #0x40 + + b.ne loop1 + + +finish: +// store result +#ifdef CONV_RELU_FUSE + fmov s0,wzr + dup v1.4s,v0.s[0] + fmax v16.4s,v16.4s,v1.4s + fmax v17.4s,v17.4s,v1.4s +#endif + stp q16, q17 ,[x4] + +#ifdef CONV_RELU_FUSE + fmax v18.4s,v18.4s,v1.4s + fmax v19.4s,v19.4s,v1.4s +#endif + stp q18, q19 ,[x4, #0x20] + +#ifdef CONV_RELU_FUSE + fmax v20.4s,v20.4s,v1.4s + fmax v21.4s,v21.4s,v1.4s +#endif + stp q20, q21 ,[x4, #0x40] + +#ifdef CONV_RELU_FUSE + fmax v22.4s,v22.4s,v1.4s + fmax v23.4s,v23.4s,v1.4s +#endif + stp q22, q23 ,[x4, #0x60] + +#ifdef CONV_RELU_FUSE + fmax v24.4s,v24.4s,v1.4s + fmax v25.4s,v25.4s,v1.4s +#endif + stp q24, q25 ,[x4, #0x80] + +#ifdef CONV_RELU_FUSE + fmax v26.4s,v26.4s,v1.4s + fmax v27.4s,v27.4s,v1.4s +#endif + stp q26, q27 ,[x4, #0xa0] + +#ifdef CONV_RELU_FUSE + fmax v28.4s,v28.4s,v1.4s + fmax v29.4s,v29.4s,v1.4s +#endif + stp q28, q29 ,[x4, #0xc0] + + +#ifdef CONV_RELU_FUSE + fmax v30.4s,v30.4s,v1.4s + fmax v31.4s,v31.4s,v1.4s +#endif + stp q30, q31 ,[x4, #0xe0] + + ret + +// zero data to fill out a few more cache lines so the prefetcher doesn't +// cause uninitialized memory to be read + + .space 256 + .end + diff --git a/executor/operator/arm64/conv/sgemm_4x16_interleave_relu_fused.S b/executor/operator/arm64/conv/sgemm_4x16_interleave_relu_fused.S index 515cc0d14..c3dac1948 100644 --- a/executor/operator/arm64/conv/sgemm_4x16_interleave_relu_fused.S +++ b/executor/operator/arm64/conv/sgemm_4x16_interleave_relu_fused.S @@ -1,28 +1,28 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2018, Open AI Lab - * Author: haitao@openailab.com - */ - -#define CONV_RELU_FUSE 1 -#define INTERLEAVE_FUNC_NAME sgemm_4x16_interleave_relu_fused - -#include "./sgemm_4x16_interleave.S" +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: haitao@openailab.com + */ + +#define CONV_RELU_FUSE 1 +#define INTERLEAVE_FUNC_NAME sgemm_4x16_interleave_relu_fused + +#include "./sgemm_4x16_interleave.S" diff --git a/executor/operator/arm64/conv/sgemm_4x4_interleave.S b/executor/operator/arm64/conv/sgemm_4x4_interleave.S index 59a820ba8..2197c2604 100644 --- a/executor/operator/arm64/conv/sgemm_4x4_interleave.S +++ b/executor/operator/arm64/conv/sgemm_4x4_interleave.S @@ -1,170 +1,170 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: xiaowei@openailab.com - */ -// -// 4*4 single precise floating point matric multiplication -// -// -- -- -- -- -- -- -- -- -// | i0 - - - - - - | | k0 k1 k2 k3 | | t00 t01 t02 t03 | | i0k0 i0k1 .. i0kf | -// | | | . . . . | | | | | -// | i1 - - - - - - | | . . . . | | t10 t11 t12 t13 | | i1k0 i1k1 .. i1kf | -// | | x | . . . . | + | | = | | -// | i2 - - - - - - | | . . . . | | t20 t21 t22 t23 | | i2k0 i2k1 .. i2kf | -// | | | . . . . | | | | | -// | i3 - - - - - - | | . . . . | | t30 t31 t32 t33 | | i3k0 i3k1 .. i3kf | -// -- -- -- -- -- -- -- -- -// input 4 x p kernel p x 4 biases 4 x 4 output 4 x 4 p = kernel size -// -// -// optimised for Cortex-A72 pipeline 18 cycle per loop (4*4*4 dot product) -// -// input: -// x0 arg0 have biases flag -// x1 arg1 biases start address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]} -// x2 arg2 input start address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...} -// x3 arg3 kernel start address {k[0-3][0],k[0-3][1],k[0-3][2],k[0-3][3],...} -// x4 arg4 output save address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]} -// x5 arg5 kernel size -// -// output: no -// -// register definition -// x0 have biases flag -// x1 biases start address -// x2 input start address -// x3 kernel start address -// x4 output start address -// x5 loop time = kernal size -// x6 ~ x31 not used -// -// v0-3 4S data of input0 {i3 i2 i1 i0} -// v4-7 4S kernal data {k3 k2 k1 k0} -// v8~v15 not used -// v16 dot product for {i3k0, i2k0, i1k0, i0k0} -// v17 dot product for {i3k1, i2k1, i1k1, i0k1} -// v18 dot product for {i3k2, i2k2, i1k2, i0k2} -// v19 dot product for {i3k3, i2k3, i1k3, i0k3} -// v20~V31 not used - .section .text,"ax" - .align 5 - -#ifndef INTERLEAVE_FUNC_NAME -#define INTERLEAVE_FUNC_NAME sgemm_4x4_interleave -#endif - .type INTERLEAVE_FUNC_NAME STT_FUNC - .global INTERLEAVE_FUNC_NAME - -INTERLEAVE_FUNC_NAME: -// initial - cbz x0, non_biases - - ldp q16, q17, [x1] - ldp q18, q19, [x1,0x20] - b convoluation_start - -non_biases: - movi d16, #0x0 - movi d17, #0x0 - movi d18, #0x0 - movi d19, #0x0 - -convoluation_start: - // compare to 0x4 - cmp x5, 0x4 - blt loop4_end - lsr x6, x5, 0x2 - -// main loop each loop generate dot prodcut for 4x4SFP -loop4: - subs x6 ,x6 ,0x1 - - ldr q0, [x2] // q0=i[3-0] - ldp q4, q5, [x3] // q4=k[3-0] - fmla v16.4s, v0.4s, v4.s[0] // i[3-0]k[0] - fmla v17.4s, v0.4s, v4.s[1] // i[3-0]k[1] - ldr q1, [x2, 0x10] // q1=i[3-0] - fmla v18.4s, v0.4s, v4.s[2] // i[3-0]k[2] - fmla v19.4s, v0.4s, v4.s[3] // i[3-0]k[3] - - ldp q2, q3, [x2, 0x20] // q2=i[3-0] q3=i[3-0] - fmla v16.4s, v1.4s, v5.s[0] // i[3-0]k[0] - fmla v17.4s, v1.4s, v5.s[1] // i[3-0]k[1] - ldp q6, q7, [x3, 0x20] // q6=k[3-0] q7=q7=k[3-0] - fmla v18.4s, v1.4s, v5.s[2] // i[3-0]k[2] - fmla v19.4s, v1.4s, v5.s[3] // i[3-0]k[3] - - fmla v16.4s, v2.4s, v6.s[0] // i[3-0]k[0] - fmla v17.4s, v2.4s, v6.s[1] // i[3-0]k[1] - prfm pldl1keep, [x2, 0x140] - add x2, x2, #0x40 - fmla v18.4s, v2.4s, v6.s[2] // i[3-0]k[2] - fmla v19.4s, v2.4s, v6.s[3] // i[3-0]k[3] - - prfm pldl1keep, [x3, 0x140] - add x3, x3, #0x40 - fmla v16.4s, v3.4s, v7.s[0] // i[3-0]k[0] - fmla v17.4s, v3.4s, v7.s[1] // i[3-0]k[1] - fmla v18.4s, v3.4s, v7.s[2] // i[3-0]k[2] - fmla v19.4s, v3.4s, v7.s[3] // i[3-0]k[3] - - b.ne loop4 - - and x5, x5, 0x3 - -loop4_end: - cbz x5, finish - -loop1: - subs x5 ,x5 ,0x1 - ldr q0, [x2], 0x10 // q0=i[3-0] - ldr q4, [x3], 0x10 // q4=k[3-0] - fmla v16.4s, v0.4s, v4.s[0] // i[0]k[3-0] - fmla v17.4s, v0.4s, v4.s[1] // i[1]k[3-0] - fmla v18.4s, v0.4s, v4.s[2] // i[2]k[3-0] - fmla v19.4s, v0.4s, v4.s[3] // i[3]k[3-0] - - b.ne loop1 -finish: -// store result -#ifdef CONV_RELU_FUSE - fmov s0,wzr - dup v1.4s,v0.s[0] - fmax v16.4s,v16.4s,v1.4s - fmax v17.4s,v17.4s,v1.4s -#endif - stp q16, q17, [x4] - -#ifdef CONV_RELU_FUSE - fmax v18.4s,v18.4s,v1.4s - fmax v19.4s,v19.4s,v1.4s -#endif - stp q18, q19, [x4,0x20] - - ret - -// zero data to fill out a few more cache lines so the prefetcher doesn't -// cause uninitialized memory to be read - - .space 256 - .end - +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: xiaowei@openailab.com + */ +// +// 4*4 single precise floating point matric multiplication +// +// -- -- -- -- -- -- -- -- +// | i0 - - - - - - | | k0 k1 k2 k3 | | t00 t01 t02 t03 | | i0k0 i0k1 .. i0kf | +// | | | . . . . | | | | | +// | i1 - - - - - - | | . . . . | | t10 t11 t12 t13 | | i1k0 i1k1 .. i1kf | +// | | x | . . . . | + | | = | | +// | i2 - - - - - - | | . . . . | | t20 t21 t22 t23 | | i2k0 i2k1 .. i2kf | +// | | | . . . . | | | | | +// | i3 - - - - - - | | . . . . | | t30 t31 t32 t33 | | i3k0 i3k1 .. i3kf | +// -- -- -- -- -- -- -- -- +// input 4 x p kernel p x 4 biases 4 x 4 output 4 x 4 p = kernel size +// +// +// optimised for Cortex-A72 pipeline 18 cycle per loop (4*4*4 dot product) +// +// input: +// x0 arg0 have biases flag +// x1 arg1 biases start address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]} +// x2 arg2 input start address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...} +// x3 arg3 kernel start address {k[0-3][0],k[0-3][1],k[0-3][2],k[0-3][3],...} +// x4 arg4 output save address {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]} +// x5 arg5 kernel size +// +// output: no +// +// register definition +// x0 have biases flag +// x1 biases start address +// x2 input start address +// x3 kernel start address +// x4 output start address +// x5 loop time = kernal size +// x6 ~ x31 not used +// +// v0-3 4S data of input0 {i3 i2 i1 i0} +// v4-7 4S kernal data {k3 k2 k1 k0} +// v8~v15 not used +// v16 dot product for {i3k0, i2k0, i1k0, i0k0} +// v17 dot product for {i3k1, i2k1, i1k1, i0k1} +// v18 dot product for {i3k2, i2k2, i1k2, i0k2} +// v19 dot product for {i3k3, i2k3, i1k3, i0k3} +// v20~V31 not used + .section .text,"ax" + .align 5 + +#ifndef INTERLEAVE_FUNC_NAME +#define INTERLEAVE_FUNC_NAME sgemm_4x4_interleave +#endif + .type INTERLEAVE_FUNC_NAME STT_FUNC + .global INTERLEAVE_FUNC_NAME + +INTERLEAVE_FUNC_NAME: +// initial + cbz x0, non_biases + + ldp q16, q17, [x1] + ldp q18, q19, [x1,0x20] + b convoluation_start + +non_biases: + movi d16, #0x0 + movi d17, #0x0 + movi d18, #0x0 + movi d19, #0x0 + +convoluation_start: + // compare to 0x4 + cmp x5, 0x4 + blt loop4_end + lsr x6, x5, 0x2 + +// main loop each loop generate dot prodcut for 4x4SFP +loop4: + subs x6 ,x6 ,0x1 + + ldr q0, [x2] // q0=i[3-0] + ldp q4, q5, [x3] // q4=k[3-0] + fmla v16.4s, v0.4s, v4.s[0] // i[3-0]k[0] + fmla v17.4s, v0.4s, v4.s[1] // i[3-0]k[1] + ldr q1, [x2, 0x10] // q1=i[3-0] + fmla v18.4s, v0.4s, v4.s[2] // i[3-0]k[2] + fmla v19.4s, v0.4s, v4.s[3] // i[3-0]k[3] + + ldp q2, q3, [x2, 0x20] // q2=i[3-0] q3=i[3-0] + fmla v16.4s, v1.4s, v5.s[0] // i[3-0]k[0] + fmla v17.4s, v1.4s, v5.s[1] // i[3-0]k[1] + ldp q6, q7, [x3, 0x20] // q6=k[3-0] q7=q7=k[3-0] + fmla v18.4s, v1.4s, v5.s[2] // i[3-0]k[2] + fmla v19.4s, v1.4s, v5.s[3] // i[3-0]k[3] + + fmla v16.4s, v2.4s, v6.s[0] // i[3-0]k[0] + fmla v17.4s, v2.4s, v6.s[1] // i[3-0]k[1] + prfm pldl1keep, [x2, 0x140] + add x2, x2, #0x40 + fmla v18.4s, v2.4s, v6.s[2] // i[3-0]k[2] + fmla v19.4s, v2.4s, v6.s[3] // i[3-0]k[3] + + prfm pldl1keep, [x3, 0x140] + add x3, x3, #0x40 + fmla v16.4s, v3.4s, v7.s[0] // i[3-0]k[0] + fmla v17.4s, v3.4s, v7.s[1] // i[3-0]k[1] + fmla v18.4s, v3.4s, v7.s[2] // i[3-0]k[2] + fmla v19.4s, v3.4s, v7.s[3] // i[3-0]k[3] + + b.ne loop4 + + and x5, x5, 0x3 + +loop4_end: + cbz x5, finish + +loop1: + subs x5 ,x5 ,0x1 + ldr q0, [x2], 0x10 // q0=i[3-0] + ldr q4, [x3], 0x10 // q4=k[3-0] + fmla v16.4s, v0.4s, v4.s[0] // i[0]k[3-0] + fmla v17.4s, v0.4s, v4.s[1] // i[1]k[3-0] + fmla v18.4s, v0.4s, v4.s[2] // i[2]k[3-0] + fmla v19.4s, v0.4s, v4.s[3] // i[3]k[3-0] + + b.ne loop1 +finish: +// store result +#ifdef CONV_RELU_FUSE + fmov s0,wzr + dup v1.4s,v0.s[0] + fmax v16.4s,v16.4s,v1.4s + fmax v17.4s,v17.4s,v1.4s +#endif + stp q16, q17, [x4] + +#ifdef CONV_RELU_FUSE + fmax v18.4s,v18.4s,v1.4s + fmax v19.4s,v19.4s,v1.4s +#endif + stp q18, q19, [x4,0x20] + + ret + +// zero data to fill out a few more cache lines so the prefetcher doesn't +// cause uninitialized memory to be read + + .space 256 + .end + diff --git a/executor/operator/arm64/conv/sgemm_4x4_interleave_relu_fused.S b/executor/operator/arm64/conv/sgemm_4x4_interleave_relu_fused.S index f956b72a5..65162223c 100644 --- a/executor/operator/arm64/conv/sgemm_4x4_interleave_relu_fused.S +++ b/executor/operator/arm64/conv/sgemm_4x4_interleave_relu_fused.S @@ -1,27 +1,27 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2018, Open AI Lab - * Author: haitao@openailab.com - */ -#define CONV_RELU_FUSE 1 -#define INTERLEAVE_FUNC_NAME sgemm_4x4_interleave_relu_fused - -#include "./sgemm_4x4_interleave.S" +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: haitao@openailab.com + */ +#define CONV_RELU_FUSE 1 +#define INTERLEAVE_FUNC_NAME sgemm_4x4_interleave_relu_fused + +#include "./sgemm_4x4_interleave.S" diff --git a/executor/operator/arm64/fc/Makefile b/executor/operator/arm64/fc/Makefile index 243a02695..d4482a4cd 100644 --- a/executor/operator/arm64/fc/Makefile +++ b/executor/operator/arm64/fc/Makefile @@ -1,7 +1,7 @@ -obj-y+=fully_connected_fast.o -obj-y+=sgemv_1x8_a72.o -obj-y+=sgemv_1x2_a72.o -obj-y+=sgemv_1x8_a53.o -obj-y+=sgemv_1x2_a53.o - -fully_connected_fast_CXXFLAGS+=-I../include +obj-y+=fully_connected_fast.o +obj-y+=sgemv_1x8_a72.o +obj-y+=sgemv_1x2_a72.o +obj-y+=sgemv_1x8_a53.o +obj-y+=sgemv_1x2_a53.o + +fully_connected_fast_CXXFLAGS+=-I../include diff --git a/executor/operator/arm64/fc/fully_connected_fast.cpp b/executor/operator/arm64/fc/fully_connected_fast.cpp index 6c1b71849..234a26539 100644 --- a/executor/operator/arm64/fc/fully_connected_fast.cpp +++ b/executor/operator/arm64/fc/fully_connected_fast.cpp @@ -176,8 +176,8 @@ struct FCOps : public MTNodeOps Tensor* tensor; tensor = node->GetInputTensor(1); - int M = tensor->GetShape().GetH(); - int K = tensor->GetShape().GetW(); + int M = tensor->GetShape().Shape(0); + int K = tensor->GetShape().Shape(1); float* weight = ( float* )get_tensor_mem(tensor); @@ -188,8 +188,6 @@ struct FCOps : public MTNodeOps if(exec_attr->low_mem_mode) { - printf("Free fc weight: %s %d\n", tensor->GetName().c_str(), tensor->GetTotalSize()); - tensor->FreeMem(); } @@ -240,8 +238,8 @@ struct FCOps : public MTNodeOps /* weight */ tensor = node->GetInputTensor(1); - int M = tensor->GetShape().GetH(); - int K = tensor->GetShape().GetW(); + int M = tensor->GetShape().Shape(0); + int K = tensor->GetShape().Shape(1); float* weight_interleaved = any_cast(node->GetAttr("weight_interleaved")); /* output */ @@ -338,6 +336,10 @@ struct FCOps : public MTNodeOps NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) { + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + FCOps* ops = new FCOps(); int master_cpu = cpu_info->GetMasterCPU(); diff --git a/executor/operator/arm64/fc/sgemv_1x2_a53.S b/executor/operator/arm64/fc/sgemv_1x2_a53.S index a131d9f35..4cdec48bf 100644 --- a/executor/operator/arm64/fc/sgemv_1x2_a53.S +++ b/executor/operator/arm64/fc/sgemv_1x2_a53.S @@ -1,126 +1,126 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2018, Open AI Lab - * Author: xiaowei@openailab.com - */ - -// -// 1*2 single precise floating point matric multiplication -// -// -- -- -// | k0 k1 | -// | . . | -// -- -- | . . | -- -- -- -- -// | i0 - - - - - - | x | . . | + | b0 b1 | = | i0k0 i0k1 | -// -- -- | . . | -- -- -- -- -// | . . | -// | . . | -// -- -- -// input 1 x p kernel p x 2 biases x 2 output 1 x 2 p = kernel size -// -// -// optimised for Cortex-A53 pipeline 15 cycle per loop (1*2*4 dot product) -// the bottleneck is memory bandwidth -// -// input: -// x0 arg0 biases start address {b0, b1 } -// x1 arg1 input data start address {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...} -// x2 arg2 kernel data start address {k00, k10, k01, k11, k02, k12, ...} -// x3 arg3 kernel size -// x4 arg4 output data save address {ik0, ik1} -// -// output: no -// -// v0 4S data of input {i3 i2 i1 i0 } -// v1~v7 not used -// v16 2S kernal data0 {k10 | k00} -// v17 not used -// v18 2S kernal data1 {k11 | k01} -// v19 not used -// v20 2S kernal data2 {k12 | k02} -// v21 not used -// v22 2S kernal data3 {k13 | k03} -// v23 not used -// v24-29 not used -// v30 dot product for {ik1, ik0} -// v31 dot product for {ik1, ik0} - - .section .text,"ax" - .align 5 - - .type sgemv_1x2_a53 STT_FUNC - .global sgemv_1x2_a53 -sgemv_1x2_a53: - // initial - movi d30, 0 - prfm pldl1keep, [x1, 0x40] - prfm pldl1keep, [x2, 0x80] - cmp x3, 0x4 - cbz x0, start_convolution - ldr d30, [x0] - -start_convolution: - and x10,x3, 0x3 - b.lt loop4_end - movi d31, 0 - lsr x9, x3, 0x2 - - -// main loop each loop generate dot prodcut for 1x8x2SFP -loop4: - ldr q0, [x1] // q0 = i[3-0] - ldp d16, d18, [x2] // d16 = k[1-0][0] d18 = k[1-0][1] - ldp d20, d22, [x2, 0x10] // d20 = k[1-0][2] d22 = k[1-0][3] - prfm pldl1keep, [x1, 0xa0] - add x1, x1, 0x10 - - fmla v30.2s, v16.2s, v0.s[0] // ik[1-0][0] - subs x9, x9, 0x1 - fmla v31.2s, v18.2s, v0.s[1] // ik[1-0][1] - prfm pldl1keep, [x2, 0x140] - fmla v30.2s, v20.2s, v0.s[2] // ik[1-0][2] - add x2, x2, 0x20 - fmla v31.2s, v22.2s, v0.s[3] // ik[1-0][3] - - b.ne loop4 - fadd v30.2s, v30.2s, v31.2s - -loop4_end: - cbz x10, save_result - -loop1: - ldr s0, [x1], 0x4 - ldr d16,[x2], 0x8 - subs x10,x10, 0x1 - - fmla v30.2s, v16.2s, v0.s[0] - - b.ne loop1 - -save_result: - str d30, [x4] - - ret - - - .space 256 - .end - +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: xiaowei@openailab.com + */ + +// +// 1*2 single precise floating point matric multiplication +// +// -- -- +// | k0 k1 | +// | . . | +// -- -- | . . | -- -- -- -- +// | i0 - - - - - - | x | . . | + | b0 b1 | = | i0k0 i0k1 | +// -- -- | . . | -- -- -- -- +// | . . | +// | . . | +// -- -- +// input 1 x p kernel p x 2 biases x 2 output 1 x 2 p = kernel size +// +// +// optimised for Cortex-A53 pipeline 15 cycle per loop (1*2*4 dot product) +// the bottleneck is memory bandwidth +// +// input: +// x0 arg0 biases start address {b0, b1 } +// x1 arg1 input data start address {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...} +// x2 arg2 kernel data start address {k00, k10, k01, k11, k02, k12, ...} +// x3 arg3 kernel size +// x4 arg4 output data save address {ik0, ik1} +// +// output: no +// +// v0 4S data of input {i3 i2 i1 i0 } +// v1~v7 not used +// v16 2S kernal data0 {k10 | k00} +// v17 not used +// v18 2S kernal data1 {k11 | k01} +// v19 not used +// v20 2S kernal data2 {k12 | k02} +// v21 not used +// v22 2S kernal data3 {k13 | k03} +// v23 not used +// v24-29 not used +// v30 dot product for {ik1, ik0} +// v31 dot product for {ik1, ik0} + + .section .text,"ax" + .align 5 + + .type sgemv_1x2_a53 STT_FUNC + .global sgemv_1x2_a53 +sgemv_1x2_a53: + // initial + movi d30, 0 + prfm pldl1keep, [x1, 0x40] + prfm pldl1keep, [x2, 0x80] + cmp x3, 0x4 + cbz x0, start_convolution + ldr d30, [x0] + +start_convolution: + and x10,x3, 0x3 + b.lt loop4_end + movi d31, 0 + lsr x9, x3, 0x2 + + +// main loop each loop generate dot prodcut for 1x8x2SFP +loop4: + ldr q0, [x1] // q0 = i[3-0] + ldp d16, d18, [x2] // d16 = k[1-0][0] d18 = k[1-0][1] + ldp d20, d22, [x2, 0x10] // d20 = k[1-0][2] d22 = k[1-0][3] + prfm pldl1keep, [x1, 0xa0] + add x1, x1, 0x10 + + fmla v30.2s, v16.2s, v0.s[0] // ik[1-0][0] + subs x9, x9, 0x1 + fmla v31.2s, v18.2s, v0.s[1] // ik[1-0][1] + prfm pldl1keep, [x2, 0x140] + fmla v30.2s, v20.2s, v0.s[2] // ik[1-0][2] + add x2, x2, 0x20 + fmla v31.2s, v22.2s, v0.s[3] // ik[1-0][3] + + b.ne loop4 + fadd v30.2s, v30.2s, v31.2s + +loop4_end: + cbz x10, save_result + +loop1: + ldr s0, [x1], 0x4 + ldr d16,[x2], 0x8 + subs x10,x10, 0x1 + + fmla v30.2s, v16.2s, v0.s[0] + + b.ne loop1 + +save_result: + str d30, [x4] + + ret + + + .space 256 + .end + diff --git a/executor/operator/arm64/fc/sgemv_1x2_a72.S b/executor/operator/arm64/fc/sgemv_1x2_a72.S index 85a7312f3..ea1ad0faa 100644 --- a/executor/operator/arm64/fc/sgemv_1x2_a72.S +++ b/executor/operator/arm64/fc/sgemv_1x2_a72.S @@ -1,126 +1,126 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: xiaowei@openailab.com - */ - -// -// 1*2 single precise floating point matric multiplication -// -// -- -- -// | k0 k1 | -// | . . | -// -- -- | . . | -- -- -- -- -// | i0 - - - - - - | x | . . | + | b0 b1 | = | i0k0 i0k1 | -// -- -- | . . | -- -- -- -- -// | . . | -// | . . | -// -- -- -// input 1 x p kernel p x 2 biases x 2 output 1 x 2 p = kernel size -// -// -// optimised for Cortex-A72 pipeline 9 cycle per loop (1*2*4 dot product) -// the bottleneck is memory bandwidth -// -// input: -// x0 arg0 biases start address {b0, b1 } -// x1 arg1 input data start address {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...} -// x2 arg2 kernel data start address {k00, k10, k01, k11, k02, k12, ...} -// x3 arg3 kernel size -// x4 arg4 output data save address {ik0, ik1} -// -// output: no -// -// v0 4S data of input {i3 i2 i1 i0 } -// v1~v7 not used -// v16 2S kernal data0 {k10 | k00} -// v17 not used -// v18 2S kernal data1 {k11 | k01} -// v19 not used -// v20 2S kernal data2 {k12 | k02} -// v21 not used -// v22 2S kernal data3 {k13 | k03} -// v23 not used -// v24-29 not used -// v30 dot product for {ik1, ik0} -// v31 dot product for {ik1, ik0} - - .section .text,"ax" - .align 5 - - .type sgemv_1x2_a72 STT_FUNC - .global sgemv_1x2_a72 -sgemv_1x2_a72: -// initial - movi d30, 0 - prfm pldl1keep, [x1, 0x80] - cmp x3, 0x4 - prfm pldl1keep, [x2, 0x100] - prfm pldl1keep, [x2, 0x140] - cbz x0, start_convolution - ldr d30, [x0] - -start_convolution: - and x10,x3, 0x3 - b.lt loop4_end - movi d31, 0 - lsr x9, x3, 0x2 - - -// main loop each loop generate dot prodcut for 1x2x4SFP -loop4: - ldr q0, [x1] // q0 = i[3-0] - ldp d16, d18, [x2] // d16 = k[1-0][0] d18 = k[1-0][1] - ldp d20, d22, [x2, 0x10] // d20 = k[1-0][2] d22 = k[1-0][3] - prfm pldl1keep, [x1, 0x100] - add x1, x1, 0x10 - - fmla v30.2s, v16.2s, v0.s[0] // ik[1-0][0] - subs x9, x9, 0x1 - fmla v31.2s, v18.2s, v0.s[1] // ik[1-0][1] - prfm pldl1keep, [x2, 0x200] - add x2, x2, 0x20 - fmla v30.2s, v20.2s, v0.s[2] // ik[1-0][2] - fmla v31.2s, v22.2s, v0.s[3] // ik[1-0][3] - - b.ne loop4 - fadd v30.2s, v30.2s, v31.2s - -loop4_end: - cbz x10, save_result - -loop1: - ldr s0, [x1], 0x4 - ldr d16,[x2], 0x8 - subs x10,x10, 0x1 - - fmla v30.2s, v16.2s, v0.s[0] - - b.ne loop1 - -save_result: - str d30, [x4] - - ret - - .space 256 - .end - +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: xiaowei@openailab.com + */ + +// +// 1*2 single precise floating point matric multiplication +// +// -- -- +// | k0 k1 | +// | . . | +// -- -- | . . | -- -- -- -- +// | i0 - - - - - - | x | . . | + | b0 b1 | = | i0k0 i0k1 | +// -- -- | . . | -- -- -- -- +// | . . | +// | . . | +// -- -- +// input 1 x p kernel p x 2 biases x 2 output 1 x 2 p = kernel size +// +// +// optimised for Cortex-A72 pipeline 9 cycle per loop (1*2*4 dot product) +// the bottleneck is memory bandwidth +// +// input: +// x0 arg0 biases start address {b0, b1 } +// x1 arg1 input data start address {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...} +// x2 arg2 kernel data start address {k00, k10, k01, k11, k02, k12, ...} +// x3 arg3 kernel size +// x4 arg4 output data save address {ik0, ik1} +// +// output: no +// +// v0 4S data of input {i3 i2 i1 i0 } +// v1~v7 not used +// v16 2S kernal data0 {k10 | k00} +// v17 not used +// v18 2S kernal data1 {k11 | k01} +// v19 not used +// v20 2S kernal data2 {k12 | k02} +// v21 not used +// v22 2S kernal data3 {k13 | k03} +// v23 not used +// v24-29 not used +// v30 dot product for {ik1, ik0} +// v31 dot product for {ik1, ik0} + + .section .text,"ax" + .align 5 + + .type sgemv_1x2_a72 STT_FUNC + .global sgemv_1x2_a72 +sgemv_1x2_a72: +// initial + movi d30, 0 + prfm pldl1keep, [x1, 0x80] + cmp x3, 0x4 + prfm pldl1keep, [x2, 0x100] + prfm pldl1keep, [x2, 0x140] + cbz x0, start_convolution + ldr d30, [x0] + +start_convolution: + and x10,x3, 0x3 + b.lt loop4_end + movi d31, 0 + lsr x9, x3, 0x2 + + +// main loop each loop generate dot prodcut for 1x2x4SFP +loop4: + ldr q0, [x1] // q0 = i[3-0] + ldp d16, d18, [x2] // d16 = k[1-0][0] d18 = k[1-0][1] + ldp d20, d22, [x2, 0x10] // d20 = k[1-0][2] d22 = k[1-0][3] + prfm pldl1keep, [x1, 0x100] + add x1, x1, 0x10 + + fmla v30.2s, v16.2s, v0.s[0] // ik[1-0][0] + subs x9, x9, 0x1 + fmla v31.2s, v18.2s, v0.s[1] // ik[1-0][1] + prfm pldl1keep, [x2, 0x200] + add x2, x2, 0x20 + fmla v30.2s, v20.2s, v0.s[2] // ik[1-0][2] + fmla v31.2s, v22.2s, v0.s[3] // ik[1-0][3] + + b.ne loop4 + fadd v30.2s, v30.2s, v31.2s + +loop4_end: + cbz x10, save_result + +loop1: + ldr s0, [x1], 0x4 + ldr d16,[x2], 0x8 + subs x10,x10, 0x1 + + fmla v30.2s, v16.2s, v0.s[0] + + b.ne loop1 + +save_result: + str d30, [x4] + + ret + + .space 256 + .end + diff --git a/executor/operator/arm64/fc/sgemv_1x8_a53.S b/executor/operator/arm64/fc/sgemv_1x8_a53.S index b0382a813..e7b5698e2 100644 --- a/executor/operator/arm64/fc/sgemv_1x8_a53.S +++ b/executor/operator/arm64/fc/sgemv_1x8_a53.S @@ -1,133 +1,133 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2018, Open AI Lab - * Author: xiaowei@openailab.com - */ - -// -// 1*8 single precise floating point matric multiplication -// -// -- -- -// | k0 k1 .. k7 | -// | . . . . | -// -- -- | . . . . | -- -- -- -- -// | i0 - - - - - - | x | . . . . | + | b0 b1 .. b7 | = | i0k0 i0k1 .. i0k7 | -// -- -- | . . . . | -- -- -- -- -// | . . . . | -// | . . . . | -// -- -- -// input 1 x p kernel p x 8 biases 1 x 8 output 1 x 8 p = kernel size -// -// -// optimised for Cortex-A53 pipeline 43 cycle per loop (1*8*4 dot product) -// the bottleneck is memory bandwidth -// -// input: -// x0 arg0 biases start address {b0, b1, b2, b3, b4, b5, b6, b7}} -// x1 arg1 input data start address {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...} -// x2 arg2 kernel data start address {k00, k10, k20, k30, k40, k50, k60, k70, k80, k01, k11, k21, k31, ...} -// x3 arg3 kernel size -// x4 arg4 output data save address {ik0, ik1, ik2, ik3, ik4, ik5, ik6, ik7, ik8} -// -// output: no -// -// v0 4S data of input {i3 i2 i1 i0 } -// v1~v7 not used -// v16 4S kernal data0 {k30 | k20 | k10 | k00} -// v17 4S kernal data4 {k70 | k60 | k50 | k40} -// v18 4S kernal data1 {k31 | k21 | k11 | k01} -// v19 4S kernal data5 {k71 | k61 | k51 | k41} -// v20 4S kernal data2 {k32 | k22 | k12 | k02} -// v21 4S kernal data6 {k72 | k62 | k52 | k42} -// v22 4S kernal data3 {k33 | k23 | k13 | k03} -// v23 4S kernal data7 {k73 | k63 | k53 | k43} -// v24-v29 not used -// v30 dot product for {ik3, ik2, ik1, ik0} -// v31 dot product for {ik7, ik6, ik5, ik4} - - .section .text,"ax" - .align 5 - - .type sgemv_1x8_a53 STT_FUNC - .global sgemv_1x8_a53 -sgemv_1x8_a53: - // initial - movi d30, 0 - cmp x3, 0x4 - movi d31, 0 - prfm pldl1keep, [x1, 0x40] - cbz x0, start_convolution - ldp q30, q31, [x0] - -start_convolution: - b.lt loop1 - lsr x6, x3, 0x2 - movi d28, 0 - movi d29, 0 - -// main loop each loop generate dot prodcut for 1x8x4SFP -loop4: - ldr q0, [x1] // q0 = i[3-0] - ldp q16, q17, [x2] // q16 = k[3-0][0] q17 = k[7-4][0] - ldp q18, q19, [x2, 0x20] // q18 = k[3-0][1] q19 = k[7-4][1] - ldp q20, q21, [x2, 0x40] // q20 = k[3-0][2] q21 = k[7-4][2] - ldp q22, q23, [x2, 0x60] // q22 = k[3-0][3] q23 = k[7-4][3] - subs x6, x6, 0x1 - - fmla v28.4s, v16.4s, v0.s[0] // ik[3-0][0] - fmla v29.4s, v17.4s, v0.s[0] // ik[7-4][0] - prfm pldl1keep, [x1, 0x80] - fmla v30.4s, v18.4s, v0.s[1] // ik[3-0][1] - prfm pldl1keep, [x2, 0x340] - fmla v31.4s, v19.4s, v0.s[1] // ik[7-4][1] - add x1, x1, 0x10 - fmla v28.4s, v20.4s, v0.s[2] // ik[3-0][2] - prfm pldl1keep, [x2, 0x380] - fmla v29.4s, v21.4s, v0.s[2] // ik[7-4][2] - fmla v30.4s, v22.4s, v0.s[3] // ik[3-0][3] - add x2, x2, 0x80 - fmla v31.4s, v23.4s, v0.s[3] // ik[7-4][3] - - b.ne loop4 - - and x3, x3, 0x3 - fadd v30.4s, v30.4s, v28.4s - fadd v31.4s, v31.4s, v29.4s - cbz x3, save_result - -loop1: - ldr s0, [x1], 0x4 - ldp q16, q17, [x2], 0x20 - subs x3, x3, 0x1 - - fmla v30.4s, v16.4s, v0.s[0] - fmla v31.4s, v17.4s, v0.s[0] - - b.ne loop1 - -save_result: - stp q30, q31, [x4] - - ret - - .space 256 - .end - +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: xiaowei@openailab.com + */ + +// +// 1*8 single precise floating point matric multiplication +// +// -- -- +// | k0 k1 .. k7 | +// | . . . . | +// -- -- | . . . . | -- -- -- -- +// | i0 - - - - - - | x | . . . . | + | b0 b1 .. b7 | = | i0k0 i0k1 .. i0k7 | +// -- -- | . . . . | -- -- -- -- +// | . . . . | +// | . . . . | +// -- -- +// input 1 x p kernel p x 8 biases 1 x 8 output 1 x 8 p = kernel size +// +// +// optimised for Cortex-A53 pipeline 43 cycle per loop (1*8*4 dot product) +// the bottleneck is memory bandwidth +// +// input: +// x0 arg0 biases start address {b0, b1, b2, b3, b4, b5, b6, b7}} +// x1 arg1 input data start address {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...} +// x2 arg2 kernel data start address {k00, k10, k20, k30, k40, k50, k60, k70, k80, k01, k11, k21, k31, ...} +// x3 arg3 kernel size +// x4 arg4 output data save address {ik0, ik1, ik2, ik3, ik4, ik5, ik6, ik7, ik8} +// +// output: no +// +// v0 4S data of input {i3 i2 i1 i0 } +// v1~v7 not used +// v16 4S kernal data0 {k30 | k20 | k10 | k00} +// v17 4S kernal data4 {k70 | k60 | k50 | k40} +// v18 4S kernal data1 {k31 | k21 | k11 | k01} +// v19 4S kernal data5 {k71 | k61 | k51 | k41} +// v20 4S kernal data2 {k32 | k22 | k12 | k02} +// v21 4S kernal data6 {k72 | k62 | k52 | k42} +// v22 4S kernal data3 {k33 | k23 | k13 | k03} +// v23 4S kernal data7 {k73 | k63 | k53 | k43} +// v24-v29 not used +// v30 dot product for {ik3, ik2, ik1, ik0} +// v31 dot product for {ik7, ik6, ik5, ik4} + + .section .text,"ax" + .align 5 + + .type sgemv_1x8_a53 STT_FUNC + .global sgemv_1x8_a53 +sgemv_1x8_a53: + // initial + movi d30, 0 + cmp x3, 0x4 + movi d31, 0 + prfm pldl1keep, [x1, 0x40] + cbz x0, start_convolution + ldp q30, q31, [x0] + +start_convolution: + b.lt loop1 + lsr x6, x3, 0x2 + movi d28, 0 + movi d29, 0 + +// main loop each loop generate dot prodcut for 1x8x4SFP +loop4: + ldr q0, [x1] // q0 = i[3-0] + ldp q16, q17, [x2] // q16 = k[3-0][0] q17 = k[7-4][0] + ldp q18, q19, [x2, 0x20] // q18 = k[3-0][1] q19 = k[7-4][1] + ldp q20, q21, [x2, 0x40] // q20 = k[3-0][2] q21 = k[7-4][2] + ldp q22, q23, [x2, 0x60] // q22 = k[3-0][3] q23 = k[7-4][3] + subs x6, x6, 0x1 + + fmla v28.4s, v16.4s, v0.s[0] // ik[3-0][0] + fmla v29.4s, v17.4s, v0.s[0] // ik[7-4][0] + prfm pldl1keep, [x1, 0x80] + fmla v30.4s, v18.4s, v0.s[1] // ik[3-0][1] + prfm pldl1keep, [x2, 0x340] + fmla v31.4s, v19.4s, v0.s[1] // ik[7-4][1] + add x1, x1, 0x10 + fmla v28.4s, v20.4s, v0.s[2] // ik[3-0][2] + prfm pldl1keep, [x2, 0x380] + fmla v29.4s, v21.4s, v0.s[2] // ik[7-4][2] + fmla v30.4s, v22.4s, v0.s[3] // ik[3-0][3] + add x2, x2, 0x80 + fmla v31.4s, v23.4s, v0.s[3] // ik[7-4][3] + + b.ne loop4 + + and x3, x3, 0x3 + fadd v30.4s, v30.4s, v28.4s + fadd v31.4s, v31.4s, v29.4s + cbz x3, save_result + +loop1: + ldr s0, [x1], 0x4 + ldp q16, q17, [x2], 0x20 + subs x3, x3, 0x1 + + fmla v30.4s, v16.4s, v0.s[0] + fmla v31.4s, v17.4s, v0.s[0] + + b.ne loop1 + +save_result: + stp q30, q31, [x4] + + ret + + .space 256 + .end + diff --git a/executor/operator/arm64/fc/sgemv_1x8_a72.S b/executor/operator/arm64/fc/sgemv_1x8_a72.S index aa5665cdc..6129d735e 100644 --- a/executor/operator/arm64/fc/sgemv_1x8_a72.S +++ b/executor/operator/arm64/fc/sgemv_1x8_a72.S @@ -1,132 +1,132 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: xiaowei@openailab.com - */ - -// -// 1*8 single precise floating point matric multiplication -// -// -- -- -// | k0 k1 .. k7 | -// | . . . . | -// -- -- | . . . . | -- -- -- -- -// | i0 - - - - - - | x | . . . . | + | b0 b1 .. b7 | = | i0k0 i0k1 .. i0k7 | -// -- -- | . . . . | -- -- -- -- -// | . . . . | -// | . . . . | -// -- -- -// input 1 x p kernel p x 8 biases 1 x 8 output 1 x 8 p = kernel size -// -// -// optimised for Cortex-A72 pipeline 13 cycle per loop (1*8*4 dot product) -// the bottleneck is memory bandwidth -// -// input: -// x0 arg0 biases start address {b0, b1, b2, b3, b4, b5, b6, b7}} -// x1 arg1 input data start address {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...} -// x2 arg2 kernel data start address {k00, k10, k20, k30, k40, k50, k60, k70, k80, k01, k11, k21, k31, ...} -// x3 arg3 kernel size -// x4 arg4 output data save address {ik0, ik1, ik2, ik3, ik4, ik5, ik6, ik7, ik8} -// -// output: no -// -// v0 4S data of input {i3 i2 i1 i0 } -// v1~v7 not used -// v16 4S kernal data0 {k30 | k20 | k10 | k00} -// v17 4S kernal data4 {k70 | k60 | k50 | k40} -// v18 4S kernal data1 {k31 | k21 | k11 | k01} -// v19 4S kernal data5 {k71 | k61 | k51 | k41} -// v20 4S kernal data2 {k32 | k22 | k12 | k02} -// v21 4S kernal data6 {k72 | k62 | k52 | k42} -// v22 4S kernal data3 {k33 | k23 | k13 | k03} -// v23 4S kernal data7 {k73 | k63 | k53 | k43} -// v24-v29 not used -// v30 dot product for {ik3, ik2, ik1, ik0} -// v31 dot product for {ik7, ik6, ik5, ik4} - - .section .text,"ax" - .align 5 - - .type sgemv_1x8_a72 STT_FUNC - .global sgemv_1x8_a72 -sgemv_1x8_a72: -// initial - cmp x3, 0x4 - prfm pldl1keep, [x1, 0x40] - prfm pldl1keep, [x2, 0x200] - prfm pldl1keep, [x2, 0x240] - movi d30, 0 - movi d31, 0 - cbz x0, start_convolution - ldp q30, q31, [x0] - -start_convolution: - and x5, x3, 0x3 - b.lt loop1 - lsr x6, x3, 0x2 - - -// main loop each loop generate dot prodcut for 1x8x4SFP -loop4: - ldr q0, [x1] // q0 = i[3-0] - ldp q16, q17, [x2] // q16 = k[3-0][0] q17 = k[7-4][0] - ldp q18, q19, [x2, 0x20] // q18 = k[3-0][1] q19 = k[7-4][1] - prfm pldl1keep, [x1, 0x80] - add x1, x1, 0x10 - subs x6, x6, 0x1 - - fmla v30.4s, v16.4s, v0.s[0] // ik[3-0][0] - fmla v31.4s, v17.4s, v0.s[0] // ik[7-4][0] - ldp q20, q21, [x2, 0x40] // q20 = k[3-0][2] q21 = k[7-4][2] - fmla v30.4s, v18.4s, v0.s[1] // ik[3-0][1] - prfm pldl1keep, [x2, 0x400] - fmla v31.4s, v19.4s, v0.s[1] // ik[7-4][1] - ldp q22, q23, [x2, 0x60] // q22 = k[3-0][3] q23 = k[7-4][3] - fmla v30.4s, v20.4s, v0.s[2] // ik[3-0][2] - prfm pldl1keep, [x2, 0x440] - add x2, x2, 0x80 - fmla v31.4s, v21.4s, v0.s[2] // ik[7-4][2] - fmla v30.4s, v22.4s, v0.s[3] // ik[3-0][3] - fmla v31.4s, v23.4s, v0.s[3] // ik[7-4][3] - - b.ne loop4 - - cbz x5, save_result - -loop1: - ldr s0, [x1], 0x4 - ldp q16, q17, [x2], 0x20 - subs x5, x5, 0x1 - - fmla v30.4s, v16.4s, v0.s[0] - fmla v31.4s, v17.4s, v0.s[0] - - b.ne loop1 - -save_result: - stp q30, q31, [x4] - - ret - - .space 256 - .end - +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: xiaowei@openailab.com + */ + +// +// 1*8 single precise floating point matric multiplication +// +// -- -- +// | k0 k1 .. k7 | +// | . . . . | +// -- -- | . . . . | -- -- -- -- +// | i0 - - - - - - | x | . . . . | + | b0 b1 .. b7 | = | i0k0 i0k1 .. i0k7 | +// -- -- | . . . . | -- -- -- -- +// | . . . . | +// | . . . . | +// -- -- +// input 1 x p kernel p x 8 biases 1 x 8 output 1 x 8 p = kernel size +// +// +// optimised for Cortex-A72 pipeline 13 cycle per loop (1*8*4 dot product) +// the bottleneck is memory bandwidth +// +// input: +// x0 arg0 biases start address {b0, b1, b2, b3, b4, b5, b6, b7}} +// x1 arg1 input data start address {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, ...} +// x2 arg2 kernel data start address {k00, k10, k20, k30, k40, k50, k60, k70, k80, k01, k11, k21, k31, ...} +// x3 arg3 kernel size +// x4 arg4 output data save address {ik0, ik1, ik2, ik3, ik4, ik5, ik6, ik7, ik8} +// +// output: no +// +// v0 4S data of input {i3 i2 i1 i0 } +// v1~v7 not used +// v16 4S kernal data0 {k30 | k20 | k10 | k00} +// v17 4S kernal data4 {k70 | k60 | k50 | k40} +// v18 4S kernal data1 {k31 | k21 | k11 | k01} +// v19 4S kernal data5 {k71 | k61 | k51 | k41} +// v20 4S kernal data2 {k32 | k22 | k12 | k02} +// v21 4S kernal data6 {k72 | k62 | k52 | k42} +// v22 4S kernal data3 {k33 | k23 | k13 | k03} +// v23 4S kernal data7 {k73 | k63 | k53 | k43} +// v24-v29 not used +// v30 dot product for {ik3, ik2, ik1, ik0} +// v31 dot product for {ik7, ik6, ik5, ik4} + + .section .text,"ax" + .align 5 + + .type sgemv_1x8_a72 STT_FUNC + .global sgemv_1x8_a72 +sgemv_1x8_a72: +// initial + cmp x3, 0x4 + prfm pldl1keep, [x1, 0x40] + prfm pldl1keep, [x2, 0x200] + prfm pldl1keep, [x2, 0x240] + movi d30, 0 + movi d31, 0 + cbz x0, start_convolution + ldp q30, q31, [x0] + +start_convolution: + and x5, x3, 0x3 + b.lt loop1 + lsr x6, x3, 0x2 + + +// main loop each loop generate dot prodcut for 1x8x4SFP +loop4: + ldr q0, [x1] // q0 = i[3-0] + ldp q16, q17, [x2] // q16 = k[3-0][0] q17 = k[7-4][0] + ldp q18, q19, [x2, 0x20] // q18 = k[3-0][1] q19 = k[7-4][1] + prfm pldl1keep, [x1, 0x80] + add x1, x1, 0x10 + subs x6, x6, 0x1 + + fmla v30.4s, v16.4s, v0.s[0] // ik[3-0][0] + fmla v31.4s, v17.4s, v0.s[0] // ik[7-4][0] + ldp q20, q21, [x2, 0x40] // q20 = k[3-0][2] q21 = k[7-4][2] + fmla v30.4s, v18.4s, v0.s[1] // ik[3-0][1] + prfm pldl1keep, [x2, 0x400] + fmla v31.4s, v19.4s, v0.s[1] // ik[7-4][1] + ldp q22, q23, [x2, 0x60] // q22 = k[3-0][3] q23 = k[7-4][3] + fmla v30.4s, v20.4s, v0.s[2] // ik[3-0][2] + prfm pldl1keep, [x2, 0x440] + add x2, x2, 0x80 + fmla v31.4s, v21.4s, v0.s[2] // ik[7-4][2] + fmla v30.4s, v22.4s, v0.s[3] // ik[3-0][3] + fmla v31.4s, v23.4s, v0.s[3] // ik[7-4][3] + + b.ne loop4 + + cbz x5, save_result + +loop1: + ldr s0, [x1], 0x4 + ldp q16, q17, [x2], 0x20 + subs x5, x5, 0x1 + + fmla v30.4s, v16.4s, v0.s[0] + fmla v31.4s, v17.4s, v0.s[0] + + b.ne loop1 + +save_result: + stp q30, q31, [x4] + + ret + + .space 256 + .end + diff --git a/executor/operator/arm64/fused/Makefile b/executor/operator/arm64/fused/Makefile index d250d1466..f9a189f58 100644 --- a/executor/operator/arm64/fused/Makefile +++ b/executor/operator/arm64/fused/Makefile @@ -1,3 +1,3 @@ -obj-y+=fused_bn_scale_relu.o -obj-y+=bn_scale_relu_neon.o - +obj-y+=fused_bn_scale_relu.o +obj-y+=bn_scale_relu_neon.o + diff --git a/executor/operator/arm64/fused/bn_scale_relu_neon.S b/executor/operator/arm64/fused/bn_scale_relu_neon.S index 095c0dc5f..316b00351 100644 --- a/executor/operator/arm64/fused/bn_scale_relu_neon.S +++ b/executor/operator/arm64/fused/bn_scale_relu_neon.S @@ -1,332 +1,332 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: haitao@openailab.com - */ -/* relu implementation using neon vector */ - - -.text -.align 5 -.global bn_scale_relu_neon -.type bn_scale_relu_neon, %function - -bn_scale_relu_neon: - //x0 input - //x1 gamma - //x2 beta - //x3 mean - //x4 var - //x5 channel_number - //x6 channel_size - //x7 output - - //s28 -- gamma - //s29 -- beta - //s30 -- mean - //s31 -- var - //v27 --- zero - - /* - data=data*s_var+s_mean; - data=data*s_gamma+s_beta; - */ - - fmov s27,wzr - dup v27.4s,v27.s[0] - -channel_start: - - ldr s28,[x1],#4 - ld1r {v29.4s},[x2],#4 - ld1r {v30.4s},[x3],#4 - ldr s31,[x4],#4 - - lsr x9,x6,6 - lsl x10,x9,6 - sub x10,x6,x10 - cbz x9, less_64 - - ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 - ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 - - sub x9,x9,1 - cbz x9, last_block_64 - -block_64_start: - - - mov v8.16b,v30.16b - mov v9.16b,v30.16b - mov v10.16b,v30.16b - mov v11.16b,v30.16b - - ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64 - - fmla v8.4s,v0.4s,v31.s[0] - fmla v9.4s,v1.4s,v31.s[0] - fmla v10.4s,v2.4s,v31.s[0] - fmla v11.4s,v3.4s,v31.s[0] - - - fmul v8.4s,v8.4s,v28.s[0] - fmul v9.4s,v9.4s,v28.s[0] - fmul v10.4s,v10.4s,v28.s[0] - fmul v11.4s,v11.4s,v28.s[0] - - fadd v8.4s,v8.4s,v29.4s - fadd v9.4s,v9.4s,v29.4s - fadd v10.4s,v10.4s,v29.4s - fadd v11.4s,v11.4s,v29.4s - - fmax v8.4s,v8.4s,v27.4s - fmax v9.4s,v9.4s,v27.4s - fmax v10.4s,v10.4s,v27.4s - fmax v11.4s,v11.4s,v27.4s - - mov v12.16b,v30.16b - mov v13.16b,v30.16b - - st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64 - - mov v14.16b,v30.16b - mov v15.16b,v30.16b - - ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64 - - fmla v12.4s,v4.4s,v31.s[0] - fmla v13.4s,v5.4s,v31.s[0] - fmla v14.4s,v6.4s,v31.s[0] - fmla v15.4s,v7.4s,v31.s[0] - - fmul v12.4s,v12.4s,v28.s[0] - fmul v13.4s,v13.4s,v28.s[0] - fmul v14.4s,v14.4s,v28.s[0] - fmul v15.4s,v15.4s,v28.s[0] - - fadd v12.4s,v12.4s,v29.4s - fadd v13.4s,v13.4s,v29.4s - fadd v14.4s,v14.4s,v29.4s - fadd v15.4s,v15.4s,v29.4s - - fmax v12.4s,v12.4s,v27.4s - fmax v13.4s,v13.4s,v27.4s - fmax v14.4s,v14.4s,v27.4s - fmax v15.4s,v15.4s,v27.4s - - mov v8.16b,v30.16b - mov v9.16b,v30.16b - - st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64 - - mov v10.16b,v30.16b - mov v11.16b,v30.16b - subs x9,x9,1 - - ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 - - fmla v8.4s,v16.4s,v31.s[0] - fmla v9.4s,v17.4s,v31.s[0] - fmla v10.4s,v18.4s,v31.s[0] - fmla v11.4s,v19.4s,v31.s[0] - - fmul v8.4s,v8.4s,v28.s[0] - fmul v9.4s,v9.4s,v28.s[0] - fmul v10.4s,v10.4s,v28.s[0] - fmul v11.4s,v11.4s,v28.s[0] - - fadd v8.4s,v8.4s,v29.4s - fadd v9.4s,v9.4s,v29.4s - fadd v10.4s,v10.4s,v29.4s - fadd v11.4s,v11.4s,v29.4s - - fmax v8.4s,v8.4s,v27.4s - fmax v9.4s,v9.4s,v27.4s - fmax v10.4s,v10.4s,v27.4s - fmax v11.4s,v11.4s,v27.4s - - - mov v12.16b,v30.16b - mov v13.16b,v30.16b - - st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64 - - mov v14.16b,v30.16b - mov v15.16b,v30.16b - - ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 - - fmla v12.4s,v20.4s,v31.s[0] - fmla v13.4s,v21.4s,v31.s[0] - fmla v14.4s,v22.4s,v31.s[0] - fmla v15.4s,v23.4s,v31.s[0] - - fmul v12.4s,v12.4s,v28.s[0] - fmul v13.4s,v13.4s,v28.s[0] - fmul v14.4s,v14.4s,v28.s[0] - fmul v15.4s,v15.4s,v28.s[0] - - fadd v12.4s,v12.4s,v29.4s - fadd v13.4s,v13.4s,v29.4s - fadd v14.4s,v14.4s,v29.4s - fadd v15.4s,v15.4s,v29.4s - - fmax v12.4s,v12.4s,v27.4s - fmax v13.4s,v13.4s,v27.4s - fmax v14.4s,v14.4s,v27.4s - fmax v15.4s,v15.4s,v27.4s - - st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64 - - b.ne block_64_start - -last_block_64: - - ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64 - - mov v8.16b,v30.16b - mov v9.16b,v30.16b - mov v10.16b,v30.16b - mov v11.16b,v30.16b - - fmla v8.4s,v0.4s,v31.s[0] - fmla v9.4s,v1.4s,v31.s[0] - fmla v10.4s,v2.4s,v31.s[0] - fmla v11.4s,v3.4s,v31.s[0] - - fmul v8.4s,v8.4s,v28.s[0] - fmul v9.4s,v9.4s,v28.s[0] - fmul v10.4s,v10.4s,v28.s[0] - fmul v11.4s,v11.4s,v28.s[0] - - fadd v8.4s,v8.4s,v29.4s - fadd v9.4s,v9.4s,v29.4s - fadd v10.4s,v10.4s,v29.4s - fadd v11.4s,v11.4s,v29.4s - - fmax v8.4s,v8.4s,v27.4s - fmax v9.4s,v9.4s,v27.4s - fmax v10.4s,v10.4s,v27.4s - fmax v11.4s,v11.4s,v27.4s - - st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64 - - ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64 - - mov v12.16b,v30.16b - mov v13.16b,v30.16b - mov v14.16b,v30.16b - mov v15.16b,v30.16b - - fmla v12.4s,v4.4s,v31.s[0] - fmla v13.4s,v5.4s,v31.s[0] - fmla v14.4s,v6.4s,v31.s[0] - fmla v15.4s,v7.4s,v31.s[0] - - fmul v12.4s,v12.4s,v28.s[0] - fmul v13.4s,v13.4s,v28.s[0] - fmul v14.4s,v14.4s,v28.s[0] - fmul v15.4s,v15.4s,v28.s[0] - - fadd v12.4s,v12.4s,v29.4s - fadd v13.4s,v13.4s,v29.4s - fadd v14.4s,v14.4s,v29.4s - fadd v15.4s,v15.4s,v29.4s - - fmax v12.4s,v12.4s,v27.4s - fmax v13.4s,v13.4s,v27.4s - fmax v14.4s,v14.4s,v27.4s - fmax v15.4s,v15.4s,v27.4s - - - st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64 - - mov v8.16b,v30.16b - mov v9.16b,v30.16b - mov v10.16b,v30.16b - mov v11.16b,v30.16b - - fmla v8.4s,v16.4s,v31.s[0] - fmla v9.4s,v17.4s,v31.s[0] - fmla v10.4s,v18.4s,v31.s[0] - fmla v11.4s,v19.4s,v31.s[0] - - fmul v8.4s,v8.4s,v28.s[0] - fmul v9.4s,v9.4s,v28.s[0] - fmul v10.4s,v10.4s,v28.s[0] - fmul v11.4s,v11.4s,v28.s[0] - - fadd v8.4s,v8.4s,v29.4s - fadd v9.4s,v9.4s,v29.4s - fadd v10.4s,v10.4s,v29.4s - fadd v11.4s,v11.4s,v29.4s - - fmax v8.4s,v8.4s,v27.4s - fmax v9.4s,v9.4s,v27.4s - fmax v10.4s,v10.4s,v27.4s - fmax v11.4s,v11.4s,v27.4s - - mov v12.16b,v30.16b - mov v13.16b,v30.16b - mov v14.16b,v30.16b - mov v15.16b,v30.16b - - st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64 - - fmla v12.4s,v20.4s,v31.s[0] - fmla v13.4s,v21.4s,v31.s[0] - fmla v14.4s,v22.4s,v31.s[0] - fmla v15.4s,v23.4s,v31.s[0] - - fmul v12.4s,v12.4s,v28.s[0] - fmul v13.4s,v13.4s,v28.s[0] - fmul v14.4s,v14.4s,v28.s[0] - fmul v15.4s,v15.4s,v28.s[0] - - fadd v12.4s,v12.4s,v29.4s - fadd v13.4s,v13.4s,v29.4s - fadd v14.4s,v14.4s,v29.4s - fadd v15.4s,v15.4s,v29.4s - - fmax v12.4s,v12.4s,v27.4s - fmax v13.4s,v13.4s,v27.4s - fmax v14.4s,v14.4s,v27.4s - fmax v15.4s,v15.4s,v27.4s - - st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64 - - cbz x10, channel_done - -less_64: - subs x10,x10,1 - ldr s0,[x0],#4 - fmadd s1,s0,s31,s30 - fmadd s1,s1,s28,s29 - fmax s1,s1,s27 - str s1,[x7],#4 - b.ne less_64 - -channel_done: - - subs x5,x5,1 //channel_counter - b.ne channel_start - - ret +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: haitao@openailab.com + */ +/* relu implementation using neon vector */ + + +.text +.align 5 +.global bn_scale_relu_neon +.type bn_scale_relu_neon, %function + +bn_scale_relu_neon: + //x0 input + //x1 gamma + //x2 beta + //x3 mean + //x4 var + //x5 channel_number + //x6 channel_size + //x7 output + + //s28 -- gamma + //s29 -- beta + //s30 -- mean + //s31 -- var + //v27 --- zero + + /* + data=data*s_var+s_mean; + data=data*s_gamma+s_beta; + */ + + fmov s27,wzr + dup v27.4s,v27.s[0] + +channel_start: + + ldr s28,[x1],#4 + ld1r {v29.4s},[x2],#4 + ld1r {v30.4s},[x3],#4 + ldr s31,[x4],#4 + + lsr x9,x6,6 + lsl x10,x9,6 + sub x10,x6,x10 + cbz x9, less_64 + + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + + sub x9,x9,1 + cbz x9, last_block_64 + +block_64_start: + + + mov v8.16b,v30.16b + mov v9.16b,v30.16b + mov v10.16b,v30.16b + mov v11.16b,v30.16b + + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64 + + fmla v8.4s,v0.4s,v31.s[0] + fmla v9.4s,v1.4s,v31.s[0] + fmla v10.4s,v2.4s,v31.s[0] + fmla v11.4s,v3.4s,v31.s[0] + + + fmul v8.4s,v8.4s,v28.s[0] + fmul v9.4s,v9.4s,v28.s[0] + fmul v10.4s,v10.4s,v28.s[0] + fmul v11.4s,v11.4s,v28.s[0] + + fadd v8.4s,v8.4s,v29.4s + fadd v9.4s,v9.4s,v29.4s + fadd v10.4s,v10.4s,v29.4s + fadd v11.4s,v11.4s,v29.4s + + fmax v8.4s,v8.4s,v27.4s + fmax v9.4s,v9.4s,v27.4s + fmax v10.4s,v10.4s,v27.4s + fmax v11.4s,v11.4s,v27.4s + + mov v12.16b,v30.16b + mov v13.16b,v30.16b + + st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64 + + mov v14.16b,v30.16b + mov v15.16b,v30.16b + + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64 + + fmla v12.4s,v4.4s,v31.s[0] + fmla v13.4s,v5.4s,v31.s[0] + fmla v14.4s,v6.4s,v31.s[0] + fmla v15.4s,v7.4s,v31.s[0] + + fmul v12.4s,v12.4s,v28.s[0] + fmul v13.4s,v13.4s,v28.s[0] + fmul v14.4s,v14.4s,v28.s[0] + fmul v15.4s,v15.4s,v28.s[0] + + fadd v12.4s,v12.4s,v29.4s + fadd v13.4s,v13.4s,v29.4s + fadd v14.4s,v14.4s,v29.4s + fadd v15.4s,v15.4s,v29.4s + + fmax v12.4s,v12.4s,v27.4s + fmax v13.4s,v13.4s,v27.4s + fmax v14.4s,v14.4s,v27.4s + fmax v15.4s,v15.4s,v27.4s + + mov v8.16b,v30.16b + mov v9.16b,v30.16b + + st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64 + + mov v10.16b,v30.16b + mov v11.16b,v30.16b + subs x9,x9,1 + + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 + + fmla v8.4s,v16.4s,v31.s[0] + fmla v9.4s,v17.4s,v31.s[0] + fmla v10.4s,v18.4s,v31.s[0] + fmla v11.4s,v19.4s,v31.s[0] + + fmul v8.4s,v8.4s,v28.s[0] + fmul v9.4s,v9.4s,v28.s[0] + fmul v10.4s,v10.4s,v28.s[0] + fmul v11.4s,v11.4s,v28.s[0] + + fadd v8.4s,v8.4s,v29.4s + fadd v9.4s,v9.4s,v29.4s + fadd v10.4s,v10.4s,v29.4s + fadd v11.4s,v11.4s,v29.4s + + fmax v8.4s,v8.4s,v27.4s + fmax v9.4s,v9.4s,v27.4s + fmax v10.4s,v10.4s,v27.4s + fmax v11.4s,v11.4s,v27.4s + + + mov v12.16b,v30.16b + mov v13.16b,v30.16b + + st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64 + + mov v14.16b,v30.16b + mov v15.16b,v30.16b + + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + + fmla v12.4s,v20.4s,v31.s[0] + fmla v13.4s,v21.4s,v31.s[0] + fmla v14.4s,v22.4s,v31.s[0] + fmla v15.4s,v23.4s,v31.s[0] + + fmul v12.4s,v12.4s,v28.s[0] + fmul v13.4s,v13.4s,v28.s[0] + fmul v14.4s,v14.4s,v28.s[0] + fmul v15.4s,v15.4s,v28.s[0] + + fadd v12.4s,v12.4s,v29.4s + fadd v13.4s,v13.4s,v29.4s + fadd v14.4s,v14.4s,v29.4s + fadd v15.4s,v15.4s,v29.4s + + fmax v12.4s,v12.4s,v27.4s + fmax v13.4s,v13.4s,v27.4s + fmax v14.4s,v14.4s,v27.4s + fmax v15.4s,v15.4s,v27.4s + + st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64 + + b.ne block_64_start + +last_block_64: + + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64 + + mov v8.16b,v30.16b + mov v9.16b,v30.16b + mov v10.16b,v30.16b + mov v11.16b,v30.16b + + fmla v8.4s,v0.4s,v31.s[0] + fmla v9.4s,v1.4s,v31.s[0] + fmla v10.4s,v2.4s,v31.s[0] + fmla v11.4s,v3.4s,v31.s[0] + + fmul v8.4s,v8.4s,v28.s[0] + fmul v9.4s,v9.4s,v28.s[0] + fmul v10.4s,v10.4s,v28.s[0] + fmul v11.4s,v11.4s,v28.s[0] + + fadd v8.4s,v8.4s,v29.4s + fadd v9.4s,v9.4s,v29.4s + fadd v10.4s,v10.4s,v29.4s + fadd v11.4s,v11.4s,v29.4s + + fmax v8.4s,v8.4s,v27.4s + fmax v9.4s,v9.4s,v27.4s + fmax v10.4s,v10.4s,v27.4s + fmax v11.4s,v11.4s,v27.4s + + st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64 + + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64 + + mov v12.16b,v30.16b + mov v13.16b,v30.16b + mov v14.16b,v30.16b + mov v15.16b,v30.16b + + fmla v12.4s,v4.4s,v31.s[0] + fmla v13.4s,v5.4s,v31.s[0] + fmla v14.4s,v6.4s,v31.s[0] + fmla v15.4s,v7.4s,v31.s[0] + + fmul v12.4s,v12.4s,v28.s[0] + fmul v13.4s,v13.4s,v28.s[0] + fmul v14.4s,v14.4s,v28.s[0] + fmul v15.4s,v15.4s,v28.s[0] + + fadd v12.4s,v12.4s,v29.4s + fadd v13.4s,v13.4s,v29.4s + fadd v14.4s,v14.4s,v29.4s + fadd v15.4s,v15.4s,v29.4s + + fmax v12.4s,v12.4s,v27.4s + fmax v13.4s,v13.4s,v27.4s + fmax v14.4s,v14.4s,v27.4s + fmax v15.4s,v15.4s,v27.4s + + + st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64 + + mov v8.16b,v30.16b + mov v9.16b,v30.16b + mov v10.16b,v30.16b + mov v11.16b,v30.16b + + fmla v8.4s,v16.4s,v31.s[0] + fmla v9.4s,v17.4s,v31.s[0] + fmla v10.4s,v18.4s,v31.s[0] + fmla v11.4s,v19.4s,v31.s[0] + + fmul v8.4s,v8.4s,v28.s[0] + fmul v9.4s,v9.4s,v28.s[0] + fmul v10.4s,v10.4s,v28.s[0] + fmul v11.4s,v11.4s,v28.s[0] + + fadd v8.4s,v8.4s,v29.4s + fadd v9.4s,v9.4s,v29.4s + fadd v10.4s,v10.4s,v29.4s + fadd v11.4s,v11.4s,v29.4s + + fmax v8.4s,v8.4s,v27.4s + fmax v9.4s,v9.4s,v27.4s + fmax v10.4s,v10.4s,v27.4s + fmax v11.4s,v11.4s,v27.4s + + mov v12.16b,v30.16b + mov v13.16b,v30.16b + mov v14.16b,v30.16b + mov v15.16b,v30.16b + + st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x7],#64 + + fmla v12.4s,v20.4s,v31.s[0] + fmla v13.4s,v21.4s,v31.s[0] + fmla v14.4s,v22.4s,v31.s[0] + fmla v15.4s,v23.4s,v31.s[0] + + fmul v12.4s,v12.4s,v28.s[0] + fmul v13.4s,v13.4s,v28.s[0] + fmul v14.4s,v14.4s,v28.s[0] + fmul v15.4s,v15.4s,v28.s[0] + + fadd v12.4s,v12.4s,v29.4s + fadd v13.4s,v13.4s,v29.4s + fadd v14.4s,v14.4s,v29.4s + fadd v15.4s,v15.4s,v29.4s + + fmax v12.4s,v12.4s,v27.4s + fmax v13.4s,v13.4s,v27.4s + fmax v14.4s,v14.4s,v27.4s + fmax v15.4s,v15.4s,v27.4s + + st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x7],#64 + + cbz x10, channel_done + +less_64: + subs x10,x10,1 + ldr s0,[x0],#4 + fmadd s1,s0,s31,s30 + fmadd s1,s1,s28,s29 + fmax s1,s1,s27 + str s1,[x7],#4 + b.ne less_64 + +channel_done: + + subs x5,x5,1 //channel_counter + b.ne channel_start + + ret diff --git a/executor/operator/arm64/fused/fused_bn_scale_relu.cpp b/executor/operator/arm64/fused/fused_bn_scale_relu.cpp index 4dfb8753a..0caab72ff 100644 --- a/executor/operator/arm64/fused/fused_bn_scale_relu.cpp +++ b/executor/operator/arm64/fused/fused_bn_scale_relu.cpp @@ -1,207 +1,207 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: haitao@openailab.com - */ -#include -#include -#include - -#include "logger.hpp" -#include "operator/fused_operator.hpp" -#include "node_ops.hpp" -#include "tensor_mem.hpp" -#include "graph.hpp" - -extern "C" void bn_scale_relu_neon(const float* input, float* gamma, float* beta, float* mean, float* var, - int channel_number, int channel_size, float* output); - -namespace TEngine { - -namespace FusedBNScaleReluArm64 { - -struct FusedOps : public MTNodeOps -{ - struct BNParam - { - const float* input; - float* gamma; - float* beta; - float* mean; - float* var; - int channel_num; - int channel_size; - float* output; - }; - - bool Aider(int cpu, int seq, void* data) - { - BNParam* param = ( BNParam* )(data); - - bn_scale_relu_neon(param->input, param->gamma, param->beta, param->mean, param->var, param->channel_num, - param->channel_size, param->output); - - return true; - } - - bool OnBind(Node* node) - { - inplace_t io_map; - - io_map[0] = 0; - - node->SetAttr(ATTR_INPLACE, io_map); - - return true; - } - - bool Run(Node* node) - { - const Tensor* input_tensor = node->GetInputTensor(0); - Tensor* output_tensor = node->GetOutputTensor(0); - - const TShape& shape = input_tensor->GetShape(); - - const std::vector dims = shape.GetDim(); - - int batch_number = dims[0]; - int channel_num = dims[1]; - int channel_size = dims[2] * dims[3]; - - Tensor* gamma_tensor = node->GetInputTensor(1); - Tensor* beta_tensor = node->GetInputTensor(2); - Tensor* mean_tensor = node->GetInputTensor(3); - Tensor* var_tensor = node->GetInputTensor(4); - - float* gamma = ( float* )get_tensor_mem(gamma_tensor); - float* beta = ( float* )get_tensor_mem(beta_tensor); - float* mean = ( float* )get_tensor_mem(mean_tensor); - float* var = ( float* )get_tensor_mem(var_tensor); - - const float* input = ( const float* )get_tensor_mem(input_tensor); - float* output = ( float* )get_tensor_mem(output_tensor); - - int cpu_number = cpu_info->GetCPUNumber(); - - for(int i = 0; i < batch_number; i++) - { - if(cpu_number == 1) - { - bn_scale_relu_neon(input, gamma, beta, mean, var, channel_num, channel_size, output); - input += channel_size * channel_num; - output += channel_size * channel_num; - } - else - { - std::vector task_list; - std::vector param_list; - - auto f = std::bind(&FusedOps::Aider, this, std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3); - - int step = (channel_num + (cpu_number - 1)) / cpu_number; - - if(channel_num - (cpu_number - 1) * step <= 0) - step = channel_num / cpu_number; - - task_list.resize(cpu_number); - param_list.resize(cpu_number); - - for(int i = 0; i < cpu_number; i++) - { - BNParam* param = ¶m_list[i]; - sub_op_task* task = &task_list[i]; - - task->exec_func = f; - task->seq = i; - task->data = param; - - param->input = input; - param->gamma = gamma; - param->beta = beta; - param->mean = mean; - param->var = var; - - param->channel_num = step; - param->channel_size = channel_size; - param->output = output; - - input += channel_size * step; - output += channel_size * step; - - gamma += step; - beta += step; - mean += step; - var += step; - } - - param_list[cpu_number - 1].channel_num = channel_num - (cpu_number - 1) * step; - - task_dispatch(task_list, -1); - wait_done(); - } - - /* - the c code of assembly code - - for(int c=0;c +#include +#include + +#include "logger.hpp" +#include "operator/fused_operator.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "graph.hpp" + +extern "C" void bn_scale_relu_neon(const float* input, float* gamma, float* beta, float* mean, float* var, + int channel_number, int channel_size, float* output); + +namespace TEngine { + +namespace FusedBNScaleReluArm64 { + +struct FusedOps : public MTNodeOps +{ + struct BNParam + { + const float* input; + float* gamma; + float* beta; + float* mean; + float* var; + int channel_num; + int channel_size; + float* output; + }; + + bool Aider(int cpu, int seq, void* data) + { + BNParam* param = ( BNParam* )(data); + + bn_scale_relu_neon(param->input, param->gamma, param->beta, param->mean, param->var, param->channel_num, + param->channel_size, param->output); + + return true; + } + + bool OnBind(Node* node) + { + inplace_t io_map; + + io_map[0] = 0; + + node->SetAttr(ATTR_INPLACE, io_map); + + return true; + } + + bool Run(Node* node) + { + const Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + + const TShape& shape = input_tensor->GetShape(); + + const std::vector dims = shape.GetDim(); + + int batch_number = dims[0]; + int channel_num = dims[1]; + int channel_size = dims[2] * dims[3]; + + Tensor* gamma_tensor = node->GetInputTensor(1); + Tensor* beta_tensor = node->GetInputTensor(2); + Tensor* mean_tensor = node->GetInputTensor(3); + Tensor* var_tensor = node->GetInputTensor(4); + + float* gamma = ( float* )get_tensor_mem(gamma_tensor); + float* beta = ( float* )get_tensor_mem(beta_tensor); + float* mean = ( float* )get_tensor_mem(mean_tensor); + float* var = ( float* )get_tensor_mem(var_tensor); + + const float* input = ( const float* )get_tensor_mem(input_tensor); + float* output = ( float* )get_tensor_mem(output_tensor); + + int cpu_number = cpu_info->GetCPUNumber(); + + for(int i = 0; i < batch_number; i++) + { + if(cpu_number == 1) + { + bn_scale_relu_neon(input, gamma, beta, mean, var, channel_num, channel_size, output); + input += channel_size * channel_num; + output += channel_size * channel_num; + } + else + { + std::vector task_list; + std::vector param_list; + + auto f = std::bind(&FusedOps::Aider, this, std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + + int step = (channel_num + (cpu_number - 1)) / cpu_number; + + if(channel_num - (cpu_number - 1) * step <= 0) + step = channel_num / cpu_number; + + task_list.resize(cpu_number); + param_list.resize(cpu_number); + + for(int i = 0; i < cpu_number; i++) + { + BNParam* param = ¶m_list[i]; + sub_op_task* task = &task_list[i]; + + task->exec_func = f; + task->seq = i; + task->data = param; + + param->input = input; + param->gamma = gamma; + param->beta = beta; + param->mean = mean; + param->var = var; + + param->channel_num = step; + param->channel_size = channel_size; + param->output = output; + + input += channel_size * step; + output += channel_size * step; + + gamma += step; + beta += step; + mean += step; + var += step; + } + + param_list[cpu_number - 1].channel_num = channel_num - (cpu_number - 1) * step; + + task_dispatch(task_list, -1); + wait_done(); + } + + /* + the c code of assembly code + + for(int c=0;c /** -* MaxPool_2x2: pooling for ksize=2x2,stride=2, pad=0(default pad=0) -* @param[in] input input data (const float pointer) -* @param[in] output output data (float pointer) -* @param[in] inc input channel (int) -* @param[in] inh input height (int) -* @param[in] inw input width (int) -* @param[in] outh output height (int) -* @param[in] outw output width (int) -* @return None -*/ + * MaxPool_2x2: pooling for ksize=2x2,stride=2, pad=0(default pad=0) + * @param[in] input input data (const float pointer) + * @param[in] output output data (float pointer) + * @param[in] inc input channel (int) + * @param[in] inh input height (int) + * @param[in] inw input width (int) + * @param[in] outh output height (int) + * @param[in] outw output width (int) + * @return None + */ static void MaxPool_2x2s2(const float* input, float* output, int inc, int inh, int inw, int outh, int outw, int, int, int, int, int, int, int pad_h1, int pad_w1, int) diff --git a/executor/operator/arm64/init.cpp b/executor/operator/arm64/init.cpp index b8f52d7ad..5a2a51771 100644 --- a/executor/operator/arm64/init.cpp +++ b/executor/operator/arm64/init.cpp @@ -25,16 +25,14 @@ namespace TEngine { extern void RegisterConv2dFast(void); -extern void RegisterConv2dINT8(void); extern void RegisterConv2dDepth(void); extern void RegisterFullyConnectedFast(void); -extern void RegisterFullyConnectedINT8(void); extern void RegisterPoolingNodeExec(void); extern void RegisterBatchNormNodeExec(void); extern void RegisterScaleNodeExec(void); -extern void RegisterDeconvNodeExec(void); extern void RegisterLRNNodeExec(void); + void __attribute__((visibility("default"))) RegisterArmOps(void) { RegisterConv2dFast(); diff --git a/executor/operator/arm64/pooling.cpp b/executor/operator/arm64/pooling.cpp index 221b7d602..0cd1473c2 100644 --- a/executor/operator/arm64/pooling.cpp +++ b/executor/operator/arm64/pooling.cpp @@ -31,15 +31,21 @@ #include "tensor_mem.hpp" #include "pooling_kernel.h" +#ifdef CONFIG_AUTH_DEVICE +#include "auth_nodeops.hpp" +#endif + namespace TEngine { namespace PoolingImpl { +const int default_prio = 100; + typedef void (*pool_kernel_t)(const float* input, float* output, int inc, int in_h, int inw, int out_h, int out_w, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe); -struct PoolingOps : public NodeOps +struct PoolingOps : public MTNodeOps { PoolingSize pooling_size = POOL_GENERIC; pool_kernel_t kernel_run = nullptr; @@ -80,15 +86,15 @@ struct PoolingOps : public NodeOps Pooling* pooling_op = dynamic_cast(node->GetOp()); PoolParam* param_ = pooling_op->GetParam(); - if(param_->strides[0] == 2 && param_->strides[1] == 2) + if(param_->stride_h == 2 && param_->stride_w == 2) { - if(param_->kernel_shape[0] == 2 && param_->kernel_shape[1] == 2) + if(param_->kernel_h == 2 && param_->kernel_w == 2) pooling_size = POOL_K2S2; - else if(param_->kernel_shape[0] == 3 && param_->kernel_shape[1] == 3) + else if(param_->kernel_h == 3 && param_->kernel_w == 3) pooling_size = POOL_K3S2; } - else if(param_->strides[0] == 1 && param_->strides[1] == 1 && param_->kernel_shape[0] == 3 && - param_->kernel_shape[1] == 3) + else if(param_->stride_h == 1 && param_->stride_w == 1 && param_->kernel_h == 3 && + param_->kernel_w == 3) { pooling_size = POOL_K3S1; } @@ -102,14 +108,14 @@ struct PoolingOps : public NodeOps } kernel_run = Generic_MaxPool; - if(param_->pads[0] == 0 && param_->pads[1] == 0) + if(param_->pad_h0 == 0 && param_->pad_w0 == 0) { if(pooling_size == POOL_K2S2) kernel_run = MaxPool_2x2s2; else if(pooling_size == POOL_K3S2) kernel_run = MaxPool_3x3s2; } - else if(param_->pads[0] == 1 && param_->pads[1] == 1) + else if(param_->pad_h0 == 1 && param_->pad_w0 == 1) { if(pooling_size == POOL_K2S2) kernel_run = MaxPool_2x2s2_pad1; @@ -131,7 +137,7 @@ struct PoolingOps : public NodeOps } kernel_run = Generic_AvgPool; - if(param_->pads[0] == 0 && param_->pads[1] == 0) + if(param_->pad_h0 == 0 && param_->pad_w0 == 0) { if(pooling_size == POOL_K2S2) kernel_run = AvgPool_2x2s2; @@ -139,7 +145,7 @@ struct PoolingOps : public NodeOps kernel_run = AvgPool_3x3s2; } - if(param_->pads[0] == 1 && param_->pads[1] == 1) + if(param_->pad_h0 == 1 && param_->pad_w0 == 1) { if(pooling_size == POOL_K2S2) kernel_run = AvgPool_2x2s2_pad1; @@ -180,8 +186,8 @@ struct PoolingOps : public NodeOps printf("input: %d,%d,%d --> output: %d,%d \n", in_dim[1], in_dim[2], in_dim[3], out_dim[2], out_dim[3]); printf("kernel: %d, stride: %d, arg: %d, pad: %d,%d,%d,%d\n", - param_->kernel_shape[0], param_->strides[0], param_->alg, - param_->pads[0],param_->pads[1],param_->pads[2],param_->pads[3]); + param_->kernel_h, param_->stride_h, param_->alg, + param_->pad_h0,param_->pad_w0,param_->pad_h1,param_->pad_w1); #endif int is_caffe = param_->caffe_flavor; for(int n = 0; n < in_dim[0]; n++) @@ -191,8 +197,8 @@ struct PoolingOps : public NodeOps if(!exec_attr->pooling_mt) { kernel_run(in_ptr, out_ptr, in_dim[1], in_dim[2], in_dim[3], out_dim[2], out_dim[3], - param_->kernel_shape[0], param_->kernel_shape[1], param_->strides[0], param_->strides[1], - param_->pads[0], param_->pads[1], param_->pads[2], param_->pads[3], is_caffe); + param_->kernel_h, param_->kernel_w, param_->stride_h, param_->stride_w, + param_->pad_h0, param_->pad_w0, param_->pad_h1, param_->pad_w1, is_caffe); } else { @@ -221,14 +227,14 @@ struct PoolingOps : public NodeOps param->in_w = in_dim[3]; param->out_h = out_dim[2]; param->out_w = out_dim[3]; - param->kernel_h = param_->kernel_shape[0]; - param->kernel_w = param_->kernel_shape[1]; - param->stride_h = param_->strides[0]; - param->stride_w = param_->strides[1]; - param->pad_h0 = param_->pads[0]; - param->pad_w0 = param_->pads[1]; - param->pad_h1 = param_->pads[2]; - param->pad_w1 = param_->pads[3]; + param->kernel_h = param_->kernel_h; + param->kernel_w = param_->kernel_w; + param->stride_h = param_->stride_h; + param->stride_w = param_->stride_w; + param->pad_h0 = param_->pad_h0; + param->pad_w0 = param_->pad_w0; + param->pad_h1 = param_->pad_h1; + param->pad_w1 = param_->pad_w1; param->is_caffe = is_caffe; } @@ -241,17 +247,21 @@ struct PoolingOps : public NodeOps } }; -const int default_prio = 100; NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) { +#ifdef CONFIG_AUTH_DEVICE + if(!get_auth_float_enabled()) + return nullptr; +#endif + + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); - if(exec_attr->layout == TENGINE_LAYOUT_NHWC) + if( data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) return nullptr; PoolingOps* ops = new PoolingOps(); - ops->need_free = true; - return ops; } diff --git a/executor/operator/arm64/scale_neon.S b/executor/operator/arm64/scale_neon.S index f67da6c15..a66f39827 100644 --- a/executor/operator/arm64/scale_neon.S +++ b/executor/operator/arm64/scale_neon.S @@ -27,6 +27,7 @@ .text .align 5 .global scale_neon +.hidden scale_neon .type scale_neon, %function scale_neon: @@ -133,6 +134,7 @@ channel_done: //scale_neon_bias .global scale_neon_bias +.hidden scale_neon_bias .type scale_neon_bias, %function scale_neon_bias: diff --git a/executor/operator/common/Makefile b/executor/operator/common/Makefile index 9ff4bad94..cb0ad1b2e 100644 --- a/executor/operator/common/Makefile +++ b/executor/operator/common/Makefile @@ -1,4 +1,3 @@ -obj-y+=conv_ref.o obj-y+=concat.o obj-y+=dropout.o obj-y+=softmax.o @@ -22,7 +21,6 @@ obj-y+=resize.o obj-y+=pooling.o obj-y+=batchnorm.o obj-y+=scale.o -obj-y+=custom_kernel_ops.o obj-y+=logistic.o obj-y+=detection_postprocess.o obj-y+=fused/ diff --git a/executor/operator/common/batchnorm.cpp b/executor/operator/common/batchnorm.cpp index d251471f7..30aa6662d 100644 --- a/executor/operator/common/batchnorm.cpp +++ b/executor/operator/common/batchnorm.cpp @@ -165,15 +165,28 @@ struct BatchNormOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + if((input->GetShape()).GetDim().size() != 4) + return nullptr; + + BatchNormOps* ops = new BatchNormOps(); + + return ops; +} + } // namespace BatchNormImpl using namespace BatchNormImpl; void RegisterBatchNorm_NodeExec(void) { - BatchNormOps* ops = new BatchNormOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "BatchNormalization", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "BatchNormalization", BatchNormImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/blas/Makefile b/executor/operator/common/blas/Makefile index 29f55108a..849d3f253 100644 --- a/executor/operator/common/blas/Makefile +++ b/executor/operator/common/blas/Makefile @@ -1,4 +1,6 @@ obj-y+=conv_2d_blas.o obj-y+=deconv_2d_blas.o obj-y+=fc_blas.o -obj-y+=lstm_blas.o \ No newline at end of file +obj-y+=lstm_blas.o +obj-y+=rnn_blas.o +obj-y+=gru_blas.o \ No newline at end of file diff --git a/executor/operator/common/blas/conv_2d_blas.cpp b/executor/operator/common/blas/conv_2d_blas.cpp index d64c2838e..b05b87360 100644 --- a/executor/operator/common/blas/conv_2d_blas.cpp +++ b/executor/operator/common/blas/conv_2d_blas.cpp @@ -37,8 +37,8 @@ namespace TEngine { namespace ConvolutionImpl { -const char* conv_name = "CONV_IMPL"; -const int default_prio = 1200; +const char* conv_name = "CONV_BLAS"; +const int default_prio = 5000; struct ConvolutionOps : public NodeOps { @@ -177,8 +177,8 @@ struct ConvolutionOps : public NodeOps int ksize_h = param->kernel_h; int ksize_w = param->kernel_w; - int pad_w = param->pads[1]; - int pad_h = param->pads[0]; + int pad_w = param->pad_w0; + int pad_h = param->pad_h0; int stride_w = param->stride_w; int stride_h = param->stride_h; @@ -248,14 +248,14 @@ struct ConvolutionOps : public NodeOps NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) { + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); - if(exec_attr->layout == TENGINE_LAYOUT_NHWC) + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) return nullptr; ConvolutionOps* ops = new ConvolutionOps(); - ops->need_free = true; - return ops; } @@ -263,8 +263,9 @@ NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) void RegisterConvBlasNodeExec(void) { - NodeOpsRegistryManager::RegisterOPImplementor("common", "Convolution", ConvolutionImpl::SelectFunc, - ConvolutionImpl::default_prio); + if(!NodeOpsRegistryManager::RegisterOPImplementor("common", "Convolution", ConvolutionImpl::SelectFunc, + ConvolutionImpl::default_prio)) + LOG_ERROR()<<__FUNCTION__<<" :Regist OP failed for prio ["<GetShape(); const std::vector dims = shape.GetDim(); - int size = dims[2] * dims[3] * param_->kernel_size * param_->kernel_size * param_->num_output; + int size = dims[2] * dims[3] * param_->kernel_h * param_->kernel_w * param_->num_output; float* buffer = ( float* )std::malloc(sizeof(float) * size); memset(buffer, 0, size * sizeof(float)); (*node)["buffer"] = buffer; @@ -154,10 +154,10 @@ struct DeconvBlasOps : public NodeOps // param Deconvolution* deconv_op = dynamic_cast(node->GetOp()); DeconvParam* param_ = deconv_op->GetParam(); - int pad = param_->pad; - int stride = param_->stride; - int ksize = param_->kernel_size; - int dilation = param_->dilation; + int pad = param_->pad_w0; + int stride = param_->stride_w; + int ksize = param_->kernel_w; + int dilation = param_->dilation_w; // buffer float* buffer = any_cast(node->GetAttr("buffer")); @@ -205,15 +205,48 @@ struct DeconvBlasOps : public NodeOps } }; -} // namespace DeconvolutionImpl + +static bool isDeconvSupported(DeconvParam * param) +{ + if(param->pad_h0 != param->pad_h1 || param->pad_w0 != param->pad_w1 || + param->pad_w0 != param->pad_h0 || + param->stride_h != param->stride_w || + param->dilation_h != param->dilation_w || + param->group != 1 || + param->kernel_h != param->kernel_w + ) + return false; + return true; + +} +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ +#ifdef CONFIG_ATUH_DEVICE + if(!get_auth_float_enabled()) + return nullptr; +#endif + Operator* op = node->GetOp(); + Deconvolution* deconv_op = dynamic_cast(op); + DeconvParam* param = deconv_op->GetParam(); + if(!isDeconvSupported(param)) + return nullptr; + + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + DeconvBlasOps* ops = new DeconvBlasOps(); + return ops; +} -using namespace DeconvolutionImpl; +} // namespace DeconvolutionBlasImpl + +using namespace DeconvolutionBlasImpl; void RegisterDeconvBlasNodeExec(void) { - DeconvBlasOps* ops = new DeconvBlasOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Deconvolution", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common","Deconvolution",DeconvolutionBlasImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/blas/fc_blas.cpp b/executor/operator/common/blas/fc_blas.cpp index 965a7f862..ae089f424 100644 --- a/executor/operator/common/blas/fc_blas.cpp +++ b/executor/operator/common/blas/fc_blas.cpp @@ -129,14 +129,25 @@ struct FcBlasOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + FcBlasOps* ops = new FcBlasOps(); + + return ops; +} + } // namespace FCImpl using namespace FCImpl; void RegisterFcBlasNodeExec(void) { - FcBlasOps* ops = new FcBlasOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "FullyConnected", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "FullyConnected", FCImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/blas/gru_blas.cpp b/executor/operator/common/blas/gru_blas.cpp new file mode 100644 index 000000000..a7a690c7d --- /dev/null +++ b/executor/operator/common/blas/gru_blas.cpp @@ -0,0 +1,473 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#include +#include +#include +#include +#include + +#include "graph.hpp" +#include "logger.hpp" +#include "node_ops.hpp" +#include "operator/gru.hpp" +#include "tensor_mem.hpp" +#include "tengine_errno.hpp" +#include +#include + +namespace TEngine { + +namespace GRURefImpl { + +struct GRUOps : public NodeOps +{ + Tensor* init_h_tensor; + Tensor* kernel_tensor; + Tensor* bias_tensor; + Tensor* candidate_kernel_tensor; + Tensor* candidate_bias_tensor; + Tensor* fused_kernel_tensor; + // bool dynamic_shape; + void* init_h_data; + + GRUOps(void) + { + init_h_tensor = nullptr; + bias_tensor = nullptr; + init_h_data = nullptr; + kernel_tensor=nullptr; + candidate_kernel_tensor=nullptr; + candidate_bias_tensor=nullptr; + fused_kernel_tensor=nullptr; + } + + void sigmoid(float* data, int size) + { + for(int i = 0; i < size; i++) + { + data[i] = std::min(data[i], 30.0f); + data[i] = std::max(data[i], -30.0f); + + data[i] = 1 / (1 + exp(-data[i])); + } + } + /* + @ func_name: concat_axis_1 + @ param: + a:[m, n1] + b:[m, n2] + c:[m, n1 + n2] + */ + void concat_axis_1(const float* a, const float* b, float* c, int m, int n1, int n2) + { + int n = n1 + n2; + for(int i = 0; i < m; i++) + { + for(int j = 0; j < n1; j++) + { + c[j + i * n] = a[j + i * n1]; + } + for(int j = 0; j < n2; j++) + { + c[j + i * n + n1] = b[j + i * n2]; + } + } + } + + void slice_axis_1(float* a, float* c, int m, int n, int st, int ed) + { + for(int i = 0; i < m; i++) + { + for(int j = st; j < ed; j++) + { + c[i * (ed - st) + j - st] = a[i * n + j]; + } + } + } + void do_gemm(const float* a, const float* b, float* c, int m, int k, int n, int lda, int ldb, int ldc) + { + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, a, lda, b, ldb, 0.0, c, ldc); + } + void do_gemm_mx(const float* a, const float* b, float* c, int m, int k, int n, int lda, int ldb, int ldc) + { + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0, a, lda, b, ldb, 0.0, c, ldc); + } + + bool do_GRU_step(const float* input, float* init_h, const float* kernel, const float* bias, + const float* candidate_kernel,const float* candidate_bias,int batch_size, + int input_size, int hidden_size,int mxnet_flag) + { + + if(mxnet_flag==1) + { + float* i2h_mat = ( float* )malloc(sizeof(float) * batch_size *3* hidden_size); + float* h2h_mat = ( float* )malloc(sizeof(float) * batch_size *3* hidden_size); + + float* i2h_r = ( float* )malloc(batch_size*hidden_size * sizeof(float)); + float* i2h_z = ( float* )malloc(batch_size*hidden_size * sizeof(float)); + float* i2h = ( float* )malloc(batch_size*hidden_size * sizeof(float)); + + float* h2h_r = ( float* )malloc(batch_size*hidden_size * sizeof(float)); + float* h2h_z = ( float* )malloc(batch_size*hidden_size * sizeof(float)); + float* h2h = ( float* )malloc(batch_size*hidden_size * sizeof(float)); + + float* r_g = ( float* )malloc(batch_size*hidden_size * sizeof(float)); + float* u_g = ( float* )malloc(batch_size*hidden_size * sizeof(float)); + float* next_h_tmp = ( float* )malloc(batch_size*hidden_size * sizeof(float)); + + do_gemm_mx(input, kernel, i2h_mat, batch_size, input_size, 3*hidden_size, input_size, + input_size, 3*hidden_size); + + for(int i = 0; i < batch_size; i++) + { + for(int j = 0; j < (3*hidden_size); j++) + { + i2h_mat[i *(3*hidden_size) + j] += bias[j]; + } + } + + do_gemm_mx(init_h, candidate_kernel, h2h_mat, batch_size, hidden_size, 3*hidden_size, hidden_size, + hidden_size, 3*hidden_size); + + for(int i = 0; i < batch_size; i++) + { + for(int j = 0; j < (3*hidden_size); j++) + { + h2h_mat[i *(3*hidden_size) + j] += candidate_bias[j]; + } + } + slice_axis_1(i2h_mat, i2h_r, batch_size, 3 * hidden_size, 0, hidden_size); + slice_axis_1(i2h_mat, i2h_z, batch_size, 3 * hidden_size, hidden_size, 2*hidden_size); + slice_axis_1(i2h_mat, i2h, batch_size, 3 * hidden_size, 2*hidden_size, 3*hidden_size); + + slice_axis_1(h2h_mat, h2h_r, batch_size, 3 * hidden_size, 0, hidden_size); + slice_axis_1(h2h_mat, h2h_z, batch_size, 3 * hidden_size, hidden_size, 2*hidden_size); + slice_axis_1(h2h_mat, h2h, batch_size, 3 * hidden_size, 2*hidden_size, 3*hidden_size); + + for(int i = 0; i < batch_size*hidden_size; i++) + { + r_g[i] = i2h_r[i]+h2h_r[i]; + } + sigmoid(r_g,hidden_size * batch_size); + for(int i = 0; i < batch_size*hidden_size; i++) + { + u_g[i] = i2h_z[i]+h2h_z[i]; + } + sigmoid(u_g,hidden_size * batch_size); + + for(int i = 0; i < batch_size*hidden_size; i++) + { + next_h_tmp[i] = tanh(i2h[i]+r_g[i]*h2h[i]); + } + + for(int i = 0; i < batch_size*hidden_size; i++) + { + init_h[i] = u_g[i] * init_h[i] + (1-u_g[i]) * next_h_tmp[i]; + } + + // free memory + free(i2h_mat); + free(h2h_mat); + free(i2h_r); + free(i2h_z); + free(i2h); + free(h2h_r); + free(h2h_z); + free(h2h); + free(r_g); + free(u_g); + free(next_h_tmp); + + return true; + } + else + { + int input_total_size = input_size + hidden_size; + int batch_cell_size = hidden_size * batch_size; + + float* merged_input = ( float* )malloc(sizeof(float) * batch_size * (input_total_size)); + float* matmul_result = ( float* )malloc(sizeof(float) * batch_size *2* hidden_size ); + float* r = ( float* )malloc(batch_cell_size * sizeof(float)); + float* u = ( float* )malloc(batch_cell_size * sizeof(float)); + float* c = ( float* )malloc(batch_cell_size * sizeof(float)); + float* r_state = ( float* )malloc(batch_cell_size * sizeof(float)); + float* candidate = ( float* )malloc(sizeof(float) * batch_size* hidden_size); + + // merge input + concat_axis_1(input, init_h, merged_input, batch_size, input_size, hidden_size); + // do gemm + do_gemm(merged_input, kernel, matmul_result, batch_size, input_total_size, 2*hidden_size, input_total_size, + 2*hidden_size, 2*hidden_size); + // add bias + + + for(int i = 0; i < batch_size; i++) + { + for(int j = 0; j < (2*hidden_size); j++) + { + matmul_result[i *(2*hidden_size) + j] += bias[j]; + } + + } + + + sigmoid(matmul_result,2*hidden_size * batch_size); + slice_axis_1(matmul_result, r, batch_size, 2 * hidden_size, 0, hidden_size); + slice_axis_1(matmul_result, u, batch_size, 2 * hidden_size, hidden_size, 2*hidden_size); + + + for(int i = 0; i < batch_cell_size; i++) + r_state[i] = r[i] * init_h[i]; + + concat_axis_1(input, r_state, merged_input, batch_size, input_size, hidden_size); + //candidate kernerl + + + do_gemm(merged_input, candidate_kernel, candidate, batch_size, input_total_size, hidden_size, input_total_size, + hidden_size, hidden_size); + //candidate bias + + for(int i = 0; i < batch_size; i++) + { + for(int j = 0; j < hidden_size; j++) + { + candidate[i *hidden_size + j] += candidate_bias[j]; + } + } + + + for(int i = 0; i < batch_cell_size; i++) + { + c[i] = tanh(candidate[i]); + } + + for(int i = 0; i < batch_cell_size; i++) + { + init_h[i] = u[i] * init_h[i] + (1-u[i]) * c[i]; + } + // free memory + free(merged_input); + free(matmul_result); + free(candidate); + free(r); + free(u); + free(c); + return true; + } + + + } + + bool do_GRU(const float* input, float* output, float* init_h, const float* kernel, + const float* bias,const float* candidate_kernel,const float* candidate_bias, + int seq_lens, int batch_size, int input_size,int output_len, int hidden_size,int mxnet_flag) + { + for(int i = 0; i < seq_lens; i++) + { + + const float* seq_input = input + i * batch_size * input_size; + if(!do_GRU_step(seq_input, init_h, kernel, bias, candidate_kernel,candidate_bias,batch_size, input_size, hidden_size,mxnet_flag)) + { + return false; + } + + if(i + output_len >= seq_lens) + { + memcpy(output, init_h, batch_size*hidden_size * sizeof(float)); + output += batch_size*hidden_size; + } + } + + return true; + } + + bool Prerun(Node* node) + { + GRU* gru_op = dynamic_cast(node->GetOp()); + + int in_num = node->GetInputNum(); + + for(int count = 0; count < in_num; count++) + { + Tensor* temptensor = node->GetInputTensor(count); + const std::string& name = temptensor->GetName(); + + if(name.find(gru_op->GetInitHiddenName()) != std::string::npos) + { + init_h_tensor = temptensor; + } + if(name.find(gru_op->GetBiasName()) != std::string::npos) + { + bias_tensor = temptensor; + } + if(name.find(gru_op->GetKernelName()) != std::string::npos) + { + kernel_tensor = temptensor; + } + if(name.find(gru_op->GetCandidateKernelName()) != std::string::npos) + { + candidate_kernel_tensor = temptensor; + } + if(name.find(gru_op->GetCandidateBiasName()) != std::string::npos) + { + candidate_bias_tensor = temptensor; + } + if(name.find(gru_op->Geti2hweightName()) != std::string::npos) + { + kernel_tensor = temptensor; + } + if(name.find(gru_op->Geti2hbiasName()) != std::string::npos) + { + bias_tensor = temptensor; + } + if(name.find(gru_op->Geth2hweightName()) != std::string::npos) + { + candidate_kernel_tensor = temptensor; + } + if(name.find(gru_op->Geth2hbiasName()) != std::string::npos) + { + candidate_bias_tensor = temptensor; + } + if(name.find(gru_op->GetFusedKernelName()) != std::string::npos) + { + fused_kernel_tensor = temptensor; + } + + + } + + if(init_h_tensor) + { + init_h_data = get_tensor_mem(init_h_tensor); + } + + return true; + } + + bool Run(Node* node) + { + GRU* gru_op = dynamic_cast(node->GetOp()); + GRUParam* param = gru_op->GetParam(); + + Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + // Tensor* kernel_tensor = node->GetInputTensor(1); + + + + int input_size = 0; + int hidden_size = param->hidden_size; + + float* output = ( float* )get_tensor_mem(output_tensor); + // std::cout<<"ot::"<GetShape(); + + int seq_lens = input_shape.Shape(0); + int batch_size = input_shape.Shape(1); + int output_len = param->output_len; + int mxnet_flag = param->mxnet_flag; + + if(mxnet_flag==1) + { + input_size=input_shape.Shape(2); + // kernel_tensor = node->GetInputTensor(1); + } + else + { + input_size = param->input_size; + } + float* init_h = ( float* )malloc(batch_size * hidden_size * sizeof(float)); + + if(init_h == nullptr) + { + set_tengine_errno(ENOMEM); + return false; + } + + if(init_h_data) + { + for(int i = 0; i < batch_size; i++) + { + memcpy(init_h + i * hidden_size, init_h_data, hidden_size * sizeof(float)); + } + } + else + { + memset(init_h, 0x0, sizeof(batch_size * hidden_size * sizeof(float))); + } + + float* kernel = nullptr; + float* bias = nullptr; + float* fused_kernel=nullptr; + float* candidate_kernel = nullptr; + float* candidate_bias = nullptr; + + if(kernel_tensor) + kernel = ( float* )get_tensor_mem(kernel_tensor); + + if(bias_tensor) + bias = ( float* )get_tensor_mem(bias_tensor); + + if(candidate_kernel_tensor) + candidate_kernel = ( float* )get_tensor_mem(candidate_kernel_tensor); + + if(candidate_bias_tensor) + candidate_bias = ( float* )get_tensor_mem(candidate_bias_tensor); + + if(fused_kernel_tensor) + { + // std::cout<<"fused_kernel\n"; + fused_kernel=( float* )get_tensor_mem(fused_kernel_tensor); + kernel=fused_kernel; + candidate_kernel=fused_kernel+input_size*hidden_size*3; + bias=candidate_kernel+hidden_size*hidden_size*3; + candidate_bias=bias+hidden_size*3; + } + + bool ret = do_GRU(input, output, init_h, kernel, bias, candidate_kernel + ,candidate_bias,seq_lens, batch_size, input_size, output_len, hidden_size,mxnet_flag); + + free(init_h); + return ret; + } + + bool Postrun(Node* node) + { + return true; + } +}; + +} // namespace GRURefImpl + +using namespace GRURefImpl; +void RegisterGRUNodeExec(void) +{ + GRUOps* ops = new GRUOps(); + + NodeOpsRegistryManager::RegisterOPImplementor("common", "GRU", ops); +} + +} // namespace TEngine diff --git a/executor/operator/common/blas/lstm_blas.cpp b/executor/operator/common/blas/lstm_blas.cpp index 64e0d4c0b..04af0dad9 100644 --- a/executor/operator/common/blas/lstm_blas.cpp +++ b/executor/operator/common/blas/lstm_blas.cpp @@ -49,13 +49,18 @@ struct LSTMOps : public NodeOps Tensor* w_i_tensor; Tensor* w_o_tensor; Tensor* proj_tensor; + Tensor* kernel_tensor; + Tensor* h2h_kernel_tensor; + Tensor* h2h_bias_tensor; + Tensor* fused_kernel_tensor; void* init_h_data; void* init_c_data; - + // bool dynamic_shape; LSTMOps(void) { init_c_tensor = nullptr; init_h_tensor = nullptr; + kernel_tensor=nullptr; bias_tensor = nullptr; w_f_tensor = nullptr; w_i_tensor = nullptr; @@ -63,6 +68,9 @@ struct LSTMOps : public NodeOps proj_tensor = nullptr; init_h_data = nullptr; init_c_data = nullptr; + h2h_kernel_tensor=nullptr; + h2h_bias_tensor=nullptr; + fused_kernel_tensor=nullptr; } /* @@ -228,116 +236,191 @@ struct LSTMOps : public NodeOps { cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, a, lda, b, ldb, 0.0, c, ldc); } + void do_gemm_mx(const float* a, const float* b, float* c, int m, int k, int n, int lda, int ldb, int ldc) + { + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0, a, lda, b, ldb, 0.0, c, ldc); + } bool do_LSTM_step(const float* input, float* init_h, float* init_c, const float* kernel, const float* bias, + const float* h2h_kernel, const float* h2h_bias, const float* w_f_data, const float* w_i_data, const float* w_o_data, const float* projection, - float forget_bias, int batch_size, int input_size, int hidden_size, int cell_size) + float forget_bias, int batch_size, int input_size, int hidden_size, int cell_size,int mxnet_flag) { - int input_total_size = input_size + hidden_size; - int batch_cell_size = cell_size * batch_size; + if(mxnet_flag==1) + { + int batch_cell_size = cell_size * batch_size; + float* i2h = ( float* )malloc(sizeof(float) * batch_size * cell_size * 4); + float* h2h = ( float* )malloc(sizeof(float) * batch_size * cell_size * 4); + float* gates = ( float* )malloc(sizeof(float) * batch_size * cell_size * 4); + + float* ig = ( float* )malloc(batch_cell_size * sizeof(float)); + float* cg = ( float* )malloc(batch_cell_size * sizeof(float)); + float* fg = ( float* )malloc(batch_cell_size * sizeof(float)); + float* og = ( float* )malloc(batch_cell_size * sizeof(float)); + // m k n + do_gemm_mx(input, kernel, i2h, batch_size, input_size, 4 * cell_size, input_size, + input_size, 4 * cell_size); + + if(bias) + { + for(int i = 0; i < batch_size; i++) + for(int j = 0; j < 4 * cell_size; j++) + i2h[i * 4 * cell_size + j] += bias[j]; + } - float* merged_input = ( float* )malloc(sizeof(float) * batch_size * (input_total_size)); - float* matmul_result = ( float* )malloc(sizeof(float) * batch_size * cell_size * 4); + do_gemm_mx(init_h, h2h_kernel, h2h, batch_size, hidden_size, 4 * hidden_size, hidden_size, + hidden_size, 4 * hidden_size); + if(h2h_bias) + { + for(int i = 0; i < batch_size; i++) + for(int j = 0; j < 4 * cell_size; j++) + h2h[i * 4 * cell_size + j] += h2h_bias[j]; + } + + for(int i = 0; i < batch_size*4*cell_size; i++) + gates[i] = i2h[i]+h2h[i]; + + slice_axis_1(gates, ig, batch_size, 4 * cell_size, 0, cell_size); + slice_axis_1(gates, fg, batch_size, 4 * cell_size, cell_size, 2 * cell_size); + slice_axis_1(gates, cg, batch_size, 4 * cell_size, 2 * cell_size, 3 * cell_size); + slice_axis_1(gates, og, batch_size, 4 * cell_size, 3 * cell_size, 4 * cell_size); + + for(int i = 0; i < batch_size*cell_size; i++) + fg[i]+=1; + + sigmoid(ig, batch_cell_size); + sigmoid(fg, batch_cell_size); + mytanh(cg, batch_cell_size); + sigmoid(og, batch_cell_size); - // merge input - concat_axis_1(input, init_h, merged_input, batch_size, input_size, hidden_size); + for(int i = 0; i < batch_cell_size; i++) + init_c[i] = init_c[i] * fg[i] + cg[i] * ig[i]; + + for(int i = 0; i < batch_cell_size; i++) + { + init_h[i] = tanh(init_c[i]) * og[i]; + } - // do gemm - do_gemm(merged_input, kernel, matmul_result, batch_size, input_total_size, 4 * cell_size, input_total_size, - 4 * cell_size, 4 * cell_size); + free(i2h); + free(h2h); + free(gates); + free(ig); + free(fg); + free(cg); + free(og); + return true; - // add bias - if(bias) - { - for(int i = 0; i < batch_size; i++) - for(int j = 0; j < 4 * cell_size; j++) - matmul_result[i * 4 * cell_size + j] += bias[j]; } + else + { + int input_total_size = input_size + hidden_size; + int batch_cell_size = cell_size * batch_size; - float* ig = ( float* )malloc(batch_cell_size * sizeof(float)); - float* cg = ( float* )malloc(batch_cell_size * sizeof(float)); - float* fg = ( float* )malloc(batch_cell_size * sizeof(float)); - float* og = ( float* )malloc(batch_cell_size * sizeof(float)); + float* merged_input = ( float* )malloc(sizeof(float) * batch_size * (input_total_size)); + float* matmul_result = ( float* )malloc(sizeof(float) * batch_size * cell_size * 4); - slice_axis_1(matmul_result, ig, batch_size, 4 * cell_size, 0, cell_size); - slice_axis_1(matmul_result, cg, batch_size, 4 * cell_size, cell_size, 2 * cell_size); - slice_axis_1(matmul_result, fg, batch_size, 4 * cell_size, 2 * cell_size, 3 * cell_size); - slice_axis_1(matmul_result, og, batch_size, 4 * cell_size, 3 * cell_size, 4 * cell_size); + // merge input + concat_axis_1(input, init_h, merged_input, batch_size, input_size, hidden_size); - // forget gate - for(int i = 0; i < batch_cell_size; i++) - fg[i] += forget_bias; + // do gemm + do_gemm(merged_input, kernel, matmul_result, batch_size, input_total_size, 4 * cell_size, input_total_size, + 4 * cell_size, 4 * cell_size); - // peephole - if(w_f_data) - { - for(int i = 0; i < batch_size; i++) - for(int j = 0; j < cell_size; j++) - { - fg[i * cell_size + j] += init_c[i * cell_size + j] * w_f_data[j]; - ig[i * cell_size + j] += init_c[i * cell_size + j] * w_i_data[j]; - } - } + // add bias + if(bias) + { + for(int i = 0; i < batch_size; i++) + for(int j = 0; j < 4 * cell_size; j++) + matmul_result[i * 4 * cell_size + j] += bias[j]; + } - sigmoid(fg, batch_cell_size); - sigmoid(ig, batch_cell_size); - mytanh(cg, batch_cell_size); + float* ig = ( float* )malloc(batch_cell_size * sizeof(float)); + float* cg = ( float* )malloc(batch_cell_size * sizeof(float)); + float* fg = ( float* )malloc(batch_cell_size * sizeof(float)); + float* og = ( float* )malloc(batch_cell_size * sizeof(float)); - // get cell output - for(int i = 0; i < batch_cell_size; i++) - init_c[i] = init_c[i] * fg[i] + cg[i] * ig[i]; + slice_axis_1(matmul_result, ig, batch_size, 4 * cell_size, 0, cell_size); + slice_axis_1(matmul_result, cg, batch_size, 4 * cell_size, cell_size, 2 * cell_size); + slice_axis_1(matmul_result, fg, batch_size, 4 * cell_size, 2 * cell_size, 3 * cell_size); + slice_axis_1(matmul_result, og, batch_size, 4 * cell_size, 3 * cell_size, 4 * cell_size); - if(w_o_data) - { - for(int i = 0; i < batch_size; i++) - for(int j = 0; j < cell_size; j++) - { - og[i * cell_size + j] += init_c[i * cell_size + j] * w_o_data[j]; - } - } + // forget gate + for(int i = 0; i < batch_cell_size; i++) + fg[i] += forget_bias; - sigmoid(og, batch_cell_size); + // peephole + if(w_f_data) + { + for(int i = 0; i < batch_size; i++) + for(int j = 0; j < cell_size; j++) + { + fg[i * cell_size + j] += init_c[i * cell_size + j] * w_f_data[j]; + ig[i * cell_size + j] += init_c[i * cell_size + j] * w_i_data[j]; + } + } - if(projection) - { + sigmoid(fg, batch_cell_size); + sigmoid(ig, batch_cell_size); + mytanh(cg, batch_cell_size); + + // get cell output for(int i = 0; i < batch_cell_size; i++) + init_c[i] = init_c[i] * fg[i] + cg[i] * ig[i]; + + if(w_o_data) { - og[i] = tanh(init_c[i]) * og[i]; + for(int i = 0; i < batch_size; i++) + for(int j = 0; j < cell_size; j++) + { + og[i * cell_size + j] += init_c[i * cell_size + j] * w_o_data[j]; + } } - /*batchxcell_size * cell_sizexhidden_size --> batch* hidden_size*/ - do_gemm(og, projection, init_h, batch_size, cell_size, hidden_size, cell_size, hidden_size, hidden_size); - } - else - { - for(int i = 0; i < batch_cell_size; i++) + sigmoid(og, batch_cell_size); + + if(projection) { - init_h[i] = tanh(init_c[i]) * og[i]; + for(int i = 0; i < batch_cell_size; i++) + { + og[i] = tanh(init_c[i]) * og[i]; + } + + /*batchxcell_size * cell_sizexhidden_size --> batch* hidden_size*/ + do_gemm(og, projection, init_h, batch_size, cell_size, hidden_size, cell_size, hidden_size, hidden_size); + } + else + { + for(int i = 0; i < batch_cell_size; i++) + { + init_h[i] = tanh(init_c[i]) * og[i]; + } } - } - // free memory - free(merged_input); - free(matmul_result); - free(ig); - free(cg); - free(fg); - free(og); + // free memory + free(merged_input); + free(matmul_result); + free(ig); + free(cg); + free(fg); + free(og); + return true; + } + - return true; + } bool do_LSTM(const float* input, float* output, float* init_h, float* init_c, const float* kernel, - const float* bias, const float* w_f_data, const float* w_i_data, const float* w_o_data, + const float* bias, const float* h2h_kernel, const float* h2h_bias, const float* w_f_data, const float* w_i_data, const float* w_o_data, const float* projection, float forget_bias, int seq_lens, int batch_size, int input_size, - int output_len, int hidden_size, int cell_size) + int output_len, int hidden_size, int cell_size,int mxnet_flag) { for(int i = 0; i < seq_lens; i++) { const float* seq_input = input + i * batch_size * input_size; - if(!do_LSTM_step(seq_input, init_h, init_c, kernel, bias, w_f_data, w_i_data, w_o_data, projection, - forget_bias, batch_size, input_size, hidden_size, cell_size)) + if(!do_LSTM_step(seq_input, init_h, init_c, kernel, bias,h2h_kernel,h2h_bias, w_f_data, w_i_data, w_o_data, projection, + forget_bias, batch_size, input_size, hidden_size, cell_size,mxnet_flag)) return false; if(i + output_len >= seq_lens) @@ -360,7 +443,10 @@ struct LSTMOps : public NodeOps { Tensor* temptensor = node->GetInputTensor(count); const std::string& name = temptensor->GetName(); - + if(name.find(lstm_op->GetKernelName()) != std::string::npos) + { + kernel_tensor = temptensor; + } if(name.find(lstm_op->GetInitCellName()) != std::string::npos) { init_c_tensor = temptensor; @@ -389,6 +475,26 @@ struct LSTMOps : public NodeOps { proj_tensor = temptensor; } + if(name.find(lstm_op->Geti2hKernelName()) != std::string::npos) + { + kernel_tensor = temptensor; + } + if(name.find(lstm_op->Geti2hBiasName()) != std::string::npos) + { + bias_tensor = temptensor; + } + if(name.find(lstm_op->Geth2hKernelName()) != std::string::npos) + { + h2h_kernel_tensor = temptensor; + } + if(name.find(lstm_op->Geth2hBiasName()) != std::string::npos) + { + h2h_bias_tensor = temptensor; + } + if(name.find(lstm_op->GetFusedKernelName()) != std::string::npos) + { + fused_kernel_tensor = temptensor; + } } if(init_c_tensor) @@ -411,17 +517,17 @@ struct LSTMOps : public NodeOps Tensor* input_tensor = node->GetInputTensor(0); Tensor* output_tensor = node->GetOutputTensor(0); - Tensor* kernel_tensor = node->GetInputTensor(1); + // Tensor* kernel_tensor = node->GetInputTensor(1); float forget_bias = param->forget_bias; bool has_peephole = param->has_peephole; bool has_projection = param->has_projection; - int input_size = param->input_size; + int hidden_size = param->hidden_size; int cell_size = param->cell_size; - + int input_size=0; float* output = ( float* )get_tensor_mem(output_tensor); float* input = ( float* )get_tensor_mem(input_tensor); @@ -430,7 +536,16 @@ struct LSTMOps : public NodeOps int seq_lens = input_shape.Shape(0); int batch_size = input_shape.Shape(1); int output_len = param->output_len; + int mxnet_flag= param->mxnet_flag; + if(mxnet_flag==1) + { + input_size=input_shape.Shape(2); + } + else + { + input_size = param->input_size; + } float* init_h = ( float* )malloc(batch_size * hidden_size * sizeof(float)); if(init_h == nullptr) @@ -462,16 +577,27 @@ struct LSTMOps : public NodeOps memset(init_c, 0x0, sizeof(batch_size * cell_size * sizeof(float))); } - float* kernel = ( float* )get_tensor_mem(kernel_tensor); - + float* kernel =nullptr; float* bias = nullptr; float* w_f_data = nullptr; float* w_i_data = nullptr; float* w_o_data = nullptr; float* projection = nullptr; + float* h2h_kernel =nullptr; + float* h2h_bias =nullptr; + float* fused_kernel =nullptr; + if(kernel_tensor) + kernel = ( float* )get_tensor_mem(kernel_tensor); + if(bias_tensor) bias = ( float* )get_tensor_mem(bias_tensor); + + if(h2h_kernel_tensor) + h2h_kernel = ( float* )get_tensor_mem(h2h_kernel_tensor); + + if(h2h_bias_tensor) + h2h_bias = ( float* )get_tensor_mem(h2h_bias_tensor); if(has_peephole) { @@ -479,12 +605,23 @@ struct LSTMOps : public NodeOps w_i_data = ( float* )get_tensor_mem(w_i_tensor); w_o_data = ( float* )get_tensor_mem(w_o_tensor); } + //int bsize=2*cell_size*4; + if(fused_kernel_tensor) + { + fused_kernel=( float* )get_tensor_mem(fused_kernel_tensor); + int kernel_size=get_tensor_mem_size(fused_kernel_tensor)/sizeof(float); + kernel=fused_kernel; + h2h_kernel=kernel+input_size*hidden_size*4; + bias=kernel+kernel_size-hidden_size*4*2; + h2h_bias=bias+hidden_size*4; + } if(has_projection) projection = ( float* )get_tensor_mem(proj_tensor); - bool ret = do_LSTM(input, output, init_h, init_c, kernel, bias, w_f_data, w_i_data, w_o_data, projection, - forget_bias, seq_lens, batch_size, input_size, output_len, hidden_size, cell_size); + // std::cout<<"inputmem: "< +#include +#include +#include +#include + +#include "graph.hpp" +#include "logger.hpp" +#include "node_ops.hpp" +#include "operator/rnn.hpp" +#include "tensor_mem.hpp" +#include "tengine_errno.hpp" +#include +#include + +namespace TEngine { + +namespace RNNRefImpl { + +struct RNNOps : public NodeOps +{ + Tensor* init_h_tensor; + Tensor* bias_tensor; + void* init_h_data; + + RNNOps(void) + { + init_h_tensor = nullptr; + bias_tensor = nullptr; + init_h_data = nullptr; + } + + /* + @ func_name: concat_axis_1 + @ param: + a:[m, n1] + b:[m, n2] + c:[m, n1 + n2] + */ + void concat_axis_1(const float* a, const float* b, float* c, int m, int n1, int n2) + { + int n = n1 + n2; + for(int i = 0; i < m; i++) + { + for(int j = 0; j < n1; j++) + { + c[j + i * n] = a[j + i * n1]; + } + for(int j = 0; j < n2; j++) + { + c[j + i * n + n1] = b[j + i * n2]; + } + } + } + + void do_gemm(const float* a, const float* b, float* c, int m, int k, int n, int lda, int ldb, int ldc) + { + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, a, lda, b, ldb, 0.0, c, ldc); + } + + bool do_RNN_step(const float* input, float* init_h, const float* kernel, const float* bias, + int batch_size, int input_size, int hidden_size) + { + int input_total_size = input_size + hidden_size; + int batch_cell_size = hidden_size * batch_size; + + float* ig = ( float* )malloc(batch_cell_size * sizeof(float)); + + float* merged_input = ( float* )malloc(sizeof(float) * batch_size * (input_total_size)); + float* matmul_result = ( float* )malloc(sizeof(float) * batch_size * hidden_size ); + + // merge input + concat_axis_1(input, init_h, merged_input, batch_size, input_size, hidden_size); + + // do gemm + do_gemm(merged_input, kernel, matmul_result, batch_size, input_total_size, hidden_size, input_total_size, + hidden_size, hidden_size); + + // add bias + if(bias) + { + for(int i = 0; i < batch_size; i++) + for(int j = 0; j < hidden_size; j++) + matmul_result[i *hidden_size + j] += bias[j]; + } + //activation + for(int i = 0; i < batch_cell_size; i++) + { + ig[i] = tanh(matmul_result[i]); + init_h[i]=ig[i]; + } + + // free memory + free(merged_input); + free(matmul_result); + free(ig); + + return true; + } + + bool do_RNN(const float* input, float* output, float* init_h, const float* kernel, + const float* bias, int seq_lens, int batch_size, int input_size,int output_len, int hidden_size) + { + for(int i = 0; i < seq_lens; i++) + { + const float* seq_input = input + i * batch_size * input_size; + + if(!do_RNN_step(seq_input, init_h, kernel, bias, batch_size, input_size, hidden_size)) + return false; + //outputs [batch_size,seq_len,hidden_size] + //final_state [batch_size,hidden_size] + if(i + output_len >= seq_lens) + { + memcpy(output, init_h, batch_size*hidden_size * sizeof(float)); + output += batch_size*hidden_size; + } + } + + return true; + } + + bool Prerun(Node* node) + { + RNN* rnn_op = dynamic_cast(node->GetOp()); + + int in_num = node->GetInputNum(); + + for(int count = 0; count < in_num; count++) + { + Tensor* temptensor = node->GetInputTensor(count); + const std::string& name = temptensor->GetName(); + + if(name.find(rnn_op->GetInitHiddenName()) != std::string::npos) + { + init_h_tensor = temptensor; + } + if(name.find(rnn_op->GetBiasName()) != std::string::npos) + { + bias_tensor = temptensor; + } + + } + + if(init_h_tensor) + { + init_h_data = get_tensor_mem(init_h_tensor); + } + + return true; + } + + bool Run(Node* node) + { + RNN* rnn_op = dynamic_cast(node->GetOp()); + RNNParam* param = rnn_op->GetParam(); + + Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + Tensor* kernel_tensor = node->GetInputTensor(1); + + int input_size = param->input_size; + int hidden_size = param->hidden_size; + + float* output = ( float* )get_tensor_mem(output_tensor); + float* input = ( float* )get_tensor_mem(input_tensor); + + const TShape& input_shape = input_tensor->GetShape(); + + int seq_lens = input_shape.Shape(0); + int batch_size = input_shape.Shape(1); + int output_len = param->output_len; + + float* init_h = ( float* )malloc(batch_size * hidden_size * sizeof(float)); + + if(init_h == nullptr) + { + set_tengine_errno(ENOMEM); + return false; + } + + if(init_h_data) + { + for(int i = 0; i < batch_size; i++) + { + memcpy(init_h + i * hidden_size, init_h_data, hidden_size * sizeof(float)); + } + } + else + { + memset(init_h, 0x0, sizeof(batch_size * hidden_size * sizeof(float))); + } + + float* kernel = ( float* )get_tensor_mem(kernel_tensor); + + float* bias = nullptr; + + if(bias_tensor) + bias = ( float* )get_tensor_mem(bias_tensor); + + bool ret = do_RNN(input, output, init_h, kernel, bias, seq_lens, batch_size, input_size, output_len, hidden_size); + + free(init_h); + + return ret; + } + + bool Postrun(Node* node) + { + return true; + } +}; + +} // namespace RNNRefImpl + +using namespace RNNRefImpl; +void RegisterRNNNodeExec(void) +{ + RNNOps* ops = new RNNOps(); + + NodeOpsRegistryManager::RegisterOPImplementor("common", "RNN", ops); +} + +} // namespace TEngine \ No newline at end of file diff --git a/executor/operator/common/concat.cpp b/executor/operator/common/concat.cpp index cb8ee5a2b..1e6304325 100644 --- a/executor/operator/common/concat.cpp +++ b/executor/operator/common/concat.cpp @@ -48,8 +48,13 @@ struct ConcatOps : public NodeOps int element_size = DataType::GetTypeSize(input_tensor->GetDataType()); Tensor* output_tensor = node->GetOutputTensor(0); auto out_quant = output_tensor->GetQuantParam(); - int out_zero = (*out_quant)[0].zero_point; - float out_scale = (*out_quant)[0].scale; + int out_zero = 0; + float out_scale = 1; + if( !out_quant->empty() ) + { + out_zero = (*out_quant)[0].zero_point; + out_scale = (*out_quant)[0].scale; + } Concat* concat_op = dynamic_cast(node->GetOp()); ConcatParam* param = concat_op->GetParam(); @@ -107,15 +112,25 @@ struct ConcatOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + if(data_type != TENGINE_DT_FP32 && data_type != TENGINE_DT_UINT8) + return nullptr; + + ConcatOps* ops = new ConcatOps(); + + return ops; +} + } // namespace ConcatImpl using namespace ConcatImpl; void RegisterConcatNodeExec(void) { - ConcatOps* ops = new ConcatOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Concat", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Concat", ConcatImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/conv_ref.cpp b/executor/operator/common/conv_ref.cpp deleted file mode 100644 index 2d5fdfe4b..000000000 --- a/executor/operator/common/conv_ref.cpp +++ /dev/null @@ -1,498 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2017, Open AI Lab - * Author: haoluo@openailab.com - */ -#include -#include -#include -#include -#include - -#include "logger.hpp" -#include "node_ops.hpp" -#include "tensor_mem.hpp" -#include "data_type.hpp" - -#include "graph.hpp" -#include "operator/convolution.hpp" - -namespace TEngine { - -namespace conv_ref { - -struct op_data -{ - float i_scale; - int i_zero; - float k_scale; - int k_zero; - float o_scale; - int o_zero; - int activation_min; - int activation_max; -}; - -const char* conv_name = "CONV_REF"; -const int default_prio = 1500; -/* -template -void interleave_kernel(void* kernel_org , void* kernel_interleaved,int output_chan , - int kernel_h, int kernel_w,int kernel_c) -{ - data_type* kernel = (data_type*) kernel_org; - data_type* kernel_inter = (data_type*) kernel_interleaved; - - int kernel_size = kernel_h * kernel_w * kernel_c; - for(int i =0;i(std::round(f / scale)); }; - - if(activation_type == 0) - { - op_param.activation_max = 255; - op_param.activation_min = std::max(0, quantize(0)); - } - else if(activation_type == 6) - { - op_param.activation_max = std::min(255, quantize(6)); - op_param.activation_min = std::max(0, quantize(0)); - } - else if(activation_type == 1) - { - op_param.activation_max = std::min(255, quantize(1)); - op_param.activation_min = std::max(0, quantize(-1)); - } - else - { - op_param.activation_max = 255; - op_param.activation_min = 0; - } - - return true; -} -/* -bool GetQuantizedMultiplerShift(op_data& op_param) -{ - const double input_product_scale = op_param.i_scale*op_param.k_scale; - double double_multiplier = input_product_scale/op_param.o_scale; - int shift = 0; - if(double_multiplier<1) - { - while(double_multiplier < 0.5) - { - double_multiplier*=2; - shift ++; - } - } - else if(double_multiplier>=1) - { - while(double_multiplier>1) - { - double_multiplier/=2; - shift --; - } - } - op_param.multiplier = std::round(double_multiplier * 256); - op_param.shift = -shift; - //printf("%f, %f, %f, %f,%d, %d\n",op_param.i_scale,op_param.k_scale,op_param.o_scale, - // dd, op_param.multiplier, shift); - //printf("%d, %d\n",op_param.i_zero,op_param.k_zero); - - return true; -} -*/ -template -void im2col(void* input_org, void* im2col, int input_chan, int input_x, int input_y, int kernel_x, int kernel_y, - int stride_x, int stride_y, int pad_x0, int pad_y0, int pad_x1, int pad_y1, int output_x, int output_y, - int group, int i_zero) -{ - data_type* input = ( data_type* )input_org; - data_type* col = ( data_type* )im2col; - - int input_c = input_chan * group; - int kernel_size = input_chan * kernel_x * kernel_y; - for(int h = 0; h < output_y; h++) - { - data_type* col_h = col + output_x * kernel_size * h; - for(int w = 0; w < output_x; w++) - { - data_type* col_w = col_h + kernel_size * w; - int w_start = w * stride_x - pad_x0; - int w_end = w_start + kernel_x; - int h_start = h * stride_y - pad_y0; - int h_end = h_start + kernel_y; - - for(int kh = h_start; kh < h_end; kh++) - for(int kw = w_start; kw < w_end; kw++) - for(int kc = 0; kc < input_chan; kc++) - { - if(kh < 0 || kh >= input_y || kw < 0 || kw >= input_x) - { - *col_w++ = ( data_type )i_zero; - } - else - *col_w++ = input[kh * input_c * input_x + kw * input_c + kc]; - } - } - } -} - -template -static void run_kernel(void* input, void* output, void* kernel, void* bias, int activation, int kernel_h, int kernel_w, - int input_c, int output_chan, int output_x, int output_y, int group, op_data param) -{ - data_type* output0 = ( data_type* )output; - data_type* kernel0 = ( data_type* )kernel; - - int in_chan_rel = input_c * group; - int out_chan_real = output_chan * group; - int kernel_size = input_c * kernel_h * kernel_w; - - for(int c = 0; c < output_chan; c++) - { - data_type* kernel_cur = kernel0 + c * in_chan_rel * kernel_h * kernel_w; - if(sizeof(data_type) == 4) - { - float* bias0 = ( float* )bias; - float bias_cur = bias0 ? bias0[c] : 0; - for(int h = 0; h < output_y; h++) - for(int w = 0; w < output_x; w++) - { - int index = h * output_x * out_chan_real + w * out_chan_real + c; - float tmp = bias_cur; - float* input_cur = ( float* )input + kernel_size * h * output_x + w * kernel_size; - for(int i = 0; i < kernel_h; i++) - for(int j = 0; j < kernel_w; j++) - for(int k = 0; k < input_c; k++) - { - int pos = i * kernel_w * in_chan_rel + j * in_chan_rel + k; - tmp += *input_cur * kernel_cur[pos]; - input_cur++; - } - - if(activation == 0) - { - if(tmp < 0) - tmp = 0; - } - if(activation == 6) - { - if(tmp < 0) - tmp = 0; - if(tmp > 6) - tmp = 6; - } - output0[index] = tmp; - } - } - else - { - int* bias0 = ( int* )bias; - int bias_cur = bias0 ? bias0[c] : 0; - for(int h = 0; h < output_y; h++) - for(int w = 0; w < output_x; w++) - { - int index = h * output_x * out_chan_real + w * out_chan_real + c; - int tmp = bias_cur; - uint8_t* input_cur = ( uint8_t* )input + kernel_size * h * output_x + w * kernel_size; - for(int i = 0; i < kernel_h; i++) - for(int j = 0; j < kernel_w; j++) - for(int k = 0; k < input_c; k++) - { - int pos = i * kernel_w * in_chan_rel + j * in_chan_rel + k; - tmp += (*input_cur - param.i_zero) * (kernel_cur[pos] - param.k_zero); - input_cur++; - } - tmp = std::round(tmp * param.i_scale * param.k_scale / param.o_scale); - - tmp += param.o_zero; - tmp = std::max(param.activation_min, tmp); - tmp = std::min(param.activation_max, tmp); - output0[index] = tmp; - } - } - } -} - -struct ConvRef : public MTNodeOps -{ - bool Prerun(Node* node) override; - bool Run(Node* node) override; - bool Reshape(Node* node) override; - bool Postrun(Node* node) override; - bool GetSharedMemorySize(Node*, unsigned int& mem_size) override; - bool SetSharedMemoryAddr(Node*, void* mem_addr, int mem_size) override; - - bool RunNHWC(Node* node); - bool RunNCHW(Node* node); - - op_data op_param; - int element_size; - bool dynamic_shape; -}; - -bool ConvRef::Reshape(Node* node) -{ - unsigned int new_col_size; - - GetSharedMemorySize(node, new_col_size); - - if(node->ExistAttr("col_buf_allocated")) - { - unsigned int col_size = any_cast(node->GetAttr("col_buf_allocated")); - - if(new_col_size == col_size) - return true; - - float* addr = any_cast(node->GetAttr("col_buf")); - mem_free(addr); - } - - float* col_buf = ( float* )mem_alloc(new_col_size); - node->SetAttr("col_buf", col_buf); - node->SetAttr("col_buf_allocated", new_col_size); - return true; -} - -bool ConvRef::SetSharedMemoryAddr(Node* node, void* mem_addr, int mem_size) -{ - (*node)["shared_col_buf"] = mem_addr; - return true; -} - -bool ConvRef::GetSharedMemorySize(Node* node, unsigned int& mem_size) -{ - Convolution* conv_op = dynamic_cast(node->GetOp()); - ConvParam* param = conv_op->GetParam(); - int group = param->group; - - Tensor* output_tensor = node->GetOutputTensor(0); - TShape& output_shape = output_tensor->GetShape(); - int output_y = output_shape.GetH(); - int output_x = output_shape.GetW(); - - Tensor* input_tensor = node->GetInputTensor(0); - TShape& input_shape = input_tensor->GetShape(); - element_size = DataType::GetTypeSize(input_tensor->GetDataType()); - - int input_chan = input_shape.GetC(); - int kernel_size = input_chan / group * param->kernel_h * param->kernel_w; - int output_xy = output_x * output_y; - - mem_size = (element_size * kernel_size * output_xy); - - return true; -} - -bool ConvRef::Prerun(Node* node) -{ - if(!dynamic_shape) - { - if(node->ExistAttr("shared_col_buf")) - { - void* addr = any_cast(node->GetAttr("shared_col_buf")); - - (*node)["col_buf"] = addr; - } - else - { - unsigned int col_size; - - GetSharedMemorySize(node, col_size); - - void* col_buf = mem_alloc(col_size); - (*node)["col_buf"] = col_buf; - node->SetAttr("col_buf_allocated", col_size); - } - } - if(element_size == 1) - { - Convolution* conv_op = dynamic_cast(node->GetOp()); - ConvParam* param = conv_op->GetParam(); - Tensor* input_tensor = node->GetInputTensor(0); - Tensor* kernel_tensor = node->GetInputTensor(1); - Tensor* output_tensor = node->GetOutputTensor(0); - - auto* in_quant = input_tensor->GetQuantParam(); - op_param.i_scale = (*in_quant)[0].scale; - op_param.i_zero = (*in_quant)[0].zero_point; - auto* k_quant = kernel_tensor->GetQuantParam(); - op_param.k_scale = (*k_quant)[0].scale; - op_param.k_zero = (*k_quant)[0].zero_point; - auto* o_quant = output_tensor->GetQuantParam(); - op_param.o_scale = (*o_quant)[0].scale; - op_param.o_zero = (*o_quant)[0].zero_point; - // GetQuantizedMultiplerShift(op_param); - GetQuantizedActivationMinMax(op_param, param->activation); - } - - return true; -} - -bool ConvRef::Run(Node* node) -{ - if(exec_attr->layout == TENGINE_LAYOUT_NHWC) - { - return RunNHWC(node); - } - else - { - // TODO: support NCHW - return false; - } -} - -bool ConvRef::RunNHWC(Node* node) -{ - Convolution* conv_op = dynamic_cast(node->GetOp()); - ConvParam* param = conv_op->GetParam(); - - int kernel_h = param->kernel_h; - int kernel_w = param->kernel_w; - int stride_h = param->stride_h; - int stride_w = param->stride_w; - // int pad_h = param->pad_h; - // int pad_w = param->pad_w; - int dilation_h = param->dilation_h; - int dilation_w = param->dilation_w; - int pad_x0 = param->pads[1]; // left padding columns - int pad_x1 = param->pads[3]; // right padding columns - int pad_y0 = param->pads[0]; // top padding rows - int pad_y1 = param->pads[2]; // bottom padding rows - int group = param->group; - int activation = param->activation; - if(dilation_h != 1 || dilation_w != 1) - return false; - - Tensor* input_tensor = node->GetInputTensor(0); - uint8_t* input_org = ( uint8_t* )get_tensor_mem(input_tensor); - TShape& input_shape = input_tensor->GetShape(); - int input_w = input_shape.GetW(); - int input_h = input_shape.GetH(); - int input_c = input_shape.GetC() / group; - int input_n = input_shape.GetN(); - int input_size = input_w * input_h * input_c; - - Tensor* kernel_tensor = node->GetInputTensor(1); - uint8_t* kernel = ( uint8_t* )get_tensor_mem(kernel_tensor); - - uint8_t* bias_data = nullptr; - if(node->GetInputNum() > 2) - { - Tensor* bias_tensor = node->GetInputTensor(2); - bias_data = ( uint8_t* )get_tensor_mem(bias_tensor); - } - - Tensor* output_tensor = node->GetOutputTensor(0); - uint8_t* output_org = ( uint8_t* )get_tensor_mem(output_tensor); - - TShape& output_shape = output_tensor->GetShape(); - int output_w = output_shape.GetW(); - int output_h = output_shape.GetH(); - int output_c = output_shape.GetC() / group; - int output_xy = output_h * output_w; - - void* col_buf = any_cast(node->GetAttr("col_buf")); - uint8_t* col = ( uint8_t* )col_buf; - - for(int n = 0; n < input_n; n++) - { - uint8_t* input = input_org + n * input_size * group * element_size; - uint8_t* output = output_org + n * output_xy * output_c * group * element_size; - - for(int g = 0; g < group; g++) - { - uint8_t* input_g = input + input_c * g * element_size; - uint8_t* output_g = output + output_c * g * element_size; - uint8_t* kernel_g = kernel + input_c * g * element_size; - uint8_t* bias_g = bias_data ? bias_data + output_c * g * 4 : nullptr; - if(element_size == 4) - { - im2col(input_g, col, input_c, input_w, input_h, kernel_w, kernel_h, stride_w, stride_h, pad_x0, - pad_y0, pad_x1, pad_y1, output_w, output_h, group, 0); - run_kernel(col, output_g, kernel_g, bias_g, activation, kernel_h, kernel_w, input_c, output_c, - output_w, output_h, group, op_param); - } - - if(element_size == 1) - { - im2col(input_g, col, input_c, input_w, input_h, kernel_w, kernel_h, stride_w, stride_h, pad_x0, - pad_y0, pad_x1, pad_y1, output_w, output_h, group, op_param.i_zero); - run_kernel(col, output_g, kernel_g, bias_g, activation, kernel_h, kernel_w, input_c, output_c, - output_w, output_h, group, op_param); - } - } - } - - return true; -} - -bool ConvRef::Postrun(Node* node) -{ - if(node->ExistAttr("col_buf_allocated")) - { - void* addr = any_cast(node->GetAttr("col_buf")); - mem_free(addr); - node->RemoveAttr("col_buf_allocated"); - } - return true; -} - -NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) -{ - ConvRef* ops = new ConvRef(); - - ops->need_free = true; - if(node->IsDynamicShape()) - ops->dynamic_shape = true; - else - ops->dynamic_shape = false; - - return ops; -} - -} // namespace conv_ref - -void RegisterConv2dRef(void) -{ - NodeOpsRegistryManager::RegisterOPImplementor("common", "Convolution", conv_ref::SelectFunc, - conv_ref::default_prio); -} - -} // namespace TEngine diff --git a/executor/operator/common/detection_output.cpp b/executor/operator/common/detection_output.cpp index 52c591697..c47e7364c 100644 --- a/executor/operator/common/detection_output.cpp +++ b/executor/operator/common/detection_output.cpp @@ -240,15 +240,26 @@ struct DetectionOutputOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + DetectionOutputOps* ops = new DetectionOutputOps(); + + return ops; +} + } // namespace DetectionOutputImpl using namespace DetectionOutputImpl; void RegisterDetectionOutputNodeExec(void) { - DetectionOutputOps* ops = new DetectionOutputOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "DetectionOutput", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "DetectionOutput", DetectionOutputImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/detection_postprocess.cpp b/executor/operator/common/detection_postprocess.cpp index 1f7ebb1ad..44ee81204 100644 --- a/executor/operator/common/detection_postprocess.cpp +++ b/executor/operator/common/detection_postprocess.cpp @@ -325,15 +325,27 @@ struct DetectionPostProcessOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if((data_type != TENGINE_DT_FP32&&data_type != TENGINE_DT_UINT8) || + exec_attr->graph_layout != TENGINE_LAYOUT_NHWC) + return nullptr; + + DetectionPostProcessOps* ops = new DetectionPostProcessOps(); + + return ops; +} + } // namespace DetectionPostProcessImpl using namespace DetectionPostProcessImpl; void RegisterDetectionPostProcessNodeExec(void) { - DetectionPostProcessOps* ops = new DetectionPostProcessOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "DetectionPostProcess", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "DetectionPostProcess", DetectionPostProcessImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/dropout.cpp b/executor/operator/common/dropout.cpp index e595ecd51..bfcdeb918 100644 --- a/executor/operator/common/dropout.cpp +++ b/executor/operator/common/dropout.cpp @@ -51,19 +51,39 @@ struct DropoutOps : public NodeOps bool Run(Node* node) { // Nothing needs to do for inference - return true; + Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + void* input_org = get_tensor_mem(input_tensor); + void* output_org = get_tensor_mem(output_tensor); + if(input_org == output_org) + return true; + + int size = input_tensor->GetTotalSize(); + memcpy(output_org, input_org, size); + return true; } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + DropoutOps* ops = new DropoutOps(); + + return ops; +} + } // namespace DropImpl using namespace DropImpl; void RegisterDropoutNodeExec(void) { - DropoutOps* ops = new DropoutOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Dropout", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Dropout", DropImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/eltwise.cpp b/executor/operator/common/eltwise.cpp index bbfab37f7..dcde9f9ca 100644 --- a/executor/operator/common/eltwise.cpp +++ b/executor/operator/common/eltwise.cpp @@ -204,15 +204,26 @@ struct EltwiseOps : public NodeOps }; // struct EltwiseOps +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + EltwiseOps* ops = new EltwiseOps(); + + return ops; +} + } // namespace EltwiseImpl using namespace EltwiseImpl; void RegisterEltwiseNodeExec(void) { - EltwiseOps* ops = new EltwiseOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Eltwise", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Eltwise", EltwiseImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/flatten.cpp b/executor/operator/common/flatten.cpp index 1a7fd0fbc..145e8ca9f 100644 --- a/executor/operator/common/flatten.cpp +++ b/executor/operator/common/flatten.cpp @@ -73,15 +73,26 @@ struct FlattenOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + FlattenOps* ops = new FlattenOps(); + + return ops; +} + } // namespace FlattenImpl using namespace FlattenImpl; void RegisterFlattenNodeExec(void) { - FlattenOps* ops = new FlattenOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Flatten", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Flatten", FlattenImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/fused/fused_bn_scale_relu.cpp b/executor/operator/common/fused/fused_bn_scale_relu.cpp index 1d2362bff..b360144de 100644 --- a/executor/operator/common/fused/fused_bn_scale_relu.cpp +++ b/executor/operator/common/fused/fused_bn_scale_relu.cpp @@ -187,15 +187,26 @@ struct FusedOps : public MTNodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + FusedOps* ops = new FusedOps(); + + return ops; +} + } // namespace FusedBNScaleReluImpl using namespace FusedBNScaleReluImpl; void RegisterCommonFusedBNScaleReluNodeExec(void) { - FusedOps* ops = new FusedOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", FusedBNScaleReLu::class_name, ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", FusedBNScaleReLu::class_name, FusedBNScaleReluImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/init.cpp b/executor/operator/common/init.cpp index 1b1f595ef..3cf766d3c 100644 --- a/executor/operator/common/init.cpp +++ b/executor/operator/common/init.cpp @@ -26,7 +26,6 @@ namespace TEngine { -extern void NodeOpsRegistryManagerInit(void); extern void RegisterConcatNodeExec(void); extern void RegisterDropoutNodeExec(void); extern void RegisterSoftmaxNodeExec(void); @@ -49,27 +48,23 @@ extern void RegisterReLuNodeExec(void); extern void RegisterResizeNodeExec(void); extern void RegisterLogisticNodeExec(void); extern void RegisterDetectionPostProcessNodeExec(void); -extern void RegisterConv2dRef(void); #ifdef CONFIG_ARCH_BLAS extern void RegisterConvBlasNodeExec(void); extern void RegisterDeconvBlasNodeExec(void); extern void RegisterFcBlasNodeExec(void); extern void RegisterLSTMNodeExec(void); +extern void RegisterRNNNodeExec(void); +extern void RegisterGRUNodeExec(void); #endif extern void RegisterPooling_NodeExec(void); extern void RegisterBatchNorm_NodeExec(void); extern void RegisterScale_NodeExec(void); extern void RegisterCommonFusedBNScaleReluNodeExec(void); -extern void RegisterDemoOps(void); void RegisterCommonOps(void) { -#ifndef ANDROID - RegisterDemoOps(); -#endif - RegisterConcatNodeExec(); RegisterDropoutNodeExec(); RegisterSoftmaxNodeExec(); @@ -92,13 +87,14 @@ void RegisterCommonOps(void) RegisterResizeNodeExec(); RegisterLogisticNodeExec(); RegisterDetectionPostProcessNodeExec(); - RegisterConv2dRef(); #ifdef CONFIG_ARCH_BLAS RegisterConvBlasNodeExec(); RegisterDeconvBlasNodeExec(); RegisterFcBlasNodeExec(); RegisterLSTMNodeExec(); + RegisterRNNNodeExec(); + RegisterGRUNodeExec(); #endif RegisterPooling_NodeExec(); RegisterBatchNorm_NodeExec(); diff --git a/executor/operator/common/logistic.cpp b/executor/operator/common/logistic.cpp index 3cd108d90..92ad3ea10 100644 --- a/executor/operator/common/logistic.cpp +++ b/executor/operator/common/logistic.cpp @@ -74,9 +74,9 @@ struct LogisticOps : public NodeOps auto o_quantized = output->GetQuantParam(); float i_scale = (*i_quantized)[0].scale; - float i_zero = (*i_quantized)[0].zero_point; + int i_zero = (*i_quantized)[0].zero_point; float o_scale = (*o_quantized)[0].scale; - float o_zero = (*o_quantized)[0].zero_point; + int o_zero = (*o_quantized)[0].zero_point; for(int i = 0; i < elements; i++) { @@ -90,15 +90,27 @@ struct LogisticOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if( (data_type != TENGINE_DT_FP32 && data_type != TENGINE_DT_UINT8) + || exec_attr->graph_layout != TENGINE_LAYOUT_NHWC) + return nullptr; + + LogisticOps* ops = new LogisticOps(); + + return ops; +} + } // namespace LogisticImpl using namespace LogisticImpl; void RegisterLogisticNodeExec(void) { - LogisticOps* ops = new LogisticOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Logistic", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Logistic", LogisticImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/lrn.cpp b/executor/operator/common/lrn.cpp index eb9a2f31b..151619a5f 100644 --- a/executor/operator/common/lrn.cpp +++ b/executor/operator/common/lrn.cpp @@ -122,15 +122,26 @@ struct LRNOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + LRNOps* ops = new LRNOps(); + + return ops; +} + } // namespace LRNImpl using namespace LRNImpl; void RegisterLRN_NodeExec(void) { - LRNOps* ops = new LRNOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "LRN", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "LRN", LRNImpl::SelectFunc, 1000); } } // namespace TEngine \ No newline at end of file diff --git a/executor/operator/common/lrn_arm.cpp b/executor/operator/common/lrn_arm.cpp index 2d0fcf760..cea518834 100644 --- a/executor/operator/common/lrn_arm.cpp +++ b/executor/operator/common/lrn_arm.cpp @@ -269,6 +269,19 @@ struct LRNOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + LRNOps* ops = new LRNOps(); + + return ops; +} + } // namespace LRNImplArm using namespace LRNImplArm; @@ -276,13 +289,11 @@ using namespace LRNImplArm; void RegisterLRNNodeExec(void) { #ifdef CONFIG_ARCH_ARM32 - LRNOps* arm32_ops = new LRNOps(); - NodeOpsRegistryManager::RegisterOPImplementor("arm32", "LRN", arm32_ops); + NodeOpsRegistryManager::RegisterOPImplementor("arm32", "LRN", LRNImplArm::SelectFunc, 1000); #endif #ifdef CONFIG_ARCH_ARM64 - LRNOps* arm64_ops = new LRNOps(); - NodeOpsRegistryManager::RegisterOPImplementor("arm64", "LRN", arm64_ops); + NodeOpsRegistryManager::RegisterOPImplementor("arm64", "LRN", LRNImplArm::SelectFunc, 1000); #endif } diff --git a/executor/operator/common/normalize.cpp b/executor/operator/common/normalize.cpp index 2a70291dc..e387b3233 100644 --- a/executor/operator/common/normalize.cpp +++ b/executor/operator/common/normalize.cpp @@ -118,15 +118,26 @@ struct NormalizeOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + NormalizeOps* ops = new NormalizeOps(); + + return ops; +} + } // namespace NormalizeImpl using namespace NormalizeImpl; void RegisterNormalizeNodeExec(void) { - NormalizeOps* ops = new NormalizeOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Normalize", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Normalize", NormalizeImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/permute.cpp b/executor/operator/common/permute.cpp index b05623361..b683e7684 100644 --- a/executor/operator/common/permute.cpp +++ b/executor/operator/common/permute.cpp @@ -56,7 +56,7 @@ struct PermuteOps : public NodeOps } } - bool Run(Node* node) + bool Run(Node* node) { const Tensor* input_tensor = node->GetInputTensor(0); Tensor* output_tensor = node->GetOutputTensor(0); @@ -66,41 +66,83 @@ struct PermuteOps : public NodeOps const TShape& shape = input_tensor->GetShape(); const std::vector dims = shape.GetDim(); - - int batch_number = dims[0]; - int channel = dims[1]; - int width = dims[3]; - int height = dims[2]; - int _wc = width * channel; - int _hw = width * height; - int _chw = channel * _hw; - - float* input = ( float* )get_tensor_mem(input_tensor); - float* output = ( float* )get_tensor_mem(output_tensor); - // 0231 [bhwc] - if((param->order0 == 0) && (param->order1 == 2) && (param->order2 == 3) && (param->order3 == 1)) + if(dims.size()==4){ + int batch_number = dims[0]; + int channel = dims[1]; + int width = dims[3]; + int height = dims[2]; + int _wc = width * channel; + int _hw = width * height; + int _chw = channel * _hw; + + float* input = ( float* )get_tensor_mem(input_tensor); + float* output = ( float* )get_tensor_mem(output_tensor); + // 0231 [bhwc] + // other case to be support + if((param->order0 == 0) && (param->order1 == 2) && (param->order2 == 3) && (param->order3 == 1)) + { + for(int b = 0; b < batch_number; b++) + { + permute_hwc(input, output, height, width, channel, _wc, _hw); + input += _chw; + output += _chw; + } + } + } + else if(dims.size()==3) { - for(int b = 0; b < batch_number; b++) + int channel = dims[0]; + int width = dims[2]; + int height = dims[1]; + int _hw = height * width; + int _cw = channel * width; + + float* input = ( float* )get_tensor_mem(input_tensor); + float* output = ( float* )get_tensor_mem(output_tensor); + if((param->order0 == 1) && (param->order1 == 0) && (param->order2 == 2)) { - permute_hwc(input, output, height, width, channel, _wc, _hw); - input += _chw; - output += _chw; + for (int q=0; qGetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + PermuteOps* ops = new PermuteOps(); + + return ops; +} + } // namespace PermuteImpl using namespace PermuteImpl; void RegisterPermuteNodeExec(void) { - PermuteOps* ops = new PermuteOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Permute", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Permute", PermuteImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/pooling.cpp b/executor/operator/common/pooling.cpp index 9bd735440..cd8a69a51 100644 --- a/executor/operator/common/pooling.cpp +++ b/executor/operator/common/pooling.cpp @@ -245,7 +245,7 @@ struct PoolOps : public NodeOps uint8_t* input_data = ( uint8_t* )get_tensor_mem(itensor); uint8_t* output_data = ( uint8_t* )get_tensor_mem(otensor); - if(exec_attr->layout == TENGINE_LAYOUT_NCHW) + if(exec_attr->graph_layout == TENGINE_LAYOUT_NCHW) { if(param_->alg == kPoolMax) { @@ -260,9 +260,9 @@ struct PoolOps : public NodeOps for(int n = 0; n < input_n; n++) { Generic_MaxPool(( float* )input_data + n * in_chw, ( float* )output_data + n * out_chw, input_c, - input_h, input_w, output_h, output_w, param_->kernel_shape[0], - param_->kernel_shape[1], param_->strides[0], param_->strides[1], - param_->pads[0], param_->pads[1]); + input_h, input_w, output_h, output_w, param_->kernel_h, + param_->kernel_w, param_->stride_h, param_->stride_w, + param_->pad_h0, param_->pad_w0); } } } @@ -279,9 +279,9 @@ struct PoolOps : public NodeOps for(int n = 0; n < input_n; n++) { Generic_AvgPool(( float* )input_data + n * in_chw, ( float* )output_data + n * out_chw, input_c, - input_h, input_w, output_h, output_w, param_->kernel_shape[0], - param_->kernel_shape[1], param_->strides[0], param_->strides[1], - param_->pads[0], param_->pads[1], param_->caffe_flavor); + input_h, input_w, output_h, output_w, param_->kernel_h, + param_->kernel_w, param_->stride_h, param_->stride_w, + param_->pad_h0, param_->pad_w0, param_->caffe_flavor); } } } @@ -314,13 +314,13 @@ struct PoolOps : public NodeOps if(elem_size == 4) Generic_AvgPool_nhwc( input_data + n * in_chw * 4, output_data + n * out_chw * 4, input_c, input_h, input_w, - output_h, output_w, param_->kernel_shape[0], param_->kernel_shape[1], - param_->strides[0], param_->strides[1], param_->pads[0], param_->pads[1]); + output_h, output_w, param_->kernel_h, param_->kernel_w, + param_->stride_h, param_->stride_w, param_->pad_h0, param_->pad_w0); if(elem_size == 1) Generic_AvgPool_nhwc( input_data + n * in_chw, output_data + n * out_chw * 1, input_c, input_h, input_w, - output_h, output_w, param_->kernel_shape[0], param_->kernel_shape[1], - param_->strides[0], param_->strides[1], param_->pads[0], param_->pads[1]); + output_h, output_w, param_->kernel_h, param_->kernel_w, + param_->stride_h, param_->stride_w, param_->pad_h0, param_->pad_w0); } } } @@ -335,15 +335,26 @@ struct PoolOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + PoolOps* ops = new PoolOps(); + + return ops; +} + } // namespace PoolingRef using namespace PoolingRef; void RegisterPooling_NodeExec(void) { - PoolOps* ops = new PoolOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Pooling", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Pooling", PoolingRef::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/prelu.cpp b/executor/operator/common/prelu.cpp index c7ea2e140..328f2633a 100644 --- a/executor/operator/common/prelu.cpp +++ b/executor/operator/common/prelu.cpp @@ -80,15 +80,26 @@ struct PreluOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + PreluOps* ops = new PreluOps(); + + return ops; +} + } // namespace PreluImpl using namespace PreluImpl; void RegisterPReLUNodeExec(void) { - PreluOps* ops = new PreluOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "PReLU", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "PReLU", PreluImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/priorbox.cpp b/executor/operator/common/priorbox.cpp index 3d0ec9676..52ace91e2 100644 --- a/executor/operator/common/priorbox.cpp +++ b/executor/operator/common/priorbox.cpp @@ -165,15 +165,26 @@ struct PriorBoxOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + PriorBoxOps* ops = new PriorBoxOps(); + + return ops; +} + } // namespace PriorBoxImpl using namespace PriorBoxImpl; void RegisterPriorBoxNodeExec(void) { - PriorBoxOps* ops = new PriorBoxOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "PriorBox", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "PriorBox", PriorBoxImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/region.cpp b/executor/operator/common/region.cpp index a380f137a..bc781c60b 100644 --- a/executor/operator/common/region.cpp +++ b/executor/operator/common/region.cpp @@ -135,15 +135,26 @@ struct RegionOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + RegionOps* ops = new RegionOps(); + + return ops; +} + } // namespace RegionImpl using namespace RegionImpl; void RegisterRegionNodeExec(void) { - RegionOps* ops = new RegionOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Region", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Region", RegionImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/relu.cpp b/executor/operator/common/relu.cpp index 57e963a68..f353d68d1 100644 --- a/executor/operator/common/relu.cpp +++ b/executor/operator/common/relu.cpp @@ -103,15 +103,26 @@ struct ReLuOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + ReLuOps* ops = new ReLuOps(); + + return ops; +} + } // namespace ReLuImpl using namespace ReLuImpl; void RegisterReLuNodeExec(void) { - ReLuOps* ops = new ReLuOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "ReLu", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "ReLu", ReLuImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/relu6.cpp b/executor/operator/common/relu6.cpp index 635d4a382..87d946b75 100644 --- a/executor/operator/common/relu6.cpp +++ b/executor/operator/common/relu6.cpp @@ -89,15 +89,26 @@ struct ReLu6Ops : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + ReLu6Ops* ops = new ReLu6Ops(); + + return ops; +} + } // namespace ReLu6Impl using namespace ReLu6Impl; void RegisterReLu6NodeExec(void) { - ReLu6Ops* ops = new ReLu6Ops(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "ReLu6", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "ReLu6", ReLu6Impl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/reorg.cpp b/executor/operator/common/reorg.cpp index 3342d8911..52bea5da2 100644 --- a/executor/operator/common/reorg.cpp +++ b/executor/operator/common/reorg.cpp @@ -81,15 +81,26 @@ struct ReorgOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + ReorgOps* ops = new ReorgOps(); + + return ops; +} + } // namespace ReorgImpl using namespace ReorgImpl; void RegisterReorgNodeExec(void) { - ReorgOps* ops = new ReorgOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Reorg", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Reorg", ReorgImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/reshape.cpp b/executor/operator/common/reshape.cpp index 4f0ec4677..3785048fa 100644 --- a/executor/operator/common/reshape.cpp +++ b/executor/operator/common/reshape.cpp @@ -54,15 +54,26 @@ struct ReshapeOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + ReshapeOps* ops = new ReshapeOps(); + + return ops; +} + } // namespace ReshapeImpl using namespace ReshapeImpl; void RegisterReshapeNodeExec(void) { - ReshapeOps* ops = new ReshapeOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Reshape", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Reshape", ReshapeImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/resize.cpp b/executor/operator/common/resize.cpp index 4ba4f4797..1ce8316ad 100644 --- a/executor/operator/common/resize.cpp +++ b/executor/operator/common/resize.cpp @@ -309,15 +309,26 @@ struct ResizeOps : public MTNodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + ResizeOps* ops = new ResizeOps(); + + return ops; +} + } // namespace ResizeImpl using namespace ResizeImpl; void RegisterResizeNodeExec(void) { - ResizeOps* ops = new ResizeOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Resize", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Resize", ResizeImpl::SelectFunc, 1000); } } // namespace TEngine \ No newline at end of file diff --git a/executor/operator/common/roi_pooling.cpp b/executor/operator/common/roi_pooling.cpp index bd550175f..07964188f 100644 --- a/executor/operator/common/roi_pooling.cpp +++ b/executor/operator/common/roi_pooling.cpp @@ -117,15 +117,26 @@ struct ROIPoolingOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + ROIPoolingOps* ops = new ROIPoolingOps(); + + return ops; +} + } // namespace ROIPoolingImpl using namespace ROIPoolingImpl; void RegisterROIPoolingNodeExec(void) { - ROIPoolingOps* ops = new ROIPoolingOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "ROIPooling", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "ROIPooling", ROIPoolingImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/rpn.cpp b/executor/operator/common/rpn.cpp index fe669535a..e06ced2ec 100644 --- a/executor/operator/common/rpn.cpp +++ b/executor/operator/common/rpn.cpp @@ -321,15 +321,26 @@ struct RPNOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + RPNOps* ops = new RPNOps(); + + return ops; +} + } // namespace RPNImpl using namespace RPNImpl; void RegisterRPNNodeExec(void) { - RPNOps* ops = new RPNOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "RPN", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "RPN", RPNImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/scale.cpp b/executor/operator/common/scale.cpp index f291673ec..8cf3d9953 100644 --- a/executor/operator/common/scale.cpp +++ b/executor/operator/common/scale.cpp @@ -108,15 +108,26 @@ struct ScaleOps : public NodeOps } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + ScaleOps* ops = new ScaleOps(); + + return ops; +} + } // namespace ScaleImpl using namespace ScaleImpl; void RegisterScale_NodeExec(void) { - ScaleOps* ops = new ScaleOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Scale", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Scale", ScaleImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/common/slice.cpp b/executor/operator/common/slice.cpp index b65389322..469214d2d 100644 --- a/executor/operator/common/slice.cpp +++ b/executor/operator/common/slice.cpp @@ -19,7 +19,7 @@ /* * Copyright (c) 2018, Open AI Lab - * Author: chunyinglv@openailab.com + * Author: ruizhang@openailab.com */ #include #include @@ -30,42 +30,160 @@ #include "node_ops.hpp" #include "tensor_mem.hpp" #include "graph.hpp" +#include "operator/slice.hpp" + namespace TEngine { namespace SliceImpl { - +const int default_prio = 200; struct SliceOps : public NodeOps { - bool Run(Node* node) + template + bool caffe_run(Node *node) { - // currently, only working on channel C (slice_axis=1) + // get the slice param + Slice * slice_op = dynamic_cast(node->GetOp()); + SliceParam * param = slice_op->GetParam(); + int slice_axis = param->axis; + int num_slices = 1; + int slice_size = 1; Tensor* input_tensor = node->GetInputTensor(0); - Tensor* output_tensor0 = node->GetOutputTensor(0); - Tensor* output_tensor1 = node->GetOutputTensor(1); - const std::vector& dims = input_tensor->GetShape().GetDim(); + const TShape& input_shape = input_tensor->GetShape(); + T* input = ( T* )get_tensor_mem(input_tensor); + std::vector in_dim = input_shape.GetDim(); + for(int i = 0; i < slice_axis; i++) + { + num_slices = num_slices * in_dim[i]; + } + for(unsigned int i = slice_axis + 1; i < in_dim.size(); i++) + { + slice_size = slice_size * in_dim[i]; + } + int in_slice = in_dim[slice_axis]; + int slice_index = 0; + unsigned int out_num = node->GetOutputNum(); + for(unsigned int i = 0; i < out_num; i++) + { + Tensor* output_tensor = node->GetOutputTensor(i); + T* output = (T* )get_tensor_mem(output_tensor); + int out_slice = (output_tensor->GetShape()).Shape(slice_axis); + + for(int n = 0; n < num_slices; n++) + { + int in_offset = (n * in_slice + slice_index) * slice_size; + int out_offset = n * out_slice * slice_size; + memcpy(output+out_offset,input + in_offset,slice_size * out_slice * sizeof(T)); + } + slice_index += out_slice; + } + return true; + + } + template + bool tf_run(Node *node) + { + // get the slice param + Slice * slice_op = dynamic_cast(node->GetOp()); + SliceParam * param = slice_op->GetParam(); + // get the input data + Tensor* input_tensor = node->GetInputTensor(0); + const TShape& input_shape = input_tensor->GetShape(); + T* input = (T* )get_tensor_mem(input_tensor); + Tensor* output_tensor = node->GetOutputTensor(0); + T *output = (T* )get_tensor_mem(output_tensor); + std::vector in_dim = input_shape.GetDim(); + int in_dim_new[4]; + int maxdim = 4; + int begins[4]; + int sizes[4]; + int real_dim = param->begin_.size(); + int dim_idx = 0; + for(int idx = 0; idx < maxdim; idx++) + { + if(maxdim - idx > real_dim) + { + begins[idx] = 0; + sizes[idx] = 1; + in_dim_new[idx] = 1; + } + else + { + begins[idx] = param->begin_[dim_idx]; + sizes[idx] = param->size_[dim_idx]; + in_dim_new[idx] = in_dim[dim_idx]; + dim_idx++; + } + } + int in_dim_0 = in_dim_new[0]; + int in_dim_1 = in_dim_new[1]; + int in_dim_2 = in_dim_new[2]; + int in_dim_3 = in_dim_new[3]; - int hw = dims[2] * dims[3]; - int slice_size = dims[1] / 2 * hw; - int size = dims[1] * hw; - float* input = ( float* )get_tensor_mem(input_tensor); - float* output0 = ( float* )get_tensor_mem(output_tensor0); - float* output1 = ( float* )get_tensor_mem(output_tensor1); + int start_dim_0 = (4 - real_dim) > 0 ? 0 : begins[0]; + int stop_dim_0 = ((4 - real_dim) > 0 || sizes[0] == -1) + ? in_dim_0 - start_dim_0 + : start_dim_0 + sizes[0]; + int start_dim_1 = (3 - real_dim) > 0 ? 0 : begins[1]; + int stop_dim_1 = ((3 - real_dim) > 0 || sizes[1] == -1) + ? in_dim_1 - start_dim_1 + : start_dim_1 + sizes[1]; + int start_dim_2 = (2 - real_dim) > 0 ? 0 : begins[2]; + int stop_dim_2 = ((2 - real_dim) > 0 || sizes[2] == -1) + ? in_dim_2 - start_dim_2 + : start_dim_2 + sizes[2]; + int start_dim_3 = (1 - real_dim) > 0 ? 0 : begins[3]; + int stop_dim_3 = ((1 - real_dim) > 0 || sizes[3] == -1) + ? in_dim_3 - start_dim_3 + : start_dim_3 + sizes[3]; - for(int i = 0; i < dims[0]; i++) + for(int n = start_dim_0; n < stop_dim_0;++n) { - float* in0 = input + i * size; - float* in1 = in0 + slice_size; - for(int j = 0; j < slice_size; j++) + for(int i = start_dim_1; i < stop_dim_1; ++i) { - output0[j] = in0[j]; - output1[j] = in1[j]; + for(int j = start_dim_2; j < stop_dim_2; ++j) + { + int len = stop_dim_3 - start_dim_3; + int input_off = n * in_dim_1 * in_dim_2 * in_dim_3 + + i * in_dim_2 * in_dim_3 + + j * in_dim_3 + start_dim_3; + memcpy(output,input + input_off,len * sizeof(T)); + output += len; + } } } return true; } + bool Run(Node* node) + { + Slice * slice_op = dynamic_cast(node->GetOp()); + SliceParam * param = slice_op->GetParam(); + if(param->iscaffe) + { + return caffe_run(node); + } + else + { + return tf_run(node); + } + } }; +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ +#ifdef CONFIG_ATUH_DEVICE + if(!get_auth_float_enabled()) + return nullptr; +#endif + + Tensor* input = node->GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 ||exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + SliceOps* ops = new SliceOps(); + return ops; +} } // namespace SliceImpl @@ -73,9 +191,9 @@ using namespace SliceImpl; void RegisterSliceNodeExec(void) { - SliceOps* ops = new SliceOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Slice", ops); + if(!NodeOpsRegistryManager::RegisterOPImplementor("common", "Slice", SliceImpl::SelectFunc, + SliceImpl::default_prio)) + LOG_ERROR()<<__FUNCTION__<<" :Regist OP failed for prio["<GetInputTensor(0); + const int data_type = input->GetDataType(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + if(data_type != TENGINE_DT_FP32 || exec_attr->graph_layout != TENGINE_LAYOUT_NCHW) + return nullptr; + + SoftmaxOps* ops = new SoftmaxOps(); + + return ops; +} + } // namespace SoftmaxImpl using namespace SoftmaxImpl; void RegisterSoftmaxNodeExec(void) { - SoftmaxOps* ops = new SoftmaxOps(); - - NodeOpsRegistryManager::RegisterOPImplementor("common", "Softmax", ops); + NodeOpsRegistryManager::RegisterOPImplementor("common", "Softmax", SoftmaxImpl::SelectFunc, 1000); } } // namespace TEngine diff --git a/executor/operator/init.cpp b/executor/operator/init.cpp new file mode 100644 index 000000000..5fc6f6373 --- /dev/null +++ b/executor/operator/init.cpp @@ -0,0 +1,27 @@ +namespace TEngine { + +extern void NodeOpsRegistryManagerInit(void); +extern void RegisterCommonOps(void); +extern void RegisterRefOps(void); + +#if CONFIG_ARCH_ARM64 == 1 +extern void RegisterArmOps(void); +#endif + +} + +using namespace TEngine; + +extern "C" int register_hclcpu_ops(void) +{ + RegisterCommonOps(); + RegisterRefOps(); + +#if CONFIG_ARCH_ARM64 + RegisterArmOps(); +#endif + + return 0; + +} + diff --git a/executor/operator/ref/Makefile b/executor/operator/ref/Makefile index f1037d992..62d066cfb 100644 --- a/executor/operator/ref/Makefile +++ b/executor/operator/ref/Makefile @@ -1,3 +1,34 @@ -obj-y+=demo_operator.o +obj-y+=init.o +obj-y+=ref_convolution.o +obj-y+=ref_pooling.o +obj-y+=ref_deconvolution.o +obj-y+=ref_fully_connected.o +obj-y+=ref_softmax.o +obj-y+=ref_concat.o +obj-y+=ref_permute.o +obj-y+=ref_swap_axis.o +obj-y+=ref_rpn.o +obj-y+=prelu.o +obj-y+=relu.o +obj-y+=relu6.o +obj-y+=sigmoid.o +obj-y+=squeeze.o +obj-y+=tanh.o +obj-y+=resize.o +obj-y+=reshape.o +obj-y+=flatten.o +obj-y+=dropout.o +obj-y+=ref_detection_postprocess.o +obj-y+=ref_lrn.o +obj-y+=eltwise.o +obj-y+=ref_slice.o +obj-y+=split.o +obj-y+=pad.o +obj-y+=reduction.o +obj-y+=ref_add_n.o +obj-y+=ref_batchnorm.o +obj-y+=ref_normalize.o + +COMMON_CFLAGS+=-I. diff --git a/executor/operator/ref/demo_operator.cpp b/executor/operator/ref/demo_operator.cpp deleted file mode 100644 index 132bbfc46..000000000 --- a/executor/operator/ref/demo_operator.cpp +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2018, Open AI Lab - * Author: haitao@openailab.com - */ - -#include -#include - -#include "logger.hpp" -#include "graph.hpp" -#include "node_ops.hpp" - -namespace TEngine { - -namespace demo_ops { - -struct DemoOps : public MTNodeOps -{ -public: - bool FloatPrerun(Node* node) - { - LOG_INFO() << "float prerun done!\n"; - return true; - } - - bool FloatPostrun(Node* node) - { - LOG_INFO() << "float post run done!\n"; - return true; - } - - bool FloatRun(Node* node) - { - LOG_INFO() << "float run done!\n"; - return true; - } - - bool IntPrerun(Node* node) - { - LOG_INFO() << "int prerun done!\n"; - return true; - } - - bool IntPostrun(Node* node) - { - LOG_INFO() << "int post run done!\n"; - return true; - } - - bool IntRun(Node* node) - { - LOG_INFO() << "int run done!\n"; - return true; - } - - bool MTIntRun(Node* node) - { - std::vector task_list; - - for(int i = 0; i < cpu_info->GetCPUNumber() * 2; i++) - { - sub_op_task task; - task.exec_func = std::move(std::bind(&DemoOps::IntAider, this, std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3)); - task.seq = i; - task.data = ( void* )(( unsigned long )i); - - task_list.push_back(task); - } - - task_dispatch(task_list, -1); - - wait_done(); - - return true; - } - - bool MTFloatRun(Node* node) - { - std::vector task_list; - - for(int i = 0; i < cpu_info->GetCPUNumber() * 2; i++) - { - sub_op_task task; - task.exec_func = std::bind(&DemoOps::FloatAider, this, std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3); - task.seq = i; - task.data = ( void* )(( unsigned long )i); - - task_list.push_back(task); - } - - task_dispatch(task_list, -1); - - wait_done(); - - return true; - } - - bool IntAider(int cpu, int seq, void* data) - { - int cpu_model = cpu_info->GetCPUModel(cpu); - - if(cpu_model == CPU_A72) - A53IntAider(cpu, seq, data); - else - A72IntAider(cpu, seq, data); - - return true; - } - - bool FloatAider(int cpu, int seq, void* data) - { - int cpu_model = cpu_info->GetCPUModel(cpu); - - if(cpu_model == CPU_A53) - A53FloatAider(cpu, seq, data); - else - A72FloatAider(cpu, seq, data); - - return true; - } - - bool A72FloatAider(int cpu, int seq, void* data) - { - unsigned long n = ( unsigned long )(data); - - LOG_INFO() << "cpu: " << cpu << " A72 FLOAT called\n"; - LOG_INFO() << "cpu: " << cpu << " will sleep " << n << " seconds\n"; - - std::chrono::milliseconds sleep_time(n * 1000); - std::this_thread::sleep_for(sleep_time); - - LOG_INFO() << "cpu: " << cpu << " DONE\n"; - - return true; - } - - bool A53FloatAider(int cpu, int seq, void* data) - { - unsigned long n = ( unsigned long )(data); - - LOG_INFO() << "cpu: " << cpu << " A53 FLOAT called\n"; - LOG_INFO() << "cpu: " << cpu << " will sleep " << n << " seconds\n"; - - std::chrono::milliseconds sleep_time(n * 1000); - std::this_thread::sleep_for(sleep_time); - - LOG_INFO() << "cpu: " << cpu << " DONE\n"; - - return true; - } - - bool A72IntAider(int cpu, int seq, void* data) - { - unsigned long n = ( unsigned long )(data); - - LOG_INFO() << "cpu: " << cpu << " A72 INT called\n"; - LOG_INFO() << "cpu: " << cpu << " will sleep " << n << " seconds\n"; - - std::chrono::milliseconds sleep_time(n * 1000); - std::this_thread::sleep_for(sleep_time); - - LOG_INFO() << "cpu: " << cpu << " DONE\n"; - - return true; - } - - bool A53IntAider(int cpu, int seq, void* data) - { - unsigned long n = ( unsigned long )(data); - - LOG_INFO() << "cpu: " << cpu << " A53 INT called\n"; - LOG_INFO() << "cpu: " << cpu << " will sleep " << n << " seconds\n"; - - std::chrono::milliseconds sleep_time(n * 1000); - std::this_thread::sleep_for(sleep_time); - - LOG_INFO() << "cpu: " << cpu << " DONE\n"; - - return true; - } - - /*****************************************************/ - bool Prerun(Node* node) override - { - if(float_mode) - return FloatPrerun(node); - else - return IntPrerun(node); - } - - bool Run(Node* node) override - { - std::cout << "Run launched on : " << cpu_info->GetCPUModelString(cpu_info->GetMasterCPU()) << "\n"; - - if(float_mode) - { - if(mt_mode) - return MTFloatRun(node); - else - return FloatRun(node); - } - else - { - if(mt_mode) - return MTIntRun(node); - else - return IntRun(node); - } - } - - bool Postrun(Node* node) override - { - if(float_mode) - return FloatPostrun(node); - else - return IntPostrun(node); - } - - DemoOps() - { - float_mode = true; - mt_mode = false; - } - - bool float_mode; - bool mt_mode; -}; - -NodeOps* SelectFunc(const CPUInfo* info, Node* node) -{ - DemoOps* ops = new DemoOps(); - - if(info->GetCPUNumber() > 1) - ops->mt_mode = true; - else - ops->mt_mode = false; - - Tensor* input_tensor = node->GetInputTensor(0); - - if(input_tensor->GetDataType() == TENGINE_DT_FP32) - { - ops->float_mode = true; - } - else - { - ops->float_mode = false; - } - - return ops; -} - -} // namespace demo_ops - -using namespace demo_ops; - -void RegisterDemoOps(void) -{ - NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "DemoOp", demo_ops::SelectFunc, 1000); -} - -} // namespace TEngine diff --git a/executor/operator/ref/dropout.cpp b/executor/operator/ref/dropout.cpp new file mode 100644 index 000000000..7d5c600d1 --- /dev/null +++ b/executor/operator/ref/dropout.cpp @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + +// #include "kernel/Dropout/Dropout_kernel.h" + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/dropout.hpp" + +namespace TEngine { + +namespace RefDropoutOps { + + + +struct RefDropout : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool OnBind(Node * node) override; + bool Run(Node * node) override; + bool Postrun(Node * node) override; + // void InitRegistry(void); + // Dropout_param op_param; + // void * mem; + // Dropout_t kernel_run; + + + // KernelRegistry kernel_registry; + + RefDropout(void) + { + // mem=nullptr; + // kernel_run=nullptr; + + // InitRegistry(); + } +}; + + +bool RefDropout::Prerun(Node * node) +{ + // Tensor * input=node->GetInputTensor(0); + // Tensor* output_tensor = node->GetOutputTensor(0); + // int layout=exec_attr->graph_layout; + + // if(input->GetDataType() == TENGINE_DT_INT8 || + // input->GetDataType() == TENGINE_DT_UINT8 ) + // { + // if(get_scale_zero(input, output_tensor, &op_param) < 0) + // return false; + // } + + + // if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType())) + // { + // set_tengine_errno(ENOENT); + // return false; + // } + + return true; +} + +bool RefDropout::OnBind(Node * node) +{ + inplace_t io_map; + + io_map[0] = 0; + + node->SetAttr(ATTR_INPLACE, io_map); + return true; +} + +bool RefDropout::Run(Node * node) +{ + // Tensor* input_tensor = node->GetInputTensor(0); + // Tensor* output_tensor = node->GetOutputTensor(0); + // const TShape& shape = input_tensor->GetShape(); + // void* data = get_tensor_mem(input_tensor); + // void* out_data = get_tensor_mem(output_tensor); + + // int size = shape.GetSize(); + // int ret=kernel_run(data,out_data,size,&op_param); + + // if(ret<0) + // return false; + // else + // return true; + + Tensor* input = node->GetInputTensor(0); + Tensor* output = node->GetOutputTensor(0); + auto i_quant = input->GetQuantParam(); + auto o_quant = output->GetQuantParam(); + if(i_quant->size() != 1) + { + LOG_ERROR()<<"input quant param num isnot 1 \n"; + return false; + } + o_quant->resize(0); + o_quant->push_back((*i_quant)[0]); + + return true; +} + +bool RefDropout::Postrun(Node * node) +{ + return true; +} + +// void RefDropout::InitRegistry(void) +// { +// #ifdef CONFIG_KERNEL_FP32 +// kernel_registry.Register((Dropout_t)Dropout_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); +// kernel_registry.Register((Dropout_t)Dropout_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +// #endif + +// #ifdef CONFIG_KERNEL_FP16 +// kernel_registry.Register((Dropout_t)Dropout_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); +// kernel_registry.Register((Dropout_t)Dropout_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +// #endif +// #ifdef CONFIG_KERNEL_INT8 +// kernel_registry.Register((Dropout_t)Dropout_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); +// kernel_registry.Register((Dropout_t)Dropout_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +// #endif + +// #ifdef CONFIG_KERNEL_UINT8 +// kernel_registry.Register((Dropout_t)Dropout_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); +// kernel_registry.Register((Dropout_t)Dropout_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +// #endif + +// } + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + RefDropout* ops = new RefDropout(); + + LOG_DEBUG()<<"ReluOps RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefReluOps +void RegisterDropoutOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Dropout", RefDropoutOps::SelectFunc, 1000); +} +} // namespace TEngine diff --git a/executor/operator/ref/eltwise.cpp b/executor/operator/ref/eltwise.cpp new file mode 100644 index 000000000..4ebaf9cd7 --- /dev/null +++ b/executor/operator/ref/eltwise.cpp @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + +#include "kernel/eltwise/eltwise.h" + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/eltwise.hpp" + +namespace TEngine { + +namespace RefEltwiseOps { + + + +struct EltwiseOps : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool Run(Node * node) override; + bool Postrun(Node * node) override; + void InitRegistry(void); + + eltwise_param op_param; + eltwise_t kernel_run; + + KernelRegistry kernel_registry; + + EltwiseOps(void) + { + kernel_run=nullptr; + + InitRegistry(); + } +}; +static int get_scale_zero(Tensor* itensor,Tensor * otensor,eltwise_param* param) +{ + auto* i_quant = itensor->GetQuantParam(); + auto* o_quant = otensor->GetQuantParam(); + if( i_quant->size() != 1 ) + { + LOG_ERROR()<<"Input quant size: ("<size()<<")\n"; + return -1; + } + param->scale[0] = (*i_quant)[0].scale; + if(itensor->GetDataType() == TENGINE_DT_UINT8) + { + if( o_quant->size() != 1) + { + LOG_ERROR()<<"Output quant size: ("<size()<<")\n"; + return -1; + } + + param->scale[2] = (*o_quant)[0].scale; + param->zero[2] = (*o_quant)[0].zero_point; + + param->zero[0] = (*i_quant)[0].zero_point; + } + + return 0; +} +static int get_scale_zero_1(Tensor* itensor,eltwise_param* param) +{ + auto* i_quant = itensor->GetQuantParam(); + if( i_quant->size() != 1 ) + { + LOG_ERROR()<<"Input quant size: ("<size()<<")\n"; + return -1; + } + param->scale[1] = (*i_quant)[0].scale; + if(itensor->GetDataType() == TENGINE_DT_UINT8) + { + + param->zero[1] = (*i_quant)[0].zero_point; + } + return 0; +} + +bool EltwiseOps::Prerun(Node * node) +{ + Tensor * input_tensor=node->GetInputTensor(0); + + int layout=exec_attr->graph_layout; + + if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + //int elem_size=DataType::GetTypeSize(input->GetDataType()); + + return true; +} + + +bool EltwiseOps::Run(Node * node) +{ + Tensor* input_tensor0 = node->GetInputTensor(0); + int element_size = DataType::GetTypeSize(input_tensor0->GetDataType()); + const TShape& ishape = input_tensor0->GetShape(); + void* input0 = get_tensor_mem(input_tensor0); + Tensor* input_tensor1 = nullptr; + void* input1 = nullptr; + int input1_count4 = 0; + int input_chan_1 = 0; + int input_hw_1 = 0; + int input_h_1 = 0; + int input_w_1 = 0; + int input_n_1 = 0; + // this version only support for input_num=2 + // int input_number=node->GetInputNum(); + + // output + Tensor* output_tensor = node->GetOutputTensor(0); + if(input_tensor0->GetDataType() == TENGINE_DT_INT8 ||input_tensor0->GetDataType() == TENGINE_DT_UINT8 ) + { + if(get_scale_zero(input_tensor0,output_tensor, &op_param) < 0) + return false; + } + + if(node->GetInputNum() > 1) + { + input_tensor1 = node->GetInputTensor(1); + const TShape& ishape1 = input_tensor1->GetShape(); + input1 = get_tensor_mem(input_tensor1); + input1_count4 = input_tensor1->GetTotalSize() / element_size; + input_n_1=ishape1.GetN(); + input_chan_1 = ishape1.GetC(); + input_hw_1 = ishape1.GetH() * ishape1.GetW(); + input_h_1=ishape1.GetH(); + input_w_1=ishape1.GetW(); + + if(input_tensor1->GetDataType() == TENGINE_DT_INT8 || + input_tensor1->GetDataType() == TENGINE_DT_UINT8 ) + { + if(get_scale_zero_1(input_tensor1, &op_param) < 0) + return false; + } + } + int layout = ishape.GetDataLayout(); + void* output = get_tensor_mem(output_tensor); + Eltwise* eltwise_op = dynamic_cast(node->GetOp()); + EltwiseParam* param = eltwise_op->GetParam(); + int input_count4 = input_tensor0->GetTotalSize() / element_size; + int input_chan = ishape.GetC(); + int input_hw = ishape.GetH() * ishape.GetW(); + int input_h=ishape.GetH(); + int input_w=ishape.GetW(); + int input_n=ishape.GetN(); + //get out_tensor size + Tensor* output_tensor0 = node->GetOutputTensor(0); + int out_element_size = DataType::GetTypeSize(output_tensor0->GetDataType()); + int out_size = output_tensor0->GetTotalSize()/out_element_size; + float * output_buf=(float *)malloc(sizeof(float)*out_size); + int ret=kernel_run(output, input0, input1, param->type, input_count4, + input_chan,input_chan_1,input_hw,input_hw_1, input1_count4, + input_h,input_w,input_h_1,input_w_1,input_n,input_n_1,layout, + out_size,output_buf,&op_param); + free(output_buf); + + if(input_tensor1->GetDataType() == TENGINE_DT_INT8 + || input_tensor0->GetDataType() == TENGINE_DT_INT8) + { + + auto* o_quant = output_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale =op_param.scale[2]; + o_quant->resize(0); + o_quant->push_back(q_param); + + } + + + if(ret<0) + return false; + else + return true; +} + +bool EltwiseOps::Postrun(Node * node) +{ + return true; +} + +void EltwiseOps::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((eltwise_t)eltwise_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((eltwise_t)eltwise_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((eltwise_t)eltwise_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((eltwise_t)eltwise_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((eltwise_t)eltwise_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((eltwise_t)eltwise_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((eltwise_t)eltwise_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((eltwise_t)eltwise_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + EltwiseOps* ops = new EltwiseOps(); + + LOG_DEBUG()<<"EltwiseOps RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefEltwiseOps +void RegisterEltwiseOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Eltwise", RefEltwiseOps::SelectFunc, 1000); +} + +} // namespace TEngine \ No newline at end of file diff --git a/executor/operator/ref/flatten.cpp b/executor/operator/ref/flatten.cpp new file mode 100644 index 000000000..4ebcab773 --- /dev/null +++ b/executor/operator/ref/flatten.cpp @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + + + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/flatten.hpp" + +namespace TEngine { + +namespace RefFlattenOps { + + + +struct RefFlatten : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool OnBind(Node * node) override; + bool Run(Node * node) override; + bool Postrun(Node * node) override; + + RefFlatten(void) + { + + } +}; + + +bool RefFlatten::Prerun(Node * node) +{ + return true; +} + +bool RefFlatten::OnBind(Node * node) +{ + inplace_t io_map; + + io_map[0] = 0; + + node->SetAttr(ATTR_INPLACE, io_map); + return true; +} + +bool RefFlatten::Run(Node * node) +{ + + return true; +} + +bool RefFlatten::Postrun(Node * node) +{ + return true; +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + RefFlatten* ops = new RefFlatten(); + + LOG_DEBUG()<<"ReluOps RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefReluOps +void RegisterFlattenOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Flatten", RefFlattenOps::SelectFunc, 1000); +} +} // namespace TEngine \ No newline at end of file diff --git a/executor/operator/ref/init.cpp b/executor/operator/ref/init.cpp new file mode 100644 index 000000000..5f3015803 --- /dev/null +++ b/executor/operator/ref/init.cpp @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: haitao@openailab.com + */ +#include +#include + +namespace TEngine { + +extern void RegisterRefPoolingOps(void); +extern void RegisterRefConv2d(void); +extern void RegisterRefDeconv2d(void); +extern void RegisterRefSoftmaxOps(void); +extern void RegisterRefDetectionPostOps(void); +extern void RegisterRefFCOps(void); +extern void RegisterRelu6Ops(void); +extern void RegisterReluOps(void); +extern void RegisterPreluOps(void); +extern void RegisterTanhOps(void); +extern void RegisterSigmoidOps(void); +extern void RegisterResizeOps(void); +extern void RegisterFlattenOps(void); +extern void RegisterReshapeOps(void); +extern void RegisterDropoutOps(void); +extern void RegisterRefConcat(void); +extern void RegisterRefPermute(void); +extern void RegisterRefLrn(void); +extern void RegisterEltwiseOps(void); +extern void RegisterRefSlice(void); +extern void RegisterSplitOps(void); +extern void RegisterPadOps(void); +extern void RegisterReductionOps(void); +extern void RegisterSqueezeOps(void); +extern void RegisterSwapAxisOps(void); +extern void RegisterRefRPNOps(void); +extern void RegisterRefBatchNormOps(void); +extern void RegisterRefNormlizeOps(void); +extern void RegisterRefAddNOps(void); + +void RegisterRefOps(void) +{ + RegisterRefPoolingOps(); + RegisterRefConv2d(); + RegisterRefDeconv2d(); + RegisterRefSoftmaxOps(); + RegisterRefDetectionPostOps(); + RegisterRefFCOps(); + RegisterRefConcat(); + RegisterRefPermute(); + RegisterRelu6Ops(); + RegisterReluOps(); + RegisterPreluOps(); + RegisterTanhOps(); + RegisterSigmoidOps(); + RegisterResizeOps(); + RegisterFlattenOps(); + RegisterReshapeOps(); + RegisterDropoutOps(); + RegisterRefLrn(); + RegisterEltwiseOps(); + RegisterRefSlice(); + RegisterSplitOps(); + RegisterPadOps(); + RegisterReductionOps(); + RegisterSqueezeOps(); + RegisterSwapAxisOps(); + RegisterRefRPNOps(); + RegisterRefBatchNormOps(); + RegisterRefNormlizeOps(); + RegisterRefAddNOps(); + +} + +} // namespace TEngine diff --git a/executor/operator/ref/kernel/concat/concat_fp16.c b/executor/operator/ref/kernel/concat/concat_fp16.c new file mode 100644 index 000000000..a494301df --- /dev/null +++ b/executor/operator/ref/kernel/concat/concat_fp16.c @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jjzeng@openailab.com + */ + + +static int ref_concat_fp16(const __fp16** in_data, __fp16* out_data, const struct concat_param* param) +{ + int axis = param->axis; + int concat_dim = 0; + for( int ii=0; iiinput_counts; ++ii ) + { + concat_dim += param->input_shape[ii].dim[axis]; + } + + if( concat_dim != param->output_shape.dim[axis] ) + { + printf("concant dimensions[%d] is not same output[%d]\n",concat_dim,param->output_shape.dim[axis]); + return -1; + } + + int out_size,in_size; + + out_size = 1; + for( int ii=0; iioutput_shape.dim[ii]; + } + in_size = 1; + for( int ii=axis+1; ii < param->output_dim; ++ii ) + { + in_size *= param->input_shape[0].dim[ii]; + } + + __fp16* output_ptr = out_data; + + for(int k = 0; k < out_size; ++k ) + { + for(int j = 0 ; jinput_counts; ++j ) + { + int cp_size = param->input_shape[j].dim[axis] * in_size; + memcpy(output_ptr, in_data[j] + k * cp_size, cp_size* sizeof(__fp16)); + output_ptr += cp_size; + } + } + + return 0; +} diff --git a/executor/operator/ref/kernel/concat/concat_fp32.c b/executor/operator/ref/kernel/concat/concat_fp32.c new file mode 100644 index 000000000..3065dac19 --- /dev/null +++ b/executor/operator/ref/kernel/concat/concat_fp32.c @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jjzeng@openailab.com + */ + + +static int ref_concat_fp32(const float** in_data, float* out_data, const struct concat_param* param) +{ + int axis = param->axis; + int concat_dim = 0; + for( int ii=0; iiinput_counts; ++ii ) + { + concat_dim += param->input_shape[ii].dim[axis]; + } + + if( concat_dim != param->output_shape.dim[axis] ) + { + printf("concant dimensions[%d] is not same output[%d]\n",concat_dim,param->output_shape.dim[axis]); + return -1; + } + + int out_size,in_size; + + out_size = 1; + for( int ii=0; iioutput_shape.dim[ii]; + } + in_size = 1; + for( int ii=axis+1; ii < param->output_dim; ++ii ) + { + in_size *= param->input_shape[0].dim[ii]; + } + + float* output_ptr = out_data; + + for(int k = 0; k < out_size; ++k ) + { + for(int j = 0 ; jinput_counts; ++j ) + { + int cp_size = param->input_shape[j].dim[axis] * in_size; + memcpy(output_ptr, in_data[j] + k * cp_size, cp_size* sizeof(float)); + output_ptr += cp_size; + } + } + + return 0; +} diff --git a/executor/operator/ref/kernel/concat/concat_int8.c b/executor/operator/ref/kernel/concat/concat_int8.c new file mode 100644 index 000000000..d4eff55f9 --- /dev/null +++ b/executor/operator/ref/kernel/concat/concat_int8.c @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jjzeng@openailab.com + */ + + +static int ref_concat_int8(const int8_t** in_data,int8_t* out_data,const struct concat_param* param) +{ + int axis = param->axis; + int concat_dim = 0; + for( int ii=0; iiinput_counts; ++ii ) + { + concat_dim += param->input_shape[ii].dim[axis]; + } + + if( concat_dim != param->output_shape.dim[axis] ) + { + printf("concant dimensions[%d] is not same output[%d]\n",concat_dim,param->output_shape.dim[axis]); + return -1; + } + + int outer_size,in_size; + outer_size = 1; + for( int ii=0; iioutput_shape.dim[ii]; + } + in_size = 1; + for( int ii=axis+1; ii < param->output_dim; ++ii ) + { + in_size *= param->output_shape.dim[ii]; + } + + int output_size = 1; + for( int ii=0; iioutput_dim;++ii ) + { + output_size *= param->output_shape.dim[ii]; + } + + float* output_tmp = (float*)malloc(output_size * 4); + if(NULL == output_tmp) + { + printf("Malloc output tmp memory failed\n"); + return -1; + } + + float* output_ptr = output_tmp; + float max_scale = 0.0f; + for(int k = 0; k < outer_size; ++k ) + { + for(int j = 0 ; jinput_counts; ++j ) + { + int cp_size = param->input_shape[j].dim[axis] * in_size; + float scale = param->input_shape[j].scale; + const int8_t* input_ptr = in_data[j] + k * cp_size; + + for(int ii=0; ii(param); + out_param->out_scale = max_scale; + + free(output_tmp); + return 0; +} diff --git a/executor/operator/ref/kernel/concat/concat_kernel.h b/executor/operator/ref/kernel/concat/concat_kernel.h new file mode 100644 index 000000000..0300cd21c --- /dev/null +++ b/executor/operator/ref/kernel/concat/concat_kernel.h @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jjzeng@openailab.com + */ + +#ifndef __CONACT_KERNEL_H__ +#define __CONACT_KERNEL_H__ + +#include +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct shape_dim +{ + int dim[4]; + float scale; + int zero; +}; + +struct concat_param +{ + struct shape_dim* input_shape; + int input_counts; + int input_dim; + struct shape_dim output_shape; + int output_dim; + int axis; + float out_scale; +}; + +typedef int (*concat_t)(const void** in_data, void* out_data, const struct concat_param* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "concat_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "concat_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "concat_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "concat_uint8.c" +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/concat/concat_uint8.c b/executor/operator/ref/kernel/concat/concat_uint8.c new file mode 100644 index 000000000..4902431c6 --- /dev/null +++ b/executor/operator/ref/kernel/concat/concat_uint8.c @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jjzeng@openailab.com + */ + + +static int ref_concat_uint8(const uint8_t** in_data,uint8_t* out_data,const struct concat_param* param) +{ + int axis = param->axis; + int concat_dim = 0; + for( int ii=0; iiinput_counts; ++ii ) + { + concat_dim += param->input_shape[ii].dim[axis]; + } + + if( concat_dim != param->output_shape.dim[axis] ) + { + printf("concant dimensions[%d] is not same output[%d]\n",concat_dim,param->output_shape.dim[axis]); + return -1; + } + + int outer_size,in_size; + outer_size = 1; + for( int ii=0; iioutput_shape.dim[ii]; + } + in_size = 1; + for( int ii=axis+1; ii < param->output_dim; ++ii ) + { + in_size *= param->output_shape.dim[ii]; + } + + int output_size = 1; + for( int ii=0; iioutput_dim;++ii ) + { + output_size *= param->output_shape.dim[ii]; + } + + float* output_tmp = (float*)malloc(output_size*4); + if(NULL == output_tmp) + { + printf("Malloc output tmp memory failed\n"); + return -1; + } + + float* output_ptr = output_tmp; + for(int k = 0; k < outer_size; ++k ) + { + for(int j = 0 ; jinput_counts; ++j ) + { + int cp_size = param->input_shape[j].dim[axis] * in_size; + float scale = param->input_shape[j].scale; + uint8_t input_zero = param->input_shape[j].zero; + + const uint8_t* input_ptr = (const uint8_t*)(in_data[j] + k * cp_size); + + for(int ii=0; iioutput_shape.scale; + uint8_t out_zero = param->output_shape.zero; + + uint8_t* last_output_ptr = out_data; + for(int ii=0; ii= 0) + { + if(tmp < 0) + tmp = 0; + if(activation== 1 && tmp>1) + tmp = 1; + if(activation== 2 && tmp>6) + tmp = 6; + } + +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + *input = fp32_to_fp16(tmp); +#else + *input = tmp; +#endif +} + +static int ref_conv_fp16(const __fp16 * input, __fp16 * output, const __fp16* kernel, const __fp16* bias, op_data* param) +{ + int batch = param->batch; + int group = param->group; + int input_c = param->in_shape[0]/group; + int input_h = param->in_shape[1]; + int input_w = param->in_shape[2]; + int output_c = param->out_shape[0]/group; + int output_h = param->out_shape[1]; + int output_w = param->out_shape[2]; + + int kernel_size = input_c * param->kernels[0] * param->kernels[1]; + int n,g,c,h,w,kc,kh,kw; + int input_offset=0; + int kernel_offset=0; + int output_offset=0; + for ( n = 0; n < batch; ++n){ + for( g = 0; g < group; ++g){ + for ( c = 0; c < output_c; ++c) { + for ( h = 0; h < output_h; ++h){ + for ( w = 0; w < output_w; ++w){ + const int h_start = (h * param->strides[0]) - param->pads[0]; + const int w_start = (w * param->strides[1]) - param->pads[1]; +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + float total = bias ? fp16_to_fp32(bias[output_c* g + c]) : 0; +#else + __fp16 total = bias ? bias[output_c* g + c] : 0; +#endif + if(param->layout == 0){ + + output_offset = n*group*output_c*output_h*output_w + + g*output_c*output_h*output_w + + c* output_h*output_w + h*output_w + w; + } + else{ + output_offset = n*group*output_c*output_h*output_w + + h*output_w*group*output_c + w*group*output_c + + output_c*group + c; + + } + for (kc = 0; kc < input_c; ++kc){ + for ( kh = 0; kh < param->kernels[0]; ++kh){ + for ( kw = 0; kw < param->kernels[1]; ++kw){ + const int cur_y = h_start + param->dilations[0] * kh; + const int cur_x = w_start + param->dilations[1] * kw; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((cur_x >= 0) && (cur_x < input_w) && (cur_y >= 0) && (cur_y < input_h)) { + if(param->layout == 0){ + input_offset = n*group*input_c*input_h*input_w + + g*input_c*input_h*input_w + + kc* input_h*input_w + cur_y*input_w + cur_x; + kernel_offset = g*output_c*kernel_size + c*kernel_size + + kc*param->kernels[0]*param->kernels[1] + + kh* param->kernels[1] + kw; + } + else{ + input_offset = n*group*input_c*input_h*input_w + + cur_y*input_w*input_c*group + cur_x* input_c*group + + g*input_c + kc; + kernel_offset = c*kernel_size*group +kh* param->kernels[1]*input_c*group + + kw*input_c*group + g*input_c + kc; + } + +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + total += fp16_to_fp32(input[input_offset]) * fp16_to_fp32(kernel[kernel_offset]); +#else + total += (input[input_offset] * kernel[kernel_offset]); +#endif + } + } + } + } +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + total = activation(total, param->activation); + output[output_offset] = fp32_to_fp16(total); +#else + activation_fp16(&total, param->activation); + output[output_offset] = total; +#endif + } + } + } + } + } + return 0; +} + diff --git a/executor/operator/ref/kernel/convolution/ref_conv_fp32.c b/executor/operator/ref/kernel/convolution/ref_conv_fp32.c new file mode 100644 index 000000000..e67864074 --- /dev/null +++ b/executor/operator/ref/kernel/convolution/ref_conv_fp32.c @@ -0,0 +1,80 @@ + + +static int ref_conv_fp32(const float * input, float * output, const float* kernel, const float* bias, op_data* param) +{ + int batch = param->batch; + int group = param->group; + int input_c = param->in_shape[0]/group; + int input_h = param->in_shape[1]; + int input_w = param->in_shape[2]; + int output_c = param->out_shape[0]/group; + int output_h = param->out_shape[1]; + int output_w = param->out_shape[2]; + + int kernel_size = input_c * param->kernels[0] * param->kernels[1]; + int n,g,c,h,w,kc,kh,kw; + int input_offset=0; + int kernel_offset=0; + int output_offset=0; + for ( n = 0; n < batch; ++n){ + for( g = 0; g < group; ++g){ + for ( c = 0; c < output_c; ++c) { + for ( h = 0; h < output_h; ++h){ + for ( w = 0; w < output_w; ++w){ + const int h_start = (h * param->strides[0]) - param->pads[0]; + const int w_start = (w * param->strides[1]) - param->pads[1]; + float total = 0.f; + if(param->layout == 0){ + + output_offset = n*group*output_c*output_h*output_w + + g*output_c*output_h*output_w + + c* output_h*output_w + h*output_w + w; + } + else{ + output_offset = n*group*output_c*output_h*output_w + + h*output_w*group*output_c + w*group*output_c + + output_c*g + c; + + } + for (kc = 0; kc < input_c; ++kc){ + for ( kh = 0; kh < param->kernels[0]; ++kh){ + for ( kw = 0; kw < param->kernels[1]; ++kw){ + const int cur_y = h_start + param->dilations[0] * kh; + const int cur_x = w_start + param->dilations[1] * kw; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((cur_x >= 0) && (cur_x < input_w) && (cur_y >= 0) && (cur_y < input_h)) { + if(param->layout == 0){ + input_offset = n*group*input_c*input_h*input_w + + g*input_c*input_h*input_w + + kc* input_h*input_w + cur_y*input_w + cur_x; + kernel_offset = g*output_c*kernel_size + c*kernel_size + + kc*param->kernels[0]*param->kernels[1] + + kh* param->kernels[1] + kw; + } + else{ + input_offset = n*group*input_c*input_h*input_w + + cur_y*input_w*input_c*group + cur_x* input_c*group + + g*input_c + kc; + kernel_offset = c*group*kernel_size + kh* param->kernels[1]*input_c*group + + kw*input_c*group + g*input_c + kc; + } + + total += (input[input_offset] * kernel[kernel_offset]); + } + } + } + } + float bias_value = 0.0f; + if (bias) { + bias_value = bias[output_c* g + c]; + } + output[output_offset] = activation(total + bias_value, param->activation); + } + } + } + } + } + return 0; +} + diff --git a/executor/operator/ref/kernel/convolution/ref_conv_int8.c b/executor/operator/ref/kernel/convolution/ref_conv_int8.c new file mode 100644 index 000000000..0b50c97c2 --- /dev/null +++ b/executor/operator/ref/kernel/convolution/ref_conv_int8.c @@ -0,0 +1,112 @@ + +static int ref_conv_int8(const int8_t * input, int8_t * output, const int8_t* kernel, const float* bias, op_data* param) +{ + int batch = param->batch; + int group = param->group; + int input_c = param->in_shape[0]/group; + int input_h = param->in_shape[1]; + int input_w = param->in_shape[2]; + int output_c = param->out_shape[0]/group; + int output_h = param->out_shape[1]; + int output_w = param->out_shape[2]; + + int kernel_size = input_c * param->kernels[0] * param->kernels[1]; + + /* dequant input */ + int input_size = batch * group * input_c * input_h * input_w; + float* input_buf = (float*)malloc(sizeof(float) * input_size); + for(int i=0; iscale[0]; + } + + /* dequant kernel */ + int kernel_total = group *output_c* kernel_size; + float* kernel_buf = (float*)malloc(sizeof(float) * kernel_total); + for(int i=0; iscale[1]; + } + + /* malloc output */ + int output_size = group*batch*output_c*output_h*output_w; + float* output_buf = (float*)malloc(sizeof(float) * output_size); + + int n,g,c,h,w,kc,kh,kw; + int input_offset=0; + int kernel_offset=0; + int output_offset=0; + for ( n = 0; n < batch; ++n){ + for( g = 0; g < group; ++g){ + for ( c = 0; c < output_c; ++c) { + for ( h = 0; h < output_h; ++h){ + for ( w = 0; w < output_w; ++w){ + const int h_start = (h * param->strides[0]) - param->pads[0]; + const int w_start = (w * param->strides[1]) - param->pads[1]; + float total = 0.f; + if(param->layout == 0){ + output_offset = n*group*output_c*output_h*output_w + + g*output_c*output_h*output_w + + c* output_h*output_w + h*output_w + w; + } + else{ + output_offset = n*group*output_c*output_h*output_w + + h*output_w*group*output_c + w*group*output_c + + output_c*group + c; + } + for (kc = 0; kc < input_c; ++kc){ + for ( kh = 0; kh < param->kernels[0]; ++kh){ + for ( kw = 0; kw < param->kernels[1]; ++kw){ + const int cur_y = h_start + param->dilations[0] * kh; + const int cur_x = w_start + param->dilations[1] * kw; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((cur_x >= 0) && (cur_x < input_w) && (cur_y >= 0) && (cur_y < input_h)) { + if(param->layout == 0){ + input_offset = n*group*input_c*input_h*input_w + + g*input_c*input_h*input_w + + kc* input_h*input_w + cur_y*input_w + cur_x; + kernel_offset = g*output_c*kernel_size + c*kernel_size + + kc*param->kernels[0]*param->kernels[1] + + kh* param->kernels[1] + kw; + } + else{ + input_offset = n*group*input_c*input_h*input_w + + cur_y*input_w*input_c*group + cur_x* input_c*group + + g*input_c + kc; + kernel_offset = c*kernel_size*group + kh* param->kernels[1]*input_c*group + + kw*input_c*group + g*input_c + kc; + } + + total += (input_buf[input_offset] * kernel_buf[kernel_offset]); + } + } + } + } + float bias_value = 0.0f; + if (bias) { + bias_value = bias[output_c* g + c]; + } + output_buf[output_offset] = activation(total + bias_value, param->activation); + } + } + } + } + } + float output_max = 0.0f; + for(int i =0; i< output_size; i++) + { + if(output_max < fabs(output_buf[i])) + output_max = fabs(output_buf[i]); + } + param->scale[2] = output_max/127; + for(int i =0; i< output_size; i++) + { + output[i] = round(output_buf[i]*127/output_max); + } + free(output_buf); + free(kernel_buf); + free(input_buf); + return 0; +} + diff --git a/executor/operator/ref/kernel/convolution/ref_conv_kernel.h b/executor/operator/ref/kernel/convolution/ref_conv_kernel.h new file mode 100644 index 000000000..60779708c --- /dev/null +++ b/executor/operator/ref/kernel/convolution/ref_conv_kernel.h @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +#ifndef __REF_CONV_KERNEL_H__ +#define __REF_CONV_KERNEL_H__ + +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct op_data +{ + int in_shape[3]; //NCHW + int out_shape[3]; //CHW + int kernels[2]; + int strides[2]; + int dilations[2]; + int pads[2]; + int batch; + int group; + int activation; + int layout; + int zero[3]; //input, kernel, output + float scale[3]; //input, kernel, output +}; + +static inline float activation(float input, int activation) +{ + if( activation >= 0) + { + if(input < 0) + input = 0; + if(activation== 1 && input>1) + input = 1; + if(activation== 6 && input>6) + input = 6; + } + + return input; +} + +typedef int (*ref_conv_kernel_t)(const void * input, void * output, const void* kernel, const void* bias, op_data* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_conv_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "ref_conv_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "ref_conv_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_conv_uint8.c" +#endif + + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/convolution/ref_conv_uint8.c b/executor/operator/ref/kernel/convolution/ref_conv_uint8.c new file mode 100644 index 000000000..ce56ddbe2 --- /dev/null +++ b/executor/operator/ref/kernel/convolution/ref_conv_uint8.c @@ -0,0 +1,108 @@ + +static int ref_conv_uint8(const uint8_t * input, uint8_t * output, const uint8_t* kernel, const int* bias, op_data* param) +{ + int batch = param->batch; + int group = param->group; + int input_c = param->in_shape[0]/group; + int input_h = param->in_shape[1]; + int input_w = param->in_shape[2]; + int output_c = param->out_shape[0]/group; + int output_h = param->out_shape[1]; + int output_w = param->out_shape[2]; + + int kernel_size = input_c * param->kernels[0] * param->kernels[1]; + + /* dequant input */ + int input_size = batch * group * input_c * input_h * input_w; + float* input_buf = (float*)malloc(sizeof(float) * input_size); + for(int i=0; izero[0]) * param->scale[0]; + + /* dequant kernel */ + int kernel_total = group *output_c* kernel_size; + float* kernel_buf = (float*)malloc(sizeof(float) * kernel_total); + for(int i=0; izero[1]) * param->scale[1]; + + /* dequant biases */ + int bias_size = group *output_c; + + float* bias_buf = NULL; + if(bias != NULL) + { + bias_buf = (float*)malloc(sizeof(float) * bias_size); + for(int i=0; iscale[0] * param->scale[1]; + } + + int n,g,c,h,w,kc,kh,kw; + int input_offset=0; + int kernel_offset=0; + int output_offset=0; + for ( n = 0; n < batch; ++n){ + for( g = 0; g < group; ++g){ + for ( c = 0; c < output_c; ++c) { + for ( h = 0; h < output_h; ++h){ + for ( w = 0; w < output_w; ++w){ + const int h_start = (h * param->strides[0]) - param->pads[0]; + const int w_start = (w * param->strides[1]) - param->pads[1]; + float total = 0.f; + if(param->layout == 0){ + output_offset = n*group*output_c*output_h*output_w + + g*output_c*output_h*output_w + + c* output_h*output_w + h*output_w + w; + } + else{ + output_offset = n*group*output_c*output_h*output_w + + h*output_w*group*output_c + w*group*output_c + + output_c*g + c; + } + for (kc = 0; kc < input_c; ++kc){ + for ( kh = 0; kh < param->kernels[0]; ++kh){ + for ( kw = 0; kw < param->kernels[1]; ++kw){ + const int cur_y = h_start + param->dilations[0] * kh; + const int cur_x = w_start + param->dilations[1] * kw; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((cur_x >= 0) && (cur_x < input_w) && (cur_y >= 0) && (cur_y < input_h)) { + if(param->layout == 0){ + input_offset = n*group*input_c*input_h*input_w + + g*input_c*input_h*input_w + + kc* input_h*input_w + cur_y*input_w + cur_x; + kernel_offset = g*output_c*kernel_size + c*kernel_size + + kc*param->kernels[0]*param->kernels[1] + + kh* param->kernels[1] + kw; + } + else{ + input_offset = n*group*input_c*input_h*input_w + + cur_y*input_w*input_c*group + cur_x* input_c*group + + g*input_c + kc; + kernel_offset = c*kernel_size*group + kh* param->kernels[1]*input_c*group + + kw*input_c*group + g*input_c + kc; + } + total += (input_buf[input_offset] * kernel_buf[kernel_offset]); + } + } + } + } + float bias_value = 0.0f; + if (bias != NULL) { + bias_value = bias_buf[output_c* g + c]; + } + total = activation(total + bias_value,param->activation); + int out = round(total/param->scale[2]) + param->zero[2]; + if(out > 255) out = 255; + if(out < 0 ) out = 0; + output[output_offset] = out; + } + } + } + } + } + if( bias != NULL) + free(bias_buf); + free(kernel_buf); + free(input_buf); + return 0; +} + diff --git a/executor/operator/ref/kernel/deconvolution/ref_deconv_fp16.c b/executor/operator/ref/kernel/deconvolution/ref_deconv_fp16.c new file mode 100644 index 000000000..b59dbd1f7 --- /dev/null +++ b/executor/operator/ref/kernel/deconvolution/ref_deconv_fp16.c @@ -0,0 +1,185 @@ +static inline void activation_fp16(__fp16* input, int activation) +{ +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + float tmp = fp16_to_fp32(*input); +#else + __fp16 tmp = *input; +#endif + if( activation >= 0) + { + if(tmp < 0) + tmp = 0; + if(activation== 1 && tmp>1) + tmp = 1; + if(activation== 2 && tmp>6) + tmp = 6; + } + +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + *input = fp32_to_fp16(tmp); +#else + *input = tmp; +#endif +} + + +static int ref_deconv_fp16(const __fp16* input, __fp16* output, const __fp16* kernel, const __fp16* bias, const deconv_ref_param* param) +{ + int batch = param->batch; + int group = param->group; + int input_c = param->in_shape[0]/group; + int input_h = param->in_shape[1]; + int input_w = param->in_shape[2]; + int output_c = param->out_shape[0]/group; + int output_h = param->out_shape[1]; + int output_w = param->out_shape[2]; + int kernel_h = param->kernels[0]; + int kernel_w = param->kernels[1]; + int pad_h0 = param->pads[0]; + int pad_w0 = param->pads[1]; + int stride_h = param->strides[0]; + int stride_w = param->strides[1]; + int dilation_h = param->dilations[0]; + int dilation_w = param->dilations[1]; + + int n,g,c,h,w,kc,k_h,k_w; + int org_out_x = 0; + int org_out_y = 0; + int cur_out_x = 0; + int cur_out_y = 0; + + int input_offset=0; + int kernel_offset=0; + int output_offset=0; + + memset((void*)output,0,output_h* output_w * output_c *batch* group * sizeof(__fp16)); + + for (n = 0; n < batch; ++n) + { + for( g = 0; g < group; ++g) + { + for(h = 0; h < input_h; h++) + { + for(w = 0;w < input_w; w++) + { + org_out_x = w * stride_w - pad_w0; + org_out_y = h * stride_h - pad_h0; + for(kc = 0; kc < input_c;kc++) + { + if(param->layout == 0) + { + input_offset = n * group * input_c * input_h * input_w + + g * input_c * input_h * input_w + + kc * input_h * input_w + + h * input_w + w; + } + else + { + input_offset = n * group * input_c * input_h * input_w + \ + h * group * input_c * input_w + \ + w * group * input_c + \ + g * input_c + kc; + } + for(c = 0; c < output_c; c++) + { + for(k_h = 0;k_h < kernel_h;k_h++) + { + for(k_w = 0;k_w < kernel_w; k_w++) + { + cur_out_x = org_out_x + k_w * dilation_w; + cur_out_y = org_out_y + k_h * dilation_h; + + if(cur_out_x >= 0 && cur_out_x < output_w && cur_out_y >=0 && cur_out_y < output_h) + { + if(param->layout == 0) + { + kernel_offset = g * output_c *input_c * kernel_h *kernel_w + \ + kc * output_c * kernel_h * kernel_w + \ + c * kernel_h * kernel_w + \ + k_h * kernel_w + k_w; + + output_offset = n * group * output_c * output_w * output_h +\ + g * output_c * output_w * output_h + \ + c * output_w * output_h +\ + cur_out_y * output_w + cur_out_x; + + } + else + { + kernel_offset = g * output_c * input_c * kernel_h * kernel_w +\ + k_h * kernel_w * output_c +\ + k_w * output_c + c; + output_offset = n * output_h * output_w * output_c * group +\ + cur_out_y * group * output_w * output_c + \ + cur_out_x * group * output_c + \ + g * output_c + c; + } +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + float tmp = 0; + tmp = fp16_to_fp32(output[output_offset]); + tmp += (fp16_to_fp32(input[input_offset]) * fp16_to_fp32(kernel[kernel_offset])); + output[output_offset] = fp32_to_fp16(tmp); +#else + output[output_offset] += kernel[kernel_offset] * input[input_offset]; +#endif + } + } + } + } + } + } + } + } + } + if(NULL != bias) + { + for(n = 0; n < batch; n++) + { + for(g = 0; g < group; g++) + { + for(c = 0; c < output_c ;c++) + { + for(h = 0; h < output_h;h++) + { + for(w= 0;w < output_w;w++) + { + if(param->layout == 0) + { + output_offset = n * output_c * group * output_w * output_h +\ + g * output_c * output_w * output_h + \ + c * output_h * output_w + \ + h * output_w + w; + } + else + { + output_offset = n * output_c * group * output_w * output_h +\ + h * output_c * group * output_w + \ + w * output_c * group + c; + } +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + float tmp = 0; + tmp = fp16_to_fp32(output[output_offset]); + tmp += (fp16_to_fp32(output[output_offset]) + fp16_to_fp32(bias[g*output_c+c]));; + output[output_offset] = fp32_to_fp16(tmp); +#else + output[output_offset] += bias[g*output_c +c]; +#endif + } + } + } + } + } + } + + //activation + if(param->activation >= 0) + { + for(n = 0; n < batch*group*output_c*output_w*output_h; n++) + { + activation_fp16(&output[n], param->activation); + } + } + + return 0; +} + diff --git a/executor/operator/ref/kernel/deconvolution/ref_deconv_fp32.c b/executor/operator/ref/kernel/deconvolution/ref_deconv_fp32.c new file mode 100644 index 000000000..613b367d7 --- /dev/null +++ b/executor/operator/ref/kernel/deconvolution/ref_deconv_fp32.c @@ -0,0 +1,127 @@ +static int ref_deconv_fp32(const float * input, float * output, const float* kernel, const float* bias, const deconv_ref_param* param) +{ + int batch = param->batch; + int group = param->group; + int input_c = param->in_shape[0]/group; + int input_h = param->in_shape[1]; + int input_w = param->in_shape[2]; + int output_c = param->out_shape[0]/group; + int output_h = param->out_shape[1]; + int output_w = param->out_shape[2]; + int kernel_h = param->kernels[0]; + int kernel_w = param->kernels[1]; + int pad_h0 = param->pads[0]; + int pad_w0 = param->pads[1]; + int stride_h = param->strides[0]; + int stride_w = param->strides[1]; + int dilation_h = param->dilations[0]; + int dilation_w = param->dilations[1]; + + int n,g,c,h,w,kc,k_h,k_w; + int org_out_x = 0; + int org_out_y = 0; + int cur_out_x = 0; + int cur_out_y = 0; + + float input_val; + float weight_val; + float bias_val = 0; + + int input_offset=0; + int kernel_offset=0; + int output_offset=0; + + memset((void*)output,0,output_h* output_w * output_c *batch* group*sizeof(float)); + + for (n = 0; n < batch; ++n){ + for( g = 0; g < group; ++g){ + for(h = 0; h < input_h; h++){ + for(w = 0;w < input_w; w++){ + org_out_x = w * stride_w - pad_w0; + org_out_y = h * stride_h - pad_h0; + for(kc = 0; kc < input_c;kc++){ + if(param->layout == 0){ + input_offset = n * group * input_c * input_h * input_w + \ + g * input_c * input_h * input_w + \ + kc * input_h * input_w + \ + h * input_w + w; + } + else{ + input_offset = n * group * input_c * input_h * input_w + \ + h * group * input_c * input_w + \ + w * group * input_c + \ + g * input_c + kc; + } + input_val = input[input_offset]; + for(c = 0; c < output_c; c++){ + for(k_h = 0;k_h < kernel_h;k_h++){ + for(k_w = 0;k_w < kernel_w; k_w++){ + cur_out_x = org_out_x + k_w * dilation_w; + cur_out_y = org_out_y + k_h * dilation_h; + if(cur_out_x >= 0 && cur_out_x < output_w + && cur_out_y >=0 && cur_out_y < output_h){ + if(param->layout == 0){ + kernel_offset = g * output_c *input_c * kernel_h *kernel_w + \ + kc * output_c * kernel_h * kernel_w + \ + c * kernel_h * kernel_w + \ + k_h * kernel_w + k_w; + + output_offset = n * group * output_c * output_w * output_h +\ + g * output_c * output_w * output_h + \ + c * output_w * output_h +\ + cur_out_y * output_w + cur_out_x; + } + else{ + kernel_offset = g * output_c * input_c * kernel_h * kernel_w +\ + k_h * kernel_w * output_c +\ + k_w * output_c + c; + output_offset = n * output_h * output_w * output_c * group +\ + cur_out_y * group * output_w * output_c + \ + cur_out_x * group * output_c + \ + g * output_c + c; + } + weight_val = kernel[kernel_offset]; + output[output_offset] += weight_val * input_val; + } + } + } + } + } + } + } + } + } + if(NULL != bias){ + for(n = 0; n < batch; n++){ + for(g = 0; g < group; g++){ + for(c = 0; c < output_c ;c++){ + bias_val = bias[g * output_c + c]; + for(h = 0; h < output_h;h++){ + for(w= 0;w < output_w;w++){ + if(param->layout == 0){ + output_offset = n * output_c * group * output_w * output_h +\ + g * output_c * output_w * output_h + \ + c * output_h * output_w + \ + h * output_w + w; + } + else{ + output_offset = n * output_c * group * output_w * output_h +\ + h * output_c * group * output_w + \ + w * output_c * group + c; + } + output[output_offset] += bias_val; + } + } + } + } + } + } + + //activation + for(n = 0; n < batch*group*output_c*output_w*output_h; n++) { + output[n] = activation(output[n], param->activation); + } + + return 0; +} + diff --git a/executor/operator/ref/kernel/deconvolution/ref_deconv_int8.c b/executor/operator/ref/kernel/deconvolution/ref_deconv_int8.c new file mode 100644 index 000000000..73b07b2ad --- /dev/null +++ b/executor/operator/ref/kernel/deconvolution/ref_deconv_int8.c @@ -0,0 +1,192 @@ +static int ref_deconv_int8(const int8_t* input, int8_t* output, const int8_t* kernel, const float* bias, deconv_ref_param* param) +{ + int batch = param->batch; + int group = param->group; + int input_c = param->in_shape[0]/group; + int input_h = param->in_shape[1]; + int input_w = param->in_shape[2]; + int output_c = param->out_shape[0]/group; + int output_h = param->out_shape[1]; + int output_w = param->out_shape[2]; + int kernel_h = param->kernels[0]; + int kernel_w = param->kernels[1]; + int pad_h0 = param->pads[0]; + int pad_w0 = param->pads[1]; + int stride_h = param->strides[0]; + int stride_w = param->strides[1]; + int dilation_h = param->dilations[0]; + int dilation_w = param->dilations[1]; + + int n,g,c,h,w,kc,k_h,k_w; + int org_out_x = 0; + int org_out_y = 0; + int cur_out_x = 0; + int cur_out_y = 0; + + int input_offset=0; + int kernel_offset=0; + int output_offset=0; + + float input_scale = param->scale[0]; + float weight_scale = param->scale[1]; + float output_scale = 1/(input_scale* weight_scale); + int output_max = 0; + + int output_size = batch * output_c * group * output_h * output_w; + + float *output_tmp = (float*)malloc(output_size); + float input_val; + float weight_val; + if(NULL == output_tmp) + { + printf("Malloc output tmp memory failed!\n"); + return -1; + } + memset(output_tmp,0,output_size); + + for (n = 0; n < batch; ++n) + { + for( g = 0; g < group; ++g) + { + for(h = 0; h < input_h; h++) + { + for(w = 0;w < input_w; w++) + { + org_out_x = w * stride_w - pad_w0; + org_out_y = h * stride_h - pad_h0; + for(kc = 0; kc < input_c;kc++) + { + if(param->layout == 0) + { + input_offset = n * group * input_c * input_h * input_w + + g * input_c * input_h * input_w + + kc * input_h * input_w + + h * input_w + w; + } + else + { + input_offset = n * group * input_c * input_h * input_w + \ + h * group * input_c * input_w + \ + w * group * input_c + \ + g * input_c + kc; + } + for(c = 0; c < output_c; c++) + { + for(k_h = 0;k_h < kernel_h;k_h++) + { + for(k_w = 0;k_w < kernel_w; k_w++) + { + cur_out_x = org_out_x + k_w * dilation_w; + cur_out_y = org_out_y + k_h * dilation_h; + + if(cur_out_x >= 0 && cur_out_x < output_w && cur_out_y >=0 && cur_out_y < output_h) + { + if(param->layout == 0) + { + kernel_offset = g * output_c *input_c * kernel_h *kernel_w + \ + kc * output_c * kernel_h * kernel_w + \ + c * kernel_h * kernel_w + \ + k_h * kernel_w + k_w; + + output_offset = n * group * output_c * output_w * output_h +\ + g * output_c * output_w * output_h + \ + c * output_w * output_h +\ + cur_out_y * output_w + cur_out_x; + + } + else + { + kernel_offset = g * output_c * input_c * kernel_h * kernel_w +\ + k_h * kernel_w * output_c +\ + k_w * output_c + c; + output_offset = n * output_h * output_w * output_c * group +\ + cur_out_y * group * output_w * output_c + \ + cur_out_x * group * output_c + \ + g * output_c + c; + } + input_val = input[input_offset] / input_scale; + weight_val = kernel[kernel_offset] / weight_scale; + output_tmp[output_offset] += input_val * weight_val ; + } + } + } + } + } + } + } + } + } + if(NULL != bias) + { + for(n = 0; n < batch; n++) + { + for(g = 0; g < group; g++) + { + for(c = 0; c < output_c ;c++) + { + for(h = 0; h < output_h;h++) + { + for(w= 0;w < output_w;w++) + { + if(param->layout == 0) + { + output_offset = n * output_c * group * output_w * output_h +\ + g * output_c * output_w * output_h + \ + c * output_h * output_w + \ + h * output_w + w; + } + else + { + output_offset = n * output_c * group * output_w * output_h +\ + h * output_c * group * output_w + \ + w * output_c * group + c; + } + output_tmp[output_offset] += bias[g*output_c +c]; + } + } + } + } + } + } + + + //activation + if(param->activation >= 0) + { + for(n = 0; n < batch*group*output_c*output_w*output_h; n++) + { + output_tmp[n] = activation(output_tmp[n], param->activation); + } + } + + + output_max = abs(output_tmp[0]); + + for(n = 1; n < output_size; n++) + { + if(fabs(output_tmp[n]) > output_max) + { + output_max = fabs(output_tmp[n]); + } + } + + output_scale = output_max / 127; + + // quant output + for(n = 0; n < batch*group*output_c*output_w*output_h; n++) + { + int output_data = round(output_tmp[n] / output_scale ); + if(output_data > 127) + output[n] = 127; + else if(output_data < -127) + output[n] = -127; + else + output[n] = (int8_t)output_data; + } + + param->scale[2] = output_scale; + free(output_tmp); + + return 0; +} + diff --git a/executor/operator/ref/kernel/deconvolution/ref_deconv_kernel.h b/executor/operator/ref/kernel/deconvolution/ref_deconv_kernel.h new file mode 100644 index 000000000..4a73fd203 --- /dev/null +++ b/executor/operator/ref/kernel/deconvolution/ref_deconv_kernel.h @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +#ifndef __REF_DECONV_KERNEL_H__ +#define __REF_DECONV_KERNEL_H__ + +#include +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct deconv_ref_param +{ + int in_shape[4]; //NCHW + int out_shape[3]; //CHW + int kernels[2]; //hw + int strides[2]; //hw + int dilations[2]; //hw + int pads[2]; + int batch; + int group; + int activation; + int layout; + int zero[3]; //input, kernel, output + float scale[3]; //input, kernel, output +}; + +static inline float activation(float input, int activation) +{ + if( activation >= 0) + { + if(input < 0) + input = 0; + if(activation== 1 && input>1) + input = 1; + if(activation== 2 && input>6) + input = 6; + } + + return input; +} + +typedef int (*ref_deconv_kernel_t)(const void * input, void * output, const void* kernel, const void* bias, const deconv_ref_param* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_deconv_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "ref_deconv_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "ref_deconv_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_deconv_uint8.c" +#endif + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/deconvolution/ref_deconv_uint8.c b/executor/operator/ref/kernel/deconvolution/ref_deconv_uint8.c new file mode 100644 index 000000000..cce178ecc --- /dev/null +++ b/executor/operator/ref/kernel/deconvolution/ref_deconv_uint8.c @@ -0,0 +1,182 @@ +static int ref_deconv_uint8(const uint8_t* input, uint8_t* output, const uint8_t* kernel, const int* bias, const deconv_ref_param* param) +{ + int batch = param->batch; + int group = param->group; + int input_c = param->in_shape[0]/group; + int input_h = param->in_shape[1]; + int input_w = param->in_shape[2]; + int output_c = param->out_shape[0]/group; + int output_h = param->out_shape[1]; + int output_w = param->out_shape[2]; + int kernel_h = param->kernels[0]; + int kernel_w = param->kernels[1]; + int pad_h0 = param->pads[0]; + int pad_w0 = param->pads[1]; + int stride_h = param->strides[0]; + int stride_w = param->strides[1]; + int dilation_h = param->dilations[0]; + int dilation_w = param->dilations[1]; + + int n,g,c,h,w,kc,k_h,k_w; + int org_out_x = 0; + int org_out_y = 0; + int cur_out_x = 0; + int cur_out_y = 0; + + int input_offset=0; + int kernel_offset=0; + int output_offset=0; + + float input_scale = param->scale[0]; + float weight_scale = param->scale[1]; + float output_scale = param->scale[2]; + + float input_val = 0; + float weight_val = 0; + float bias_val = 0; + uint8_t input_zero = param->zero[0]; + uint8_t weight_zero = param->zero[1]; + uint8_t output_zero = param->zero[2]; + + float output_size = batch * output_c * group * output_h * output_w; + float *output_tmp = (float*)malloc(output_size); + if(NULL == output_tmp) + { + printf("Malloc output tmp memory failed!\n"); + return -1; + } + memset(output_tmp,0,output_size); + + for (n = 0; n < batch; ++n) + { + for( g = 0; g < group; ++g) + { + for(h = 0; h < input_h; h++) + { + for(w = 0;w < input_w; w++) + { + org_out_x = w * stride_w - pad_w0; + org_out_y = h * stride_h - pad_h0; + for(kc = 0; kc < input_c;kc++) + { + if(param->layout == 0) + { + input_offset = n * group * input_c * input_h * input_w + + g * input_c * input_h * input_w + + kc * input_h * input_w + + h * input_w + w; + } + else + { + input_offset = n * group * input_c * input_h * input_w + \ + h * group * input_c * input_w + \ + w * group * input_c + \ + g * input_c + kc; + } + for(c = 0; c < output_c; c++) + { + for(k_h = 0;k_h < kernel_h;k_h++) + { + for(k_w = 0;k_w < kernel_w; k_w++) + { + cur_out_x = org_out_x + k_w * dilation_w; + cur_out_y = org_out_y + k_h * dilation_h; + + if(cur_out_x >= 0 && cur_out_x < output_w && cur_out_y >=0 && cur_out_y < output_h) + { + if(param->layout == 0) + { + kernel_offset = g * output_c *input_c * kernel_h *kernel_w + \ + kc * output_c * kernel_h * kernel_w + \ + c * kernel_h * kernel_w + \ + k_h * kernel_w + k_w; + + output_offset = n * group * output_c * output_w * output_h +\ + g * output_c * output_w * output_h + \ + c * output_w * output_h +\ + cur_out_y * output_w + cur_out_x; + + } + else + { + kernel_offset = g * output_c * input_c * kernel_h * kernel_w +\ + k_h * kernel_w * output_c +\ + k_w * output_c + c; + output_offset = n * output_h * output_w * output_c * group +\ + cur_out_y * group * output_w * output_c + \ + cur_out_x * group * output_c + \ + g * output_c + c; + } + + input_val = input_scale*(input[input_offset] - input_zero); + weight_val = weight_scale * (kernel[kernel_offset] - weight_zero); + output_tmp[output_offset] += input_val * weight_val; + } + } + } + } + } + } + } + } + } + if(NULL != bias) + { + for(n = 0; n < batch; n++) + { + for(g = 0; g < group; g++) + { + for(c = 0; c < output_c ;c++) + { + for(h = 0; h < output_h;h++) + { + for(w= 0;w < output_w;w++) + { + if(param->layout == 0) + { + output_offset = n * output_c * group * output_w * output_h +\ + g * output_c * output_w * output_h + \ + c * output_h * output_w + \ + h * output_w + w; + } + else + { + output_offset = n * output_c * group * output_w * output_h +\ + h * output_c * group * output_w + \ + w * output_c * group + c; + } + bias_val = bias[g*output_c +c] * input_scale * weight_scale; + output_tmp[output_offset] += bias_val; + } + } + } + } + } + } + + //activation + if(param->activation >= 0) + { + for(n = 0; n < batch*group*output_c*output_w*output_h; n++) + { + output_tmp[n] = activation(output_tmp[n], param->activation); + } + } + + //quant the output + for(n = 0; n < batch*group*output_c*output_w*output_h; n++) + { + int output_data = round( output_tmp[n] / output_scale + output_zero); + if(output_data > 255) + output[n] = 255; + else if(output_data < 0) + output[n] = 0; + else + output[n] = (uint8_t)output_data; + + } + + free(output_tmp); + return 0; +} + diff --git a/executor/operator/ref/kernel/dpp/ref_dpp_fp16.c b/executor/operator/ref/kernel/dpp/ref_dpp_fp16.c new file mode 100644 index 000000000..5422439fc --- /dev/null +++ b/executor/operator/ref/kernel/dpp/ref_dpp_fp16.c @@ -0,0 +1,36 @@ + +int ref_dpp_fp16(const __fp16* input, const __fp16* score, const __fp16* anchor, + float* detect_num, float* detect_class, float* detect_score, float* detect_boxes, dpp_param* param) +{ + const int num_classes = param->num_classes + 1; + const int num_boxes = param->num_boxes; + + /* transform __fp16 to fp32 */ + int input_size = num_boxes * 4; + int score_size = num_boxes * num_classes; + float* input_f = (float*)malloc( input_size * sizeof(float)); + float* score_f = (float*)malloc( score_size * sizeof(float)); + float* anchor_f = (float*)malloc( input_size * sizeof(float)); + +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + for(int i =0; i < input_size; i++) + input_f[i] = fp16_to_fp32(input[i]); + for(int i =0; i < input_size; i++) + score_f[i] = fp16_to_fp32(score[i]); + for(int i =0; i < input_size; i++) + anchor_f[i] = fp16_to_fp32(anchor[i]); +#else + for(int i =0; i < input_size; i++) + input_f[i] = input[i]; + for(int i =0; i < input_size; i++) + score_f[i] = score[i]; + for(int i =0; i < input_size; i++) + anchor_f[i] = anchor[i]; +#endif + + ref_dpp_common(input_f, score_f, anchor_f, param, detect_num, detect_class, detect_score, detect_boxes); + free(anchor_f); + free(score_f); + free(input_f); + return 0; +} diff --git a/executor/operator/ref/kernel/dpp/ref_dpp_fp32.c b/executor/operator/ref/kernel/dpp/ref_dpp_fp32.c new file mode 100644 index 000000000..dcf9a6cee --- /dev/null +++ b/executor/operator/ref/kernel/dpp/ref_dpp_fp32.c @@ -0,0 +1,6 @@ + +int ref_dpp_fp32(const float* input, const float* score, const float* anchor, + float* detect_num, float* detect_class, float* detect_score, float* detect_boxes, dpp_param* param) +{ + return ref_dpp_common(input, score, anchor, param, detect_num, detect_class, detect_score, detect_boxes);; +} diff --git a/executor/operator/ref/kernel/dpp/ref_dpp_kernel.h b/executor/operator/ref/kernel/dpp/ref_dpp_kernel.h new file mode 100644 index 000000000..fe34a09b6 --- /dev/null +++ b/executor/operator/ref/kernel/dpp/ref_dpp_kernel.h @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +#ifndef __REF_DPP_KERNEL_H__ +#define __REF_DPP_KERNEL_H__ + +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct Dpp_Box +{ + float x0; // xmin + float y0; // ymin + float x1; // xmax + float y1; // ymax + int box_idx; + int class_idx; + float score; +}; + +struct dpp_param +{ + int max_detections; + int max_classes_per_detection; + float nms_score_threshold; + float nms_iou_threshold; + int num_classes; + int num_boxes; + float scales[4]; + float quant_scale[3]; + int zero[3]; +}; + +#define DPP_MIN(a,b) ( ab ? a : b ) + +typedef int (*ref_dpp_kernel_t )(const void* input, const void* score, const void* anchor, + void* detect_num, void* detect_class, void* detect_score, void* detect_boxes, dpp_param* param); + +static inline float intersection_area(const struct Dpp_Box a, const struct Dpp_Box b) +{ + if(a.x0 > b.x1 || a.x1 < b.x0 || a.y0 > b.y1 || a.y1 < b.y0) + { + // no intersection + return 0.f; + } + + float inter_width = DPP_MIN(a.x1, b.x1) - DPP_MAX(a.x0, b.x0); + float inter_height = DPP_MIN(a.y1, b.y1) - DPP_MAX(a.y0, b.y0); + + return inter_width * inter_height; +} + +static inline void nms_sorted_bboxes(const struct Dpp_Box* boxes, int boxes_size, int* picked, int* picked_size, float nms_threshold) +{ + float areas[boxes_size]; + int n_picked = 0; + for(int i = 0; i < boxes_size; i++) + { + + float width = boxes[i].x1 - boxes[i].x0; + float height = boxes[i].y1 - boxes[i].y0; + + areas[i] = width * height; + } + + for(int i = 0; i < boxes_size; i++) + { + int keep = 1; + for(int j = 0; j < n_picked; j++) + { + + // intersection over union + float inter_area = intersection_area(boxes[i], boxes[picked[j]]); + float union_area = areas[i] + areas[picked[j]] - inter_area; + // float IoU = inter_area / union_area + if(inter_area / union_area > nms_threshold) + keep = 0; + } + + if(keep) + { + picked[n_picked] = i; + n_picked ++; + } + } + *picked_size = n_picked; +} + +void sort_boxes_by_score(struct Dpp_Box* boxes, int size) +{ + int i, j; + for(i = 0; i < size-1; i++) + { + int max_idx = i; + for(j = i + 1; j < size; j++) + { + if(boxes[j].score < 0.6) + continue; + if(boxes[max_idx].score < boxes[j].score) + max_idx = j; + } + if(i != max_idx) + { + struct Dpp_Box tmp; + memcpy(&tmp, boxes+i, sizeof(struct Dpp_Box)); + memcpy(boxes + i, boxes+max_idx, sizeof(struct Dpp_Box)); + memcpy(boxes + max_idx, &tmp, sizeof(struct Dpp_Box)); + } + else + { + if(boxes[max_idx].score < 0.6) + return ; + } + } +} + +static inline int decode_single_box(struct Dpp_Box* box, const float* box_ptr, const float* anchor_ptr, + const float* scales) +{ + int i = box->box_idx; + + const float* box_coord = box_ptr + i * 4; + const float* anchor = anchor_ptr + i * 4; + + // [0]: y [1]: x [2]: h [3]: w + float ycenter = box_coord[0] / scales[0] * anchor[2] + anchor[0]; + float xcenter = box_coord[1] / scales[1] * anchor[3] + anchor[1]; + float half_h = 0.5f * (exp(box_coord[2] / scales[2])) * anchor[2]; + float half_w = 0.5f * (exp(box_coord[3] / scales[3])) * anchor[3]; + + box->y0 = ycenter - half_h; + box->x0 = xcenter - half_w; + box->y1 = ycenter + half_h; + box->x1 = xcenter + half_w; + if(box->y0 < 0 || box->x0 < 0) + return -1; + return 0; +} + +void get_all_boxes_rect(struct Dpp_Box* all_class_bbox_rects, + const float* box, const float* scores, const float* anchor, + int num_boxes, int num_classes, float* scales) +{ + struct Dpp_Box selected_box; + for(int j = 0; j < num_boxes; j++) + { + for(int i = 1; i < num_classes; i++) + { + float score = scores[j * num_classes + i]; + + if(score < 0.6) + continue; + + selected_box.score = score; + selected_box.class_idx = i; + selected_box.box_idx = j; + //printf("score: %f ,box_idx: %d ,class: %d\n",score, j, i); + + if(decode_single_box(&selected_box, box, anchor, scales) < 0) + continue; + + //struct Box* cls_vector = all_class_bbox_rects[i]; + memcpy(all_class_bbox_rects + i*num_boxes +j, &selected_box, sizeof(struct Dpp_Box)); + + } + } +} + +int ref_dpp_common(const float* input_f, const float* score_f, const float* anchor_f, dpp_param* param, + float* detect_num, float* detect_class, float* detect_score, float* detect_boxes) +{ + + const int num_classes = param->num_classes + 1; + const int num_boxes = param->num_boxes; + const int max_detections = param->max_detections; + + struct Dpp_Box* all_boxes = (struct Dpp_Box*)malloc(num_classes*num_boxes*sizeof(struct Dpp_Box)); + memset(all_boxes, 0, sizeof(struct Dpp_Box)*num_classes*num_boxes); + + get_all_boxes_rect(all_boxes, input_f, score_f, anchor_f, + num_boxes, num_classes, param->scales); + + int max_picked_boxes = 2 * max_detections * num_classes; + struct Dpp_Box* picked_boxes = (struct Dpp_Box*)malloc(max_picked_boxes * sizeof(struct Dpp_Box)); + memset(picked_boxes, 0, sizeof(struct Dpp_Box)*max_picked_boxes); + int all_picked_size = 0; + + for(int i = 1; i < num_classes; i++) + { + struct Dpp_Box* class_box = all_boxes + i*num_boxes; + + // sort + sort_boxes_by_score(class_box, num_boxes); + int box_size = 0; + for(int j = 0; j < num_boxes; j ++) + { + if(class_box[j].score < 0.6) + break; + box_size ++; + } + if(box_size == 0) + continue; + + + + if( box_size > max_detections * 2) + box_size = max_detections * 2; + + int picked[num_boxes]; + int picked_size = 0; + + picked[0]=0; + nms_sorted_bboxes(class_box, box_size, picked, &picked_size, param->nms_iou_threshold); + + // save the survivors + for(int j = 0; j < picked_size; j++) + { + int z = picked[j]; + memcpy(picked_boxes + all_picked_size, class_box + z,sizeof(struct Dpp_Box)); + all_picked_size++; + } + + } + + sort_boxes_by_score(picked_boxes, max_picked_boxes); + if(all_picked_size > max_detections) + all_picked_size = max_detections; + + printf("all_picked_size: %d\n",all_picked_size); + // generate output tensors + detect_num[0] = all_picked_size; + + for(int i = 0; i < all_picked_size; i++) + { + + detect_class[i] = picked_boxes[i].class_idx; + detect_score[i] = picked_boxes[i].score; + + detect_boxes[4 * i] = picked_boxes[i].x0; + detect_boxes[4 * i + 1] = picked_boxes[i].y0; + detect_boxes[4 * i + 2] = picked_boxes[i].x1; + detect_boxes[4 * i + 3] = picked_boxes[i].y1; + } + + free(all_boxes); + free(picked_boxes); + + return 0; +} + + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_dpp_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "ref_dpp_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_dpp_uint8.c" +#endif + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/dpp/ref_dpp_uint8.c b/executor/operator/ref/kernel/dpp/ref_dpp_uint8.c new file mode 100644 index 000000000..f3d362ea2 --- /dev/null +++ b/executor/operator/ref/kernel/dpp/ref_dpp_uint8.c @@ -0,0 +1,28 @@ + +int ref_dpp_uint8(const uint8_t* input, const uint8_t* score, const uint8_t* anchor, + float* detect_num, float* detect_class, float* detect_score, float* detect_boxes, dpp_param* param) +{ + const int num_classes = param->num_classes + 1; + const int num_boxes = param->num_boxes; + + /* transform uint8_t to fp32 */ + int input_size = num_boxes * 4; + int score_size = num_boxes * num_classes; + float* input_f = (float*)malloc( input_size * sizeof(float)); + float* score_f = (float*)malloc( score_size * sizeof(float)); + float* anchor_f = (float*)malloc( input_size * sizeof(float)); + for(int i =0; i < input_size; i++) + input_f[i] = (input[i] - param->zero[0]) * param->quant_scale[0]; + for(int i =0; i < score_size; i++) + score_f[i] = score[i] * param->quant_scale[1]; + for(int i =0; i < input_size; i++) + anchor_f[i] = (anchor[i] - param->zero[2]) * param->quant_scale[2]; + + ref_dpp_common(input_f, score_f, anchor_f, param, detect_num, detect_class, detect_score, detect_boxes); + + free(anchor_f); + free(score_f); + free(input_f); + + return 0; +} diff --git a/executor/operator/ref/kernel/eltwise/eltwise.h b/executor/operator/ref/kernel/eltwise/eltwise.h new file mode 100644 index 000000000..05f58b3da --- /dev/null +++ b/executor/operator/ref/kernel/eltwise/eltwise.h @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#ifndef __ELTWISE_KERNEL_H__ +#define __ELTWISE_KERNEL_H__ + +#include +#include +#include +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +struct eltwise_param; + +struct eltwise_param +{ + float scale[3]; + int zero[3]; +}; + +typedef int (*eltwise_t)(void* output, void* input0, void* input1, int type, int input_count4, + int input_chan,int input_chan_1,int input_hw,int input_hw_1, int input1_count4, + int input_h,int input_w,int input_h_1,int input_w_1,int input_n,int input_n_1,int layout, + int out_size,float* output_buf,eltwise_param* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "eltwise_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "eltwise_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "eltwise_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "eltwise_uint8.c" +#endif + + + + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/executor/operator/ref/kernel/eltwise/eltwise_fp16.c b/executor/operator/ref/kernel/eltwise/eltwise_fp16.c new file mode 100644 index 000000000..4f51a3508 --- /dev/null +++ b/executor/operator/ref/kernel/eltwise/eltwise_fp16.c @@ -0,0 +1,1010 @@ +static int eltwise_fp16(__fp16* output, __fp16* input0, __fp16* input1,int type, int input_count4, + int input_chan,int input_chan_1,int input_hw, int input_hw_1,int input1_count4, + int input_h,int input_w,int input_h_1,int input_w_1,int input_n, + int input_n_1,int layout,int out_size,float * output_buf,eltwise_param* param) +{ +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + switch(type) + { + case 10: + { + if(input1_count4 == 1) + { + for(int i = 0; i < input_count4; ++i) + { + *output++ = fp32_to_fp16(fp16_to_fp32((*input0++)) / fp16_to_fp32(input1[0])); + } + } + else if(input_count4 == input1_count4) + { + for(int i = 0; i < input_count4; ++i) + { + *output++ = fp32_to_fp16(fp16_to_fp32(input0[i]) / fp16_to_fp32(input1[i])); + } + } + else if(input_count4 == 1) + { + for(int i = 0; i < input1_count4; ++i) + { + *output++ = fp32_to_fp16(fp16_to_fp32(input0[0])/fp16_to_fp32((*input1++))); + } + } + else if(input_chan == input1_count4) + { + if(layout==0) + { + for(int j=0;jscale[0]; + float real_input1 = (input1[0]) * param->scale[1]; + float result=real_input0/real_input1; + output_buf[i] =result; + } + } + else if(input_count4 == input1_count4) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = (input0[i]) * param->scale[0]; + float real_input1 = (input1[i]) * param->scale[1]; + float result=real_input0/real_input1; + output_buf[i] = result; + } + } + else if(input_count4 == 1) + { + for(int i = 0; i < input1_count4; ++i) + { + float real_input0 = (input0[0]) * param->scale[0]; + float real_input1 = ((*input1++)) * param->scale[1]; + float result=real_input0/real_input1; + output_buf[i] = result; + } + } + else if(input_chan == input1_count4) + { + if(layout==0) + { + for(int j=0;jscale[0]; + float real_input1 = (input1[k]) * param->scale[1]; + float result=real_input0/real_input1; + output_buf[ofset] = result; + } + } + } + } + //nhwc + else + { + for(int i=0;iscale[0]; + float real_input1 = (input1[c]) * param->scale[1]; + float result=real_input0/real_input1; + output_buf[ofst]=result; + } + } + } + } + } + } + else if(input_chan_1 == input_count4) + { + if(layout==0) + { + for(int j=0;jscale[0]; + float real_input1 = (input1[ofset]) * param->scale[1]; + float result=real_input0/real_input1; + output_buf[ofset] = result; + } + } + } + } + else + { + for(int i=0;iscale[0]; + float real_input1 = (input1[ofst]) * param->scale[1]; + float result=real_input0/real_input1; + output_buf[ofst]=result; + } + } + } + } + } + + } + else + { + return -1; + } + break; + } + case 0: + { + if(input1_count4 == 1) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = ((*input0++) ) * param->scale[0]; + float real_input1 = (input1[0]) * param->scale[1]; + float result=real_input0*real_input1; + printf("result: %f\n",result); + output_buf[i] =result; + } + } + else if(input_count4 == input1_count4) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = (input0[i]) * param->scale[0]; + float real_input1 = (input1[i]) * param->scale[1]; + float result=real_input0*real_input1; + output_buf[i] = result; + } + } + else if(input_count4 == 1) + { + for(int i = 0; i < input1_count4; ++i) + { + float real_input0 = (input0[0]) * param->scale[0]; + float real_input1 = ((*input1++)) * param->scale[1]; + float result=real_input0*real_input1; + output_buf[i] = result; + } + } + else if(input_chan == input1_count4) + { + + if(layout==0) + { + for(int j=0;jscale[0]; + float real_input1 = (input1[k]) * param->scale[1]; + float result=real_input0*real_input1; + output_buf[ofset] = result; + } + } + } + } + //nhwc + else + { + for(int i=0;iscale[0]; + float real_input1 = (input1[c]) * param->scale[1]; + float result=real_input0*real_input1; + output_buf[ofst]=result; + } + } + } + } + } + } + else if(input_chan_1 == input_count4) + { + if(layout==0) + { + for(int j=0;jscale[0]; + float real_input1 = (input1[ofset]) * param->scale[1]; + float result=real_input0*real_input1; + output_buf[ofset] = result; + } + } + } + } + else + { + for(int i=0;iscale[0]; + float real_input1 = (input1[ofst]) * param->scale[1]; + float result=real_input0*real_input1; + output_buf[ofst]=result; + } + } + } + } + } + + } + else + { + return -1; + } + break; + } + case 4: + { + if(input1_count4 == 1) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = ((*input0++) ) * param->scale[0]; + float real_input1 = (input1[0]) * param->scale[1]; + float result=real_input0-real_input1; + output_buf[i] =result; + } + } + else if(input_count4 == input1_count4) + { + for(int i = 0; i < input_count4; ++i) + { + + float real_input0 = (input0[i]) * param->scale[0]; + float real_input1 = (input1[i]) * param->scale[1]; + float result=real_input0-real_input1; + *output_buf = result; + } + } + else if(input_count4 == 1) + { + for(int i = 0; i < input1_count4; ++i) + { + float real_input0 = (input0[0]) * param->scale[0]; + float real_input1 = ((*input1++)) * param->scale[1]; + float result=real_input0-real_input1; + output_buf[i] = result; + } + } + else if(input_chan == input1_count4) + { + if(layout==0) + { + for(int j=0;jscale[0]; + float real_input1 = (input1[k]) * param->scale[1]; + float result=real_input0-real_input1; + output_buf[ofset] = result; + } + } + } + } + //nhwc + else + { + for(int i=0;iscale[0]; + float real_input1 = (input1[c]) * param->scale[1]; + float result=real_input0-real_input1; + output_buf[ofst]=result; + } + } + } + } + } + } + else if(input_chan_1 == input_count4) + { + if(layout==0) + { + for(int j=0;jscale[0]; + float real_input1 = (input1[ofset]) * param->scale[1]; + float result=real_input0-real_input1; + output_buf[ofset] = result; + } + } + } + } + else + { + for(int i=0;iscale[0]; + float real_input1 = (input1[ofst]) * param->scale[1]; + float result=real_input0-real_input1; + output_buf[ofst]=result; + } + } + } + } + } + + } + else + { + return -1; + } + break; + } + case 2: + { + if(input1_count4 == 1) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = ((*input0++) ) * param->scale[0]; + float real_input1 = (input1[0]) * param->scale[1]; + float result=real_input0+real_input1; + output_buf[i] =result; + } + } + else if(input_count4 == input1_count4) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = (input0[i]) * param->scale[0]; + float real_input1 = (input1[i]) * param->scale[1]; + float result=real_input0+real_input1; + output_buf[i] = result; + } + } + else if(input_count4 == 1) + { + for(int i = 0; i < input1_count4; ++i) + { + float real_input0 = (input0[0]) * param->scale[0]; + float real_input1 = ((*input1++)) * param->scale[1]; + float result=real_input0+real_input1; + output_buf[i] = result; + } + } + else if(input_chan == input1_count4) + { + if(layout==0) + { + for(int j=0;jscale[0]; + float real_input1 = (input1[k]) * param->scale[1]; + float result=real_input0+real_input1; + output_buf[ofset] = result; + } + } + } + } + //nhwc + else + { + for(int i=0;iscale[0]; + float real_input1 = (input1[c]) * param->scale[1]; + float result=real_input0+real_input1; + output_buf[ofst]=result; + } + } + } + } + } + } + else if(input_chan_1 == input_count4) + { + if(layout==0) + { + for(int j=0;jscale[0]; + float real_input1 = (input1[ofset]) * param->scale[1]; + float result=real_input0+real_input1; + output_buf[ofset] = result; + } + } + } + } + else + { + for(int i=0;iscale[0]; + float real_input1 = (input1[ofst]) * param->scale[1]; + float result=real_input0+real_input1; + output_buf[ofst]=result; + } + } + } + } + } + + } + else + { + return -1; + } + break; + } + case 12: + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = (input0[i]) * param->scale[0]; + float result=exp(real_input0); + output_buf[i] = result; + } + break; + } + default: + break; + } + float output_max = 0.0f; + for(int i =0; i< out_size; i++) + { + if(output_max < fabs(output_buf[i])) + output_max = fabs(output_buf[i]); + } + param->scale[2] = output_max/127; + for(int i =0; i< out_size; i++) + { + output[i] = round(output_buf[i]*127/output_max); + } + + return 0; +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/eltwise/eltwise_uint8.c b/executor/operator/ref/kernel/eltwise/eltwise_uint8.c new file mode 100644 index 000000000..f1e78fc8f --- /dev/null +++ b/executor/operator/ref/kernel/eltwise/eltwise_uint8.c @@ -0,0 +1,512 @@ +static int eltwise_uint8(uint8_t* output, uint8_t* input0, uint8_t* input1,int type, int input_count4, + int input_chan,int input_chan_1,int input_hw, int input_hw_1,int input1_count4, + int input_h,int input_w,int input_h_1,int input_w_1,int input_n, + int input_n_1,int layout,int out_size,float * output_buf,eltwise_param* param) +{ + switch(type) + { + case 10: + { + if(input1_count4 == 1) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = ((*input0++) - param->zero[0]) * param->scale[0]; + float real_input1 = (input1[0]- param->zero[1]) * param->scale[1]; + float result=real_input0/real_input1; + *output++ =round(result / param->scale[2]) + param->zero[2]; + } + } + else if(input_count4 == input1_count4) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = (input0[i]- param->zero[0]) * param->scale[0]; + float real_input1 = (input1[i]- param->zero[1]) * param->scale[1]; + float result=real_input0/real_input1; + *output++ = round(result / param->scale[2]) + param->zero[2]; + } + } + else if(input_count4 == 1) + { + for(int i = 0; i < input1_count4; ++i) + { + float real_input0 = (input0[0]- param->zero[0]) * param->scale[0]; + float real_input1 = ((*input1++)- param->zero[1]) * param->scale[1]; + float result=real_input0/real_input1; + *output++ = round(result / param->scale[2]) + param->zero[2]; + } + } + else if(input_chan == input1_count4) + { + if(layout==0) + { + for(int j=0;jzero[0]) * param->scale[0]; + float real_input1 = (input1[k]- param->zero[1]) * param->scale[1]; + float result=real_input0/real_input1; + output[ofset] = round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + //nhwc + else + { + for(int i=0;izero[0]) * param->scale[0]; + float real_input1 = (input1[c]- param->zero[1]) * param->scale[1]; + float result=real_input0/real_input1; + output[ofst]=round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + } + } + else if(input_chan_1 == input_count4) + { + if(layout==0) + { + for(int j=0;jzero[0]) * param->scale[0]; + float real_input1 = (input1[ofset]- param->zero[1]) * param->scale[1]; + float result=real_input0/real_input1; + output[ofset] = round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + else + { + for(int i=0;izero[0]) * param->scale[0]; + float real_input1 = (input1[ofst]- param->zero[1]) * param->scale[1]; + float result=real_input0/real_input1; + output[ofst]=round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + } + + } + else + { + return -1; + } + break; + } + case 0: + { + if(input1_count4 == 1) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = ((*input0++) - param->zero[0]) * param->scale[0]; + float real_input1 = (input1[0]- param->zero[1]) * param->scale[1]; + float result=real_input0*real_input1; + *output++ =round(result / param->scale[2]) + param->zero[2]; + } + } + else if(input_count4 == input1_count4) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = (input0[i]- param->zero[0]) * param->scale[0]; + float real_input1 = (input1[i]- param->zero[1]) * param->scale[1]; + float result=real_input0*real_input1; + *output++ = round(result / param->scale[2]) + param->zero[2]; + } + } + else if(input_count4 == 1) + { + for(int i = 0; i < input1_count4; ++i) + { + float real_input0 = (input0[0]- param->zero[0]) * param->scale[0]; + float real_input1 = ((*input1++)- param->zero[1]) * param->scale[1]; + float result=real_input0*real_input1; + *output++ = round(result / param->scale[2]) + param->zero[2]; + } + } + else if(input_chan == input1_count4) + { + + if(layout==0) + { + for(int j=0;jzero[0]) * param->scale[0]; + float real_input1 = (input1[k]- param->zero[1]) * param->scale[1]; + float result=real_input0*real_input1; + output[ofset] = round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + //nhwc + else + { + for(int i=0;izero[0]) * param->scale[0]; + float real_input1 = (input1[c]- param->zero[1]) * param->scale[1]; + float result=real_input0*real_input1; + output[ofst]=round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + } + } + else if(input_chan_1 == input_count4) + { + if(layout==0) + { + for(int j=0;jzero[0]) * param->scale[0]; + float real_input1 = (input1[ofset]- param->zero[1]) * param->scale[1]; + float result=real_input0*real_input1; + output[ofset] = round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + else + { + for(int i=0;izero[0]) * param->scale[0]; + float real_input1 = (input1[ofst]- param->zero[1]) * param->scale[1]; + float result=real_input0*real_input1; + output[ofst]=round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + } + + } + else + { + return -1; + } + break; + } + case 4: + { + if(input1_count4 == 1) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = ((*input0++) - param->zero[0]) * param->scale[0]; + float real_input1 = (input1[0]- param->zero[1]) * param->scale[1]; + float result=real_input0-real_input1; + *output++ =round(result / param->scale[2]) + param->zero[2]; + } + } + else if(input_count4 == input1_count4) + { + for(int i = 0; i < input_count4; ++i) + { + + float real_input0 = (input0[i]) * param->scale[0]; + float real_input1 = (input1[i]) * param->scale[1]; + float result=real_input0-real_input1; + *output = round(result / param->scale[2]); + } + } + else if(input_count4 == 1) + { + for(int i = 0; i < input1_count4; ++i) + { + float real_input0 = (input0[0]- param->zero[0]) * param->scale[0]; + float real_input1 = ((*input1++)- param->zero[1]) * param->scale[1]; + float result=real_input0-real_input1; + *output++ = round(result / param->scale[2]) + param->zero[2]; + } + } + else if(input_chan == input1_count4) + { + if(layout==0) + { + for(int j=0;jzero[0]) * param->scale[0]; + float real_input1 = (input1[k]- param->zero[1]) * param->scale[1]; + float result=real_input0-real_input1; + output[ofset] = round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + //nhwc + else + { + for(int i=0;izero[0]) * param->scale[0]; + float real_input1 = (input1[c]- param->zero[1]) * param->scale[1]; + float result=real_input0-real_input1; + output[ofst]=round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + } + } + else if(input_chan_1 == input_count4) + { + if(layout==0) + { + for(int j=0;jzero[0]) * param->scale[0]; + float real_input1 = (input1[ofset]- param->zero[1]) * param->scale[1]; + float result=real_input0-real_input1; + output[ofset] = round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + else + { + for(int i=0;izero[0]) * param->scale[0]; + float real_input1 = (input1[ofst]- param->zero[1]) * param->scale[1]; + float result=real_input0-real_input1; + output[ofst]=round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + } + + } + else + { + return -1; + } + break; + } + case 2: + { + if(input1_count4 == 1) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = ((*input0++) - param->zero[0]) * param->scale[0]; + float real_input1 = (input1[0]- param->zero[1]) * param->scale[1]; + float result=real_input0+real_input1; + *output++ =round(result / param->scale[2]) + param->zero[2]; + } + } + else if(input_count4 == input1_count4) + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = (input0[i]- param->zero[0]) * param->scale[0]; + float real_input1 = (input1[i]- param->zero[1]) * param->scale[1]; + float result=real_input0+real_input1; + *output++ = round(result / param->scale[2]) + param->zero[2]; + } + } + else if(input_count4 == 1) + { + for(int i = 0; i < input1_count4; ++i) + { + float real_input0 = (input0[0]- param->zero[0]) * param->scale[0]; + float real_input1 = ((*input1++)- param->zero[1]) * param->scale[1]; + float result=real_input0+real_input1; + *output++ = round(result / param->scale[2]) + param->zero[2]; + } + } + else if(input_chan == input1_count4) + { + if(layout==0) + { + for(int j=0;jzero[0]) * param->scale[0]; + float real_input1 = (input1[k]- param->zero[1]) * param->scale[1]; + float result=real_input0+real_input1; + output[ofset] = round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + //nhwc + else + { + for(int i=0;izero[0]) * param->scale[0]; + float real_input1 = (input1[c]- param->zero[1]) * param->scale[1]; + float result=real_input0+real_input1; + output[ofst]=round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + } + } + else if(input_chan_1 == input_count4) + { + if(layout==0) + { + for(int j=0;jzero[0]) * param->scale[0]; + float real_input1 = (input1[ofset]- param->zero[1]) * param->scale[1]; + float result=real_input0+real_input1; + output[ofset] = round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + else + { + for(int i=0;izero[0]) * param->scale[0]; + float real_input1 = (input1[ofst]- param->zero[1]) * param->scale[1]; + float result=real_input0+real_input1; + output[ofst]=round(result / param->scale[2]) + param->zero[2]; + } + } + } + } + } + + } + else + { + return -1; + } + break; + } + case 12: + { + for(int i = 0; i < input_count4; ++i) + { + float real_input0 = (input0[i]- param->zero[0]) * param->scale[0]; + float result=exp(real_input0); + *output++ = round(result / param->scale[2]) + param->zero[2]; + } + break; + } + default: + break; + } + return 0; +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/fully_connected/ref_fc_fp16.c b/executor/operator/ref/kernel/fully_connected/ref_fc_fp16.c new file mode 100644 index 000000000..2f80cf269 --- /dev/null +++ b/executor/operator/ref/kernel/fully_connected/ref_fc_fp16.c @@ -0,0 +1,40 @@ + +static int ref_fc_fp16(const __fp16 * input, __fp16 * output, const __fp16* weight, const __fp16* bias, fc_data* param) +{ + int batch = param->batch; + int hidden = param->hidden; + int out_number = param->out_number; + + int n,i,j; + for ( n = 0; n < batch; ++n) + { + for( i = 0; i < out_number; ++i) + { +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + float tmp = bias ? fp16_to_fp32(bias[i]) : 0.0; + for ( j = 0; j < hidden; ++j) + { + if(param->need_trans == 0) + tmp += fp16_to_fp32(input[n*hidden + j]) * fp16_to_fp32(weight[i*hidden + j]); + else + tmp += fp16_to_fp32(input[n*hidden + j]) * fp16_to_fp32(weight[i + j*out_number]); + } + + output[n*out_number + i ] = fp32_to_fp16(tmp); +#else + __fp16 tmp = bias ? bias[i] : 0.0; + for ( j = 0; j < hidden; ++j) + { + if(param->need_trans == 0) + tmp += input[n*hidden + j] * weight[i*hidden + j]; + else + tmp += input[n*hidden + j] * weight[i + j*out_number]; + } + + output[n*out_number + i ] = tmp; +#endif + } + } + return 0; +} + diff --git a/executor/operator/ref/kernel/fully_connected/ref_fc_fp32.c b/executor/operator/ref/kernel/fully_connected/ref_fc_fp32.c new file mode 100644 index 000000000..55caca665 --- /dev/null +++ b/executor/operator/ref/kernel/fully_connected/ref_fc_fp32.c @@ -0,0 +1,28 @@ + + +static int ref_fc_fp32(const float * input, float * output, const float* weight, const float* bias, fc_data* param) +{ + int batch = param->batch; + int hidden = param->hidden; + int out_number = param->out_number; + + + int n,i,j; + for ( n = 0; n < batch; ++n) + { + for( i = 0; i < out_number; ++i) + { + float tmp = bias ? bias[i]:0.0; + for ( j = 0; j < hidden; ++j) + { + if(param->need_trans == 0) + tmp += input[n* hidden + j] * weight[i*hidden + j]; + else + tmp += input[n* hidden + j] * weight[i + j*out_number]; + } + output[n*out_number + i ] = tmp; + } + } + return 0; +} + diff --git a/executor/operator/ref/kernel/fully_connected/ref_fc_int8.c b/executor/operator/ref/kernel/fully_connected/ref_fc_int8.c new file mode 100644 index 000000000..4c1ad953c --- /dev/null +++ b/executor/operator/ref/kernel/fully_connected/ref_fc_int8.c @@ -0,0 +1,62 @@ + +static int ref_fc_int8(const int8_t * input, int8_t * output, const int8_t* weight, const float* bias, fc_data* param) +{ + int batch = param->batch; + int hidden = param->hidden; + int out_number = param->out_number; + + /* dequant input */ + int input_size = batch * hidden; + float* input_buf = (float*)malloc(sizeof(float) * input_size); + for(int i=0; iscale[0]; + } + + /* dequant kernel */ + int kernel_size = hidden * out_number; + float* weight_buf = (float*)malloc(sizeof(float) * kernel_size); + for(int i=0; iscale[1]; + } + + /* malloc output_buffer */ + int output_size = batch * out_number; + float* output_buf = (float*)malloc(sizeof(float) * output_size); + + int n,i,j; + for ( n = 0; n < batch; ++n) + { + for( i = 0; i < out_number; ++i) + { + float tmp = bias ? bias[i] :0.0; + for ( j = 0; j < hidden; ++j) + { + if(param->need_trans == 0) + tmp += input_buf[n* hidden + j] * weight_buf[i*hidden + j]; + else + tmp += input_buf[n* hidden + j] * weight_buf[i + j*out_number]; + } + output_buf[n*out_number + i ] = tmp; + } + } + + /* quant output */ + float output_max = 0.0f; + for(int i =0; i< output_size; i++) + { + if(output_max < fabs(output_buf[i])) + output_max = fabs(output_buf[i]); + } + param->scale[2] = output_max/127; + for(int i =0; i< output_size; i++) + { + output[i] = round(output_buf[i]*127/output_max); + } + free(output_buf); + free(weight_buf); + free(input_buf); + return 0; +} + diff --git a/executor/operator/ref/kernel/fully_connected/ref_fc_kernel.h b/executor/operator/ref/kernel/fully_connected/ref_fc_kernel.h new file mode 100644 index 000000000..fc55309e5 --- /dev/null +++ b/executor/operator/ref/kernel/fully_connected/ref_fc_kernel.h @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +#ifndef __REF_FC_KERNEL_H__ +#define __REF_FC_KERNEL_H__ + +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct fc_data +{ + int need_trans; + int batch; //N + int out_number; //OUT + int hidden; //hidden + int zero[3]; //input, kernel, output + float scale[3]; //input, kernel, output +}; + + +typedef int (*ref_fc_kernel_t)(const void * input, void * output, const void* weight, const void* bias, fc_data* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_fc_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "ref_fc_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "ref_fc_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_fc_uint8.c" +#endif + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/fully_connected/ref_fc_uint8.c b/executor/operator/ref/kernel/fully_connected/ref_fc_uint8.c new file mode 100644 index 000000000..b73d85ef0 --- /dev/null +++ b/executor/operator/ref/kernel/fully_connected/ref_fc_uint8.c @@ -0,0 +1,48 @@ + +static int ref_fc_uint8(const uint8_t * input, uint8_t * output, const uint8_t* weight, const int* bias, fc_data* param) +{ + int batch = param->batch; + int hidden = param->hidden; + int out_number = param->out_number; + + /* dequant input */ + int input_size = batch * hidden; + float* input_buf = (float*)malloc(sizeof(float) * input_size); + for(int i=0; izero[0]) * param->scale[0]; + } + + /* dequant kernel */ + int kernel_size = hidden * out_number; + float* weight_buf = (float*)malloc(sizeof(float) * kernel_size); + for(int i=0; izero[1]) * param->scale[1]; + } + + int n,i,j; + for ( n = 0; n < batch; ++n) + { + for( i = 0; i < out_number; ++i) + { + float tmp = bias? bias[i]*param->scale[0]*param->scale[1]:0.0; + for ( j = 0; j < hidden; ++j) + { + if(param->need_trans == 0) + tmp += input_buf[n* hidden + j] * weight_buf[i*hidden + j]; + else + tmp += input_buf[n* hidden + j] * weight_buf[i + j*out_number]; + } + int quant_tmp = round(tmp/param->scale[2]) + param->zero[2]; + if(quant_tmp > 255) quant_tmp = 255; + if(quant_tmp < 0) quant_tmp = 0; + output[n*out_number + i ] = quant_tmp; + } + } + + free(weight_buf); + free(input_buf); + return 0; +} + diff --git a/executor/operator/ref/kernel/lrn/ref_lrn_fp16.c b/executor/operator/ref/kernel/lrn/ref_lrn_fp16.c new file mode 100644 index 000000000..37ebbcbe0 --- /dev/null +++ b/executor/operator/ref/kernel/lrn/ref_lrn_fp16.c @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ + +static int ref_lrn_fp16(const __fp16* in_data, __fp16* out_data, ref_lrn_param* param) +{ + int n = param->dims[0]; + int c = param->dims[1]; + int h = param->dims[2]; + int w = param->dims[3]; + + float alpha = param->alpha; + float beta = param->beta; + float bias = param->bias; + int local_size = param->local_size; + + int channel_size = h * w; + int img_size = c * channel_size; + + float* square = ( float* )(malloc(img_size * sizeof(float))); + float* accum_square = ( float* )(malloc(channel_size * sizeof(float))); + + for(int i = 0; i < n; i++) + { + const __fp16* img_base = in_data + i * img_size; + + /* get square value */ + for(int j = 0; j < img_size; j++) + { + float img_data = fp16_to_fp32(img_base[j]); + square[j] = img_data * img_data + bias; + } + + if(param->norm_region == 0) /* LRN_ACROSS_CHANNELS */ + { + float alpha_over_size = alpha / local_size; + + for(int j = 0; j < c; j++) + { + int c_start = j - local_size / 2; + int c_end = j + local_size / 2; + + memset(accum_square, 0x0, channel_size * sizeof(float)); + + for(int l = c_start; l <= c_end; l++) + { + if(l < 0 || l >= c) + continue; + + for(int n = 0; n < channel_size; n++) + { + accum_square[n] += square[l * channel_size + n]; + } + } + + /* get the output */ + for(int n = 0; n < channel_size; n++) + { + int offset = i * img_size + j * channel_size + n; + float input_f = fp16_to_fp32(in_data[offset]); + float output_f = input_f * pow(1.0f + alpha_over_size * accum_square[n], -beta); + out_data[offset] = fp32_to_fp16(output_f); + } + } + } + else + { + printf("LRN: IN CHANNEL, TO BE IMPLEMENTED\n"); + } + } + + free(square); + free(accum_square); + return 0; +} diff --git a/executor/operator/ref/kernel/lrn/ref_lrn_fp32.c b/executor/operator/ref/kernel/lrn/ref_lrn_fp32.c new file mode 100644 index 000000000..eb843c5cb --- /dev/null +++ b/executor/operator/ref/kernel/lrn/ref_lrn_fp32.c @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ + +static int ref_lrn_fp32(const float* in_data, float* out_data, ref_lrn_param* param) +{ + int n = param->dims[0]; + int c = param->dims[1]; + int h = param->dims[2]; + int w = param->dims[3]; + + float alpha = param->alpha; + float beta = param->beta; + float bias = param->bias; + int local_size = param->local_size; + + int channel_size = h * w; + int img_size = c * channel_size; + + float* square = ( float* )(malloc(img_size * sizeof(float))); + float* accum_square = ( float* )(malloc(channel_size * sizeof(float))); + + for(int i = 0; i < n; i++) + { + const float* img_base = in_data + i * img_size; + + /* get square value */ + for(int j = 0; j < img_size; j++) + square[j] = img_base[j] * img_base[j] + bias; + + if(param->norm_region == 0) /* LRN_ACROSS_CHANNELS */ + { + float alpha_over_size = alpha / local_size; + + for(int j = 0; j < c; j++) + { + int c_start = j - local_size / 2; + int c_end = j + local_size / 2; + + memset(accum_square, 0x0, channel_size * sizeof(float)); + + for(int l = c_start; l <= c_end; l++) + { + if(l < 0 || l >= c) + continue; + + for(int n = 0; n < channel_size; n++) + { + accum_square[n] += square[l * channel_size + n]; + } + } + + /* get the output */ + for(int n = 0; n < channel_size; n++) + { + int offset = i * img_size + j * channel_size + n; + out_data[offset] = in_data[offset] * pow(1.0f + alpha_over_size * accum_square[n], -beta); + } + } + } + else + { + printf("LRN: IN CHANNEL, TO BE IMPLEMENTED\n"); + } + } + + free(square); + free(accum_square); + return 0; +} diff --git a/executor/operator/ref/kernel/lrn/ref_lrn_int8.c b/executor/operator/ref/kernel/lrn/ref_lrn_int8.c new file mode 100644 index 000000000..ec7bfb5f6 --- /dev/null +++ b/executor/operator/ref/kernel/lrn/ref_lrn_int8.c @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ + +static int ref_lrn_int8(const int8_t* in_data, int8_t* out_data, ref_lrn_param* param) +{ + int n = param->dims[0]; + int c = param->dims[1]; + int h = param->dims[2]; + int w = param->dims[3]; + + float alpha = param->alpha; + float beta = param->beta; + float bias = param->bias; + int local_size = param->local_size; + + int channel_size = h * w; + int img_size = c * channel_size; + int input_size = n * img_size; + + float* square = ( float* )(malloc(img_size * sizeof(float))); + float* accum_square = ( float* )(malloc(channel_size * sizeof(float))); + float* input_f = ( float* )(malloc(input_size * sizeof(float))); + float* output_f = ( float* )(malloc(input_size * sizeof(float))); + + for(int i = 0; i < input_size; i++) + input_f[i] = in_data[i] * param->scale[0]; + + for(int i = 0; i < n; i++) + { + const float* img_base = input_f + i * img_size; + + /* get square value */ + for(int j = 0; j < img_size; j++) + square[j] = img_base[j] * img_base[j] + bias; + + if(param->norm_region == 0) /* LRN_ACROSS_CHANNELS */ + { + float alpha_over_size = alpha / local_size; + + for(int j = 0; j < c; j++) + { + int c_start = j - local_size / 2; + int c_end = j + local_size / 2; + + memset(accum_square, 0x0, channel_size * sizeof(float)); + + for(int l = c_start; l <= c_end; l++) + { + if(l < 0 || l >= c) + continue; + + for(int n = 0; n < channel_size; n++) + { + accum_square[n] += square[l * channel_size + n]; + } + } + + /* get the output */ + for(int n = 0; n < channel_size; n++) + { + int offset = i * img_size + j * channel_size + n; + output_f[offset] = input_f[offset] * pow(1.0f + alpha_over_size * accum_square[n], -beta); + } + } + } + else + { + printf("LRN: IN CHANNEL, TO BE IMPLEMENTED\n"); + } + } + + float max_val = 0.0f; + for(int i = 0; i < input_size; i++) + { + if(max_val < fabs(output_f[i])) + max_val = fabs(output_f[i]); + } + float out_scale = max_val / 127; + for(int i = 0; i < input_size; i++) + { + out_data[i] = (int8_t)(round(output_f[i] / out_scale)); + } + param->scale[1] = out_scale; + + free(square); + free(accum_square); + free(input_f); + free(output_f); + return 0; +} diff --git a/executor/operator/ref/kernel/lrn/ref_lrn_kernel.h b/executor/operator/ref/kernel/lrn/ref_lrn_kernel.h new file mode 100644 index 000000000..9f32135ce --- /dev/null +++ b/executor/operator/ref/kernel/lrn/ref_lrn_kernel.h @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ + +#ifndef __REF_LRN_KERNEL_H__ +#define __REF_LRN_KERNEL_H__ + +#include +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ref_lrn_param +{ + float alpha; + float beta; + float bias; + int local_size; + int norm_region; + int layout; + int dims[4]; + int zero[2]; /* input, output */ + float scale[2]; /* input, output */ +}; + +typedef int (*ref_lrn_kernel_t)(const void* in_data, void* out_data, ref_lrn_param* param); + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_lrn_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "ref_lrn_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "ref_lrn_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_lrn_uint8.c" +#endif + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/executor/operator/ref/kernel/lrn/ref_lrn_uint8.c b/executor/operator/ref/kernel/lrn/ref_lrn_uint8.c new file mode 100644 index 000000000..41252fd2a --- /dev/null +++ b/executor/operator/ref/kernel/lrn/ref_lrn_uint8.c @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ + +static int ref_lrn_uint8(const uint8_t* in_data, uint8_t* out_data, ref_lrn_param* param) +{ + int n = param->dims[0]; + int c = param->dims[1]; + int h = param->dims[2]; + int w = param->dims[3]; + + float alpha = param->alpha; + float beta = param->beta; + float bias = param->bias; + int local_size = param->local_size; + + int channel_size = h * w; + int img_size = c * channel_size; + + float* square = ( float* )(malloc(img_size * sizeof(float))); + float* accum_square = ( float* )(malloc(channel_size * sizeof(float))); + + for(int i = 0; i < n; i++) + { + const uint8_t* img_base = in_data + i * img_size; + + /* get square value */ + for(int j = 0; j < img_size; j++) + { + float img_data = (img_base[j] - param->zero[0]) * param->scale[0]; + square[j] = img_data * img_data + bias; + } + + if(param->norm_region == 0) /* LRN_ACROSS_CHANNELS */ + { + float alpha_over_size = alpha / local_size; + + for(int j = 0; j < c; j++) + { + int c_start = j - local_size / 2; + int c_end = j + local_size / 2; + + memset(accum_square, 0x0, channel_size * sizeof(float)); + + for(int l = c_start; l <= c_end; l++) + { + if(l < 0 || l >= c) + continue; + + for(int n = 0; n < channel_size; n++) + { + accum_square[n] += square[l * channel_size + n]; + } + } + + /* get the output */ + for(int n = 0; n < channel_size; n++) + { + int offset = i * img_size + j * channel_size + n; + float output_f = in_data[offset] * pow(1.0f + alpha_over_size * accum_square[n], -beta); + out_data[offset] = (uint8_t)(round(output_f / param->scale[1]) + param->zero[1]); + } + } + } + else + { + printf("LRN: IN CHANNEL, TO BE IMPLEMENTED\n"); + } + } + + free(square); + free(accum_square); + return 0; +} diff --git a/executor/operator/ref/kernel/pad/pad_fp16.c b/executor/operator/ref/kernel/pad/pad_fp16.c new file mode 100644 index 000000000..0e22417d0 --- /dev/null +++ b/executor/operator/ref/kernel/pad/pad_fp16.c @@ -0,0 +1,133 @@ +static int pad_fp16(__fp16 * data,__fp16 * out_data,pad_param * param) +{ + + if (param->mode==0) + { + //support pad on h,w dim only + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + for (int pw = 0; pw < param->out_w; ++pw) + { + int h = ph - param->pad_1_h; + int w = pw - param->pad_2_h; + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + if (h < 0 || w < 0 || h >= param->in_h || w >= param->in_w) + { + for (int c = 0; c < param->in_c; ++c) + { + #if!defined( __ARM_ARCH) || __ARM_ARCH <8 + out_data[pad_index + c] = fp32_to_fp16(fp16_to_fp32(param->cv_f16)); + #else + out_data[pad_index + c] = param->cv_f16; + #endif + } + } + else + { + const int input_index = (h * param->in_w + w) * param->in_c; + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = data[input_index + c]; + } + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + + + } + else if (param->mode==1) + { + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + for (int pw = 0; pw < param->out_w; ++pw) + { + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + int h = ph - param->pad_1_h; + int w = pw - param->pad_2_h; + h = MAX(h, -h); + h = MIN(h, 2 * param->in_h - h - 2); + w = MAX(w, -w); + w = MIN(w, 2 * param->in_w - w - 2); + const int input_index = (h * param->in_w + w) * param->in_c; + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = data[input_index + c]; + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + } + else if(param->mode==2) + { + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + for (int pw = 0; pw < param->out_w; ++pw) + { + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + int w = pw - param->pad_2_h; + int h = ph - param->pad_1_h; + h = MAX(h, -h-1); + h = MIN(h, 2 * param->in_h - h - 1); + w = MAX(w, -w-1); + w = MIN(w, 2 * param->in_w - w - 1); + const int input_index = (h * param->in_w + w) * param->in_c; + for (int c = 0; c < param->in_c; ++c) + { + + out_data[pad_index + c] = data[input_index + c]; + + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + + + } + else + { + return -1; + } + return 0; + + +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/pad/pad_fp32.c b/executor/operator/ref/kernel/pad/pad_fp32.c new file mode 100644 index 000000000..80873fdc8 --- /dev/null +++ b/executor/operator/ref/kernel/pad/pad_fp32.c @@ -0,0 +1,129 @@ +static int pad_fp32(float * data,float * out_data,pad_param * param) +{ + if (param->mode==0) + { + //support pad on h,w dim only + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + for (int pw = 0; pw < param->out_w; ++pw) + { + int h = ph - param->pad_1_h; + int w = pw - param->pad_2_h; + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + if (h < 0 || w < 0 || h >= param->in_h || w >= param->in_w) + { + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = param->cv_f32; + } + } + else + { + const int input_index = (h * param->in_w + w) * param->in_c; + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = data[input_index + c]; + } + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + + + } + else if (param->mode==1) + { + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + for (int pw = 0; pw < param->out_w; ++pw) + { + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + int h = ph - param->pad_1_h; + int w = pw - param->pad_2_h; + h = MAX(h, -h); + h = MIN(h, 2 * param->in_h - h - 2); + w = MAX(w, -w); + w = MIN(w, 2 * param->in_w - w - 2); + const int input_index = (h * param->in_w + w) * param->in_c; + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = data[input_index + c]; + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + } + else if(param->mode==2) + { + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + + for (int pw = 0; pw < param->out_w; ++pw) + { + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + // int h = ph - param->pad_1_h; + int w = pw - param->pad_2_h; + int h = ph - param->pad_1_h; + h = MAX(h, -h-1); + h = MIN(h, 2 * param->in_h - h - 1); + w = MAX(w, -w-1); + w = MIN(w, 2 * param->in_w - w - 1); + + const int input_index = (h * param->in_w + w) * param->in_c ; + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = data[input_index + c]; + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + + + } + else + { + return -1; + } + + + return 0; +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/pad/pad_int8.c b/executor/operator/ref/kernel/pad/pad_int8.c new file mode 100644 index 000000000..f04b5a3d2 --- /dev/null +++ b/executor/operator/ref/kernel/pad/pad_int8.c @@ -0,0 +1,126 @@ +static int pad_int8(int8_t * data,int8_t * out_data,pad_param * param) +{ + if (param->mode==0) + { + //support pad on h,w dim only + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + for (int pw = 0; pw < param->out_w; ++pw) + { + int h = ph - param->pad_1_h; + int w = pw - param->pad_2_h; + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + if (h < 0 || w < 0 || h >= param->in_h || w >= param->in_w) + { + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = param->cv_int8; + } + } + else + { + const int input_index = (h * param->in_w + w) * param->in_c; + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = data[input_index + c]; + } + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + + + } + else if (param->mode==1) + { + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + for (int pw = 0; pw < param->out_w; ++pw) + { + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + int h = ph - param->pad_1_h; + int w = pw - param->pad_2_h; + h = MAX(h, -h); + h = MIN(h, 2 * param->in_h - h - 2); + w = MAX(w, -w); + w = MIN(w, 2 * param->in_w - w - 2); + const int input_index = (h * param->in_w + w) * param->in_c; + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = data[input_index + c]; + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + } + else if(param->mode==2) + { + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + for (int pw = 0; pw < param->out_w; ++pw) + { + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + int w = pw - param->pad_2_h; + int h = ph - param->pad_1_h; + h = MAX(h, -h-1); + h = MIN(h, 2 * param->in_h - h - 1); + w = MAX(w, -w-1); + w = MIN(w, 2 * param->in_w - w - 1); + const int input_index = (h * param->in_w + w) * param->in_c; + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = data[input_index + c]; + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + + + } + else + { + return -1; + } + + + return 0; +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/pad/pad_kernel.h b/executor/operator/ref/kernel/pad/pad_kernel.h new file mode 100644 index 000000000..4e7b5aca3 --- /dev/null +++ b/executor/operator/ref/kernel/pad/pad_kernel.h @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#ifndef __PAD_KERNEL_H__ +#define __PAD_KERNEL_H__ + +#include +#include +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +struct pad_param; + +struct pad_param +{ + int mode; + float cv_f32; + __fp16 cv_f16; + int8_t cv_int8; + uint8_t cv_uint8; + int in_size; + int out_size; + int in_n; + int in_h; + int in_w; + int in_c; + int out_h; + int out_w; + int out_n; + int pad_0_h; + int pad_0_w; + int pad_1_h; + int pad_1_w; + int pad_2_h; + int pad_2_w; + int pad_3_h; + int pad_3_w; + float scale[2]; + int zero[2]; +}; + +typedef int (*pad_t)(void * data,void * out_data,pad_param* param); + +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) + +#ifdef CONFIG_KERNEL_FP32 +#include "pad_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "pad_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "pad_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "pad_uint8.c" +#endif + + + + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/executor/operator/ref/kernel/pad/pad_uint8.c b/executor/operator/ref/kernel/pad/pad_uint8.c new file mode 100644 index 000000000..f8053af58 --- /dev/null +++ b/executor/operator/ref/kernel/pad/pad_uint8.c @@ -0,0 +1,126 @@ +static int pad_uint8(uint8_t * data,uint8_t * out_data,pad_param * param) +{ + if (param->mode==0) + { + //support pad on h,w dim only + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + for (int pw = 0; pw < param->out_w; ++pw) + { + int h = ph - param->pad_1_h; + int w = pw - param->pad_2_h; + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + if (h < 0 || w < 0 || h >= param->in_h || w >= param->in_w) + { + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = param->cv_uint8; + } + } + else + { + const int input_index = (h * param->in_w + w) * param->in_c; + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = data[input_index + c]; + } + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + + + } + else if (param->mode==1) + { + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + for (int pw = 0; pw < param->out_w; ++pw) + { + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + int h = ph - param->pad_1_h; + int w = pw - param->pad_2_h; + h = MAX(h, -h); + h = MIN(h, 2 * param->in_h - h - 2); + w = MAX(w, -w); + w = MIN(w, 2 * param->in_w - w - 2); + const int input_index = (h * param->in_w + w) * param->in_c; + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = data[input_index + c]; + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + } + else if(param->mode==2) + { + if(param->pad_0_h==0 && param->pad_0_w==0 && param->pad_3_h==0 && param->pad_3_w==0 ) + { + for (int n = 0; n < param->in_n; ++n) + { + for (int ph = 0; ph < param->out_h; ++ph) + { + for (int pw = 0; pw < param->out_w; ++pw) + { + const int pad_index = (ph * param->out_w+ pw) * param->in_c; + int w = pw - param->pad_2_h; + int h = ph - param->pad_1_h; + h = MAX(h, -h-1); + h = MIN(h, 2 * param->in_h - h - 1); + w = MAX(w, -w-1); + w = MIN(w, 2 * param->in_w - w - 1); + const int input_index = (h * param->in_w + w) * param->in_c; + for (int c = 0; c < param->in_c; ++c) + { + out_data[pad_index + c] = data[input_index + c]; + } + } + } + // Do offset. + data += param->in_size / param->in_n; + out_data += param->out_size/ param->out_n; + } + + } + else + { + return -1; + } + + + } + else + { + return -1; + } + + + return 0; +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/permute/permute_fp16.c b/executor/operator/ref/kernel/permute/permute_fp16.c new file mode 100644 index 000000000..25008a2f8 --- /dev/null +++ b/executor/operator/ref/kernel/permute/permute_fp16.c @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jjzeng@openailab.com + */ + +static void __hwc_fp16(const __fp16* input,__fp16* output,int hh,int ww,int cc,int wc,int hw) +{ + for(int h=0; hlayout == TENGINE_LAYOUT_NCHW) + { + n = param->in_dim[0]; + c = param->in_dim[1]; + h = param->in_dim[2]; + w = param->in_dim[3]; + } + else + { + n = param->in_dim[0]; + h = param->in_dim[1]; + w = param->in_dim[2]; + c = param->in_dim[3]; + } + + int wc = w * c; + int hw = h * w; + int chw = c * hw; + + const __fp16* input = in_data; + __fp16* output = out_data; + if( param->order0 == 0 && param->order1 == 2 && param->order2 == 3 && param->order3 == 1 ) + { + for(int ii=0; iiorder0 == 0 && param->order1 == 3 && param->order2 == 1 && param->order3 == 2 ) + { + for(int ii=0; iilayout == TENGINE_LAYOUT_NCHW) + { + n = param->in_dim[0]; + c = param->in_dim[1]; + h = param->in_dim[2]; + w = param->in_dim[3]; + } + else + { + n = param->in_dim[0]; + h = param->in_dim[1]; + w = param->in_dim[2]; + c = param->in_dim[3]; + } + + int wc = w * c; + int hw = h * w; + int chw = c * hw; + + const float* input = in_data; + float* output = out_data; + if( param->order0 == 0 && param->order1 == 2 && param->order2 == 3 && param->order3 == 1 ) + { + for(int ii=0; iiorder0 == 0 && param->order1 == 3 && param->order2 == 1 && param->order3 == 2 ) + { + for(int ii=0; iilayout == TENGINE_LAYOUT_NCHW) + { + n = param->in_dim[0]; + c = param->in_dim[1]; + h = param->in_dim[2]; + w = param->in_dim[3]; + } + else + { + n = param->in_dim[0]; + h = param->in_dim[1]; + w = param->in_dim[2]; + c = param->in_dim[3]; + } + + int wc = w * c; + int hw = h * w; + int chw = c * hw; + + const int8_t* input = in_data; + int8_t* output = out_data; + if( param->order0 == 0 && param->order1 == 2 && param->order2 == 3 && param->order3 == 1 ) + { + for(int ii=0; iiorder0 == 0 && param->order1 == 3 && param->order2 == 1 && param->order3 == 2 ) + { + for(int ii=0; ii +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct permute_param +{ + int order0; + int order1; + int order2; + int order3; + + int in_dim[4]; + int layout; +}; + +typedef int (*permute_t)(const void* in_data,void* out_data,const permute_param* param) ; + +#ifdef CONFIG_KERNEL_FP32 +#include "permute_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "permute_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "permute_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "permute_uint8.c" +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/permute/permute_uint8.c b/executor/operator/ref/kernel/permute/permute_uint8.c new file mode 100644 index 000000000..b706a4878 --- /dev/null +++ b/executor/operator/ref/kernel/permute/permute_uint8.c @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jjzeng@openailab.com + */ + +static void __hwc_uint8(const uint8_t* input,uint8_t* output,int hh,int ww,int cc,int wc,int hw) +{ + for(int h=0; hlayout == TENGINE_LAYOUT_NCHW) + { + n = param->in_dim[0]; + c = param->in_dim[1]; + h = param->in_dim[2]; + w = param->in_dim[3]; + } + else + { + n = param->in_dim[0]; + h = param->in_dim[1]; + w = param->in_dim[2]; + c = param->in_dim[3]; + } + + int wc = w * c; + int hw = h * w; + int chw = c * hw; + + const uint8_t* input = in_data; + uint8_t* output = out_data; + if( param->order0 == 0 && param->order1 == 2 && param->order2 == 3 && param->order3 == 1 ) + { + for(int ii=0; iiorder0 == 0 && param->order1 == 3 && param->order2 == 1 && param->order3 == 2 ) + { + for(int ii=0; iitmp ? max_f : tmp; + + } + *max = fp32_to_fp16(max_f); +#else + *max = 0.0f; + __fp16 tmp = 0.0f; + if(layout == 0) + *max = input[cur_ch*h*w + start_h*w + start_w]; + else + *max = input[start_h*w*c + start_w*c + cur_ch]; + + for(int i=start_h;itmp ? *max : tmp; + + } + +#endif +} + +static int ref_pooling_fp16(const __fp16 * input, __fp16 * output, struct op_data* param) +{ + int input_chw = param->channel * param->input[0]*param->input[1]; + int output_chw = param->channel * param->output[0]*param->output[1]; + + for(int n = 0; n < param->batch; n++) + { + const __fp16* input_cur = input + n*input_chw; + for(int c = 0; c < param->channel; c++) + { + for(int ph = 0; ph < param->output[0]; ph++) + { + for(int pw = 0; pw < param->output[1]; pw++) + { + int pool_size = 1; + int offset = 0; + int h_start = ph * param->strides[0] - param->pads[0]; + int h_end = h_start + param->kernels[0]; + if( h_end > param->input[0] + param->pads[0]) + h_end = param->input[0] + param->pads[0]; + int w_start = pw * param->strides[1] - param->pads[1]; + int w_end = w_start + param->kernels[1]; + if( w_end > param->input[1] + param->pads[1]) + w_end = param->input[1] + param->pads[1]; + + if(param->caffe_flavor) + pool_size = (h_end - h_start) * (w_end - w_start); + + h_start = h_start > 0 ? h_start : 0; + w_start = w_start > 0 ? w_start : 0; + h_end = h_end < param->input[0] ? h_end : param->input[0]; + w_end = w_end < param->input[1] ? w_end : param->input[1]; + //printf("w: %d,%d ,h: %d,%d\n",w_start,w_end,h_start,h_end); + + if(!param->caffe_flavor) + pool_size = (h_end - h_start) * (w_end - w_start); + if(param->layout == 0) //nchw + offset = n*output_chw + c*param->output[0]*param->output[1] + + ph*param->output[1] + pw; + else + offset =n*output_chw + ph*param->output[1]*param->channel + + pw*param->channel + c; + + if(param->method == 0) + { + __fp16 max; + calc_max_fp16(input_cur, &max, param->layout,param->channel,param->input[0],param->input[1], + c,h_start,w_start,h_end,w_end); + output[offset] = max; + } + else if( param->method == 1) + { + __fp16 sum; + calc_sum_fp16(input_cur, &sum, param->layout,param->channel,param->input[0],param->input[1], + c,h_start,w_start,h_end,w_end); +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + output[offset] = fp32_to_fp16( fp16_to_fp32(sum) / pool_size ); +#else + output[offset] = sum/pool_size; +#endif + } + else + return -1; + } + } + } + } + return 0; +} diff --git a/executor/operator/ref/kernel/pooling/ref_pooling_fp32.c b/executor/operator/ref/kernel/pooling/ref_pooling_fp32.c new file mode 100644 index 000000000..14b3de214 --- /dev/null +++ b/executor/operator/ref/kernel/pooling/ref_pooling_fp32.c @@ -0,0 +1,104 @@ + +static inline float calc_sum(const float* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w) +{ + float sum = 0.0f; + for(int i=start_h;itmp ? max : tmp; + + } + + return max; +} + +static int ref_pooling_fp32(const float * input, float * output, struct op_data* param) +{ + int input_chw = param->channel * param->input[0]*param->input[1]; + int output_chw = param->channel * param->output[0]*param->output[1]; + + for(int n = 0; n < param->batch; n++) + { + const float* input_cur = input + n*input_chw; + for(int c = 0; c < param->channel; c++) + { + for(int ph = 0; ph < param->output[0]; ph++) + { + for(int pw = 0; pw < param->output[1]; pw++) + { + int pool_size = 1; + int offset = 0; + int h_start = ph * param->strides[0] - param->pads[0]; + int h_end = h_start + param->kernels[0]; + if( h_end > param->input[0] + param->pads[0]) + h_end = param->input[0] + param->pads[0]; + int w_start = pw * param->strides[1] - param->pads[1]; + int w_end = w_start + param->kernels[1]; + if( w_end > param->input[1] + param->pads[1]) + w_end = param->input[1] + param->pads[1]; + + if(param->caffe_flavor) + pool_size = (h_end - h_start) * (w_end - w_start); + + h_start = h_start > 0 ? h_start : 0; + w_start = w_start > 0 ? w_start : 0; + h_end = h_end < param->input[0] ? h_end : param->input[0]; + w_end = w_end < param->input[1] ? w_end : param->input[1]; + //printf("w: %d,%d ,h: %d,%d\n",w_start,w_end,h_start,h_end); + + if(!param->caffe_flavor) + pool_size = (h_end - h_start) * (w_end - w_start); + if(param->layout == 0) //nchw + offset = n*output_chw + c*param->output[0]*param->output[1] + + ph*param->output[1] + pw; + else + offset =n*output_chw + ph*param->output[1]*param->channel + + pw*param->channel + c; + + if(param->method == 0) + { + float max = calc_max(input_cur,param->layout,param->channel,param->input[0],param->input[1], + c,h_start,w_start,h_end,w_end); + output[offset] = max; + } + else if( param->method == 1) + { + float sum = calc_sum(input_cur,param->layout,param->channel,param->input[0],param->input[1], + c,h_start,w_start,h_end,w_end); + output[offset] = sum/pool_size; + } + else + return -1; + } + } + } + } + return 0; +} diff --git a/executor/operator/ref/kernel/pooling/ref_pooling_int8.c b/executor/operator/ref/kernel/pooling/ref_pooling_int8.c new file mode 100644 index 000000000..d5071096a --- /dev/null +++ b/executor/operator/ref/kernel/pooling/ref_pooling_int8.c @@ -0,0 +1,104 @@ + +static inline int calc_sum_int8(const int8_t* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w) +{ + int sum = 0; + for(int i=start_h;itmp ? max : tmp; + + } + + return max; +} + +static int ref_pooling_int8(const int8_t* input, int8_t * output, struct op_data* param) +{ + int input_chw = param->channel * param->input[0]*param->input[1]; + int output_chw = param->channel * param->output[0]*param->output[1]; + + for(int n = 0; n < param->batch; n++) + { + const int8_t* input_cur = input + n*input_chw; + for(int c = 0; c < param->channel; c++) + { + for(int ph = 0; ph < param->output[0]; ph++) + { + for(int pw = 0; pw < param->output[1]; pw++) + { + int pool_size = 1; + int offset = 0; + int h_start = ph * param->strides[0] - param->pads[0]; + int h_end = h_start + param->kernels[0]; + if( h_end > param->input[0] + param->pads[0]) + h_end = param->input[0] + param->pads[0]; + int w_start = pw * param->strides[1] - param->pads[1]; + int w_end = w_start + param->kernels[1]; + if( w_end > param->input[1] + param->pads[1]) + w_end = param->input[1] + param->pads[1]; + + if(param->caffe_flavor) + pool_size = (h_end - h_start) * (w_end - w_start); + + h_start = h_start > 0 ? h_start : 0; + w_start = w_start > 0 ? w_start : 0; + h_end = h_end < param->input[0] ? h_end : param->input[0]; + w_end = w_end < param->input[1] ? w_end : param->input[1]; + //printf("w: %d,%d ,h: %d,%d\n",w_start,w_end,h_start,h_end); + + if(!param->caffe_flavor) + pool_size = (h_end - h_start) * (w_end - w_start); + if(param->layout == 0) //nchw + offset = n*output_chw + c*param->output[0]*param->output[1] + + ph*param->output[1] + pw; + else + offset =n*output_chw + ph*param->output[1]*param->channel + + pw*param->channel + c; + + if(param->method == 0) + { + int8_t max = calc_max_int8(input_cur,param->layout,param->channel,param->input[0],param->input[1], + c,h_start,w_start,h_end,w_end); + output[offset] = max; + } + else if( param->method == 1) + { + int sum = calc_sum_int8(input_cur,param->layout,param->channel,param->input[0],param->input[1], + c,h_start,w_start,h_end,w_end); + output[offset] = (int8_t)round(sum/pool_size); + } + else + return -1; + } + } + } + } + return 0; +} diff --git a/executor/operator/ref/kernel/pooling/ref_pooling_kernel.h b/executor/operator/ref/kernel/pooling/ref_pooling_kernel.h new file mode 100644 index 000000000..fc7e28c46 --- /dev/null +++ b/executor/operator/ref/kernel/pooling/ref_pooling_kernel.h @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +#ifndef __REF_POOLING_KERNEL_H__ +#define __REF_POOLING_KERNEL_H__ + +#include +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif +struct op_data{ + int layout; + int batch; + int channel; + int method; + int input[2]; + int output[2]; + int kernels[2]; + int strides[2]; + int pads[2]; + int caffe_flavor; + int zero_point; + int align[4]; +}; + + +typedef int (*ref_pooling_kernel_t)(const void * input, void * output, struct op_data* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_pooling_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "ref_pooling_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "ref_pooling_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_pooling_uint8.c" +#endif + + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/pooling/ref_pooling_uint8.c b/executor/operator/ref/kernel/pooling/ref_pooling_uint8.c new file mode 100644 index 000000000..e3ec7a78b --- /dev/null +++ b/executor/operator/ref/kernel/pooling/ref_pooling_uint8.c @@ -0,0 +1,113 @@ + +static inline int calc_sum_uint8(const uint8_t* input, int layout, int c, int h, int w, int cur_ch, int start_h, int start_w, int end_h, int end_w) +{ + int sum = 0; + for(int i=start_h;itmp ? max : tmp; + + } + + return max; +} + +static int ref_pooling_uint8(const uint8_t* input, uint8_t * output, struct op_data* param) +{ + int input_chw = param->channel * param->input[0]*param->input[1]; + int output_chw = param->channel * param->output[0]*param->output[1]; + + int zero_point = param->zero_point; + + for(int n = 0; n < param->batch; n++) + { + const uint8_t* input_cur = input + n*input_chw; + for(int c = 0; c < param->channel; c++) + { + for(int ph = 0; ph < param->output[0]; ph++) + { + for(int pw = 0; pw < param->output[1]; pw++) + { + int pool_size = 1; + int pool_size_caffe = 1; + int offset = 0; + int h_start = ph * param->strides[0] - param->pads[0]; + int h_end = h_start + param->kernels[0]; + if( h_end > param->input[0] + param->pads[0]) + h_end = param->input[0] + param->pads[0]; + int w_start = pw * param->strides[1] - param->pads[1]; + int w_end = w_start + param->kernels[1]; + if( w_end > param->input[1] + param->pads[1]) + w_end = param->input[1] + param->pads[1]; + + if(param->caffe_flavor) + pool_size_caffe = (h_end - h_start) * (w_end - w_start); + + h_start = h_start > 0 ? h_start : 0; + w_start = w_start > 0 ? w_start : 0; + h_end = h_end < param->input[0] ? h_end : param->input[0]; + w_end = w_end < param->input[1] ? w_end : param->input[1]; + //printf("w: %d,%d ,h: %d,%d\n",w_start,w_end,h_start,h_end); + + pool_size = (h_end - h_start) * (w_end - w_start); + if(!param->caffe_flavor) + pool_size_caffe = (h_end - h_start) * (w_end - w_start); + + if(param->layout == 0) //nchw + offset = n*output_chw + c*param->output[0]*param->output[1] + + ph*param->output[1] + pw; + else + offset =n*output_chw + ph*param->output[1]*param->channel + + pw*param->channel + c; + + if(param->method == 0) + { + uint8_t max = calc_max_uint8(input_cur,param->layout,param->channel,param->input[0],param->input[1], + c,h_start,w_start,h_end,w_end); + output[offset] = max; + } + else if( param->method == 1) + { + int sum = calc_sum_uint8(input_cur,param->layout,param->channel,param->input[0],param->input[1], + c,h_start,w_start,h_end,w_end); + // (a-z)*s + ... + (n-z)*s = (output-z)*s*pool_size_caffe + // (a+...+z)-pool_size*z = output*pool_size_caffe - z* pool_size_caffe + // output = ( sum + (pool_size_caffe - pool_size)*z )/pool_size_caffe + int diff_size = pool_size_caffe - pool_size; + output[offset] = (uint8_t)round((sum + diff_size*zero_point)/pool_size_caffe); + } + else + return -1; + } + } + } + } + return 0; +} diff --git a/executor/operator/ref/kernel/prelu/prelu_fp16.c b/executor/operator/ref/kernel/prelu/prelu_fp16.c new file mode 100644 index 000000000..49e9a6c5d --- /dev/null +++ b/executor/operator/ref/kernel/prelu/prelu_fp16.c @@ -0,0 +1,38 @@ +static int prelu_fp16(__fp16 * data,__fp16 * out_data, int dim0,int dim1,int dim2,int dim3,float* slope,prelu_param* param) +{ + int offset=0; + //__fp16* data = ( __fp16* )data; + for(int i = 0; i < dim0; i++) + { + for(int c = 0; c < dim1; c++) + { + for(int l = 0; l < dim2; l++) + { + for(int k = 0; k < dim3; k++) + { + if(param->layout==0) + { + //nchw + offset = i*dim1*dim2*dim3 + c*dim2*dim3 + l*dim3 + k; + + } + else + { + //nhwc + offset = i*dim1*dim2*dim3 + l*dim3*dim1 + k*dim1 + c; + } +/* for arm32 && x86 */ +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + float output_real = MAX(fp16_to_fp32(data[offset]), 0) + slope[c] * MIN(fp16_to_fp32(data[offset]), 0.f); + out_data[offset] = fp32_to_fp16(output_real); + +#else + out_data[offset] = MAX(data[offset], 0) + slope[c] * MIN(data[offset], 0.f); +#endif + } + } + + } + } + return 0; +} diff --git a/executor/operator/ref/kernel/prelu/prelu_fp32.c b/executor/operator/ref/kernel/prelu/prelu_fp32.c new file mode 100644 index 000000000..1654ae92e --- /dev/null +++ b/executor/operator/ref/kernel/prelu/prelu_fp32.c @@ -0,0 +1,33 @@ + +static int prelu_fp32(float * data,float * out_data, int dim0,int dim1,int dim2,int dim3,float* slope,const prelu_param * param) +{ + int offset=0; + //nchw + //nhwc + for(int i = 0; i < dim0; i++) + { + for(int c = 0; c < dim1; c++) + { + for(int l = 0; l < dim2; l++) + { + for(int k = 0; k < dim3; k++) + { + if(param->layout==0) + { + //nchw + offset = i*dim1*dim2*dim3 + c*dim2*dim3 + l*dim3 + k; + + } + else + { + //nhwc + offset = i*dim1*dim2*dim3 + l*dim3*dim1 + k*dim1 + c; + } + out_data[offset] = MAX(data[offset], 0) + slope[c] * MIN(data[offset], 0.f); + } + } + + } + } + return 0; +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/prelu/prelu_int8.c b/executor/operator/ref/kernel/prelu/prelu_int8.c new file mode 100644 index 000000000..2b209b88f --- /dev/null +++ b/executor/operator/ref/kernel/prelu/prelu_int8.c @@ -0,0 +1,34 @@ + +static int prelu_int8(int8_t * data,int8_t * out_data, int dim0,int dim1,int dim2,int dim3,float* slope,const prelu_param * param) +{ + int offset; + + for(int i = 0; i < dim0; i++) + { + for(int c = 0; c < dim1; c++) + { + for(int l = 0; l < dim2; l++) + { + for(int k = 0; k < dim3; k++) + { + if(param->layout==0) + { + //nchw + offset=i*dim1*dim2*dim3 + c*dim2*dim3 + l*dim3 + k; + + } + else + { + //nhwc + offset=i*dim1*dim2*dim3 + l*dim3*dim1 + k*dim1 + c; + } + float real_input = data[offset] * param->scale; + float real_output = MAX(real_input, 0) + slope[c] * MIN(real_input, 0.f); + out_data[offset] = round(real_output / param->scale); + } + } + + } + } + return 0; +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/prelu/prelu_kernel.h b/executor/operator/ref/kernel/prelu/prelu_kernel.h new file mode 100644 index 000000000..f66eb8f63 --- /dev/null +++ b/executor/operator/ref/kernel/prelu/prelu_kernel.h @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#ifndef __PRELU_KERNEL_H__ +#define __PRELU_KERNEL_H__ + +#include +#include +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +struct prelu_param; + +struct prelu_param +{ + int layout; + float scale; + int zero; +}; + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +typedef int (*prelu_t)(void * data,void * out_data, int dim0,int dim1,int dim2,int dim3,float* slope,const prelu_param* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "prelu_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "prelu_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "prelu_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "prelu_uint8.c" +#endif + + + + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/executor/operator/ref/kernel/prelu/prelu_uint8.c b/executor/operator/ref/kernel/prelu/prelu_uint8.c new file mode 100644 index 000000000..c6541187a --- /dev/null +++ b/executor/operator/ref/kernel/prelu/prelu_uint8.c @@ -0,0 +1,34 @@ + +static int prelu_uint8(uint8_t * data,uint8_t * out_data, int dim0,int dim1,int dim2,int dim3,float* slope,const prelu_param * param) +{ + int offset; + + for(int i = 0; i < dim0; i++) + { + for(int c = 0; c < dim1; c++) + { + for(int l = 0; l < dim2; l++) + { + for(int k = 0; k < dim3; k++) + { + if(param->layout==0) + { + //nchw + offset=i*dim1*dim2*dim3 + c*dim2*dim3 + l*dim3 + k; + + } + else + { + //nhwc + offset=i*dim1*dim2*dim3 + l*dim3*dim1 + k*dim1 + c; + } + float real_input = (data[offset] - param->zero) * param->scale; + float real_output = MAX(real_input, 0) + slope[c] * MIN(real_input, 0.f); + out_data[offset] = round(real_output / param->scale) + param->zero; + } + } + + } + } + return 0; +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/reduction/reduce.h b/executor/operator/ref/kernel/reduction/reduce.h new file mode 100644 index 000000000..1902e39c3 --- /dev/null +++ b/executor/operator/ref/kernel/reduction/reduce.h @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#ifndef __REDUCE_KERNEL_H__ +#define __REDUCE_KERNEL_H__ + +#include +#include +#include +#include +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +struct reduce_param; + +struct reduce_param +{ + int layout; + int type; + int param_dim[4]; + float scale[2]; + int zero[2]; +}; + +typedef int (*reduce_t)(void * data,void * out_data, int dim0,int dim1,int dim2,int dim3,int out_size,reduce_param* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "reduce_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "reduce_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "reduce_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "reduce_uint8.c" +#endif + + + + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/executor/operator/ref/kernel/reduction/reduce_fp16.c b/executor/operator/ref/kernel/reduction/reduce_fp16.c new file mode 100644 index 000000000..96e1ebc1a --- /dev/null +++ b/executor/operator/ref/kernel/reduction/reduce_fp16.c @@ -0,0 +1,1314 @@ +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 +void sum_4d_ax0_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp); +void sum_4d_ax1_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp); +void sum_4d_ax2_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp); +void sum_4d_ax3_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp); +void sum_3d_ax0_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_01); +void sum_3d_ax1_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_02); +void sum_3d_ax2_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_03); +void sum_2d_ax0_fp16(int dim1,int dim2,float * tmp,float* tmp_0); +void sum_2d_ax1_fp16(int dim1,int dim2,float * tmp,float* tmp_1); + +void mean_4d_ax0_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp); +void mean_4d_ax1_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp); +void mean_4d_ax2_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp); +void mean_4d_ax3_fp16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,float * tmp); +void mean_3d_ax0_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_01); +void mean_3d_ax1_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_02); +void mean_3d_ax2_fp16(int dim1,int dim2,int dim3,float * tmp,float* tmp_03); +void mean_2d_ax0_fp16(int dim1,int dim2,float * tmp,float* tmp_0); +void mean_2d_ax1_fp16(int dim1,int dim2,float * tmp,float* tmp_1); +#else +void sum_4d_ax0_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp); +void sum_4d_ax1_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp); +void sum_4d_ax2_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp); +void sum_4d_ax3_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp); +void sum_3d_ax0_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_01); +void sum_3d_ax1_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_02); +void sum_3d_ax2_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_03); +void sum_2d_ax0_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_0); +void sum_2d_ax1_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_1); + +void mean_4d_ax0_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp); +void mean_4d_ax1_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp); +void mean_4d_ax2_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp); +void mean_4d_ax3_f16(int dim0,int dim1,int dim2,int dim3,__fp16 * data,__fp16 * tmp); +void mean_3d_ax0_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_01); +void mean_3d_ax1_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_02); +void mean_3d_ax2_f16(int dim1,int dim2,int dim3,__fp16 * tmp,__fp16* tmp_03); +void mean_2d_ax0_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_0); +void mean_2d_ax1_f16(int dim1,int dim2,__fp16 * tmp,__fp16* tmp_1); +#endif + +static int reduce_fp16(__fp16 * data,__fp16 * out_data, int dim0, + int dim1,int dim2,int dim3,int out_size,reduce_param * param) +{ +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + int offset=0; + float* tmp=(float*)malloc(sizeof(float)*out_size); + memset(tmp, 0, sizeof(float) * out_size); + int param_dim0=param->param_dim[0]; + int param_dim1=param->param_dim[1]; + int param_dim2=param->param_dim[2]; + int param_dim3=param->param_dim[3]; + //reduce sum + if(param->type==0) + { + if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3)) + { + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + //nhwc + offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + tmp[0]+=fp16_to_fp32(data[offset]); + } + } + + } + } + } + else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax2_fp16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax3_fp16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01); + sum_3d_ax0_fp16(dim1,dim2,dim3,tmp,tmp_01); + + free(tmp_01); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_02); + sum_3d_ax1_fp16(dim1,dim2,dim3,tmp,tmp_02); + + free(tmp_02); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_03); + sum_3d_ax2_fp16(dim1,dim2,dim3,tmp,tmp_03); + free(tmp_03); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + sum_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_12); + sum_3d_ax1_fp16(dim0,dim2,dim3,tmp,tmp_12); + + free(tmp_12); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3); + sum_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_13); + sum_3d_ax2_fp16(dim0,dim2,dim3,tmp,tmp_13); + + free(tmp_13); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2))) + { + //reduce on axis2 + float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3); + memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3); + sum_4d_ax2_fp16(dim0,dim1,dim2,dim3,data,tmp_23); + sum_3d_ax2_fp16(dim0,dim1,dim3,tmp,tmp_23); + + free(tmp_23); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) + ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2) + ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1) + ||(param_dim0==2 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_0, 0, sizeof(float) * dim2*dim3); + + sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01); + sum_3d_ax0_fp16(dim1,dim2,dim3,tmp_0,tmp_01); + sum_2d_ax0_fp16(dim2,dim3,tmp,tmp_0); + + free(tmp_01); + free(tmp_0); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1) + ||(param_dim0==3 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_1, 0, sizeof(float) * dim2*dim3); + + sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01); + sum_3d_ax0_fp16(dim1,dim2,dim3,tmp_1,tmp_01); + sum_2d_ax1_fp16(dim2,dim3,tmp,tmp_1); + + free(tmp_01); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3); + memset(tmp_1, 0, sizeof(float) * dim1*dim3); + + sum_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_02); + sum_3d_ax1_fp16(dim1,dim2,dim3,tmp_1,tmp_02); + sum_2d_ax1_fp16(dim1,dim3,tmp,tmp_1); + + free(tmp_02); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==1))) + { + //reduce on axis0 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3); + memset(tmp_1, 0, sizeof(float) * dim0*dim3); + + sum_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_12); + sum_3d_ax1_fp16(dim0,dim2,dim3,tmp_1,tmp_12); + sum_2d_ax1_fp16(dim0,dim3,tmp,tmp_1); + + free(tmp_12); + free(tmp_1); + } + } + //reduce mean + else if(param->type==1) + { + if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3)) + { + float s_tmp=0.f; + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + //nhwc + offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + s_tmp+=fp16_to_fp32(data[offset]); + } + } + } + } + tmp[0]=s_tmp / (dim0*dim1*dim2*dim3); + } + else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax2_fp16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax3_fp16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01); + mean_3d_ax0_fp16(dim1,dim2,dim3,tmp,tmp_01); + + free(tmp_01); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_02); + mean_3d_ax1_fp16(dim1,dim2,dim3,tmp,tmp_02); + + free(tmp_02); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_03); + mean_3d_ax2_fp16(dim1,dim2,dim3,tmp,tmp_03); + free(tmp_03); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + mean_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_12); + mean_3d_ax1_fp16(dim0,dim2,dim3,tmp,tmp_12); + + free(tmp_12); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3); + mean_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_13); + mean_3d_ax2_fp16(dim0,dim2,dim3,tmp,tmp_13); + + free(tmp_13); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2))) + { + //reduce on axis2 + float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3); + memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3); + mean_4d_ax2_fp16(dim0,dim1,dim2,dim3,data,tmp_23); + mean_3d_ax2_fp16(dim0,dim1,dim3,tmp,tmp_23); + + free(tmp_23); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) + ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2) + ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1) + ||(param_dim0==2 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_0, 0, sizeof(float) * dim2*dim3); + + mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01); + mean_3d_ax0_fp16(dim1,dim2,dim3,tmp_0,tmp_01); + mean_2d_ax0_fp16(dim2,dim3,tmp,tmp_0); + + free(tmp_01); + free(tmp_0); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1) + ||(param_dim0==3 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_1, 0, sizeof(float) * dim2*dim3); + + mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_01); + mean_3d_ax0_fp16(dim1,dim2,dim3,tmp_1,tmp_01); + mean_2d_ax1_fp16(dim2,dim3,tmp,tmp_1); + + free(tmp_01); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3); + memset(tmp_1, 0, sizeof(float) * dim1*dim3); + + mean_4d_ax0_fp16(dim0,dim1,dim2,dim3,data,tmp_02); + mean_3d_ax1_fp16(dim1,dim2,dim3,tmp_1,tmp_02); + mean_2d_ax1_fp16(dim1,dim3,tmp,tmp_1); + + free(tmp_02); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==1))) + { + //reduce on axis0 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3); + memset(tmp_1, 0, sizeof(float) * dim0*dim3); + + mean_4d_ax1_fp16(dim0,dim1,dim2,dim3,data,tmp_12); + mean_3d_ax1_fp16(dim0,dim2,dim3,tmp_1,tmp_12); + mean_2d_ax1_fp16(dim0,dim3,tmp,tmp_1); + + free(tmp_12); + free(tmp_1); + } + } + //pase to out_data + for(int i=0;iparam_dim[0]; + int param_dim1=param->param_dim[1]; + int param_dim2=param->param_dim[2]; + int param_dim3=param->param_dim[3]; + //reduce sum + if(param->type==0) + { + if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3)) + { + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + //nhwc + offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + tmp[0]+=data[offset]; + } + } + + } + } + } + else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax2_f16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax3_f16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1))) + { + //reduce on axis0 + __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3); + sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01); + sum_3d_ax0_f16(dim1,dim2,dim3,tmp,tmp_01); + + free(tmp_01); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0))) + { + //reduce on axis0 + __fp16 * tmp_02=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(__fp16) * dim1*dim2*dim3); + sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_02); + sum_3d_ax1_f16(dim1,dim2,dim3,tmp,tmp_02); + + free(tmp_02); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0))) + { + //reduce on axis0 + __fp16 * tmp_03=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_03, 0, sizeof(__fp16) * dim1*dim2*dim3); + sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_03); + sum_3d_ax2_f16(dim1,dim2,dim3,tmp,tmp_03); + free(tmp_03); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1))) + { + //reduce on axis1 + __fp16 * tmp_12=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(__fp16) * dim0*dim2*dim3); + sum_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_12); + sum_3d_ax1_f16(dim0,dim2,dim3,tmp,tmp_12); + + free(tmp_12); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1))) + { + //reduce on axis1 + __fp16 * tmp_13=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3); + memset(tmp_13, 0, sizeof(__fp16) * dim0*dim2*dim3); + sum_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_13); + sum_3d_ax2_f16(dim0,dim2,dim3,tmp,tmp_13); + + free(tmp_13); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2))) + { + //reduce on axis2 + __fp16 * tmp_23=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim1*dim3); + memset(tmp_23, 0, sizeof(__fp16) * dim0*dim1*dim3); + sum_4d_ax2_f16(dim0,dim1,dim2,dim3,data,tmp_23); + sum_3d_ax2_f16(dim0,dim1,dim3,tmp,tmp_23); + + free(tmp_23); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) + ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2) + ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1) + ||(param_dim0==2 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3); + + __fp16 * tmp_0=(__fp16 *)malloc(sizeof(__fp16)*dim2*dim3); + memset(tmp_0, 0, sizeof(__fp16) * dim2*dim3); + + sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01); + sum_3d_ax0_f16(dim1,dim2,dim3,tmp_0,tmp_01); + sum_2d_ax0_f16(dim2,dim3,tmp,tmp_0); + + free(tmp_01); + free(tmp_0); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1) + ||(param_dim0==3 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3); + + __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim2*dim3); + memset(tmp_1, 0, sizeof(__fp16) * dim2*dim3); + + sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01); + sum_3d_ax0_f16(dim1,dim2,dim3,tmp_1,tmp_01); + sum_2d_ax1_f16(dim2,dim3,tmp,tmp_1); + + free(tmp_01); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==0))) + { + //reduce on axis0 + __fp16 * tmp_02=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(__fp16) * dim1*dim2*dim3); + + __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim3); + memset(tmp_1, 0, sizeof(__fp16) * dim1*dim3); + + sum_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_02); + sum_3d_ax1_f16(dim1,dim2,dim3,tmp_1,tmp_02); + sum_2d_ax1_f16(dim1,dim3,tmp,tmp_1); + + free(tmp_02); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==1))) + { + //reduce on axis0 + __fp16 * tmp_12=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(__fp16) * dim0*dim2*dim3); + + __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim3); + memset(tmp_1, 0, sizeof(__fp16) * dim0*dim3); + + sum_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_12); + sum_3d_ax1_f16(dim0,dim2,dim3,tmp_1,tmp_12); + sum_2d_ax1_f16(dim0,dim3,tmp,tmp_1); + + free(tmp_12); + free(tmp_1); + } + } + //reduce mean + else if(param->type==1) + { + if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3)) + { + __fp16 s_tmp=fp32_to_fp16(0); + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + //nhwc + offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + s_tmp+=data[offset]; + } + } + } + } + tmp[0]=s_tmp / (dim0*dim1*dim2*dim3); + } + else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax2_f16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax3_f16(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1))) + { + //reduce on axis0 + __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3); + mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01); + mean_3d_ax0_f16(dim1,dim2,dim3,tmp,tmp_01); + + free(tmp_01); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0))) + { + //reduce on axis0 + __fp16 * tmp_02=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(__fp16) * dim1*dim2*dim3); + mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_02); + mean_3d_ax1_f16(dim1,dim2,dim3,tmp,tmp_02); + + free(tmp_02); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0))) + { + //reduce on axis0 + __fp16 * tmp_03=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_03, 0, sizeof(__fp16) * dim1*dim2*dim3); + mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_03); + mean_3d_ax2_f16(dim1,dim2,dim3,tmp,tmp_03); + free(tmp_03); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1))) + { + //reduce on axis1 + __fp16 * tmp_12=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(__fp16) * dim0*dim2*dim3); + mean_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_12); + mean_3d_ax1_f16(dim0,dim2,dim3,tmp,tmp_12); + + free(tmp_12); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1))) + { + //reduce on axis1 + __fp16 * tmp_13=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3); + memset(tmp_13, 0, sizeof(__fp16) * dim0*dim2*dim3); + mean_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_13); + mean_3d_ax2_f16(dim0,dim2,dim3,tmp,tmp_13); + + free(tmp_13); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2))) + { + //reduce on axis2 + __fp16 * tmp_23=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim1*dim3); + memset(tmp_23, 0, sizeof(__fp16) * dim0*dim1*dim3); + mean_4d_ax2_f16(dim0,dim1,dim2,dim3,data,tmp_23); + mean_3d_ax2_f16(dim0,dim1,dim3,tmp,tmp_23); + + free(tmp_23); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) + ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2) + ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1) + ||(param_dim0==2 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3); + + __fp16 * tmp_0=(__fp16 *)malloc(sizeof(__fp16)*dim2*dim3); + memset(tmp_0, 0, sizeof(__fp16) * dim2*dim3); + + mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01); + mean_3d_ax0_f16(dim1,dim2,dim3,tmp_0,tmp_01); + mean_2d_ax0_f16(dim2,dim3,tmp,tmp_0); + + free(tmp_01); + free(tmp_0); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1) + ||(param_dim0==3 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + __fp16 * tmp_01=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(__fp16) * dim1*dim2*dim3); + + __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim2*dim3); + memset(tmp_1, 0, sizeof(__fp16) * dim2*dim3); + + mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_01); + mean_3d_ax0_f16(dim1,dim2,dim3,tmp_1,tmp_01); + mean_2d_ax1_f16(dim2,dim3,tmp,tmp_1); + + free(tmp_01); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==0))) + { + //reduce on axis0 + __fp16 * tmp_02=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(__fp16) * dim1*dim2*dim3); + + __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim1*dim3); + memset(tmp_1, 0, sizeof(__fp16) * dim1*dim3); + + mean_4d_ax0_f16(dim0,dim1,dim2,dim3,data,tmp_02); + mean_3d_ax1_f16(dim1,dim2,dim3,tmp_1,tmp_02); + mean_2d_ax1_f16(dim1,dim3,tmp,tmp_1); + + free(tmp_02); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==1))) + { + //reduce on axis0 + __fp16 * tmp_12=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(__fp16) * dim0*dim2*dim3); + + __fp16 * tmp_1=(__fp16 *)malloc(sizeof(__fp16)*dim0*dim3); + memset(tmp_1, 0, sizeof(__fp16) * dim0*dim3); + + mean_4d_ax1_f16(dim0,dim1,dim2,dim3,data,tmp_12); + mean_3d_ax1_f16(dim0,dim2,dim3,tmp_1,tmp_12); + mean_2d_ax1_f16(dim0,dim3,tmp,tmp_1); + + free(tmp_12); + free(tmp_1); + } + } + //pase to out_data + for(int i=0;iparam_dim[0]; + int param_dim1=param->param_dim[1]; + int param_dim2=param->param_dim[2]; + int param_dim3=param->param_dim[3]; + //reduce sum + if(param->type==0) + { + if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3)) + { + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + //nhwc + offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + tmp[0]+=data[offset]; + } + } + + } + } + } + else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax1(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax2(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax3(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01); + sum_3d_ax0(dim1,dim2,dim3,tmp,tmp_01); + + free(tmp_01); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_02); + sum_3d_ax1(dim1,dim2,dim3,tmp,tmp_02); + + free(tmp_02); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_03); + sum_3d_ax2(dim1,dim2,dim3,tmp,tmp_03); + free(tmp_03); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + sum_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_12); + sum_3d_ax1(dim0,dim2,dim3,tmp,tmp_12); + + free(tmp_12); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3); + sum_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_13); + sum_3d_ax2(dim0,dim2,dim3,tmp,tmp_13); + + free(tmp_13); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2))) + { + //reduce on axis2 + float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3); + memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3); + sum_4d_ax2(dim0,dim1,dim2,dim3,data,tmp_23); + sum_3d_ax2(dim0,dim1,dim3,tmp,tmp_23); + + free(tmp_23); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) + ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2) + ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1) + ||(param_dim0==2 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_0, 0, sizeof(float) * dim2*dim3); + + sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01); + sum_3d_ax0(dim1,dim2,dim3,tmp_0,tmp_01); + sum_2d_ax0(dim2,dim3,tmp,tmp_0); + + free(tmp_01); + free(tmp_0); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1) + ||(param_dim0==3 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_1, 0, sizeof(float) * dim2*dim3); + + sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01); + sum_3d_ax0(dim1,dim2,dim3,tmp_1,tmp_01); + sum_2d_ax1(dim2,dim3,tmp,tmp_1); + + free(tmp_01); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3); + memset(tmp_1, 0, sizeof(float) * dim1*dim3); + + sum_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_02); + sum_3d_ax1(dim1,dim2,dim3,tmp_1,tmp_02); + sum_2d_ax1(dim1,dim3,tmp,tmp_1); + + free(tmp_02); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==1))) + { + //reduce on axis0 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3); + memset(tmp_1, 0, sizeof(float) * dim0*dim3); + + sum_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_12); + sum_3d_ax1(dim0,dim2,dim3,tmp_1,tmp_12); + sum_2d_ax1(dim0,dim3,tmp,tmp_1); + + free(tmp_12); + free(tmp_1); + } + } + //reduce mean + else if(param->type==1) + { + if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3)) + { + float s_tmp=0.f; + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + //nhwc + offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + s_tmp+=data[offset]; + } + } + } + } + tmp[0]=s_tmp / (dim0*dim1*dim2*dim3); + } + else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax1(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax2(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax3(dim0,dim1,dim2,dim3,data,tmp); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01); + mean_3d_ax0(dim1,dim2,dim3,tmp,tmp_01); + + free(tmp_01); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_02); + mean_3d_ax1(dim1,dim2,dim3,tmp,tmp_02); + + free(tmp_02); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_03); + mean_3d_ax2(dim1,dim2,dim3,tmp,tmp_03); + free(tmp_03); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + mean_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_12); + mean_3d_ax1(dim0,dim2,dim3,tmp,tmp_12); + + free(tmp_12); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3); + mean_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_13); + mean_3d_ax2(dim0,dim2,dim3,tmp,tmp_13); + + free(tmp_13); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2))) + { + //reduce on axis2 + float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3); + memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3); + mean_4d_ax2(dim0,dim1,dim2,dim3,data,tmp_23); + mean_3d_ax2(dim0,dim1,dim3,tmp,tmp_23); + + free(tmp_23); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) + ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2) + ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1) + ||(param_dim0==2 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_0, 0, sizeof(float) * dim2*dim3); + + mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01); + mean_3d_ax0(dim1,dim2,dim3,tmp_0,tmp_01); + mean_2d_ax0(dim2,dim3,tmp,tmp_0); + + free(tmp_01); + free(tmp_0); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1) + ||(param_dim0==3 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_1, 0, sizeof(float) * dim2*dim3); + + mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_01); + mean_3d_ax0(dim1,dim2,dim3,tmp_1,tmp_01); + mean_2d_ax1(dim2,dim3,tmp,tmp_1); + + free(tmp_01); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3); + memset(tmp_1, 0, sizeof(float) * dim1*dim3); + + mean_4d_ax0(dim0,dim1,dim2,dim3,data,tmp_02); + mean_3d_ax1(dim1,dim2,dim3,tmp_1,tmp_02); + mean_2d_ax1(dim1,dim3,tmp,tmp_1); + + free(tmp_02); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==1))) + { + //reduce on axis0 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3); + memset(tmp_1, 0, sizeof(float) * dim0*dim3); + + mean_4d_ax1(dim0,dim1,dim2,dim3,data,tmp_12); + mean_3d_ax1(dim0,dim2,dim3,tmp_1,tmp_12); + mean_2d_ax1(dim0,dim3,tmp,tmp_1); + + free(tmp_12); + free(tmp_1); + } + } + //pase to out_data + for(int i=0;iparam_dim[0]; + int param_dim1=param->param_dim[1]; + int param_dim2=param->param_dim[2]; + int param_dim3=param->param_dim[3]; + //reduce sum + if(param->type==0) + { + if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3)) + { + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + //nhwc + offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + float real_input0 = data[offset] * param->scale[0]; + tmp[0]+=real_input0; + } + } + + } + } + } + else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax2_int8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax3_int8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param); + sum_3d_ax0_int8(dim1,dim2,dim3,tmp,tmp_01); + + free(tmp_01); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_02,param); + sum_3d_ax1_int8(dim1,dim2,dim3,tmp,tmp_02); + + free(tmp_02); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_03,param); + sum_3d_ax2_int8(dim1,dim2,dim3,tmp,tmp_03); + free(tmp_03); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + sum_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_12,param); + sum_3d_ax1_int8(dim0,dim2,dim3,tmp,tmp_12); + + free(tmp_12); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3); + sum_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_13,param); + sum_3d_ax2_int8(dim0,dim2,dim3,tmp,tmp_13); + + free(tmp_13); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2))) + { + //reduce on axis2 + float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3); + memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3); + sum_4d_ax2_int8(dim0,dim1,dim2,dim3,data,tmp_23,param); + sum_3d_ax2_int8(dim0,dim1,dim3,tmp,tmp_23); + + free(tmp_23); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) + ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2) + ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1) + ||(param_dim0==2 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_0, 0, sizeof(float) * dim2*dim3); + + sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param); + sum_3d_ax0_int8(dim1,dim2,dim3,tmp_0,tmp_01); + sum_2d_ax0_int8(dim2,dim3,tmp,tmp_0); + + free(tmp_01); + free(tmp_0); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1) + ||(param_dim0==3 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_1, 0, sizeof(float) * dim2*dim3); + + sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param); + sum_3d_ax0_int8(dim1,dim2,dim3,tmp_1,tmp_01); + sum_2d_ax1_int8(dim2,dim3,tmp,tmp_1); + + free(tmp_01); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3); + memset(tmp_1, 0, sizeof(float) * dim1*dim3); + + sum_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_02,param); + sum_3d_ax1_int8(dim1,dim2,dim3,tmp_1,tmp_02); + sum_2d_ax1_int8(dim1,dim3,tmp,tmp_1); + + free(tmp_02); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==1))) + { + //reduce on axis0 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3); + memset(tmp_1, 0, sizeof(float) * dim0*dim3); + + sum_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_12,param); + sum_3d_ax1_int8(dim0,dim2,dim3,tmp_1,tmp_12); + sum_2d_ax1_int8(dim0,dim3,tmp,tmp_1); + + free(tmp_12); + free(tmp_1); + } + } + //reduce mean + else if(param->type==1) + { + if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3)) + { + float s_tmp=0.f; + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + //nhwc + offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + float real_input0 = data[offset] * param->scale[0]; + s_tmp+=real_input0; + } + } + } + } + tmp[0]=s_tmp / (dim0*dim1*dim2*dim3); + } + else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax2_int8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax3_int8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param); + mean_3d_ax0_int8(dim1,dim2,dim3,tmp,tmp_01); + + free(tmp_01); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_02,param); + mean_3d_ax1_int8(dim1,dim2,dim3,tmp,tmp_02); + + free(tmp_02); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_03,param); + mean_3d_ax2_int8(dim1,dim2,dim3,tmp,tmp_03); + free(tmp_03); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + mean_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_12,param); + mean_3d_ax1_int8(dim0,dim2,dim3,tmp,tmp_12); + + free(tmp_12); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3); + mean_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_13,param); + mean_3d_ax2_int8(dim0,dim2,dim3,tmp,tmp_13); + + free(tmp_13); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2))) + { + //reduce on axis2 + float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3); + memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3); + mean_4d_ax2_int8(dim0,dim1,dim2,dim3,data,tmp_23,param); + mean_3d_ax2_int8(dim0,dim1,dim3,tmp,tmp_23); + + free(tmp_23); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) + ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2) + ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1) + ||(param_dim0==2 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_0, 0, sizeof(float) * dim2*dim3); + + mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param); + mean_3d_ax0_int8(dim1,dim2,dim3,tmp_0,tmp_01); + mean_2d_ax0_int8(dim2,dim3,tmp,tmp_0); + + free(tmp_01); + free(tmp_0); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1) + ||(param_dim0==3 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_1, 0, sizeof(float) * dim2*dim3); + + mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_01,param); + mean_3d_ax0_int8(dim1,dim2,dim3,tmp_1,tmp_01); + mean_2d_ax1_int8(dim2,dim3,tmp,tmp_1); + + free(tmp_01); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3); + memset(tmp_1, 0, sizeof(float) * dim1*dim3); + + mean_4d_ax0_int8(dim0,dim1,dim2,dim3,data,tmp_02,param); + mean_3d_ax1_int8(dim1,dim2,dim3,tmp_1,tmp_02); + mean_2d_ax1_int8(dim1,dim3,tmp,tmp_1); + + free(tmp_02); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==1))) + { + //reduce on axis0 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3); + memset(tmp_1, 0, sizeof(float) * dim0*dim3); + + mean_4d_ax1_int8(dim0,dim1,dim2,dim3,data,tmp_12,param); + mean_3d_ax1_int8(dim0,dim2,dim3,tmp_1,tmp_12); + mean_2d_ax1_int8(dim0,dim3,tmp,tmp_1); + + free(tmp_12); + free(tmp_1); + } + } + + float output_max = 0.0f; + for(int i =0; i< out_size; i++) + { + if(output_max < fabs(tmp[i])) + output_max = fabs(tmp[i]); + } + param->scale[1] = output_max/127; + //pase to out_data + for(int i=0;iscale[0]; + s_tmp+=real_input0; + + } + tmp[j]=s_tmp / dim0; + } +} +void mean_4d_ax1_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param) +{ + for(int n=0;nscale[0]; + s_tmp+=real_input0; + } + tmp[n*dim2*dim3+cw]=s_tmp/dim1; + } + } +} +void mean_4d_ax2_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param) +{ + for(int n=0;nscale[0]; + s_tmp+=real_input0; + } + tmp[n*dim1*dim3+h*dim3+c]=s_tmp/dim2; + } + } + } +} +void mean_4d_ax3_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param) +{ + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + float s_tmp=0.f; + for(int c = 0; c < dim3; c++) + { + int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + float real_input0 = data[offset] * param->scale[0]; + s_tmp+=real_input0; + } + tmp[n*dim1*dim2+h*dim2+w]=s_tmp/dim3; + } + } + } +} +void mean_3d_ax0_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01) +{ + for(int wc=0;wcscale[0]; + tmp[j] +=real_input0; + } + } +} +void sum_4d_ax1_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param) +{ + for(int n=0;nscale[0]; + tmp[n*dim2*dim3+cw]+=real_input0; + } + } + } +} +void sum_4d_ax2_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param) +{ + for(int n=0;nscale[0]; + tmp[n*dim1*dim3+h*dim3+c]+=real_input0; + } + } + } + } +} +void sum_4d_ax3_int8(int dim0,int dim1,int dim2,int dim3,int8_t * data,float * tmp,reduce_param * param) +{ + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + float real_input0 = data[offset] * param->scale[0]; + tmp[n*dim1*dim2+h*dim2+w]+=real_input0; + } + } + } + } +} +void sum_3d_ax0_int8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01) +{ + for(int wc=0;wcparam_dim[0]; + int param_dim1=param->param_dim[1]; + int param_dim2=param->param_dim[2]; + int param_dim3=param->param_dim[3]; + //reduce sum + if(param->type==0) + { + if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3)) + { + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + //nhwc + offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + float real_input0 = (data[offset] - param->zero[0]) * param->scale[0]; + tmp[0]+=real_input0; + } + } + + } + } + } + else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax2_uint8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + sum_4d_ax3_uint8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param); + sum_3d_ax0_uint8(dim1,dim2,dim3,tmp,tmp_01); + + free(tmp_01); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_02,param); + sum_3d_ax1_uint8(dim1,dim2,dim3,tmp,tmp_02); + + free(tmp_02); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3); + sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_03,param); + sum_3d_ax2_uint8(dim1,dim2,dim3,tmp,tmp_03); + free(tmp_03); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + sum_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_12,param); + sum_3d_ax1_uint8(dim0,dim2,dim3,tmp,tmp_12); + + free(tmp_12); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3); + sum_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_13,param); + sum_3d_ax2_uint8(dim0,dim2,dim3,tmp,tmp_13); + + free(tmp_13); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2))) + { + //reduce on axis2 + float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3); + memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3); + sum_4d_ax2_uint8(dim0,dim1,dim2,dim3,data,tmp_23,param); + sum_3d_ax2_uint8(dim0,dim1,dim3,tmp,tmp_23); + + free(tmp_23); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) + ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2) + ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1) + ||(param_dim0==2 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_0, 0, sizeof(float) * dim2*dim3); + + sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param); + sum_3d_ax0_uint8(dim1,dim2,dim3,tmp_0,tmp_01); + sum_2d_ax0_uint8(dim2,dim3,tmp,tmp_0); + + free(tmp_01); + free(tmp_0); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1) + ||(param_dim0==3 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_1, 0, sizeof(float) * dim2*dim3); + + sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param); + sum_3d_ax0_uint8(dim1,dim2,dim3,tmp_1,tmp_01); + sum_2d_ax1_uint8(dim2,dim3,tmp,tmp_1); + + free(tmp_01); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3); + memset(tmp_1, 0, sizeof(float) * dim1*dim3); + + sum_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_02,param); + sum_3d_ax1_uint8(dim1,dim2,dim3,tmp_1,tmp_02); + sum_2d_ax1_uint8(dim1,dim3,tmp,tmp_1); + + free(tmp_02); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==1))) + { + //reduce on axis0 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3); + memset(tmp_1, 0, sizeof(float) * dim0*dim3); + + sum_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_12,param); + sum_3d_ax1_uint8(dim0,dim2,dim3,tmp_1,tmp_12); + sum_2d_ax1_uint8(dim0,dim3,tmp,tmp_1); + + free(tmp_12); + free(tmp_1); + } + } + //reduce mean + else if(param->type==1) + { + if((param_dim0==-2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + || (param_dim0==0 && param_dim1==1 && param_dim2==2 && param_dim3==3)) + { + float s_tmp=0.f; + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + //nhwc + offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + float real_input0 = (data[offset] - param->zero[0]) * param->scale[0]; + s_tmp+=real_input0; + } + } + } + } + tmp[0]=s_tmp / (dim0*dim1*dim2*dim3); + } + else if(param_dim0==0 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==1 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==2 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax2_uint8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim0==3 && param_dim1==-2 && param_dim2==-2 && param_dim3==-2) + { + mean_4d_ax3_uint8(dim0,dim1,dim2,dim3,data,tmp,param); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==0) ||(param_dim0==0 && param_dim1==1))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param); + mean_3d_ax0_uint8(dim1,dim2,dim3,tmp,tmp_01); + + free(tmp_01); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==2) ||(param_dim0==2 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_02,param); + mean_3d_ax1_uint8(dim1,dim2,dim3,tmp,tmp_02); + + free(tmp_02); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==0 && param_dim1==3) ||(param_dim0==3 && param_dim1==0))) + { + //reduce on axis0 + float * tmp_03=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_03, 0, sizeof(float) * dim1*dim2*dim3); + mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_03,param); + mean_3d_ax2_uint8(dim1,dim2,dim3,tmp,tmp_03); + free(tmp_03); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==2) ||(param_dim0==2 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + mean_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_12,param); + mean_3d_ax1_uint8(dim0,dim2,dim3,tmp,tmp_12); + + free(tmp_12); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==1 && param_dim1==3) ||(param_dim0==3 && param_dim1==1))) + { + //reduce on axis1 + float * tmp_13=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_13, 0, sizeof(float) * dim0*dim2*dim3); + mean_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_13,param); + mean_3d_ax2_uint8(dim0,dim2,dim3,tmp,tmp_13); + + free(tmp_13); + } + else if(param_dim2==-2 && param_dim3==-2 && ( (param_dim0==2 && param_dim1==3) ||(param_dim0==3 && param_dim1==2))) + { + //reduce on axis2 + float * tmp_23=(float *)malloc(sizeof(float)*dim0*dim1*dim3); + memset(tmp_23, 0, sizeof(float) * dim0*dim1*dim3); + mean_4d_ax2_uint8(dim0,dim1,dim2,dim3,data,tmp_23,param); + mean_3d_ax2_uint8(dim0,dim1,dim3,tmp,tmp_23); + + free(tmp_23); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==2) + ||(param_dim0==0 && param_dim1==2 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==2) + ||(param_dim0==1 && param_dim1==2 && param_dim2==0)||(param_dim0==2 && param_dim1==0 && param_dim2==1) + ||(param_dim0==2 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_0=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_0, 0, sizeof(float) * dim2*dim3); + + mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param); + mean_3d_ax0_uint8(dim1,dim2,dim3,tmp_0,tmp_01); + mean_2d_ax0_uint8(dim2,dim3,tmp,tmp_0); + + free(tmp_01); + free(tmp_0); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==1 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==1)||(param_dim0==1 && param_dim1==0 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==1) + ||(param_dim0==3 && param_dim1==1 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_01=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_01, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim2*dim3); + memset(tmp_1, 0, sizeof(float) * dim2*dim3); + + mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_01,param); + mean_3d_ax0_uint8(dim1,dim2,dim3,tmp_1,tmp_01); + mean_2d_ax1_uint8(dim2,dim3,tmp,tmp_1); + + free(tmp_01); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==0 && param_dim1==2 && param_dim2==3) + ||(param_dim0==0 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==0 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==0)||(param_dim0==3 && param_dim1==0 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==0))) + { + //reduce on axis0 + float * tmp_02=(float *)malloc(sizeof(float)*dim1*dim2*dim3); + memset(tmp_02, 0, sizeof(float) * dim1*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim1*dim3); + memset(tmp_1, 0, sizeof(float) * dim1*dim3); + + mean_4d_ax0_uint8(dim0,dim1,dim2,dim3,data,tmp_02,param); + mean_3d_ax1_uint8(dim1,dim2,dim3,tmp_1,tmp_02); + mean_2d_ax1_uint8(dim1,dim3,tmp,tmp_1); + + free(tmp_02); + free(tmp_1); + } + else if(param_dim3==-2 && ( (param_dim0==1 && param_dim1==2 && param_dim2==3) + ||(param_dim0==1 && param_dim1==3 && param_dim2==2)||(param_dim0==2 && param_dim1==1 && param_dim2==3) + ||(param_dim0==2 && param_dim1==3 && param_dim2==1)||(param_dim0==3 && param_dim1==1 && param_dim2==2) + ||(param_dim0==3 && param_dim1==2 && param_dim2==1))) + { + //reduce on axis0 + float * tmp_12=(float *)malloc(sizeof(float)*dim0*dim2*dim3); + memset(tmp_12, 0, sizeof(float) * dim0*dim2*dim3); + + float * tmp_1=(float *)malloc(sizeof(float)*dim0*dim3); + memset(tmp_1, 0, sizeof(float) * dim0*dim3); + + mean_4d_ax1_uint8(dim0,dim1,dim2,dim3,data,tmp_12,param); + mean_3d_ax1_uint8(dim0,dim2,dim3,tmp_1,tmp_12); + mean_2d_ax1_uint8(dim0,dim3,tmp,tmp_1); + + free(tmp_12); + free(tmp_1); + } + } + //pase to out_data + for(int i=0;iscale[1]) + param->zero[1]; + } + free(tmp); + return 0; +} +//mean +void mean_4d_ax0_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param) +{ + for(int j=0;jzero[0]) * param->scale[0]; + s_tmp+=real_input0; + + } + tmp[j]=s_tmp / dim0; + } +} +void mean_4d_ax1_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param) +{ + for(int n=0;nzero[0]) * param->scale[0]; + s_tmp+=real_input0; + } + tmp[n*dim2*dim3+cw]=s_tmp/dim1; + } + } +} +void mean_4d_ax2_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param) +{ + for(int n=0;nzero[0]) * param->scale[0]; + s_tmp+=real_input0; + } + tmp[n*dim1*dim3+h*dim3+c]=s_tmp/dim2; + } + } + } +} +void mean_4d_ax3_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param) +{ + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + float s_tmp=0.f; + for(int c = 0; c < dim3; c++) + { + int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + float real_input0 = (data[offset] - param->zero[0]) * param->scale[0]; + s_tmp+=real_input0; + } + tmp[n*dim1*dim2+h*dim2+w]=s_tmp/dim3; + } + } + } +} +void mean_3d_ax0_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01) +{ + for(int wc=0;wczero[0]) * param->scale[0]; + tmp[j] +=real_input0; + } + } +} +void sum_4d_ax1_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param) +{ + for(int n=0;nzero[0]) * param->scale[0]; + tmp[n*dim2*dim3+cw]+=real_input0; + } + } + } +} +void sum_4d_ax2_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param) +{ + for(int n=0;nzero[0]) * param->scale[0]; + tmp[n*dim1*dim3+h*dim3+c]+=real_input0; + } + } + } + } +} +void sum_4d_ax3_uint8(int dim0,int dim1,int dim2,int dim3,uint8_t * data,float * tmp,reduce_param * param) +{ + for(int n = 0; n < dim0; n++) + { + for(int h = 0; h < dim1; h++) + { + for(int w = 0; w < dim2; w++) + { + for(int c = 0; c < dim3; c++) + { + int offset=n*dim1*dim2*dim3+h*dim2*dim3+w*dim3+c; + float real_input0 = (data[offset] - param->zero[0]) * param->scale[0]; + tmp[n*dim1*dim2+h*dim2+w]+=real_input0; + } + } + } + } +} +void sum_3d_ax0_uint8(int dim1,int dim2,int dim3,float * tmp,float* tmp_01) +{ + for(int wc=0;wcinput_size / sizeof(__fp16)); + int in_num = param->in_num; + +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + float *buff = (float*)malloc(input_size); + for(int i = 0; i < in_num; ++i) + { + __fp16 *input_data = input[i]; + for(int j = 0; j < input_size; ++j) + { + float data = fp16_to_fp32(input_data[j]); + buff[j] += data; + } + } + for(int j = 0; j < input_size; ++j) + { + output[j] = fp32_to_fp16(buff[j]); + } + + free(buff); +#else + for(int i = 0; i < in_num; ++i) + { + __fp16 *input_data = input[i]; + for(int j = 0; j < input_size; ++j) + { + output[j] += input_data[j]; + } + } +#endif + return 0; +} + diff --git a/executor/operator/ref/kernel/ref_add_n/ref_addn_fp32.c b/executor/operator/ref/kernel/ref_add_n/ref_addn_fp32.c new file mode 100644 index 000000000..50857e78d --- /dev/null +++ b/executor/operator/ref/kernel/ref_add_n/ref_addn_fp32.c @@ -0,0 +1,16 @@ +static int ref_addn_fp32(float**input,float * output, const ref_addn_param* param) +{ + int input_size = (param->input_size / sizeof(float)); + int in_num = param->in_num; + + for(int i = 0; i < in_num; ++i) + { + float *input_data = input[i]; + for(int j = 0; j < input_size; ++j) + { + output[j] += input_data[j]; + } + } + return 0; +} + diff --git a/executor/operator/ref/kernel/ref_add_n/ref_addn_int8.c b/executor/operator/ref/kernel/ref_add_n/ref_addn_int8.c new file mode 100644 index 000000000..8497f45b8 --- /dev/null +++ b/executor/operator/ref/kernel/ref_add_n/ref_addn_int8.c @@ -0,0 +1,35 @@ +static int ref_addn_int8(int8_t**input,int8_t * output, ref_addn_param* param) +{ + int input_size = param->input_size; + int in_num = param->in_num; + + float *out_f32 = (float*)malloc(input_size); + memset(out_f32,0,input_size); + for(int i = 0; i < in_num; ++i) + { + int8_t *input_data = input[i]; + float input_scale = param->in_scale[i]; + for(int j = 0; j < input_size; ++j) + { + out_f32[j] += input_data[j] * input_scale; + } + } + float output_max = 0.0f; + for(int i =0; i< input_size; i++) + { + if(output_max < fabs(out_f32[i])) + output_max = fabs(out_f32[i]); + } + param->out_scale = output_max / 127; + param->out_zero = 0; + for(int i =0; i< input_size; i++) + { + int s32_out = round(out_f32[i]*127/output_max); + if(s32_out > 127) s32_out = 127; + if(s32_out < -127 ) s32_out = -127; + output[i] = s32_out; + } + free(out_f32); + out_f32 = NULL; + return 0; +} diff --git a/executor/operator/ref/kernel/ref_add_n/ref_addn_kernel.h b/executor/operator/ref/kernel/ref_add_n/ref_addn_kernel.h new file mode 100644 index 000000000..64effac3a --- /dev/null +++ b/executor/operator/ref/kernel/ref_add_n/ref_addn_kernel.h @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: ruizhang@openailab.com + */ + +#ifndef __REF_ADDN_KERNEL_H__ +#define __REF_ADDN_KERNEL_H__ + +#include +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ref_addn_param +{ + float * in_scale; + int *in_zero; + int in_num; + int input_size; + float out_scale; + int out_zero; +}; + +typedef int (*ref_add_n_kernel_t)(uint8_t **input,uint8_t * output,const ref_addn_param* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_addn_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "ref_addn_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "ref_addn_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_addn_uint8.c" +#endif + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/ref_add_n/ref_addn_uint8.c b/executor/operator/ref/kernel/ref_add_n/ref_addn_uint8.c new file mode 100644 index 000000000..558cb0f7c --- /dev/null +++ b/executor/operator/ref/kernel/ref_add_n/ref_addn_uint8.c @@ -0,0 +1,31 @@ +static int ref_addn_uint8(uint8_t**input,uint8_t * output, const ref_addn_param* param) +{ + int input_size = param->input_size; + int in_num = param->in_num; + float *out_f32 = (float*)malloc(input_size); + memset(out_f32,0,input_size); + for(int i = 0; i < in_num; ++i) + { + uint8_t *input_data = input[i]; + float input_scale = param->in_scale[i]; + int zero_point = param->in_zero[i]; + for(int j = 0; j < input_size; ++j) + { + out_f32[j] += (input_data[j] * input_scale + zero_point); + } + } + for(int j = 0; j < input_size; ++j) + { + int s32_out = round(out_f32[j]/param->out_scale) + param->out_zero; + if(s32_out > 255) + s32_out = 255; + if(s32_out < 0 ) + s32_out = 0; + output[j] = s32_out; + } + free(out_f32); + out_f32 = NULL; + + return 0; +} + diff --git a/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp16.c b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp16.c new file mode 100644 index 000000000..1bb8483e5 --- /dev/null +++ b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp16.c @@ -0,0 +1,64 @@ +static int ref_batchnorm_fp16(__fp16*input,__fp16 * output,const ref_batchnorm_param* param) +{ + float* scale_mean = param->scale_mean; + float* scale_var_inv = param->scale_var_inv; + float* gamma = param->gamma; + float* beta = param->beta; + + int img_size = param->input_c * param->input_h * param->input_w; + float *out_f32 = (float*)malloc(sizeof(float)* img_size * param->input_n); + memset(out_f32,0,sizeof(float) * img_size); + for(int n = 0; n < param->input_n; ++n) + { + for(int h = 0; h < param->input_h; ++h) + { + for(int w = 0; w < param->input_w;++w) + { + for(int c = 0; c < param->input_c; ++c) + { + float s_mean = scale_mean[c]; + float s_var = scale_var_inv[c]; + float s_val1 = s_mean; + float s_val2 = s_var; + if(!param->iscaffe) + { + float s_gamma = gamma[c]; + float s_beta = beta[c]; + s_val1 = s_beta + s_gamma * s_mean; + s_val2 = s_gamma * s_var; + } + int offset = 0; + if(TENGINE_LAYOUT_NCHW == param->layout) + { + offset = n * img_size + c * param->input_h * param->input_w + + h * param->input_w + w; + } + else + { + offset = n * img_size + h * param->input_w * param->input_c + + w * param->input_c + c; + } + #if!defined( __ARM_ARCH) || __ARM_ARCH <8 + float data = fp16_to_fp32(input[offset]); + #else + __fp16 data = input[offset]; + #endif + out_f32[offset] = data * s_val2 + s_val1; + } + } + } + } + for(int j = 0; j < img_size * param->input_n; ++j) + { + #if!defined( __ARM_ARCH) || __ARM_ARCH <8 + output[j] = fp32_to_fp16(out_f32[j]); + #else + output[j] = (__fp16)out_f32[j]; + #endif + } + free(out_f32); + out_f32 = NULL; + + return 0; +} + diff --git a/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp32.c b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp32.c new file mode 100644 index 000000000..263b8a0fd --- /dev/null +++ b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_fp32.c @@ -0,0 +1,47 @@ +static int ref_batchnorm_fp32(float*input,float * output,const ref_batchnorm_param* param) +{ + float* scale_mean = param->scale_mean; + float* scale_var_inv = param->scale_var_inv; + float* gamma = param->gamma; + float* beta = param->beta; + + int img_size = param->input_c * param->input_h * param->input_w; + + for(int n = 0; n < param->input_n; ++n) + { + for(int h = 0; h < param->input_h; ++h) + { + for(int w = 0; w < param->input_w;++w) + { + for(int c = 0; c < param->input_c; ++c) + { + float s_mean = scale_mean[c]; + float s_var = scale_var_inv[c]; + float s_val1 = s_mean; + float s_val2 = s_var; + if(!param->iscaffe) + { + float s_gamma = gamma[c]; + float s_beta = beta[c]; + s_val1 = s_beta + s_gamma * s_mean; + s_val2 = s_gamma * s_var; + } + int offset = 0; + if(TENGINE_LAYOUT_NCHW == param->layout) + { + offset = n * img_size + c * param->input_h * param->input_w + + h * param->input_w + w; + } + else + { + offset = n * img_size + h * param->input_w * param->input_c + + w * param->input_c + c; + } + output[offset] = input[offset] * s_val2 + s_val1; + } + } + } + } + return 0; +} + diff --git a/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_int8.c b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_int8.c new file mode 100644 index 000000000..1e241e8e7 --- /dev/null +++ b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_int8.c @@ -0,0 +1,68 @@ +static int ref_batchnorm_int8(int8_t*input,int8_t * output,ref_batchnorm_param* param) +{ + float* scale_mean = param->scale_mean; + float* scale_var_inv = param->scale_var_inv; + float* gamma = param->gamma; + float* beta = param->beta; + + int img_size = param->input_c * param->input_h * param->input_w; + float *out_f32 = (float*)malloc(sizeof(float)* img_size * param->input_n); + memset(out_f32,0,sizeof(float) * img_size * param->input_n); + for(int n = 0; n < param->input_n; ++n) + { + for(int h = 0; h < param->input_h; ++h) + { + for(int w = 0; w < param->input_w;++w) + { + for(int c = 0; c < param->input_c; ++c) + { + float s_mean = scale_mean[c]; + float s_var = scale_var_inv[c]; + float s_val1 = s_mean; + float s_val2 = s_var; + if(!param->iscaffe) + { + float s_gamma = gamma[c]; + float s_beta = beta[c]; + s_val1 = s_beta + s_gamma * s_mean; + s_val2 = s_gamma * s_var; + } + int offset = 0; + if(TENGINE_LAYOUT_NCHW == param->layout) + { + offset = n * img_size + c * param->input_h * param->input_w + + h * param->input_w + w; + } + else + { + offset = n * img_size + h * param->input_w * param->input_c + + w * param->input_c + c; + } + float data = (float)param->in_scale * (input[offset] - param->in_zero); + out_f32[offset] = data * s_val2 + s_val1; + } + } + } + } + float output_max = 0.0f; + for(int i =0; i< img_size*param->input_n; i++) + { + if(output_max < fabs(out_f32[i])) + output_max = fabs(out_f32[i]); + } + param->out_scale = output_max / 127; + param->out_zero = 0; + for(int i =0; i< img_size*param->input_n; i++) + { + int s32_out = round(out_f32[i]*127/output_max); + if(s32_out > 127) s32_out = 127; + if(s32_out < -127 ) s32_out = -127; + output[i] = s32_out; + } + + free(out_f32); + out_f32 = NULL; + + return 0; +} + diff --git a/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_kernel.h b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_kernel.h new file mode 100644 index 000000000..337dd5512 --- /dev/null +++ b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_kernel.h @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: ruizhang@openailab.com + */ + +#ifndef __REF_BATCHNORM_KERNEL_H__ +#define __REF_BATCHNORM_KERNEL_H__ + +#include +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ref_batchnorm_param +{ + int input_n; + int input_h; + int input_w; + int input_c; + int layout; + bool iscaffe; + float* scale_mean; + float* scale_var_inv; + float* gamma; + float* beta; + float in_scale; + int in_zero; + float out_scale; + int out_zero; +}; + +typedef int (*ref_batchnorm_kernel_t)(uint8_t *input,uint8_t * output,const ref_batchnorm_param* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_batchnorm_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "ref_batchnorm_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "ref_batchnorm_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_batchnorm_uint8.c" +#endif + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_uint8.c b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_uint8.c new file mode 100644 index 000000000..daadabb9d --- /dev/null +++ b/executor/operator/ref/kernel/ref_batchnorm/ref_batchnorm_uint8.c @@ -0,0 +1,62 @@ +static int ref_batchnorm_uint8(uint8_t*input,uint8_t* output,const ref_batchnorm_param* param) +{ + float* scale_mean = param->scale_mean; + float* scale_var_inv = param->scale_var_inv; + float* gamma = param->gamma; + float* beta = param->beta; + + int img_size = param->input_c * param->input_h * param->input_w; + float *out_f32 = (float*)malloc(sizeof(float)* img_size * param->input_n); + memset(out_f32,0,sizeof(float) * img_size * param->input_n); + for(int n = 0; n < param->input_n; ++n) + { + for(int h = 0; h < param->input_h; ++h) + { + for(int w = 0; w < param->input_w;++w) + { + for(int c = 0; c < param->input_c; ++c) + { + float s_mean = scale_mean[c]; + float s_var = scale_var_inv[c]; + float s_val1 = s_mean; + float s_val2 = s_var; + if(!param->iscaffe) + { + float s_gamma = gamma[c]; + float s_beta = beta[c]; + s_val1 = s_beta + s_gamma * s_mean; + s_val2 = s_gamma * s_var; + } + int offset = 0; + if(TENGINE_LAYOUT_NCHW == param->layout) + { + offset = n * img_size + c * param->input_h * param->input_w + + h * param->input_w + w; + } + else + { + offset = n * img_size + h * param->input_w * param->input_c + + w * param->input_c + c; + } + + float data = param->in_scale*(input[offset]-param->in_zero); + out_f32[offset] = data * s_val2 + s_val1; + } + } + } + } + for(int j = 0; j < img_size * param->input_n; ++j) + { + int s32_out = round(out_f32[j]/param->out_scale) + param->out_zero; + if(s32_out > 255) + s32_out = 255; + if(s32_out < 0 ) + s32_out = 0; + output[j] = s32_out; + } + free(out_f32); + out_f32 = NULL; + + return 0; +} + diff --git a/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp16.c b/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp16.c new file mode 100644 index 000000000..b2818cfa3 --- /dev/null +++ b/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp16.c @@ -0,0 +1,93 @@ + +static int ref_normalize_fp16(__fp16*input,__fp16 * output,__fp16* scale, const ref_normalize_param* param) +{ + int batch_num = param->input_n; + int in_h = param->input_h; + int in_w = param->input_w; + int in_c = param->input_c; + int in_offset = 0; + int out_offset = 0; + + float *buff = (float*)malloc(sizeof(float)* in_h * in_w); + __fp16 *in_buf = input; + __fp16 *out_buf = output; + + for(int n = 0; n < batch_num; ++n) + { + memset(buff,0,sizeof(float)*in_h*in_w); + for(int h = 0; h < in_h; ++h) + { + for(int w = 0; w < in_w; ++w) + { + int buff_idx = h * in_w + w; + for(int c = 0; c < in_c; ++c) + { + if(param->layout == 0) // nchw + { + in_offset = n * in_h * in_w * in_c + c * in_h * in_w + h * in_w + w; + } + else // nhwc + { + in_offset = n * in_h * in_w * in_c + h * in_w * in_c + w * in_c + c; + } + #if!defined( __ARM_ARCH) || __ARM_ARCH <8 + float data = fp16_to_fp32(in_buf[in_offset]); + #else + __fp16 data = in_buf[in_offset]; + #endif + buff[buff_idx] += data * data; + } + buff[buff_idx] = 1.f / sqrt(buff[buff_idx]); + } + } + for(int h = 0; h < in_h; ++h) + { + for(int w = 0; w < in_w; ++w) + { + int buff_idx = h * in_w + w; + for(int c = 0; c < in_c; ++c) + { + if(param->layout == 0) // nchw + { + out_offset = n * in_h * in_w * in_c + c * in_h * in_w + h * in_w + w; + in_offset = out_offset; + } + else // nhwc + { + out_offset = n * in_h * in_w * in_c + h * in_w * in_c + w * in_c + c; + in_offset = out_offset; + } + #if!defined( __ARM_ARCH) || __ARM_ARCH <8 + float in_data = fp16_to_fp32(in_buf[in_offset]); + #else + __fp16 in_data = in_buf[in_offset]; + #endif + float data = buff[buff_idx]; + + float out_data = in_data * data; + if(scale) + { + #if!defined( __ARM_ARCH) || __ARM_ARCH <8 + float scale_data = fp16_to_fp32(scale[c]); + #else + __fp16 scale_data = scale[c]; + #endif + + out_data = out_data * scale_data; + } + + #if!defined( __ARM_ARCH) || __ARM_ARCH <8 + out_buf[out_offset] = fp32_to_fp16(out_data); + #else + out_buf[out_offset] = (__fp16)out_data; + #endif + } + } + } + } + + free(buff); + buff = NULL; + return 0; +} + diff --git a/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp32.c b/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp32.c new file mode 100644 index 000000000..99f1f90ec --- /dev/null +++ b/executor/operator/ref/kernel/ref_normalize/ref_normalize_fp32.c @@ -0,0 +1,75 @@ + +static int ref_normalize_fp32(float*input,float * output,float* scale,const ref_normalize_param* param) +{ + + int batch_num = param->input_n; + int in_h = param->input_h; + int in_w = param->input_w; + int in_c = param->input_c; + int in_offset = 0; + int out_offset = 0; + + float *buff = (float*)malloc(sizeof(float)* in_h * in_w); + float *in_buf = input; + float *out_buf = output; + + for(int n = 0; n < batch_num; ++n) + { + in_buf = input + n * in_h * in_w * in_c; + out_buf = output + n * in_h * in_w * in_c; + memset(buff,0,sizeof(float)*in_h*in_w); + for(int h = 0; h < in_h; ++h) + { + for(int w = 0; w < in_w; ++w) + { + int buff_idx = h * in_w + w; + for(int c = 0; c < in_c; ++c) + { + if(param->layout == 0) // nchw + { + in_offset = c * in_h * in_w + h * in_w + w; + } + else // nhwc + { + in_offset = h * in_w * in_c + w * in_c + c; + } + float data = in_buf[in_offset]; + buff[buff_idx] += data * data; + } + buff[buff_idx] = 1.f / sqrt(buff[buff_idx]); + } + } + for(int h = 0; h < in_h; ++h) + { + for(int w = 0; w < in_w; ++w) + { + int buff_idx = h * in_w + w; + for(int c = 0; c < in_c; ++c) + { + if(param->layout == 0) // nchw + { + out_offset = c * in_h * in_w + h * in_w + w; + in_offset = out_offset; + } + else // nhwc + { + out_offset = h * in_w * in_c + w * in_c + c; + in_offset = out_offset; + } + float data = buff[buff_idx]; + float in_data = in_buf[in_offset]; + out_buf[out_offset] = in_data * data; + if(scale) + { + out_buf[out_offset] = out_buf[out_offset] * scale[c]; + } + } + } + } + } + + free(buff); + buff = NULL; + return 0; +} + diff --git a/executor/operator/ref/kernel/ref_normalize/ref_normalize_int8.c b/executor/operator/ref/kernel/ref_normalize/ref_normalize_int8.c new file mode 100644 index 000000000..389784f77 --- /dev/null +++ b/executor/operator/ref/kernel/ref_normalize/ref_normalize_int8.c @@ -0,0 +1,93 @@ +static int ref_normalize_int8(int8_t*input, int8_t * output,int8_t* scale,ref_normalize_param* param) +{ + + int batch_num = param->input_n; + int in_h = param->input_h; + int in_w = param->input_w; + int in_c = param->input_c; + int in_offset = 0; + int out_offset = 0; + + int out_size = batch_num * in_h * in_w * in_c; + float *buff = (float*)malloc(sizeof(float)* in_h * in_w); + float *out_f32 = (float*)malloc(sizeof(float)* out_size); + int8_t *in_buf = input; + float * out_f32_tmp = out_f32; + for(int n = 0; n < batch_num; ++n) + { + in_buf = input + n * in_h * in_w * in_c; + out_f32_tmp = out_f32 + n * in_h * in_w * in_c; + memset(buff,0,sizeof(float)*in_h*in_w); + for(int h = 0; h < in_h; ++h) + { + for(int w = 0; w < in_w; ++w) + { + int buff_idx = h * in_w + w; + for(int c = 0; c < in_c; ++c) + { + if(param->layout == 0) // nchw + { + in_offset = c * in_h * in_w + h * in_w + w; + } + else // nhwc + { + in_offset = h * in_w * in_c + w * in_c + c; + } + float data = (float)param->in_scale * (in_buf[in_offset] - param->in_zero); + buff[buff_idx] += data * data; + } + buff[buff_idx] = 1.f / sqrt(buff[buff_idx]); + } + } + for(int h = 0; h < in_h; ++h) + { + for(int w = 0; w < in_w; ++w) + { + int buff_idx = h * in_w + w; + for(int c = 0; c < in_c; ++c) + { + if(param->layout == 0) // nchw + { + out_offset = c * in_h * in_w + h * in_w + w; + in_offset = out_offset; + } + else // nhwc + { + out_offset = h * in_w * in_c + w * in_c + c; + in_offset = out_offset; + } + float data = buff[buff_idx]; + float in_data = (float)param->in_scale * (in_buf[in_offset] + param->in_zero); + float out_data = in_data * data; + if(scale) + { + out_data = out_data * param->scale_scale * (scale[c] + param->scale_zero); + } + out_f32_tmp[out_offset] = out_data; + } + } + } + + } + float output_max = 0.0f; + for(int i =0; i< out_size; i++) + { + if(output_max < fabs(out_f32[i])) + output_max = fabs(out_f32[i]); + } + param->out_scale = output_max / 127; + param->out_zero = 0; + for(int i =0; i< out_size; i++) + { + int s32_out = round(out_f32[i]*127/output_max); + if(s32_out > 127) s32_out = 127; + if(s32_out < -127 ) s32_out = -127; + output[i] = s32_out; + } + free(buff); + free(out_f32); + out_f32 = NULL; + buff = NULL; + return 0; +} + diff --git a/executor/operator/ref/kernel/ref_normalize/ref_normalize_kernel.h b/executor/operator/ref/kernel/ref_normalize/ref_normalize_kernel.h new file mode 100644 index 000000000..51d807eba --- /dev/null +++ b/executor/operator/ref/kernel/ref_normalize/ref_normalize_kernel.h @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: ruizhang@openailab.com + */ + +#ifndef __REF_NORMAL_KERNEL_H__ +#define __REF_NORMAL_KERNEL_H__ + +#include +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ref_normalize_param +{ + int input_n; + int input_h; + int input_w; + int input_c; + int across_spatial; + int channel_shared; + int layout; + float in_scale; + int in_zero; + float out_scale; + int out_zero; + float scale_scale; + int scale_zero; +}; + +typedef int (*ref_normalize_kernel_t)(void *input,void * output,void* scale,const ref_normalize_param* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_normalize_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "ref_normalize_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "ref_normalize_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_normalize_uint8.c" +#endif + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/ref_normalize/ref_normalize_uint8.c b/executor/operator/ref/kernel/ref_normalize/ref_normalize_uint8.c new file mode 100644 index 000000000..6e4895781 --- /dev/null +++ b/executor/operator/ref/kernel/ref_normalize/ref_normalize_uint8.c @@ -0,0 +1,80 @@ + +static int ref_normalize_uint8(uint8_t*input, uint8_t * output,uint8_t* scale,const ref_normalize_param* param) +{ + + int batch_num = param->input_n; + int in_h = param->input_h; + int in_w = param->input_w; + int in_c = param->input_c; + int in_offset = 0; + int out_offset = 0; + + float *buff = (float*)malloc(sizeof(float)* in_h * in_w); + uint8_t *in_buf = input; + uint8_t *out_buf = output; + + for(int n = 0; n < batch_num; ++n) + { + in_buf = input + n * in_h * in_w * in_c; + out_buf = output + n * in_h * in_w * in_c; + memset(buff,0,sizeof(float)*in_h*in_w); + for(int h = 0; h < in_h; ++h) + { + for(int w = 0; w < in_w; ++w) + { + int buff_idx = h * in_w + w; + for(int c = 0; c < in_c; ++c) + { + if(param->layout == 0) // nchw + { + in_offset = c * in_h * in_w + h * in_w + w; + } + else // nhwc + { + in_offset = h * in_w * in_c + w * in_c + c; + } + float data = (float)param->in_scale * (in_buf[in_offset] + param->in_zero); + buff[buff_idx] += data * data; + } + buff[buff_idx] = 1.f / sqrt(buff[buff_idx]); + } + } + for(int h = 0; h < in_h; ++h) + { + for(int w = 0; w < in_w; ++w) + { + int buff_idx = h * in_w + w; + for(int c = 0; c < in_c; ++c) + { + if(param->layout == 0) // nchw + { + out_offset = c * in_h * in_w + h * in_w + w; + in_offset = out_offset; + } + else // nhwc + { + out_offset = h * in_w * in_c + w * in_c + c; + in_offset = out_offset; + } + float data = buff[buff_idx]; + float in_data = (float)param->in_scale * (in_buf[in_offset] - param->in_zero); + float out_data = in_data * data; + if(scale) + { + out_data = out_data * param->scale_scale * (scale[c] - param->scale_zero); + } + int s32_out = round(out_data/param->out_scale) + param->out_zero; + if(s32_out > 255) s32_out = 255; + if(s32_out < 0 ) s32_out = 0; + out_buf[out_offset] = s32_out; + } + } + } + } + + free(buff); + buff = NULL; + + return 0; +} + diff --git a/executor/operator/ref/kernel/relu/relu.h b/executor/operator/ref/kernel/relu/relu.h new file mode 100644 index 000000000..2b377895b --- /dev/null +++ b/executor/operator/ref/kernel/relu/relu.h @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#ifndef __RELU_KERNEL_H__ +#define __RELU_KERNEL_H__ + +#include +#include + +#include "compiler_fp16.h" +#include "relu_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int (*relu_t)(void * data, int size, float negative_slope, float scale, int zero_point); + + +#ifdef CONFIG_KERNEL_FP32 +#include "relu_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "relu_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "relu_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "relu_uint8.c" +#endif + + + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/relu/relu_common.h b/executor/operator/ref/kernel/relu/relu_common.h new file mode 100644 index 000000000..70994c29b --- /dev/null +++ b/executor/operator/ref/kernel/relu/relu_common.h @@ -0,0 +1,7 @@ +#ifndef __RELU_COMMON_H__ +#define __RELU_COMMON_H__ + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#endif \ No newline at end of file diff --git a/executor/operator/ref/kernel/relu/relu_fp16.c b/executor/operator/ref/kernel/relu/relu_fp16.c new file mode 100644 index 000000000..abc753ab4 --- /dev/null +++ b/executor/operator/ref/kernel/relu/relu_fp16.c @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +int relu_fp16(__fp16 * data, int size, float negative_slope, float scale, int zero_point) +{ + +/* for arm32 && x86 */ +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + //__fp16* data = ( __fp16* )data; + if(negative_slope == 0.0) + { + for(int i = 0; i < size; i++) + { + + data[i] = fp32_to_fp16(MAX(fp16_to_fp32(data[i]), 0.0)); + + + } + } + else + { + for(int i = 0; i < size; i++) + { + float bias= negative_slope * MIN(fp16_to_fp32(data[i]), 0.f); + data[i] = fp32_to_fp16(MAX(fp16_to_fp32(data[i]), 0.f) +bias); + } + } +#else + if(negative_slope == 0) + { + for(int i = 0; i < size; i++) + { + data[i] = MAX(data[i], 0.f); + } + } + else + { + for(int i = 0; i < size; i++) + { + data[i] = MAX(data[i], 0.f) + negative_slope * MIN(data[i], 0.f); + } + } +#endif + return 0; +} diff --git a/executor/operator/ref/kernel/relu/relu_fp32.c b/executor/operator/ref/kernel/relu/relu_fp32.c new file mode 100644 index 000000000..978ec5d20 --- /dev/null +++ b/executor/operator/ref/kernel/relu/relu_fp32.c @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +int relu_fp32(float * data, int size, float negative_slope, float scale, int zero_point) +{ + //float* out_data = ( float* )data; + if(negative_slope == 0) + { + for(int i = 0; i < size; i++) + { + data[i] = MAX(data[i], 0); + } + } + else + { + for(int i = 0; i < size; i++) + { + data[i] = MAX(data[i], 0.f) + negative_slope * MIN(data[i], 0.f); + } + } + return 0; +} diff --git a/executor/operator/ref/kernel/relu/relu_int8.c b/executor/operator/ref/kernel/relu/relu_int8.c new file mode 100644 index 000000000..4bd429008 --- /dev/null +++ b/executor/operator/ref/kernel/relu/relu_int8.c @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +int relu_int8(int8_t * data, int size, float negative_slope, float scale, int zero_point) +{ + if(negative_slope == 0) + { + for(int i = 0; i < size; i++) + { + data[i] = MAX(data[i], 0); + } + } + else + { + for(int i = 0; i < size; i++) + { + data[i] = MAX(data[i], 0.f) + negative_slope * MIN(data[i], 0.f); + } + } + return 0; +} diff --git a/executor/operator/ref/kernel/relu/relu_uint8.c b/executor/operator/ref/kernel/relu/relu_uint8.c new file mode 100644 index 000000000..62aafa940 --- /dev/null +++ b/executor/operator/ref/kernel/relu/relu_uint8.c @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +int relu_uint8(uint8_t * data, int size, float negative_slope, float scale, int zero_point) +{ + if(negative_slope == 0) + { + for(int i = 0; i < size; i++) + { + data[i] = MAX(data[i], zero_point); + } + } + else + { + for(int i = 0; i < size; i++) + { + if(data[i] < zero_point) + { + data[i] = round(negative_slope * data[i]); + } + } + } + return 0; +} diff --git a/executor/operator/ref/kernel/relu6/relu6.h b/executor/operator/ref/kernel/relu6/relu6.h new file mode 100644 index 000000000..5b0eac576 --- /dev/null +++ b/executor/operator/ref/kernel/relu6/relu6.h @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#ifndef __RELU6_KERNEL_H__ +#define __RELU6_KERNEL_H__ + +#include +#include +#include "compiler_fp16.h" +#include "relu6_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int (*relu6_t)(void * data, int size, float scale, int zero_point); + + +#ifdef CONFIG_KERNEL_FP32 +#include "relu6_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "relu6_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "relu6_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "relu6_uint8.c" +#endif + + + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/relu6/relu6_common.h b/executor/operator/ref/kernel/relu6/relu6_common.h new file mode 100644 index 000000000..ceed752a7 --- /dev/null +++ b/executor/operator/ref/kernel/relu6/relu6_common.h @@ -0,0 +1,7 @@ +#ifndef __RELU6_COMMON_H__ +#define __RELU6_COMMON_H__ + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#endif diff --git a/executor/operator/ref/kernel/relu6/relu6_fp16.c b/executor/operator/ref/kernel/relu6/relu6_fp16.c new file mode 100644 index 000000000..cdb5014d8 --- /dev/null +++ b/executor/operator/ref/kernel/relu6/relu6_fp16.c @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +int relu6_fp16(__fp16 * data, int size, float scale, int zero_point) +{ + +/* for arm32 && x86 */ +#if!defined( __ARM_ARCH) || __ARM_ARCH <8 + + for(int i = 0; i < size; i++) + { + data[i] = fp32_to_fp16(MIN(MAX(fp16_to_fp32(data[i]), 0.0),6.0)); + } + +#else + for(int i = 0; i < size; i++) + { + data[i] = MIN(MAX(data[i], 0.0f), 6.0f); + } +#endif + return 0; +} diff --git a/executor/operator/ref/kernel/relu6/relu6_fp32.c b/executor/operator/ref/kernel/relu6/relu6_fp32.c new file mode 100644 index 000000000..694915a00 --- /dev/null +++ b/executor/operator/ref/kernel/relu6/relu6_fp32.c @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + + +int relu6_fp32(float * data, int size, float scale, int zero_point) +{ + for(int i = 0; i < size; i++) + { + data[i] = MIN(MAX(data[i], 0), 6); + } + return 0; +} diff --git a/executor/operator/ref/kernel/relu6/relu6_int8.c b/executor/operator/ref/kernel/relu6/relu6_int8.c new file mode 100644 index 000000000..ffc142642 --- /dev/null +++ b/executor/operator/ref/kernel/relu6/relu6_int8.c @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +int relu6_int8(int8_t * data, int size, float scale, int zero_point) +{ + for(int i = 0; i < size; i++) + { + float real_data = data[i] * scale; + real_data = MIN(MAX(real_data, 0.0), 6.0); + data[i] = round(real_data/scale); + } + return 0; +} diff --git a/executor/operator/ref/kernel/relu6/relu6_uint8.c b/executor/operator/ref/kernel/relu6/relu6_uint8.c new file mode 100644 index 000000000..a2b2f07f6 --- /dev/null +++ b/executor/operator/ref/kernel/relu6/relu6_uint8.c @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +int relu6_uint8(uint8_t * data, int size, float scale, int zero_point) +{ + for(int i = 0; i < size; i++) + { + float real_data = (data[i]-zero_point) * scale; + real_data = MIN(MAX(real_data, 0.0), 6.0); + data[i] = round(real_data/scale) + zero_point; + } + return 0; +} diff --git a/executor/operator/ref/kernel/resize/resize_fp16.c b/executor/operator/ref/kernel/resize/resize_fp16.c new file mode 100644 index 000000000..19d428c5e --- /dev/null +++ b/executor/operator/ref/kernel/resize/resize_fp16.c @@ -0,0 +1,92 @@ + +static void bilinear_resize_fp16(__fp16* inp, __fp16* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow) +{ + int out_hw = oh * ow; + int in_hw = h * w; + for(int j = 0; j < oh; j++) + { + float fy = (j + 0.5) * scale_y - 0.5; + int sy = floor(fy); + fy -= sy; + sy = T_MIN(sy, h - 2); + sy = T_MAX(0, sy); + float fy_0 = 1.f - fy; + + for(int i = 0; i < ow; i++) + { + float fx = (i + 0.5) * scale_x - 0.5; + int sx = floor(fx); + fx -= sx; + if(sx < 0) + { + sx = 0; + fx = 0; + } + if(sx >= w - 1) + { + fx = 0; + sx = w - 2; + } + float fx_0 = 1.f - fx; + int out_idx = j * ow + i; + int in_idx = sy * w + sx; + // printf("i=%d j=%d\t sx=%d fx=%f\t sy=%d fy=%f\n",i,j,sx,fx,sy,fy); + for(int k = 0; k < c; k++) + { + int in_index = in_idx + k * in_hw; + #if!defined( __ARM_ARCH) || __ARM_ARCH <8 + output[k * out_hw + out_idx] = fp32_to_fp16(fp16_to_fp32(inp[in_index]) * fx_0 * fy_0 + fp16_to_fp32(inp[in_index + w]) * fx_0 * fy + + fp16_to_fp32(inp[in_index + 1]) * fx * fy_0 + fp16_to_fp32(inp[in_index + w + 1]) * fx * fy); + #else + output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy + + inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy; + #endif + } + } + } +} + +static int resize_fp16(__fp16* input, __fp16* output, struct resize_param* param) +{ + + int batch = param->batch; + int channel = param->channel; + int in_chw = channel * param->input_h * param->input_w; + int out_chw = channel * param->output_h * param->output_w; + + + for(int n = 0; n < batch; n++) + { + + if(param->type==0) + { + int si, sj; + for(int k = 0; k < channel; k++) + { + __fp16* input_c = input + n*in_chw + k * param->input_h * param->input_w; + __fp16* output_c = output + k *param->output_h * param->output_w; + for(int i = 0; i < param->output_h; i++) + { + si = T_MIN(( int )(i * param->scale_y), param->input_h - 1); + for(int j = 0; j < param->output_w; j++) + { + sj = T_MIN(( int )(j * param->scale_x), param->output_w - 1); + output_c[i * param->output_w + j] = input_c[si * param->input_w + sj]; + } + } + } + input += in_chw; + output += out_chw; + + } + else + { + bilinear_resize_fp16(input + n*in_chw, output + n*out_chw, param->input_h, param->input_w, channel, + param->scale_x, param->scale_y, param->output_h, param->output_w); + input += in_chw; + output += out_chw; + } + } + + return 0; +} diff --git a/executor/operator/ref/kernel/resize/resize_fp32.c b/executor/operator/ref/kernel/resize/resize_fp32.c new file mode 100644 index 000000000..694fe39b2 --- /dev/null +++ b/executor/operator/ref/kernel/resize/resize_fp32.c @@ -0,0 +1,86 @@ + +static void bilinear_resize_fp32(float* inp, float* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow) +{ + int out_hw = oh * ow; + int in_hw = h * w; + for(int j = 0; j < oh; j++) + { + float fy = (j + 0.5) * scale_y - 0.5; + int sy = floor(fy); + fy -= sy; + sy = T_MIN(sy, h - 2); + sy = T_MAX(0, sy); + float fy_0 = 1.f - fy; + + for(int i = 0; i < ow; i++) + { + float fx = (i + 0.5) * scale_x - 0.5; + int sx = floor(fx); + fx -= sx; + if(sx < 0) + { + sx = 0; + fx = 0; + } + if(sx >= w - 1) + { + fx = 0; + sx = w - 2; + } + float fx_0 = 1.f - fx; + int out_idx = j * ow + i; + int in_idx = sy * w + sx; + // printf("i=%d j=%d\t sx=%d fx=%f\t sy=%d fy=%f\n",i,j,sx,fx,sy,fy); + for(int k = 0; k < c; k++) + { + int in_index = in_idx + k * in_hw; + output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy + + inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy; + } + } + } +} + +static int resize_fp32(float* input, float* output, struct resize_param* param) +{ + int batch = param->batch; + int channel = param->channel; + int in_chw = channel * param->input_h * param->input_w; + int out_chw = channel * param->output_h * param->output_w; + + for(int n = 0; n < batch; n++) + { + + if(param->type==0) + { + int si, sj; + for(int k = 0; k < channel; k++) + { + float* input_c = input + n*in_chw + k * param->input_h * param->input_w; + float* output_c = output + n*out_chw + k * param->output_h * param->output_w; + for(int i = 0; i < param->output_h; i++) + { + si = T_MIN(( int )(i * param->scale_y), param->input_h - 1); + for(int j = 0; j < param->output_w; j++) + { + sj = T_MIN(( int )(j * param->scale_x), param->input_w - 1); + output_c[i * param->output_w + j] = input_c[si * param->input_w + sj]; + } + } + } + + input += in_chw; + output += out_chw; + + } + else + { + bilinear_resize_fp32(input+ n*in_chw, output + n*out_chw, param->input_h, param->input_w, channel, + param->scale_x, param->scale_y, param->output_h, param->output_w); + input += in_chw; + output += out_chw; + } + } + + return 0; +} diff --git a/executor/operator/ref/kernel/resize/resize_int8.c b/executor/operator/ref/kernel/resize/resize_int8.c new file mode 100644 index 000000000..9db782b5e --- /dev/null +++ b/executor/operator/ref/kernel/resize/resize_int8.c @@ -0,0 +1,55 @@ +#include + +static int prelu_fp32(int batch_number,int in_chw,int out_chw,float* input, float* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow) +{ + + for(int i = 0; i < batch_number; i++) + { + bilinear_resize(input, output, h, w, c, scale_x, scale_y, oh, ow); + input += in_chw; + output += out_chw; + } + + return 0; +} +void bilinear_resize(float* inp, float* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow) +{ + int out_hw = oh * ow; + int in_hw = h * w; + for(int j = 0; j < oh; j++) + { + float fy = (j + 0.5) * scale_y - 0.5; + int sy = floor(fy); + fy -= sy; + sy = min(sy, h - 2); + sy = max(0, sy); + float fy_0 = 1.f - fy; + + for(int i = 0; i < ow; i++) + { + float fx = (i + 0.5) * scale_x - 0.5; + int sx = floor(fx); + fx -= sx; + if(sx < 0) + { + sx = 0; + fx = 0; + } + if(sx >= w - 1) + { + fx = 0; + sx = w - 2; + } + float fx_0 = 1.f - fx; + int out_idx = j * ow + i; + int in_idx = sy * w + sx; + // printf("i=%d j=%d\t sx=%d fx=%f\t sy=%d fy=%f\n",i,j,sx,fx,sy,fy); + for(int k = 0; k < c; k++) + { + int in_index = in_idx + k * in_hw; + output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy + + inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy; + } + } + } +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/resize/resize_kernel.h b/executor/operator/ref/kernel/resize/resize_kernel.h new file mode 100644 index 000000000..3e75f60a9 --- /dev/null +++ b/executor/operator/ref/kernel/resize/resize_kernel.h @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#ifndef __RESIZE_KERNEL_H__ +#define __RESIZE_KERNEL_H__ + +#include +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +struct resize_param +{ + int type; + int batch; + int channel; + int input_h; + int input_w; + int output_h; + int output_w; + float scale_x; + float scale_y; +}; +#define T_MAX(a,b) ((a)>(b)?(a):(b)) +#define T_MIN(a,b) ((a)<(b)?(a):(b)) + +typedef int (*resize_t)(void* input, void* output, struct resize_param* param); + +#ifdef CONFIG_KERNEL_FP32 +#include "resize_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "resize_fp16.c" +#endif + +// #ifdef CONFIG_KERNEL_INT8 +// #include "flatten_int8.c" +// #endif + +// #ifdef CONFIG_KERNEL_UINT8 +// #include "flatten_uint8.c" +// #endif + + + + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/executor/operator/ref/kernel/resize/resize_uint8.c b/executor/operator/ref/kernel/resize/resize_uint8.c new file mode 100644 index 000000000..9db782b5e --- /dev/null +++ b/executor/operator/ref/kernel/resize/resize_uint8.c @@ -0,0 +1,55 @@ +#include + +static int prelu_fp32(int batch_number,int in_chw,int out_chw,float* input, float* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow) +{ + + for(int i = 0; i < batch_number; i++) + { + bilinear_resize(input, output, h, w, c, scale_x, scale_y, oh, ow); + input += in_chw; + output += out_chw; + } + + return 0; +} +void bilinear_resize(float* inp, float* output, int h, int w, int c, float scale_x, float scale_y, int oh, int ow) +{ + int out_hw = oh * ow; + int in_hw = h * w; + for(int j = 0; j < oh; j++) + { + float fy = (j + 0.5) * scale_y - 0.5; + int sy = floor(fy); + fy -= sy; + sy = min(sy, h - 2); + sy = max(0, sy); + float fy_0 = 1.f - fy; + + for(int i = 0; i < ow; i++) + { + float fx = (i + 0.5) * scale_x - 0.5; + int sx = floor(fx); + fx -= sx; + if(sx < 0) + { + sx = 0; + fx = 0; + } + if(sx >= w - 1) + { + fx = 0; + sx = w - 2; + } + float fx_0 = 1.f - fx; + int out_idx = j * ow + i; + int in_idx = sy * w + sx; + // printf("i=%d j=%d\t sx=%d fx=%f\t sy=%d fy=%f\n",i,j,sx,fx,sy,fy); + for(int k = 0; k < c; k++) + { + int in_index = in_idx + k * in_hw; + output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy + + inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy; + } + } + } +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/rpn/ref_rpn_fp16.c b/executor/operator/ref/kernel/rpn/ref_rpn_fp16.c new file mode 100644 index 000000000..babe09e36 --- /dev/null +++ b/executor/operator/ref/kernel/rpn/ref_rpn_fp16.c @@ -0,0 +1,52 @@ +#include "ref_rpn_kernel.h" + +int ref_rpn_fp16(const __fp16* score, __fp16* featmap, float* anchors, __fp16* output, struct rpn_param* param) +{ + if(score == nullptr || featmap == nullptr || anchors == nullptr || output == nullptr) + return false; + int featmap_size = param->feat_height * param->feat_width * param->feat_chan; + int max_num_boxes = featmap_size /4; + struct RPN_Box* boxes = (struct RPN_Box*)malloc(max_num_boxes * sizeof(struct RPN_Box)); + + /* __fp16 -> float */ + float* featmap_fp32 = (float*)malloc(featmap_size * sizeof(float)); + float* score_fp32 = (float*)malloc(max_num_boxes * sizeof(float)); + for(int i = 0; i < featmap_size; i++) + featmap_fp32[i] = fp16_to_fp32(featmap[i]); + for(int i = 0; i < max_num_boxes; i++) + score_fp32[i] = fp16_to_fp32(score[i]); + + bbox_tranform_inv(featmap_fp32, anchors, param); + + int num_boxes = 0; + ref_filter_boxes(boxes, featmap_fp32, score_fp32, &num_boxes, param); + + sort_rpn_boxes_by_score(boxes, num_boxes); + + if(param->per_nms_topn > 0) + { + num_boxes = RPN_MIN(param->per_nms_topn, num_boxes); + } + nms_rpn_boxes(boxes, &num_boxes, param->nms_thresh); + + if(param->post_nms_topn > 0) + { + num_boxes = RPN_MIN(param->post_nms_topn, num_boxes); + } + // inder shape [default batch=1] + + // std::cout<<"num_box "<feat_height * param->feat_width * param->feat_chan; + int max_num_boxes = featmap_size /4; + struct RPN_Box* boxes = (struct RPN_Box*)malloc(max_num_boxes * sizeof(struct RPN_Box)); + + bbox_tranform_inv(featmap, anchors, param); + + int num_boxes = 0; + ref_filter_boxes(boxes, featmap, score, &num_boxes, param); + + sort_rpn_boxes_by_score(boxes, num_boxes); + + if(param->per_nms_topn > 0) + { + num_boxes = RPN_MIN(param->per_nms_topn, num_boxes); + } + nms_rpn_boxes(boxes, &num_boxes, param->nms_thresh); + + if(param->post_nms_topn > 0) + { + num_boxes = RPN_MIN(param->post_nms_topn, num_boxes); + } + // inder shape [default batch=1] + + // std::cout<<"num_box "<num_classes + 1; + const int num_boxes = param->num_boxes; + + /* transform uint8_t to fp32 */ + int input_size = num_boxes * 4; + int score_size = num_boxes * num_classes; + float* input_f = (float*)malloc( input_size * sizeof(float)); + float* score_f = (float*)malloc( score_size * sizeof(float)); + float* anchor_f = (float*)malloc( input_size * sizeof(float)); + for(int i =0; i < input_size; i++) + input_f[i] = (input[i] - param->zero[0]) * param->quant_scale[0]; + for(int i =0; i < score_size; i++) + score_f[i] = score[i] * param->quant_scale[1]; + for(int i =0; i < input_size; i++) + anchor_f[i] = (anchor[i] - param->zero[2]) * param->quant_scale[2]; + + ref_dpp_common(input_f, score_f, anchor_f, param, detect_num, detect_class, detect_score, detect_boxes); + + free(anchor_f); + free(score_f); + free(input_f); + + return 0; +} diff --git a/executor/operator/ref/kernel/rpn/ref_rpn_kernel.h b/executor/operator/ref/kernel/rpn/ref_rpn_kernel.h new file mode 100644 index 000000000..6372c1dbf --- /dev/null +++ b/executor/operator/ref/kernel/rpn/ref_rpn_kernel.h @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +#ifndef __REF_RPN_KERNEL_H__ +#define __REF_RPN_KERNEL_H__ + +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct anchor_box +{ + float x0; // xmin + float y0; // ymin + float x1; // xmax + float y1; // ymax +}; +struct RPN_Box +{ + float x0; // xmin + float y0; // ymin + float x1; // xmax + float y1; // ymax + float score; +}; + +struct rpn_param +{ + int feat_height; + int feat_width; + int feat_chan; + int score_chan; + float src_scale; + int src_width; + int src_height; + int num_anchors; + int min_size; + int feat_stride; + int per_nms_topn; + int post_nms_topn; + float nms_thresh; + //float scales[4]; + //float quant_scale[3]; + //int zero[3]; +}; + +#define RPN_MIN(a,b) ( (a)<(b) ? (a) : (b) ) +#define RPN_MAX(a,b) ( (a)>(b) ? (a) : (b) ) + +typedef int (*ref_rpn_kernel_t )(const void* score, void* featmap, float* anchor, void* output, struct rpn_param* param); + +static inline void bbox_tranform_inv(float* m_box, float* local_anchors, struct rpn_param* param) +{ + int feat_size = param->feat_height * param->feat_width; + int c_4 = param->feat_chan / 4; + for(int i = 0; i < c_4; ++i) + { + for(int j = 0; j < (2*feat_size); ++j) + { + local_anchors[(i*4+2)*feat_size + j] -= local_anchors[(i*4+0)*feat_size + j] - 1; + local_anchors[(i*4+0)*feat_size + j] += local_anchors[(i*4+2)*feat_size + j] * 0.5; + + m_box[(i * 4 + 0) * feat_size + j] *= local_anchors[(i*4+2)*feat_size + j]; + m_box[(i * 4 + 0) * feat_size + j] += local_anchors[(i*4+0)*feat_size + j]; + + m_box[(i * 4 + 2) * feat_size + j] = exp(m_box[(i * 4 + 2) * feat_size + j]); + m_box[(i * 4 + 2) * feat_size + j] *= local_anchors[(i*4+2)*feat_size + j]; + } + } +} + +static inline void ref_filter_boxes(struct RPN_Box* boxes, const float* featmap, const float* score, int* num_boxes, struct rpn_param* param) +{ + float local_minsize = param->min_size * param->src_scale; + int c_4 = param->feat_chan / 4; + int feat_size = param->feat_height * param->feat_width; + + int offset_w, offset_h, offset_x, offset_y, offset_s; + + int num = 0; + for(int h = 0; h < param->feat_height; h++) + for(int w = 0; w < param->feat_width; w++) + { + offset_x = h * param->feat_width + w; + offset_y = offset_x + feat_size; + offset_w = offset_y + feat_size; + offset_h = offset_w + feat_size; + offset_s = feat_size * param->num_anchors + offset_x; + for(int c = 0; c < c_4; c++) + { + float width = featmap[offset_w]; + float height = featmap[offset_h]; + if((width >= local_minsize) & (height >= local_minsize)) + { + struct RPN_Box tmp; + tmp.x0 = featmap[offset_x] - 0.5 * width; + tmp.y0 = featmap[offset_y] - 0.5 * height; + tmp.x1 = featmap[offset_x] + 0.5 * width; + tmp.y1 = featmap[offset_y] + 0.5 * height; + tmp.x0 = RPN_MIN(RPN_MAX(tmp.x0, 0), param->src_width); + tmp.y0 = RPN_MIN(RPN_MAX(tmp.y0, 0), param->src_height); + tmp.x1 = RPN_MIN(RPN_MAX(tmp.x1, 0), param->src_width); + tmp.y1 = RPN_MIN(RPN_MAX(tmp.y1, 0), param->src_height); + tmp.score = score[offset_s]; + memcpy(boxes + num, &tmp, sizeof(struct RPN_Box)); + num ++; + } + offset_x += 4*feat_size; + offset_y += 4*feat_size; + offset_w += 4*feat_size; + offset_h += 4*feat_size; + offset_s += feat_size; + } + } + + *num_boxes = num; +} + +void sort_rpn_boxes_by_score(struct RPN_Box* boxes, int size) +{ + int i, j; + for(i = 0; i < size-1; i++) + { + int max_idx = i; + for(j = i + 1; j < size; j++) + { + if(boxes[max_idx].score < boxes[j].score) + max_idx = j; + } + if(i != max_idx) + { + struct RPN_Box tmp; + memcpy(&tmp, boxes+i, sizeof(struct RPN_Box)); + memcpy(boxes + i, boxes+max_idx, sizeof(struct RPN_Box)); + memcpy(boxes + max_idx, &tmp, sizeof(struct RPN_Box)); + } + } +} + +void nms_rpn_boxes(struct RPN_Box* input_boxes, int* size, float nms_thresh) +{ + int input_size = *size; + int output_size = 0; + + struct RPN_Box* output_boxes = (struct RPN_Box*)malloc(sizeof(struct RPN_Box)*input_size); + float* areas = (float*)malloc(sizeof(float)* input_size); + int* picked = (int*)malloc(sizeof(int)* input_size); + + for(int i = 0; i < input_size; ++i) + { + areas[i] = (input_boxes[i].x1 - input_boxes[i].x0 + 1) * (input_boxes[i].y1 - input_boxes[i].y0 + 1); + } + for(int i = 0; i < input_size; ++i) + { + int keep = 1; + for(int j = 0; j < output_size;j++) + { + float xx1 = RPN_MAX(input_boxes[i].x0, output_boxes[j].x0); + float yy1 = RPN_MAX(input_boxes[i].y0, output_boxes[j].y0); + float xx2 = RPN_MIN(input_boxes[i].x1, output_boxes[j].x1); + float yy2 = RPN_MIN(input_boxes[i].y1, output_boxes[j].y1); + float w = RPN_MAX(float(0), xx2 - xx1 + 1); + float h = RPN_MAX(float(0), yy2 - yy1 + 1); + float inter = w * h; + float ovr = inter / (areas[i] + areas[picked[j]] - inter); + if(ovr >= nms_thresh) + { + keep = 0; + break; + } + } + if(keep) + { + memcpy(output_boxes + output_size, input_boxes + i, sizeof(struct RPN_Box)); + picked[output_size] = i; + output_size ++; + } + + } + memcpy(input_boxes, output_boxes, output_size * sizeof(struct RPN_Box)); + *size = output_size; + free(picked); + free(areas); + free(output_boxes); +} + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_rpn_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "ref_rpn_fp16.c" +#endif +/* + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_rpn_uint8.c" +#endif +*/ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/sigmoid/Makefile b/executor/operator/ref/kernel/sigmoid/Makefile new file mode 100644 index 000000000..78d922637 --- /dev/null +++ b/executor/operator/ref/kernel/sigmoid/Makefile @@ -0,0 +1,4 @@ +# obj-$(CONFIG_KERNEL_FP32)+=sigmoid_fp32.o +# obj-$(CONFIG_KERNEL_FP16)+=sigmoid_fp16.o +# obj-$(CONFIG_KERNEL_INT8)+=sigmoid_int8.o +# obj-$(CONFIG_KERNEL_UINT8)+=sigmoid_uint8.o diff --git a/executor/operator/ref/kernel/sigmoid/sigmoid.h b/executor/operator/ref/kernel/sigmoid/sigmoid.h new file mode 100644 index 000000000..42aaa8606 --- /dev/null +++ b/executor/operator/ref/kernel/sigmoid/sigmoid.h @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#ifndef __SIGMOID_H__ +#define __SIGMOID_H__ + +#include + +#include "compiler_fp16.h" +#include +#ifdef __cplusplus +extern "C" { +#endif + +struct sigmoid_param; + +struct sigmoid_param +{ + float scale[2]; + int zero[2]; +}; + + + +#define T_MAX(a, b) ((a) > (b) ? (a) : (b)) +#define T_MIN(a, b) ((a) < (b) ? (a) : (b)) + +typedef int (*sigmoid_t)(void * data, int size,const sigmoid_param* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "sigmoid_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "sigmoid_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "sigmoid_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "sigmoid_uint8.c" +#endif + + + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/sigmoid/sigmoid_fp16.c b/executor/operator/ref/kernel/sigmoid/sigmoid_fp16.c new file mode 100644 index 000000000..45f19d6c5 --- /dev/null +++ b/executor/operator/ref/kernel/sigmoid/sigmoid_fp16.c @@ -0,0 +1,22 @@ + + +int sigmoid_fp16(__fp16 * data,int size,const sigmoid_param * param) +{ + for(int i=0;iscale[0]; + float real_comp = T_MIN(real_in, 30); + real_comp = T_MAX(real_in, -30); + + real_comp = 1 / (1 + exp(-real_comp)); + data[i] = round(real_comp*127); + + } + return 0; +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/sigmoid/sigmoid_uint8.c b/executor/operator/ref/kernel/sigmoid/sigmoid_uint8.c new file mode 100644 index 000000000..06ec6bbc1 --- /dev/null +++ b/executor/operator/ref/kernel/sigmoid/sigmoid_uint8.c @@ -0,0 +1,15 @@ + + +int sigmoid_uint8(uint8_t * data,int size,const sigmoid_param * param) +{ + for(int i=0;izero[0])*param->scale[0]; + float real_comp = T_MIN(real_in, 30); + real_comp = T_MAX(real_in, -30); + + real_comp = 1 / (1 + exp(-real_comp)); + data[i] = round(real_comp/param->scale[1]) + param->zero[1]; + } + return 0; +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/slice/slice_common.c b/executor/operator/ref/kernel/slice/slice_common.c new file mode 100644 index 000000000..7621e2a27 --- /dev/null +++ b/executor/operator/ref/kernel/slice/slice_common.c @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: ruizhang@openailab.com + */ + +static int caffe_run(const int8_t* in_data,int8_t** out_data,int element_size,const struct slice_param *param) +{ + // get the slice param + int slice_axis = param->axis; + int num_slices = 1; + int slice_size = 1; + const int8_t * input = in_data; + const int *in_dim = param->in_shape; + + for(int i = 0; i < slice_axis; i++) + { + num_slices = num_slices * in_dim[i]; + } + for(int i = slice_axis + 1; i < param->dim_num; i++) + { + slice_size = slice_size * in_dim[i]; + } + int in_slice = in_dim[slice_axis]; + int slice_index = 0; + int out_num = param->out_num; + for(int i = 0; i < out_num; i++) + { + int8_t* output = out_data[i]; + int out_slice = param->output_shape[i].dims[slice_axis]; + for(int n = 0; n < num_slices; n++) + { + int in_offset = (n * in_slice + slice_index) * slice_size * element_size; + int out_offset = n * out_slice * slice_size * element_size; + memcpy(output+out_offset,input+in_offset,slice_size * out_slice * element_size); + } + slice_index += out_slice; + } + return 0; + +} +static int tf_run(const int8_t* in_data,int8_t** out_data,int element_size,const struct slice_param *param) +{ + const int8_t* input = in_data; + int8_t* output = out_data[0]; + + const int *begins = param->output_shape[0].begins; + const int *sizes = param->output_shape[0].sizes; + int real_dim = param->dim_num; + const int* in_dim_new = param->in_shape; + int in_dim_0 = in_dim_new[0]; + int in_dim_1 = in_dim_new[1]; + int in_dim_2 = in_dim_new[2]; + int in_dim_3 = in_dim_new[3]; + + int start_dim_0 = (4 - real_dim) > 0 ? 0 : begins[0]; + int stop_dim_0 = ((4 - real_dim) > 0 || sizes[0] == -1) + ? in_dim_0 - start_dim_0 + : start_dim_0 + sizes[0]; + int start_dim_1 = (3 - real_dim) > 0 ? 0 : begins[1]; + int stop_dim_1 = ((3 - real_dim) > 0 || sizes[1] == -1) + ? in_dim_1 - start_dim_1 + : start_dim_1 + sizes[1]; + int start_dim_2 = (2 - real_dim) > 0 ? 0 : begins[2]; + int stop_dim_2 = ((2 - real_dim) > 0 || sizes[2] == -1) + ? in_dim_2 - start_dim_2 + : start_dim_2 + sizes[2]; + int start_dim_3 = (1 - real_dim) > 0 ? 0 : begins[3]; + int stop_dim_3 = ((1 - real_dim) > 0 || sizes[3] == -1) + ? in_dim_3 - start_dim_3 + : start_dim_3 + sizes[3]; + + for(int n = start_dim_0; n < stop_dim_0;++n) + { + for(int i = start_dim_1; i < stop_dim_1; ++i) + { + for(int j = start_dim_2; j < stop_dim_2; ++j) + { + int len = stop_dim_3 - start_dim_3; + int input_off = n * in_dim_1 * in_dim_2 * in_dim_3 + + i * in_dim_2 * in_dim_3 + + j * in_dim_3 + start_dim_3; + memcpy(output,input + input_off,len * element_size); + output += len * element_size; + } + } + } + return 0; +} +static int ref_slice_common(const int8_t* in_data,int8_t** out_data,int element_size,const struct slice_param *param) +{ + if(param->iscaffe) + return caffe_run(in_data,out_data,element_size,param); + else + return tf_run(in_data,out_data,element_size,param); + +} diff --git a/executor/operator/ref/kernel/slice/slice_fp16.c b/executor/operator/ref/kernel/slice/slice_fp16.c new file mode 100644 index 000000000..7aff3ed2c --- /dev/null +++ b/executor/operator/ref/kernel/slice/slice_fp16.c @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: ruizhang@openailab.com + */ + +static int ref_slice_fp16(const __fp16* in_data, __fp16** out_data, const struct slice_param* param) +{ + return ref_slice_common((const int8_t*)in_data,(int8_t**)out_data,sizeof(__fp16),param); +} \ No newline at end of file diff --git a/executor/operator/ref/kernel/slice/slice_fp32.c b/executor/operator/ref/kernel/slice/slice_fp32.c new file mode 100644 index 000000000..d35989590 --- /dev/null +++ b/executor/operator/ref/kernel/slice/slice_fp32.c @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: ruizhang@openailab.com + */ + + +static int ref_slice_fp32(const float* in_data, float** out_data, const struct slice_param* param) +{ + return ref_slice_common((const int8_t*)in_data,(int8_t**)out_data,sizeof(float),param); +} diff --git a/executor/operator/ref/kernel/slice/slice_int8.c b/executor/operator/ref/kernel/slice/slice_int8.c new file mode 100644 index 000000000..50a09cd60 --- /dev/null +++ b/executor/operator/ref/kernel/slice/slice_int8.c @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: ruizhang@openailab.com + */ + +static int ref_slice_int8(const int8_t* in_data, int8_t** out_data, const struct slice_param* param) +{ + return ref_slice_common((const int8_t*)in_data,(int8_t**)out_data,sizeof(int8_t),param); +} diff --git a/executor/operator/ref/kernel/slice/slice_kernel.h b/executor/operator/ref/kernel/slice/slice_kernel.h new file mode 100644 index 000000000..f219d3282 --- /dev/null +++ b/executor/operator/ref/kernel/slice/slice_kernel.h @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: ruizhang@openailab.com + */ + +#ifndef __SLICE_KERNEL_H__ +#define __SLICE_KERNEL_H__ + +#include +#include + +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct shape_dim +{ + int dims[4]; // for caffe + int begins[4]; // for tf + int sizes[4]; // for tf +}; + +struct slice_param +{ + int in_shape[4]; // the dim of the input + struct shape_dim *output_shape; // out shape + int out_num; + int dim_num; + int axis; // for caffe + float out_scale; // for input tensor int8 + bool iscaffe; +}; + +typedef int (*slice_t)(const int8_t* in_data, int8_t** out_data, const struct slice_param* param); + +#include "slice_common.c" +#ifdef CONFIG_KERNEL_FP32 +#include "slice_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "slice_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "slice_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "slice_uint8.c" +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/slice/slice_uint8.c b/executor/operator/ref/kernel/slice/slice_uint8.c new file mode 100644 index 000000000..c6fffce78 --- /dev/null +++ b/executor/operator/ref/kernel/slice/slice_uint8.c @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: ruizhang@openailab.com + */ + +static int ref_slice_uint8(const uint8_t* in_data, uint8_t** out_data, const struct slice_param* param) +{ + return ref_slice_common((const int8_t*)in_data,(int8_t**)out_data,sizeof(uint8_t),param); +} + diff --git a/executor/operator/ref/kernel/softmax/ref_softmax.h b/executor/operator/ref/kernel/softmax/ref_softmax.h new file mode 100644 index 000000000..a46b69f9b --- /dev/null +++ b/executor/operator/ref/kernel/softmax/ref_softmax.h @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haitao@openailab.com + */ + +#ifndef __REF_SOFTMAX_OP_KERNEL_H__ +#define __REF_SOFTMAX_OP_KERNEL_H__ + +#include +#include +#include +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct op_data +{ + int out_size; + int in_size; + int on_size; + int i_zero; + float i_scale; + int o_zero; + float o_scale; +}; + + +static void GetMaxArray(float* input, float* array, int in_size, int on_size) +{ + float* input_ptr = ( float* )input; + float* array_ptr = ( float* )array; + memset(array, 0, in_size * sizeof(float)); + + for(int j = 0; j < on_size; j++) + for(int l = 0; l < in_size; l++) + { + if(array_ptr[l] < input_ptr[j * in_size + l]) + array_ptr[l] = input_ptr[j * in_size + l]; + } +} + + +static void GetOutResult(float* input, float* output, float* array, float* sum_array, int in_size, int on_size) +{ + float* input_ptr = ( float* )input; + float* output_ptr = ( float* )output; + float* array_ptr = ( float* )array; + float* sum_array_ptr = ( float* )sum_array; + + memset(sum_array, 0x0, in_size * sizeof(float)); + + /* get the exp and the summary */ + + for(int j = 0; j < on_size; j++) + for(int l = 0; l < in_size; l++) + { + int index = j * in_size + l; + output_ptr[index] = exp(input_ptr[index] - array_ptr[l]); + sum_array_ptr[l] += output_ptr[index]; + } + + /* the final result */ + for(int j = 0; j < on_size; j++) + for(int l = 0; l < in_size; l++) + { + int index = j * in_size + l; + output_ptr[index] /= sum_array_ptr[l]; + } +} + + + +typedef int (*ref_softmax_kernel_t)(void * input, void * output, void * max_array, void * sum_array, op_data* op_param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_softmax_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "ref_softmax_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "ref_softmax_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_softmax_uint8.c" +#endif + + + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/softmax/ref_softmax_fp16.c b/executor/operator/ref/kernel/softmax/ref_softmax_fp16.c new file mode 100644 index 000000000..24d493c99 --- /dev/null +++ b/executor/operator/ref/kernel/softmax/ref_softmax_fp16.c @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haitao@openailab.com + */ + +#include + +#include "ref_softmax.h" +#include +#include + + + int ref_softmax_kernel_fp16(__fp16* input, __fp16* output, float* max_array, float* sum_array, op_data* op_param) + { + int out_size = op_param->out_size; + int in_size = op_param->in_size; + int on_size = op_param->on_size; + int on_in_size = in_size * on_size; + + float* input_f = (float*)malloc(out_size * on_in_size * sizeof(float)); + float* output_f = (float*)malloc(out_size * on_in_size * sizeof(float)); + + for(int i = 0; i < out_size; i++) + for(int j=0; j< on_in_size; j++) + input_f[i*on_in_size+j] = fp16_to_fp32(input[i*on_in_size+j]); + + for(int i = 0; i < out_size; i++) + { + /* get max */ + int img_base = i * in_size * on_size; + GetMaxArray(input_f + img_base, max_array, in_size, on_size); + GetOutResult(input_f + img_base, output_f + img_base, max_array, sum_array, in_size, on_size); + } + + for(int i = 0; i < out_size; i++) + for(int j=0; j< on_in_size; j++) + output[i*on_in_size+j] = fp32_to_fp16(output_f[i*on_in_size+j]); + + free(input_f); + free(output_f); + + return 0; + } + + + + diff --git a/executor/operator/ref/kernel/softmax/ref_softmax_fp32.c b/executor/operator/ref/kernel/softmax/ref_softmax_fp32.c new file mode 100644 index 000000000..c7424d1bc --- /dev/null +++ b/executor/operator/ref/kernel/softmax/ref_softmax_fp32.c @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haitao@openailab.com + */ + +#include +#include + +#include "ref_softmax.h" +#include + + + int ref_softmax_kernel_fp32(float* input, float* output, float* max_array, float* sum_array, op_data* op_param) + { + for(int i = 0; i < op_param->out_size; i++) + { + /* get max */ + int img_base = i * op_param->in_size * op_param->on_size; + GetMaxArray(input + img_base, max_array, op_param->in_size, op_param->on_size); + GetOutResult(input + img_base, output + img_base, max_array, sum_array, op_param->in_size, op_param->on_size); + } + + return 0; + } + + + + diff --git a/executor/operator/ref/kernel/softmax/ref_softmax_int8.c b/executor/operator/ref/kernel/softmax/ref_softmax_int8.c new file mode 100644 index 000000000..3ca40f0f6 --- /dev/null +++ b/executor/operator/ref/kernel/softmax/ref_softmax_int8.c @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haitao@openailab.com + */ + +#include + +#include "ref_softmax.h" +#include +#include + + + int ref_softmax_kernel_int8(int8_t* input, int8_t* output, float* max_array, float* sum_array, op_data* op_param) + { + int out_size = op_param->out_size; + int in_size = op_param->in_size; + int on_size = op_param->on_size; + int on_in_size = in_size * on_size; + + float* input_f = (float*)malloc(out_size * on_in_size * sizeof(float)); + float* output_f = (float*)malloc(out_size * on_in_size * sizeof(float)); + + for(int i = 0; i < out_size; i++) + for(int j=0; j< on_in_size; j++) + input_f[i*on_in_size+j] = (input[i*on_in_size+j])*op_param->i_scale; + + for(int i = 0; i < out_size; i++) + { + /* get max */ + int img_base = i * in_size * on_size; + GetMaxArray(input_f + img_base, max_array, in_size, on_size); + GetOutResult(input_f + img_base, output_f + img_base, max_array, sum_array, in_size, on_size); + } + + float fmax=0.0f; + + for(int i = 0; i < out_size; i++) + for(int j=0; j< on_in_size; j++) + if(fmaxo_scale = o_scale; + + for(int i = 0; i < out_size; i++) + for(int j=0; j< on_in_size; j++) + output[i*on_in_size+j] = round(output_f[i*on_in_size+j]/op_param->o_scale); + + free(input_f); + free(output_f); + + return 0; + } + + diff --git a/executor/operator/ref/kernel/softmax/ref_softmax_uint8.c b/executor/operator/ref/kernel/softmax/ref_softmax_uint8.c new file mode 100644 index 000000000..8fdc5c080 --- /dev/null +++ b/executor/operator/ref/kernel/softmax/ref_softmax_uint8.c @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haitao@openailab.com + */ + +#include + +#include "ref_softmax.h" +#include +#include + + + int ref_softmax_kernel_uint8(uint8_t* input, uint8_t* output, float* max_array, float* sum_array, op_data* op_param) + { + int out_size = op_param->out_size; + int in_size = op_param->in_size; + int on_size = op_param->on_size; + int on_in_size = in_size * on_size; + + float* input_f = (float*)malloc(out_size * on_in_size * sizeof(float)); + float* output_f = (float*)malloc(out_size * on_in_size * sizeof(float)); + + for(int i = 0; i < out_size; i++) + for(int j=0; j< on_in_size; j++) + input_f[i*on_in_size+j] = (input[i*on_in_size+j]-op_param->i_zero)*op_param->i_scale; + + for(int i = 0; i < out_size; i++) + { + /* get max */ + int img_base = i * in_size * on_size; + GetMaxArray(input_f + img_base, max_array, in_size, on_size); + GetOutResult(input_f + img_base, output_f + img_base, max_array, sum_array, in_size, on_size); + } + + for(int i = 0; i < out_size; i++) + for(int j=0; j< on_in_size; j++) + output[i*on_in_size+j] = round((output_f[i*on_in_size+j]/op_param->o_scale)+op_param->o_zero); + + free(input_f); + free(output_f); + + return 0; + } + diff --git a/executor/operator/ref/kernel/split/split_fp16.c b/executor/operator/ref/kernel/split/split_fp16.c new file mode 100644 index 000000000..e7a38bb23 --- /dev/null +++ b/executor/operator/ref/kernel/split/split_fp16.c @@ -0,0 +1,32 @@ +static int split_fp16(const __fp16* in_data, __fp16** out_data, struct split_param* param) +{ + int slice_axis = param->axis; + int num_slices = 1; + int slice_size = 1; + for(int i = 0; i < slice_axis; i++) + { + num_slices = num_slices * param->input_shape.dim[i]; + } + for(int i = slice_axis + 1; i < param->input_dim; i++) + { + slice_size = slice_size * param->input_shape.dim[i]; + } + int in_slice = param->input_shape.dim[slice_axis]; + int slice_index = 0; + unsigned int out_num = param->output_counts; + for(unsigned int i = 0; i < out_num; i++) + { + __fp16 * output=(__fp16*)out_data[i]; + int out_slice = param->output_shape[i].dim[slice_axis]; + + for(int n = 0; n < num_slices; n++) + { + int in_offset = (n * in_slice + slice_index) * slice_size; + int out_offset = n * out_slice * slice_size; + memcpy(output+out_offset,in_data + in_offset,slice_size * out_slice * sizeof(__fp16)); + } + slice_index += out_slice; + } + + return 0; +} diff --git a/executor/operator/ref/kernel/split/split_fp32.c b/executor/operator/ref/kernel/split/split_fp32.c new file mode 100644 index 000000000..23496a5b2 --- /dev/null +++ b/executor/operator/ref/kernel/split/split_fp32.c @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + + +static int split_fp32(const float* in_data, float** out_data, struct split_param* param) +{ + int slice_axis = param->axis; + int num_slices = 1; + int slice_size = 1; + for(int i = 0; i < slice_axis; i++) + { + num_slices = num_slices * param->input_shape.dim[i]; + } + for(int i = slice_axis + 1; i < param->input_dim; i++) + { + slice_size = slice_size * param->input_shape.dim[i]; + } + int in_slice = param->input_shape.dim[slice_axis]; + int slice_index = 0; + unsigned int out_num = param->output_counts; + for(unsigned int i = 0; i < out_num; i++) + { + float * output=(float*)out_data[i]; + int out_slice = param->output_shape[i].dim[slice_axis]; + + for(int n = 0; n < num_slices; n++) + { + int in_offset = (n * in_slice + slice_index) * slice_size; + int out_offset = n * out_slice * slice_size; + memcpy(output+out_offset,in_data + in_offset,slice_size * out_slice * sizeof(float)); + } + slice_index += out_slice; + } + + return 0; +} diff --git a/executor/operator/ref/kernel/split/split_int8.c b/executor/operator/ref/kernel/split/split_int8.c new file mode 100644 index 000000000..8a915d641 --- /dev/null +++ b/executor/operator/ref/kernel/split/split_int8.c @@ -0,0 +1,32 @@ +static int split_int8(const int8_t* in_data, int8_t** out_data, struct split_param* param) +{ + int slice_axis = param->axis; + int num_slices = 1; + int slice_size = 1; + for(int i = 0; i < slice_axis; i++) + { + num_slices = num_slices * param->input_shape.dim[i]; + } + for(int i = slice_axis + 1; i < param->input_dim; i++) + { + slice_size = slice_size * param->input_shape.dim[i]; + } + int in_slice = param->input_shape.dim[slice_axis]; + int slice_index = 0; + unsigned int out_num = param->output_counts; + for(unsigned int i = 0; i < out_num; i++) + { + int8_t * output=(int8_t*)out_data[i]; + int out_slice = param->output_shape[i].dim[slice_axis]; + + for(int n = 0; n < num_slices; n++) + { + int in_offset = (n * in_slice + slice_index) * slice_size; + int out_offset = n * out_slice * slice_size; + memcpy(output+out_offset,in_data + in_offset,slice_size * out_slice * sizeof(int8_t)); + } + slice_index += out_slice; + } + + return 0; +} diff --git a/executor/operator/ref/kernel/split/split_kernel.h b/executor/operator/ref/kernel/split/split_kernel.h new file mode 100644 index 000000000..13bbddbed --- /dev/null +++ b/executor/operator/ref/kernel/split/split_kernel.h @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#ifndef __SPLIT_KERNEL_H__ +#define __SPLIT_KERNEL_H__ + +#include +#include +#include "compiler_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +struct shape_dim +{ + int dim[4]; + float scale; + int zero; +}; + +struct split_param +{ + struct shape_dim input_shape; + int output_counts; + int input_dim; + struct shape_dim* output_shape; + int output_dim; + int axis; + float out_scale; +}; + + +typedef int (*split_t)(void * data,void ** out_data,split_param* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "split_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "split_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "split_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "split_uint8.c" +#endif + + + + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/executor/operator/ref/kernel/split/split_uint8.c b/executor/operator/ref/kernel/split/split_uint8.c new file mode 100644 index 000000000..b07bf24e6 --- /dev/null +++ b/executor/operator/ref/kernel/split/split_uint8.c @@ -0,0 +1,32 @@ +static int split_uint8(const uint8_t* in_data, uint8_t** out_data,struct split_param* param) +{ + int slice_axis = param->axis; + int num_slices = 1; + int slice_size = 1; + for(int i = 0; i < slice_axis; i++) + { + num_slices = num_slices * param->input_shape.dim[i]; + } + for(int i = slice_axis + 1; i < param->input_dim; i++) + { + slice_size = slice_size * param->input_shape.dim[i]; + } + int in_slice = param->input_shape.dim[slice_axis]; + int slice_index = 0; + unsigned int out_num = param->output_counts; + for(unsigned int i = 0; i < out_num; i++) + { + uint8_t * output=(uint8_t*)out_data[i]; + int out_slice = param->output_shape[i].dim[slice_axis]; + + for(int n = 0; n < num_slices; n++) + { + int in_offset = (n * in_slice + slice_index) * slice_size; + int out_offset = n * out_slice * slice_size; + memcpy(output+out_offset,in_data + in_offset,slice_size * out_slice * sizeof(uint8_t)); + } + slice_index += out_slice; + } + + return 0; +} diff --git a/executor/operator/ref/kernel/swap_axis/ref_swap_axis_fp32.c b/executor/operator/ref/kernel/swap_axis/ref_swap_axis_fp32.c new file mode 100644 index 000000000..980892587 --- /dev/null +++ b/executor/operator/ref/kernel/swap_axis/ref_swap_axis_fp32.c @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +static int ref_swap_axis_fp32(const float* in_data,float* out_data,const int* dims) +{ + + for(int i = 0; i < dims[0]; i ++) + for(int j = 0; j < dims[3]; j ++) + for(int p = 0; p < dims[2]; p ++) + for(int q = 0; q < dims[1]; q ++) + { + int out_index = i*dims[1]*dims[2]*dims[3]*dims[4] + j*dims[2]*dims[1]*dims[4] + + p*dims[1]*dims[4] + q*dims[4]; + int in_index = i*dims[1]*dims[2]*dims[3]*dims[4] + q*dims[2]*dims[3]*dims[4] + + p*dims[3]*dims[4] + j*dims[4]; + memcpy(out_data + out_index, in_data + in_index, dims[4]*sizeof(float)); + } + + return 0; +} diff --git a/executor/operator/ref/kernel/swap_axis/ref_swap_axis_kernel.h b/executor/operator/ref/kernel/swap_axis/ref_swap_axis_kernel.h new file mode 100644 index 000000000..1b124bbc8 --- /dev/null +++ b/executor/operator/ref/kernel/swap_axis/ref_swap_axis_kernel.h @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + + +#ifndef __REF_SWAP_AXIS_H__ +#define __REF_SWAP_AXIS_H__ + +#include +#include +#include +#include "compiler_fp16.h" + +#ifdef __cplusplus + extern "C" { +#endif + + +typedef int (*ref_swap_axis_kernel_t)(void * input, void * output, int* dims); + + +#ifdef CONFIG_KERNEL_FP32 +#include "ref_swap_axis_fp32.c" +#endif +/* +#ifdef CONFIG_KERNEL_FP16 +#include "ref_swap_axis_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "ref_swap_axis_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "ref_swap_axis_uint8.c" +#endif +*/ + + + +#ifdef __cplusplus +} +#endif + + +#endif + + diff --git a/executor/operator/ref/kernel/tanh/tanh.h b/executor/operator/ref/kernel/tanh/tanh.h new file mode 100644 index 000000000..8d13a87e7 --- /dev/null +++ b/executor/operator/ref/kernel/tanh/tanh.h @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#ifndef __TANH_KERNEL_H__ +#define __TANH_KERNEL_H__ + +#include +#include + +#include "compiler_fp16.h" +#include "tanh_common.h" +#ifdef __cplusplus +extern "C" { +#endif +struct tanh_param{ + float input_scale; + int input_zero; + float output_scale; + int output_zero; +}; +typedef int (*tanh_t)(void * data, int size, struct tanh_param* param); + + +#ifdef CONFIG_KERNEL_FP32 +#include "tanh_fp32.c" +#endif + +#ifdef CONFIG_KERNEL_FP16 +#include "tanh_fp16.c" +#endif + +#ifdef CONFIG_KERNEL_INT8 +#include "tanh_int8.c" +#endif + +#ifdef CONFIG_KERNEL_UINT8 +#include "tanh_uint8.c" +#endif + + + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/executor/operator/ref/kernel/tanh/tanh_common.h b/executor/operator/ref/kernel/tanh/tanh_common.h new file mode 100644 index 000000000..25026b4bb --- /dev/null +++ b/executor/operator/ref/kernel/tanh/tanh_common.h @@ -0,0 +1,7 @@ +#ifndef __TANH_COMMON_H__ +#define __TANH_COMMON_H__ + +#define T_MAX(a, b) ((a) > (b) ? (a) : (b)) +#define T_MIN(a, b) ((a) < (b) ? (a) : (b)) + +#endif diff --git a/executor/operator/ref/kernel/tanh/tanh_fp16.c b/executor/operator/ref/kernel/tanh/tanh_fp16.c new file mode 100644 index 000000000..f5c44e8b9 --- /dev/null +++ b/executor/operator/ref/kernel/tanh/tanh_fp16.c @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + + +int tanh_fp16(__fp16 * data,int size, struct tanh_param* param) +{ + for(int i=0;iinput_scale; + real_data = T_MIN(real_data, 30.0f); + real_data = T_MAX(real_data, -30.0f); + + real_data = (exp(real_data) - exp(-real_data)) / (exp(real_data) + exp(-real_data)); + data[i] = real_data * 127; + } + return 0; +} diff --git a/executor/operator/ref/kernel/tanh/tanh_uint8.c b/executor/operator/ref/kernel/tanh/tanh_uint8.c new file mode 100644 index 000000000..d0870744d --- /dev/null +++ b/executor/operator/ref/kernel/tanh/tanh_uint8.c @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +int tanh_uint8(int8_t * data, int size, struct tanh_param* param) +{ + for(int i=0;iinput_zero) * param->input_scale; + real_data = T_MIN(real_data, 30.0f); + real_data = T_MAX(real_data, -30.0f); + + real_data = (exp(real_data) - exp(-real_data)) / (exp(real_data) + exp(-real_data)); + data[i] = round(real_data /param->output_scale) + param->output_zero; + } + return 0; +} diff --git a/executor/operator/ref/pad.cpp b/executor/operator/ref/pad.cpp new file mode 100644 index 000000000..3e9447735 --- /dev/null +++ b/executor/operator/ref/pad.cpp @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" + +#include "operator/pad.hpp" +#include "kernel/pad/pad_kernel.h" + +namespace TEngine { + +namespace RefPadOps { + + + +struct RefPad : public MTNodeOps +{ + bool Prerun(Node * node) override; + + bool Run(Node * node) override; + void InitRegistry(void); + + pad_t kernel_run; + pad_param param; + + KernelRegistry kernel_registry; + RefPad(void) + { + InitRegistry(); + } +}; + + +bool RefPad::Prerun(Node * node) +{ + Tensor * input_tensor=node->GetInputTensor(0); + + int layout=exec_attr->graph_layout; + + if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + + return true; +} +static int get_scale_zero(Tensor* itensor,Tensor * otensor,pad_param* param) +{ + auto* i_quant = itensor->GetQuantParam(); + auto* o_quant = otensor->GetQuantParam(); + if( i_quant->size() != 1 ) + return -1; + param->scale[0] = (*i_quant)[0].scale; + if(itensor->GetDataType() == TENGINE_DT_UINT8) + { + if( o_quant->size() != 1) + return -1; + + param->scale[1] = (*o_quant)[0].scale; + param->zero[1] = (*o_quant)[0].zero_point; + + param->zero[0] = (*i_quant)[0].zero_point; + } + return 0; +} + +bool RefPad::Run(Node * node) +{ + Pad* pad_op = dynamic_cast(node->GetOp()); + PadParam* op_param = pad_op->GetParam(); + + Tensor * input_tensor = node->GetInputTensor(0); + Tensor * out_tensor = node->GetOutputTensor(0); + // int element_size = DataType::GetTypeSize(out_tensor->GetDataType()); + // int out_size= out_tensor->GetTotalSize() / element_size; + + if(input_tensor->GetDataType() == TENGINE_DT_INT8 ||input_tensor->GetDataType() == TENGINE_DT_UINT8 ) + { + if(get_scale_zero(input_tensor,out_tensor, ¶m) < 0) + return false; + } + const TShape& i_shape = input_tensor->GetShape(); + std::vector i_dims=i_shape.GetDim(); + + const TShape& o_shape = out_tensor->GetShape(); + std::vector o_dims=o_shape.GetDim(); + + int in_n=i_shape.GetN(); + int in_h=i_shape.GetH(); + int in_w=i_shape.GetW(); + int in_c=i_shape.GetC(); + + int out_n=o_shape.GetN(); + int out_h=o_shape.GetH(); + int out_w=o_shape.GetW(); + int out_c=o_shape.GetC(); + + int in_size=in_n*in_h*in_w*in_c; + int out_size=out_n*out_h*out_w*out_c; + + param.mode=op_param->mode; + if(param.mode==0) + { + param.cv_f32=op_param->value; + param.cv_f16=(__fp16 )fp32_to_fp16(op_param->value); + param.cv_int8=op_param->value; + param.cv_uint8=op_param->value; + } + param.pad_0_h=op_param->pad_0_h; + param.pad_0_w=op_param->pad_0_w; + param.pad_1_h=op_param->pad_1_h; + param.pad_1_w=op_param->pad_1_w; + param.pad_2_h=op_param->pad_2_h; + param.pad_2_w=op_param->pad_2_w; + param.pad_3_h=op_param->pad_3_h; + param.pad_3_w=op_param->pad_3_w; + + param.in_n=in_n; + param.in_h=in_h; + param.in_w=in_w; + param.in_c=in_c; + + param.out_n=out_n; + param.out_h=out_h; + param.out_w=out_w; + + param.in_size=in_size; + param.out_size=out_size; + + void* in_data=get_tensor_mem(input_tensor); + void* out_data=get_tensor_mem(out_tensor); + + int ret=kernel_run(in_data,out_data,¶m); + if(input_tensor->GetDataType() == TENGINE_DT_INT8) + { + auto* i_quant = input_tensor->GetQuantParam(); + auto* o_quant = out_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale =(*i_quant)[0].scale;; + o_quant->resize(0); + o_quant->push_back(q_param); + } + if(ret<0) + return false; + else + return true; +} + +void RefPad::InitRegistry(void) +{ + +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((pad_t)pad_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((pad_t)pad_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((pad_t)pad_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((pad_t)pad_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((pad_t)pad_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((pad_t)pad_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((pad_t)pad_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((pad_t)pad_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + RefPad* ops = new RefPad(); + + LOG_DEBUG()<<"Pad RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefSqueezeOps +void RegisterPadOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Pad", RefPadOps::SelectFunc, 1000); +} +} // namespace TEngine diff --git a/executor/operator/ref/prelu.cpp b/executor/operator/ref/prelu.cpp new file mode 100644 index 000000000..41f264d63 --- /dev/null +++ b/executor/operator/ref/prelu.cpp @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + +#include "kernel/prelu/prelu_kernel.h" + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/prelu.hpp" + +namespace TEngine { + +namespace RefPreluOps { + + + +struct PReluOps : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool OnBind(Node * node) override; + bool Run(Node * node) override; + bool Postrun(Node * node) override; + void InitRegistry(void); + prelu_param op_param; + prelu_t kernel_run; + + KernelRegistry kernel_registry; + + PReluOps(void) + { + kernel_run = nullptr; + + InitRegistry(); + } +}; + +bool PReluOps::Prerun(Node * node) +{ + Tensor * input = node->GetInputTensor(0); + int layout=exec_attr->graph_layout; + + op_param.layout = layout; + + + + if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; +} + +bool PReluOps::OnBind(Node * node) +{ + inplace_t io_map; + + io_map[0] = 0; + + node->SetAttr(ATTR_INPLACE, io_map); + return true; +} + +bool PReluOps::Run(Node * node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + const TShape& shape = input_tensor->GetShape(); + const std::vector dims = shape.GetDim(); + if(input_tensor->GetDataType() == TENGINE_DT_INT8 || + input_tensor->GetDataType() == TENGINE_DT_UINT8 ) + { + auto quant_param = input_tensor->GetQuantParam(); + if(quant_param->size() != 1 ) + return false; + op_param.scale = (*quant_param)[0].scale; + op_param.zero = (*quant_param)[0].zero_point; + } + int ret = -1; + int dim0 = dims[0]; + int dim1 = dims[1]; + int dim2 = dims[2]; + int dim3 = dims[3]; + void* data = get_tensor_mem(input_tensor); + void* out_data = get_tensor_mem(output_tensor); + const Tensor* slope_tensor = node->GetInputTensor(1); + float* slope = ( float* )get_tensor_mem(slope_tensor); + ret = kernel_run(data,out_data,dim0,dim1,dim2,dim3,slope,&op_param); + + if(ret<0) + return false; + else + return true; + + + +} + +bool PReluOps::Postrun(Node * node) +{ + return true; +} + +void PReluOps::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((prelu_t)prelu_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((prelu_t)prelu_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((prelu_t)prelu_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((prelu_t)prelu_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((prelu_t)prelu_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((prelu_t)prelu_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((prelu_t)prelu_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((prelu_t)prelu_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + PReluOps* ops = new PReluOps(); + + LOG_DEBUG()<<"ReluOps RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefReluOps +void RegisterPreluOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "PReLU", RefPreluOps::SelectFunc, 1000); +} +} // namespace TEngine \ No newline at end of file diff --git a/executor/operator/ref/reduction.cpp b/executor/operator/ref/reduction.cpp new file mode 100644 index 000000000..824dc797a --- /dev/null +++ b/executor/operator/ref/reduction.cpp @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" + +#include "operator/reduction.hpp" +#include "kernel/reduction/reduce.h" + +namespace TEngine { + +namespace RefReductionOps { + + + +struct RefReduction : public MTNodeOps +{ + bool Prerun(Node * node) override; + + bool Run(Node * node) override; + void InitRegistry(void); + + reduce_t kernel_run; + reduce_param param; + + KernelRegistry kernel_registry; + RefReduction(void) + { + InitRegistry(); + } +}; + + +bool RefReduction::Prerun(Node * node) +{ + Tensor * input_tensor=node->GetInputTensor(0); + + int layout=exec_attr->graph_layout; + + if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + + return true; +} +static int get_scale_zero(Tensor* itensor,Tensor * otensor,reduce_param* param) +{ + auto* i_quant = itensor->GetQuantParam(); + auto* o_quant = otensor->GetQuantParam(); + if( i_quant->size() != 1 ) + return -1; + param->scale[0] = (*i_quant)[0].scale; + if(itensor->GetDataType() == TENGINE_DT_UINT8) + { + if( o_quant->size() != 1) + return -1; + + param->scale[1] = (*o_quant)[0].scale; + param->zero[1] = (*o_quant)[0].zero_point; + + param->zero[0] = (*i_quant)[0].zero_point; + } + return 0; +} + +bool RefReduction::Run(Node * node) +{ + Reduction* reduction_op = dynamic_cast(node->GetOp()); + ReductionParam* op_param = reduction_op->GetParam(); + + Tensor * input_tensor = node->GetInputTensor(0); + Tensor * out_tensor = node->GetOutputTensor(0); + int element_size = DataType::GetTypeSize(out_tensor->GetDataType()); + int out_size= out_tensor->GetTotalSize() / element_size; + + if(input_tensor->GetDataType() == TENGINE_DT_INT8 ||input_tensor->GetDataType() == TENGINE_DT_UINT8 ) + { + if(get_scale_zero(input_tensor,out_tensor, ¶m) < 0) + return false; + } + const TShape& i_shape = input_tensor->GetShape(); + + std::vector dims=i_shape.GetDim(); + + int dim0=dims[0]; + int dim1=dims[1]; + int dim2=dims[2]; + int dim3=dims[3]; + + param.param_dim[0]=op_param->dim_0; + param.param_dim[1]=op_param->dim_1; + param.param_dim[2]=op_param->dim_2; + param.param_dim[3]=op_param->dim_3; + param.type=op_param->type; + + void* in_data=get_tensor_mem(input_tensor); + void* out_data=get_tensor_mem(out_tensor); + + int ret=kernel_run(in_data,out_data,dim0,dim1,dim2,dim3,out_size,¶m); + if(input_tensor->GetDataType() == TENGINE_DT_INT8) + { + auto* o_quant = out_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale =param.scale[1]; + o_quant->resize(0); + o_quant->push_back(q_param); + } + if(ret<0) + return false; + else + return true; +} + +void RefReduction::InitRegistry(void) +{ + +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((reduce_t)reduce_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((reduce_t)reduce_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((reduce_t)reduce_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((reduce_t)reduce_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((reduce_t)reduce_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((reduce_t)reduce_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((reduce_t)reduce_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((reduce_t)reduce_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + RefReduction* ops = new RefReduction(); + + LOG_DEBUG()<<"Reduction RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefSqueezeOps +void RegisterReductionOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Reduction", RefReductionOps::SelectFunc, 1000); +} +} // namespace TEngine diff --git a/executor/operator/ref/ref_add_n.cpp b/executor/operator/ref/ref_add_n.cpp new file mode 100644 index 000000000..be0ec8dc3 --- /dev/null +++ b/executor/operator/ref/ref_add_n.cpp @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: ruizhang@openailab.com + */ +#include +#include +#include +#include "kernel_registry.hpp" +#include "logger.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "graph.hpp" +#include "tengine_errno.hpp" +#include "operator/add_n.hpp" +#include "kernel/ref_add_n/ref_addn_kernel.h" +#include + +namespace TEngine { + +namespace RefAddNImpl { +//const int default_prio = 1500; +struct RefAddNOps : public NodeOps +{ + bool Prerun(Node* node) override; + bool Run(Node* node) override; + bool Postrun(Node* node) override; + void InitRegistry(void); + RefAddNOps() + { + kernel_run = nullptr; + InitRegistry(); + } + struct ref_addn_param op_param; + ref_add_n_kernel_t kernel_run; + uint8_t** in_data_ptrs; + KernelRegistry kernel_registry; +}; + +void RefAddNOps::InitRegistry(void) +{ + #ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_add_n_kernel_t)ref_addn_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((ref_add_n_kernel_t)ref_addn_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); + #endif + #ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_add_n_kernel_t)ref_addn_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((ref_add_n_kernel_t)ref_addn_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); + #endif + #ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((ref_add_n_kernel_t)ref_addn_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((ref_add_n_kernel_t)ref_addn_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); + #endif + #ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_add_n_kernel_t)ref_addn_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_add_n_kernel_t)ref_addn_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); + #endif + +} + +bool RefAddNOps::Prerun(Node* node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + int data_type = input_tensor->GetDataType(); + int layout = exec_attr->graph_layout; + unsigned int input_num = node->GetInputNum(); + op_param.input_size = input_tensor->GetTotalSize(); + op_param.in_num = input_num; + op_param.in_scale=new float[input_num]; + op_param.in_zero =new int[input_num]; + in_data_ptrs = new uint8_t*[input_num]; + + if(!kernel_registry.GetKernel(kernel_run,layout,data_type)) + { + set_tengine_errno(ENOENT); + return false; + } + return true; +} + +bool RefAddNOps::Run(Node* node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + const int data_type = input_tensor->GetDataType(); + + for(int i = 0; i < op_param.in_num; ++i) + { + Tensor* input_tensor = node->GetInputTensor(i); + auto* in_quant = input_tensor->GetQuantParam(); + if(in_quant->size()) + { + op_param.in_scale[i] = (*in_quant)[0].scale; + op_param.in_zero[i] = (*in_quant)[0].zero_point; + } + + in_data_ptrs[i] = (uint8_t*)get_tensor_mem(input_tensor); + } + + Tensor* output_tensor = node->GetOutputTensor(0); + uint8_t* out_data = (uint8_t*)get_tensor_mem(output_tensor); + memset(out_data, 0, op_param.input_size); + if( data_type == TENGINE_DT_UINT8 ) + { + auto* o_quant = output_tensor->GetQuantParam(); + op_param.out_scale = (*o_quant)[0].scale; + op_param.out_zero = (*o_quant)[0].zero_point; + } + int ret = kernel_run(in_data_ptrs, out_data, &op_param); + if(ret<0) + return false; + + if( data_type == TENGINE_DT_INT8 ) + { + Tensor* o_tensor = node->GetOutputTensor(0); + auto* o_quant = o_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = op_param.out_scale; + o_quant->resize(0); + o_quant->push_back(q_param); + } + return true; +} + +bool RefAddNOps::Postrun(Node* node) +{ + free(in_data_ptrs); + free(op_param.in_scale); + free(op_param.in_zero); + + return true; +} + +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + RefAddNOps* ops = new RefAddNOps(); + return ops; +} + +} // namespace RefAddNImpl + +using namespace RefAddNImpl; + +void RegisterRefAddNOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Addn", RefAddNImpl::SelectFunc, 1000); +} + +} // namespace TEngine diff --git a/executor/operator/ref/ref_batchnorm.cpp b/executor/operator/ref/ref_batchnorm.cpp new file mode 100644 index 000000000..f26eb76f3 --- /dev/null +++ b/executor/operator/ref/ref_batchnorm.cpp @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: ruizhang@openailab.com + */ +#include +#include +#include +#include "kernel_registry.hpp" +#include "logger.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "tengine_errno.hpp" +#include "graph.hpp" +#include "operator/batch_norm.hpp" +#include "kernel/ref_batchnorm/ref_batchnorm_kernel.h" +#include + +namespace TEngine { + +namespace RefBatchNormImpl{ + +struct RefBatchNormOps : public NodeOps +{ + bool Prerun(Node* node) override; + bool Run(Node* node) override; + bool Postrun(Node* node) override; + void InitRegistry(void); + RefBatchNormOps() + { + kernel_run = nullptr; + InitRegistry(); + } + struct ref_batchnorm_param op_param; + ref_batchnorm_kernel_t kernel_run; + KernelRegistry kernel_registry; +}; + +void RefBatchNormOps::InitRegistry(void) +{ + #ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); + #endif + #ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); + #endif + #ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); + #endif + #ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_batchnorm_kernel_t)ref_batchnorm_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); + #endif + +} +bool RefBatchNormOps::Prerun(Node* node) +{ + BatchNorm* bn_op = dynamic_cast(node->GetOp()); + BatchNormParam* param = bn_op->GetParam(); + + const Tensor* input_tensor = node->GetInputTensor(0); + int data_type = input_tensor->GetDataType(); + const TShape& shape = input_tensor->GetShape(); + const std::vector dims = shape.GetDim(); + int channel_num = dims[1]; + float* scale_mean = ( float* )mem_alloc(channel_num * sizeof(float)); + float* scale_var_inv = ( float* )mem_alloc(channel_num * sizeof(float)); + const Tensor* mean_tensor = node->GetInputTensor(3); + const Tensor* var_tensor = node->GetInputTensor(4); + const float* mean = ( const float* )get_tensor_mem(mean_tensor); + const float* var = ( const float* )get_tensor_mem(var_tensor); + + float rescale_factor; + float eps = param->eps; + + rescale_factor = param->rescale_factor ? 1 / param->rescale_factor : 0; + for(int c = 0; c < channel_num; c++) + { + scale_var_inv[c] = 1.f / sqrt(var[c] * rescale_factor + eps); + scale_mean[c] = -mean[c] * rescale_factor * scale_var_inv[c]; + } + float* gamma = NULL; + float* beta = NULL; + if(!param->caffe_flavor) + { + const Tensor* gamma_tensor = node->GetInputTensor(1); + const Tensor* beta_tensor = node->GetInputTensor(2); + gamma = (float* )get_tensor_mem(gamma_tensor); + beta = (float* )get_tensor_mem(beta_tensor); + } + int layout = exec_attr->graph_layout; + op_param.iscaffe = param->caffe_flavor; + op_param.scale_mean = scale_mean; + op_param.scale_var_inv = scale_var_inv; + op_param.gamma = gamma; + op_param.beta = beta; + op_param.layout = layout; + + if(!kernel_registry.GetKernel(kernel_run, layout, data_type)) + { + set_tengine_errno(ENOENT); + return false; + } + return true; +} + +bool RefBatchNormOps::Run(Node* node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + const TShape& shape = input_tensor->GetShape(); + const std::vector dims = shape.GetDim(); + + if(TENGINE_LAYOUT_NCHW == op_param.layout) + { + if(4 == dims.size()) + { + op_param.input_n = dims[0]; + op_param.input_c = dims[1]; + op_param.input_h = dims[2]; + op_param.input_w = dims[3]; + } + else if(3 == dims.size()) + { + op_param.input_n = dims[0]; + op_param.input_c = dims[1]; + op_param.input_w = dims[2]; + op_param.input_h = 1; + } + else + { + return false; + } + } + else + { + if(4 == dims.size()) + { + op_param.input_n = dims[0]; + op_param.input_c = dims[3]; + op_param.input_h = dims[1]; + op_param.input_w = dims[2]; + } + else if(3 == dims.size()) + { + op_param.input_n = dims[0]; + op_param.input_c = dims[2]; + op_param.input_w = dims[1]; + op_param.input_h = 1; + } + else + { + return false; + } + } + auto* in_quant = input_tensor->GetQuantParam(); + if(in_quant->size()) + { + op_param.in_scale = (*in_quant)[0].scale; + op_param.in_zero = (*in_quant)[0].zero_point; + } + uint8_t* input = (uint8_t*)get_tensor_mem(input_tensor); + Tensor* output_tensor = node->GetOutputTensor(0); + uint8_t*out_data = (uint8_t*)get_tensor_mem(output_tensor); + const int data_type = input_tensor->GetDataType(); + if( data_type == TENGINE_DT_UINT8 ) + { + auto* o_quant = output_tensor->GetQuantParam(); + op_param.out_scale = (*o_quant)[0].scale; + op_param.out_zero = (*o_quant)[0].zero_point; + } + int ret = kernel_run(input, out_data, &op_param); + if(ret<0) + return false; + + if(data_type == TENGINE_DT_INT8 ) + { + Tensor* o_tensor = node->GetOutputTensor(0); + auto* o_quant = o_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = op_param.out_scale; + o_quant->resize(0); + o_quant->push_back(q_param); + } + + return true; +} + +bool RefBatchNormOps::Postrun(Node* node) +{ + free(op_param.scale_mean); + free(op_param.scale_var_inv); + return true; +} + +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + RefBatchNormOps* ops = new RefBatchNormOps(); + return ops; +} + +}// namespace RefBatchNormImpl + + +void RegisterRefBatchNormOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "BatchNormalization", + RefBatchNormImpl::SelectFunc, 1000); +} + +} // namespace TEngine diff --git a/executor/operator/ref/ref_concat.cpp b/executor/operator/ref/ref_concat.cpp new file mode 100644 index 000000000..d7624e036 --- /dev/null +++ b/executor/operator/ref/ref_concat.cpp @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jjzeng@openailab.com + */ + +#include +#include + +#include "logger.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "graph.hpp" +#include "operator/concat.hpp" + +#include "kernel/concat/concat_kernel.h" + +namespace TEngine +{ + namespace RefConcatOps + { + const int default_prio = 1500; + struct RefConcat : public MTNodeOps + { + bool Prerun(Node* node) override; + bool Run(Node* node) override; + bool Postrun(Node* node) override; + void InitRegistry(void); + + RefConcat() + { + kernel_run = nullptr; + InitRegistry(); + } + + struct concat_param op_param; + concat_t kernel_run; + void** input_data; + KernelRegistry kernel_registry; + }; + + void RefConcat::InitRegistry(void) + { + #ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((concat_t)ref_concat_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((concat_t)ref_concat_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); + #endif + #ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((concat_t)ref_concat_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((concat_t)ref_concat_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); + #endif + #ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((concat_t)ref_concat_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((concat_t)ref_concat_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); + #endif + #ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((concat_t)ref_concat_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((concat_t)ref_concat_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); + #endif + + } + + bool RefConcat::Prerun(Node* node) + { + int layout=exec_attr->graph_layout; + Tensor* output_tensor = node->GetOutputTensor(0); + Concat* concat_op = dynamic_cast(node->GetOp()); + ConcatParam* param = concat_op->GetParam(); + + Tensor* input_tensor = node->GetInputTensor(0); + int data_type = input_tensor->GetDataType(); + op_param.axis = param->axis; + + int in_nums = node->GetInputNum(); + input_data = new void*[in_nums]; + op_param.input_shape = new shape_dim[in_nums]; + op_param.input_counts = in_nums; + + auto dims = output_tensor->GetShape().GetDim(); + op_param.output_dim = (int)(dims.size()); + for(std::size_t ii=0; iiGetOutputTensor(0); + void* output = get_tensor_mem(o_tensor); + int data_type = -1; + for(int ii=0; iiGetInputTensor(ii); + data_type = input_tensor->GetDataType(); + auto* in_quant = input_tensor->GetQuantParam(); + if( (*in_quant).size() != 0 ) + { + op_param.input_shape[ii].scale = (*in_quant)[0].scale; + op_param.input_shape[ii].zero = (*in_quant)[0].zero_point; + } + else + { + op_param.input_shape[ii].scale = 1; + op_param.input_shape[ii].zero = 0; + } + + auto dims = input_tensor->GetShape().GetDim(); + op_param.input_dim = (int)(dims.size()); + for(std::size_t jj=0; jjGetQuantParam(); + if( (*o_quant).size() !=0) + { + op_param.output_shape.scale = (*o_quant)[0].scale; + op_param.output_shape.zero = (*o_quant)[0].zero_point; + } + else + { + op_param.output_shape.scale = 1; + op_param.output_shape.zero = 0; + } + + const void ** input = (const void**)input_data; + int ret = kernel_run(input, output, &op_param); + if(ret<0) + return false; + + if( data_type == TENGINE_DT_INT8 ) + { + auto* o_quant = o_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = op_param.out_scale; + o_quant->resize(0); + o_quant->push_back(q_param); + } + + return true; + } + + bool RefConcat::Postrun(Node* node) + { + delete[] input_data; + delete[] op_param.input_shape; + return true; + } + + NodeOps* SelectFunc(const CPUInfo* info, Node* node) + { + RefConcat* ops = new RefConcat(); + + LOG_DEBUG()<<"Refconcat is selected\n"; + + return ops; + } + + + } //end namespace RefConcatOps + + void RegisterRefConcat(void) + { + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Concat", RefConcatOps::SelectFunc,RefConcatOps::default_prio); + } + +} diff --git a/executor/operator/ref/ref_convolution.cpp b/executor/operator/ref/ref_convolution.cpp new file mode 100644 index 000000000..e25052cfc --- /dev/null +++ b/executor/operator/ref/ref_convolution.cpp @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: haoluo@openailab.com + */ +#include +#include +#include +#include +#include + +#include "logger.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" + +#include "graph.hpp" +#include "operator/convolution.hpp" +#include "kernel/convolution/ref_conv_kernel.h" + +namespace TEngine { + +namespace RefConvolutionOps { + +const int default_prio = 1500; + +inline static int get_scale_zero(Tensor* itensor, Tensor* otensor, Tensor* ktensor, op_data* param) +{ + auto* i_quant = itensor->GetQuantParam(); + auto* k_quant = ktensor->GetQuantParam(); + auto* o_quant = otensor->GetQuantParam(); + if( i_quant->size() != 1 || k_quant->size() != 1) + { + std::cerr<<"quant size: input("<< i_quant->size()<<"),kernel("<size()<<")\n"; + return -1; + } + param->scale[0] = (*i_quant)[0].scale; + param->scale[1] = (*k_quant)[0].scale; + if(itensor->GetDataType() == TENGINE_DT_UINT8) + { + if( o_quant->size() != 1) + { + std::cerr<<"output quant size: "<size()<<"\n"; + return -1; + } + + param->scale[2] = (*o_quant)[0].scale; + param->zero[2] = (*o_quant)[0].zero_point; + + param->zero[0] = (*i_quant)[0].zero_point; + param->zero[1] = (*k_quant)[0].zero_point; + } + //printf("scale: %f,%f,%f -- zero : %d,%d,%d \n", + // param->scale[0],param->scale[1],param->scale[2], + // param->zero[0],param->zero[1],param->zero[2]); + return 0; +} + +struct RefConv : public MTNodeOps +{ + bool Prerun(Node* node) override; + bool Reshape(Node* node) override; + bool Run(Node* node) override; + bool Postrun(Node* node) override; + void InitRegistry(void); + + bool dynamic_shape; + op_data op_param; + + ref_conv_kernel_t kernel_run; + KernelRegistry kernel_registry; + RefConv(void) + { + kernel_run=nullptr; + InitRegistry(); + } +}; +void RefConv::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_conv_kernel_t)ref_conv_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((ref_conv_kernel_t)ref_conv_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_conv_kernel_t)ref_conv_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((ref_conv_kernel_t)ref_conv_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif + +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((ref_conv_kernel_t)ref_conv_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((ref_conv_kernel_t)ref_conv_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_conv_kernel_t)ref_conv_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_conv_kernel_t)ref_conv_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +bool RefConv::Prerun(Node* node) +{ + int layout=exec_attr->graph_layout; + + Convolution* conv_op = dynamic_cast(node->GetOp()); + ConvParam* param = conv_op->GetParam(); + + Tensor* input_tensor = node->GetInputTensor(0); + op_param.batch = input_tensor->GetShape().GetN(); + op_param.in_shape[0] = input_tensor->GetShape().GetC(); + op_param.in_shape[1] = input_tensor->GetShape().GetH(); + op_param.in_shape[2] = input_tensor->GetShape().GetW(); + + Tensor* kernel_tensor = node->GetInputTensor(1); + op_param.kernels[0] = kernel_tensor->GetShape().GetH(); + op_param.kernels[1] = kernel_tensor->GetShape().GetW(); + + Tensor* output_tensor = node->GetOutputTensor(0); + op_param.out_shape[0] = output_tensor->GetShape().GetC(); + op_param.out_shape[1] = output_tensor->GetShape().GetH(); + op_param.out_shape[2] = output_tensor->GetShape().GetW(); + + op_param.strides[0] = param->stride_h; + op_param.strides[1] = param->stride_w; + + op_param.dilations[1] = param->dilation_h; + op_param.dilations[0] = param->dilation_w; + + op_param.pads[0] = param->pad_h0; + op_param.pads[1] = param->pad_w0; + op_param.group = param->group; + op_param.activation = param->activation; + op_param.layout = layout; + + if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; +} + +bool RefConv::Reshape(Node* node) +{ + + Tensor* input_tensor = node->GetInputTensor(0); + op_param.batch = input_tensor->GetShape().GetN(); + op_param.in_shape[0] = input_tensor->GetShape().GetC(); + op_param.in_shape[1] = input_tensor->GetShape().GetH(); + op_param.in_shape[2] = input_tensor->GetShape().GetW(); + + Tensor* output_tensor = node->GetOutputTensor(0); + op_param.out_shape[0] = output_tensor->GetShape().GetC(); + op_param.out_shape[1] = output_tensor->GetShape().GetH(); + op_param.out_shape[2] = output_tensor->GetShape().GetW(); + + return true; +} +bool RefConv::Run(Node* node) +{ + //printf("---------------------------- Run ref_conv!!!\n"); + Tensor* i_tensor = node->GetInputTensor(0); + const void* input = get_tensor_mem(i_tensor); + Tensor* k_tensor = node->GetInputTensor(1); + const void* kernel = get_tensor_mem(k_tensor); + Tensor* b_tensor = node->GetInputTensor(2); + const void* bias = nullptr; + if(b_tensor != nullptr) + bias = get_tensor_mem(b_tensor); + Tensor* o_tensor = node->GetOutputTensor(0); + void* output = get_tensor_mem(o_tensor); + + /* Get input,kernel,output scale & zero */ + /* Current: one tensor has only one quantparam(scale)*/ + if(i_tensor->GetDataType() == TENGINE_DT_INT8 || + i_tensor->GetDataType() == TENGINE_DT_UINT8 ) + { + if(get_scale_zero(i_tensor, o_tensor, k_tensor, &op_param) < 0) + return false; + } + + int ret = kernel_run(input,output,kernel,bias,&op_param); + if(i_tensor->GetDataType() == TENGINE_DT_INT8) + { + auto* o_quant = o_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = op_param.scale[2]; + o_quant->resize(0); + o_quant->push_back(q_param); + } + if(ret<0) + return false; + return true; +} + +bool RefConv::Postrun(Node* node) +{ + return true; +} + +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + RefConv* ops = new RefConv(); + + return ops; +} + +} // namespace RefConvolutionOps + +void RegisterRefConv2d(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Convolution", RefConvolutionOps::SelectFunc, + RefConvolutionOps::default_prio); +} + +} // namespace TEngine diff --git a/executor/operator/ref/ref_deconvolution.cpp b/executor/operator/ref/ref_deconvolution.cpp new file mode 100644 index 000000000..cb0a8b2e2 --- /dev/null +++ b/executor/operator/ref/ref_deconvolution.cpp @@ -0,0 +1,257 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: haoluo@openailab.com + */ +#include +#include +#include +#include +#include + +#include "logger.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" + +#include "graph.hpp" +#include "operator/deconvolution.hpp" +#include "kernel/deconvolution/ref_deconv_kernel.h" + +namespace TEngine { + +namespace RefDeconvolutionOps { + +const int default_prio = 1500; + +struct RefDeconv : public MTNodeOps +{ + bool Prerun(Node* node) override; + bool Run(Node* node) override; + bool Reshape(Node* node) override; + bool Postrun(Node* node) override; + bool GetSharedMemorySize(Node*, unsigned int& mem_size) override; + bool SetSharedMemoryAddr(Node*, void* mem_addr, int mem_size) override; + void InitRegistry(void); + + int element_size; + bool dynamic_shape; + deconv_ref_param op_param; + + ref_deconv_kernel_t kernel_run; + KernelRegistry kernel_registry; + RefDeconv(void) + { + kernel_run=nullptr; + InitRegistry(); + } +}; +void RefDeconv::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_deconv_kernel_t)ref_deconv_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} +bool RefDeconv::Reshape(Node* node) +{ + return true; +} + +bool RefDeconv::SetSharedMemoryAddr(Node* node, void* mem_addr, int mem_size) +{ + return true; +} + +bool RefDeconv::GetSharedMemorySize(Node* node, unsigned int& mem_size) +{ + return true; +} + +bool RefDeconv::Prerun(Node* node) +{ + int layout=exec_attr->graph_layout; + + Deconvolution* deconv_op = dynamic_cast(node->GetOp()); + DeconvParam* param = deconv_op->GetParam(); + + Tensor* input_tensor = node->GetInputTensor(0); + TShape inshape = input_tensor->GetShape(); + + if(0 == layout) // nchw + { + op_param.batch = inshape.Shape(0); + op_param.in_shape[0] = inshape.Shape(1); + op_param.in_shape[1] = inshape.Shape(2); + op_param.in_shape[2] = inshape.Shape(3); + } + else // nhwc + { + op_param.batch = inshape.Shape(0); + op_param.in_shape[0] = inshape.Shape(3); + op_param.in_shape[1] = inshape.Shape(1); + op_param.in_shape[2] = inshape.Shape(2); + } + + /* kernel quant param */ + Tensor* kernel_tensor = node->GetInputTensor(1); + auto* k_quant = kernel_tensor->GetQuantParam(); + if( (*k_quant).size() !=0) + { + op_param.scale[1] = (*k_quant)[0].scale; + op_param.zero[1] = (*k_quant)[0].zero_point; + } + + TShape wshape = kernel_tensor->GetShape(); + + if(0 == layout) // hw + { + op_param.kernels[0] = wshape.Shape(2); + op_param.kernels[1] = wshape.Shape(3); + } + else // + { + op_param.kernels[0] = wshape.Shape(1); + op_param.kernels[1] = wshape.Shape(2); + } + + /* output quant param */ + Tensor* output_tensor = node->GetOutputTensor(0); + auto* o_quant = output_tensor->GetQuantParam(); + if( (*o_quant).size() !=0) + { + op_param.scale[2] = (*o_quant)[0].scale; + op_param.zero[2] = (*o_quant)[0].zero_point; + } + + TShape outshape = output_tensor->GetShape(); + + if(0 == layout) // chw + { + op_param.out_shape[0] = outshape.Shape(1); + op_param.out_shape[1] = outshape.Shape(2); + op_param.out_shape[2] = outshape.Shape(3); + } + else + { + op_param.out_shape[0] = outshape.Shape(3); + op_param.out_shape[1] = outshape.Shape(1); + op_param.out_shape[2] = outshape.Shape(2); + } + + op_param.strides[0] = param->stride_h; + op_param.strides[1] = param->stride_w; + + op_param.dilations[1] = param->dilation_h; + op_param.dilations[0] = param->dilation_w; + + op_param.pads[0] = param->pad_h0; //pad_h + op_param.pads[1] = param->pad_w0; //pad_w + + op_param.group = param->group; + op_param.activation = param->activation; + op_param.layout = layout; + + if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; +} + +bool RefDeconv::Run(Node* node) +{ + //printf("run ref_deconv!!!\n"); + Tensor* i_tensor = node->GetInputTensor(0); + const void* input = get_tensor_mem(i_tensor); + Tensor* k_tensor = node->GetInputTensor(1); + const void* kernel = get_tensor_mem(k_tensor); + Tensor* b_tensor = node->GetInputTensor(2); + const void* bias = nullptr; + if(b_tensor != nullptr) + bias = get_tensor_mem(b_tensor); + Tensor* o_tensor = node->GetOutputTensor(0); + void* output = get_tensor_mem(o_tensor); + + /* input quant param */ + auto* in_quant = i_tensor->GetQuantParam(); + if((*in_quant).size() !=0) + { + op_param.scale[0] = (*in_quant)[0].scale; + op_param.zero[0] = (*in_quant)[0].zero_point; + } + + int ret = kernel_run(input,output,kernel,bias,&op_param); + if(ret<0) + return false; + if(i_tensor->GetDataType() == TENGINE_DT_INT8) + { + auto* o_quant = o_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = op_param.scale[2]; + o_quant->resize(0); + o_quant->push_back(q_param); + } + return true; +} + +bool RefDeconv::Postrun(Node* node) +{ + return true; +} + +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + RefDeconv* ops = new RefDeconv(); + + if(node->IsDynamicShape()) + ops->dynamic_shape = true; + else + ops->dynamic_shape = false; + + return ops; +} + +} // namespace RefDeconvolutionOps + +void RegisterRefDeconv2d(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Deconvolution", RefDeconvolutionOps::SelectFunc,RefDeconvolutionOps::default_prio); +} + +} // namespace TEngine diff --git a/executor/operator/ref/ref_detection_postprocess.cpp b/executor/operator/ref/ref_detection_postprocess.cpp new file mode 100644 index 000000000..4a82891f8 --- /dev/null +++ b/executor/operator/ref/ref_detection_postprocess.cpp @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +#include +#include + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/detection_postprocess.hpp" +#include "kernel/dpp/ref_dpp_kernel.h" + +namespace TEngine { + +namespace RefDetectionPostOps { + +struct RefDetectionPost : public NodeOps +{ + bool Prerun(Node * node) override; + bool Run(Node * node) override; + void InitRegistry(void); + + dpp_param param; + ref_dpp_kernel_t kernel_run; + KernelRegistry kernel_registry; + + RefDetectionPost(void) + { + kernel_run=nullptr; + + InitRegistry(); + } +}; + +bool RefDetectionPost::Prerun(Node * node) +{ + if(node->GetInputNum() != 3 || node->GetOutputNum()!=4) + return false; + + int layout = exec_attr->graph_layout; + DetectionPostProcess* dpp_op = dynamic_cast(node->GetOp()); + DetectionPostProcessParam* param_ = dpp_op->GetParam(); + param.max_classes_per_detection = param_->max_classes_per_detection; + param.nms_iou_threshold = param_->nms_iou_threshold; + param.nms_score_threshold = param_->nms_score_threshold; + param.num_classes = param_->num_classes; + param.max_detections = param_->max_detections; + param.scales[0] = param_->scales[0]; + param.scales[1] = param_->scales[1]; + param.scales[2] = param_->scales[2]; + param.scales[3] = param_->scales[3]; + + Tensor* input = node->GetInputTensor(0); + if( input->GetDataType() != TENGINE_DT_FP32 && + input->GetDataType() != TENGINE_DT_FP16 && + input->GetDataType() != TENGINE_DT_UINT8) + return false; + param.num_boxes = input->GetShape().Shape(1); + auto i_quant = input->GetQuantParam(); + + Tensor* score = node->GetInputTensor(1); + auto s_quant = score->GetQuantParam(); + + Tensor* anchor = node->GetInputTensor(2); + auto a_quant = anchor->GetQuantParam(); + + if(input->GetDataType() == TENGINE_DT_UINT8) + { + if(i_quant->size() == 0 || s_quant->size() == 0 || a_quant->size() == 0) + { + std::cerr<<"RefDetectionPost one quant is NONE: <"<size()<<"," + <size()<<","<size()<<"\n"; + return false; + } + param.quant_scale[0] = (*i_quant)[0].scale; + param.quant_scale[1] = (*s_quant)[0].scale; + param.quant_scale[2] = (*a_quant)[0].scale; + param.zero[0] = (*i_quant)[0].zero_point; + param.zero[1] = (*s_quant)[0].zero_point; + param.zero[2] = (*a_quant)[0].zero_point; + } + + if(!kernel_registry.GetKernel(kernel_run, layout, input->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; +} + +bool RefDetectionPost::Run(Node * node) +{ + if(kernel_run == nullptr) + return false; + + //printf(" ********** run ref dpp\n"); + + Tensor* input = node->GetInputTensor(0); + const void* input_data = get_tensor_mem(input); + Tensor* score = node->GetInputTensor(1); + void* score_data = get_tensor_mem(score); + Tensor* anchor = node->GetInputTensor(2); + void* anchor_data = get_tensor_mem(anchor); + + Tensor* detect_boxes = node->GetOutputTensor(0); + float* detect_boxes_data = (float*)get_tensor_mem(detect_boxes); + Tensor* detect_classes = node->GetOutputTensor(1); + float* detect_classes_data = (float*)get_tensor_mem(detect_classes); + Tensor* detect_scores = node->GetOutputTensor(2); + float* detect_scores_data = (float*)get_tensor_mem(detect_scores); + Tensor* detect_num = node->GetOutputTensor(3); + float* detect_num_data = (float*)get_tensor_mem(detect_num); + + if(kernel_run(input_data, score_data, anchor_data, detect_num_data, + detect_classes_data, detect_scores_data, detect_boxes_data, ¶m)<0) + return false; + + return true; +} + +void RefDetectionPost::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_dpp_kernel_t)ref_dpp_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + RefDetectionPost* ops = new RefDetectionPost(); + + LOG_DEBUG()<<"Demo RefDetectionPost is selected\n"; + + return ops; +} + +} // namespace RefDetectionPostOps + +void RegisterRefDetectionPostOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "DetectionPostProcess", RefDetectionPostOps::SelectFunc, 1000); +} + + +} // namespace TEngine diff --git a/executor/operator/ref/ref_fully_connected.cpp b/executor/operator/ref/ref_fully_connected.cpp new file mode 100644 index 000000000..e189ef5f4 --- /dev/null +++ b/executor/operator/ref/ref_fully_connected.cpp @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +#include +#include + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/fully_connected.hpp" +#include "kernel/fully_connected/ref_fc_kernel.h" + +namespace TEngine { + +namespace RefFCOps { + +struct RefFC : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool Run(Node * node) override; + void InitRegistry(void); + + struct fc_data param; + ref_fc_kernel_t kernel_run; + KernelRegistry kernel_registry; + + RefFC(void) + { + kernel_run=nullptr; + + InitRegistry(); + } +}; + +bool RefFC::Prerun(Node * node) +{ + int layout = exec_attr->graph_layout; + FullyConnected* fc_op = dynamic_cast(node->GetOp()); + FCParam* param_ = fc_op->GetParam(); + param.out_number = param_->num_output; + + Tensor* input = node->GetInputTensor(0); + auto i_quant = input->GetQuantParam(); + + Tensor* weight = node->GetInputTensor(1); + int weight_out = weight->GetShape().Shape(0); + if(weight_out == param.out_number) + param.need_trans = 0; + else + param.need_trans = 1; + auto w_quant = weight->GetQuantParam(); + + Tensor* output = node->GetOutputTensor(0); + auto o_quant = output->GetQuantParam(); + + if(input->GetDataType() == TENGINE_DT_UINT8) + { + if(i_quant->size() == 0 || w_quant->size() == 0 || o_quant->size() == 0) + { + std::cerr<<"FC one quant is NONE: <"<size()<<"," + <size()<<","<size()<<"\n"; + return false; + } + param.scale[0] = (*i_quant)[0].scale; + param.scale[1] = (*w_quant)[0].scale; + param.scale[2] = (*o_quant)[0].scale; + param.zero[0] = (*i_quant)[0].zero_point; + param.zero[1] = (*w_quant)[0].zero_point; + param.zero[2] = (*o_quant)[0].zero_point; + } + else if(input->GetDataType() == TENGINE_DT_INT8) + { + if(w_quant->size() == 0) + { + std::cerr<<"FC weight quant size is NONE\n"; + return false; + } + param.scale[1] = (*w_quant)[0].scale; + param.zero[1] = (*w_quant)[0].zero_point; + } + + if(!kernel_registry.GetKernel(kernel_run, layout, input->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; +} + +bool RefFC::Run(Node * node) +{ + if(kernel_run == nullptr) + return false; + + Tensor* input = node->GetInputTensor(0); + param.batch = input->GetShape().Shape(0); + param.hidden = input->GetShape().GetSize()/param.batch; + const void* input_data = get_tensor_mem(input); + Tensor* weight = node->GetInputTensor(1); + + void* weight_data = get_tensor_mem(weight); + + Tensor* output = node->GetOutputTensor(0); + void* output_data = get_tensor_mem(output); + + /* INT8 get input scale */ + if(input->GetDataType() == TENGINE_DT_INT8) + { + auto i_quant = input->GetQuantParam(); + param.scale[0] = (*i_quant)[0].scale; + param.zero[0] = (*i_quant)[0].zero_point; + } + + void* bias_data = nullptr; + if(node->GetInputNum() > 2) + { + Tensor* bias = node->GetInputTensor(2); + bias_data = get_tensor_mem(bias); + } + if(kernel_run(input_data, output_data, weight_data, bias_data, ¶m)<0) + return false; + + /* INT8 set output scale */ + if(input->GetDataType() == TENGINE_DT_INT8) + { + auto o_quant = output->GetQuantParam(); + o_quant->resize(1); + (*o_quant)[0].scale = param.scale[2]; + (*o_quant)[0].zero_point = param.zero[2]; + } + + return true; +} + +void RefFC::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_fc_kernel_t)ref_fc_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((ref_fc_kernel_t)ref_fc_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_fc_kernel_t)ref_fc_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((ref_fc_kernel_t)ref_fc_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif + +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((ref_fc_kernel_t)ref_fc_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((ref_fc_kernel_t)ref_fc_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_fc_kernel_t)ref_fc_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_fc_kernel_t)ref_fc_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + RefFC* ops = new RefFC(); + + LOG_DEBUG()<<"Demo RefFCOp is selected\n"; + + return ops; +} + +} // namespace RefFCOps + +void RegisterRefFCOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "FullyConnected", RefFCOps::SelectFunc, 1000); +} + + +} // namespace TEngine diff --git a/executor/operator/ref/ref_lrn.cpp b/executor/operator/ref/ref_lrn.cpp new file mode 100644 index 000000000..262462b41 --- /dev/null +++ b/executor/operator/ref/ref_lrn.cpp @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ + +#include +#include + +#include "logger.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "graph.hpp" +#include "operator/lrn.hpp" + +#include "kernel/lrn/ref_lrn_kernel.h" + +namespace TEngine +{ + namespace RefLrnOps + { + const int default_prio = 1500; + struct RefLrn : public MTNodeOps + { + bool Prerun(Node* node) override; + bool Run(Node* node) override; + bool RunNHWC(Node* node); + bool RunNCHW(Node* node); + bool Postrun(Node* node) override; + void InitRegistry(void); + + RefLrn() + { + kernel_run = nullptr; + InitRegistry(); + } + + ref_lrn_param op_param; + ref_lrn_kernel_t kernel_run; + KernelRegistry kernel_registry; + }; + + void RefLrn::InitRegistry(void) + { + #ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); + #endif + #ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); + #endif + #ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); + #endif + #ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_lrn_kernel_t)ref_lrn_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); + #endif + } + + bool RefLrn::Prerun(Node* node) + { + int layout = exec_attr->graph_layout; + Tensor* input_tensor = node->GetInputTensor(0); + int data_type = input_tensor->GetDataType(); + + LRN* lrn_op = dynamic_cast(node->GetOp()); + LRNParam* param = lrn_op->GetParam(); + + op_param.layout = layout; + op_param.alpha = param->alpha; + op_param.beta = param->beta; + op_param.bias = param->k; + op_param.local_size = param->local_size; + op_param.norm_region = param->norm_region; + + auto dims = input_tensor->GetShape().GetDim(); + for(unsigned int i = 0; i < dims.size(); i++) + { + op_param.dims[i] = dims[i]; + } + + if(!kernel_registry.GetKernel(kernel_run, layout, data_type)) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; + } + + bool RefLrn::Run(Node* node) + { + if(exec_attr->graph_layout == TENGINE_LAYOUT_NCHW) + { + return RunNCHW(node); + } + else + { + // TODO: support NCHW + return false; + } + } + + bool RefLrn::RunNCHW(Node* node) + { + Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + void* input = get_tensor_mem(input_tensor); + void* output = get_tensor_mem(output_tensor); + int data_type = input_tensor->GetDataType(); + + auto dims = input_tensor->GetShape().GetDim(); + for(unsigned int i = 0; i < dims.size(); i++) + { + op_param.dims[i] = dims[i]; + } + + auto* in_quant = input_tensor->GetQuantParam(); + if((*in_quant).size() != 0 ) + { + op_param.scale[0] = (*in_quant)[0].scale; + op_param.zero[0] = (*in_quant)[0].zero_point; + } + else + { + op_param.scale[0] = 1; + op_param.zero[0] = 0; + } + + auto* o_quant = output_tensor->GetQuantParam(); + if((*o_quant).size() !=0) + { + op_param.scale[1] = (*o_quant)[0].scale; + op_param.zero[1] = (*o_quant)[0].zero_point; + } + else + { + op_param.scale[1] = 1; + op_param.zero[1] = 0; + } + + if(kernel_run(input, output, &op_param) < 0) + return false; + + if(data_type == TENGINE_DT_INT8) + { + QuantParam q_param; + q_param.scale = op_param.scale[1]; + o_quant->resize(0); + o_quant->push_back(q_param); + } + + return true; + } + + bool RefLrn::Postrun(Node* node) + { + return true; + } + + NodeOps* SelectFunc(const CPUInfo* info, Node* node) + { + RefLrn* ops = new RefLrn(); + + LOG_DEBUG()<<"RefLrn is selected\n"; + + return ops; + } + + + } //end namespace RefLrnOps + + void RegisterRefLrn(void) + { + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "LRN", RefLrnOps::SelectFunc, RefLrnOps::default_prio); + } +} + diff --git a/executor/operator/ref/ref_normalize.cpp b/executor/operator/ref/ref_normalize.cpp new file mode 100644 index 000000000..a227d3a6f --- /dev/null +++ b/executor/operator/ref/ref_normalize.cpp @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: ruizhang@openailab.com + */ +#include +#include +#include +#include +#include + +#include "logger.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" + +#include "graph.hpp" +#include "operator/normalize.hpp" +#include "kernel/ref_normalize/ref_normalize_kernel.h" + +namespace TEngine { + +namespace RefNormalizeOps { + + +struct RefNormalize : public MTNodeOps +{ + bool Prerun(Node* node) override; + bool Run(Node* node) override; + void InitRegistry(void); + ref_normalize_param op_param; + ref_normalize_kernel_t kernel_run; + KernelRegistry kernel_registry; + RefNormalize(void) + { + kernel_run=nullptr; + InitRegistry(); + } +}; + +void RefNormalize::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_normalize_kernel_t)ref_normalize_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +bool RefNormalize::Prerun(Node* node) +{ + int layout = exec_attr->graph_layout; + Tensor* input_tensor = node->GetInputTensor(0); + if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; +} + +bool RefNormalize::Run(Node* node) +{ + + Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + //Normalize* normalize_op = dynamic_cast(node->GetOp()); + //NormalizeParam* param_ = normalize_op->GetParam(); + + TShape& shape = input_tensor->GetShape(); + std::vector dims = shape.GetDim(); + const ExecAttr* exec_attr = any_cast(node->GetAttr(ATTR_EXEC_ATTR)); + + op_param.layout = exec_attr->graph_layout; + if(TENGINE_LAYOUT_NCHW == op_param.layout) + { + op_param.input_n = dims[0]; + op_param.input_h = dims[2]; + op_param.input_w = dims[3]; + op_param.input_c = dims[1]; + } + else // nhwc + { + op_param.input_n = dims[0]; + op_param.input_h = dims[1]; + op_param.input_w = dims[2]; + op_param.input_c = dims[3]; + } + + uint8_t *scale = NULL; + if(node->GetInputNum() > 1) + { + const Tensor* scale_tensor = node->GetInputTensor(1); + scale = (uint8_t* )get_tensor_mem(scale_tensor); + } + uint8_t* input = (uint8_t *)get_tensor_mem(input_tensor); + uint8_t* output = (uint8_t *)get_tensor_mem(output_tensor); + if(TENGINE_DT_UINT8 == input_tensor->GetDataType() || + TENGINE_DT_INT8 == input_tensor->GetDataType()) + { + auto *in_quant = input_tensor->GetQuantParam(); + if(in_quant->size()) + { + op_param.in_scale = (*in_quant)[0].scale; + op_param.in_zero = (*in_quant)[0].zero_point; + } + if(node->GetInputNum() == 2) + { + Tensor* scale_tensor = node->GetInputTensor(1); + auto *scale_quant = scale_tensor->GetQuantParam(); + if(scale_quant->size()) + { + op_param.scale_scale = (*scale_quant)[0].scale; + op_param.scale_zero = (*scale_quant)[0].zero_point; + } + } + } + if(TENGINE_DT_UINT8 == input_tensor->GetDataType()) + { + auto *out_quant = output_tensor->GetQuantParam(); + if(out_quant->size()) + { + op_param.out_scale = (*out_quant)[0].scale; + op_param.out_zero = (*out_quant)[0].zero_point; + } + } + int ret = kernel_run(input,output,scale,&(this->op_param)); + if(ret < 0) + return false; + + if(TENGINE_DT_INT8 == input_tensor->GetDataType()) + { + auto *out_quant = output_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = op_param.out_scale; + q_param.zero_point = 0; + out_quant->resize(0); + out_quant->push_back(q_param); + } + + return true; +} + +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + RefNormalize* ops = new RefNormalize(); + + return ops; +} + +} // namespace RefNormalizeOps + +void RegisterRefNormlizeOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Normalize", RefNormalizeOps::SelectFunc, 2000); +} +} // namespace TEngine diff --git a/executor/operator/ref/ref_permute.cpp b/executor/operator/ref/ref_permute.cpp new file mode 100644 index 000000000..59838625e --- /dev/null +++ b/executor/operator/ref/ref_permute.cpp @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jjzeng@openailab.com + */ + +#include +#include + +#include "logger.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "graph.hpp" +#include "operator/permute.hpp" + +#include "kernel/permute/permute_kernel.h" + +namespace TEngine +{ + namespace RefPermuteOps + { + const int default_prio = 1500; + struct RefPermute : public MTNodeOps + { + bool Prerun(Node* node) override; + bool Run(Node* node) override; + bool Reshape(Node* node) override; + bool Postrun(Node* node) override; + bool GetSharedMemorySize(Node*, unsigned int& mem_size) override; + bool SetSharedMemoryAddr(Node*, void* mem_addr, int mem_size) override; + void InitRegistry(void); + + RefPermute() + { + kernel_run = nullptr; + InitRegistry(); + } + + permute_param op_param; + permute_t kernel_run; + KernelRegistry kernel_registry; + }; + + void RefPermute::InitRegistry(void) + { + #ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((permute_t)ref_permute_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((permute_t)ref_permute_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); + #endif + #ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((permute_t)ref_permute_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((permute_t)ref_permute_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); + #endif + #ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((permute_t)ref_permute_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((permute_t)ref_permute_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); + #endif + #ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((permute_t)ref_permute_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((permute_t)ref_permute_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); + #endif + + } + + bool RefPermute::Reshape(Node* node) + { + return true; + } + + bool RefPermute::SetSharedMemoryAddr(Node* node, void* mem_addr, int mem_size) + { + return true; + } + + bool RefPermute::GetSharedMemorySize(Node* node, unsigned int& mem_size) + { + return true; + } + + bool RefPermute::Prerun(Node* node) + { + int layout=exec_attr->graph_layout; + + Permute* permute_op = dynamic_cast(node->GetOp()); + PermuteParam* param = permute_op->GetParam(); + + op_param.order0 = param->order0; + op_param.order1 = param->order1; + op_param.order2 = param->order2; + op_param.order3 = param->order3; + + Tensor* in_tensor = node->GetInputTensor(0); + auto dims = in_tensor->GetShape().GetDim(); + for(std::size_t ii=0; iiGetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; + } + + bool RefPermute::Run(Node* node) + { + Tensor* o_tensor = node->GetOutputTensor(0); + void* output = get_tensor_mem(o_tensor); + Tensor* i_tensor = node->GetInputTensor(0); + const void* input = get_tensor_mem(i_tensor); + float scale = 1; + int data_type = i_tensor->GetDataType(); + auto* i_quant = i_tensor->GetQuantParam(); + if( (*i_quant).size() !=0 ) + { + scale = (*i_quant)[0].scale; + } + + int ret = kernel_run(input,output,&op_param); + if(ret<0) + return false; + + if( data_type == TENGINE_DT_INT8 ) + { + auto* o_quant = o_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = scale; + o_quant->resize(0); + o_quant->push_back(q_param); + } + + return true; + } + + bool RefPermute::Postrun(Node* node) + { + return true; + } + + NodeOps* SelectFunc(const CPUInfo* info, Node* node) + { + RefPermute* ops = new RefPermute(); + + LOG_DEBUG()<<"Refpermute is selected\n"; + + return ops; + } + + + } //end namespace RefConcatOps + + void RegisterRefPermute(void) + { + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Permute", RefPermuteOps::SelectFunc,RefPermuteOps::default_prio); + } + +} diff --git a/executor/operator/ref/ref_pooling.cpp b/executor/operator/ref/ref_pooling.cpp new file mode 100644 index 000000000..5c055e71a --- /dev/null +++ b/executor/operator/ref/ref_pooling.cpp @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +#include + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/pooling.hpp" +#include "kernel/pooling/ref_pooling_kernel.h" + +namespace TEngine { + +namespace RefPoolingOps { + +struct RefPooling : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool Reshape(Node * node) override; + bool Run(Node * node) override; + void InitRegistry(void); + + struct op_data param; + ref_pooling_kernel_t kernel_run; + KernelRegistry kernel_registry; + + RefPooling(void) + { + kernel_run=nullptr; + + InitRegistry(); + } +}; + +bool RefPooling::Prerun(Node * node) +{ + int layout = exec_attr->graph_layout; + param.layout = layout; + Pooling* pooling_op = dynamic_cast(node->GetOp()); + PoolParam* param_ = pooling_op->GetParam(); + param.kernels[0] = param_->kernel_h; + param.kernels[1] = param_->kernel_w; + param.strides[0] = param_->stride_h; + param.strides[1] = param_->stride_w; + param.pads[0] = param_->pad_h0; + param.pads[1] = param_->pad_w0; + param.method = param_->alg; + param.caffe_flavor = param_->caffe_flavor; + + Tensor * input = node->GetInputTensor(0); + param.batch = input->GetShape().GetN(); + param.channel = input->GetShape().GetC(); + param.input[0] = input->GetShape().GetH(); + param.input[1] = input->GetShape().GetW(); + + Tensor * output= node->GetOutputTensor(0); + param.output[0] = output->GetShape().GetH(); + param.output[1] = output->GetShape().GetW(); + + if(input->GetDataType() == TENGINE_DT_UINT8) + { + auto quant_param = input->GetQuantParam(); + param.zero_point = (*quant_param)[0].zero_point; + } + + if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; +} + +bool RefPooling::Reshape(Node* node) +{ + Pooling* pooling_op = dynamic_cast(node->GetOp()); + PoolParam* param_ = pooling_op->GetParam(); + param.kernels[0] = param_->kernel_h; + param.kernels[1] = param_->kernel_w; + + Tensor * input = node->GetInputTensor(0); + param.batch = input->GetShape().GetN(); + param.channel = input->GetShape().GetC(); + param.input[0] = input->GetShape().GetH(); + param.input[1] = input->GetShape().GetW(); + + Tensor * output= node->GetOutputTensor(0); + param.output[0] = output->GetShape().GetH(); + param.output[1] = output->GetShape().GetW(); + return true; +} + +bool RefPooling::Run(Node * node) +{ + if(kernel_run == nullptr) + return false; + + Tensor* input = node->GetInputTensor(0); + Tensor* output = node->GetOutputTensor(0); + auto i_quant = input->GetQuantParam(); + auto o_quant = output->GetQuantParam(); + if(input->GetDataType() == TENGINE_DT_INT8) + { + if(i_quant->size() != 1) + { + std::cerr<<"Input data_type is INT8 ,and quant param num is not 1 !!!!\n"; + return false; + } + o_quant->resize(0); + o_quant->push_back((*i_quant)[0]); + } + + + const void* input_data = get_tensor_mem(input); + void* output_data = get_tensor_mem(output); + + if(kernel_run(input_data, output_data, ¶m)<0) + return false; + + + return true; +} + +void RefPooling::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + RefPooling* ops = new RefPooling(); + + LOG_DEBUG()<<"Demo RefPoolingOp is selected\n"; + + return ops; +} + +} // namespace RefPoolingOps + +void RegisterRefPoolingOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Pooling", RefPoolingOps::SelectFunc, 8000); +} + + +} // namespace TEngine diff --git a/executor/operator/ref/ref_rpn.cpp b/executor/operator/ref/ref_rpn.cpp new file mode 100644 index 000000000..b1a6b3aec --- /dev/null +++ b/executor/operator/ref/ref_rpn.cpp @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +#include +#include +#include +#include +#include + +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/rpn.hpp" +#include "kernel/rpn/ref_rpn_kernel.h" + +void ref_proposal_local_anchor(int feat_height, int feat_width, int feat_stride, std::vector& anchors, + float* local_anchors) +{ + int feat_size = feat_height*feat_width; + int num_anchors = ( int )anchors.size(); + for(int i = 0; i < num_anchors; ++i) + { + for(int j = 0; j < feat_height; j++) + for(int k = 0; k < feat_width; k++) + { + local_anchors[(i * 4 + 0) * feat_size + j * feat_width + k] = anchors[i].x0 + k*feat_stride; + local_anchors[(i * 4 + 1) * feat_size + j * feat_width + k] = anchors[i].y0 + j*feat_stride; + local_anchors[(i * 4 + 2) * feat_size + j * feat_width + k] = anchors[i].x1 + k*feat_stride; + local_anchors[(i * 4 + 3) * feat_size + j * feat_width + k] = anchors[i].y1 + j*feat_stride; + } + } +} + +namespace TEngine { + +namespace RefRPNImpl { + +struct RefRPNOps : public NodeOps +{ + bool Prerun(Node * node) override; + bool Run(Node * node) override; + void InitRegistry(void); + + struct rpn_param param; + ref_rpn_kernel_t kernel_run; + KernelRegistry kernel_registry; + + RefRPNOps(void) + { + kernel_run=nullptr; + + InitRegistry(); + } + +}; +void RefRPNOps::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_rpn_kernel_t)ref_rpn_fp32, TENGINE_LAYOUT_NCHW, TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_rpn_kernel_t)ref_rpn_fp16, TENGINE_LAYOUT_NCHW, TENGINE_DT_FP16); +#endif +/* + +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_pooling_kernel_t)ref_pooling_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif +*/ +} + +bool RefRPNOps::Prerun(Node* node) +{ + RPN* RPN_op = dynamic_cast(node->GetOp()); + RPNParam* param_ = RPN_op->GetParam(); + param.feat_stride = param_->feat_stride; + param.min_size = param_->min_size; + param.per_nms_topn = param_->per_nms_topn; + param.post_nms_topn = param_->post_nms_topn; + param.nms_thresh = param_->nms_thresh; + + int layout = exec_attr->graph_layout; + Tensor * input = node->GetInputTensor(0); + + if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; +} + +bool RefRPNOps::Run(Node* node) +{ + printf("ref RPN run\n"); + RPN* RPN_op = dynamic_cast(node->GetOp()); + RPNParam* param_ = RPN_op->GetParam(); + + const Tensor* score_tensor = node->GetInputTensor(0); + const Tensor* featmap_tensor = node->GetInputTensor(1); + const Tensor* info_tensor = node->GetInputTensor(2); + Tensor* output_tensor = node->GetOutputTensor(0); + TShape& out_shape = output_tensor->GetShape(); + + const void* score_org = get_tensor_mem(score_tensor); + void* featmap_org = get_tensor_mem(featmap_tensor); + const float* info_org = (float*)get_tensor_mem(info_tensor); + void* output_org = get_tensor_mem(output_tensor); + + const TShape& featmap_shape = featmap_tensor->GetShape(); + const int feat_channel = featmap_shape.GetC(); + const int feat_height = featmap_shape.GetH(); + const int feat_width = featmap_shape.GetW(); + const int feat_size = feat_height * feat_width; + + const TShape& score_shape = score_tensor->GetShape(); + param.num_anchors = ( int )param_->anchors_.size(); + param.feat_chan = feat_channel; + param.feat_height = feat_height; + param.feat_width = feat_width; + param.score_chan = score_shape.GetC(); + param.src_height = info_org[0]; + param.src_width = info_org[1]; + param.src_scale = info_org[2]; + + + // local_anchors (1, anchors_nums_ * 4, map_height_, map_width_); + int size = param.num_anchors * 4 * feat_size; + float* local_anchors = new float[size]; + + ref_proposal_local_anchor(feat_height, feat_width, param.feat_stride, param_->anchors_, local_anchors); + + int output_num = kernel_run(score_org, featmap_org, local_anchors, output_org, ¶m); + + std::vector outdim = {1, output_num, 4, 1}; + out_shape.SetDim(outdim); + + delete[] local_anchors; + + return true; +} + +NodeOps* SelectFunc(const CPUInfo* cpu_info, Node* node) +{ + + RefRPNOps* ops = new RefRPNOps(); + + return ops; +} + +} // namespace RefRPNImpl + +void RegisterRefRPNOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor("reference", "RPN", RefRPNImpl::SelectFunc, 1000); + +} + +} // namespace TEngine diff --git a/executor/operator/ref/ref_slice.cpp b/executor/operator/ref/ref_slice.cpp new file mode 100644 index 000000000..579fed5be --- /dev/null +++ b/executor/operator/ref/ref_slice.cpp @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: ruizhang@openailab.com + */ + +#include +#include + +#include "logger.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "graph.hpp" +#include "operator/slice.hpp" + +#include "kernel/slice/slice_kernel.h" + +namespace TEngine +{ + namespace RefSliceOps + { + const int default_prio = 1500; + struct RefSlice : public MTNodeOps + { + bool Prerun(Node* node) override; + bool Run(Node* node) override; + bool Postrun(Node* node) override; + void InitRegistry(void); + + RefSlice() + { + kernel_run = nullptr; + InitRegistry(); + } + struct slice_param op_param; + slice_t kernel_run; + int8_t** out_data_ptrs; + KernelRegistry kernel_registry; + }; + + void RefSlice::InitRegistry(void) + { + #ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((slice_t)ref_slice_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((slice_t)ref_slice_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); + #endif + #ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((slice_t)ref_slice_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((slice_t)ref_slice_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); + #endif + #ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((slice_t)ref_slice_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((slice_t)ref_slice_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); + #endif + #ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((slice_t)ref_slice_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((slice_t)ref_slice_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); + #endif + + } + + bool RefSlice::Prerun(Node* node) + { + int layout=exec_attr->graph_layout; + Slice* slice_op = dynamic_cast(node->GetOp()); + SliceParam * param = slice_op->GetParam(); + Tensor* input_tensor = node->GetInputTensor(0); + int data_type = input_tensor->GetDataType(); + auto in_dim = input_tensor->GetShape().GetDim(); + unsigned int out_num = node->GetOutputNum(); + out_data_ptrs = new int8_t*[out_num]; + op_param.axis = param->axis; + op_param.output_shape = new shape_dim[out_num]; + op_param.out_num = out_num; + op_param.dim_num = (int)(in_dim.size()); + op_param.iscaffe = param->iscaffe; + if(!kernel_registry.GetKernel(kernel_run,layout,data_type)) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; + } + + bool RefSlice::Run(Node* node) + { + Slice* slice_op = dynamic_cast(node->GetOp()); + SliceParam * param = slice_op->GetParam(); + Tensor * input_tensor = node->GetInputTensor(0); + int8_t *input = (int8_t*)get_tensor_mem(input_tensor); + auto in_dim = input_tensor->GetShape().GetDim(); + auto *in_quant = input_tensor->GetQuantParam(); + if(in_quant->size() > 0) + { + op_param.out_scale = (*in_quant)[0].scale; + } + const int data_type = input_tensor->GetDataType(); + if(op_param.iscaffe) + { + //set the input dim and output dim + for(int i = 0; i < op_param.dim_num;i++) + { + op_param.in_shape[i] = in_dim[i]; + } + // set the output + for(int i = 0; i < op_param.out_num; ++i) + { + Tensor * out_tensor = node->GetOutputTensor(i); + auto out_dim = out_tensor->GetShape().GetDim(); + for(int j = 0; j < op_param.dim_num; ++j) + { + op_param.output_shape[i].dims[j] = out_dim[j]; + } + out_data_ptrs[i] = (int8_t*)get_tensor_mem(out_tensor); + //set the output quant param + if( data_type == TENGINE_DT_INT8 ) + { + auto* o_quant = out_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = op_param.out_scale; + o_quant->resize(0); + o_quant->push_back(q_param); + } + } + } + else // For tensorflow, there is only one output tensor + { + int maxdim = 4; + int real_dim = op_param.dim_num; + int dim_idx = 0; + for(int idx = 0; idx < maxdim; idx++) + { + if(maxdim - idx > real_dim) + { + op_param.output_shape[0].begins[idx] = 0; + op_param.output_shape[0].sizes[idx] = 1; + op_param.in_shape[idx] = 1; + } + else + { + op_param.output_shape[0].begins[idx] = param->begin_[dim_idx]; + op_param.output_shape[0].sizes[idx] = param->size_[dim_idx]; + op_param.in_shape[idx] = in_dim[dim_idx]; + dim_idx++; + } + } + Tensor* o_tensor = node->GetOutputTensor(0); + out_data_ptrs[0] = (int8_t*)get_tensor_mem(o_tensor); + // Set the int8 output quant param + if( data_type == TENGINE_DT_INT8 ) + { + auto* o_quant = o_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = op_param.out_scale; + o_quant->resize(0); + o_quant->push_back(q_param); + } + + } + int ret = kernel_run(input,out_data_ptrs,&op_param); + if(ret<0) + return false; + return true; + } + + bool RefSlice::Postrun(Node* node) + { + delete[] out_data_ptrs; + delete[] op_param.output_shape; + return true; + } + + NodeOps* SelectFunc(const CPUInfo* info, Node* node) + { + RefSlice* ops = new RefSlice(); + + LOG_DEBUG()<<"RefSlice is selected\n"; + + return ops; + } + + + } //end namespace RefSliceOps + + void RegisterRefSlice(void) + { + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Slice", + RefSliceOps::SelectFunc,RefSliceOps::default_prio); + } + +} diff --git a/executor/operator/ref/ref_softmax.cpp b/executor/operator/ref/ref_softmax.cpp new file mode 100644 index 000000000..3bd6cca28 --- /dev/null +++ b/executor/operator/ref/ref_softmax.cpp @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haitao@openailab.com + */ + +#include +#include +#include "kernel/softmax/ref_softmax.h" + +#include "data_type.hpp" +#include "operator/softmax.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" + +namespace TEngine { + +namespace RefSoftmaxOps { + +/* impl ref softmax op */ +// +inline static int get_scale_zero(Tensor* itensor, Tensor* otensor, op_data* param) +{ + auto* i_quant = itensor->GetQuantParam(); + auto* o_quant = otensor->GetQuantParam(); + if( i_quant->size() != 1) + { + std::cerr<<"quant size: input("<< i_quant->size()<<")\n"; + return -1; + } + param->i_scale = (*i_quant)[0].scale; + if(itensor->GetDataType() == TENGINE_DT_UINT8) + { + if( o_quant->size() != 1) + { + std::cerr<<"output quant size: "<size()<<"\n"; + return -1; + } + + param->o_scale = (*o_quant)[0].scale; + param->o_zero = (*o_quant)[0].zero_point; + + param->i_zero = (*i_quant)[0].zero_point; + } + return 0; +} +// +struct RefSoftmax : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool Run(Node * node) override; + void InitRegistry(void); + + float * max_array; + float * sum_array; + + op_data op_param; + + ref_softmax_kernel_t kernel_run; + + KernelRegistry kernel_registry; + + RefSoftmax(void) + { + max_array=nullptr; + sum_array=nullptr; + + kernel_run=nullptr; + + InitRegistry(); + } +}; + +bool RefSoftmax::Prerun(Node * node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + int layout = exec_attr->graph_layout; + + if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + return true; +} + +bool RefSoftmax::Run(Node * node) +{ + Tensor * input_tensor=node->GetInputTensor(0); + Tensor * output_tensor=node->GetOutputTensor(0); + + const std::vector& dims = input_tensor->GetShape().GetDim(); + // + Softmax* softmax_op = dynamic_cast(node->GetOp()); + SoftmaxParam* param_ = softmax_op->GetParam(); + int axis = param_->axis; + int out_size = 1; + for(int i = 0; i < axis; i++) + { + out_size *= dims[i]; + } + int in_size = 1; + for(size_t i = axis + 1; i < dims.size(); i++) + { + in_size *= dims[i]; + } + int on_size = dims[axis]; + + max_array = ( float* )std::malloc(in_size * sizeof(float)); + sum_array = ( float* )std::malloc(in_size * sizeof(float)); + + // + op_param.out_size=out_size; + op_param.in_size=in_size; + op_param.on_size=on_size; + + // + void* input=(void*)get_tensor_mem(input_tensor); + void* output=(void*)get_tensor_mem(output_tensor); + // + /* Get input,kernel,output scale & zero */ + /* Current: one tensor has only one quantparam(scale)*/ + if(input_tensor->GetDataType() == TENGINE_DT_INT8 || + input_tensor->GetDataType() == TENGINE_DT_UINT8 ) + { + if(get_scale_zero(input_tensor, output_tensor, &op_param) < 0) + return false; + } + // + int ret = kernel_run(input,output,max_array,sum_array,&op_param); + // + if(input_tensor->GetDataType() == TENGINE_DT_INT8) + { + auto* o_quant = output_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = op_param.o_scale; + o_quant->resize(0); + o_quant->push_back(q_param); + } + + std::free(max_array); + std::free(sum_array); + + if(ret<0) + return false; + else + return true; +} + +void RefSoftmax::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_softmax_kernel_t)ref_softmax_kernel_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + RefSoftmax* ops = new RefSoftmax(); + + LOG_DEBUG()<<"RefSoftmaxOp is selected\n"; + + return ops; +} + +} // namespace RefSoftmaxOps + +void RegisterRefSoftmaxOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Softmax", RefSoftmaxOps::SelectFunc, 1000); +} + +} // namespace TEngine diff --git a/executor/operator/ref/ref_swap_axis.cpp b/executor/operator/ref/ref_swap_axis.cpp new file mode 100644 index 000000000..7afba1f6b --- /dev/null +++ b/executor/operator/ref/ref_swap_axis.cpp @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ + +#include + + + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/swap_axis.hpp" + +#include "kernel/swap_axis/ref_swap_axis_kernel.h" + +namespace TEngine { + +namespace RefSwapAxisOps { + + + +struct RefSwapAxis : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool Run(Node * node) override; + void InitRegistry(void); + + int dims[5]; + ref_swap_axis_kernel_t kernel_run; + KernelRegistry kernel_registry; + RefSwapAxis(void) + { + + kernel_run=nullptr; + + InitRegistry(); + } +}; + +void RefSwapAxis::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif +/* +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axisl_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((ref_swap_axis_kernel_t)ref_swap_axis_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif +*/ +} + +bool RefSwapAxis::Prerun(Node * node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + int layout = exec_attr->graph_layout; + if(!kernel_registry.GetKernel(kernel_run,layout,input_tensor->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + std::vector& in_dims = input_tensor->GetShape().GetDim(); + int in_dims_size = in_dims.size(); + SwapAxis* swap = dynamic_cast(node->GetOp()); + SwapAxisParam* param_ = swap->GetParam(); + int dim0 = param_->dim_0; + int dim1 = param_->dim_1; + if(dim0 > dim1) + { + int tmp = dim0; + dim0 = dim1; + dim1 = tmp; + } + + for(int i = 0; i < 5; i++) + dims[i] = 1; + //dim0 + for(int i = 0; i < dim0; i++) + dims[0] *= in_dims[i]; + //dim1 + dims[1] = in_dims[dim0]; + //dim2 + for(int i = dim0+1; i < dim1; i++ ) + dims[2] *= in_dims[i]; + //dim3 + dims[3] = in_dims[dim1]; + //dim4 + for(int i = dim1+1; i < in_dims_size; i++ ) + dims[4] *= in_dims[i]; + + + return true; +} + +bool RefSwapAxis::Run(Node * node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + + void* input_org = get_tensor_mem(input_tensor); + void* output_org = get_tensor_mem(output_tensor); + + kernel_run(input_org, output_org, dims); + + return true; +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + RefSwapAxis* ops = new RefSwapAxis(); + + LOG_DEBUG()<<"RefSwapAxis is selected\n"; + + return ops; +} + + +} // namespace RefSwapAxisOps + +void RegisterSwapAxisOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "SwapAxis", RefSwapAxisOps::SelectFunc, 1000); +} + +} // namespace TEngine \ No newline at end of file diff --git a/executor/operator/ref/relu.cpp b/executor/operator/ref/relu.cpp new file mode 100644 index 000000000..b94661946 --- /dev/null +++ b/executor/operator/ref/relu.cpp @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/relu.hpp" + +#include "kernel/relu/relu.h" + +namespace TEngine { + +namespace RefReluOps { + + + +struct ReluOps : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool OnBind(Node * node) override; + bool Run(Node * node) override; + void InitRegistry(void); + + relu_t kernel_run; + + KernelRegistry kernel_registry; + + ReluOps(void) + { + kernel_run=nullptr; + + InitRegistry(); + } +}; + +bool ReluOps::Prerun(Node * node) +{ + Tensor * input=node->GetInputTensor(0); + int layout=exec_attr->graph_layout; + + + if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType())) + { + set_tengine_errno(ENOENT); + //printf("errorno: %d\n",ENOENT); + return false; + } + + + return true; +} + +bool ReluOps::OnBind(Node * node) +{ + inplace_t io_map; + + io_map[0] = 0; + + node->SetAttr(ATTR_INPLACE, io_map); + return true; +} + +bool ReluOps::Run(Node * node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + const TShape& shape = output_tensor->GetShape(); + int elem_num = shape.GetSize(); + + ReLu* relu_op = dynamic_cast(node->GetOp()); + ReLuParam* param = relu_op->GetParam(); + void* data = get_tensor_mem(output_tensor); + float negativeslope=param->negative_slope; + + float scale = 1.f; + int zero_point = 0; + if(input_tensor->GetDataType() == TENGINE_DT_INT8 || input_tensor->GetDataType() == TENGINE_DT_UINT8) + { + auto quant_param = input_tensor->GetQuantParam(); + scale = (*quant_param)[0].scale; + zero_point = (*quant_param)[0].zero_point; + auto out_quant_param = output_tensor->GetQuantParam(); + out_quant_param->resize(0); + out_quant_param->push_back((*quant_param)[0]); + } + + int ret=kernel_run(data,elem_num,negativeslope, scale, zero_point); + + if(ret<0) + return false; + else + return true; +} + +void ReluOps::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((relu_t)relu_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((relu_t)relu_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((relu_t)relu_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((relu_t)relu_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((relu_t)relu_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((relu_t)relu_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((relu_t)relu_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((relu_t)relu_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + ReluOps* ops = new ReluOps(); + + LOG_DEBUG()<<"ReluOps RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefReluOps +void RegisterReluOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "ReLu", RefReluOps::SelectFunc, 1000); +} +} // namespace TEngine diff --git a/executor/operator/ref/relu6.cpp b/executor/operator/ref/relu6.cpp new file mode 100644 index 000000000..5306f3070 --- /dev/null +++ b/executor/operator/ref/relu6.cpp @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + +#include "kernel/relu6/relu6.h" + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" + +namespace TEngine { + +namespace RefRelu6Ops { + + + +struct Relu6Ops : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool OnBind(Node * node) override; + bool Run(Node * node) override; + void InitRegistry(void); + + relu6_t kernel_run; + + KernelRegistry kernel_registry; + + Relu6Ops(void) + { + kernel_run=nullptr; + + InitRegistry(); + } +}; + +bool Relu6Ops::Prerun(Node * node) +{ + Tensor * input=node->GetInputTensor(0); + int layout=exec_attr->graph_layout; + + + if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + //int elem_size=DataType::GetTypeSize(input->GetDataType()); + + return true; +} + +bool Relu6Ops::OnBind(Node * node) +{ + inplace_t io_map; + + io_map[0] = 0; + + node->SetAttr(ATTR_INPLACE, io_map); + return true; +} + +bool Relu6Ops::Run(Node * node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + //int element_size = DataType::GetTypeSize(input_tensor->GetDataType()); + const TShape& shape = input_tensor->GetShape(); + int elem_num = shape.GetSize(); + void* data = get_tensor_mem(output_tensor); + + float scale = 1.f; + int zero_point = 0; + if(input_tensor->GetDataType() == TENGINE_DT_INT8 || input_tensor->GetDataType() == TENGINE_DT_UINT8) + { + auto quant_param = input_tensor->GetQuantParam(); + scale = (*quant_param)[0].scale; + zero_point = (*quant_param)[0].zero_point; + auto out_quant_param = output_tensor->GetQuantParam(); + out_quant_param->resize(0); + out_quant_param->push_back((*quant_param)[0]); + } + + int ret=kernel_run(data,elem_num, scale, zero_point); + + if(ret<0) + return false; + else + return true; +} + +void Relu6Ops::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((relu6_t)relu6_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((relu6_t)relu6_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((relu6_t)relu6_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((relu6_t)relu6_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((relu6_t)relu6_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((relu6_t)relu6_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((relu6_t)relu6_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((relu6_t)relu6_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + Relu6Ops* ops = new Relu6Ops(); + + LOG_DEBUG()<<"ReluOps RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefRelu6Ops +void RegisterRelu6Ops(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "ReLu6", RefRelu6Ops::SelectFunc, 1000); +} + +} // namespace TEngine \ No newline at end of file diff --git a/executor/operator/ref/reshape.cpp b/executor/operator/ref/reshape.cpp new file mode 100644 index 000000000..39a2f7669 --- /dev/null +++ b/executor/operator/ref/reshape.cpp @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/reshape.hpp" + +namespace TEngine { + +namespace RefReshapeOps { + + + +struct RefReshape : public MTNodeOps +{ + bool OnBind(Node * node) override; + bool Run(Node * node) override; + +}; + + +bool RefReshape::OnBind(Node * node) +{ + inplace_t io_map; + + io_map[0] = 0; + + node->SetAttr(ATTR_INPLACE, io_map); + return true; +} + +bool RefReshape::Run(Node * node) +{ + + return true; +} + + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + RefReshape* ops = new RefReshape(); + + LOG_DEBUG()<<"ReluOps RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefReluOps +void RegisterReshapeOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Reshape", RefReshapeOps::SelectFunc, 1000); +} +} // namespace TEngine \ No newline at end of file diff --git a/executor/operator/ref/resize.cpp b/executor/operator/ref/resize.cpp new file mode 100644 index 000000000..adf601da6 --- /dev/null +++ b/executor/operator/ref/resize.cpp @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" + +#include "operator/resize.hpp" +#include "kernel/resize/resize_kernel.h" + +namespace TEngine { + +namespace RefResizeOps { + + +struct ResizeOps : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool Run(Node * node) override; + bool Postrun(Node * node) override; + void InitRegistry(void); + + struct resize_param op_param; + resize_t kernel_run; + + + KernelRegistry kernel_registry; + + ResizeOps(void) + { + kernel_run=nullptr; + + InitRegistry(); + } +}; + +bool ResizeOps::Prerun(Node * node) +{ + Tensor * input=node->GetInputTensor(0); + int layout=exec_attr->graph_layout; + Resize* resize_op = dynamic_cast(node->GetOp()); + ResizeParam* param_ = resize_op->GetParam(); + op_param.scale_x = 1.f / param_->scale_w; + op_param.scale_x = 1.f / param_->scale_h; + op_param.type = param_->type; + + if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; +} + +bool ResizeOps::Run(Node * node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + const TShape& shape = input_tensor->GetShape(); + + op_param.batch = shape.GetN(); + op_param.channel = shape.GetC(); + op_param.input_h = shape.GetH(); + op_param.input_w = shape.GetW(); + + const TShape& shape1 = output_tensor->GetShape(); + op_param.output_h = shape1.GetH(); + op_param.output_w = shape1.GetW(); + + + float* input = ( float* )get_tensor_mem(input_tensor); + float* output = ( float* )get_tensor_mem(output_tensor); + int ret=-1; + + ret=kernel_run(input, output, &op_param); + + if(ret<0) + return false; + else + return true; + +} + +bool ResizeOps::Postrun(Node * node) +{ + return true; +} + +void ResizeOps::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((resize_t)resize_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((resize_t)resize_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); +#endif +// #ifdef CONFIG_KERNEL_INT8 +// kernel_registry.Register((resize_t)resize_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); +// kernel_registry.Register((resize_t)resize_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +// #endif + +// #ifdef CONFIG_KERNEL_UINT8 +// kernel_registry.Register((resize_t)resize_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); +// kernel_registry.Register((resize_t)resize_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +// #endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + ResizeOps* ops = new ResizeOps(); + + LOG_DEBUG()<<"ReluOps RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefReluOps +void RegisterResizeOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Resize", RefResizeOps::SelectFunc, 1000); +} +} // namespace TEngine diff --git a/executor/operator/ref/sigmoid.cpp b/executor/operator/ref/sigmoid.cpp new file mode 100644 index 000000000..dc617610a --- /dev/null +++ b/executor/operator/ref/sigmoid.cpp @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + +#include "kernel/sigmoid/sigmoid.h" + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" + +namespace TEngine { + +namespace RefSigmoidOps { + + + +struct SigmoidOps : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool OnBind(Node * node) override; + bool Run(Node * node) override; + bool Postrun(Node * node) override; + void InitRegistry(void); + + sigmoid_param op_param; + sigmoid_t kernel_run; + + KernelRegistry kernel_registry; + + SigmoidOps(void) + { + kernel_run=nullptr; + + InitRegistry(); + } +}; + +static int get_scale_zero(Tensor* itensor,Tensor * otensor,sigmoid_param* param) +{ + auto* i_quant = itensor->GetQuantParam(); + auto* o_quant = otensor->GetQuantParam(); + if( i_quant->size() != 1 ) + { + return -1; + } + param->scale[0] = (*i_quant)[0].scale; + param->zero[0] = (*i_quant)[0].zero_point; + if(itensor->GetDataType() == TENGINE_DT_UINT8) + { + if( o_quant->size() != 1) + { + return -1; + } + + param->scale[1] = (*o_quant)[0].scale; + param->zero[1] = (*o_quant)[0].zero_point; + + } + return 0; +} + +bool SigmoidOps::Prerun(Node * node) +{ + Tensor * input=node->GetInputTensor(0); + int layout=exec_attr->graph_layout; + + + if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + return true; +} + +bool SigmoidOps::OnBind(Node * node) +{ + inplace_t io_map; + + io_map[0] = 0; + + node->SetAttr(ATTR_INPLACE, io_map); + return true; +} + +bool SigmoidOps::Run(Node * node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + Tensor* output_tensor = node->GetOutputTensor(0); + const TShape& shape = input_tensor->GetShape(); + int elem_num = shape.GetSize(); + void* data = get_tensor_mem(input_tensor); + if(input_tensor->GetDataType() == TENGINE_DT_INT8 || + input_tensor->GetDataType() == TENGINE_DT_UINT8 ) + { + if(get_scale_zero(input_tensor, output_tensor, &op_param) < 0) + return false; + } + + int ret = kernel_run(data, elem_num, &op_param); + + if(input_tensor->GetDataType() == TENGINE_DT_INT8) + { + auto* o_quant = output_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = 1/127; + q_param.zero_point = 0; + o_quant->resize(0); + o_quant->push_back(q_param); + } + + + if(ret<0) + return false; + else + return true; +} + +bool SigmoidOps::Postrun(Node * node) +{ + return true; +} + +void SigmoidOps::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((sigmoid_t)sigmoid_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((sigmoid_t)sigmoid_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((sigmoid_t)sigmoid_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((sigmoid_t)sigmoid_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((sigmoid_t)sigmoid_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((sigmoid_t)sigmoid_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((sigmoid_t)sigmoid_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((sigmoid_t)sigmoid_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + SigmoidOps* ops = new SigmoidOps(); + + LOG_DEBUG()<<"ReluOps RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefsigmoidOps +void RegisterSigmoidOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Sigmoid", RefSigmoidOps::SelectFunc, 1000); +} + +} // namespace TEngine diff --git a/executor/operator/ref/split.cpp b/executor/operator/ref/split.cpp new file mode 100644 index 000000000..de9582b58 --- /dev/null +++ b/executor/operator/ref/split.cpp @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include +#include + +#include "logger.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "graph.hpp" +#include "operator/split.hpp" + +#include "kernel/split/split_kernel.h" + +namespace TEngine +{ + namespace RefSplitOps + { + const int default_prio = 1500; + struct RefSplit : public MTNodeOps + { + bool Prerun(Node* node) override; + bool Run(Node* node) override; + bool Postrun(Node* node) override; + void InitRegistry(void); + + RefSplit() + { + kernel_run = nullptr; + InitRegistry(); + } + + struct split_param op_param; + split_t kernel_run; + void** output_data; + KernelRegistry kernel_registry; + }; + + void RefSplit::InitRegistry(void) + { + #ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((split_t)split_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((split_t)split_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); + #endif + #ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((split_t)split_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((split_t)split_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); + #endif + #ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((split_t)split_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((split_t)split_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); + #endif + #ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((split_t)split_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((split_t)split_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); + #endif + + } + + bool RefSplit::Prerun(Node* node) + { + int layout=exec_attr->graph_layout; + Tensor* output_tensor = node->GetOutputTensor(0); + Split* split_op = dynamic_cast(node->GetOp()); + SplitParam* param = split_op->GetParam(); + + Tensor* input_tensor = node->GetInputTensor(0); + int data_type = input_tensor->GetDataType(); + op_param.axis = param->axis; + + int out_nums = node->GetOutputNum(); + output_data = new void*[out_nums]; + + op_param.output_shape = new shape_dim[out_nums]; + op_param.output_counts = out_nums; + + + + auto dims = output_tensor->GetShape().GetDim(); + op_param.output_dim = (int)(dims.size()); + for(int i=0;iGetInputTensor(0); + void* input = get_tensor_mem(i_tensor); + int data_type = -1; + + data_type = i_tensor->GetDataType(); + auto* in_quant = i_tensor->GetQuantParam(); + if( (*in_quant).size() != 0 ) + { + op_param.input_shape.scale = (*in_quant)[0].scale; + op_param.out_scale = (*in_quant)[0].scale; + op_param.input_shape.zero = (*in_quant)[0].zero_point; + } + else + { + op_param.input_shape.scale = 1; + op_param.input_shape.zero = 0; + } + + auto dims = i_tensor->GetShape().GetDim(); + op_param.input_dim = (int)(dims.size()); + + for(std::size_t jj=0; jjGetOutputTensor(ii); + auto* o_quant = o_tensor->GetQuantParam(); + if( (*o_quant).size() !=0) + { + op_param.output_shape[ii].scale = (*o_quant)[0].scale; + op_param.output_shape[ii].zero = (*o_quant)[0].zero_point; + } + else + { + op_param.output_shape[ii].scale = 1; + op_param.output_shape[ii].zero = 0; + } + output_data[ii] = get_tensor_mem(o_tensor); + + } + + int ret = kernel_run(input, output_data, &op_param); + if(ret<0) + return false; + + if( data_type == TENGINE_DT_INT8 ) + { + + for(int ii=0; iiGetOutputTensor(ii); + auto* o_quant = o_tensor->GetQuantParam(); + QuantParam q_param; + q_param.scale = op_param.out_scale; + o_quant->resize(0); + o_quant->push_back(q_param); + } + } + + return true; + + } + + bool RefSplit::Postrun(Node* node) + { + delete[] output_data; + delete[] op_param.output_shape; + return true; + } + + NodeOps* SelectFunc(const CPUInfo* info, Node* node) + { + RefSplit* ops = new RefSplit(); + + LOG_DEBUG()<<"Refconcat is selected\n"; + + return ops; + } + + + } //end namespace RefSplitOps + + void RegisterSplitOps(void) + { + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Split", RefSplitOps::SelectFunc,RefSplitOps::default_prio); + } + +} diff --git a/executor/operator/ref/squeeze.cpp b/executor/operator/ref/squeeze.cpp new file mode 100644 index 000000000..db0a2ef2f --- /dev/null +++ b/executor/operator/ref/squeeze.cpp @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + + + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" +#include "operator/squeeze.hpp" + +namespace TEngine { + +namespace RefSqueezeOps { + + + +struct RefSqueeze : public MTNodeOps +{ + bool OnBind(Node * node) override; + bool Run(Node * node) override; + + +}; + + +bool RefSqueeze::OnBind(Node * node) +{ + inplace_t io_map; + + io_map[0] = 0; + + node->SetAttr(ATTR_INPLACE, io_map); + return true; +} + +bool RefSqueeze::Run(Node * node) +{ + + return true; +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + RefSqueeze* ops = new RefSqueeze(); + + LOG_DEBUG()<<"ReluOps RefOp is selected\n"; + + return ops; +} + + + + +} // namespace RefSqueezeOps +void RegisterSqueezeOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Squeeze", RefSqueezeOps::SelectFunc, 1000); +} +} // namespace TEngine \ No newline at end of file diff --git a/executor/operator/ref/tanh.cpp b/executor/operator/ref/tanh.cpp new file mode 100644 index 000000000..858ab20b5 --- /dev/null +++ b/executor/operator/ref/tanh.cpp @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ + +#include + +#include "kernel/tanh/tanh.h" + +#include "data_type.hpp" +#include "kernel_registry.hpp" +#include "tengine_errno.hpp" +#include "logger.hpp" +#include "graph.hpp" +#include "node_ops.hpp" +#include "tensor_mem.hpp" + +namespace TEngine { + +namespace RefTanhOps { + + + +struct TanhOps : public MTNodeOps +{ + bool Prerun(Node * node) override; + bool OnBind(Node * node) override; + bool Run(Node * node) override; + void InitRegistry(void); + + struct tanh_param op_param; + tanh_t kernel_run; + + KernelRegistry kernel_registry; + + TanhOps(void) + { + kernel_run=nullptr; + + InitRegistry(); + } +}; + +bool TanhOps::Prerun(Node * node) +{ + Tensor * input=node->GetInputTensor(0); + Tensor * output=node->GetOutputTensor(0); + int layout=exec_attr->graph_layout; + + if(output->GetDataType() == TENGINE_DT_UINT8 ) + { + auto output_quant = output->GetQuantParam(); + if(output_quant->size() < 1) + return false; + op_param.output_scale = (*output_quant)[0].scale; + op_param.output_zero = (*output_quant)[0].zero_point; + } + + if(!kernel_registry.GetKernel(kernel_run,layout,input->GetDataType())) + { + set_tengine_errno(ENOENT); + return false; + } + + + return true; +} + +bool TanhOps::OnBind(Node * node) +{ + inplace_t io_map; + + io_map[0] = 0; + + node->SetAttr(ATTR_INPLACE, io_map); + return true; +} + +bool TanhOps::Run(Node * node) +{ + Tensor* input_tensor = node->GetInputTensor(0); + const TShape& shape = input_tensor->GetShape(); + int elem_num = shape.GetSize(); + void* data = get_tensor_mem(input_tensor); + + if(input_tensor->GetDataType() == TENGINE_DT_INT8 || + input_tensor->GetDataType() == TENGINE_DT_UINT8 ) + { + auto input_quant = input_tensor->GetQuantParam(); + if(input_quant->size() < 1) + return false; + op_param.input_scale = (*input_quant)[0].scale; + op_param.input_zero = (*input_quant)[0].zero_point; + } + + int ret=kernel_run(data, elem_num, &op_param); + + if(ret<0) + return false; + else + return true; +} + +void TanhOps::InitRegistry(void) +{ +#ifdef CONFIG_KERNEL_FP32 + kernel_registry.Register((tanh_t)tanh_fp32,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP32); + kernel_registry.Register((tanh_t)tanh_fp32,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP32); +#endif + +#ifdef CONFIG_KERNEL_FP16 + kernel_registry.Register((tanh_t)tanh_fp16,TENGINE_LAYOUT_NCHW,TENGINE_DT_FP16); + kernel_registry.Register((tanh_t)tanh_fp16,TENGINE_LAYOUT_NHWC,TENGINE_DT_FP16); +#endif +#ifdef CONFIG_KERNEL_INT8 + kernel_registry.Register((tanh_t)tanh_int8,TENGINE_LAYOUT_NCHW,TENGINE_DT_INT8); + kernel_registry.Register((tanh_t)tanh_int8,TENGINE_LAYOUT_NHWC,TENGINE_DT_INT8); +#endif + +#ifdef CONFIG_KERNEL_UINT8 + kernel_registry.Register((tanh_t)tanh_uint8,TENGINE_LAYOUT_NCHW,TENGINE_DT_UINT8); + kernel_registry.Register((tanh_t)tanh_uint8,TENGINE_LAYOUT_NHWC,TENGINE_DT_UINT8); +#endif + +} + +NodeOps* SelectFunc(const CPUInfo* info, Node* node) +{ + TanhOps* ops = new TanhOps(); + + LOG_DEBUG()<<"ReluOps RefOp is selected\n"; + + return ops; +} + + + + +} // namespace ReftanhOps +void RegisterTanhOps(void) +{ + NodeOpsRegistryManager::RegisterOPImplementor(REF_REGISTRY_NAME, "Tanh", RefTanhOps::SelectFunc, 1000); +} + +} // namespace TEngine diff --git a/executor/plugin/init.cpp b/executor/plugin/init.cpp index 28d84d651..f21986061 100644 --- a/executor/plugin/init.cpp +++ b/executor/plugin/init.cpp @@ -34,11 +34,6 @@ using namespace TEngine; namespace TEngine { extern void NodeOpsRegistryManagerInit(void); -extern void RegisterCommonOps(void); - -#if CONFIG_ARCH_ARM64 == 1 || CONFIG_ARCH_ARM32 == 1 -extern void RegisterArmOps(void); -#endif void DevAllocatorManagerInit(void); void DevSchedulerManagerInit(void); @@ -48,12 +43,6 @@ int executor_plugin_init(void) { NodeOpsRegistryManagerInit(); - RegisterCommonOps(); - -#if CONFIG_ARCH_ARM64 || CONFIG_ARCH_ARM32 - RegisterArmOps(); -#endif - DevAllocatorManagerInit(); DevSchedulerManagerInit(); diff --git a/hclarm/Makefile b/hclarm/Makefile new file mode 100644 index 000000000..ded2ce2e2 --- /dev/null +++ b/hclarm/Makefile @@ -0,0 +1,99 @@ +BUILD_DIR?=$(shell pwd)/build +INSTALL_DIR?=$(shell pwd)/install +MAKEBUILD?=$(shell pwd)/makefile.build + +export CC CXX CFLAGS LD LDFLAGS CXXFLAGS COMMON_CFLAGS + +default: $(LIB_HCL_SO) + +include $(MAKEFILE_CONFIG) + +INC_DIR+=-I$(shell pwd)/../include +INC_DIR+=-I$(shell pwd)/../core/include +INC_DIR+=-I$(shell pwd)/../operator/include +INC_DIR+=-I$(shell pwd)/../executor/include + +CXXFLAGS+= + + +COMMON_CFLAGS+=$(CONFIG_OPT_CFLAGS) +COMMON_CFLAGS+= -Wall -g -fPIC $(INC_DIR) -Werror + +ifeq ($(CONFIG_INTERN_RELEASE),y) + COMMON_CFLAGS+=-DCONFIG_INTERN_RELEASE +endif + +ifeq ($(CONFIG_INTERN_TRIAL),y) + COMMON_CFLAGS+=-DCONFIG_INTERN_TRIAL +endif + +ifneq ($(CONFIG_OPT_CFLAGS),) + COMMON_CFLAGS+=-O3 -funroll-loops +endif + +ARM_BLOB=$(BUILD_DIR)/arm-builtin.o +LIB_HCL_SO?=$(BUILD_DIR)/../libhclcpu.so + + +LIB_SUB_DIRS+=../executor/operator lib/ + +ifeq ($(CONFIG_ARCH_ARM64),y) + COMMON_CFLAGS+= -DCONFIG_ARCH_ARM64=1 +endif + +ifeq ($(CONFIG_ARCH_BLAS),y) + COMMON_CFLAGS+=-DCONFIG_ARCH_BLAS=1 +endif + +ifeq ($(CONFIG_ARCH_ARM32),y) + COMMON_CFLAGS+= -DCONFIG_ARCH_ARM32=1 + CC+= -march=armv7-a -mfpu=neon + CXX+=-march=armv7-a -mfpu=neon +endif + + +ifeq ($(CONFIG_ACL_GPU),y) + ACL_LIBS+=-Wl,-rpath,$(ACL_ROOT)/build/ -L$(ACL_ROOT)/build + ACL_LIBS+= -larm_compute_core -larm_compute + LIB_LDFLAGS+=$(ACL_LIBS) +endif + +ARM_OBJS =$(addprefix $(BUILD_DIR)/, $(foreach f,$(LIB_SUB_DIRS),$(f)/built-in.o)) + +ifeq ($(CONFIG_ARCH_BLAS),y) + LIB_LDFLAGS+=-lopenblas +endif + +$(LIB_HCL_SO): $(ARM_BLOB) + $(CC) -o $@ -shared -Wl,-Bsymbolic -Wl,-Bsymbolic-functions $(ARM_BLOB) $(LIB_LDFLAGS) + +$(ARM_BLOB): $(ARM_OBJS) + $(BUILT_IN_LD) -r -o $@ $(ARM_OBJS) + + +$(ARM_OBJS): $(LIB_SUB_DIRS); + + + +build: default install + + +install: + @mkdir -p $(INSTALL_DIR)/lib + cp -f $(LIB_HCL_SO) $(INSTALL_DIR)/lib + +$(LIB_SUB_DIRS): + @$(MAKE) -C $@ -f $(MAKEBUILD) BUILD_DIR=$(BUILD_DIR)/$@ $(MAKECMDGOALS) + + +clean: $(LIB_SUB_DIRS) + @rm -rf $(ARM_BLOB) $(LIB_HCL_SO) + + +.PHONY: build clean default test install $(LIB_SUB_DIRS) + + + + + + diff --git a/hclarm/lib/Makefile b/hclarm/lib/Makefile new file mode 100644 index 000000000..6b1f91f43 --- /dev/null +++ b/hclarm/lib/Makefile @@ -0,0 +1 @@ +obj-y+=hcl_version.o diff --git a/hclarm/lib/hcl_version.c b/hclarm/lib/hcl_version.c new file mode 100644 index 000000000..46bb1f81e --- /dev/null +++ b/hclarm/lib/hcl_version.c @@ -0,0 +1,29 @@ +#include + +#define HCL_VERSION "1.2.2" + +const char * get_hcl_version(void) +{ + static char hcl_version[64]; + const char * postfix="github"; + +#ifdef CONFIG_INTERN_RELEASE + postfix="trial"; +#endif + +#ifdef CONFIG_INTERN_TRIAL + postfix="release"; +#endif + +#ifdef CONFIG_AUTHENICATION + postfix="authed"; +#endif + int ret=snprintf(hcl_version,64,"%s-%s", HCL_VERSION,postfix); + + if(ret>=64) + hcl_version[63]=0; + + return hcl_version; +} + + diff --git a/include/any.hpp b/include/any.hpp index 758fc4ceb..741c5b81a 100644 --- a/include/any.hpp +++ b/include/any.hpp @@ -20,23 +20,21 @@ #include #include #include +#include - -namespace TEngine -{ +namespace TEngine { static inline std::string GetRealName(const char* name) { - std::string result; + std::string result; + + char* real_name = abi::__cxa_demangle(name, nullptr, nullptr, nullptr); - char * real_name=abi::__cxa_demangle(name, nullptr, - nullptr, nullptr); + result = real_name; - result=real_name; + std::free(real_name); - std::free(real_name); - - return result; + return result; } class bad_any_cast : public std::bad_cast @@ -44,31 +42,33 @@ class bad_any_cast : public std::bad_cast public: bad_any_cast(const std::type_info& expected, const std::type_info& real) { - std::string& message=GetMessage(); + std::string& message = GetMessage(); - message=std::string("Bad any cast: Expected: ")+GetRealName(real.name()); - message+=" Real: "+GetRealName(expected.name()); + message = std::string("Bad any cast: Expected: ") + GetRealName(real.name()); + message += " Real: " + GetRealName(expected.name()); - const char * str=getenv("HALT_ON_MISMATCH"); + const char* str = getenv("HALT_ON_MISMATCH"); - if(str) - { - std::cerr<clear(); } - /// Constructs an object of type any that contains an object of type T direct-initialized with std::forward(value). + /// Constructs an object of type any that contains an object of type T direct-initialized with + /// std::forward(value). /// /// T shall satisfy the CopyConstructible requirements, otherwise the program is ill-formed. - /// This is because an `any` may be copy constructed into another `any` at any time, so a copy should always be allowed. - template::type, any>::value>::type> + /// This is because an `any` may be copy constructed into another `any` at any time, so a copy should always be + /// allowed. + template ::type, any>::value>::type> any(ValueType&& value) { static_assert(std::is_copy_constructible::type>::value, - "T shall satisfy the CopyConstructible requirements."); + "T shall satisfy the CopyConstructible requirements."); this->construct(std::forward(value)); } @@ -141,12 +139,14 @@ class any final /// Has the same effect as any(std::forward(value)).swap(*this). No effect if a exception is thrown. /// /// T shall satisfy the CopyConstructible requirements, otherwise the program is ill-formed. - /// This is because an `any` may be copy constructed into another `any` at any time, so a copy should always be allowed. - template::type, any>::value>::type> + /// This is because an `any` may be copy constructed into another `any` at any time, so a copy should always be + /// allowed. + template ::type, any>::value>::type> any& operator=(ValueType&& value) { static_assert(std::is_copy_constructible::type>::value, - "T shall satisfy the CopyConstructible requirements."); + "T shall satisfy the CopyConstructible requirements."); any(std::forward(value)).swap(*this); return *this; } @@ -170,7 +170,7 @@ class any final /// If *this has a contained object of type T, typeid(T); otherwise typeid(void). const std::type_info& type() const noexcept { - return empty()? typeid(void) : this->vtable->type(); + return empty() ? typeid(void) : this->vtable->type(); } /// Exchange the states of *this and rhs. @@ -185,7 +185,7 @@ class any final if(this->vtable != nullptr) { this->vtable->move(this->storage, rhs.storage); - //this->vtable = nullptr; -- uneeded, see below + // this->vtable = nullptr; -- uneeded, see below } // move from tmp (previously rhs) to *this. @@ -196,84 +196,81 @@ class any final tmp.vtable = nullptr; } } - else // same types + else // same types { if(this->vtable != nullptr) this->vtable->swap(this->storage, rhs.storage); } } -private: // Storage and Virtual Method Table - +private: // Storage and Virtual Method Table union storage_union { using stack_storage_t = typename std::aligned_storage<2 * sizeof(void*), std::alignment_of::value>::type; - void* dynamic; - stack_storage_t stack; // 2 words for e.g. shared_ptr + void* dynamic; + stack_storage_t stack; // 2 words for e.g. shared_ptr }; - - /// Base VTable specification. - struct vtable_type + + /// Base VTable specification. + struct vtable_type + { + // Note: The caller is responssible for doing .vtable = nullptr after destructful operations + // such as destroy() and/or move(). + + /// The type of the object this vtable is for. + const std::type_info& (*type)(); + + /// Destroys the object in the union. + /// The state of the union after this call is unspecified, caller must ensure not to use src anymore. + void (*destroy)(storage_union&); + + /// Copies the **inner** content of the src union into the yet unitialized dest union. + /// As such, both inner objects will have the same state, but on separate memory locations. + void (*copy)(const storage_union& src, storage_union& dest); + + /// Moves the storage from src to the yet unitialized dest union. + /// The state of src after this call is unspecified, caller must ensure not to use src anymore. + void (*move)(storage_union& src, storage_union& dest); + + /// Exchanges the storage between lhs and rhs. + void (*swap)(storage_union& lhs, storage_union& rhs); + }; + + /// VTable for dynamically allocated storage. + template struct vtable_dynamic + { + static const std::type_info& type() noexcept { - // Note: The caller is responssible for doing .vtable = nullptr after destructful operations - // such as destroy() and/or move(). - - /// The type of the object this vtable is for. - const std::type_info& (*type)(); - - /// Destroys the object in the union. - /// The state of the union after this call is unspecified, caller must ensure not to use src anymore. - void(*destroy)(storage_union&); - - /// Copies the **inner** content of the src union into the yet unitialized dest union. - /// As such, both inner objects will have the same state, but on separate memory locations. - void(*copy)(const storage_union& src, storage_union& dest); - - /// Moves the storage from src to the yet unitialized dest union. - /// The state of src after this call is unspecified, caller must ensure not to use src anymore. - void(*move)(storage_union& src, storage_union& dest); - - /// Exchanges the storage between lhs and rhs. - void(*swap)(storage_union& lhs, storage_union& rhs); - }; - - /// VTable for dynamically allocated storage. - template - struct vtable_dynamic + return typeid(T); + } + + static void destroy(storage_union& storage) noexcept { - static const std::type_info& type() noexcept - { - return typeid(T); - } - - static void destroy(storage_union& storage) noexcept - { - //assert(reinterpret_cast(storage.dynamic)); - delete reinterpret_cast(storage.dynamic); - } - - static void copy(const storage_union& src, storage_union& dest) - { - dest.dynamic = new T(*reinterpret_cast(src.dynamic)); - } - - static void move(storage_union& src, storage_union& dest) noexcept - { - dest.dynamic = src.dynamic; - src.dynamic = nullptr; - } - - static void swap(storage_union& lhs, storage_union& rhs) noexcept - { - // just exchage the storage pointers. - std::swap(lhs.dynamic, rhs.dynamic); - } - }; + // assert(reinterpret_cast(storage.dynamic)); + delete reinterpret_cast(storage.dynamic); + } + + static void copy(const storage_union& src, storage_union& dest) + { + dest.dynamic = new T(*reinterpret_cast(src.dynamic)); + } + + static void move(storage_union& src, storage_union& dest) noexcept + { + dest.dynamic = src.dynamic; + src.dynamic = nullptr; + } + + static void swap(storage_union& lhs, storage_union& rhs) noexcept + { + // just exchage the storage pointers. + std::swap(lhs.dynamic, rhs.dynamic); + } + }; /// VTable for stack allocated storage. - template - struct vtable_stack + template struct vtable_stack { static const std::type_info& type() noexcept { @@ -287,14 +284,14 @@ class any final static void copy(const storage_union& src, storage_union& dest) { - new (&dest.stack) T(reinterpret_cast(src.stack)); + new(&dest.stack) T(reinterpret_cast(src.stack)); } static void move(storage_union& src, storage_union& dest) noexcept { // one of the conditions for using vtable_stack is a nothrow move constructor, // so this move constructor will never throw a exception. - new (&dest.stack) T(std::move(reinterpret_cast(src.stack))); + new(&dest.stack) T(std::move(reinterpret_cast(src.stack))); destroy(src); } @@ -305,32 +302,30 @@ class any final }; /// Whether the type T must be dynamically allocated or can be stored on the stack. - template - struct requires_allocation : - std::integral_constant::value // N4562 §6.3/3 [any.class] - && sizeof(T) <= sizeof(storage_union::stack) - && std::alignment_of::value <= std::alignment_of::value)> - {}; + template + struct requires_allocation + : std::integral_constant::value // N4562 §6.3/3 [any.class] + && sizeof(T) <= sizeof(storage_union::stack) && + std::alignment_of::value <= + std::alignment_of::value)> + { + }; /// Returns the pointer to the vtable of the type T. - template - static vtable_type* vtable_for_type() + template static vtable_type* vtable_for_type() { - using VTableType = typename std::conditional::value, vtable_dynamic, vtable_stack>::type; + using VTableType = + typename std::conditional::value, vtable_dynamic, vtable_stack>::type; static vtable_type table = { - VTableType::type, VTableType::destroy, - VTableType::copy, VTableType::move, - VTableType::swap, + VTableType::type, VTableType::destroy, VTableType::copy, VTableType::move, VTableType::swap, }; return &table; } protected: - template - friend const T* any_cast(const any* operand) noexcept; - template - friend T* any_cast(any* operand) noexcept; + template friend const T* any_cast(const any* operand) noexcept; + template friend T* any_cast(any* operand) noexcept; /// Same effect as is_same(this->type(), t); bool is_typed(const std::type_info& t) const @@ -347,9 +342,13 @@ class any final static bool is_same(const std::type_info& a, const std::type_info& b) { #ifdef ANY_IMPL_FAST_TYPE_INFO_COMPARE - return &a == &b; + return &a == &b; +#else +#ifdef __ANDROID__ + return a == b || strcmp(a.name(),b.name()) == 0; #else return a == b; +#endif #endif } @@ -480,7 +479,13 @@ class any final inline T* any_cast(any* operand) noexcept { if(operand == nullptr || !operand->is_typed(typeid(T))) + { + if(operand != nullptr ) + { + std::cout << "type is not same-----------------------\n"; + } return nullptr; + } else return operand->cast(); } @@ -494,4 +499,3 @@ namespace std lhs.swap(rhs); } } - diff --git a/include/share_lib_parser.hpp b/include/share_lib_parser.hpp index d7903d6ec..881ff0ce8 100644 --- a/include/share_lib_parser.hpp +++ b/include/share_lib_parser.hpp @@ -54,7 +54,7 @@ class ShareLibParser sl = ::dlopen(so_path.c_str(), RTLD_LAZY | RTLD_GLOBAL); if(!sl) { - std::printf("%s\n", dlerror()); + //std::printf("%s\n", dlerror()); throw te_error_unable_to_load_library(so_path); return -1; } @@ -70,7 +70,7 @@ class ShareLibParser if(!f) { throw te_error_shared_function_not_found(func_name); - return nullptr; + //return nullptr; } func_map.emplace(func_name, ( func* )f); it = func_map.find(func_name); diff --git a/include/te_error.hpp b/include/te_error.hpp index 9ab8ee9ce..44101e377 100644 --- a/include/te_error.hpp +++ b/include/te_error.hpp @@ -34,22 +34,40 @@ struct te_error_base : public std::runtime_error { return error_code; } - te_error_base(error_code_t e) : runtime_error("tengine error"), error_code(e) {} + te_error_base() : runtime_error("tengine error"){} }; struct te_error_shared_function_not_found : public te_error_base { using te_error_base::te_error_base; + static std::string msg; + + te_error_shared_function_not_found(const std::string& func_name) + { + msg="\nShared function not found: "; + msg+=func_name; + msg+="\n"; + } + const char* what() const throw() override { - return "Shared function not found"; + return msg.c_str(); } }; struct te_error_unable_to_load_library : public te_error_base { using te_error_base::te_error_base; + static std::string msg; + + te_error_unable_to_load_library(const std::string& so_name) + { + msg="\nShared library not found: "; + msg+=so_name; + msg+="\n"; + } + const char* what() const throw() override { - return "Unable to load library"; + return msg.c_str(); } }; struct te_error_general : public te_error_base diff --git a/include/type_name.hpp b/include/type_name.hpp index de0560ba6..f3e49ef8e 100644 --- a/include/type_name.hpp +++ b/include/type_name.hpp @@ -37,7 +37,7 @@ template static std::string type_name() { typedef typename std::remove_reference::type TR; std::unique_ptr own( -#ifndef __GNUC__ +#if !defined(__GNUC__) || defined(NO_CXA_DEMANGLE) nullptr, #else abi::__cxa_demangle(typeid(TR).name(), nullptr, nullptr, nullptr), @@ -72,7 +72,7 @@ template static std::string GetNameForType(T&& t) static std::string GetTypeName(const char* name) { -#ifndef __GNUC__ +#if !defined(__GNUC__) || defined(NO_CXA_DEMANGLE) return name; #else std::unique_ptr own(abi::__cxa_demangle(name, nullptr, nullptr, nullptr), std::free); diff --git a/makefile.config.example b/makefile.config.example index cbac07ee3..bb6c32aae 100644 --- a/makefile.config.example +++ b/makefile.config.example @@ -15,8 +15,19 @@ # $ make -j8 #------------------------------------------------------------------------------- +# cross compile for ARM64 +# CROSS_COMPILE=aarch64-linux-gnu- +# cross compile for ARM32 +# CROSS_COMPILE=arm-linux-gnueabihf- + +# Just to differentiate with sysroot for embedded toolchains building +# As toolchains just need a few pre-built libraries +# +# EMBEDDED_CROSS_ROOT=/opt/install/ + # Set the target arch -CONFIG_ARCH_ARM64=y + CONFIG_ARCH_ARM64=y +# CONFIG_ARCH_ARM32=y # Enable Compiling Optimization CONFIG_OPT_CFLAGS = -O2 @@ -35,6 +46,7 @@ CONFIG_CAFFE_SERIALIZER=y # CONFIG_MXNET_SERIALIZER=y # CONFIG_ONNX_SERIALIZER=y # CONFIG_TF_SERIALIZER=y +# CONFIG_TFLITE_SERIALIZER=y CONFIG_TENGINE_SERIALIZER=y # Enable Wrappers @@ -46,3 +58,5 @@ CONFIG_VERSION_POSTFIX=github # support legacy API CONFIG_LEGACY_API=y +# kernel configuration +CONFIG_KERNEL_FP32=y diff --git a/operator/include/operator/add_n.hpp b/operator/include/operator/add_n.hpp new file mode 100644 index 000000000..ae528d306 --- /dev/null +++ b/operator/include/operator/add_n.hpp @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: haitao@openailab.com + */ +#ifndef __ADDN_HPP__ +#define __ADDN_HPP__ + +#include "operator.hpp" +#include "addn_param.hpp" + +namespace TEngine { + +class Addn : public OperatorWithParam +{ +public: + Addn() + { + name_ = "Addn"; + } + Addn(const Addn& src) = default; + + void SetSchema(void) override; + + bool InferShape(const std::vector&, std::vector&, int layout) override; +}; +} // namespace TEngine + +#endif diff --git a/operator/include/operator/addn_param.hpp b/operator/include/operator/addn_param.hpp new file mode 100644 index 000000000..41cb560fd --- /dev/null +++ b/operator/include/operator/addn_param.hpp @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: haitao@openailab.com + */ +#ifndef __ADDN_PARAM_HPP__ +#define __ADDN_PARAM_HPP__ + +#include "parameter.hpp" + +namespace TEngine { + +struct AddnParam : public NamedParam +{ + int axis; + DECLARE_PARSER_STRUCTURE(AddnParam) + { + DECLARE_PARSER_ENTRY(axis); + } +}; + +} // namespace TEngine + +#endif diff --git a/operator/include/operator/conv_param.hpp b/operator/include/operator/conv_param.hpp index 565c53877..064b623a2 100644 --- a/operator/include/operator/conv_param.hpp +++ b/operator/include/operator/conv_param.hpp @@ -46,14 +46,16 @@ struct ConvParam : public NamedParam int kernel_w; int stride_h; int stride_w; - int pad_h; - int pad_w; int dilation_h; int dilation_w; + int input_channel; int output_channel; int group; int activation; - std::vector pads; + int pad_h0; // top padding rows + int pad_w0; // left padding columns + int pad_h1; // bottom padding rows + int pad_w1; // right padding columns DECLARE_PARSER_STRUCTURE(ConvParam) { @@ -61,13 +63,16 @@ struct ConvParam : public NamedParam DECLARE_PARSER_ENTRY(kernel_w); DECLARE_PARSER_ENTRY(stride_h); DECLARE_PARSER_ENTRY(stride_w); - DECLARE_PARSER_ENTRY(pad_h); - DECLARE_PARSER_ENTRY(pad_w); DECLARE_PARSER_ENTRY(dilation_h); DECLARE_PARSER_ENTRY(dilation_w); + DECLARE_PARSER_ENTRY(input_channel); DECLARE_PARSER_ENTRY(output_channel); DECLARE_PARSER_ENTRY(group); DECLARE_PARSER_ENTRY(activation); + DECLARE_PARSER_ENTRY(pad_h0); + DECLARE_PARSER_ENTRY(pad_w0); + DECLARE_PARSER_ENTRY(pad_h1); + DECLARE_PARSER_ENTRY(pad_w1); }; }; diff --git a/operator/include/operator/deconv_param.hpp b/operator/include/operator/deconv_param.hpp index f8bad7170..f3bb0c93e 100644 --- a/operator/include/operator/deconv_param.hpp +++ b/operator/include/operator/deconv_param.hpp @@ -30,19 +30,36 @@ namespace TEngine { struct DeconvParam : public NamedParam { - int kernel_size; - int stride; - int pad; int num_output; - int dilation; + int kernel_h; + int kernel_w; + int stride_h; + int stride_w; + int pad_h0; + int pad_w0; + int pad_h1; + int pad_w1; + int dilation_h; + int dilation_w; + int group; + int activation; DECLARE_PARSER_STRUCTURE(DeconvParam) { - DECLARE_PARSER_ENTRY(kernel_size); - DECLARE_PARSER_ENTRY(stride); - DECLARE_PARSER_ENTRY(pad); - DECLARE_PARSER_ENTRY(num_output); - DECLARE_PARSER_ENTRY(dilation); + DECLARE_PARSER_ENTRY(num_output); + DECLARE_PARSER_ENTRY(kernel_h); + DECLARE_PARSER_ENTRY(kernel_w); + DECLARE_PARSER_ENTRY(stride_h); + DECLARE_PARSER_ENTRY(stride_w); + DECLARE_PARSER_ENTRY(pad_h0); + DECLARE_PARSER_ENTRY(pad_w0); + DECLARE_PARSER_ENTRY(pad_h1); + DECLARE_PARSER_ENTRY(pad_w1); + DECLARE_PARSER_ENTRY(dilation_h); + DECLARE_PARSER_ENTRY(dilation_w); + DECLARE_PARSER_ENTRY(group); + DECLARE_PARSER_ENTRY(activation); + }; }; diff --git a/operator/include/operator/demo_op.hpp b/operator/include/operator/demo_op.hpp index d57736818..c8ed423f4 100644 --- a/operator/include/operator/demo_op.hpp +++ b/operator/include/operator/demo_op.hpp @@ -37,6 +37,8 @@ class DemoOp : public OperatorNoParam } DemoOp(const DemoOp& src) = default; + bool InferShape(const std::vector& ishape, std::vector& oshape, int layout) override; + void SetSchema(void) override; }; } // namespace TEngine diff --git a/operator/include/operator/eltwise.hpp b/operator/include/operator/eltwise.hpp index ccc677ce2..34b329fa1 100644 --- a/operator/include/operator/eltwise.hpp +++ b/operator/include/operator/eltwise.hpp @@ -38,23 +38,23 @@ class Eltwise : public OperatorWithParam Eltwise(const Eltwise& src) = default; virtual ~Eltwise(){}; - void MethodToType(EltwiseParam& param) - { - std::string& method = param.method; + // void MethodToType(EltwiseParam& param) + // { + // std::string& method = param.method; - /* default eltwise_SUM */ - param.type = ELT_SUM; + // /* default eltwise_SUM */ + // param.type = ELT_SUM; - if(method == "max") - param.type = ELT_MAX; - else if(method == "prod") - param.type = ELT_PROD; - } - void ParseParam(EltwiseParam& param, Operator* op) override - { - ParsePredefinedParam(param, op); - MethodToType(param); - } + // if(method == "max") + // param.type = ELT_MAX; + // else if(method == "prod") + // param.type = ELT_PROD; + // } + // void ParseParam(EltwiseParam& param, Operator* op) override + // { + // ParsePredefinedParam(param, op); + // MethodToType(param); + // } void SetSchema(void) override; bool InferShape(const std::vector& ishape, std::vector& oshape, int layout) override; diff --git a/operator/include/operator/eltwise_param.hpp b/operator/include/operator/eltwise_param.hpp index 740677a44..c63c1b428 100644 --- a/operator/include/operator/eltwise_param.hpp +++ b/operator/include/operator/eltwise_param.hpp @@ -37,20 +37,28 @@ enum EltType ELT_MAX, ELT_RSQRT, ELT_MIN_SCALAR, - ELT_LAST + ELT_LAST, + ELT_DIV, + ELT_LOG, + ELT_EXP, + ELT_SQRT, + ELT_FLOOR, + ELT_SQUARE, + ELT_POW }; namespace TEngine { struct EltwiseParam : public NamedParam { - std::string method; - EltType type; + // std::string method; + // EltType type; + int type; int caffe_flavor; DECLARE_PARSER_STRUCTURE(EltwiseParam) { - DECLARE_PARSER_ENTRY(method); + DECLARE_PARSER_ENTRY(type); DECLARE_PARSER_ENTRY(caffe_flavor); }; }; diff --git a/operator/include/operator/gru.hpp b/operator/include/operator/gru.hpp new file mode 100644 index 000000000..6a13fdd9e --- /dev/null +++ b/operator/include/operator/gru.hpp @@ -0,0 +1,63 @@ +#ifndef __GRU_HPP__ +#define __GRU_HPP__ + +#include "operator.hpp" +#include "gru_param.hpp" + +namespace TEngine { + +class GRU : public OperatorWithParam +{ +public: + GRU(void) + { + name_ = "GRU"; + } + GRU(const GRU&) = default; + void SetSchema(void) override; + bool InferShape(const std::vector&, std::vector&, int layout) override; + const char* GetBiasName(void) + { + return "gates/bias"; + } + const char* GetKernelName(void) + { + return "gates/kernel"; + } + const char* GetInitHiddenName(void) + { + return "init_h"; + } + const char* GetCandidateKernelName(void) + { + return "candidate/kernel"; + } + const char* GetCandidateBiasName(void) + { + return "candidate/bias"; + } + const char* Geti2hweightName(void) + { + return "i2h_weight"; + } + const char* Geti2hbiasName(void) + { + return "i2h_bias"; + } + const char* Geth2hweightName(void) + { + return "h2h_weight"; + } + const char* Geth2hbiasName(void) + { + return "h2h_bias"; + } + const char* GetFusedKernelName(void) + { + return "parameters"; + } +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/operator/include/operator/gru_param.hpp b/operator/include/operator/gru_param.hpp new file mode 100644 index 000000000..514c3bab0 --- /dev/null +++ b/operator/include/operator/gru_param.hpp @@ -0,0 +1,58 @@ +/* + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#ifndef __GRU_PARAM_HPP__ +#define __GRU_PARAM_HPP__ + +#include + +#include "parameter.hpp" + +namespace TEngine { + +#define GRU_ACT_TANH 1 + +struct GRUParam : public NamedParam +{ + float clip; + int output_len; + int sequence_len; + int input_size; + int hidden_size; + int has_clip; + int has_gate_bias; + int has_candidate_bias; + int has_init_state; + int mxnet_flag; + + DECLARE_PARSER_STRUCTURE(GRUParam) + { + DECLARE_PARSER_ENTRY(clip); + DECLARE_PARSER_ENTRY(output_len); + DECLARE_PARSER_ENTRY(sequence_len); + DECLARE_PARSER_ENTRY(input_size); + DECLARE_PARSER_ENTRY(hidden_size); + DECLARE_PARSER_ENTRY(has_clip); + DECLARE_PARSER_ENTRY(has_gate_bias); + DECLARE_PARSER_ENTRY(has_candidate_bias); + DECLARE_PARSER_ENTRY(has_init_state); + DECLARE_PARSER_ENTRY(mxnet_flag); + }; +}; + +} // namespace TEngine + +#endif diff --git a/operator/include/operator/lstm.hpp b/operator/include/operator/lstm.hpp index 62c21ed59..8d4dcf13b 100644 --- a/operator/include/operator/lstm.hpp +++ b/operator/include/operator/lstm.hpp @@ -16,6 +16,10 @@ class LSTM : public OperatorWithParam LSTM(const LSTM&) = default; void SetSchema(void) override; bool InferShape(const std::vector&, std::vector&, int layout) override; + const char* GetKernelName(void) + { + return "kernel"; + } const char* GetBiasName(void) { return "bias"; @@ -44,6 +48,27 @@ class LSTM : public OperatorWithParam { return "init_h"; } + const char* Geti2hKernelName(void) + { + return "i2h_weight"; + } + const char* Geti2hBiasName(void) + { + return "i2h_bias"; + } + const char* Geth2hKernelName(void) + { + return "h2h_weight"; + } + const char* Geth2hBiasName(void) + { + return "h2h_bias"; + } + const char* GetFusedKernelName(void) + { + return "parameters"; + } + }; } // namespace TEngine diff --git a/operator/include/operator/lstm_param.hpp b/operator/include/operator/lstm_param.hpp index 6e0551902..5c01cbf02 100644 --- a/operator/include/operator/lstm_param.hpp +++ b/operator/include/operator/lstm_param.hpp @@ -30,6 +30,9 @@ namespace TEngine { +#define LSTM_ACT_SIGMOID 1 +#define LSTM_ACT_TANH 2 + struct LSTMParam : public NamedParam { float forget_bias; @@ -44,11 +47,12 @@ struct LSTMParam : public NamedParam int has_clip; int has_bias; int has_init_state; - const char* forget_act; - const char* input_act; - const char* output_act; - const char* cellin_act; - const char* cellout_act; + int forget_act; + int input_act; + int output_act; + int cellin_act; + int cellout_act; + int mxnet_flag; DECLARE_PARSER_STRUCTURE(LSTMParam) { @@ -69,6 +73,7 @@ struct LSTMParam : public NamedParam DECLARE_PARSER_ENTRY(cellin_act); DECLARE_PARSER_ENTRY(output_act); DECLARE_PARSER_ENTRY(cellout_act); + DECLARE_PARSER_ENTRY(mxnet_flag); }; }; diff --git a/operator/include/operator/pad.hpp b/operator/include/operator/pad.hpp new file mode 100644 index 000000000..d11183ce2 --- /dev/null +++ b/operator/include/operator/pad.hpp @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#ifndef __PAD_HPP__ +#define __PAD_HPP__ + +#include "operator.hpp" +#include "pad_param.hpp" + +namespace TEngine { + +class Pad : public OperatorWithParam +{ +public: + Pad() + { + name_ = "Pad"; + } + Pad(const Pad& src) = default; + + virtual ~Pad() {} + bool InferShape(const std::vector& ishape, std::vector& oshape, + int layout) override; + void SetSchema(void) override; +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/operator/include/operator/pad_param.hpp b/operator/include/operator/pad_param.hpp new file mode 100644 index 000000000..82eb30fb9 --- /dev/null +++ b/operator/include/operator/pad_param.hpp @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#ifndef __PAD_PARAM_HPP__ +#define __PAD_PARAM_HPP__ + +#include "parameter.hpp" + +namespace TEngine { + +struct PadParam : public NamedParam +{ + //mode : 0: CONSTANT; 1: REFLECT; 2: SYMMETRIC. + int mode; + int pad_0_h; + int pad_0_w; + int pad_1_h; + int pad_1_w; + int pad_2_h; + int pad_2_w; + int pad_3_h; + int pad_3_w; + float value; + + DECLARE_PARSER_STRUCTURE(PadParam) + { + DECLARE_PARSER_ENTRY(mode); + DECLARE_PARSER_ENTRY(pad_0_h); + DECLARE_PARSER_ENTRY(pad_0_w); + DECLARE_PARSER_ENTRY(pad_1_h); + DECLARE_PARSER_ENTRY(pad_1_w); + DECLARE_PARSER_ENTRY(pad_2_h); + DECLARE_PARSER_ENTRY(pad_2_w); + DECLARE_PARSER_ENTRY(pad_3_h); + DECLARE_PARSER_ENTRY(pad_3_w); + DECLARE_PARSER_ENTRY(value); + }; +}; +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/operator/include/operator/pool_param.hpp b/operator/include/operator/pool_param.hpp index 7254b882e..7f1ea4e39 100644 --- a/operator/include/operator/pool_param.hpp +++ b/operator/include/operator/pool_param.hpp @@ -48,15 +48,14 @@ struct PoolParam : public NamedParam int alg; int kernel_h; int kernel_w; - int pad_h; - int pad_w; int stride_h; int stride_w; int global; int caffe_flavor; - std::vector kernel_shape; ///> The size of the kernel along each axis (H, W). - std::vector strides; ///> stride along each axis (H, W). - std::vector pads; ///> [x1_begin, x2_begin...x1_end, x2_end,...] for each axis. + int pad_h0; // top padding rows + int pad_w0; // left padding columns + int pad_h1; // bottom padding rows + int pad_w1; // right padding columns DECLARE_PARSER_STRUCTURE(PoolParam) { @@ -65,10 +64,12 @@ struct PoolParam : public NamedParam DECLARE_PARSER_ENTRY(kernel_w); DECLARE_PARSER_ENTRY(stride_h); DECLARE_PARSER_ENTRY(stride_w); - DECLARE_PARSER_ENTRY(pad_h); - DECLARE_PARSER_ENTRY(pad_w); DECLARE_PARSER_ENTRY(global); DECLARE_PARSER_ENTRY(caffe_flavor); + DECLARE_PARSER_ENTRY(pad_h0); + DECLARE_PARSER_ENTRY(pad_w0); + DECLARE_PARSER_ENTRY(pad_h1); + DECLARE_PARSER_ENTRY(pad_w1); }; }; diff --git a/operator/include/operator/pooling.hpp b/operator/include/operator/pooling.hpp index cd62f9176..34e446ccf 100644 --- a/operator/include/operator/pooling.hpp +++ b/operator/include/operator/pooling.hpp @@ -47,26 +47,6 @@ class Pooling : public OperatorWithParam void SetSchema(void) override; - void ParseParam(PoolParam& param, Operator* op) override - { - ParsePredefinedParam(param, op); - - /* translate to onnx parameters */ - param.kernel_shape.resize(2); - - param.kernel_shape[0] = param.kernel_h; - param.kernel_shape[1] = param.kernel_w; - - param.strides.resize(2); - param.strides[0] = param.stride_h; - param.strides[1] = param.stride_w; - - param.pads.resize(4); - param.pads[0] = param.pad_h; - param.pads[1] = param.pad_w; - param.pads[2] = param.pad_h; - param.pads[3] = param.pad_w; - } }; } // namespace TEngine diff --git a/operator/include/operator/reduction.hpp b/operator/include/operator/reduction.hpp new file mode 100644 index 000000000..65b887dac --- /dev/null +++ b/operator/include/operator/reduction.hpp @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#ifndef __REDUCTION_HPP__ +#define __REDUCTION_HPP__ + +#include "operator.hpp" +#include "reduction_param.hpp" + +namespace TEngine { + +class Reduction : public OperatorWithParam +{ +public: + Reduction() + { + name_ = "Reduction"; + } + Reduction(const Reduction& src) = default; + + virtual ~Reduction() {} + bool InferShape(const std::vector& ishape, std::vector& oshape, + int layout) override; + void SetSchema(void) override; +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/operator/include/operator/reduction_param.hpp b/operator/include/operator/reduction_param.hpp new file mode 100644 index 000000000..49c1c4ed7 --- /dev/null +++ b/operator/include/operator/reduction_param.hpp @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#ifndef __REDUCTION_PARAM_HPP__ +#define __REDUCTION_PARAM_HPP__ + +#include "parameter.hpp" + +namespace TEngine { + +struct ReductionParam : public NamedParam +{ + int dim_0; + int dim_1; + int dim_2; + int dim_3; + //type : 0: sum; 1: mean. + int type; + int keepdim; + DECLARE_PARSER_STRUCTURE(ReductionParam) + { + DECLARE_PARSER_ENTRY(dim_0); + DECLARE_PARSER_ENTRY(dim_1); + DECLARE_PARSER_ENTRY(dim_2); + DECLARE_PARSER_ENTRY(dim_3); + DECLARE_PARSER_ENTRY(keepdim); + DECLARE_PARSER_ENTRY(type); + }; +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/operator/include/operator/rnn.hpp b/operator/include/operator/rnn.hpp new file mode 100644 index 000000000..8811bf4e2 --- /dev/null +++ b/operator/include/operator/rnn.hpp @@ -0,0 +1,31 @@ +#ifndef __RNN_HPP__ +#define __RNN_HPP__ + +#include "operator.hpp" +#include "rnn_param.hpp" + +namespace TEngine { + +class RNN : public OperatorWithParam +{ +public: + RNN(void) + { + name_ = "RNN"; + } + RNN(const RNN&) = default; + void SetSchema(void) override; + bool InferShape(const std::vector&, std::vector&, int layout) override; + const char* GetBiasName(void) + { + return "bias"; + } + const char* GetInitHiddenName(void) + { + return "init_h"; + } +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/operator/include/operator/rnn_param.hpp b/operator/include/operator/rnn_param.hpp new file mode 100644 index 000000000..cc79455c4 --- /dev/null +++ b/operator/include/operator/rnn_param.hpp @@ -0,0 +1,56 @@ +/* + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#ifndef __RNN_PARAM_HPP__ +#define __RNN_PARAM_HPP__ + +#include + +#include "parameter.hpp" + +namespace TEngine { + +#define RNN_ACT_TANH 1 + +struct RNNParam : public NamedParam +{ + float clip; + int output_len; + int sequence_len; + int input_size; + int hidden_size; + int has_clip; + int has_bias; + int has_init_state; + int activation; + + DECLARE_PARSER_STRUCTURE(RNNParam) + { + DECLARE_PARSER_ENTRY(clip); + DECLARE_PARSER_ENTRY(output_len); + DECLARE_PARSER_ENTRY(sequence_len); + DECLARE_PARSER_ENTRY(input_size); + DECLARE_PARSER_ENTRY(hidden_size); + DECLARE_PARSER_ENTRY(has_clip); + DECLARE_PARSER_ENTRY(has_bias); + DECLARE_PARSER_ENTRY(has_init_state); + DECLARE_PARSER_ENTRY(activation); + }; +}; + +} // namespace TEngine + +#endif diff --git a/operator/include/operator/sigmoid.hpp b/operator/include/operator/sigmoid.hpp new file mode 100644 index 000000000..c1a9ce5d7 --- /dev/null +++ b/operator/include/operator/sigmoid.hpp @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ +#ifndef __SIGMOID_HPP__ +#define __SIGMOID_HPP__ + +#include "operator.hpp" + +namespace TEngine { + +class Sigmoid : public OperatorNoParam +{ +public: + Sigmoid() + { + name_ = "Sigmoid"; + } + Sigmoid(const Sigmoid& src) = default; + virtual ~Sigmoid(){}; + + float GetFops(const std::vector& inputs, const std::vector& outputs) override; + + void SetSchema(void) override; +}; + +} // namespace TEngine + +#endif diff --git a/operator/include/operator/slice_param.hpp b/operator/include/operator/slice_param.hpp index f19744a7a..32c4d5a27 100644 --- a/operator/include/operator/slice_param.hpp +++ b/operator/include/operator/slice_param.hpp @@ -31,6 +31,10 @@ namespace TEngine { struct SliceParam : public NamedParam { int axis; + std::vector slice_point_; + std::vector begin_; + std::vector size_; + bool iscaffe; DECLARE_PARSER_STRUCTURE(SliceParam) { diff --git a/operator/include/operator/split.hpp b/operator/include/operator/split.hpp index e1712c5ff..fa6881127 100644 --- a/operator/include/operator/split.hpp +++ b/operator/include/operator/split.hpp @@ -25,10 +25,11 @@ #define __SPLIT_HPP__ #include "operator.hpp" +#include "split_param.hpp" namespace TEngine { -class Split : public OperatorNoParam +class Split : public OperatorWithParam { public: Split() @@ -36,7 +37,7 @@ class Split : public OperatorNoParam name_ = "Split"; } Split(const Split& src) = default; - ~Split() {} + virtual ~Split() {} bool InferShape(const std::vector& ishape, std::vector& oshape, int layout) override; diff --git a/operator/include/operator/split_param.hpp b/operator/include/operator/split_param.hpp new file mode 100644 index 000000000..6e5627b6a --- /dev/null +++ b/operator/include/operator/split_param.hpp @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#ifndef __SPLIT_PARAM_HPP__ +#define __SPLIT_PARAM_HPP__ + +#include "parameter.hpp" + +namespace TEngine { + +struct SplitParam : public NamedParam +{ + int axis; + int split_dim; + bool is_caffe; + std::vector split_sizes_; + + DECLARE_PARSER_STRUCTURE(SplitParam) + { + DECLARE_PARSER_ENTRY(axis); + DECLARE_PARSER_ENTRY(split_dim); + DECLARE_PARSER_ENTRY(is_caffe); + DECLARE_PARSER_ENTRY(split_sizes_); + } +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/operator/include/operator/squeeze.hpp b/operator/include/operator/squeeze.hpp new file mode 100644 index 000000000..7113861ad --- /dev/null +++ b/operator/include/operator/squeeze.hpp @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#ifndef __SQUEEZE_HPP__ +#define __SQUEEZE_HPP__ + +#include "operator.hpp" +#include "squeeze_param.hpp" + +namespace TEngine { + +class Squeeze : public OperatorWithParam +{ +public: + Squeeze() + { + name_ = "Squeeze"; + } + Squeeze(const Squeeze& src) = default; + + virtual ~Squeeze() {} + bool InferShape(const std::vector& ishape, std::vector& oshape, + int layout) override; + void SetSchema(void) override; +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/operator/include/operator/squeeze_param.hpp b/operator/include/operator/squeeze_param.hpp new file mode 100644 index 000000000..9377f0078 --- /dev/null +++ b/operator/include/operator/squeeze_param.hpp @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#ifndef __SQUEEZE_PARAM_HPP__ +#define __SQUEEZE_PARAM_HPP__ + +#include "parameter.hpp" + +namespace TEngine { + +struct SqueezeParam : public NamedParam +{ + int dim_0; + int dim_1; + int dim_2; + int dim_3; + DECLARE_PARSER_STRUCTURE(SqueezeParam) + { + DECLARE_PARSER_ENTRY(dim_0); + DECLARE_PARSER_ENTRY(dim_1); + DECLARE_PARSER_ENTRY(dim_2); + DECLARE_PARSER_ENTRY(dim_3); + }; +}; + +} // namespace TEngine + +#endif diff --git a/operator/include/operator/swap_axis.hpp b/operator/include/operator/swap_axis.hpp new file mode 100644 index 000000000..9ae5f8cf9 --- /dev/null +++ b/operator/include/operator/swap_axis.hpp @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ +#ifndef __SWAP_AXIS_HPP__ +#define __SWAP_AXIS_HPP__ + +#include "operator.hpp" +#include "swap_axis_param.hpp" + +namespace TEngine { + +class SwapAxis : public OperatorWithParam +{ +public: + SwapAxis() + { + name_ = "SwapAxis"; + } + SwapAxis(const SwapAxis& src) = default; + + virtual ~SwapAxis() {} + bool InferShape(const std::vector& ishape, std::vector& oshape, + int layout) override; + void SetSchema(void) override; +}; + +} // namespace TEngine + +#endif diff --git a/operator/include/operator/swap_axis_param.hpp b/operator/include/operator/swap_axis_param.hpp new file mode 100644 index 000000000..1099442e3 --- /dev/null +++ b/operator/include/operator/swap_axis_param.hpp @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: haoluo@openailab.com + */ +#ifndef __SWAP_AXIS_PARAM_HPP__ +#define __SWAP_AXIS_PARAM_HPP__ + +#include "parameter.hpp" + +namespace TEngine { + +struct SwapAxisParam : public NamedParam +{ + int dim_0; + int dim_1; + DECLARE_PARSER_STRUCTURE(SwapAxisParam) + { + DECLARE_PARSER_ENTRY(dim_0); + DECLARE_PARSER_ENTRY(dim_1); + }; +}; + +} // namespace TEngine + +#endif diff --git a/operator/include/operator/tanh.hpp b/operator/include/operator/tanh.hpp new file mode 100644 index 000000000..9635388c8 --- /dev/null +++ b/operator/include/operator/tanh.hpp @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ +#ifndef __TANH_HPP__ +#define __TANH_HPP__ + +#include "operator.hpp" + +namespace TEngine { + +class Tanh : public OperatorNoParam +{ +public: + Tanh() + { + name_ = "Tanh"; + } + Tanh(const Tanh& src) = default; + virtual ~Tanh(){}; + + void SetSchema(void) override; +}; + +} // namespace TEngine + +#endif diff --git a/operator/operator/Makefile b/operator/operator/Makefile index 5a0169172..376db442b 100644 --- a/operator/operator/Makefile +++ b/operator/operator/Makefile @@ -1,37 +1,46 @@ -obj-y+=convolution.o -obj-y+=softmax.o -obj-y+=pooling.o -obj-y+=input_op.o -obj-y+=fully_connected.o -obj-y+=relu.o -obj-y+=const_op.o -obj-y+=split.o -obj-y+=concat.o -obj-y+=dropout.o obj-y+=accuracy.o +obj-y+=addn.o obj-y+=batch_norm.o -obj-y+=scale.o -obj-y+=lrn.o -obj-y+=fused_operator.o -obj-y+=prelu.o -obj-y+=eltwise.o -obj-y+=slice.o +obj-y+=concat.o +obj-y+=const_op.o +obj-y+=convolution.o +obj-y+=deconvolution.o obj-y+=demo_op.o +obj-y+=detection_output.o +obj-y+=detection_postprocess.o +obj-y+=dropout.o +obj-y+=eltwise.o +obj-y+=flatten.o +obj-y+=fully_connected.o +obj-y+=fused_operator.o +obj-y+=gemm.o +obj-y+=generic.o +obj-y+=input_op.o +obj-y+=logistic.o +obj-y+=lrn.o +obj-y+=lstm.o obj-y+=normalize.o +obj-y+=pad.o obj-y+=permute.o -obj-y+=flatten.o +obj-y+=pooling.o +obj-y+=prelu.o obj-y+=priorbox.o -obj-y+=reshape.o -obj-y+=detection_output.o -obj-y+=rpn.o -obj-y+=roi_pooling.o -obj-y+=reorg.o +obj-y+=reduction.o obj-y+=region.o +obj-y+=relu.o obj-y+=relu6.o -obj-y+=deconvolution.o +obj-y+=reorg.o +obj-y+=reshape.o obj-y+=resize.o -obj-y+=gemm.o -obj-y+=generic.o -obj-y+=lstm.o -obj-y+=logistic.o -obj-y+=detection_postprocess.o +obj-y+=rnn.o +obj-y+=roi_pooling.o +obj-y+=rpn.o +obj-y+=scale.o +obj-y+=sigmoid.o +obj-y+=slice.o +obj-y+=softmax.o +obj-y+=split.o +obj-y+=squeeze.o +obj-y+=swap_axis.o +obj-y+=tanh.o +obj-y+=gru.o diff --git a/operator/operator/accuracy.cpp b/operator/operator/accuracy.cpp index 1e6633a2e..aca5330e2 100644 --- a/operator/operator/accuracy.cpp +++ b/operator/operator/accuracy.cpp @@ -27,7 +27,7 @@ namespace TEngine { void Accuracy::SetSchema(void) { - Input({"input:float32"}).Output({"output:float32"}).SetLayout("W").SetDoc(R"DOC(Accuracy Operator)DOC"); + Input({"input:float32"}).Output({"output:float32"}).SetDoc(R"DOC(Accuracy Operator)DOC"); } } // namespace TEngine diff --git a/core/lib/data_layout.cpp b/operator/operator/addn.cpp similarity index 66% rename from core/lib/data_layout.cpp rename to operator/operator/addn.cpp index 515861cf8..15a84372b 100644 --- a/core/lib/data_layout.cpp +++ b/operator/operator/addn.cpp @@ -21,23 +21,22 @@ * Copyright (c) 2017, Open AI Lab * Author: haitao@openailab.com */ -#include "data_layout.hpp" +#include "operator/add_n.hpp" +#include "static_graph.hpp" namespace TEngine { -template <> void NamedData::InitPredefinedData() +bool Addn::InferShape(const std::vector& ishape, std::vector& oshape, int layout) { -#define DUMMY_OBJECT(type) static type DUMMY_OBJECT_##type -#define DUMMY_OBJECT_DEFAULT(type) static type DUMMY_OBJECT_##type(true) + oshape[0] = ishape[0]; + return true; +} - DUMMY_OBJECT_DEFAULT(LayoutNCHW); - DUMMY_OBJECT(LayoutNCDHW); - DUMMY_OBJECT(LayoutNHWC); - DUMMY_OBJECT(LayoutNDHWC); - DUMMY_OBJECT(LayoutNHW); - DUMMY_OBJECT(LayoutNW); - DUMMY_OBJECT(LayoutHW); - DUMMY_OBJECT(LayoutW); +void Addn::SetSchema(void) +{ + Input({"input:float32"}) + .Output({"output:float32"}) + .SetDoc(R"DOC(Addn Operator)DOC"); } } // namespace TEngine diff --git a/operator/operator/concat.cpp b/operator/operator/concat.cpp index 76205f0e3..5edd368eb 100644 --- a/operator/operator/concat.cpp +++ b/operator/operator/concat.cpp @@ -55,7 +55,6 @@ void Concat::SetSchema(void) Input({"input:float32"}) .Output({"output:float32"}) .SetAttr("axis", 1) - .SetLayout("NCHW") .SetDoc(R"DOC(Concat Operator)DOC"); } diff --git a/operator/operator/convolution.cpp b/operator/operator/convolution.cpp index 954808867..3a0f51b13 100644 --- a/operator/operator/convolution.cpp +++ b/operator/operator/convolution.cpp @@ -73,57 +73,43 @@ bool Convolution::InferShape(const std::vector& ishape, std::vector= 0) + if(param_.pad_h0 == -1) // TF or SAME_UPPER in ONNX { - param_.pads[0] = param_.pad_h; - param_.pads[2] = param_.pad_h; + param_.pad_h0 = pad_num / 2; + param_.pad_h1 = pad_num - pad_num / 2; } else { - int n = (input_h - 1) / param_.stride_h + 1; - int total_len = (n - 1) * param_.stride_h + param_.kernel_h; - int pad_num = total_len - input_h; - - if(param_.pad_h == -1) // TF or SAME_UPPER in ONNX - { - param_.pads[0] = pad_num / 2; - param_.pads[2] = pad_num - pad_num / 2; - } - else - { - // SAME_LOWER in ONNX - param_.pads[0] = pad_num - pad_num / 2; - param_.pads[2] = pad_num / 2; - } + // SAME_LOWER in ONNX + param_.pad_h0 = pad_num - pad_num / 2; + param_.pad_h1 = pad_num / 2; } + } + + if(param_.pad_w0 < 0) + { + int n = (input_w - 1) / param_.stride_w + 1; + int total_len = (n - 1) * param_.stride_w + param_.kernel_w; + int pad_num = total_len - input_w; - if(param_.pad_w >= 0) + if(param_.pad_w0 == -1) // TF or SAME_UPPER in ONNX { - param_.pads[1] = param_.pad_w; - param_.pads[3] = param_.pad_w; + param_.pad_w0 = pad_num / 2; + param_.pad_w1 = pad_num - pad_num / 2; } else { - int n = (input_w - 1) / param_.stride_w + 1; - int total_len = (n - 1) * param_.stride_w + param_.kernel_w; - int pad_num = total_len - input_w; - - if(param_.pad_w == -1) // TF or SAME_UPPER in ONNX - { - param_.pads[1] = pad_num / 2; - param_.pads[3] = pad_num - pad_num / 2; - } - else - { - // SAME_LOWER in ONNX - param_.pads[1] = pad_num - pad_num / 2; - param_.pads[3] = pad_num / 2; - } + // SAME_LOWER in ONNX + param_.pad_w0 = pad_num - pad_num / 2; + param_.pad_w1 = pad_num / 2; } } @@ -131,9 +117,9 @@ bool Convolution::InferShape(const std::vector& ishape, std::vector& ishape, std::vector dim = {input_n, output_h, output_w, output_c}; result.SetDim(dim); - result.SetDataLayout("NHWC"); + result.SetDataLayout(TENGINE_LAYOUT_NHWC); } else { std::vector dim = {input_n, output_c, output_h, output_w}; result.SetDim(dim); - result.SetDataLayout("NCHW"); + result.SetDataLayout(TENGINE_LAYOUT_NCHW); } oshape[0] = result; @@ -176,18 +162,20 @@ void Convolution::SetSchema(void) { Input({"input:float32", "weight:float32", "bias:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("kernel_h", 1) .SetAttr("kernel_w", 1) .SetAttr("stride_h", 1) .SetAttr("stride_w", 1) - .SetAttr("pad_h", 0) - .SetAttr("pad_w", 0) .SetAttr("dilation_h", 1) .SetAttr("dilation_w", 1) + .SetAttr("input_channel", 1) .SetAttr("output_channel", 1) .SetAttr("group", 1) .SetAttr("activation", -1) + .SetAttr("pad_h0", 0) + .SetAttr("pad_w0", 0) + .SetAttr("pad_h1", 0) + .SetAttr("pad_w1", 0) .SetDoc(R"DOC(Convolution Layer)DOC"); } diff --git a/operator/operator/deconvolution.cpp b/operator/operator/deconvolution.cpp index 80ef1d3cf..4101839f6 100644 --- a/operator/operator/deconvolution.cpp +++ b/operator/operator/deconvolution.cpp @@ -35,16 +35,18 @@ bool Deconvolution::InferShape(const std::vector& ishape, std::vector dim = {input_n, param_.num_output, output_h, output_w}; TShape result; result.SetDim(dim); - result.SetDataLayout("NCHW"); + result.SetDataLayout(input_shape.GetDataLayout()); oshape[0] = result; @@ -53,7 +55,7 @@ bool Deconvolution::InferShape(const std::vector& ishape, std::vector& inputs, const std::vector& outputs) { - float ops = 1.0f * param_.num_output * param_.kernel_size * param_.kernel_size * inputs[0].GetSize() * 2; + float ops = 1.0f * param_.num_output * param_.kernel_h * param_.kernel_w * inputs[0].GetSize() * 2; return ops; } @@ -61,13 +63,20 @@ float Deconvolution::GetFops(const std::vector& inputs, const std::vecto void Deconvolution::SetSchema(void) { Input({"input:float32", "weight:float32", "bias:float32"}) - .Output({"output:float32"}) - .SetLayout("NCHW") - .SetAttr("kernel_size", 1) - .SetAttr("stride", 1) - .SetAttr("pad", 1) + .Output({"output:float32"}) + .SetAttr("kernel_h", 1) + .SetAttr("kernel_w", 1) + .SetAttr("stride_h", 1) + .SetAttr("stride_w", 1) + .SetAttr("pad_h0", 0) + .SetAttr("pad_w0", 0) + .SetAttr("pad_h1", 0) + .SetAttr("pad_w1", 0) + .SetAttr("dilation_h", 1) + .SetAttr("dilation_w", 1) .SetAttr("num_output", 1) - .SetAttr("dilation", 1) + .SetAttr("group", 1) + .SetAttr("activation", -1) .SetDoc(R"DOC(Deconvolution Layer)DOC"); } diff --git a/operator/operator/demo_op.cpp b/operator/operator/demo_op.cpp index b59bcdca7..8f72ce846 100644 --- a/operator/operator/demo_op.cpp +++ b/operator/operator/demo_op.cpp @@ -25,11 +25,31 @@ namespace TEngine { +/* + DemoOps demos to permute a 2d matrix and + then expanding one column to summerize each row of the permuted matrix +*/ + +bool DemoOp::InferShape(const std::vector& ishape, std::vector& oshape, int layout) +{ + int h=ishape[0].Shape(0); + int w=ishape[0].Shape(1); + std::vector dims; + + dims.push_back(w); + dims.push_back(h+1); + + oshape[0].SetDim(dims); + oshape[0].SetDataLayout(layout); + + return true; +} + + void DemoOp::SetSchema(void) { Input({"input:float32/int8"}) .Output({"output:float32/int8"}) - .SetLayout("NCHW") .SetDoc(R"DOC(Demo Operator: a demo operator to show how to define and run a operator)DOC"); } diff --git a/operator/operator/detection_output.cpp b/operator/operator/detection_output.cpp index 66b7f9b01..8d2983c50 100644 --- a/operator/operator/detection_output.cpp +++ b/operator/operator/detection_output.cpp @@ -35,7 +35,7 @@ bool DetectionOutput::InferShape(const std::vector& ishape, std TShape shape; std::vector dim = {in_dim[0], 1, 6, 1}; shape.SetDim(dim); - shape.SetDataLayout("NCHW"); + shape.SetDataLayout(input.GetDataLayout()); oshape[0] = shape; return true; } @@ -44,7 +44,6 @@ void DetectionOutput::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("num_classes", 21) .SetDoc(R"DOC(DetectionOutput Layer)DOC"); diff --git a/operator/operator/detection_postprocess.cpp b/operator/operator/detection_postprocess.cpp index 313ba8733..d12a4089b 100644 --- a/operator/operator/detection_postprocess.cpp +++ b/operator/operator/detection_postprocess.cpp @@ -46,7 +46,7 @@ bool DetectionPostProcess::InferShape(const std::vector& ishape std::vector dim3 = {1, num_detected_boxes}; std::vector dim4 = {1}; - shape.SetDataLayout("NCHW"); + shape.SetDataLayout(ishape[0].GetDataLayout()); shape.SetDim(dim1); oshape[0] = shape; shape.SetDim(dim2); @@ -61,7 +61,7 @@ bool DetectionPostProcess::InferShape(const std::vector& ishape void DetectionPostProcess::SetSchema(void) { - Input({"input:float32"}).Output({"output:float32"}).SetLayout("NCHW").SetDoc(R"DOC(DetectionPostProcess Layer)DOC"); + Input({"input:float32"}).Output({"output:float32"}).SetDoc(R"DOC(DetectionPostProcess Layer)DOC"); } } // namespace TEngine diff --git a/operator/operator/dropout.cpp b/operator/operator/dropout.cpp index 066315536..6df51b791 100644 --- a/operator/operator/dropout.cpp +++ b/operator/operator/dropout.cpp @@ -27,7 +27,7 @@ namespace TEngine { void Dropout::SetSchema(void) { - Input({"input:float32"}).Output({"output:float32"}).SetLayout("NCHW").SetDoc(R"DOC(Dropout Operator)DOC"); + Input({"input:float32"}).Output({"output:float32"}).SetDoc(R"DOC(Dropout Operator)DOC"); } } // namespace TEngine diff --git a/operator/operator/eltwise.cpp b/operator/operator/eltwise.cpp index ab317d90c..438a17c9d 100644 --- a/operator/operator/eltwise.cpp +++ b/operator/operator/eltwise.cpp @@ -59,8 +59,7 @@ void Eltwise::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") - .SetAttr("method", "sum") + .SetAttr("type", 2) .SetAttr("caffe_flavor", 1) .SetDoc(R"DOC(Eltwise Layer)DOC"); } diff --git a/operator/operator/flatten.cpp b/operator/operator/flatten.cpp index 800d00bf4..b59d469a9 100644 --- a/operator/operator/flatten.cpp +++ b/operator/operator/flatten.cpp @@ -40,7 +40,7 @@ bool Flatten::InferShape(const std::vector& ishape, std::vector TShape shape; std::vector dim = {in_dim[0], new_channel, 1, 1}; shape.SetDim(dim); - shape.SetDataLayout("NCHW"); + shape.SetDataLayout(input.GetDataLayout()); oshape[0] = shape; return true; } @@ -49,7 +49,6 @@ void Flatten::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("axis", 1) .SetAttr("end_axis", 3) .SetDoc(R"DOC(Flatten Layer)DOC"); diff --git a/operator/operator/fully_connected.cpp b/operator/operator/fully_connected.cpp index 2a882afe1..4cb992bc5 100644 --- a/operator/operator/fully_connected.cpp +++ b/operator/operator/fully_connected.cpp @@ -35,8 +35,8 @@ bool FullyConnected::InferShape(const std::vector& ishape, std: int m = input.GetN(); int input_k = input.GetW() * input.GetH() * input.GetC(); - int n = weight.GetH(); - int k = weight.GetW(); + int n = weight.Shape(0); + int k = weight.Shape(1); if(k != input_k) return false; @@ -46,7 +46,7 @@ bool FullyConnected::InferShape(const std::vector& ishape, std: std::vector dim = {m, n, 1, 1}; shape.SetDim(dim); - shape.SetDataLayout("NCHW"); + shape.SetDataLayout(input.GetDataLayout()); oshape[0] = shape; @@ -72,7 +72,6 @@ void FullyConnected::SetSchema(void) { Input({"input:float32", "weight:float32", "bias:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("num_output", 10) .SetDoc(R"DOC(Fully Connected Operator)DOC"); } diff --git a/operator/operator/gemm.cpp b/operator/operator/gemm.cpp index 8f6ae56ef..34a691198 100644 --- a/operator/operator/gemm.cpp +++ b/operator/operator/gemm.cpp @@ -51,6 +51,7 @@ bool Gemm::InferShape(const std::vector& ishape, std::vector& ishape, std::vector& oshape, int layout) +{ + // input tensors: + // 0 --- input: [seq_length, batch_size,input_size] + // 1 --- kernel [ (input_size+hidden_size),hidden_state_size] + // others: optional + + // output tensor: [output_len,batch_size,hidden_size] + + const TShape input_shape = ishape[0]; + + int batch_size = input_shape.Shape(1); + + std::vector dims(3); + + dims[1] = param_.output_len; + dims[0] = batch_size; + dims[2] = param_.hidden_size; + + oshape[0].SetDim(dims); + + //std::cout<& ishape, std::vector& os // others: optional // output tensor: [output_len, batch_size,hidden_size] - + // std::cout<<"!!!!!!!\n"; const TShape input_shape = ishape[0]; int batch_size = input_shape.Shape(1); @@ -22,6 +22,8 @@ bool LSTM::InferShape(const std::vector& ishape, std::vector& os dims[0] = param_.output_len; dims[1] = batch_size; dims[2] = param_.hidden_size; + + // std::cout<& ishape, std::vector& oshape, int layout) +{ + const TShape& input = ishape[0]; + // TShape& output = oshape[0]; + int n = input.GetN(); + int c = input.GetC(); + int h = input.GetH(); + int w = input.GetW(); + + std::vector o_dim=input.GetDim(); + if(param_.pad_0_h!=-1 && param_.pad_0_w!=-1 + &¶m_.pad_1_h!=-1 && param_.pad_1_w!=-1 + &¶m_.pad_2_h!=-1 && param_.pad_2_w!=-1 + &¶m_.pad_3_h!=-1 && param_.pad_3_w!=-1) + { + o_dim[0]=n+param_.pad_0_h+param_.pad_0_w; + o_dim[1]=h+param_.pad_1_h+param_.pad_1_w; + o_dim[2]=w+param_.pad_2_h+param_.pad_2_w; + o_dim[3]=c+param_.pad_3_h+param_.pad_3_w; + } + else + { + return false; + } + TShape shape; + shape.SetDim(o_dim); + shape.SetDataLayout(TENGINE_LAYOUT_NHWC); + oshape[0] = shape; + return true; +} + +void Pad::SetSchema(void) +{ + Input({"input:float32"}) + .Output({"output:float32"}) + .SetAttr("mode", 0) + .SetAttr("pad_0_h", -1) + .SetAttr("pad_0_w", -1) + .SetAttr("pad_1_h", -1) + .SetAttr("pad_1_w", -1) + .SetAttr("pad_2_h", -1) + .SetAttr("pad_2_w", -1) + .SetAttr("pad_3_h", -1) + .SetAttr("pad_3_w", -1) + .SetAttr("value", 0) + .SetDoc(R"DOC(Pad Layer)DOC"); +} + +} // namespace TEngine diff --git a/operator/operator/permute.cpp b/operator/operator/permute.cpp index 23db8bf64..30f67d07e 100644 --- a/operator/operator/permute.cpp +++ b/operator/operator/permute.cpp @@ -28,18 +28,31 @@ namespace TEngine { bool Permute::InferShape(const std::vector& ishape, std::vector& oshape, int layout) { const TShape& input = ishape[0]; - int n = input.GetN(); - int c = input.GetC(); - int h = input.GetH(); - int w = input.GetW(); + const std::vector dims = input.GetDim(); // only support for 0231[bhwc] if((param_.order0 == 0) && (param_.order1 == 2) && (param_.order2 == 3) && (param_.order3 == 1)) { + int n = input.GetN(); + int c = input.GetC(); + int h = input.GetH(); + int w = input.GetW(); TShape shape; std::vector dim = {n, h, w, c}; shape.SetDim(dim); - shape.SetDataLayout("NCHW"); + shape.SetDataLayout(TENGINE_LAYOUT_NHWC); + oshape[0] = shape; + return true; + } + else if((param_.order0 == 1) && (param_.order1 == 0) && (param_.order2 == 2) && dims.size()==3) + { + // int n = input.GetN(); + int c = input.Shape(0); + int h = input.Shape(1); + int w = input.Shape(2); + TShape shape; + std::vector dim = {h,c,w}; + shape.SetDim(dim); oshape[0] = shape; return true; } @@ -53,7 +66,6 @@ void Permute::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("flag", 0) .SetAttr("order0", 0) .SetAttr("order1", 1) diff --git a/operator/operator/pooling.cpp b/operator/operator/pooling.cpp index e0e02dc7d..78ebcb89d 100644 --- a/operator/operator/pooling.cpp +++ b/operator/operator/pooling.cpp @@ -85,34 +85,25 @@ bool Pooling::InferShape(const std::vector& ishape, std::vector if(param_.global) { - param_.pad_h = 0; - param_.pad_w = 0; param_.stride_h = 1; param_.stride_w = 1; - - param_.kernel_shape[0] = input_h; - param_.kernel_shape[1] = input_w; - param_.pads[0] = param_.pads[1] = param_.pads[2] = param_.pads[3] = 0; - param_.strides[0] = param_.strides[1] = 1; - + param_.kernel_h = input_h; + param_.kernel_w = input_w; + param_.pad_h0 = param_.pad_w0 = param_.pad_h1 = param_.pad_w1 = 0; output_h = 1; output_w = 1; } else { - param_.kernel_shape[0] = param_.kernel_h; - param_.kernel_shape[1] = param_.kernel_w; - param_.strides[0] = param_.stride_h; - param_.strides[1] = param_.stride_w; output_h = - calc_output_size(input_h, param_.kernel_shape[0], param_.stride_h, param_.pad_h, param_.caffe_flavor); + calc_output_size(input_h, param_.kernel_h, param_.stride_h, param_.pad_h0, param_.caffe_flavor); output_w = - calc_output_size(input_w, param_.kernel_shape[1], param_.stride_w, param_.pad_w, param_.caffe_flavor); + calc_output_size(input_w, param_.kernel_w, param_.stride_w, param_.pad_w0, param_.caffe_flavor); - calc_real_pads(output_h, input_h, param_.kernel_shape[0], param_.stride_h, param_.pad_h, ¶m_.pads[0], - ¶m_.pads[2]); - calc_real_pads(output_w, input_w, param_.kernel_shape[1], param_.stride_w, param_.pad_w, ¶m_.pads[1], - ¶m_.pads[3]); + calc_real_pads(output_h, input_h, param_.kernel_h, param_.stride_h, param_.pad_h0, ¶m_.pad_h0, + ¶m_.pad_h1); + calc_real_pads(output_w, input_w, param_.kernel_w, param_.stride_w, param_.pad_w0, ¶m_.pad_w0, + ¶m_.pad_w1); } TShape shape; @@ -121,14 +112,14 @@ bool Pooling::InferShape(const std::vector& ishape, std::vector std::vector dim = {input_shape.GetN(), input_shape.GetC(), output_h, output_w}; shape.SetDim(dim); - shape.SetDataLayout("NCHW"); + shape.SetDataLayout(TENGINE_LAYOUT_NCHW); } else { std::vector dim = {input_shape.GetN(), output_h, output_w, input_shape.GetC()}; shape.SetDim(dim); - shape.SetDataLayout("NHWC"); + shape.SetDataLayout(TENGINE_LAYOUT_NHWC); } oshape[0] = shape; return true; @@ -136,7 +127,7 @@ bool Pooling::InferShape(const std::vector& ishape, std::vector float Pooling::GetFops(const std::vector& inputs, const std::vector& outputs) { - float patch_fops = param_.kernel_shape[0] * param_.kernel_shape[1]; + float patch_fops = param_.kernel_h * param_.kernel_w; return (patch_fops * outputs[0].GetSize()); } @@ -145,16 +136,17 @@ void Pooling::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("alg", 0) .SetAttr("kernel_h", 2) .SetAttr("kernel_w", 2) .SetAttr("stride_h", 1) .SetAttr("stride_w", 1) - .SetAttr("pad_h", 0) - .SetAttr("pad_w", 0) .SetAttr("global", 0) .SetAttr("caffe_flavor", 0) + .SetAttr("pad_h0", 0) + .SetAttr("pad_w0", 0) + .SetAttr("pad_h1", 0) + .SetAttr("pad_w1", 0) .SetDoc(R"DOC(Pooling Layer)DOC"); } diff --git a/operator/operator/prelu.cpp b/operator/operator/prelu.cpp index 97bec951f..37f5ccb02 100644 --- a/operator/operator/prelu.cpp +++ b/operator/operator/prelu.cpp @@ -34,7 +34,6 @@ void PReLU::SetSchema(void) { Input({"input:float32", "slope:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") // to check .SetDoc(R"DOC(PreLu Operator)DOC"); } diff --git a/operator/operator/priorbox.cpp b/operator/operator/priorbox.cpp index 1d5d9d287..59363ec21 100644 --- a/operator/operator/priorbox.cpp +++ b/operator/operator/priorbox.cpp @@ -54,7 +54,7 @@ bool PriorBox::InferShape(const std::vector& ishape, std::vecto TShape shape; std::vector dim = {feat_dim[0], 2, param_.out_dim_, 1}; shape.SetDim(dim); - shape.SetDataLayout("NCHW"); + shape.SetDataLayout(input.GetDataLayout()); oshape[0] = shape; return true; } @@ -63,7 +63,6 @@ void PriorBox::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("offset", 0.5) .SetDoc(R"DOC(PriorBox Layer)DOC"); diff --git a/operator/operator/reduction.cpp b/operator/operator/reduction.cpp new file mode 100644 index 000000000..f8001a9b0 --- /dev/null +++ b/operator/operator/reduction.cpp @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#include "operator/reduction.hpp" + +namespace TEngine { + +bool Reduction::InferShape(const std::vector& ishape, std::vector& oshape, int layout) +{ + const TShape& input = ishape[0]; + + const std::vector& in_dim = input.GetDim(); + int in_size=in_dim.size(); + std::vector new_shape; + if(param_.dim_0 != -2) + new_shape.push_back(param_.dim_0); + if(param_.dim_1 != -2) + new_shape.push_back(param_.dim_1); + if(param_.dim_2 != -2) + new_shape.push_back(param_.dim_2); + if(param_.dim_3 != -2) + new_shape.push_back(param_.dim_3); + bool should_reduced[4] = {false}; + int reduceddim=0; + int kd=param_.keepdim; + int newshape_size = new_shape.size(); + std::vector real_shape={0,2,3,1}; + if(newshape_size) + { + for(int i=0;i=0) + { + int idx=new_shape[i]; + if(input.GetDataLayout()==TENGINE_LAYOUT_NCHW) + idx=real_shape[idx]; + if(idx>=0 && idx<4) + { + + should_reduced[idx]=true; + ++reduceddim; + } + } + else if(new_shape[i]<0) + { + int current=in_dim.size()+new_shape[i]; + if(input.GetDataLayout()==TENGINE_LAYOUT_NCHW) + { + current=real_shape[current]; + } + + should_reduced[current]=true; + ++reduceddim; + + } + } + } + else + { + for(int idx=0;idx odim={1}; + TShape shape; + shape.SetDim(odim); + + shape.SetDataLayout(input.GetDataLayout()); + oshape[0] = shape; + return true; + } + else + { + std::vector odim(in_size); + for(int i_idx=0,o_idx=0;i_idx odim(o_size); + for(int i_idx=0,o_idx=0;i_idx& inputs, const std::vector& ishape, std::vector dim = {n, c * (stride * stride), h / stride, w / stride}; shape.SetDim(dim); - shape.SetDataLayout("NCHW"); + shape.SetDataLayout(input.GetDataLayout()); oshape[0] = shape; return true; @@ -47,7 +47,6 @@ void Reorg::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("stride", 1) .SetDoc(R"DOC(Reorg Operator)DOC"); } diff --git a/operator/operator/reshape.cpp b/operator/operator/reshape.cpp index 70c99d7f9..362a3eac9 100644 --- a/operator/operator/reshape.cpp +++ b/operator/operator/reshape.cpp @@ -60,24 +60,8 @@ bool Reshape::InferShape(const std::vector& ishape, std::vector TShape shape; shape.SetDim(new_shape); - // only support 2-D 3-D or 4-D - if(new_shape.size() == 4) - { - if(layout == TENGINE_LAYOUT_NCHW) - shape.SetDataLayout("NCHW"); - else - shape.SetDataLayout("NHWC"); - } - else if(new_shape.size() == 3) - { - shape.SetDataLayout("NHW"); - } - else if(new_shape.size() == 2) - { - shape.SetDataLayout("HW"); - } - else - return false; + shape.SetDataLayout(input.GetDataLayout()); + oshape[0] = shape; return true; } @@ -86,7 +70,6 @@ void Reshape::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("dim_0", -2) .SetAttr("dim_1", -2) .SetAttr("dim_2", -2) diff --git a/operator/operator/resize.cpp b/operator/operator/resize.cpp index 336650e68..9966c90bb 100644 --- a/operator/operator/resize.cpp +++ b/operator/operator/resize.cpp @@ -37,7 +37,7 @@ bool Resize::InferShape(const std::vector& ishape, std::vector< std::vector dim = {in_dim[0], in_dim[1], out_h, out_w}; shape.SetDim(dim); - shape.SetDataLayout("NCHW"); + shape.SetDataLayout(input.GetDataLayout()); oshape[0] = shape; @@ -48,7 +48,6 @@ void Resize::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("scale_h", 1.f) .SetAttr("scale_w", 1.f) diff --git a/operator/operator/rnn.cpp b/operator/operator/rnn.cpp new file mode 100644 index 000000000..17934062a --- /dev/null +++ b/operator/operator/rnn.cpp @@ -0,0 +1,51 @@ +#include "operator/rnn.hpp" +#include "operator/rnn_param.hpp" +#include "static_graph.hpp" + +namespace TEngine { + +bool RNN::InferShape(const std::vector& ishape, std::vector& oshape, int layout) +{ + // input tensors: + // 0 --- input: [seq_length, batch_size,input_size] + // 1 --- kernel [ (input_size+hidden_size),hidden_state_size] + // others: optional + + // output tensor: [output_len,batch_size,hidden_size] + + const TShape input_shape = ishape[0]; + + int batch_size = input_shape.Shape(1); + + std::vector dims(3); + + dims[0] = param_.output_len; + dims[1] = batch_size; + dims[2] = param_.hidden_size; + + oshape[0].SetDim(dims); + + return true; +} + +void RNN::SetSchema(void) +{ + Input({"input:float32", "kernel:float32", "bias:float32", "init_h:float32"}) + .Output({"output:float32"}) + .SetAttr("clip", 0.0f) + .SetAttr("output_len", 1) + .SetAttr("sequence_len", 1) + .SetAttr("input_size", 1) + .SetAttr("hidden_size", 1) + .SetAttr("has_clip", 0) + .SetAttr("has_bias", 0) + .SetAttr("has_init_state", 0) + .SetAttr("activation", RNN_ACT_TANH) + .SetDoc(R"DOC(LSTM Cell + input: input sequences, a 3D tensor [seq_length,batch_size,input_size] + kernel: gate weight tensor,[num_directions, hidden_size, ] + bias: gate bias tensor, [num_directions, hidden_size] + init_h: optional [hidden_size] + )DOC"); +} +} // namespace TEngine diff --git a/operator/operator/roi_pooling.cpp b/operator/operator/roi_pooling.cpp index a1614fb0a..bff14634d 100644 --- a/operator/operator/roi_pooling.cpp +++ b/operator/operator/roi_pooling.cpp @@ -37,7 +37,7 @@ bool ROIPooling::InferShape(const std::vector& ishape, std::vec std::vector dim = {300, c, param_.pooled_h, param_.pooled_w}; shape.SetDim(dim); - shape.SetDataLayout("NCHW"); + shape.SetDataLayout(input.GetDataLayout()); oshape[0] = shape; @@ -48,7 +48,6 @@ void ROIPooling::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("spatial_scale", 1.f) .SetDoc(R"DOC(ROIPooling Layer)DOC"); diff --git a/operator/operator/rpn.cpp b/operator/operator/rpn.cpp index cf900e460..3175067c7 100644 --- a/operator/operator/rpn.cpp +++ b/operator/operator/rpn.cpp @@ -100,7 +100,7 @@ bool RPN::InferShape(const std::vector& ishape, std::vector dim = {feat_dim[0], param_.post_nms_topn + 1, 4, 1}; shape.SetDim(dim); - shape.SetDataLayout("NCHW"); + shape.SetDataLayout(input.GetDataLayout()); oshape[0] = shape; return true; } @@ -109,7 +109,6 @@ void RPN::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("feat_stride", 16) .SetDoc(R"DOC(RPN Layer)DOC"); diff --git a/operator/operator/sigmoid.cpp b/operator/operator/sigmoid.cpp new file mode 100644 index 000000000..ff7dabae9 --- /dev/null +++ b/operator/operator/sigmoid.cpp @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: haoluo@openailab.com + */ +#include "operator/sigmoid.hpp" + +namespace TEngine { + +float Sigmoid::GetFops(const std::vector& inputs, const std::vector& outputs) +{ + return inputs[0].GetSize(); +} + +void Sigmoid::SetSchema(void) +{ + Input({"input:float32"}) + .Output({"output:float32"}) + .SetDoc(R"DOC(ReLu Operator)DOC"); +} + +} // namespace TEngine diff --git a/operator/operator/slice.cpp b/operator/operator/slice.cpp index 8d701f946..3b1a0e52c 100644 --- a/operator/operator/slice.cpp +++ b/operator/operator/slice.cpp @@ -26,35 +26,66 @@ namespace TEngine { bool Slice::InferShape(const std::vector& ishape, std::vector& oshape, int layout) { - // only support for slice_axis=1 const TShape& input = ishape[0]; + std::vector input_dim = input.GetDim(); - int n = input.GetN(); - int c = input.GetC(); - int h = input.GetH(); - int w = input.GetW(); - - if(c % 2 != 0) - return false; - - TShape shape; - - std::vector dim = {n, c / 2, h, w}; - - shape.SetDim(dim); - shape.SetDataLayout("NCHW"); - - oshape[0] = shape; - oshape[1] = shape; - + if(param_.iscaffe) + { + int slice_axis = param_.axis; + if(param_.slice_point_.size()!= 0) + { + int prev = 0; + int input_slice_num = input_dim[slice_axis]; + unsigned int i = 0 ; + for (; i < param_.slice_point_.size(); ++i) + { + input_dim[slice_axis] = (param_.slice_point_[i] - prev); + prev = param_.slice_point_[i]; + oshape[i].SetDim(input_dim); + oshape[i].SetDataLayout(input.GetDataLayout()); + } + //The last one + input_dim[slice_axis] = (input_slice_num - prev); + oshape[i].SetDim(input_dim); + oshape[i].SetDataLayout(input.GetDataLayout()); + } + else + { + int out_num = oshape.size(); + if(input.Shape(slice_axis) % out_num != 0) + return false; + if(slice_axis > (int)input_dim.size()) + return false; + input_dim[slice_axis] = input_dim[slice_axis] / out_num; + for(int i = 0; i < out_num; i++) + { + oshape[i].SetDim(input_dim); + oshape[i].SetDataLayout(input.GetDataLayout()); + } + } + } + else + { + std::vector out_dim; + //input shape size must be equal to begin and size's size; + if( (param_.size_.size()!= param_.begin_.size())|| (param_.size_.size()!= input_dim.size())) + return false; + out_dim.reserve(input_dim.size()); + for(unsigned int i = 0; i < input_dim.size(); i++) + { + out_dim[i] = param_.size_[i]; + } + oshape[0].SetDim(out_dim); + oshape[0].SetDataLayout(input.GetDataLayout()); + } return true; } void Slice::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("axis", 1) + .SetAttr("iscaffe", true) .SetDoc(R"DOC(Slice Operator)DOC"); } diff --git a/operator/operator/softmax.cpp b/operator/operator/softmax.cpp index 669983bab..b12146c1b 100644 --- a/operator/operator/softmax.cpp +++ b/operator/operator/softmax.cpp @@ -29,7 +29,6 @@ void Softmax::SetSchema(void) { Input({"input:float32"}) .Output({"output:float32"}) - .SetLayout("NCHW") .SetAttr("axis", 1) .SetDoc(R"DOC(Softmax Operator)DOC"); } diff --git a/operator/operator/split.cpp b/operator/operator/split.cpp index c10d420be..a85002c36 100644 --- a/operator/operator/split.cpp +++ b/operator/operator/split.cpp @@ -28,14 +28,65 @@ namespace TEngine { bool Split::InferShape(const std::vector& ishape, std::vector& oshape, int layout) { - for(unsigned int i = 0; i < oshape.size(); i++) - oshape[i] = ishape[0]; + int axis = param_.axis; + const TShape shape = ishape[0]; + std::vector input_dim = shape.GetDim(); + + if(param_.is_caffe) + { + for(unsigned int i = 0; i < oshape.size(); i++) + oshape[i] = ishape[0]; + } + else + { + if(param_.split_sizes_.size()!= 0) + { + int sumcheck = 0; + int input_slice_num = input_dim[axis]; + for (unsigned int i = 0; i < param_.split_sizes_.size(); ++i) + { + sumcheck+=param_.split_sizes_[i]; + } + if(sumcheck!=input_slice_num) + { + return false; + } + for (unsigned int i = 0; i < param_.split_sizes_.size(); ++i) + { + input_dim[axis] = (param_.split_sizes_[i]); + oshape[i].SetDim(input_dim); + oshape[i].SetDataLayout(shape.GetDataLayout()); + } + } + else + { + int split_dim = param_.split_dim; + int split_shape = 0; + std::vector dim; + dim = ishape[0].GetDim(); + if(dim[axis]% split_dim!=0) + return false; + split_shape= dim[axis]/split_dim; + input_dim[axis]=split_shape; + for(unsigned int i = 0; i < oshape.size(); i++) + { + oshape[i].SetDim(input_dim); + oshape[i].SetDataLayout(shape.GetDataLayout()); + } + } + + } + return true; } - void Split::SetSchema(void) { - Input({"input:float32"}).Output({"output:float32"}).SetLayout("NCHW").SetDoc(R"DOC(Split Operator)DOC"); + Input({"input:float32"}) + .Output({"output:float32"}) + .SetAttr("axis", 0) + .SetAttr("split_dim", 1) + .SetAttr("is_caffe", false) + .SetDoc(R"DOC(Split Operator)DOC"); } } // namespace TEngine diff --git a/operator/operator/squeeze.cpp b/operator/operator/squeeze.cpp new file mode 100644 index 000000000..0e357fb85 --- /dev/null +++ b/operator/operator/squeeze.cpp @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: zpluo@openailab.com + */ +#include "operator/squeeze.hpp" + +namespace TEngine { + +bool Squeeze::InferShape(const std::vector& ishape, std::vector& oshape, int layout) +{ + const TShape& input = ishape[0]; + + const std::vector& in_dim = input.GetDim(); + int in_size=in_dim.size(); + std::vector new_shape; + if(param_.dim_0 != -2) + new_shape.push_back(param_.dim_0); + if(param_.dim_1 != -2) + new_shape.push_back(param_.dim_1); + if(param_.dim_2 != -2) + new_shape.push_back(param_.dim_2); + if(param_.dim_3 != -2) + new_shape.push_back(param_.dim_3); + bool should_squeeze[4] = {false}; + int squeezeddim=0; + int newshape_size = new_shape.size(); + std::vector real_shape={0,2,3,1}; + if(newshape_size) + { + for(int i=0;i=0) + { + int idx=new_shape[i]; + if(input.GetDataLayout()==TENGINE_LAYOUT_NCHW) + idx=real_shape[idx]; + if(in_dim[idx]==1 && idx>=0 && idx<4) + { + + should_squeeze[idx]=true; + ++squeezeddim; + } + } + else if(new_shape[i]<0) + { + int idx=new_shape[i]; + if(input.GetDataLayout()==TENGINE_LAYOUT_NCHW) + idx=real_shape[idx]; + if(in_dim[idx]==1 && idx>0 && idx<3) + { + int current=in_dim.size()+idx; + should_squeeze[current]=true; + ++squeezeddim; + } + } + } + } + else + { + for(int idx=0;idx odim(in_size-squeezeddim); + int o_idx=0; + for(int i_idx=0;i_idx& ishape, std::vector& oshape, int layout) +{ + if(param_.dim_0 == param_.dim_1 ) + { + return false; + } + if(ishape.size()!=1 || oshape.size()!=1) + return false; + + const std::vector& in_dim = ishape[0].GetDim(); + int in_dim_size = in_dim.size(); + + if( param_.dim_0 >= in_dim_size || param_.dim_1 >= in_dim_size) + return false; + + std::vector new_dim; + new_dim.resize(in_dim_size); + for(int i=0;i("LSTM"); RegisterOp("Logistic"); RegisterOp("DetectionPostProcess"); + RegisterOp("RNN"); + RegisterOp("Tanh"); + RegisterOp("Sigmoid"); + RegisterOp("Squeeze"); + RegisterOp("Pad"); + RegisterOp("Reduction"); + RegisterOp("SwapAxis"); + RegisterOp("GRU"); + RegisterOp("Addn"); // std::cout<<"OPERATOR PLUGIN INITED\n"; return 0; diff --git a/scripts/makefile.build b/scripts/makefile.build index 52a546b46..ff2704a16 100644 --- a/scripts/makefile.build +++ b/scripts/makefile.build @@ -19,7 +19,7 @@ bin-obj-y:= obj-y:= subdir-y:= --include $(MAKEFILE_CONFIG) +include $(MAKEFILE_CONFIG) include Makefile @@ -48,9 +48,7 @@ endif prebuilt_objs=$(prebuilt-obj-y) -#real_subdir_built_in=$(foreach f, $(subdir_objs), $(wildcard $(f)) ) -real_subdir_built_in=$(subdir_objs) -real_built_in_objs=$(real_subdir_built_in) $(curdir_objs) $(prebuilt_objs) +real_built_in_objs= $(subdir_objs) $(curdir_objs) $(prebuilt_objs) #add BUILD_DIR PREFIX curdir_objs:=$(addprefix $(BUILD_DIR)/, $(cur_objs)) @@ -79,10 +77,14 @@ $(subdir_objs): $(subdir-y); endif $(BUILT_IN_OBJ): $(real_built_in_objs) - @echo $(BUILT_IN_LD) -r -o $@ $(real_built_in_objs); \ - $(BUILT_IN_LD) -r -o $@ $(real_built_in_objs); + @for file in $? ; do if [ -f $$file ] ; then \ + NEED_BUILD=true; break; fi; done; \ + if [ $$NEED_BUILD ]; then \ + echo "$(BUILT_IN_LD) -r -o $@ $(wildcard $(real_built_in_objs))"; \ + $(BUILT_IN_LD) -r -o $@ $(wildcard $(real_built_in_objs)); fi; else $(BUILT_IN_OBJ): + endif clean:: $(subdir-y) diff --git a/serializer/Makefile b/serializer/Makefile index ab60e187f..6b9126e89 100644 --- a/serializer/Makefile +++ b/serializer/Makefile @@ -8,6 +8,7 @@ MODULE_DIR+= ifeq ($(CONFIG_CAFFE_SERIALIZER),y) obj-y+=caffe/ COMMON_CFLAGS+= -DCONFIG_CAFFE_SERIALIZER + PROTOBUF_NEEDED=y endif ifeq ($(CONFIG_MXNET_SERIALIZER),y) obj-y+=mxnet/ @@ -20,6 +21,7 @@ endif ifeq ($(CONFIG_TF_SERIALIZER),y) obj-y+=tensorflow/ COMMON_CFLAGS+= -DCONFIG_TF_SERIALIZER + PROTOBUF_NEEDED=y endif ifeq ($(CONFIG_TFLITE_SERIALIZER),y) obj-y+=tf_lite/ @@ -31,14 +33,19 @@ ifeq ($(CONFIG_TENGINE_SERIALIZER),y) COMMON_CFLAGS+= -DCONFIG_TENGINE_SERIALIZER endif -obj-y+=source/ -obj-y+=plugin/ +ifeq ($(PROTOBUF_NEEDED),y) + #to get the protobuf header file + PROTOBUF_HEADER=$(shell pkg-config --cflags protobuf) + COMMON_CFLAGS+=$(PROTOBUF_HEADER) +endif CXXFLAGS+= COMMON_CFLAGS+=$(CONFIG_OPT_CFLAGS) COMMON_CFLAGS+= -Wall -g -I$(shell pwd)/include -fPIC $(INC_DIR) -Werror +obj-y+=source/ +obj-y+=plugin/ install: diff --git a/serializer/caffe/caffe_serializer.cpp b/serializer/caffe/caffe_serializer.cpp index 06fe33cda..7b8333456 100644 --- a/serializer/caffe/caffe_serializer.cpp +++ b/serializer/caffe/caffe_serializer.cpp @@ -32,8 +32,10 @@ #include #include +#include "tengine_c_api.h" #include "data_type.hpp" #include "type_name.hpp" +#include "exec_attr.hpp" #include "tengine_errno.hpp" #include "caffe_serializer.hpp" #include "operator_manager.hpp" @@ -60,6 +62,8 @@ #include "operator/region_param.hpp" #include "operator/deconv_param.hpp" #include "operator/resize_param.hpp" +#include "operator/split_param.hpp" + namespace TEngine { @@ -207,6 +211,10 @@ bool CaffeSingle::LoadModel(const std::vector& file_list, StaticGra SetGraphSource(graph, file_list[0]); SetGraphSourceFormat(graph, "caffe"); SetGraphConstTensorFile(graph, file_list[0]); + SetGraphLayout(graph,TENGINE_LAYOUT_NCHW); + SetModelLayout(graph,TENGINE_LAYOUT_NCHW); + SetModelFormat(graph,MODEL_FORMAT_CAFFE); + return LoadGraph(caffe_net, graph); } @@ -237,7 +245,6 @@ bool CaffeSingle::LoadNode(StaticGraph* graph, StaticNode* node, const te_caffe: StaticTensor* tensor = CreateStaticTensor(graph, tensor_name); - SetTensorDataLayout(tensor, "NCHW"); SetTensorDataType(tensor, DataType::GetTypeID("float32")); AddNodeOutputTensor(node, tensor); @@ -308,6 +315,10 @@ bool CaffeBuddy::LoadModel(const std::vector& file_list, StaticGrap SetGraphSource(graph, file_list[1]); SetGraphSourceFormat(graph, "caffe"); SetGraphConstTensorFile(graph, file_list[1]); + SetGraphLayout(graph,TENGINE_LAYOUT_NCHW); + SetModelLayout(graph,TENGINE_LAYOUT_NCHW); + SetModelFormat(graph,MODEL_FORMAT_CAFFE); + return LoadGraph(test_net, train_net, graph); } @@ -425,7 +436,6 @@ static void LoadCaffeBlob(StaticGraph* graph, StaticNode* node, const std::vecto SetTensorDim(tensor, dims); SetTensorDataType(tensor, DataType::GetTypeID("float32")); - SetTensorDataLayout(tensor, layout_list[i]); int mem_size = blob.data_size() * 4; @@ -459,7 +469,6 @@ static void CreatePresetNode(StaticGraph* graph, StaticNode* node, const char* n SetTensorDim(tensor, dims); SetTensorDataType(tensor, DataType::GetTypeID("float32")); - SetTensorDataLayout(tensor, layout); int elem_size = 1; @@ -685,24 +694,21 @@ static bool LoadCaffeNormalize(StaticGraph* graph, StaticNode* node, const te_ca static bool LoadCaffeSlice(StaticGraph* graph, StaticNode* node, const te_caffe::LayerParameter& layer_param) { - const te_caffe::SliceParameter& slice_param = layer_param.slice_param(); - - SliceParam param = any_cast(OpManager::GetOpDefParam("Slice")); - - if(slice_param.has_axis()) + const te_caffe::SliceParameter& slice_param = layer_param.slice_param(); + SliceParam param = any_cast(OpManager::GetOpDefParam("Slice")); + if(slice_param.has_axis()) param.axis = slice_param.axis(); - else + else param.axis = 1; - - StaticOp* op = CreateStaticOp(graph, "Slice"); - - SetOperatorParam(op, param); - - SetNodeOp(node, op); - - return true; + param.iscaffe = true; + param.slice_point_.clear(); + std::copy(slice_param.slice_point().begin(),slice_param.slice_point().end(),std::back_inserter(param.slice_point_)); + StaticOp* op = CreateStaticOp(graph, "Slice"); + SetOperatorParam(op, param); + SetNodeOp(node, op); + + return true; } - static bool LoadCaffeReLu(StaticGraph* graph, StaticNode* node, const te_caffe::LayerParameter& layer_param) { ReLuParam param = any_cast(OpManager::GetOpDefParam("ReLu")); @@ -724,10 +730,11 @@ static bool LoadCaffeReLu(StaticGraph* graph, StaticNode* node, const te_caffe:: static bool LoadCaffeSplit(StaticGraph* graph, StaticNode* node, const te_caffe::LayerParameter& layer_param) { + SplitParam param = any_cast(OpManager::GetOpDefParam("Split")); + param.is_caffe=true; StaticOp* op = CreateStaticOp(graph, "Split"); - + SetOperatorParam(op, param); SetNodeOp(node, op); - return true; } @@ -1050,13 +1057,17 @@ static bool LoadCaffeConvolution(StaticGraph* graph, StaticNode* node, const te_ if(conv_param.has_pad_h() && conv_param.has_pad_w()) { - param.pad_h = conv_param.pad_h(); - param.pad_w = conv_param.pad_w(); + param.pad_h0 = conv_param.pad_h(); + param.pad_h1 = conv_param.pad_h(); + param.pad_w0 = conv_param.pad_w(); + param.pad_w1 = conv_param.pad_w(); } else if(conv_param.pad_size()) { - param.pad_h = conv_param.pad(0); - param.pad_w = conv_param.pad(0); + param.pad_h0 = conv_param.pad(0); + param.pad_h1 = conv_param.pad(0); + param.pad_w0 = conv_param.pad(0); + param.pad_w1 = conv_param.pad(0); } param.output_channel = conv_param.num_output(); @@ -1092,16 +1103,54 @@ static bool LoadCaffeDeconvolution(StaticGraph* graph, StaticNode* node, const t DeconvParam param = any_cast(OpManager::GetOpDefParam("Deconvolution")); - param.kernel_size = conv_param.kernel_size(0); - param.stride = conv_param.stride(0); - param.pad = conv_param.pad(0); + if(conv_param.has_kernel_h() && conv_param.has_kernel_w()) + { + param.kernel_h = conv_param.kernel_h(); + param.kernel_w = conv_param.kernel_w(); + } + else + { + param.kernel_h = conv_param.kernel_size(0); + param.kernel_w = conv_param.kernel_size(0); + } + + if(conv_param.has_stride_h() && conv_param.has_stride_w()) + { + param.stride_h = conv_param.stride_h(); + param.stride_w = conv_param.stride_w(); + } + else if(conv_param.stride_size()) + { + param.stride_h = conv_param.stride(0); + param.stride_w = conv_param.stride(0); + } + + if(conv_param.has_pad_h() && conv_param.has_pad_w()) + { + param.pad_h0 = conv_param.pad_h(); + param.pad_h1 = conv_param.pad_h(); + param.pad_w0 = conv_param.pad_w(); + param.pad_w1 = conv_param.pad_w(); + } + else if(conv_param.pad_size()) + { + param.pad_h0 = conv_param.pad(0); + param.pad_w0 = conv_param.pad(0); + param.pad_h1 = conv_param.pad(0); + param.pad_w1 = conv_param.pad(0); + } param.num_output = conv_param.num_output(); + if(conv_param.has_group()) + param.group = conv_param.group(); + if(conv_param.dilation_size()) { - param.dilation = conv_param.dilation(0); + param.dilation_h = conv_param.dilation(0); + param.dilation_w = conv_param.dilation(0); } + StaticOp* op = CreateStaticOp(graph, "Deconvolution"); SetOperatorParam(op, param); @@ -1147,27 +1196,23 @@ static bool LoadCaffePooling(StaticGraph* graph, StaticNode* node, const te_caff param.kernel_h = pool_param.kernel_h(); param.kernel_w = pool_param.kernel_w(); } - param.kernel_shape.resize(2); - param.kernel_shape[0] = param.kernel_h; - param.kernel_shape[1] = param.kernel_w; param.global = pool_param.global_pooling(); if(pool_param.has_pad()) { - param.pad_h = pool_param.pad(); - param.pad_w = pool_param.pad(); + param.pad_h0 = pool_param.pad(); + param.pad_h1 = pool_param.pad(); + param.pad_w0 = pool_param.pad(); + param.pad_w1 = pool_param.pad(); } else if(pool_param.has_pad_h() && pool_param.has_pad_w()) { - param.pad_h = pool_param.pad_h(); - param.pad_w = pool_param.pad_w(); + param.pad_h0 = pool_param.pad_h(); + param.pad_h1 = pool_param.pad_h(); + param.pad_w0 = pool_param.pad_w(); + param.pad_w1 = pool_param.pad_w(); } - param.pads.resize(4); - param.pads[0] = param.pad_h; - param.pads[1] = param.pad_w; - param.pads[2] = param.pad_h; - param.pads[3] = param.pad_w; if(pool_param.has_stride()) { @@ -1179,9 +1224,6 @@ static bool LoadCaffePooling(StaticGraph* graph, StaticNode* node, const te_caff param.stride_h = pool_param.stride_h(); param.stride_w = pool_param.stride_w(); } - param.strides.resize(2); - param.strides[0] = param.stride_h; - param.strides[1] = param.stride_w; param.caffe_flavor = 1; diff --git a/serializer/include/tm_format.h b/serializer/include/tengine/v1/tm1_format.h similarity index 98% rename from serializer/include/tm_format.h rename to serializer/include/tengine/v1/tm1_format.h index f28ae033e..890b37e13 100644 --- a/serializer/include/tm_format.h +++ b/serializer/include/tengine/v1/tm1_format.h @@ -21,11 +21,11 @@ * Copyright (c) 2018, Open AI Lab * Author: jingyou@openailab.com */ -#ifndef __TM_FORMAT_H__ -#define __TM_FORMAT_H__ +#ifndef __TM1_FORMAT_H__ +#define __TM1_FORMAT_H__ + +#include "tm_generate.h" -#include -#include #include #ifdef __cplusplus @@ -38,8 +38,6 @@ extern "C" { #define NOT_SET 0x00 -#define TM_FILE_MAX_SIZE 1 << 30 /* 1G */ - /* Type define */ typedef uint32_t tm_uoffset_t; /* offset is 4-byte unsigned integer */ typedef uint32_t tm_size_t; /* size is 4-byte unsigned integer */ diff --git a/serializer/include/tm_op_serializer.hpp b/serializer/include/tengine/v1/tm1_op_serializer.hpp similarity index 97% rename from serializer/include/tm_op_serializer.hpp rename to serializer/include/tengine/v1/tm1_op_serializer.hpp index 6e98add41..930202153 100644 --- a/serializer/include/tm_op_serializer.hpp +++ b/serializer/include/tengine/v1/tm1_op_serializer.hpp @@ -21,8 +21,8 @@ * Copyright (c) 2018, Open AI Lab * Author: jingyou@openailab.com */ -#ifndef __TM_OP_SERIALIZER_HPP__ -#define __TM_OP_SERIALIZER_HPP__ +#ifndef __TM1_OP_SERIALIZER_HPP__ +#define __TM1_OP_SERIALIZER_HPP__ #include #include "static_graph_interface.hpp" @@ -73,10 +73,13 @@ #include "operator/slice_param.hpp" #include "operator/softmax_param.hpp" #include "logger.hpp" -#include "tm_generate.h" + +#include "tm1_format.h" namespace TEngine { +namespace TMSerializer1 { + using op_load_t = std::function; tm_uoffset_t SaveTmOperator(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); @@ -121,6 +124,8 @@ template const T* GetTmPtr(void* const start_ptr, tm_uoffset_t tm_o return nullptr; } +} // namespace TMSerializer1 + } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/serializer/include/tengine/v1/tm1_serializer.hpp b/serializer/include/tengine/v1/tm1_serializer.hpp new file mode 100644 index 000000000..49c309e65 --- /dev/null +++ b/serializer/include/tengine/v1/tm1_serializer.hpp @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: jingyou@openailab.com + */ +#ifndef __TM1_SERIALIZER_HPP__ +#define __TM1_SERIALIZER_HPP__ + +#include "serializer.hpp" +#include "static_graph_interface.hpp" +#include "logger.hpp" + +#include "tm1_format.h" +#include "tm_serializer.hpp" + +namespace TEngine { + +namespace TMSerializer1 { + +class TmSerializer1 : public TmSerializer +{ + using name_map_t = std::unordered_map; + +public: + TmSerializer1() + { + name_ = "tm_loader"; + version_ = "1.0"; + format_name_ = "tengine"; + } + + virtual ~TmSerializer1(){}; + + bool LoadModelFromMem(void* mmap_buf, StaticGraph* graph) override; + bool SaveModelIntoMem(void* start_ptr, Graph* graph, uint32_t* tm_model_size) override; + + bool LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int& size); + bool LoadNode(StaticGraph* graph, StaticNode* node, const TM_Node* tm_node, void* mmap_buf); + bool LoadTensor(StaticGraph* graph, const TM_Tensor* tm_tensor, const TM_Buffer* tm_buf, void* mmap_buf); + bool LoadGraph(StaticGraph* graph, const TM_Model* tm_model, void* mmap_buf); + + tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph); + tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node, name_map_t& tensor_name_map); + tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor, unsigned int tensor_id, + unsigned int buffer_id); + + bool IsSaveString(void); + bool IsSaveData(void); +}; + +} // namespace TMSerializer1 + +} // namespace TEngine + +#endif diff --git a/serializer/include/tengine/v2/tm2_format.h b/serializer/include/tengine/v2/tm2_format.h new file mode 100644 index 000000000..f297425f9 --- /dev/null +++ b/serializer/include/tengine/v2/tm2_format.h @@ -0,0 +1,535 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ +#ifndef __TM2_FORMAT_H__ +#define __TM2_FORMAT_H__ + +#include "tm_generate.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TM2_FILE_VER_MAIN 2 +#define TM2_FILE_VER_SUB 0 +#define TM2_FILE_VER_COMPILE 0 + +#define TM2_OP_VER 1 + +#define TM2_NOT_SET 0x00 + +/* Type define */ +typedef uint32_t tm_uoffset_t; /* offset is 4-byte unsigned integer */ +typedef uint32_t tm_size_t; /* size is 4-byte unsigned integer */ +typedef uint8_t tm_bool_t; /* bool is 1-byte unsigned integer */ + +/* Operator strings */ +#define TM2_OPSTR_ACCURACY "Accuracy" +#define TM2_OPSTR_BATCHNORMALIZATION "BatchNormalization" +#define TM2_OPSTR_BILINEARRESIZE "BilinearResize" +#define TM2_OPSTR_CONCAT "Concat" +#define TM2_OPSTR_CONST "Const" +#define TM2_OPSTR_CONVOLUTION "Convolution" +#define TM2_OPSTR_DECONVOLUTION "Deconvolution" +#define TM2_OPSTR_DETECTIONOUTPUT "DetectionOutput" +#define TM2_OPSTR_DROPOUT "Dropout" +#define TM2_OPSTR_ELTWISE "Eltwise" +#define TM2_OPSTR_FLATTEN "Flatten" +#define TM2_OPSTR_FULLYCONNECTED "FullyConnected" +#define TM2_OPSTR_INPUTOP "InputOp" +#define TM2_OPSTR_LRN "LRN" +#define TM2_OPSTR_NORMALIZE "Normalize" +#define TM2_OPSTR_PERMUTE "Permute" +#define TM2_OPSTR_POOLING "Pooling" +#define TM2_OPSTR_PRELU "PReLU" +#define TM2_OPSTR_PRIORBOX "PriorBox" +#define TM2_OPSTR_REGION "Region" +#define TM2_OPSTR_RELU "ReLu" +#define TM2_OPSTR_RELU6 "ReLu6" +#define TM2_OPSTR_REORG "Reorg" +#define TM2_OPSTR_RESHAPE "Reshape" +#define TM2_OPSTR_ROIPOOLING "ROIPooling" +#define TM2_OPSTR_RPN "RPN" +#define TM2_OPSTR_SCALE "Scale" +#define TM2_OPSTR_SLICE "Slice" +#define TM2_OPSTR_SOFTMAX "Softmax" +#define TM2_OPSTR_SPLIT "Split" +#define TM2_OPSTR_DETECTIONPOSTPROCESS "DetectionPostProcess" +#define TM2_OPSTR_GEMM "Gemm" +#define TM2_OPSTR_GENERIC "Generic" +#define TM2_OPSTR_LOGISTIC "Logistic" +#define TM2_OPSTR_LSTM "LSTM" +#define TM2_OPSTR_RNN "RNN" +#define TM2_OPSTR_TANH "Tanh" +#define TM2_OPSTR_SIGMOID "Sigmoid" +#define TM2_OPSTR_SQUEEZE "Squeeze" +#define TM2_OPSTR_FUSEDBNSCALERELU "Fused.BNScaleReLu" + +/* Operator types */ +#define TM2_OPTYPE_ACCURACY 0 /* No Param */ +#define TM2_OPTYPE_BATCHNORMALIZATION 1 /* TM2_BatchNormParam */ +#define TM2_OPTYPE_BILINEARRESIZE 2 /* TM2_ResizeParam */ +#define TM2_OPTYPE_CONCAT 3 /* TM2_ConcatParam */ +#define TM2_OPTYPE_CONST 4 /* No Param */ +#define TM2_OPTYPE_CONVOLUTION 5 /* TM2_ConvParam */ +#define TM2_OPTYPE_DECONVOLUTION 6 /* TM2_DeconvParam */ +#define TM2_OPTYPE_DETECTIONOUTPUT 7 /* TM2_DetectionOutputParam */ +#define TM2_OPTYPE_DROPOUT 8 /* No Param */ +#define TM2_OPTYPE_ELTWISE 9 /* TM2_EltwiseParam */ +#define TM2_OPTYPE_FLATTEN 10 /* TM2_FlattenParam */ +#define TM2_OPTYPE_FULLYCONNECTED 11 /* TM2_FCParam */ +#define TM2_OPTYPE_INPUTOP 12 /* No Param */ +#define TM2_OPTYPE_LRN 13 /* TM2_LRNParam */ +#define TM2_OPTYPE_NORMALIZE 14 /* TM2_NormalizeParam */ +#define TM2_OPTYPE_PERMUTE 15 /* TM2_PermuteParam */ +#define TM2_OPTYPE_POOLING 16 /* TM2_PoolParam */ +#define TM2_OPTYPE_PRELU 17 /* No Param */ +#define TM2_OPTYPE_PRIORBOX 18 /* TM2_PriorBoxParam */ +#define TM2_OPTYPE_REGION 19 /* TM2_RegionParam */ +#define TM2_OPTYPE_RELU 20 /* TM2_ReLuParam */ +#define TM2_OPTYPE_RELU6 21 /* No Param */ +#define TM2_OPTYPE_REORG 22 /* TM2_ReorgParam */ +#define TM2_OPTYPE_RESHAPE 23 /* TM2_ReshapeParam */ +#define TM2_OPTYPE_ROIPOOLING 24 /* TM2_ROIPoolingParam */ +#define TM2_OPTYPE_RPN 25 /* TM2_RPNParam */ +#define TM2_OPTYPE_SCALE 26 /* TM2_ScaleParam */ +#define TM2_OPTYPE_SLICE 27 /* TM2_SliceParam */ +#define TM2_OPTYPE_SOFTMAX 28 /* TM2_SoftmaxParam */ +#define TM2_OPTYPE_SPLIT 29 /* No Param */ +#define TM2_OPTYPE_DETECTIONPOSTPROCESS 30 /* TM2_DetectionPostProcessParam */ +#define TM2_OPTYPE_GEMM 31 /* TM2_GemmParam */ +#define TM2_OPTYPE_GENERIC 32 /* TM2_GenericParam */ +#define TM2_OPTYPE_LOGISTIC 33 /* No Param */ +#define TM2_OPTYPE_LSTM 34 /* TM2_LstmParam */ +#define TM2_OPTYPE_RNN 35 /* TM2_RnnParam */ +#define TM2_OPTYPE_TANH 36 /* No Param */ +#define TM2_OPTYPE_SIGMOID 37 /* No Param */ +#define TM2_OPTYPE_SQUEEZE 38 /* TM2_SqueezeParam */ +#define TM2_OPTYPE_FUSEDBNSCALERELU 39 /* No Param */ +#define TM2_OPTYPE_NUM 40 + +/* --------------------- -------- TM objects -------------------------------- */ + +typedef struct +{ + uint16_t ver_main; /* main version of Tengine model file format */ + uint16_t ver_sub; /* sub version of Tengine model file format */ + uint16_t ver_compile; /* compile version of Tengine model file format */ + tm_uoffset_t offset_root; /* offset of root table (TM2_Model) */ +} TM2_Header; + +/* Root table of Tengine model */ +typedef struct +{ + int32_t orig_format; /* format of original model */ + int32_t sub_format; /* sub format for DLA model */ + tm_uoffset_t offset_vo_subgraphs; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_s_mname; /* offset of string */ +} TM2_Model; + +/* Only 1 subgraph is supported currently */ +typedef struct +{ + uint32_t subgraph_id; /* subgraph id */ + int32_t graph_layout; /* actual data layout */ + int32_t model_layout; /* data layout of original model */ + tm_uoffset_t offset_vi_input_indices; /* offset of TM2_Vector_indices */ + tm_uoffset_t offset_vi_output_indices; /* offset of TM2_Vector_indices */ + tm_uoffset_t offset_vo_seq_nodes; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_vo_tensors; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_vo_buffers; /* offset of TM2_Vector_offsets */ + tm_uoffset_t offset_s_sname; /* offset of string */ +} TM2_Subgraph; + +typedef struct +{ + tm_uoffset_t offset_s_attrname; /* offset of string */ + tm_uoffset_t offset_s_attrval; /* offset of string */ + int32_t attr_type; +} TM2_Attr; + +typedef struct +{ + uint32_t node_id; /* node id */ + tm_uoffset_t offset_vi_input_tensors; /* offset of TM2_Vector_indices */ + tm_uoffset_t offset_vi_output_tensors; /* offset of TM2_Vector_indices */ + tm_uoffset_t offset_t_operator; /* offset of table */ + tm_uoffset_t offset_s_nname; /* offset of string */ + tm_uoffset_t offset_vo_attrs; /* offset of TM2_Vector_offsets */ + tm_bool_t dynamic_shape; +} TM2_Node; + +typedef struct +{ + uint32_t op_ver; /* version of operator */ + uint32_t operator_type; /* operator type */ + tm_uoffset_t offset_t_param; /* offset of table */ +} TM2_Operator; + +typedef struct +{ + int32_t zero_point; + float scale; + int32_t width; +} TM2_QuantParam; + +typedef struct +{ + uint32_t tensor_id; + uint32_t buffer_id; + tm_uoffset_t offset_vd_dims; /* offset of TM2_Vector_dims */ + tm_uoffset_t offset_s_tname; /* offset of string */ + tm_uoffset_t offect_vo_quantparams; /* offset of TM2_Vector_offsets */ + int32_t layout; + int32_t type; + int32_t data_type; +} TM2_Tensor; + +typedef struct +{ + tm_size_t size; /* buffer size */ + tm_uoffset_t offset_data; /* offset of buffer data */ +} TM2_Buffer; + +typedef struct +{ + tm_size_t size; /* string size */ + tm_uoffset_t offset_data; /* offset of string data */ +} TM2_String; + +/* ------------------------ ------- Vectors --------------------------------- */ + +typedef struct +{ + tm_size_t v_num; /* number of vector elements */ + tm_uoffset_t offsets[0]; +} TM2_Vector_offsets; + +typedef struct +{ + tm_size_t v_num; /* number of vector elements */ + uint32_t indices[0]; +} TM2_Vector_indices; + +typedef struct +{ + tm_size_t v_num; /* number of vector elements */ + int32_t dims[0]; +} TM2_Vector_dims; + +typedef struct +{ + tm_size_t v_num; /* number of vector elements */ + float data[0]; +} TM2_Vector_floats; + +typedef struct +{ + tm_size_t v_num; /* number of vector elements */ + float data[0][4]; /* x0, y0, x1, y1 */ +} TM2_Vector_anchors; + +/* -------------------- ------- Operator params ----------------------------- */ + +typedef struct +{ + int32_t max_input_num; + int32_t max_output_num; + tm_uoffset_t offset_s_opname; /* offset of string */ +} TM2_GenericParam; + +typedef struct +{ + float rescale_factor; + float eps; + int32_t caffe_flavor; +} TM2_BatchNormParam; + +typedef struct +{ + int32_t axis; +} TM2_ConcatParam; + +typedef struct +{ + int32_t kernel_h; + int32_t kernel_w; + int32_t stride_h; + int32_t stride_w; + int32_t dilation_h; + int32_t dilation_w; + int32_t input_channel; + int32_t output_channel; + int32_t group; + int32_t activation; + int32_t pad_h0; /* top padding rows */ + int32_t pad_w0; /* left padding columns */ + int32_t pad_h1; /* bottom padding rows */ + int32_t pad_w1; /* right padding columns */ +} TM2_ConvParam; + +typedef struct +{ + int32_t num_output; + int32_t kernel_h; + int32_t kernel_w; + int32_t stride_h; + int32_t stride_w; + int32_t pad_w0; + int32_t pad_h0; + int32_t pad_w1; + int32_t pad_h1; + int32_t dilation_h; + int32_t dilation_w; + int32_t group; + int32_t activation; +} TM2_DeconvParam; + +typedef struct +{ + int32_t num_classes; + int32_t keep_top_k; + int32_t nms_top_k; + float confidence_threshold; + float nms_threshold; +} TM2_DetectionOutputParam; + +typedef struct +{ + uint32_t type; + int32_t caffe_flavor; +} TM2_EltwiseParam; + +typedef struct +{ + int32_t num_output; +} TM2_FCParam; + +typedef struct +{ + int32_t axis; + int32_t end_axis; +} TM2_FlattenParam; + +typedef struct +{ + int32_t local_size; + float alpha; + float beta; + int32_t norm_region; + float k; +} TM2_LRNParam; + +typedef struct +{ + int32_t across_spatial; + int32_t channel_shared; +} TM2_NormalizeParam; + +typedef struct +{ + int32_t flag; + int32_t order0; + int32_t order1; + int32_t order2; + int32_t order3; +} TM2_PermuteParam; + +typedef struct +{ + uint32_t alg; + int32_t kernel_h; + int32_t kernel_w; + int32_t stride_h; + int32_t stride_w; + int32_t global; + int32_t caffe_flavor; + int32_t pad_h0; /* top padding rows */ + int32_t pad_w0; /* left padding columns */ + int32_t pad_h1; /* bottom padding rows */ + int32_t pad_w1; /* right padding columns */ +} TM2_PoolParam; + +typedef struct +{ + tm_uoffset_t offset_vf_min_size; /* offset of TM2_Vector_floats */ + tm_uoffset_t offset_vf_max_size; /* offset of TM2_Vector_floats */ + tm_uoffset_t offset_vf_variance; /* offset of TM2_Vector_floats */ + tm_uoffset_t offset_vf_aspect_ratio; /* offset of TM2_Vector_floats */ + int32_t flip; + int32_t clip; + int32_t img_size; + int32_t img_h; + int32_t img_w; + float step_w; + float step_h; + float offset; + int32_t num_priors; + int32_t out_dim; +} TM2_PriorBoxParam; + +typedef struct +{ + int32_t num_classes; + int32_t side; + int32_t num_box; + int32_t coords; + float confidence_threshold; + float nms_threshold; + tm_uoffset_t offset_vf_biases; /* offset of TM2_Vector_floats */ +} TM2_RegionParam; + +typedef struct +{ + float negative_slope; +} TM2_ReLuParam; + +typedef struct +{ + int32_t stride; +} TM2_ReorgParam; + +typedef struct +{ + int32_t dim_0; + int32_t dim_1; + int32_t dim_2; + int32_t dim_3; + int32_t dim_size; + int32_t axis; +} TM2_ReshapeParam; + +typedef struct +{ + float scale_x; + float scale_y; +} TM2_ResizeParam; + +typedef struct +{ + int32_t pooled_h; + int32_t pooled_w; + float spatial_scale; +} TM2_ROIPoolingParam; + +typedef struct +{ + tm_uoffset_t offset_vf_ratios; /* pointer to TM2_Vector_floats */ + tm_uoffset_t offset_vf_anchor_scales; /* pointer to TM2_Vector_floats */ + int32_t feat_stride; + int32_t basesize; + int32_t min_size; + int32_t per_nms_topn; + int32_t post_nms_topn; + float nms_thresh; + tm_uoffset_t offset_va_anchors; /* offset of TM2_Vector_anchors */ +} TM2_RPNParam; + +typedef struct +{ + int32_t axis; + int32_t num_axes; + int32_t bias_term; +} TM2_ScaleParam; + +typedef struct +{ + int32_t axis; + tm_uoffset_t offset_vi_slice_points; /* offset of TM2_Vector_dims */ + tm_uoffset_t offset_vi_begins; /* offset of TM2_Vector_dims */ + tm_uoffset_t offset_vi_sizes; /* offset of TM2_Vector_dims */ + int32_t iscaffe; +} TM2_SliceParam; + +typedef struct +{ + int32_t axis; +} TM2_SoftmaxParam; + +typedef struct +{ + int32_t max_detections; + int32_t max_classes_per_detection; + float nms_score_threshold; + float nms_iou_threshold; + int32_t num_classes; + tm_uoffset_t offset_vf_scales; /* y_scale, x_scale, h_scale, w_scale */ +} TM2_DetectionPostProcessParam; + +typedef struct +{ + float alpha; + float beta; + int32_t transA; + int32_t transB; +} TM2_GemmParam; + +typedef struct +{ + float forget_bias; + float clip; + int32_t output_len; + int32_t sequence_len; + int32_t input_size; + int32_t hidden_size; + int32_t cell_size; + int32_t has_peephole; + int32_t has_projection; + int32_t has_clip; + int32_t has_bias; + int32_t has_init_state; + int32_t forget_act; + int32_t input_act; + int32_t output_act; + int32_t cellin_act; + int32_t cellout_act; +} TM2_LstmParam; + +typedef struct +{ + float clip; + int32_t output_len; + int32_t sequence_len; + int32_t input_size; + int32_t hidden_size; + int32_t has_clip; + int32_t has_bias; + int32_t has_init_state; + int32_t activation; +} TM2_RnnParam; + +typedef struct +{ + int32_t dim_0; + int32_t dim_1; + int32_t dim_2; + int32_t dim_3; +} TM2_SqueezeParam; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/serializer/include/tengine/v2/tm2_op_serializer.hpp b/serializer/include/tengine/v2/tm2_op_serializer.hpp new file mode 100644 index 000000000..f243f08c1 --- /dev/null +++ b/serializer/include/tengine/v2/tm2_op_serializer.hpp @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ +#ifndef __TM2_OP_SERIALIZER_HPP__ +#define __TM2_OP_SERIALIZER_HPP__ + +#include "static_graph_interface.hpp" +#include "logger.hpp" + +#include "operator/batch_norm.hpp" +#include "operator/concat.hpp" +#include "operator/convolution.hpp" +#include "operator/deconvolution.hpp" +#include "operator/detection_output.hpp" +#include "operator/eltwise.hpp" +#include "operator/fully_connected.hpp" +#include "operator/flatten.hpp" +#include "operator/lrn.hpp" +#include "operator/normalize.hpp" +#include "operator/permute.hpp" +#include "operator/pooling.hpp" +#include "operator/priorbox.hpp" +#include "operator/region.hpp" +#include "operator/relu.hpp" +#include "operator/reorg.hpp" +#include "operator/reshape.hpp" +#include "operator/resize.hpp" +#include "operator/roi_pooling.hpp" +#include "operator/rpn.hpp" +#include "operator/scale.hpp" +#include "operator/slice.hpp" +#include "operator/softmax.hpp" +#include "operator/detection_postprocess.hpp" +#include "operator/gemm.hpp" +#include "operator/generic.hpp" +#include "operator/logistic.hpp" +#include "operator/lstm.hpp" +#include "operator/rnn.hpp" +#include "operator/tanh.hpp" +#include "operator/sigmoid.hpp" +#include "operator/squeeze.hpp" +#include "operator/fused_operator.hpp" + +#include "operator/batch_norm_param.hpp" +#include "operator/concat_param.hpp" +#include "operator/conv_param.hpp" +#include "operator/deconv_param.hpp" +#include "operator/detection_output_param.hpp" +#include "operator/eltwise_param.hpp" +#include "operator/fc_param.hpp" +#include "operator/flatten_param.hpp" +#include "operator/lrn_param.hpp" +#include "operator/normalize_param.hpp" +#include "operator/permute_param.hpp" +#include "operator/pool_param.hpp" +#include "operator/priorbox_param.hpp" +#include "operator/region_param.hpp" +#include "operator/relu_param.hpp" +#include "operator/reorg_param.hpp" +#include "operator/reshape_param.hpp" +#include "operator/resize_param.hpp" +#include "operator/roi_pooling_param.hpp" +#include "operator/rpn_param.hpp" +#include "operator/scale_param.hpp" +#include "operator/slice_param.hpp" +#include "operator/softmax_param.hpp" +#include "operator/detection_postprocess_param.hpp" +#include "operator/gemm_param.hpp" +#include "operator/generic_param.hpp" +#include "operator/lstm_param.hpp" +#include "operator/rnn_param.hpp" +#include "operator/squeeze_param.hpp" + +#include "tm2_format.h" + +namespace TEngine { + +namespace TMSerializer2 { + +using op_load_t = std::function; +using op_save_t = std::function; + +std::string GetOpStr(uint32_t op_type); + +op_load_t LoadTmOpFunc(uint32_t op_type); +bool LoadTmAccuracyOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmBatchNormOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmResizeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmConcatOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmConstOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmConvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmDeconvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmDetectionOutputOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmDropoutOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmEltwiseOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmFlattenOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmFCOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmInputOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmLRNOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmNormalizeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmPermuteOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmPoolingOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmPreluOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmPriorBoxOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmRegionOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmReLuOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmRelu6Op(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmReorgOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmReshapeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmROIPoolingOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmRPNOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmScaleOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmSliceOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmSoftmaxOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmSplitOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmDetectionPostProcessOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmGemmOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmGenericOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmLogisticOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmLstmOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmRnnOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmTanhOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmSigmoidOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmSqueezeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); +bool LoadTmFusedbnscalereluOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op); + +op_save_t SaveTmOpFunc(uint32_t op_type); +tm_uoffset_t SaveTmAccuracyOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmBatchNormOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmConcatOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmConstOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmConvOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmDeconvOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmDetectionOutputOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmDropoutOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmEltwiseOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmFCOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmFlattenOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmInputOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmLRNOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmNormalizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmPermuteOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmPoolingOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmPreluOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmRegionOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmReLuOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmRelu6Op(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmReorgOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmReshapeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmResizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmROIPoolingOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmScaleOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmSoftmaxOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmSplitOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmDetectionPostProcessOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmGemmOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmGenericOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmLogisticOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmLstmOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmRnnOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmTanhOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmSigmoidOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmSqueezeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); +tm_uoffset_t SaveTmFusedbnscalereluOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op); + +template const T* GetTmPtr(void* const start_ptr, tm_uoffset_t tm_offset) +{ + if(tm_offset != TM2_NOT_SET) + return reinterpret_cast(reinterpret_cast(start_ptr) + tm_offset); + else + return nullptr; +} + +} // namespace TMSerializer2 + +} // namespace TEngine + +#endif diff --git a/serializer/include/tengine/v2/tm2_serializer.hpp b/serializer/include/tengine/v2/tm2_serializer.hpp new file mode 100644 index 000000000..850aa6bb5 --- /dev/null +++ b/serializer/include/tengine/v2/tm2_serializer.hpp @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ +#ifndef __TM2_SERIALIZER_HPP__ +#define __TM2_SERIALIZER_HPP__ + +#include "serializer.hpp" +#include "static_graph_interface.hpp" +#include "logger.hpp" + +#include "tm2_format.h" +#include "tm_serializer.hpp" + +namespace TEngine { + +namespace TMSerializer2 { + +class TmSerializer2 : public TmSerializer +{ + using name_map_t = std::unordered_map; + +public: + TmSerializer2() + { + name_ = "tm2_loader"; + version_ = "2.0"; + format_name_ = "tengine"; + } + + virtual ~TmSerializer2(){}; + + bool LoadModelFromMem(void* mmap_buf, StaticGraph* graph) override; + bool SaveModelIntoMem(void* start_ptr, Graph* graph, uint32_t* tm_model_size) override; + + bool LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int& size); + bool LoadNode(StaticGraph* graph, StaticNode* node, const TM2_Node* tm_node, void* mmap_buf); + bool LoadTensor(StaticGraph* graph, const TM2_Tensor* tm_tensor, const TM2_Buffer* tm_buf, void* mmap_buf); + bool LoadGraph(StaticGraph* graph, const TM2_Model* tm_model, void* mmap_buf); + + tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph); + tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node, name_map_t& tensor_name_map); + tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor, unsigned int tensor_id, + unsigned int buffer_id); + + bool IsSaveString(void); + bool IsSaveData(void); +}; + +} // namespace TMSerializer2 + +} // namespace TEngine + +#endif diff --git a/serializer/include/tf_lite/flatbuffers/flatbuffers.h b/serializer/include/tf_lite/flatbuffers/flatbuffers.h index e34c55d8a..4154639a3 100644 --- a/serializer/include/tf_lite/flatbuffers/flatbuffers.h +++ b/serializer/include/tf_lite/flatbuffers/flatbuffers.h @@ -1902,7 +1902,7 @@ class Verifier FLATBUFFERS_FINAL_CLASS public: Verifier(const uint8_t* buf, size_t buf_len, uoffset_t _max_depth = 64, uoffset_t _max_tables = 1000000) : buf_(buf), end_(buf + buf_len), depth_(0), max_depth_(_max_depth), num_tables_(0), max_tables_(_max_tables) - // clang-format off +// clang-format off #ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE , upper_bound_(buf) #endif diff --git a/serializer/include/tf_serializer.hpp b/serializer/include/tf_serializer.hpp index 2a9231be6..127e23cc9 100644 --- a/serializer/include/tf_serializer.hpp +++ b/serializer/include/tf_serializer.hpp @@ -106,6 +106,81 @@ struct LSTMNode : public TFNode } }; +struct RNNNode : public TFNode +{ + float clip; + + std::string direction; + + /* optional inputs */ + TFNode* kernel; + TFNode* bias; + TFNode* init_h; + + std::set rnn_graph; + + RNNNode() + { + kernel = nullptr; + bias = nullptr; + init_h = nullptr; + } + + ~RNNNode() + { + auto rnn_ir = rnn_graph.begin(); + auto rnn_end = rnn_graph.end(); + + while(rnn_ir != rnn_end) + { + delete(*rnn_ir); + rnn_ir++; + } + } +}; + +struct GRUNode : public TFNode +{ + float clip; + + std::string direction; + + /* optional inputs */ + TFNode* kernel; + TFNode* bias; + TFNode* init_h; + //gru kernel & bias + TFNode* gate_kernel; + TFNode* gate_bias; + TFNode* candidate_kernel; + TFNode* candidate_bias; + + std::set rnn_graph; + + GRUNode() + { + kernel = nullptr; + bias = nullptr; + init_h = nullptr; + gate_kernel= nullptr; + gate_bias= nullptr; + candidate_kernel= nullptr; + candidate_bias= nullptr; + } + + ~GRUNode() + { + auto rnn_ir = rnn_graph.begin(); + auto rnn_end = rnn_graph.end(); + + while(rnn_ir != rnn_end) + { + delete(*rnn_ir); + rnn_ir++; + } + } +}; + struct TFGraph { std::vector seq_nodes; @@ -120,7 +195,7 @@ struct TFGraph #define TF_RNN_LSTM 0 #define TF_RNN_GRU 1 #define TF_RNN_BASIC_LSTM 2 - +#define TF_RNN_BASIC_RNN 3 class TFSerializer : public Serializer { public: @@ -165,6 +240,10 @@ class TFSerializer : public Serializer void StripRNNScope(TFGraph& tf_graph, std::string& rnn_scope, int rnn_type); void ParseLSTMGraph(TFGraph& tf_graph, LSTMNode* lstm_node, std::set& rnn_graph); + + void ParseRNNGraph(TFGraph& tf_graph, RNNNode* rnn_node, std::set& rnn_graph); + + void ParseGRUGraph(TFGraph& tf_graph, GRUNode* gru_node, std::set& rnn_graph); }; } // namespace TEngine diff --git a/serializer/include/tm_generate.h b/serializer/include/tm_generate.h index df0f6e120..ab09492d3 100644 --- a/serializer/include/tm_generate.h +++ b/serializer/include/tm_generate.h @@ -24,15 +24,15 @@ #ifndef __TM_GENERATE_H__ #define __TM_GENERATE_H__ -#include "tm_format.h" +#include #ifdef __cplusplus extern "C" { #endif -tm_uoffset_t WriteTmFileAlign1(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size); -tm_uoffset_t WriteTmFileAlign4(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size); -tm_uoffset_t WriteTmObject(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size); +uint32_t WriteTmFileAlign1(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size); +uint32_t WriteTmFileAlign4(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size); +uint32_t WriteTmObject(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size); #ifdef __cplusplus } diff --git a/serializer/include/tm_serializer.hpp b/serializer/include/tm_serializer.hpp index 99f8b2c67..0e719a06d 100644 --- a/serializer/include/tm_serializer.hpp +++ b/serializer/include/tm_serializer.hpp @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2018, Open AI Lab + * Copyright (c) 2019, Open AI Lab * Author: jingyou@openailab.com */ #ifndef __TM_SERIALIZER_HPP__ @@ -26,23 +26,13 @@ #include "serializer.hpp" #include "static_graph_interface.hpp" -#include "logger.hpp" -#include "tm_generate.h" namespace TEngine { class TmSerializer : public Serializer { - using name_map_t = std::unordered_map; - public: - TmSerializer() - { - name_ = "tm_loader"; - version_ = "0.1"; - format_name_ = "tengine"; - } - + TmSerializer() {}; virtual ~TmSerializer(){}; unsigned int GetFileNum(void) override @@ -65,21 +55,17 @@ class TmSerializer : public Serializer return false; } - bool LoadModelFromMem(void* mmap_buf, StaticGraph* graph); + bool LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int& size); - bool IsSaveString(void); - bool IsSaveData(void); + virtual bool LoadModelFromMem(void* mmap_buf, StaticGraph* graph) { return false; } + virtual bool SaveModelIntoMem(void* start_ptr, Graph* graph, uint32_t* tm_model_size) { return false; } +}; -protected: - bool LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int& size); - bool LoadNode(StaticGraph* graph, StaticNode* node, const TM_Node* tm_node, void* mmap_buf); - bool LoadTensor(StaticGraph* graph, const TM_Tensor* tm_tensor, const TM_Buffer* tm_buf, void* mmap_buf); - bool LoadGraph(StaticGraph* graph, const TM_Model* tm_model, void* mmap_buf); +using TmSerializerPtr = std::shared_ptr; +using TmSerializerFactory = SpecificFactory; - tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph); - tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node, name_map_t& tensor_name_map); - tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor, unsigned int tensor_id, - unsigned int buffer_id); +class TmSerializerManager : public SimpleObjectManagerWithLock +{ }; } // namespace TEngine diff --git a/serializer/mxnet/mxnet_serializer.cpp b/serializer/mxnet/mxnet_serializer.cpp index 67c016898..1d06ee129 100644 --- a/serializer/mxnet/mxnet_serializer.cpp +++ b/serializer/mxnet/mxnet_serializer.cpp @@ -24,6 +24,8 @@ #include "mxnet_serializer.hpp" +#include "tengine_c_api.h" +#include "exec_attr.hpp" #include "type_name.hpp" #include "data_type.hpp" #include "tengine_errno.hpp" @@ -38,6 +40,11 @@ #include "operator/eltwise_param.hpp" #include "operator/fc_param.hpp" #include "operator/reshape_param.hpp" +#include "operator/swap_axis_param.hpp" +#include "operator/addn_param.hpp" +#include "operator/lstm_param.hpp" +#include "operator/gru_param.hpp" +#include "operator/permute_param.hpp" //#define DEBUG @@ -47,6 +54,20 @@ typedef std::string::size_type pos; typedef std::map::const_iterator const_iterator; using op_load_t = std::function; +std::vector &split(const std::string &str, char delim, std::vector &elems, bool skip_empty = true) { + std::istringstream iss(str); + for (std::string item; getline(iss, item, delim); ) + if (skip_empty && item.empty()) continue; + else elems.push_back(atoi(item.c_str())); + return elems; +} + +static void ParseAttr_n(const std::string str, std::vector& result) +{ + std::string s = str.substr(1, str.length() - 2); + split(s,',',result); +} + static void Trim(std::string& s, const char charlist[]) { // Erase the leading characters @@ -194,8 +215,10 @@ bool MxnetSerializer::LoadTextFile(const char* fname, std::vector& no node.name = unknown.str(); cnt_unknown_name++; } - if(node.op == "Flatten") + + if(node.op == "Flatten"||node.op == "SliceChannel") node.op = "Dropout"; + nodelist.push_back(node); nest--; continue; @@ -430,8 +453,17 @@ bool MxnetSerializer::LoadModel(const std::vector& file_list, Stati SetGraphSource(graph, file_list[1]); SetGraphSourceFormat(graph, "mxnet"); SetGraphConstTensorFile(graph, file_list[1]); + SetGraphLayout(graph,TENGINE_LAYOUT_NCHW); + SetModelLayout(graph,TENGINE_LAYOUT_NCHW); + SetModelFormat(graph,MODEL_FORMAT_MXNET); - return LoadGraph(graph, nodelist, paramlist); + bool res = LoadGraph(graph, nodelist, paramlist); + for(std::size_t ii=0; ii < paramlist.size(); ++ii) + { + std::free(paramlist[ii].raw_data); + } + + return res; } bool MxnetSerializer::LoadConstTensor(StaticGraph* graph, const std::vector& paramlist) @@ -513,7 +545,6 @@ void MxnetSerializer::CreateInputNode(StaticGraph* graph, const std::vector { MxnetNode mxnet_node = nodelist.at(i); - if(mxnet_node.op == "null") + if(mxnet_node.op == "null"||mxnet_node.op == "_zeros") continue; if(!FindOpLoadMethod(mxnet_node.op)) @@ -640,6 +651,7 @@ static bool LoadMxnetSoftmax(StaticGraph* graph, StaticNode* node, const MxnetNo StaticOp* op = CreateStaticOp(graph, "Softmax"); SoftmaxParam param = any_cast(OpManager::GetOpDefParam("Softmax")); + param.axis = 1; SetOperatorParam(op, param); @@ -669,18 +681,30 @@ static void ParseAttr(const std::string str, std::vector& result) // Remove leading '(' and trailing ')' std::string s = str.substr(1, str.length() - 2); - pos comma_pos = s.find(','); - std::string s1 = s.substr(0, comma_pos); - std::string s2 = s.substr(comma_pos + 1); - s2.erase(0, s2.find_first_not_of(" ")); - - std::istringstream ist1(s1); - std::istringstream ist2(s2); - int i, j; - ist1 >> i; - ist2 >> j; - result.push_back(i); - result.push_back(j); + std::string s1,s2; + int i; + while(1) + { + pos comma_pos = s.find(','); + if(comma_pos != std::string::npos) + { + s1 = s.substr(0, comma_pos); + s2 = s.substr(comma_pos + 1); + s2.erase(0, s2.find_first_not_of(" ")); + std::istringstream ist1(s1); + ist1 >> i; + result.push_back(i); + s = s2; + + }else + { + std::istringstream ist2(s2); + ist2 >> i; + result.push_back(i); + break; + } + } + } static bool LoadMxnetConvolution(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node) @@ -708,8 +732,10 @@ static bool LoadMxnetConvolution(StaticGraph* graph, StaticNode* node, const Mxn if(cit != mxnet_node.attrs.end()) { ParseAttr(cit->second, v3); - param.pad_h = v3.at(0); - param.pad_w = v3.at(1); + param.pad_h0 = v3.at(0); + param.pad_h1 = v3.at(0); + param.pad_w0 = v3.at(1); + param.pad_w1 = v3.at(1); } cit = mxnet_node.attrs.find("num_group"); if(cit != mxnet_node.attrs.end()) @@ -730,7 +756,7 @@ static bool LoadMxnetConvolution(StaticGraph* graph, StaticNode* node, const Mxn #ifdef DEBUG std::cout << "ConvParam : " << param.kernel_h << ", " << param.kernel_w << ", " << param.stride_h << ", " - << param.stride_w << ", " << param.pad_h << ", " << param.pad_w << ", " << param.group << ", " + << param.stride_w << ", " << param.pad_h0 << ", " << param.pad_w0 << ", " << param.group << ", " << param.output_channel << std::endl; #endif @@ -754,10 +780,6 @@ static bool LoadMxnetPooling(StaticGraph* graph, StaticNode* node, const MxnetNo ParseAttr(cit->second, v1); param.kernel_h = v1.at(0); param.kernel_w = v1.at(1); - - param.kernel_shape.resize(2); - param.kernel_shape[0] = param.kernel_h; - param.kernel_shape[1] = param.kernel_w; } cit = mxnet_node.attrs.find("stride"); if(cit != mxnet_node.attrs.end()) @@ -765,43 +787,42 @@ static bool LoadMxnetPooling(StaticGraph* graph, StaticNode* node, const MxnetNo ParseAttr(cit->second, v2); param.stride_h = v2.at(0); param.stride_w = v2.at(1); - - param.strides.resize(2); - param.strides[0] = param.stride_h; - param.strides[1] = param.stride_w; } cit = mxnet_node.attrs.find("pad"); if(cit != mxnet_node.attrs.end()) { ParseAttr(cit->second, v3); - param.pad_h = v3.at(0); - param.pad_w = v3.at(1); - - param.pads.resize(4); - param.pads[0] = param.pad_h; - param.pads[1] = param.pad_w; - param.pads[2] = param.pad_h; - param.pads[3] = param.pad_w; + param.pad_h0 = v3.at(0); + param.pad_h1 = v3.at(0); + param.pad_w0 = v3.at(1); + param.pad_w1 = v3.at(1); } cit = mxnet_node.attrs.find("pool_type"); if(cit != mxnet_node.attrs.end()) { if(cit->second == "max") { - param.global = 0; param.alg = kPoolMax; } else if(cit->second == "avg") { - param.global = 1; param.alg = kPoolAvg; } } + param.global = 0; + cit = mxnet_node.attrs.find("global_pool"); + if(cit != mxnet_node.attrs.end()) + { + if(cit->second == "True") + { + param.global = 1; + } + } param.caffe_flavor = 0; #ifdef DEBUG std::cout << "PoolParam : " << param.kernel_h << ", " << param.kernel_w << ", " << param.stride_h << ", " - << param.stride_w << ", " << param.pad_h << ", " << param.pad_w << ", " << param.global << ", " + << param.stride_w << ", " << param.pad_h0 << ", " << param.pad_w0 << ", " << param.global << ", " << param.alg << std::endl; #endif @@ -851,14 +872,42 @@ static bool LoadMxnetDropout(StaticGraph* graph, StaticNode* node, const MxnetNo return true; } -static bool LoadMxnetRelu(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node) +static bool LoadMxnetActivation(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node) { - ReLuParam param = any_cast(OpManager::GetOpDefParam("ReLu")); - param.negative_slope = 0.f; + const_iterator act_type = mxnet_node.attrs.find("act_type"); + if(act_type != mxnet_node.attrs.end()) + { + if(act_type->second == "relu") + { + ReLuParam param = any_cast(OpManager::GetOpDefParam("ReLu")); + param.negative_slope = 0.f; - StaticOp* op = CreateStaticOp(graph, "ReLu"); - SetOperatorParam(op, param); - SetNodeOp(node, op); + StaticOp* op = CreateStaticOp(graph, "ReLu"); + SetOperatorParam(op, param); + SetNodeOp(node, op); + } + else if(act_type->second == "tanh") + { + StaticOp* op = CreateStaticOp(graph, "Tanh"); + SetNodeOp(node, op); + } + else if(act_type->second == "sigmoid") + { + StaticOp* op = CreateStaticOp(graph, "Sigmoid"); + SetNodeOp(node, op); + } + else if(act_type->second == "softmax") + { + SoftmaxParam param = any_cast(OpManager::GetOpDefParam("Softmax")); + param.axis = 1; + + StaticOp* op = CreateStaticOp(graph, "Softmax"); + SetOperatorParam(op, param); + SetNodeOp(node, op); + } + else + return false; + } return true; } @@ -877,7 +926,6 @@ static bool LoadMxnetEltScalar(StaticGraph* graph, StaticNode* node, const Mxnet dims.push_back(1); SetTensorDim(tensor, dims); SetTensorDataType(tensor, DataType::GetTypeID("float32")); - SetTensorDataLayout(tensor, "W"); SetTensorSize(tensor, sizeof(float)); float* mem_buf = ( float* )std::malloc(sizeof(float)); @@ -993,7 +1041,116 @@ static bool LoadMxnetReshape(StaticGraph* graph, StaticNode* node, const MxnetNo return true; } +static bool LoadMxnetPermute(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node) +{ + PermuteParam param = any_cast(OpManager::GetOpDefParam("Permute")); + + const_iterator cit; + std::vector v1; + + cit = mxnet_node.attrs.find("axes"); + + ParseAttr_n(cit->second, v1); + + param.order0 = v1[0]; + param.order1 = v1[1]; + param.order2 = v1[2]; + param.order3 = -2; + + StaticOp* op = CreateStaticOp(graph, "Permute"); + SetOperatorParam(op, param); + SetNodeOp(node, op); + + return true; +} +static bool LoadMxnetSwapAxis(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node) +{ + SwapAxisParam param = any_cast(OpManager::GetOpDefParam("SwapAxis")); + + const_iterator cit; + cit = mxnet_node.attrs.find("dim1"); + if(cit != mxnet_node.attrs.end()) + { + std::istringstream ist(cit->second); + ist >> param.dim_0; + } + cit = mxnet_node.attrs.find("dim2"); + if(cit != mxnet_node.attrs.end()) + { + std::istringstream ist(cit->second); + ist >> param.dim_1; + } + + StaticOp* op = CreateStaticOp(graph, "SwapAxis"); + SetOperatorParam(op, param); + SetNodeOp(node, op); + + return true; +} + +static bool LoadMxnetAddN(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node) +{ + AddnParam param = any_cast(OpManager::GetOpDefParam("Addn")); + param.axis = 1; + + StaticOp* op = CreateStaticOp(graph, "Addn"); + SetOperatorParam(op, param); + SetNodeOp(node, op); + + return true; +} + +static bool LoadMxnetClip(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node) +{ + const_iterator cit1, cit2; + cit1 = mxnet_node.attrs.find("a_max"); + cit2 = mxnet_node.attrs.find("a_min"); + if(cit1 != mxnet_node.attrs.end() && cit1->second == "6" && + cit2 != mxnet_node.attrs.end() && cit2->second == "0") + { + StaticOp* op = CreateStaticOp(graph, "ReLu6"); + SetNodeOp(node, op); + } + else + return false; + + return true; +} + +static bool LoadMxnetRNN(StaticGraph* graph, StaticNode* node, const MxnetNode& mxnet_node) +{ + const_iterator cit = mxnet_node.attrs.find("mode"); + const_iterator cit1 = mxnet_node.attrs.find("state_size"); + int s_size=atoi(cit1->second.c_str()); + + if(cit->second == "lstm") + { + LSTMParam param = any_cast(OpManager::GetOpDefParam("LSTM")); + param.mxnet_flag=1; + + param.hidden_size=s_size; + param.cell_size=s_size; + + StaticOp* op = CreateStaticOp(graph, "LSTM"); + SetOperatorParam(op, param); + // SetOperatorDynamicShape(op); + SetNodeOp(node, op); + + } + else if(cit->second == "gru") + { + GRUParam param = any_cast(OpManager::GetOpDefParam("GRU")); + param.mxnet_flag=1; + param.hidden_size=s_size; + + StaticOp* op = CreateStaticOp(graph, "GRU"); + SetOperatorParam(op, param); + // SetOperatorDynamicShape(op); + SetNodeOp(node, op); + } + return true; +} bool MxnetSerializerRegisterOpLoader(void) { SerializerPtr serializer; @@ -1009,7 +1166,7 @@ bool MxnetSerializerRegisterOpLoader(void) p_mxnet->RegisterOpLoadMethod("Concat", op_load_t(LoadMxnetConcat)); p_mxnet->RegisterOpLoadMethod("BatchNorm", op_load_t(LoadMxnetBatchNorm)); p_mxnet->RegisterOpLoadMethod("Dropout", op_load_t(LoadMxnetDropout)); - p_mxnet->RegisterOpLoadMethod("Activation", op_load_t(LoadMxnetRelu)); + p_mxnet->RegisterOpLoadMethod("Activation", op_load_t(LoadMxnetActivation)); p_mxnet->RegisterOpLoadMethod("_minus_scalar", op_load_t(LoadMxnetEltScalar)); p_mxnet->RegisterOpLoadMethod("_mul_scalar", op_load_t(LoadMxnetEltScalar)); @@ -1018,6 +1175,11 @@ bool MxnetSerializerRegisterOpLoader(void) p_mxnet->RegisterOpLoadMethod("FullyConnected", op_load_t(LoadMxnetFullyConnected)); p_mxnet->RegisterOpLoadMethod("Reshape", op_load_t(LoadMxnetReshape)); + p_mxnet->RegisterOpLoadMethod("SwapAxis", op_load_t(LoadMxnetSwapAxis)); + p_mxnet->RegisterOpLoadMethod("add_n", op_load_t(LoadMxnetAddN)); + p_mxnet->RegisterOpLoadMethod("clip", op_load_t(LoadMxnetClip)); + p_mxnet->RegisterOpLoadMethod("RNN", op_load_t(LoadMxnetRNN)); + p_mxnet->RegisterOpLoadMethod("transpose", op_load_t(LoadMxnetPermute)); return true; } diff --git a/serializer/onnx/onnx_serializer.cpp b/serializer/onnx/onnx_serializer.cpp index 73e6dda00..668e8db2e 100644 --- a/serializer/onnx/onnx_serializer.cpp +++ b/serializer/onnx/onnx_serializer.cpp @@ -27,6 +27,8 @@ #include #include +#include "tengine_c_api.h" +#include "exec_attr.hpp" #include "data_type.hpp" #include "tengine_errno.hpp" #include "operator_manager.hpp" @@ -63,6 +65,9 @@ bool OnnxSerializer::LoadModel(const std::vector& file_list, Static SetGraphSource(graph, file_list[0]); SetGraphSourceFormat(graph, "onnx"); SetGraphConstTensorFile(graph, file_list[0]); + SetGraphLayout(graph,TENGINE_LAYOUT_NCHW); + SetModelLayout(graph,TENGINE_LAYOUT_NCHW); + SetModelFormat(graph,MODEL_FORMAT_ONNX); return LoadGraph(model, graph); } @@ -191,7 +196,6 @@ void OnnxSerializer::CreateInputNode(StaticGraph* graph, const onnx::GraphProto& StaticTensor* tensor = CreateStaticTensor(graph, val.name()); SetTensorDataType(tensor, DataType::GetTypeID("float32")); - SetTensorDataLayout(tensor, "NCHW"); if(has_shape) SetTensorDim(tensor, dims); @@ -239,7 +243,6 @@ bool OnnxSerializer::LoadNode(StaticGraph* graph, StaticNode* node, const onnx:: StaticTensor* tensor = CreateStaticTensor(graph, output_name); SetTensorDataType(tensor, DataType::GetTypeID("float32")); - SetTensorDataLayout(tensor, "NCHW"); AddNodeOutputTensor(node, tensor); } @@ -307,8 +310,10 @@ static bool LoadOnnxConvolutionOp(StaticGraph* graph, StaticNode* node, const on } else if(attr.name() == "pads") { - param.pad_h = attr.ints(0); - param.pad_w = attr.ints(1); + param.pad_h0 = attr.ints(0); + param.pad_h1 = attr.ints(0); + param.pad_w0 = attr.ints(1); + param.pad_w1 = attr.ints(1); } } @@ -323,13 +328,10 @@ static bool LoadOnnxConvolutionOp(StaticGraph* graph, StaticNode* node, const on { const std::vector& dim = GetTensorDim(tensor); - SetTensorDataLayout(tensor, "NCHW"); /* onnx hide the output channel in weight ..*/ param.output_channel = dim[0]; } - else if(k == 2) - SetTensorDataLayout(tensor, "W"); } StaticOp* op = CreateStaticOp(graph, "Convolution"); @@ -353,13 +355,6 @@ static bool LoadOnnxBN(StaticGraph* graph, StaticNode* node, const onnx::NodePro param.eps = attr.f(); } - for(int k = 1; k < onnx_node.input_size(); k++) - { - const std::string& input_name = onnx_node.input(k); - StaticTensor* tensor = FindTensor(graph, input_name); - SetTensorDataLayout(tensor, "W"); - } - StaticOp* op = CreateStaticOp(graph, "BatchNormalization"); SetOperatorParam(op, param); SetNodeOp(node, op); @@ -415,8 +410,10 @@ static bool LoadOnnxPooling(StaticGraph* graph, StaticNode* node, const onnx::No } else if(attr.name() == "pads") { - param.pad_h = attr.ints(0); - param.pad_w = attr.ints(1); + param.pad_h0 = attr.ints(0); + param.pad_h1 = attr.ints(0); + param.pad_w0 = attr.ints(1); + param.pad_w1 = attr.ints(1); } } } @@ -426,20 +423,6 @@ static bool LoadOnnxPooling(StaticGraph* graph, StaticNode* node, const onnx::No return false; } - param.kernel_shape.resize(2); - param.kernel_shape[0] = param.kernel_h; - param.kernel_shape[1] = param.kernel_w; - - param.pads.resize(4); - param.pads[0] = param.pad_h; - param.pads[1] = param.pad_w; - param.pads[2] = param.pad_h; - param.pads[3] = param.pad_w; - - param.strides.resize(2); - param.strides[0] = param.stride_h; - param.strides[1] = param.stride_w; - StaticOp* op = CreateStaticOp(graph, "Pooling"); SetOperatorParam(op, param); @@ -488,8 +471,6 @@ static bool LoadOnnxGemm(StaticGraph* graph, StaticNode* node, const onnx::NodeP StaticTensor* bias_tensor = FindTensor(graph, onnx_node.input(2)); - SetTensorDataLayout(weight_tensor, "HW"); - SetTensorDataLayout(bias_tensor, "W"); if(param.transA) { diff --git a/serializer/plugin/init.cpp b/serializer/plugin/init.cpp index 747c7dc56..4f78bf9fa 100644 --- a/serializer/plugin/init.cpp +++ b/serializer/plugin/init.cpp @@ -76,8 +76,9 @@ extern bool TFLiteSerializerRegisterOpLoader(); #endif #ifdef CONFIG_TENGINE_SERIALIZER -extern bool TmSerializerRegisterOpLoader(); +bool TmSerializerInit(void); #endif + } // namespace TEngine using namespace TEngine; @@ -137,12 +138,7 @@ int serializer_plugin_init(void) #endif #ifdef CONFIG_TENGINE_SERIALIZER - factory->RegisterInterface("tengine"); - auto tm_serializer = factory->Create("tengine"); - - SerializerManager::SafeAdd("tengine", SerializerPtr(tm_serializer)); - - TmSerializerRegisterOpLoader(); + TmSerializerInit(); #define SrcTmName "src_tm" diff --git a/serializer/tengine/Makefile b/serializer/tengine/Makefile index acfa649af..703441439 100644 --- a/serializer/tengine/Makefile +++ b/serializer/tengine/Makefile @@ -1,4 +1,7 @@ obj-y+=tm_generate.o -obj-y+=tm_op_load.o -obj-y+=tm_op_save.o obj-y+=tm_serializer.o +obj-y+=v1/ +obj-y+=v2/ + +COMMON_CFLAGS+=-I$(shell pwd)/../include/tengine + diff --git a/serializer/tengine/tm_generate.c b/serializer/tengine/tm_generate.c index 48be4188e..27f598b93 100644 --- a/serializer/tengine/tm_generate.c +++ b/serializer/tengine/tm_generate.c @@ -30,22 +30,22 @@ extern "C" { #define ALIGN(pos, alignbytes) (((pos) + ( alignbytes )-1) & ~(( alignbytes )-1)) -tm_uoffset_t WriteTmFileAlign1(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size) +uint32_t WriteTmFileAlign1(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size) { - tm_uoffset_t buf_pos = *cur_pos; + uint32_t buf_pos = *cur_pos; memcpy(start_ptr + *cur_pos, buf, buf_size); *cur_pos += buf_size; return buf_pos; } -tm_uoffset_t WriteTmFileAlign4(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size) +uint32_t WriteTmFileAlign4(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size) { *cur_pos = ALIGN(*cur_pos, 4); return WriteTmFileAlign1(start_ptr, cur_pos, buf, buf_size); } -tm_uoffset_t WriteTmObject(void* const start_ptr, tm_uoffset_t* cur_pos, const void* buf, const tm_size_t buf_size) +uint32_t WriteTmObject(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size) { return WriteTmFileAlign4(start_ptr, cur_pos, buf, buf_size); } diff --git a/serializer/tengine/tm_serializer.cpp b/serializer/tengine/tm_serializer.cpp index 047f99006..d18962674 100644 --- a/serializer/tengine/tm_serializer.cpp +++ b/serializer/tengine/tm_serializer.cpp @@ -18,274 +18,26 @@ */ /* - * Copyright (c) 2018, Open AI Lab + * Copyright (c) 2019, Open AI Lab * Author: jingyou@openailab.com */ #include #include #include #include -#include -#include "data_type.hpp" #include "operator_manager.hpp" #include "static_graph.hpp" #include "graph.hpp" -#include "node.hpp" -#include "tensor.hpp" #include "tm_serializer.hpp" -#include "tm_op_serializer.hpp" -#include "compiler.hpp" -namespace TEngine { - -bool TmSerializer::IsSaveString(void) -{ - const char* env = std::getenv("TM_WITH_STRING"); - - if(env) - return true; - else - return false; -} - -bool TmSerializer::IsSaveData(void) -{ - const char* env = std::getenv("TM_FOR_BENCHMARK"); - - if(env) - return false; - else - return true; -} - -tm_uoffset_t TmSerializer::SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor, - unsigned int tensor_id, unsigned int buffer_id) -{ - TM_Tensor tm_tensor; - tm_tensor.tensor_id = tensor_id; - tm_tensor.buffer_id = buffer_id; - tm_tensor.type = tensor->GetType(); - - bool tm_with_string = IsSaveString(); +#define TM_FILE_MAX_SIZE 1 << 30 /* 1G */ - if(tm_with_string) - { - std::string name = tensor->GetName(); - TM_String tensor_name; - tensor_name.size = name.size(); - tensor_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), tensor_name.size); - tm_tensor.offset_s_tname = WriteTmObject(start_ptr, cur_pos, &tensor_name, sizeof(TM_String)); - } - else - tm_tensor.offset_s_tname = NOT_SET; - - const std::string& data_type = DataType::GetTypeName(tensor->GetDataType()); - if(data_type == "float32") - tm_tensor.data_type = TM_DT_FLOAT32; - else if(data_type == "float16") - tm_tensor.data_type = TM_DT_FLOAT16; - else if(data_type == "int") - tm_tensor.data_type = TM_DT_INT32; - else if(data_type == "int8") - tm_tensor.data_type = TM_DT_INT8; - - /* Get the dims of the tensor */ - TShape& shape = tensor->GetShape(); - std::vector& dim = shape.GetDim(); - if(dim.size()) - { - /* Write the vector of dims */ - size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * dim.size(); - TM_Vector_dims* v_dims = ( TM_Vector_dims* )malloc(vector_size); - v_dims->v_num = dim.size(); - for(unsigned int i = 0; i < dim.size(); i++) - { - v_dims->dims[i] = dim[i]; - } - tm_tensor.offset_vd_dims = WriteTmObject(start_ptr, cur_pos, v_dims, vector_size); - free(v_dims); - } - else - tm_tensor.offset_vd_dims = NOT_SET; - - /* Write the tensor */ - return WriteTmObject(start_ptr, cur_pos, &tm_tensor, sizeof(TM_Tensor)); -} - -tm_uoffset_t TmSerializer::SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node, - name_map_t& tensor_name_map) -{ - TM_Node tm_node; - tm_node.node_id = node->GetNodeIndex(); - tm_node.dynamic_shape = node->IsDynamicShape(); - - bool tm_with_string = IsSaveString(); - - if(tm_with_string) - { - std::string name = node->GetName(); - TM_String node_name; - node_name.size = name.size(); - node_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), node_name.size); - tm_node.offset_s_nname = WriteTmObject(start_ptr, cur_pos, &node_name, sizeof(TM_String)); - } - else - tm_node.offset_s_nname = NOT_SET; - - unsigned int input_num = node->GetInputNum(); - unsigned int output_num = node->GetOutputNum(); - - if(input_num) - { - /* Write the vector of input indices */ - size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * input_num; - TM_Vector_indices* v_input_indices = ( TM_Vector_indices* )malloc(vector_size); - v_input_indices->v_num = input_num; - for(unsigned int i = 0; i < input_num; i++) - { - Tensor* p_tensor = node->GetInputTensor(i); - v_input_indices->indices[i] = tensor_name_map[p_tensor->GetName()]; - } - tm_node.offset_vi_input_tensors = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size); - free(v_input_indices); - } - else - tm_node.offset_vi_input_tensors = NOT_SET; - - if(output_num) - { - /* Write the vector of output indices */ - size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * output_num; - TM_Vector_indices* v_output_indices = ( TM_Vector_indices* )malloc(vector_size); - v_output_indices->v_num = output_num; - for(unsigned int i = 0; i < output_num; i++) - { - Tensor* p_tensor = node->GetOutputTensor(i); - v_output_indices->indices[i] = tensor_name_map[p_tensor->GetName()]; - } - tm_node.offset_vi_output_tensors = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size); - free(v_output_indices); - } - else - tm_node.offset_vi_output_tensors = NOT_SET; - - tm_node.offset_t_operator = SaveTmOperator(start_ptr, cur_pos, node->GetOp()); - - /* Write the node */ - return WriteTmObject(start_ptr, cur_pos, &tm_node, sizeof(TM_Node)); -} - -tm_uoffset_t TmSerializer::SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph) -{ - TM_Subgraph tm_subgraph; - tm_subgraph.subgraph_id = 0; /* subgraph_id starts from 0 */ - tm_subgraph.offset_s_sname = NOT_SET; - - unsigned int tensor_num = 0; - unsigned int buffer_num = 0; - std::vector tensor_ptrs; - std::vector buf_ptrs; - std::vector buf_sizes; - name_map_t tensor_name_map; /* map of tensor name and tensor index */ - bool tm_no_data = !IsSaveData(); - - /* Write the nodes */ - size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * graph->seq_nodes.size(); - TM_Vector_offsets* v_nodes = ( TM_Vector_offsets* )malloc(vector_size); - v_nodes->v_num = graph->seq_nodes.size(); - for(unsigned int i = 0; i < graph->seq_nodes.size(); i++) - { - Node* p_node = graph->seq_nodes[i]; - for(unsigned int k = 0; k < p_node->GetOutputNum(); k++) - { - Tensor* p_tensor = p_node->GetOutputTensor(k); - tensor_ptrs.push_back(p_tensor); - tensor_name_map[p_tensor->GetName()] = tensor_num; - tensor_num++; - } - v_nodes->offsets[i] = SaveTmNode(start_ptr, cur_pos, p_node, tensor_name_map); - } - /* Write the vector of nodes */ - tm_subgraph.offset_vo_seq_nodes = WriteTmObject(start_ptr, cur_pos, v_nodes, vector_size); - - /* Write the tensors */ - vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor_num; - TM_Vector_offsets* v_tensors = ( TM_Vector_offsets* )malloc(vector_size); - v_tensors->v_num = tensor_num; - for(unsigned int i = 0; i < tensor_num; i++) - { - Tensor* p_tensor = tensor_ptrs[i]; - if(p_tensor->GetType() == kConstTensor) - { - buf_ptrs.push_back(p_tensor->GetMemAddr()); - buf_sizes.push_back(p_tensor->GetTotalSize()); - buffer_num++; - } - - v_tensors->offsets[i] = SaveTmTensor(start_ptr, cur_pos, p_tensor, i, buffer_num - 1); - } - /* Write the vector of tensors */ - tm_subgraph.offset_vo_tensors = WriteTmObject(start_ptr, cur_pos, v_tensors, vector_size); - - /* Write the buffers */ - vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * buffer_num; - TM_Vector_offsets* v_buffers = ( TM_Vector_offsets* )malloc(vector_size); - v_buffers->v_num = buffer_num; - for(unsigned int i = 0; i < buffer_num; i++) - { - TM_Buffer tm_buf; - tm_buf.size = buf_sizes[i]; - - if(tm_no_data) - { - /* TM_FOR_BENCHMARK environment variable exists. Not write buf data into the tm file */ - tm_buf.offset_data = NOT_SET; - } - else - { - /* TM_FOR_BENCHMARK environment variable does not exist */ - tm_buf.offset_data = - WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast(buf_ptrs[i]), tm_buf.size); - } - v_buffers->offsets[i] = WriteTmObject(start_ptr, cur_pos, &tm_buf, sizeof(TM_Buffer)); - } - /* Write the vector of buffers */ - tm_subgraph.offset_vo_buffers = WriteTmObject(start_ptr, cur_pos, v_buffers, vector_size); - - /* Write the vector of input indices */ - vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->input_nodes.size(); - TM_Vector_indices* v_input_indices = ( TM_Vector_indices* )malloc(vector_size); - v_input_indices->v_num = graph->input_nodes.size(); - for(unsigned int i = 0; i < graph->input_nodes.size(); i++) - { - v_input_indices->indices[i] = graph->input_nodes[i]->GetNodeIndex(); - } - tm_subgraph.offset_vi_input_indices = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size); - - /* Write the vector of output indices */ - vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->output_nodes.size(); - TM_Vector_indices* v_output_indices = ( TM_Vector_indices* )malloc(vector_size); - v_output_indices->v_num = graph->output_nodes.size(); - for(unsigned int i = 0; i < graph->output_nodes.size(); i++) - { - v_output_indices->indices[i] = graph->output_nodes[i]->GetNodeIndex(); - } - tm_subgraph.offset_vi_output_indices = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size); - - /* Write the subgraph */ - tm_uoffset_t ret = WriteTmObject(start_ptr, cur_pos, &tm_subgraph, sizeof(TM_Subgraph)); - - /* Free the memory of vectors */ - free(v_tensors); - free(v_buffers); - free(v_nodes); - free(v_input_indices); - free(v_output_indices); +namespace TEngine { - return ret; -} +extern bool register_tm1_serializer(); +extern bool register_tm2_serializer(); bool TmSerializer::SaveModel(const std::vector& file_list, Graph* graph) { @@ -294,7 +46,7 @@ bool TmSerializer::SaveModel(const std::vector& file_list, Graph* g return false; /* Open the tengine model file */ - int fd = open(file_list[0].c_str(), O_RDWR | O_CREAT, 0666); + int fd = open(file_list[0].c_str(), O_RDWR | O_CREAT | O_TRUNC, 0666); if(fd == -1) { LOG_ERROR() << "Could not open " << file_list[0] << "\n"; @@ -325,267 +77,29 @@ bool TmSerializer::SaveModel(const std::vector& file_list, Graph* g bool TmSerializer::SaveModel(std::vector& addr_list, std::vector& size_list, Graph* graph) { - bool tm_with_string = IsSaveString(); + uint32_t tm_model_size = 0; - void* start_ptr = ( void* )malloc(TM_FILE_MAX_SIZE); + uint32_t malloc_size = TM_FILE_MAX_SIZE; + const char* env = std::getenv("TM_FILE_MAX_SIZE"); + if(env) + malloc_size = std::atoi(env); + + void* start_ptr = ( void* )malloc(malloc_size); if(start_ptr == nullptr) { - LOG_ERROR() << "No enough memory for saving tengine model.\n"; + LOG_ERROR() << "Malloc memory failed: " << malloc_size << ".\n"; return false; } - tm_size_t tm_model_size = 0; - tm_uoffset_t cur_pos = sizeof(TM_Header); - - /* Define the TM_Header object */ - TM_Header header; - header.ver_main = TM_FILE_VER_MAIN; - header.ver_sub = TM_FILE_VER_SUB; - header.ver_compile = TM_FILE_VER_COMPILE; - - /* Define the TM_Model object */ - TM_Model tm_model; - if(tm_with_string) - { - const std::string& fname = graph->GetName(); - TM_String model_name; - model_name.size = fname.size(); - model_name.offset_data = WriteTmFileAlign1(start_ptr, &cur_pos, fname.c_str(), model_name.size); - tm_model.offset_s_mname = WriteTmObject(start_ptr, &cur_pos, &model_name, sizeof(TM_String)); - } - else - tm_model.offset_s_mname = NOT_SET; - - /* Write the subgraphs */ - /* Only 1 subgraph is supported currently */ - size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * 1; - TM_Vector_offsets* v_subgraphs = ( TM_Vector_offsets* )malloc(vector_size); - v_subgraphs->v_num = 1; - v_subgraphs->offsets[0] = SaveTmSubgraph(start_ptr, &cur_pos, graph); - - /* Write the vector of subgraphs */ - tm_model.offset_vo_subgraphs = WriteTmObject(start_ptr, &cur_pos, v_subgraphs, vector_size); - - /* Write the model */ - header.offset_root = WriteTmObject(start_ptr, &cur_pos, &tm_model, sizeof(TM_Model)); - tm_model_size = cur_pos; - - /* Write the header */ - cur_pos = 0; - WriteTmObject(start_ptr, &cur_pos, &header, sizeof(TM_Header)); - - free(v_subgraphs); + TmSerializerPtr tm_serializer; + TmSerializerManager::SafeGet("tm_v2", tm_serializer); + + bool ret = tm_serializer->SaveModelIntoMem(start_ptr, graph, &tm_model_size); addr_list.push_back(start_ptr); size_list.push_back(tm_model_size); - return true; -} - -bool TmSerializer::LoadNode(StaticGraph* graph, StaticNode* node, const TM_Node* tm_node, void* mmap_buf) -{ - if(tm_node->offset_vi_input_tensors != NOT_SET) - { - const TM_Vector_indices* v_input_tensors = - GetTmPtr(mmap_buf, tm_node->offset_vi_input_tensors); - - /* Set the input tensors to the node */ - for(unsigned int i = 0; i < v_input_tensors->v_num; i++) - { - StaticTensor* tensor = graph->tensor_list[v_input_tensors->indices[i]].get(); - if(!tensor) - { - LOG_ERROR() << "The input tensor not exist: " << v_input_tensors->indices[i] << "\n"; - return false; - } - AddNodeInputTensor(node, tensor); - } - } - - if(tm_node->offset_vi_output_tensors != NOT_SET) - { - const TM_Vector_indices* v_output_tensors = - GetTmPtr(mmap_buf, tm_node->offset_vi_output_tensors); - - /* Set the output tensors to the node */ - for(unsigned int i = 0; i < v_output_tensors->v_num; i++) - { - StaticTensor* tensor = graph->tensor_list[v_output_tensors->indices[i]].get(); - if(!tensor) - { - LOG_ERROR() << "The output tensor not exist: " << v_output_tensors->indices[i] << "\n"; - return false; - } - AddNodeOutputTensor(node, tensor); - } - } - return true; -} - -bool TmSerializer::LoadTensor(StaticGraph* graph, const TM_Tensor* tm_tensor, const TM_Buffer* tm_buf, void* mmap_buf) -{ - /* Set the tensor name */ - int idx = tm_tensor->tensor_id; - std::string tm_tensor_name; - if(tm_tensor->offset_s_tname == NOT_SET) - tm_tensor_name = "tensor_" + std::to_string(idx); - else - { - const TM_String* tm_string = GetTmPtr(mmap_buf, tm_tensor->offset_s_tname); - tm_tensor_name.assign(GetTmPtr(mmap_buf, tm_string->offset_data), tm_string->size); - } - - /* Create the static tensor */ - StaticTensor* tensor; - if(tm_tensor->type == kConstTensor) - tensor = CreateStaticConstTensor(graph, tm_tensor_name); - else - tensor = CreateStaticTensor(graph, tm_tensor_name); - if(!tensor) - { - LOG_ERROR() << "Create static const tensor failed: " << tm_tensor_name << "\n"; - return false; - } - - /* Set the dims */ - if(tm_tensor->offset_vd_dims != NOT_SET) - { - const TM_Vector_dims* v_dims = GetTmPtr(mmap_buf, tm_tensor->offset_vd_dims); - if(!v_dims || !(v_dims->v_num)) - { - LOG_ERROR() << "Get tensor dims failed\n"; - return false; - } - std::vector dims; - for(unsigned int i = 0; i < v_dims->v_num; i++) - dims.push_back(v_dims->dims[i]); - SetTensorDim(tensor, dims); - - /* Set the daya layout */ - if(v_dims->v_num == 4) - SetTensorDataLayout(tensor, "NCHW"); - else if(v_dims->v_num == 2) - SetTensorDataLayout(tensor, "HW"); - else if(v_dims->v_num == 1) - SetTensorDataLayout(tensor, "W"); - } - - /* Set the data type */ - if(tm_tensor->data_type == TM_DT_FLOAT32) - SetTensorDataType(tensor, DataType::GetTypeID("float32")); - else if(tm_tensor->data_type == TM_DT_FLOAT16) - SetTensorDataType(tensor, DataType::GetTypeID("float16")); - else if(tm_tensor->data_type == TM_DT_INT32) - SetTensorDataType(tensor, DataType::GetTypeID("int")); - else if(tm_tensor->data_type == TM_DT_INT8) - SetTensorDataType(tensor, DataType::GetTypeID("int8")); - - /* Set the memory size and pointer */ - if(tm_tensor->type == kConstTensor) - { - SetTensorSize(tensor, tm_buf->size); - void* buf = malloc(tm_buf->size); - if(tm_buf->offset_data != NOT_SET) - { - memcpy(buf, GetTmPtr(mmap_buf, tm_buf->offset_data), tm_buf->size); - } - - SetConstTensorBuffer(tensor, buf); - SetConstTensorFileLocation(tensor, -1, 0); - } - - return true; -} - -bool TmSerializer::LoadGraph(StaticGraph* graph, const TM_Model* tm_model, void* mmap_buf) -{ - const TM_Vector_offsets* v_graphs = GetTmPtr(mmap_buf, tm_model->offset_vo_subgraphs); - const TM_Subgraph* tm_graph = GetTmPtr(mmap_buf, v_graphs->offsets[0]); - - const TM_Vector_offsets* v_nodes = GetTmPtr(mmap_buf, tm_graph->offset_vo_seq_nodes); - const TM_Vector_offsets* v_tensors = GetTmPtr(mmap_buf, tm_graph->offset_vo_tensors); - const TM_Vector_offsets* v_buffers = GetTmPtr(mmap_buf, tm_graph->offset_vo_buffers); - - /* Load const tensors */ - for(unsigned int i = 0; i < v_tensors->v_num; i++) - { - const TM_Tensor* tm_tensor = GetTmPtr(mmap_buf, v_tensors->offsets[i]); - const TM_Buffer* tm_buf; - if(tm_tensor->type == kConstTensor) - tm_buf = GetTmPtr(mmap_buf, v_buffers->offsets[tm_tensor->buffer_id]); - else - tm_buf = nullptr; - LoadTensor(graph, tm_tensor, tm_buf, mmap_buf); - } - - /* Create static nodes */ - unsigned int i; - for(i = 0; i < v_nodes->v_num; i++) - { - const TM_Node* tm_node = GetTmPtr(mmap_buf, v_nodes->offsets[i]); - int idx = tm_node->node_id; - std::string tm_node_name; - if(tm_node->offset_s_nname == NOT_SET) - tm_node_name = "node_" + std::to_string(idx); - else - { - const TM_String* tm_string = GetTmPtr(mmap_buf, tm_node->offset_s_nname); - tm_node_name.assign(GetTmPtr(mmap_buf, tm_string->offset_data), tm_string->size); - } - - const TM_Operator* tm_operator = GetTmPtr(mmap_buf, tm_node->offset_t_operator); - const std::string& tm_op_name = GetOpStr(tm_operator->operator_type); - - if(!FindOpLoadMethod(tm_op_name)) - { - LOG_ERROR() << "cannot find load function for operator: " << tm_op_name << "\n"; - break; - } - - StaticNode* node = CreateStaticNode(graph, tm_node_name); - if(!LoadNode(graph, node, tm_node, mmap_buf)) - break; - - op_load_t op_func = any_cast(GetOpLoadMethod(tm_op_name)); - - if(!op_func(graph, node, mmap_buf, tm_operator)) - break; - - /* Set the dynamic shape of the operator */ - node->op->dynamic_shape = tm_node->dynamic_shape; - } - - if(i < v_nodes->v_num) - return false; - - const TM_Vector_indices* v_input_nodes = GetTmPtr(mmap_buf, tm_graph->offset_vi_input_indices); - const TM_Vector_indices* v_output_nodes = GetTmPtr(mmap_buf, tm_graph->offset_vi_output_indices); - - /* Set the input nodes */ - for(unsigned int i = 0; i < v_input_nodes->v_num; i++) - { - StaticNode* node = graph->node_list[v_input_nodes->indices[i]].get(); - if(!node) - { - LOG_ERROR() << "Input node #" << v_input_nodes->indices[i] << " not exist\n"; - return false; - } - AddGraphInputNode(graph, node); - } - - /* Set the output nodes */ - for(unsigned int i = 0; i < v_output_nodes->v_num; i++) - { - StaticNode* node = graph->node_list[v_output_nodes->indices[i]].get(); - if(!node) - { - LOG_ERROR() << "Output node #" << v_output_nodes->indices[i] << " not exist\n"; - return false; - } - AddGraphOutputNode(graph, node); - } - - return true; + return ret; } bool TmSerializer::LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int& size) @@ -611,35 +125,6 @@ bool TmSerializer::LoadBinaryFile(const char* tm_fname, int& fd, void*& buf, int return true; } -bool TmSerializer::LoadModelFromMem(void* mmap_buf, StaticGraph* graph) -{ - const TM_Header* tm_header = reinterpret_cast(mmap_buf); - /* Check the version of tm file format */ - if(tm_header->ver_main != TM_FILE_VER_MAIN || tm_header->ver_sub != TM_FILE_VER_SUB || - tm_header->ver_compile != TM_FILE_VER_COMPILE) - { - printf("Wrong version of tm file\n"); - return false; - } - - const TM_Model* tm_model = GetTmPtr(mmap_buf, tm_header->offset_root); - if(tm_model->offset_s_mname == NOT_SET) - { - SetGraphIdentity(graph, "tengine", "tengine_model", "0"); - } - else - { - std::string tm_model_name; - const TM_String* tm_string = GetTmPtr(mmap_buf, tm_model->offset_s_mname); - tm_model_name.assign(GetTmPtr(mmap_buf, tm_string->offset_data), tm_string->size); - SetGraphIdentity(graph, "tengine", tm_model_name, "0"); - } - - if(LoadGraph(graph, tm_model, mmap_buf)) - return true; - else - return false; -} bool TmSerializer::LoadModel(const std::vector& file_list, StaticGraph* graph) { int fd; @@ -656,7 +141,17 @@ bool TmSerializer::LoadModel(const std::vector& file_list, StaticGr SetGraphSourceFormat(graph, "tengine"); SetGraphConstTensorFile(graph, file_list[0]); - bool ret = LoadModelFromMem(mmap_buf, graph); + const uint16_t* ver_main = reinterpret_cast(mmap_buf); + TmSerializerPtr tm_serializer; + if(*ver_main < 2) + { + LOG_WARN() << "The input tengine model file is in old format, please regenerate it by using tengine convert tool.\n"; + TmSerializerManager::SafeGet("tm_v1", tm_serializer); + } + else + TmSerializerManager::SafeGet("tm_v2", tm_serializer); + + bool ret = tm_serializer->LoadModelFromMem(mmap_buf, graph); munmap(const_cast(mmap_buf), mmap_size); close(fd); @@ -674,7 +169,17 @@ bool TmSerializer::LoadModel(const std::vector& addr_list, const st SetGraphSource(graph, "in_mem"); SetGraphSourceFormat(graph, "tengine"); - bool ret = LoadModelFromMem(mmap_buf, graph); + const uint16_t* ver_main = reinterpret_cast(mmap_buf); + TmSerializerPtr tm_serializer; + if(*ver_main < 2) + { + LOG_WARN() << "The input tengine model file is in old format, please regenerate it by using tengine convert tool.\n"; + TmSerializerManager::SafeGet("tm_v1", tm_serializer); + } + else + TmSerializerManager::SafeGet("tm_v2", tm_serializer); + + bool ret = tm_serializer->LoadModelFromMem(mmap_buf, graph); if(ret) graph->mem_src.push_back(mmap_buf); @@ -682,21 +187,19 @@ bool TmSerializer::LoadModel(const std::vector& addr_list, const st return ret; } -bool TmSerializerRegisterOpLoader(void) +bool TmSerializerInit(void) { - SerializerPtr serializer; + auto factory = SerializerFactory::GetFactory(); - if(!SerializerManager::SafeGet("tengine", serializer)) - return false; + factory->RegisterInterface("tengine"); + auto tm_serializer = factory->Create("tengine"); - TmSerializer* p_tengine = dynamic_cast(serializer.get()); + SerializerManager::SafeAdd("tengine", SerializerPtr(tm_serializer)); - for(int i = 0; i < TM_OPTYPE_NUM; i++) - { - p_tengine->RegisterOpLoadMethod(GetOpStr(i), op_load_t(LoadTmOpFunc(i))); - } + bool ret1 = register_tm1_serializer(); + bool ret2 = register_tm2_serializer(); - return true; + return (ret1 && ret2); } } // namespace TEngine diff --git a/serializer/tengine/v1/Makefile b/serializer/tengine/v1/Makefile new file mode 100644 index 000000000..c62ec5829 --- /dev/null +++ b/serializer/tengine/v1/Makefile @@ -0,0 +1,7 @@ +obj-y+=init.o +obj-y+=tm1_op_load.o +obj-y+=tm1_op_save.o +obj-y+=tm1_serializer.o + +COMMON_CFLAGS+=-I$(shell pwd)/../../include/tengine/v1 + diff --git a/serializer/tengine/v1/init.cpp b/serializer/tengine/v1/init.cpp new file mode 100644 index 000000000..734be3860 --- /dev/null +++ b/serializer/tengine/v1/init.cpp @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ +#include "tm_serializer.hpp" +#include "tm1_serializer.hpp" + +namespace TEngine { +namespace TMSerializer1 { + +extern bool TmSerializerRegisterOpLoader1(); + +} + +using namespace TMSerializer1; + +bool register_tm1_serializer(void) +{ + auto factory = TmSerializerFactory::GetFactory(); + + factory->RegisterInterface("tm_v1"); + auto tm_serializer = factory->Create("tm_v1"); + + TmSerializerManager::SafeAdd("tm_v1", TmSerializerPtr(tm_serializer)); + + return TmSerializerRegisterOpLoader1(); +} + +} // namespace TEngine + diff --git a/serializer/tengine/tm_op_load.cpp b/serializer/tengine/v1/tm1_op_load.cpp similarity index 96% rename from serializer/tengine/tm_op_load.cpp rename to serializer/tengine/v1/tm1_op_load.cpp index 27cc00db4..ffaa76e26 100644 --- a/serializer/tengine/tm_op_load.cpp +++ b/serializer/tengine/v1/tm1_op_load.cpp @@ -21,10 +21,12 @@ * Copyright (c) 2018, Open AI Lab * Author: jingyou@openailab.com */ -#include "tm_op_serializer.hpp" +#include "tm1_op_serializer.hpp" namespace TEngine { +namespace TMSerializer1 { + bool LoadTmAccuracyOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM_Operator* tm_op) { StaticOp* op = CreateStaticOp(graph, OP_STR_ACCURACY); @@ -98,13 +100,15 @@ bool LoadTmConvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, c param.kernel_w = tm_param->kernel_w; param.stride_h = tm_param->stride_h; param.stride_w = tm_param->stride_w; - param.pad_h = tm_param->pad_h; - param.pad_w = tm_param->pad_w; param.dilation_h = tm_param->dilation_h; param.dilation_w = tm_param->dilation_w; param.output_channel = tm_param->output_channel; param.activation = tm_param->activation; param.group = tm_param->group; + param.pad_h0 = tm_param->pad_h; + param.pad_h1 = tm_param->pad_h; + param.pad_w0 = tm_param->pad_w; + param.pad_w1 = tm_param->pad_w; StaticOp* op = CreateStaticOp(graph, op_str); SetOperatorParam(op, param); @@ -119,11 +123,18 @@ bool LoadTmDeconvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, DeconvParam param = any_cast(OpManager::GetOpDefParam(op_str)); const TM_DeconvParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); - param.kernel_size = tm_param->kernel_size; - param.stride = tm_param->stride; - param.pad = tm_param->pad; + param.kernel_h = tm_param->kernel_size; + param.kernel_w = tm_param->kernel_size; + param.stride_h = tm_param->stride; + param.stride_w = tm_param->stride; + param.pad_w0 = tm_param->pad; + param.pad_w1 = tm_param->pad; + param.pad_h0 = tm_param->pad; + param.pad_h1 = tm_param->pad; param.num_output = tm_param->num_output; - param.dilation = tm_param->dilation; + param.dilation_h = tm_param->dilation; + param.dilation_w = tm_param->dilation; + param.group = 1; StaticOp* op = CreateStaticOp(graph, op_str); SetOperatorParam(op, param); @@ -275,23 +286,14 @@ bool LoadTmPoolingOp(StaticGraph* graph, StaticNode* node, void* const start_ptr param.alg = static_cast(tm_param->alg); param.kernel_h = tm_param->kernel_h; param.kernel_w = tm_param->kernel_w; - param.pad_h = tm_param->pad_h; - param.pad_w = tm_param->pad_w; param.stride_h = tm_param->stride_h; param.stride_w = tm_param->stride_w; param.global = tm_param->global; param.caffe_flavor = tm_param->caffe_flavor; - param.kernel_shape.resize(2); - param.kernel_shape[0] = tm_param->kernel_shape[0]; - param.kernel_shape[1] = tm_param->kernel_shape[1]; - param.strides.resize(2); - param.strides[0] = tm_param->strides[0]; - param.strides[1] = tm_param->strides[1]; - param.pads.resize(4); - param.pads[0] = tm_param->pads[0]; - param.pads[1] = tm_param->pads[1]; - param.pads[2] = tm_param->pads[2]; - param.pads[3] = tm_param->pads[3]; + param.pad_h0 = tm_param->pads[0]; + param.pad_w0 = tm_param->pads[1]; + param.pad_h1 = tm_param->pads[2]; + param.pad_w1 = tm_param->pads[3]; StaticOp* op = CreateStaticOp(graph, op_str); SetOperatorParam(op, param); @@ -490,6 +492,7 @@ bool LoadTmSliceOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM_SliceParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); param.axis = tm_param->axis; + param.iscaffe = true; StaticOp* op = CreateStaticOp(graph, op_str); SetOperatorParam(op, param); @@ -659,4 +662,6 @@ std::string GetOpStr(uint32_t op_type) } } +} // namespace TMSerializer1 + } // namespace TEngine diff --git a/serializer/tengine/tm_op_save.cpp b/serializer/tengine/v1/tm1_op_save.cpp similarity index 95% rename from serializer/tengine/tm_op_save.cpp rename to serializer/tengine/v1/tm1_op_save.cpp index ab879a12e..8afa95103 100644 --- a/serializer/tengine/tm_op_save.cpp +++ b/serializer/tengine/v1/tm1_op_save.cpp @@ -21,10 +21,12 @@ * Copyright (c) 2018, Open AI Lab * Author: jingyou@openailab.com */ -#include "tm_op_serializer.hpp" +#include "tm1_op_serializer.hpp" namespace TEngine { +namespace TMSerializer1 { + inline void SetTmOperator(TM_Operator* tm_op, const uint32_t op_type, const tm_uoffset_t offset1, const tm_uoffset_t offset2) { @@ -81,27 +83,17 @@ static tm_uoffset_t SaveTmConvOp(void* const start_ptr, tm_uoffset_t* cur_pos, O tm_param.kernel_w = p->kernel_w; tm_param.stride_h = p->stride_h; tm_param.stride_w = p->stride_w; - tm_param.pad_h = p->pad_h; - tm_param.pad_w = p->pad_w; tm_param.dilation_h = p->dilation_h; tm_param.dilation_w = p->dilation_w; tm_param.output_channel = p->output_channel; tm_param.activation = p->activation; tm_param.group = p->group; - if(p->pads.size() == 4) - { - tm_param.pads[0] = p->pads[0]; - tm_param.pads[1] = p->pads[1]; - tm_param.pads[2] = p->pads[2]; - tm_param.pads[3] = p->pads[3]; - } - else - { - tm_param.pads[0] = 0; - tm_param.pads[1] = 0; - tm_param.pads[2] = 0; - tm_param.pads[3] = 0; - } + tm_param.pad_h = p->pad_h0; + tm_param.pad_w = p->pad_w0; + tm_param.pads[0] = p->pad_h0; + tm_param.pads[1] = p->pad_w0; + tm_param.pads[2] = p->pad_h1; + tm_param.pads[3] = p->pad_w1; TM_Operator tm_op; SetTmOperator(&tm_op, TM_OPTYPE_CONVOLUTION, NOT_SET, @@ -113,11 +105,12 @@ static tm_uoffset_t SaveTmDeconvOp(void* const start_ptr, tm_uoffset_t* cur_pos, { DeconvParam* p = (dynamic_cast(op))->GetParam(); TM_DeconvParam tm_param; - tm_param.kernel_size = p->kernel_size; - tm_param.stride = p->stride; - tm_param.pad = p->pad; + + tm_param.kernel_size = p->kernel_h; + tm_param.stride = p->stride_h; + tm_param.pad = p->pad_w0; tm_param.num_output = p->num_output; - tm_param.dilation = p->dilation; + tm_param.dilation = p->dilation_h; TM_Operator tm_op; SetTmOperator(&tm_op, TM_OPTYPE_DECONVOLUTION, NOT_SET, @@ -244,20 +237,20 @@ static tm_uoffset_t SaveTmPoolOp(void* const start_ptr, tm_uoffset_t* cur_pos, O tm_param.alg = p->alg; tm_param.kernel_h = p->kernel_h; tm_param.kernel_w = p->kernel_w; - tm_param.pad_h = p->pad_h; - tm_param.pad_w = p->pad_w; + tm_param.pad_h = p->pad_h0; + tm_param.pad_w = p->pad_w0; tm_param.stride_h = p->stride_h; tm_param.stride_w = p->stride_w; tm_param.global = p->global; tm_param.caffe_flavor = p->caffe_flavor; - tm_param.kernel_shape[0] = p->kernel_shape[0]; - tm_param.kernel_shape[1] = p->kernel_shape[1]; - tm_param.strides[0] = p->strides[0]; - tm_param.strides[1] = p->strides[1]; - tm_param.pads[0] = p->pads[0]; - tm_param.pads[1] = p->pads[1]; - tm_param.pads[2] = p->pads[2]; - tm_param.pads[3] = p->pads[3]; + tm_param.kernel_shape[0] = p->kernel_h; + tm_param.kernel_shape[1] = p->kernel_w; + tm_param.strides[0] = p->stride_h; + tm_param.strides[1] = p->stride_w; + tm_param.pads[0] = p->pad_h0; + tm_param.pads[1] = p->pad_w0; + tm_param.pads[2] = p->pad_h1; + tm_param.pads[3] = p->pad_w1; TM_Operator tm_op; SetTmOperator(&tm_op, TM_OPTYPE_POOLING, NOT_SET, @@ -600,4 +593,6 @@ tm_uoffset_t SaveTmOperator(void* const start_ptr, tm_uoffset_t* cur_pos, Operat return 0; } +} // namespace TMSerializer1 + } // namespace TEngine diff --git a/serializer/tengine/v1/tm1_serializer.cpp b/serializer/tengine/v1/tm1_serializer.cpp new file mode 100644 index 000000000..0938fd1b3 --- /dev/null +++ b/serializer/tengine/v1/tm1_serializer.cpp @@ -0,0 +1,595 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: jingyou@openailab.com + */ +#include +#include +#include +#include +#include + +#include "tengine_c_api.h" +#include "exec_attr.hpp" +#include "data_type.hpp" +#include "operator_manager.hpp" +#include "static_graph.hpp" +#include "graph.hpp" +#include "node.hpp" +#include "tensor.hpp" +#include "compiler.hpp" + +#include "tm1_format.h" +#include "tm1_serializer.hpp" +#include "tm1_op_serializer.hpp" + +namespace TEngine { + +namespace TMSerializer1 { + +bool TmSerializer1::IsSaveString(void) +{ + const char* env = std::getenv("TM_WITH_STRING"); + + if(env) + return true; + else + return false; +} + +bool TmSerializer1::IsSaveData(void) +{ + const char* env = std::getenv("TM_FOR_BENCHMARK"); + + if(env) + return false; + else + return true; +} + +tm_uoffset_t TmSerializer1::SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor, + unsigned int tensor_id, unsigned int buffer_id) +{ + TM_Tensor tm_tensor; + tm_tensor.tensor_id = tensor_id; + tm_tensor.buffer_id = buffer_id; + tm_tensor.type = tensor->GetType(); + + bool tm_with_string = IsSaveString(); + + if(tm_with_string) + { + std::string name = tensor->GetName(); + TM_String tensor_name; + tensor_name.size = name.size(); + tensor_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), tensor_name.size); + tm_tensor.offset_s_tname = WriteTmObject(start_ptr, cur_pos, &tensor_name, sizeof(TM_String)); + } + else + tm_tensor.offset_s_tname = NOT_SET; + + const std::string& data_type = DataType::GetTypeName(tensor->GetDataType()); + if(data_type == "float32") + tm_tensor.data_type = TM_DT_FLOAT32; + else if(data_type == "float16") + tm_tensor.data_type = TM_DT_FLOAT16; + else if(data_type == "int") + tm_tensor.data_type = TM_DT_INT32; + else if(data_type == "int8") + tm_tensor.data_type = TM_DT_INT8; + + /* Get the dims of the tensor */ + TShape& shape = tensor->GetShape(); + std::vector& dim = shape.GetDim(); + if(dim.size()) + { + /* Write the vector of dims */ + size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * dim.size(); + TM_Vector_dims* v_dims = ( TM_Vector_dims* )malloc(vector_size); + v_dims->v_num = dim.size(); + for(unsigned int i = 0; i < dim.size(); i++) + { + v_dims->dims[i] = dim[i]; + } + tm_tensor.offset_vd_dims = WriteTmObject(start_ptr, cur_pos, v_dims, vector_size); + free(v_dims); + } + else + tm_tensor.offset_vd_dims = NOT_SET; + + /* Write the tensor */ + return WriteTmObject(start_ptr, cur_pos, &tm_tensor, sizeof(TM_Tensor)); +} + +tm_uoffset_t TmSerializer1::SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node, + name_map_t& tensor_name_map) +{ + TM_Node tm_node; + tm_node.node_id = node->GetNodeIndex(); + tm_node.dynamic_shape = node->IsDynamicShape(); + + bool tm_with_string = IsSaveString(); + + if(tm_with_string) + { + std::string name = node->GetName(); + TM_String node_name; + node_name.size = name.size(); + node_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), node_name.size); + tm_node.offset_s_nname = WriteTmObject(start_ptr, cur_pos, &node_name, sizeof(TM_String)); + } + else + tm_node.offset_s_nname = NOT_SET; + + unsigned int input_num = node->GetInputNum(); + unsigned int output_num = node->GetOutputNum(); + + if(input_num) + { + /* Write the vector of input indices */ + size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * input_num; + TM_Vector_indices* v_input_indices = ( TM_Vector_indices* )malloc(vector_size); + v_input_indices->v_num = input_num; + for(unsigned int i = 0; i < input_num; i++) + { + Tensor* p_tensor = node->GetInputTensor(i); + v_input_indices->indices[i] = tensor_name_map[p_tensor->GetName()]; + } + tm_node.offset_vi_input_tensors = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size); + free(v_input_indices); + } + else + tm_node.offset_vi_input_tensors = NOT_SET; + + if(output_num) + { + /* Write the vector of output indices */ + size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * output_num; + TM_Vector_indices* v_output_indices = ( TM_Vector_indices* )malloc(vector_size); + v_output_indices->v_num = output_num; + for(unsigned int i = 0; i < output_num; i++) + { + Tensor* p_tensor = node->GetOutputTensor(i); + v_output_indices->indices[i] = tensor_name_map[p_tensor->GetName()]; + } + tm_node.offset_vi_output_tensors = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size); + free(v_output_indices); + } + else + tm_node.offset_vi_output_tensors = NOT_SET; + + tm_node.offset_t_operator = SaveTmOperator(start_ptr, cur_pos, node->GetOp()); + + /* Write the node */ + return WriteTmObject(start_ptr, cur_pos, &tm_node, sizeof(TM_Node)); +} + +tm_uoffset_t TmSerializer1::SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph) +{ + TM_Subgraph tm_subgraph; + tm_subgraph.subgraph_id = 0; /* subgraph_id starts from 0 */ + tm_subgraph.offset_s_sname = NOT_SET; + + unsigned int tensor_num = 0; + unsigned int buffer_num = 0; + std::vector tensor_ptrs; + std::vector buf_ptrs; + std::vector buf_sizes; + name_map_t tensor_name_map; /* map of tensor name and tensor index */ + bool tm_no_data = !IsSaveData(); + + /* Write the nodes */ + size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * graph->seq_nodes.size(); + TM_Vector_offsets* v_nodes = ( TM_Vector_offsets* )malloc(vector_size); + v_nodes->v_num = graph->seq_nodes.size(); + for(unsigned int i = 0; i < graph->seq_nodes.size(); i++) + { + Node* p_node = graph->seq_nodes[i]; + for(unsigned int k = 0; k < p_node->GetOutputNum(); k++) + { + Tensor* p_tensor = p_node->GetOutputTensor(k); + tensor_ptrs.push_back(p_tensor); + tensor_name_map[p_tensor->GetName()] = tensor_num; + tensor_num++; + } + v_nodes->offsets[i] = SaveTmNode(start_ptr, cur_pos, p_node, tensor_name_map); + } + /* Write the vector of nodes */ + tm_subgraph.offset_vo_seq_nodes = WriteTmObject(start_ptr, cur_pos, v_nodes, vector_size); + + /* Write the tensors */ + vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor_num; + TM_Vector_offsets* v_tensors = ( TM_Vector_offsets* )malloc(vector_size); + v_tensors->v_num = tensor_num; + for(unsigned int i = 0; i < tensor_num; i++) + { + Tensor* p_tensor = tensor_ptrs[i]; + if(p_tensor->GetType() == kConstTensor) + { + buf_ptrs.push_back(p_tensor->GetMemAddr()); + buf_sizes.push_back(p_tensor->GetTotalSize()); + buffer_num++; + } + + v_tensors->offsets[i] = SaveTmTensor(start_ptr, cur_pos, p_tensor, i, buffer_num - 1); + } + /* Write the vector of tensors */ + tm_subgraph.offset_vo_tensors = WriteTmObject(start_ptr, cur_pos, v_tensors, vector_size); + + /* Write the buffers */ + vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * buffer_num; + TM_Vector_offsets* v_buffers = ( TM_Vector_offsets* )malloc(vector_size); + v_buffers->v_num = buffer_num; + for(unsigned int i = 0; i < buffer_num; i++) + { + TM_Buffer tm_buf; + tm_buf.size = buf_sizes[i]; + + if(tm_no_data) + { + /* TM_FOR_BENCHMARK environment variable exists. Not write buf data into the tm file */ + tm_buf.offset_data = NOT_SET; + } + else + { + /* TM_FOR_BENCHMARK environment variable does not exist */ + tm_buf.offset_data = + WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast(buf_ptrs[i]), tm_buf.size); + } + v_buffers->offsets[i] = WriteTmObject(start_ptr, cur_pos, &tm_buf, sizeof(TM_Buffer)); + } + /* Write the vector of buffers */ + tm_subgraph.offset_vo_buffers = WriteTmObject(start_ptr, cur_pos, v_buffers, vector_size); + + /* Write the vector of input indices */ + vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->input_nodes.size(); + TM_Vector_indices* v_input_indices = ( TM_Vector_indices* )malloc(vector_size); + v_input_indices->v_num = graph->input_nodes.size(); + for(unsigned int i = 0; i < graph->input_nodes.size(); i++) + { + v_input_indices->indices[i] = graph->input_nodes[i]->GetNodeIndex(); + } + tm_subgraph.offset_vi_input_indices = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size); + + /* Write the vector of output indices */ + vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->output_nodes.size(); + TM_Vector_indices* v_output_indices = ( TM_Vector_indices* )malloc(vector_size); + v_output_indices->v_num = graph->output_nodes.size(); + for(unsigned int i = 0; i < graph->output_nodes.size(); i++) + { + v_output_indices->indices[i] = graph->output_nodes[i]->GetNodeIndex(); + } + tm_subgraph.offset_vi_output_indices = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size); + + /* Write the subgraph */ + tm_uoffset_t ret = WriteTmObject(start_ptr, cur_pos, &tm_subgraph, sizeof(TM_Subgraph)); + + /* Free the memory of vectors */ + free(v_tensors); + free(v_buffers); + free(v_nodes); + free(v_input_indices); + free(v_output_indices); + + return ret; +} + +bool TmSerializer1::SaveModelIntoMem(void* start_ptr, Graph* graph, uint32_t* tm_model_size) +{ + bool tm_with_string = IsSaveString(); + + tm_uoffset_t cur_pos = sizeof(TM_Header); + + /* Define the TM_Header object */ + TM_Header header; + header.ver_main = TM_FILE_VER_MAIN; + header.ver_sub = TM_FILE_VER_SUB; + header.ver_compile = TM_FILE_VER_COMPILE; + + /* Define the TM_Model object */ + TM_Model tm_model; + if(tm_with_string) + { + const std::string& fname = graph->GetName(); + TM_String model_name; + model_name.size = fname.size(); + model_name.offset_data = WriteTmFileAlign1(start_ptr, &cur_pos, fname.c_str(), model_name.size); + tm_model.offset_s_mname = WriteTmObject(start_ptr, &cur_pos, &model_name, sizeof(TM_String)); + } + else + tm_model.offset_s_mname = NOT_SET; + + /* Write the subgraphs */ + /* Only 1 subgraph is supported currently */ + size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * 1; + TM_Vector_offsets* v_subgraphs = ( TM_Vector_offsets* )malloc(vector_size); + v_subgraphs->v_num = 1; + v_subgraphs->offsets[0] = SaveTmSubgraph(start_ptr, &cur_pos, graph); + + /* Write the vector of subgraphs */ + tm_model.offset_vo_subgraphs = WriteTmObject(start_ptr, &cur_pos, v_subgraphs, vector_size); + + /* Write the model */ + header.offset_root = WriteTmObject(start_ptr, &cur_pos, &tm_model, sizeof(TM_Model)); + *tm_model_size = cur_pos; + + /* Write the header */ + cur_pos = 0; + WriteTmObject(start_ptr, &cur_pos, &header, sizeof(TM_Header)); + + free(v_subgraphs); + + return true; +} + +bool TmSerializer1::LoadNode(StaticGraph* graph, StaticNode* node, const TM_Node* tm_node, void* mmap_buf) +{ + if(tm_node->offset_vi_input_tensors != NOT_SET) + { + const TM_Vector_indices* v_input_tensors = + GetTmPtr(mmap_buf, tm_node->offset_vi_input_tensors); + + /* Set the input tensors to the node */ + for(unsigned int i = 0; i < v_input_tensors->v_num; i++) + { + StaticTensor* tensor = graph->tensor_list[v_input_tensors->indices[i]].get(); + if(!tensor) + { + LOG_ERROR() << "The input tensor not exist: " << v_input_tensors->indices[i] << "\n"; + return false; + } + AddNodeInputTensor(node, tensor); + } + } + + if(tm_node->offset_vi_output_tensors != NOT_SET) + { + const TM_Vector_indices* v_output_tensors = + GetTmPtr(mmap_buf, tm_node->offset_vi_output_tensors); + + /* Set the output tensors to the node */ + for(unsigned int i = 0; i < v_output_tensors->v_num; i++) + { + StaticTensor* tensor = graph->tensor_list[v_output_tensors->indices[i]].get(); + if(!tensor) + { + LOG_ERROR() << "The output tensor not exist: " << v_output_tensors->indices[i] << "\n"; + return false; + } + AddNodeOutputTensor(node, tensor); + } + } + return true; +} + +bool TmSerializer1::LoadTensor(StaticGraph* graph, const TM_Tensor* tm_tensor, const TM_Buffer* tm_buf, void* mmap_buf) +{ + /* Set the tensor name */ + int idx = tm_tensor->tensor_id; + std::string tm_tensor_name; + if(tm_tensor->offset_s_tname == NOT_SET) + tm_tensor_name = "tensor_" + std::to_string(idx); + else + { + const TM_String* tm_string = GetTmPtr(mmap_buf, tm_tensor->offset_s_tname); + tm_tensor_name.assign(GetTmPtr(mmap_buf, tm_string->offset_data), tm_string->size); + } + + /* Create the static tensor */ + StaticTensor* tensor; + if(tm_tensor->type == kConstTensor) + tensor = CreateStaticConstTensor(graph, tm_tensor_name); + else + tensor = CreateStaticTensor(graph, tm_tensor_name); + if(!tensor) + { + LOG_ERROR() << "Create static const tensor failed: " << tm_tensor_name << "\n"; + return false; + } + + /* Set the dims */ + if(tm_tensor->offset_vd_dims != NOT_SET) + { + const TM_Vector_dims* v_dims = GetTmPtr(mmap_buf, tm_tensor->offset_vd_dims); + if(!v_dims || !(v_dims->v_num)) + { + LOG_ERROR() << "Get tensor dims failed\n"; + return false; + } + std::vector dims; + for(unsigned int i = 0; i < v_dims->v_num; i++) + dims.push_back(v_dims->dims[i]); + SetTensorDim(tensor, dims); + + } + + /* Set the data type */ + if(tm_tensor->data_type == TM_DT_FLOAT32) + SetTensorDataType(tensor, DataType::GetTypeID("float32")); + else if(tm_tensor->data_type == TM_DT_FLOAT16) + SetTensorDataType(tensor, DataType::GetTypeID("float16")); + else if(tm_tensor->data_type == TM_DT_INT32) + SetTensorDataType(tensor, DataType::GetTypeID("int")); + else if(tm_tensor->data_type == TM_DT_INT8) + SetTensorDataType(tensor, DataType::GetTypeID("int8")); + + /* Set the memory size and pointer */ + if(tm_tensor->type == kConstTensor) + { + SetTensorSize(tensor, tm_buf->size); + void* buf = malloc(tm_buf->size); + if(tm_buf->offset_data != NOT_SET) + { + memcpy(buf, GetTmPtr(mmap_buf, tm_buf->offset_data), tm_buf->size); + } + + SetConstTensorBuffer(tensor, buf); + SetConstTensorFileLocation(tensor, -1, 0); + } + + return true; +} + +bool TmSerializer1::LoadGraph(StaticGraph* graph, const TM_Model* tm_model, void* mmap_buf) +{ + const TM_Vector_offsets* v_graphs = GetTmPtr(mmap_buf, tm_model->offset_vo_subgraphs); + const TM_Subgraph* tm_graph = GetTmPtr(mmap_buf, v_graphs->offsets[0]); + + const TM_Vector_offsets* v_nodes = GetTmPtr(mmap_buf, tm_graph->offset_vo_seq_nodes); + const TM_Vector_offsets* v_tensors = GetTmPtr(mmap_buf, tm_graph->offset_vo_tensors); + const TM_Vector_offsets* v_buffers = GetTmPtr(mmap_buf, tm_graph->offset_vo_buffers); + + /* Load const tensors */ + for(unsigned int i = 0; i < v_tensors->v_num; i++) + { + const TM_Tensor* tm_tensor = GetTmPtr(mmap_buf, v_tensors->offsets[i]); + const TM_Buffer* tm_buf; + if(tm_tensor->type == kConstTensor) + tm_buf = GetTmPtr(mmap_buf, v_buffers->offsets[tm_tensor->buffer_id]); + else + tm_buf = nullptr; + LoadTensor(graph, tm_tensor, tm_buf, mmap_buf); + } + + /* Create static nodes */ + unsigned int i; + for(i = 0; i < v_nodes->v_num; i++) + { + const TM_Node* tm_node = GetTmPtr(mmap_buf, v_nodes->offsets[i]); + int idx = tm_node->node_id; + std::string tm_node_name; + if(tm_node->offset_s_nname == NOT_SET) + tm_node_name = "node_" + std::to_string(idx); + else + { + const TM_String* tm_string = GetTmPtr(mmap_buf, tm_node->offset_s_nname); + tm_node_name.assign(GetTmPtr(mmap_buf, tm_string->offset_data), tm_string->size); + } + + const TM_Operator* tm_operator = GetTmPtr(mmap_buf, tm_node->offset_t_operator); + const std::string& tm_op_name = GetOpStr(tm_operator->operator_type); + + if(!FindOpLoadMethod(tm_op_name)) + { + LOG_ERROR() << "cannot find load function for operator: " << tm_op_name << "\n"; + break; + } + + StaticNode* node = CreateStaticNode(graph, tm_node_name); + if(!LoadNode(graph, node, tm_node, mmap_buf)) + break; + + op_load_t op_func = any_cast(GetOpLoadMethod(tm_op_name)); + + if(!op_func(graph, node, mmap_buf, tm_operator)) + break; + + /* Set the dynamic shape of the operator */ + node->op->dynamic_shape = tm_node->dynamic_shape; + } + + if(i < v_nodes->v_num) + return false; + + const TM_Vector_indices* v_input_nodes = GetTmPtr(mmap_buf, tm_graph->offset_vi_input_indices); + const TM_Vector_indices* v_output_nodes = GetTmPtr(mmap_buf, tm_graph->offset_vi_output_indices); + + /* Set the input nodes */ + for(unsigned int i = 0; i < v_input_nodes->v_num; i++) + { + StaticNode* node = graph->node_list[v_input_nodes->indices[i]].get(); + if(!node) + { + LOG_ERROR() << "Input node #" << v_input_nodes->indices[i] << " not exist\n"; + return false; + } + AddGraphInputNode(graph, node); + } + + /* Set the output nodes */ + for(unsigned int i = 0; i < v_output_nodes->v_num; i++) + { + StaticNode* node = graph->node_list[v_output_nodes->indices[i]].get(); + if(!node) + { + LOG_ERROR() << "Output node #" << v_output_nodes->indices[i] << " not exist\n"; + return false; + } + AddGraphOutputNode(graph, node); + } + + return true; +} + +bool TmSerializer1::LoadModelFromMem(void* mmap_buf, StaticGraph* graph) +{ + const TM_Header* tm_header = reinterpret_cast(mmap_buf); + /* Check the version of tm file format */ + if(tm_header->ver_main != TM_FILE_VER_MAIN || tm_header->ver_sub != TM_FILE_VER_SUB || + tm_header->ver_compile != TM_FILE_VER_COMPILE) + { + printf("Wrong version of tm file\n"); + return false; + } + + const TM_Model* tm_model = GetTmPtr(mmap_buf, tm_header->offset_root); + if(tm_model->offset_s_mname == NOT_SET) + { + SetGraphIdentity(graph, "tengine", "tengine_model", "0"); + } + else + { + std::string tm_model_name; + const TM_String* tm_string = GetTmPtr(mmap_buf, tm_model->offset_s_mname); + tm_model_name.assign(GetTmPtr(mmap_buf, tm_string->offset_data), tm_string->size); + SetGraphIdentity(graph, "tengine", tm_model_name, "0"); + } + + SetModelFormat(graph,MODEL_FORMAT_TENGINE); + SetGraphLayout(graph,TENGINE_LAYOUT_NCHW); + SetModelLayout(graph,TENGINE_LAYOUT_NCHW); + + if(LoadGraph(graph, tm_model, mmap_buf)) + return true; + else + return false; +} + +bool TmSerializerRegisterOpLoader1(void) +{ + TmSerializerPtr serializer; + + if(!TmSerializerManager::SafeGet("tm_v1", serializer)) + return false; + + TmSerializer1* p_tengine = dynamic_cast(serializer.get()); + + for(int i = 0; i < TM_OPTYPE_NUM; i++) + { + p_tengine->RegisterOpLoadMethod(GetOpStr(i), op_load_t(LoadTmOpFunc(i))); + } + + return true; +} + +} // namespace TMSerializer1 + +} // namespace TEngine diff --git a/serializer/tengine/v2/Makefile b/serializer/tengine/v2/Makefile new file mode 100644 index 000000000..dc9eea616 --- /dev/null +++ b/serializer/tengine/v2/Makefile @@ -0,0 +1,7 @@ +obj-y+=init.o +obj-y+=tm2_op_load.o +obj-y+=tm2_op_save.o +obj-y+=tm2_serializer.o + +COMMON_CFLAGS+=-I$(shell pwd)/../../include/tengine/v2 + diff --git a/serializer/tengine/v2/init.cpp b/serializer/tengine/v2/init.cpp new file mode 100644 index 000000000..3c9825059 --- /dev/null +++ b/serializer/tengine/v2/init.cpp @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ +#include "tm_serializer.hpp" +#include "tm2_serializer.hpp" + +namespace TEngine { +namespace TMSerializer2 { + +extern bool TmSerializerRegisterOpLoader2(); + +} + +using namespace TMSerializer2; + +bool register_tm2_serializer(void) +{ + auto factory = TmSerializerFactory::GetFactory(); + + factory->RegisterInterface("tm_v2"); + auto tm_serializer = factory->Create("tm_v2"); + + TmSerializerManager::SafeAdd("tm_v2", TmSerializerPtr(tm_serializer)); + + return TmSerializerRegisterOpLoader2(); +} + +} // namespace TEngine + diff --git a/serializer/tengine/v2/tm2_op_load.cpp b/serializer/tengine/v2/tm2_op_load.cpp new file mode 100644 index 000000000..ad119d509 --- /dev/null +++ b/serializer/tengine/v2/tm2_op_load.cpp @@ -0,0 +1,895 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ +#include + +#include "tm2_format.h" +#include "tm2_op_serializer.hpp" + +namespace TEngine { + +namespace TMSerializer2 { + +bool LoadTmAccuracyOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_ACCURACY); + SetNodeOp(node, op); + return true; +} + +bool LoadTmBatchNormOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_BATCHNORMALIZATION; + + BatchNormParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_BatchNormParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.rescale_factor = tm_param->rescale_factor; + param.eps = tm_param->eps; + param.caffe_flavor = tm_param->caffe_flavor; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmResizeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_BILINEARRESIZE; + + ResizeParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_ResizeParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.scale_w = tm_param->scale_x; + param.scale_h = tm_param->scale_y; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmConcatOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_CONCAT; + + ConcatParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_ConcatParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.axis = tm_param->axis; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmConstOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_CONST); + SetNodeOp(node, op); + return true; +} + +bool LoadTmConvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_CONVOLUTION; + + ConvParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_ConvParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.kernel_h = tm_param->kernel_h; + param.kernel_w = tm_param->kernel_w; + param.stride_h = tm_param->stride_h; + param.stride_w = tm_param->stride_w; + param.dilation_h = tm_param->dilation_h; + param.dilation_w = tm_param->dilation_w; + param.input_channel = tm_param->input_channel; + param.output_channel = tm_param->output_channel; + param.group = tm_param->group; + param.activation = tm_param->activation; + param.pad_h0 = tm_param->pad_h0; + param.pad_h1 = tm_param->pad_h1; + param.pad_w0 = tm_param->pad_w0; + param.pad_w1 = tm_param->pad_w1; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmDeconvOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_DECONVOLUTION; + + DeconvParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_DeconvParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.kernel_h = tm_param->kernel_h; + param.kernel_w = tm_param->kernel_w; + param.stride_h = tm_param->stride_h; + param.stride_w = tm_param->stride_w; + param.pad_w0 = tm_param->pad_w0; + param.pad_w1 = tm_param->pad_w1; + param.pad_h0 = tm_param->pad_h0; + param.pad_h1 = tm_param->pad_h1; + param.num_output = tm_param->num_output; + param.dilation_h = tm_param->dilation_h; + param.dilation_w = tm_param->dilation_w; + param.group = tm_param->group; + param.activation = tm_param->activation; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmDetectionOutputOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_DETECTIONOUTPUT; + + DetectionOutputParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_DetectionOutputParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.num_classes = tm_param->num_classes; + param.keep_top_k = tm_param->keep_top_k; + param.nms_top_k = tm_param->nms_top_k; + param.confidence_threshold = tm_param->confidence_threshold; + param.nms_threshold = tm_param->nms_threshold; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmDropoutOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_DROPOUT); + SetNodeOp(node, op); + return true; +} + +bool LoadTmEltwiseOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_ELTWISE; + + EltwiseParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_EltwiseParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.type = static_cast(tm_param->type); + param.caffe_flavor = tm_param->caffe_flavor; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmFlattenOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_FLATTEN; + + FlattenParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_FlattenParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.axis = tm_param->axis; + param.end_axis = tm_param->end_axis; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmFCOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_FULLYCONNECTED; + + FCParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_FCParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.num_output = tm_param->num_output; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmInputOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_INPUTOP); + SetNodeOp(node, op); + return true; +} + +bool LoadTmLRNOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_LRN; + + LRNParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_LRNParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.local_size = tm_param->local_size; + param.alpha = tm_param->alpha; + param.beta = tm_param->beta; + param.norm_region = tm_param->norm_region; + param.k = tm_param->k; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmNormalizeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_NORMALIZE; + + NormalizeParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_NormalizeParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.across_spatial = tm_param->across_spatial; + param.channel_shared = tm_param->channel_shared; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmPermuteOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_PERMUTE; + + PermuteParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_PermuteParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.flag = tm_param->flag; + param.order0 = tm_param->order0; + param.order1 = tm_param->order1; + param.order2 = tm_param->order2; + param.order3 = tm_param->order3; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmPoolingOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_POOLING; + + PoolParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_PoolParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.alg = static_cast(tm_param->alg); + param.kernel_h = tm_param->kernel_h; + param.kernel_w = tm_param->kernel_w; + param.stride_h = tm_param->stride_h; + param.stride_w = tm_param->stride_w; + param.global = tm_param->global; + param.caffe_flavor = tm_param->caffe_flavor; + param.pad_h0 = tm_param->pad_h0; + param.pad_w0 = tm_param->pad_w0; + param.pad_h1 = tm_param->pad_h1; + param.pad_w1 = tm_param->pad_w1; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmPreluOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_PRELU); + SetNodeOp(node, op); + return true; +} + +bool LoadTmPriorBoxOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_PRIORBOX; + + PriorBoxParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_PriorBoxParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + const TM2_Vector_floats* v_minsizes = GetTmPtr(start_ptr, tm_param->offset_vf_min_size); + const TM2_Vector_floats* v_maxsizes = GetTmPtr(start_ptr, tm_param->offset_vf_max_size); + const TM2_Vector_floats* v_variances = GetTmPtr(start_ptr, tm_param->offset_vf_variance); + const TM2_Vector_floats* v_ratios = GetTmPtr(start_ptr, tm_param->offset_vf_aspect_ratio); + + for(unsigned int i = 0; i < v_minsizes->v_num; i++) + param.min_size.push_back(v_minsizes->data[i]); + for(unsigned int i = 0; i < v_maxsizes->v_num; i++) + param.max_size.push_back(v_maxsizes->data[i]); + for(unsigned int i = 0; i < v_variances->v_num; i++) + param.variance.push_back(v_variances->data[i]); + for(unsigned int i = 0; i < v_ratios->v_num; i++) + param.aspect_ratio.push_back(v_ratios->data[i]); + param.flip = tm_param->flip; + param.clip = tm_param->clip; + param.img_size = tm_param->img_size; + param.img_h = tm_param->img_h; + param.img_w = tm_param->img_w; + param.step_w = tm_param->step_w; + param.step_h = tm_param->step_h; + param.offset = tm_param->offset; + param.num_priors_ = tm_param->num_priors; + param.out_dim_ = tm_param->out_dim; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmRegionOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_REGION; + + RegionParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_RegionParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + const TM2_Vector_floats* v_biases = GetTmPtr(start_ptr, tm_param->offset_vf_biases); + + for(unsigned int i = 0; i < v_biases->v_num; i++) + param.biases.push_back(v_biases->data[i]); + param.num_classes = tm_param->num_classes; + param.side = tm_param->side; + param.num_box = tm_param->num_box; + param.coords = tm_param->coords; + param.confidence_threshold = tm_param->confidence_threshold; + param.nms_threshold = tm_param->nms_threshold; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmReLuOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_RELU; + + ReLuParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_ReLuParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.negative_slope = tm_param->negative_slope; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmRelu6Op(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_RELU6); + SetNodeOp(node, op); + return true; +} + +bool LoadTmReorgOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_REORG; + + ReorgParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_ReorgParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.stride = tm_param->stride; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmReshapeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_RESHAPE; + + ReshapeParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_ReshapeParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.dim_0 = tm_param->dim_0; + param.dim_1 = tm_param->dim_1; + param.dim_2 = tm_param->dim_2; + param.dim_3 = tm_param->dim_3; + param.dim_size = tm_param->dim_size; + param.axis = tm_param->axis; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmROIPoolingOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_ROIPOOLING; + + ROIPoolingParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_ROIPoolingParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.pooled_h = tm_param->pooled_h; + param.pooled_w = tm_param->pooled_w; + param.spatial_scale = tm_param->spatial_scale; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmRPNOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_RPN; + + RPNParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_RPNParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + const TM2_Vector_floats* v_ratios = GetTmPtr(start_ptr, tm_param->offset_vf_ratios); + const TM2_Vector_floats* v_scales = GetTmPtr(start_ptr, tm_param->offset_vf_anchor_scales); + + for(unsigned int i = 0; i < v_ratios->v_num; i++) + param.ratios.push_back(v_ratios->data[i]); + for(unsigned int i = 0; i < v_scales->v_num; i++) + param.anchor_scales.push_back(v_scales->data[i]); + param.feat_stride = tm_param->feat_stride; + param.basesize = tm_param->basesize; + param.min_size = tm_param->min_size; + param.per_nms_topn = tm_param->per_nms_topn; + param.post_nms_topn = tm_param->post_nms_topn; + param.nms_thresh = tm_param->nms_thresh; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmScaleOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_SCALE; + + ScaleParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_ScaleParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.axis = tm_param->axis; + param.num_axes = tm_param->num_axes; + param.bias_term = tm_param->bias_term; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmSliceOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_SLICE; + + SliceParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_SliceParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + if(tm_param->offset_vi_slice_points != TM2_NOT_SET) + { + const TM2_Vector_dims* v_slice_points = GetTmPtr(start_ptr, tm_param->offset_vi_slice_points); + for(unsigned int i = 0; i < v_slice_points->v_num; i++) + param.slice_point_.push_back(v_slice_points->dims[i]); + } + if(tm_param->offset_vi_begins != TM2_NOT_SET) + { + const TM2_Vector_dims* v_begins = GetTmPtr(start_ptr, tm_param->offset_vi_begins); + for(unsigned int i = 0; i < v_begins->v_num; i++) + param.begin_.push_back(v_begins->dims[i]); + } + if(tm_param->offset_vi_sizes != TM2_NOT_SET) + { + const TM2_Vector_dims* v_sizes = GetTmPtr(start_ptr, tm_param->offset_vi_sizes); + for(unsigned int i = 0; i < v_sizes->v_num; i++) + param.size_.push_back(v_sizes->dims[i]); + } + + param.axis = tm_param->axis; + param.iscaffe = tm_param->iscaffe; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmSoftmaxOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_SOFTMAX; + + SoftmaxParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_SoftmaxParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.axis = tm_param->axis; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmSplitOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_SPLIT); + SetNodeOp(node, op); + return true; +} + +bool LoadTmDetectionPostProcessOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_DETECTIONPOSTPROCESS; + + DetectionPostProcessParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_DetectionPostProcessParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.max_detections = tm_param->max_detections; + param.max_classes_per_detection = tm_param->max_classes_per_detection; + param.nms_score_threshold = tm_param->nms_score_threshold; + param.nms_iou_threshold = tm_param->nms_iou_threshold; + param.num_classes = tm_param->num_classes; + + const TM2_Vector_floats* v_scales = GetTmPtr(start_ptr, tm_param->offset_vf_scales); + + for(unsigned int i = 0; i < v_scales->v_num; i++) + param.scales.push_back(v_scales->data[i]); + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmGemmOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_GEMM; + + GemmParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_GemmParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.alpha = tm_param->alpha; + param.beta = tm_param->beta; + param.transA = tm_param->transA; + param.transB = tm_param->transB; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmGenericOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_GENERIC; + + GenericParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_GenericParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.max_input_num = tm_param->max_input_num; + param.max_output_num = tm_param->max_output_num; + + const TM2_String* tm_string = GetTmPtr(start_ptr, tm_param->offset_s_opname); + char *op_name = (char *)malloc(tm_string->size); + memcpy(op_name, GetTmPtr(start_ptr, tm_string->offset_data), tm_string->size); + param.op_name = op_name; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + + return true; +} + +bool LoadTmLogisticOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_LOGISTIC); + SetNodeOp(node, op); + return true; +} + +bool LoadTmLstmOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_LSTM; + + LSTMParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_LstmParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.forget_bias = tm_param->forget_bias; + param.clip = tm_param->clip; + param.output_len = tm_param->output_len; + param.sequence_len = tm_param->sequence_len; + param.input_size = tm_param->input_size; + param.hidden_size = tm_param->hidden_size; + param.cell_size = tm_param->cell_size; + param.has_peephole = tm_param->has_peephole; + param.has_projection = tm_param->has_projection; + param.has_clip = tm_param->has_clip; + param.has_bias = tm_param->has_bias; + param.has_init_state = tm_param->has_init_state; + param.forget_act = tm_param->forget_act; + param.input_act = tm_param->input_act; + param.output_act = tm_param->output_act; + param.cellin_act = tm_param->cellin_act; + param.cellout_act = tm_param->cellout_act; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmRnnOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_RNN; + + RNNParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_RnnParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.clip = tm_param->clip; + param.output_len = tm_param->output_len; + param.sequence_len = tm_param->sequence_len; + param.input_size = tm_param->input_size; + param.hidden_size = tm_param->hidden_size; + param.has_clip = tm_param->has_clip; + param.has_bias = tm_param->has_bias; + param.has_init_state = tm_param->has_init_state; + param.activation = tm_param->activation; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmTanhOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_TANH); + SetNodeOp(node, op); + return true; +} + +bool LoadTmSigmoidOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_SIGMOID); + SetNodeOp(node, op); + return true; +} + +bool LoadTmSqueezeOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + const std::string& op_str = TM2_OPSTR_SQUEEZE; + + SqueezeParam param = any_cast(OpManager::GetOpDefParam(op_str)); + const TM2_SqueezeParam* tm_param = GetTmPtr(start_ptr, tm_op->offset_t_param); + + param.dim_0 = tm_param->dim_0; + param.dim_1 = tm_param->dim_1; + param.dim_2 = tm_param->dim_2; + param.dim_3 = tm_param->dim_3; + + StaticOp* op = CreateStaticOp(graph, op_str); + SetOperatorParam(op, param); + SetNodeOp(node, op); + return true; +} + +bool LoadTmFusedbnscalereluOp(StaticGraph* graph, StaticNode* node, void* const start_ptr, const TM2_Operator* tm_op) +{ + StaticOp* op = CreateStaticOp(graph, TM2_OPSTR_FUSEDBNSCALERELU); + SetNodeOp(node, op); + return true; +} + +op_load_t LoadTmOpFunc(uint32_t op_type) +{ + switch(op_type) + { + case TM2_OPTYPE_ACCURACY: + return LoadTmAccuracyOp; + case TM2_OPTYPE_BATCHNORMALIZATION: + return LoadTmBatchNormOp; + case TM2_OPTYPE_BILINEARRESIZE: + return LoadTmResizeOp; + case TM2_OPTYPE_CONCAT: + return LoadTmConcatOp; + case TM2_OPTYPE_CONST: + return LoadTmConstOp; + case TM2_OPTYPE_CONVOLUTION: + return LoadTmConvOp; + case TM2_OPTYPE_DECONVOLUTION: + return LoadTmDeconvOp; + case TM2_OPTYPE_DETECTIONOUTPUT: + return LoadTmDetectionOutputOp; + case TM2_OPTYPE_DROPOUT: + return LoadTmDropoutOp; + case TM2_OPTYPE_ELTWISE: + return LoadTmEltwiseOp; + case TM2_OPTYPE_FLATTEN: + return LoadTmFlattenOp; + case TM2_OPTYPE_FULLYCONNECTED: + return LoadTmFCOp; + case TM2_OPTYPE_INPUTOP: + return LoadTmInputOp; + case TM2_OPTYPE_LRN: + return LoadTmLRNOp; + case TM2_OPTYPE_NORMALIZE: + return LoadTmNormalizeOp; + case TM2_OPTYPE_PERMUTE: + return LoadTmPermuteOp; + case TM2_OPTYPE_POOLING: + return LoadTmPoolingOp; + case TM2_OPTYPE_PRELU: + return LoadTmPreluOp; + case TM2_OPTYPE_PRIORBOX: + return LoadTmPriorBoxOp; + case TM2_OPTYPE_REGION: + return LoadTmRegionOp; + case TM2_OPTYPE_RELU: + return LoadTmReLuOp; + case TM2_OPTYPE_RELU6: + return LoadTmRelu6Op; + case TM2_OPTYPE_REORG: + return LoadTmReorgOp; + case TM2_OPTYPE_RESHAPE: + return LoadTmReshapeOp; + case TM2_OPTYPE_ROIPOOLING: + return LoadTmROIPoolingOp; + case TM2_OPTYPE_RPN: + return LoadTmRPNOp; + case TM2_OPTYPE_SCALE: + return LoadTmScaleOp; + case TM2_OPTYPE_SLICE: + return LoadTmSliceOp; + case TM2_OPTYPE_SOFTMAX: + return LoadTmSoftmaxOp; + case TM2_OPTYPE_SPLIT: + return LoadTmSplitOp; + case TM2_OPTYPE_DETECTIONPOSTPROCESS: + return LoadTmDetectionPostProcessOp; + case TM2_OPTYPE_GEMM: + return LoadTmGemmOp; + case TM2_OPTYPE_GENERIC: + return LoadTmGenericOp; + case TM2_OPTYPE_LOGISTIC: + return LoadTmLogisticOp; + case TM2_OPTYPE_LSTM: + return LoadTmLstmOp; + case TM2_OPTYPE_RNN: + return LoadTmRnnOp; + case TM2_OPTYPE_TANH: + return LoadTmTanhOp; + case TM2_OPTYPE_SIGMOID: + return LoadTmSigmoidOp; + case TM2_OPTYPE_SQUEEZE: + return LoadTmSqueezeOp; + case TM2_OPTYPE_FUSEDBNSCALERELU: + return LoadTmFusedbnscalereluOp; + default: + LOG_ERROR() << "Operator #" << op_type << " not supported in tengine model yet\n"; + return nullptr; + } +} + +std::string GetOpStr(uint32_t op_type) +{ + switch(op_type) + { + case TM2_OPTYPE_ACCURACY: + return std::string(TM2_OPSTR_ACCURACY); + case TM2_OPTYPE_BATCHNORMALIZATION: + return std::string(TM2_OPSTR_BATCHNORMALIZATION); + case TM2_OPTYPE_BILINEARRESIZE: + return std::string(TM2_OPSTR_BILINEARRESIZE); + case TM2_OPTYPE_CONCAT: + return std::string(TM2_OPSTR_CONCAT); + case TM2_OPTYPE_CONST: + return std::string(TM2_OPSTR_CONST); + case TM2_OPTYPE_CONVOLUTION: + return std::string(TM2_OPSTR_CONVOLUTION); + case TM2_OPTYPE_DECONVOLUTION: + return std::string(TM2_OPSTR_DECONVOLUTION); + case TM2_OPTYPE_DETECTIONOUTPUT: + return std::string(TM2_OPSTR_DETECTIONOUTPUT); + case TM2_OPTYPE_DROPOUT: + return std::string(TM2_OPSTR_DROPOUT); + case TM2_OPTYPE_ELTWISE: + return std::string(TM2_OPSTR_ELTWISE); + case TM2_OPTYPE_FLATTEN: + return std::string(TM2_OPSTR_FLATTEN); + case TM2_OPTYPE_FULLYCONNECTED: + return std::string(TM2_OPSTR_FULLYCONNECTED); + case TM2_OPTYPE_INPUTOP: + return std::string(TM2_OPSTR_INPUTOP); + case TM2_OPTYPE_LRN: + return std::string(TM2_OPSTR_LRN); + case TM2_OPTYPE_NORMALIZE: + return std::string(TM2_OPSTR_NORMALIZE); + case TM2_OPTYPE_PERMUTE: + return std::string(TM2_OPSTR_PERMUTE); + case TM2_OPTYPE_POOLING: + return std::string(TM2_OPSTR_POOLING); + case TM2_OPTYPE_PRELU: + return std::string(TM2_OPSTR_PRELU); + case TM2_OPTYPE_PRIORBOX: + return std::string(TM2_OPSTR_PRIORBOX); + case TM2_OPTYPE_REGION: + return std::string(TM2_OPSTR_REGION); + case TM2_OPTYPE_RELU: + return std::string(TM2_OPSTR_RELU); + case TM2_OPTYPE_RELU6: + return std::string(TM2_OPSTR_RELU6); + case TM2_OPTYPE_REORG: + return std::string(TM2_OPSTR_REORG); + case TM2_OPTYPE_RESHAPE: + return std::string(TM2_OPSTR_RESHAPE); + case TM2_OPTYPE_ROIPOOLING: + return std::string(TM2_OPSTR_ROIPOOLING); + case TM2_OPTYPE_RPN: + return std::string(TM2_OPSTR_RPN); + case TM2_OPTYPE_SCALE: + return std::string(TM2_OPSTR_SCALE); + case TM2_OPTYPE_SLICE: + return std::string(TM2_OPSTR_SLICE); + case TM2_OPTYPE_SOFTMAX: + return std::string(TM2_OPSTR_SOFTMAX); + case TM2_OPTYPE_SPLIT: + return std::string(TM2_OPSTR_SPLIT); + case TM2_OPTYPE_DETECTIONPOSTPROCESS: + return std::string(TM2_OPSTR_DETECTIONPOSTPROCESS); + case TM2_OPTYPE_GEMM: + return std::string(TM2_OPSTR_GEMM); + case TM2_OPTYPE_GENERIC: + return std::string(TM2_OPSTR_GENERIC); + case TM2_OPTYPE_LOGISTIC: + return std::string(TM2_OPSTR_LOGISTIC); + case TM2_OPTYPE_LSTM: + return std::string(TM2_OPSTR_LSTM); + case TM2_OPTYPE_RNN: + return std::string(TM2_OPSTR_RNN); + case TM2_OPTYPE_TANH: + return std::string(TM2_OPSTR_TANH); + case TM2_OPTYPE_SIGMOID: + return std::string(TM2_OPSTR_SIGMOID); + case TM2_OPTYPE_SQUEEZE: + return std::string(TM2_OPSTR_SQUEEZE); + case TM2_OPTYPE_FUSEDBNSCALERELU: + return std::string(TM2_OPSTR_FUSEDBNSCALERELU); + default: + LOG_ERROR() << "Get operator string failed\n"; + return std::string(""); + } +} + +} // namespace TMSerializer2 + +} // namespace TEngine diff --git a/serializer/tengine/v2/tm2_op_save.cpp b/serializer/tengine/v2/tm2_op_save.cpp new file mode 100644 index 000000000..f2e56695a --- /dev/null +++ b/serializer/tengine/v2/tm2_op_save.cpp @@ -0,0 +1,825 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ +#include + +#include "tm2_format.h" +#include "tm2_op_serializer.hpp" + +namespace TEngine { + +namespace TMSerializer2 { + +inline void SetTmOperator(TM2_Operator* tm_op, const uint32_t op_type, const tm_uoffset_t offset) +{ + tm_op->op_ver = TM2_OP_VER; + tm_op->operator_type = op_type; + tm_op->offset_t_param = offset; +} + +tm_uoffset_t SaveTmAccuracyOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_ACCURACY, TM2_NOT_SET); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmBatchNormOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + BatchNormParam* p = (dynamic_cast(op))->GetParam(); + TM2_BatchNormParam tm_param; + tm_param.rescale_factor = p->rescale_factor; + tm_param.eps = p->eps; + tm_param.caffe_flavor = p->caffe_flavor; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_BATCHNORMALIZATION, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_BatchNormParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmConcatOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + ConcatParam* p = (dynamic_cast(op))->GetParam(); + TM2_ConcatParam tm_param; + tm_param.axis = p->axis; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_CONCAT, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ConcatParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmConstOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_CONST, TM2_NOT_SET); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmConvOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + ConvParam* p = (dynamic_cast(op))->GetParam(); + TM2_ConvParam tm_param; + + tm_param.kernel_h = p->kernel_h; + tm_param.kernel_w = p->kernel_w; + tm_param.stride_h = p->stride_h; + tm_param.stride_w = p->stride_w; + tm_param.dilation_h = p->dilation_h; + tm_param.dilation_w = p->dilation_w; + tm_param.input_channel = p->input_channel; + tm_param.output_channel = p->output_channel; + tm_param.group = p->group; + tm_param.activation = p->activation; + tm_param.pad_h0 = p->pad_h0; + tm_param.pad_h1 = p->pad_h1; + tm_param.pad_w0 = p->pad_w0; + tm_param.pad_w1 = p->pad_w1; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_CONVOLUTION, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ConvParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmDeconvOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + DeconvParam* p = (dynamic_cast(op))->GetParam(); + TM2_DeconvParam tm_param; + + tm_param.kernel_h = p->kernel_h; + tm_param.kernel_w = p->kernel_w; + tm_param.stride_h = p->stride_h; + tm_param.stride_w = p->stride_w; + tm_param.pad_w0 = p->pad_w0; + tm_param.pad_w1 = p->pad_w1; + tm_param.pad_h0 = p->pad_h0; + tm_param.pad_h1 = p->pad_h1; + tm_param.num_output = p->num_output; + tm_param.dilation_h = p->dilation_h; + tm_param.dilation_w = p->dilation_w; + tm_param.group = p->group; + tm_param.activation = p->activation; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_DECONVOLUTION, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_DeconvParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmDetectionOutputOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + DetectionOutputParam* p = (dynamic_cast(op))->GetParam(); + TM2_DetectionOutputParam tm_param; + tm_param.num_classes = p->num_classes; + tm_param.keep_top_k = p->keep_top_k; + tm_param.nms_top_k = p->nms_top_k; + tm_param.confidence_threshold = p->confidence_threshold; + tm_param.nms_threshold = p->nms_threshold; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_DETECTIONOUTPUT, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_DetectionOutputParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmDropoutOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_DROPOUT, TM2_NOT_SET); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmEltwiseOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + EltwiseParam* p = (dynamic_cast(op))->GetParam(); + TM2_EltwiseParam tm_param; + tm_param.type = p->type; + tm_param.caffe_flavor = p->caffe_flavor; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_ELTWISE, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_EltwiseParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmFCOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + FCParam* p = (dynamic_cast(op))->GetParam(); + TM2_FCParam tm_param; + tm_param.num_output = p->num_output; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_FULLYCONNECTED, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_FCParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmFlattenOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + FlattenParam* p = (dynamic_cast(op))->GetParam(); + TM2_FlattenParam tm_param; + tm_param.axis = p->axis; + tm_param.end_axis = p->end_axis; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_FLATTEN, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_FlattenParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmInputOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_INPUTOP, TM2_NOT_SET); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmLRNOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + LRNParam* p = (dynamic_cast(op))->GetParam(); + TM2_LRNParam tm_param; + tm_param.local_size = p->local_size; + tm_param.alpha = p->alpha; + tm_param.beta = p->beta; + tm_param.norm_region = p->norm_region; + tm_param.k = p->k; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_LRN, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_LRNParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmNormalizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + NormalizeParam* p = (dynamic_cast(op))->GetParam(); + TM2_NormalizeParam tm_param; + tm_param.across_spatial = p->across_spatial; + tm_param.channel_shared = p->channel_shared; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_NORMALIZE, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_NormalizeParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmPermuteOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + PermuteParam* p = (dynamic_cast(op))->GetParam(); + TM2_PermuteParam tm_param; + tm_param.flag = p->flag; + tm_param.order0 = p->order0; + tm_param.order1 = p->order1; + tm_param.order2 = p->order2; + tm_param.order3 = p->order3; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_PERMUTE, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_PermuteParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmPoolingOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + PoolParam* p = (dynamic_cast(op))->GetParam(); + TM2_PoolParam tm_param; + tm_param.alg = p->alg; + tm_param.kernel_h = p->kernel_h; + tm_param.kernel_w = p->kernel_w; + tm_param.stride_h = p->stride_h; + tm_param.stride_w = p->stride_w; + tm_param.global = p->global; + tm_param.caffe_flavor = p->caffe_flavor; + tm_param.pad_h0 = p->pad_h0; + tm_param.pad_w0 = p->pad_w0; + tm_param.pad_h1 = p->pad_h1; + tm_param.pad_w1 = p->pad_w1; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_POOLING, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_PoolParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmPreluOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_PRELU, TM2_NOT_SET); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + PriorBoxParam* p = (dynamic_cast(op))->GetParam(); + TM2_PriorBoxParam tm_param; + + size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->min_size.size(); + TM2_Vector_floats* v_minsizes = ( TM2_Vector_floats* )malloc(vector_size); + v_minsizes->v_num = p->min_size.size(); + for(unsigned int i = 0; i < p->min_size.size(); i++) + { + v_minsizes->data[i] = p->min_size[i]; + } + tm_param.offset_vf_min_size = WriteTmObject(start_ptr, cur_pos, v_minsizes, vector_size); + free(v_minsizes); + + vector_size = sizeof(tm_size_t) + sizeof(float) * p->max_size.size(); + TM2_Vector_floats* v_maxsizes = ( TM2_Vector_floats* )malloc(vector_size); + v_maxsizes->v_num = p->max_size.size(); + for(unsigned int i = 0; i < p->max_size.size(); i++) + { + v_maxsizes->data[i] = p->max_size[i]; + } + tm_param.offset_vf_max_size = WriteTmObject(start_ptr, cur_pos, v_maxsizes, vector_size); + free(v_maxsizes); + + vector_size = sizeof(tm_size_t) + sizeof(float) * p->variance.size(); + TM2_Vector_floats* v_variance = ( TM2_Vector_floats* )malloc(vector_size); + v_variance->v_num = p->variance.size(); + for(unsigned int i = 0; i < p->variance.size(); i++) + { + v_variance->data[i] = p->variance[i]; + } + tm_param.offset_vf_variance = WriteTmObject(start_ptr, cur_pos, v_variance, vector_size); + free(v_variance); + + vector_size = sizeof(tm_size_t) + sizeof(float) * p->aspect_ratio.size(); + TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size); + v_ratios->v_num = p->aspect_ratio.size(); + for(unsigned int i = 0; i < p->aspect_ratio.size(); i++) + { + v_ratios->data[i] = p->aspect_ratio[i]; + } + tm_param.offset_vf_aspect_ratio = WriteTmObject(start_ptr, cur_pos, v_ratios, vector_size); + free(v_ratios); + + tm_param.flip = p->flip; + tm_param.clip = p->clip; + tm_param.img_size = p->img_size; + tm_param.img_h = p->img_h; + tm_param.img_w = p->img_w; + tm_param.step_w = p->step_w; + tm_param.step_h = p->step_h; + tm_param.offset = p->offset; + tm_param.num_priors = p->num_priors_; + tm_param.out_dim = p->out_dim_; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_PRIORBOX, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_PriorBoxParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmRegionOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + RegionParam* p = (dynamic_cast(op))->GetParam(); + TM2_RegionParam tm_param; + tm_param.num_classes = p->num_classes; + tm_param.side = p->side; + tm_param.num_box = p->num_box; + tm_param.coords = p->coords; + tm_param.confidence_threshold = p->confidence_threshold; + tm_param.nms_threshold = p->nms_threshold; + + size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->biases.size(); + TM2_Vector_floats* v_biases = ( TM2_Vector_floats* )malloc(vector_size); + v_biases->v_num = p->biases.size(); + for(unsigned int i = 0; i < p->biases.size(); i++) + { + v_biases->data[i] = p->biases[i]; + } + tm_param.offset_vf_biases = WriteTmObject(start_ptr, cur_pos, v_biases, vector_size); + free(v_biases); + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_REGION, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_RegionParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmReLuOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + ReLuParam* p = (dynamic_cast(op))->GetParam(); + TM2_ReLuParam tm_param; + tm_param.negative_slope = p->negative_slope; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_RELU, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ReLuParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmRelu6Op(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_RELU6, TM2_NOT_SET); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmReorgOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + ReorgParam* p = (dynamic_cast(op))->GetParam(); + TM2_ReorgParam tm_param; + tm_param.stride = p->stride; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_REORG, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ReorgParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmReshapeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + ReshapeParam* p = (dynamic_cast(op))->GetParam(); + TM2_ReshapeParam tm_param; + + tm_param.dim_0 = p->dim_0; + tm_param.dim_1 = p->dim_1; + tm_param.dim_2 = p->dim_2; + tm_param.dim_3 = p->dim_3; + tm_param.dim_size = p->dim_size; + tm_param.axis = p->axis; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_RESHAPE, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ReshapeParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmResizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + ResizeParam* p = (dynamic_cast(op))->GetParam(); + TM2_ResizeParam tm_param; + tm_param.scale_x = p->scale_w; + tm_param.scale_y = p->scale_h; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_BILINEARRESIZE, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ResizeParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmROIPoolingOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + ROIPoolingParam* p = (dynamic_cast(op))->GetParam(); + TM2_ROIPoolingParam tm_param; + tm_param.pooled_h = p->pooled_h; + tm_param.pooled_w = p->pooled_w; + tm_param.spatial_scale = p->spatial_scale; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_ROIPOOLING, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ROIPoolingParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + RPNParam* p = (dynamic_cast(op))->GetParam(); + TM2_RPNParam tm_param; + + size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->ratios.size(); + TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size); + v_ratios->v_num = p->ratios.size(); + for(unsigned int i = 0; i < p->ratios.size(); i++) + { + v_ratios->data[i] = p->ratios[i]; + } + tm_param.offset_vf_ratios = WriteTmObject(start_ptr, cur_pos, v_ratios, vector_size); + free(v_ratios); + + vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchor_scales.size(); + TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size); + v_scales->v_num = p->anchor_scales.size(); + for(unsigned int i = 0; i < p->anchor_scales.size(); i++) + { + v_scales->data[i] = p->anchor_scales[i]; + } + tm_param.offset_vf_anchor_scales = WriteTmObject(start_ptr, cur_pos, v_scales, vector_size); + free(v_scales); + + vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchors_.size() * 4; + TM2_Vector_anchors* v_anchors = ( TM2_Vector_anchors* )malloc(vector_size); + v_anchors->v_num = p->anchors_.size(); + for(unsigned int i = 0; i < p->anchors_.size(); i++) + { + v_anchors->data[i][0] = p->anchors_[i].x0; + v_anchors->data[i][1] = p->anchors_[i].y0; + v_anchors->data[i][2] = p->anchors_[i].x1; + v_anchors->data[i][3] = p->anchors_[i].y1; + } + tm_param.offset_va_anchors = WriteTmObject(start_ptr, cur_pos, v_anchors, vector_size); + free(v_anchors); + + tm_param.feat_stride = p->feat_stride; + tm_param.basesize = p->basesize; + tm_param.min_size = p->min_size; + tm_param.per_nms_topn = p->per_nms_topn; + tm_param.post_nms_topn = p->post_nms_topn; + tm_param.nms_thresh = p->nms_thresh; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_RPN, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_RPNParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmScaleOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + ScaleParam* p = (dynamic_cast(op))->GetParam(); + TM2_ScaleParam tm_param; + tm_param.axis = p->axis; + tm_param.num_axes = p->num_axes; + tm_param.bias_term = p->bias_term; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_SCALE, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ScaleParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + SliceParam* p = (dynamic_cast(op))->GetParam(); + TM2_SliceParam tm_param; + + tm_param.axis = p->axis; + tm_param.iscaffe = p->iscaffe; + + if((p->slice_point_).size()) + { + size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * (p->slice_point_).size(); + TM2_Vector_dims* v_slice_points = ( TM2_Vector_dims* )malloc(vector_size); + v_slice_points->v_num = (p->slice_point_).size(); + for(unsigned int i = 0; i < (p->slice_point_).size(); i++) + { + v_slice_points->dims[i] = p->slice_point_[i]; + } + tm_param.offset_vi_slice_points = WriteTmObject(start_ptr, cur_pos, v_slice_points, vector_size); + free(v_slice_points); + } + else + tm_param.offset_vi_slice_points = TM2_NOT_SET; + + if((p->begin_).size()) + { + size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * (p->begin_).size(); + TM2_Vector_dims* v_begins = ( TM2_Vector_dims* )malloc(vector_size); + v_begins->v_num = (p->begin_).size(); + for(unsigned int i = 0; i < (p->begin_).size(); i++) + { + v_begins->dims[i] = p->begin_[i]; + } + tm_param.offset_vi_begins = WriteTmObject(start_ptr, cur_pos, v_begins, vector_size); + free(v_begins); + } + else + tm_param.offset_vi_begins = TM2_NOT_SET; + + if((p->size_).size()) + { + size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * (p->size_).size(); + TM2_Vector_dims* v_sizes = ( TM2_Vector_dims* )malloc(vector_size); + v_sizes->v_num = (p->size_).size(); + for(unsigned int i = 0; i < (p->size_).size(); i++) + { + v_sizes->dims[i] = p->size_[i]; + } + tm_param.offset_vi_sizes = WriteTmObject(start_ptr, cur_pos, v_sizes, vector_size); + free(v_sizes); + } + else + tm_param.offset_vi_sizes = TM2_NOT_SET; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_SLICE, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SliceParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmSoftmaxOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + SoftmaxParam* p = (dynamic_cast(op))->GetParam(); + TM2_SoftmaxParam tm_param; + tm_param.axis = p->axis; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_SOFTMAX, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SoftmaxParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmSplitOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_SPLIT, TM2_NOT_SET); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmDetectionPostProcessOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + DetectionPostProcessParam* p = (dynamic_cast(op))->GetParam(); + TM2_DetectionPostProcessParam tm_param; + + tm_param.max_detections = p->max_detections; + tm_param.max_classes_per_detection = p->max_classes_per_detection; + tm_param.nms_score_threshold = p->nms_score_threshold; + tm_param.nms_iou_threshold = p->nms_iou_threshold; + tm_param.num_classes = p->num_classes; + + size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->scales.size(); + TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size); + v_scales->v_num = p->scales.size(); + for(unsigned int i = 0; i < p->scales.size(); i++) + { + v_scales->data[i] = p->scales[i]; + } + tm_param.offset_vf_scales = WriteTmObject(start_ptr, cur_pos, v_scales, vector_size); + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_DETECTIONPOSTPROCESS, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_DetectionPostProcessParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmGemmOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + GemmParam* p = (dynamic_cast(op))->GetParam(); + TM2_GemmParam tm_param; + + tm_param.alpha = p->alpha; + tm_param.beta = p->beta; + tm_param.transA = p->transA; + tm_param.transB = p->transB; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_GEMM, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_GemmParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmGenericOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + GenericParam* p = (dynamic_cast(op))->GetParam(); + TM2_GenericParam tm_param; + + tm_param.max_input_num = p->max_input_num; + tm_param.max_output_num = p->max_output_num; + + TM2_String op_name; + op_name.size = strlen(p->op_name) + 1; // including trailing \0 + op_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, p->op_name, op_name.size); + tm_param.offset_s_opname = WriteTmObject(start_ptr, cur_pos, &op_name, sizeof(TM2_String)); + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_GENERIC, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_GenericParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmLogisticOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_LOGISTIC, TM2_NOT_SET); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmLstmOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + LSTMParam* p = (dynamic_cast(op))->GetParam(); + TM2_LstmParam tm_param; + + tm_param.forget_bias = p->forget_bias; + tm_param.clip = p->clip; + tm_param.output_len = p->output_len; + tm_param.sequence_len = p->sequence_len; + tm_param.input_size = p->input_size; + tm_param.hidden_size = p->hidden_size; + tm_param.cell_size = p->cell_size; + tm_param.has_peephole = p->has_peephole; + tm_param.has_projection = p->has_projection; + tm_param.has_clip = p->has_clip; + tm_param.has_bias = p->has_bias; + tm_param.has_init_state = p->has_init_state; + tm_param.forget_act = p->forget_act; + tm_param.input_act = p->input_act; + tm_param.output_act = p->output_act; + tm_param.cellin_act = p->cellin_act; + tm_param.cellout_act = p->cellout_act; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_LSTM, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_LstmParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmRnnOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + RNNParam* p = (dynamic_cast(op))->GetParam(); + TM2_RnnParam tm_param; + + tm_param.clip = p->clip; + tm_param.output_len = p->output_len; + tm_param.sequence_len = p->sequence_len; + tm_param.input_size = p->input_size; + tm_param.hidden_size = p->hidden_size; + tm_param.has_clip = p->has_clip; + tm_param.has_bias = p->has_bias; + tm_param.has_init_state = p->has_init_state; + tm_param.activation = p->activation; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_RNN, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_RnnParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmTanhOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_TANH, TM2_NOT_SET); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmSigmoidOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_SIGMOID, TM2_NOT_SET); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmSqueezeOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + SqueezeParam* p = (dynamic_cast(op))->GetParam(); + TM2_SqueezeParam tm_param; + + tm_param.dim_0 = p->dim_0; + tm_param.dim_1 = p->dim_1; + tm_param.dim_2 = p->dim_2; + tm_param.dim_3 = p->dim_3; + + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_SQUEEZE, + WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SqueezeParam))); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +tm_uoffset_t SaveTmFusedbnscalereluOp(void* const start_ptr, tm_uoffset_t* cur_pos, Operator* op) +{ + TM2_Operator tm_op; + SetTmOperator(&tm_op, TM2_OPTYPE_FUSEDBNSCALERELU, TM2_NOT_SET); + return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator)); +} + +op_save_t SaveTmOpFunc(uint32_t op_type) +{ + switch(op_type) + { + case TM2_OPTYPE_ACCURACY: + return SaveTmAccuracyOp; + case TM2_OPTYPE_BATCHNORMALIZATION: + return SaveTmBatchNormOp; + case TM2_OPTYPE_BILINEARRESIZE: + return SaveTmResizeOp; + case TM2_OPTYPE_CONCAT: + return SaveTmConcatOp; + case TM2_OPTYPE_CONST: + return SaveTmConstOp; + case TM2_OPTYPE_CONVOLUTION: + return SaveTmConvOp; + case TM2_OPTYPE_DECONVOLUTION: + return SaveTmDeconvOp; + case TM2_OPTYPE_DETECTIONOUTPUT: + return SaveTmDetectionOutputOp; + case TM2_OPTYPE_DROPOUT: + return SaveTmDropoutOp; + case TM2_OPTYPE_ELTWISE: + return SaveTmEltwiseOp; + case TM2_OPTYPE_FLATTEN: + return SaveTmFlattenOp; + case TM2_OPTYPE_FULLYCONNECTED: + return SaveTmFCOp; + case TM2_OPTYPE_INPUTOP: + return SaveTmInputOp; + case TM2_OPTYPE_LRN: + return SaveTmLRNOp; + case TM2_OPTYPE_NORMALIZE: + return SaveTmNormalizeOp; + case TM2_OPTYPE_PERMUTE: + return SaveTmPermuteOp; + case TM2_OPTYPE_POOLING: + return SaveTmPoolingOp; + case TM2_OPTYPE_PRELU: + return SaveTmPreluOp; + case TM2_OPTYPE_PRIORBOX: + return SaveTmPriorBoxOp; + case TM2_OPTYPE_REGION: + return SaveTmRegionOp; + case TM2_OPTYPE_RELU: + return SaveTmReLuOp; + case TM2_OPTYPE_RELU6: + return SaveTmRelu6Op; + case TM2_OPTYPE_REORG: + return SaveTmReorgOp; + case TM2_OPTYPE_RESHAPE: + return SaveTmReshapeOp; + case TM2_OPTYPE_ROIPOOLING: + return SaveTmROIPoolingOp; + case TM2_OPTYPE_RPN: + return SaveTmRPNOp; + case TM2_OPTYPE_SCALE: + return SaveTmScaleOp; + case TM2_OPTYPE_SLICE: + return SaveTmSliceOp; + case TM2_OPTYPE_SOFTMAX: + return SaveTmSoftmaxOp; + case TM2_OPTYPE_SPLIT: + return SaveTmSplitOp; + case TM2_OPTYPE_DETECTIONPOSTPROCESS: + return SaveTmDetectionPostProcessOp; + case TM2_OPTYPE_GEMM: + return SaveTmGemmOp; + case TM2_OPTYPE_GENERIC: + return SaveTmGenericOp; + case TM2_OPTYPE_LOGISTIC: + return SaveTmLogisticOp; + case TM2_OPTYPE_LSTM: + return SaveTmLstmOp; + case TM2_OPTYPE_RNN: + return SaveTmRnnOp; + case TM2_OPTYPE_TANH: + return SaveTmTanhOp; + case TM2_OPTYPE_SIGMOID: + return SaveTmSigmoidOp; + case TM2_OPTYPE_SQUEEZE: + return SaveTmSqueezeOp; + case TM2_OPTYPE_FUSEDBNSCALERELU: + return SaveTmFusedbnscalereluOp; + default: + LOG_ERROR() << "Operator #" << op_type << " not supported in tengine model yet\n"; + return nullptr; + } +} + +} // namespace TMSerializer2 + +} // namespace TEngine diff --git a/serializer/tengine/v2/tm2_serializer.cpp b/serializer/tengine/v2/tm2_serializer.cpp new file mode 100644 index 000000000..33209e82a --- /dev/null +++ b/serializer/tengine/v2/tm2_serializer.cpp @@ -0,0 +1,750 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2019, Open AI Lab + * Author: jingyou@openailab.com + */ +#include +#include +#include +#include +#include +#include +#include + +#include "operator_manager.hpp" +#include "static_graph.hpp" +#include "graph.hpp" +#include "node.hpp" +#include "tensor.hpp" +#include "compiler.hpp" + +#include "tm2_format.h" +#include "tm2_serializer.hpp" +#include "tm2_op_serializer.hpp" + +#define TYPE_INFO_INT32 1 +#define TYPE_INFO_UINT32 2 +#define TYPE_INFO_FLOAT 3 +#define TYPE_INFO_POINTER 4 +#define TYPE_INFO_GENERIC 5 + +namespace TEngine { + +extern int NodeSetParamGeneric(void* node, const char* param_name, const char* type_name, const void* param_val, int size); +extern int NodeAddParamGeneric(void* node, const char* param_name, const char* type_name, int param_size); + +} + +using namespace TEngine; + +namespace TEngine { + +namespace TMSerializer2 { + +static int typename_to_int(const char* name) +{ + if(name == nullptr) + return TYPE_INFO_POINTER; + + if(!strcmp(name, typeid(int).name())) + return TYPE_INFO_INT32; + if(!strcmp(name, typeid(unsigned int).name())) + return TYPE_INFO_UINT32; + if(!strcmp(name, typeid(float).name())) + return TYPE_INFO_FLOAT; + + return TYPE_INFO_GENERIC; +} + +static const char* int_to_typename(int id) +{ + switch(id) + { + case TYPE_INFO_INT32: + return typeid(int).name(); + case TYPE_INFO_UINT32: + return typeid(unsigned int).name(); + case TYPE_INFO_FLOAT: + return typeid(float).name(); + case TYPE_INFO_POINTER: + case TYPE_INFO_GENERIC: + default: + return nullptr; + } +} + +bool TmSerializer2::IsSaveString(void) +{ + const char* env = std::getenv("TM_NO_STRING"); + + if(env) + return false; + else + return true; +} + +bool TmSerializer2::IsSaveData(void) +{ + const char* env = std::getenv("TM_FOR_BENCHMARK"); + + if(env) + return false; + else + return true; +} + +tm_uoffset_t TmSerializer2::SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, Tensor* tensor, + unsigned int tensor_id, unsigned int buffer_id) +{ + TM2_Tensor tm_tensor; + tm_tensor.tensor_id = tensor_id; + tm_tensor.buffer_id = buffer_id; + tm_tensor.type = tensor->GetType(); + tm_tensor.data_type = tensor->GetDataType(); + tm_tensor.layout = (tensor->GetShape()).GetDataLayout(); + + bool tm_with_string = IsSaveString(); + + if(tm_with_string) + { + std::string name = tensor->GetName(); + TM2_String tensor_name; + tensor_name.size = name.size()+1; // including trailing \0 + tensor_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), tensor_name.size); + tm_tensor.offset_s_tname = WriteTmObject(start_ptr, cur_pos, &tensor_name, sizeof(TM2_String)); + } + else + tm_tensor.offset_s_tname = TM2_NOT_SET; + + /* Get the dims of the tensor */ + TShape& shape = tensor->GetShape(); + std::vector& dim = shape.GetDim(); + size_t vector_size; + if(dim.size()) + { + /* Write the vector of dims */ + vector_size = sizeof(tm_size_t) + sizeof(int32_t) * dim.size(); + TM2_Vector_dims* v_dims = ( TM2_Vector_dims* )malloc(vector_size); + v_dims->v_num = dim.size(); + for(unsigned int i = 0; i < dim.size(); i++) + { + v_dims->dims[i] = dim[i]; + } + tm_tensor.offset_vd_dims = WriteTmObject(start_ptr, cur_pos, v_dims, vector_size); + free(v_dims); + } + else + tm_tensor.offset_vd_dims = TM2_NOT_SET; + + /* Write the quant params */ + std::vector* params = tensor->GetQuantParam(); + if(params->size() != 0) + { + vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * params->size(); + TM2_Vector_offsets* v_qtparams = ( TM2_Vector_offsets* )malloc(vector_size); + v_qtparams->v_num = params->size(); + for(unsigned int i = 0; i < v_qtparams->v_num; i++) + { + QuantParam& p = (*params)[i]; + TM2_QuantParam qtparam; + + qtparam.zero_point = p.zero_point; + qtparam.scale = p.scale; + qtparam.width = p.width; + + v_qtparams->offsets[i] = WriteTmObject(start_ptr, cur_pos, &qtparam, sizeof(TM2_QuantParam)); + } + + /* Write the vector of quant params */ + tm_tensor.offect_vo_quantparams = WriteTmObject(start_ptr, cur_pos, v_qtparams, vector_size); + } + else + tm_tensor.offect_vo_quantparams = TM2_NOT_SET; + + /* Write the tensor */ + return WriteTmObject(start_ptr, cur_pos, &tm_tensor, sizeof(TM2_Tensor)); +} + +tm_uoffset_t TmSerializer2::SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, Node* node, + name_map_t& tensor_name_map) +{ + TM2_Node tm_node; + tm_node.node_id = node->GetNodeIndex(); + tm_node.dynamic_shape = node->IsDynamicShape(); + + bool tm_with_string = IsSaveString(); + + if(tm_with_string) + { + std::string name = node->GetName(); + TM2_String node_name; + node_name.size = name.size()+1; // including trailing \0 + node_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), node_name.size); + tm_node.offset_s_nname = WriteTmObject(start_ptr, cur_pos, &node_name, sizeof(TM2_String)); + } + else + tm_node.offset_s_nname = TM2_NOT_SET; + + unsigned int input_num = node->GetInputNum(); + unsigned int output_num = node->GetOutputNum(); + + if(input_num) + { + /* Write the vector of input indices */ + size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * input_num; + TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size); + v_input_indices->v_num = input_num; + for(unsigned int i = 0; i < input_num; i++) + { + Tensor* p_tensor = node->GetInputTensor(i); + v_input_indices->indices[i] = tensor_name_map[p_tensor->GetName()]; + } + tm_node.offset_vi_input_tensors = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size); + free(v_input_indices); + } + else + tm_node.offset_vi_input_tensors = TM2_NOT_SET; + + if(output_num) + { + /* Write the vector of output indices */ + size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * output_num; + TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size); + v_output_indices->v_num = output_num; + for(unsigned int i = 0; i < output_num; i++) + { + Tensor* p_tensor = node->GetOutputTensor(i); + v_output_indices->indices[i] = tensor_name_map[p_tensor->GetName()]; + } + tm_node.offset_vi_output_tensors = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size); + free(v_output_indices); + } + else + tm_node.offset_vi_output_tensors = TM2_NOT_SET; + + /* Write tm operator */ + std::string op_name = node->GetOp()->GetName(); + if(op_name == "Input") + op_name = TM2_OPSTR_INPUTOP; + if(!FindOpSaveMethod(op_name)) + { + LOG_ERROR() << "cannot find save function for operator: " << op_name << "\n"; + return false; + } + op_save_t op_save_func = any_cast(GetOpSaveMethod(op_name)); + tm_node.offset_t_operator = op_save_func(start_ptr, cur_pos, node->GetOp()); + + /* No custom attrs */ + if(!node->ExistAttr(ATTR_CUSTOM_ATTR)) + { + tm_node.offset_vo_attrs = TM2_NOT_SET; + /* Write the node */ + return WriteTmObject(start_ptr, cur_pos, &tm_node, sizeof(TM2_Node)); + } + + /* Get custom attrs of node */ + std::vector tm_attrs; + node_custom_attr_map_t* attr_map = any_cast(&node->GetAttr(ATTR_CUSTOM_ATTR)); + node_custom_attr_map_t::iterator it = (*attr_map).begin(); + while(it != (*attr_map).end()) + { + TM2_Attr tm_attr; + std::string attr_name = it->first; + CustomNodeAttr attr = it->second; + + TM2_String tm_attr_name, tm_attr_val; + tm_attr_name.size = attr_name.size()+1; // including trailing \0 + tm_attr_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, attr_name.c_str(), attr_name.size()); + tm_attr.offset_s_attrname = WriteTmObject(start_ptr, cur_pos, &tm_attr_name, sizeof(TM2_String)); + + tm_attr_val.size = attr.attr_size; // no trailing \0 + tm_attr_val.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, &(attr.mem), attr.attr_size); + tm_attr.offset_s_attrval = WriteTmObject(start_ptr, cur_pos, &tm_attr_val, sizeof(TM2_String)); + + tm_attr.attr_type = typename_to_int(attr.type_name); + + tm_attrs.push_back(tm_attr); + ++it; + } + + /* Write custom attrs */ + size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tm_attrs.size(); + TM2_Vector_offsets* v_attrs = ( TM2_Vector_offsets* )malloc(vector_size); + v_attrs->v_num = tm_attrs.size(); + for(unsigned int i = 0; i < tm_attrs.size(); i++) + { + v_attrs->offsets[i] = WriteTmObject(start_ptr, cur_pos, &(tm_attrs[i]), sizeof(TM2_Attr)); + } + tm_node.offset_vo_attrs = WriteTmObject(start_ptr, cur_pos, v_attrs, vector_size); + free(v_attrs); + + /* Write the node */ + return WriteTmObject(start_ptr, cur_pos, &tm_node, sizeof(TM2_Node)); +} + +tm_uoffset_t TmSerializer2::SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, Graph* graph) +{ + TM2_Subgraph tm_subgraph; + tm_subgraph.subgraph_id = 0; /* subgraph_id starts from 0 */ + tm_subgraph.offset_s_sname = TM2_NOT_SET; + + tm_subgraph.graph_layout = graph->GetLayout(); + tm_subgraph.model_layout = graph->GetModelLayout(); + + unsigned int tensor_num = 0; + unsigned int buffer_num = 0; + std::vector tensor_ptrs; + std::vector buf_ptrs; + std::vector buf_sizes; + name_map_t tensor_name_map; /* map of tensor name and tensor index */ + bool tm_no_data = !IsSaveData(); + + /* Write the nodes */ + size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * graph->seq_nodes.size(); + TM2_Vector_offsets* v_nodes = ( TM2_Vector_offsets* )malloc(vector_size); + v_nodes->v_num = graph->seq_nodes.size(); + for(unsigned int i = 0; i < graph->seq_nodes.size(); i++) + { + Node* p_node = graph->seq_nodes[i]; + for(unsigned int k = 0; k < p_node->GetOutputNum(); k++) + { + Tensor* p_tensor = p_node->GetOutputTensor(k); + tensor_ptrs.push_back(p_tensor); + tensor_name_map[p_tensor->GetName()] = tensor_num; + tensor_num++; + } + v_nodes->offsets[i] = SaveTmNode(start_ptr, cur_pos, p_node, tensor_name_map); + } + /* Write the vector of nodes */ + tm_subgraph.offset_vo_seq_nodes = WriteTmObject(start_ptr, cur_pos, v_nodes, vector_size); + + /* Write the tensors */ + vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor_num; + TM2_Vector_offsets* v_tensors = ( TM2_Vector_offsets* )malloc(vector_size); + v_tensors->v_num = tensor_num; + for(unsigned int i = 0; i < tensor_num; i++) + { + Tensor* p_tensor = tensor_ptrs[i]; + if(p_tensor->GetType() == kConstTensor) + { + buf_ptrs.push_back(p_tensor->GetMemAddr()); + buf_sizes.push_back(p_tensor->GetTotalSize()); + buffer_num++; + } + + v_tensors->offsets[i] = SaveTmTensor(start_ptr, cur_pos, p_tensor, i, buffer_num - 1); + } + /* Write the vector of tensors */ + tm_subgraph.offset_vo_tensors = WriteTmObject(start_ptr, cur_pos, v_tensors, vector_size); + + /* Write the buffers */ + vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * buffer_num; + TM2_Vector_offsets* v_buffers = ( TM2_Vector_offsets* )malloc(vector_size); + v_buffers->v_num = buffer_num; + for(unsigned int i = 0; i < buffer_num; i++) + { + TM2_Buffer tm_buf; + tm_buf.size = buf_sizes[i]; + + if(tm_no_data) + { + /* TM2_FOR_BENCHMARK environment variable exists. Not write buf data into the tm file */ + tm_buf.offset_data = TM2_NOT_SET; + } + else + { + /* TM2_FOR_BENCHMARK environment variable does not exist */ + tm_buf.offset_data = + WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast(buf_ptrs[i]), tm_buf.size); + } + v_buffers->offsets[i] = WriteTmObject(start_ptr, cur_pos, &tm_buf, sizeof(TM2_Buffer)); + } + /* Write the vector of buffers */ + tm_subgraph.offset_vo_buffers = WriteTmObject(start_ptr, cur_pos, v_buffers, vector_size); + + /* Write the vector of input indices */ + vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->input_nodes.size(); + TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size); + v_input_indices->v_num = graph->input_nodes.size(); + for(unsigned int i = 0; i < graph->input_nodes.size(); i++) + { + v_input_indices->indices[i] = graph->input_nodes[i]->GetNodeIndex(); + } + tm_subgraph.offset_vi_input_indices = WriteTmObject(start_ptr, cur_pos, v_input_indices, vector_size); + + /* Write the vector of output indices */ + vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->output_nodes.size(); + TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size); + v_output_indices->v_num = graph->output_nodes.size(); + for(unsigned int i = 0; i < graph->output_nodes.size(); i++) + { + v_output_indices->indices[i] = graph->output_nodes[i]->GetNodeIndex(); + } + tm_subgraph.offset_vi_output_indices = WriteTmObject(start_ptr, cur_pos, v_output_indices, vector_size); + + /* Write the subgraph */ + tm_uoffset_t ret = WriteTmObject(start_ptr, cur_pos, &tm_subgraph, sizeof(TM2_Subgraph)); + + /* Free the memory of vectors */ + free(v_tensors); + free(v_buffers); + free(v_nodes); + free(v_input_indices); + free(v_output_indices); + + return ret; +} + +bool TmSerializer2::SaveModelIntoMem(void* start_ptr, Graph* graph, uint32_t* tm_model_size) +{ + bool tm_with_string = IsSaveString(); + + tm_uoffset_t cur_pos = sizeof(TM2_Header); + + /* Define the TM2_Header object */ + TM2_Header header; + header.ver_main = TM2_FILE_VER_MAIN; + header.ver_sub = TM2_FILE_VER_SUB; + header.ver_compile = TM2_FILE_VER_COMPILE; + + /* Define the TM2_Model object */ + TM2_Model tm_model; + tm_model.orig_format = graph->GetModelFormat(); + tm_model.sub_format = 0; + + if(tm_with_string) + { + const std::string& fname = graph->GetName(); + TM2_String model_name; + model_name.size = fname.size()+1; // including trailing \0 + model_name.offset_data = WriteTmFileAlign1(start_ptr, &cur_pos, fname.c_str(), model_name.size); + tm_model.offset_s_mname = WriteTmObject(start_ptr, &cur_pos, &model_name, sizeof(TM2_String)); + } + else + tm_model.offset_s_mname = TM2_NOT_SET; + + /* Write the subgraphs */ + /* Only 1 subgraph is supported currently */ + size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * 1; + TM2_Vector_offsets* v_subgraphs = ( TM2_Vector_offsets* )malloc(vector_size); + v_subgraphs->v_num = 1; + v_subgraphs->offsets[0] = SaveTmSubgraph(start_ptr, &cur_pos, graph); + + /* Write the vector of subgraphs */ + tm_model.offset_vo_subgraphs = WriteTmObject(start_ptr, &cur_pos, v_subgraphs, vector_size); + + /* Write the model */ + header.offset_root = WriteTmObject(start_ptr, &cur_pos, &tm_model, sizeof(TM2_Model)); + *tm_model_size = cur_pos; + + /* Write the header */ + cur_pos = 0; + WriteTmObject(start_ptr, &cur_pos, &header, sizeof(TM2_Header)); + + free(v_subgraphs); + + return true; +} + +bool TmSerializer2::LoadNode(StaticGraph* graph, StaticNode* node, const TM2_Node* tm_node, void* mmap_buf) +{ + if(tm_node->offset_vi_input_tensors != TM2_NOT_SET) + { + const TM2_Vector_indices* v_input_tensors = + GetTmPtr(mmap_buf, tm_node->offset_vi_input_tensors); + + /* Set the input tensors to the node */ + for(unsigned int i = 0; i < v_input_tensors->v_num; i++) + { + StaticTensor* tensor = graph->tensor_list[v_input_tensors->indices[i]].get(); + if(!tensor) + { + LOG_ERROR() << "The input tensor not exist: " << v_input_tensors->indices[i] << "\n"; + return false; + } + AddNodeInputTensor(node, tensor); + } + } + + if(tm_node->offset_vi_output_tensors != TM2_NOT_SET) + { + const TM2_Vector_indices* v_output_tensors = + GetTmPtr(mmap_buf, tm_node->offset_vi_output_tensors); + + /* Set the output tensors to the node */ + for(unsigned int i = 0; i < v_output_tensors->v_num; i++) + { + StaticTensor* tensor = graph->tensor_list[v_output_tensors->indices[i]].get(); + if(!tensor) + { + LOG_ERROR() << "The output tensor not exist: " << v_output_tensors->indices[i] << "\n"; + return false; + } + AddNodeOutputTensor(node, tensor); + } + } + + /* set the custom attributes into static node */ + if(tm_node->offset_vo_attrs == TM2_NOT_SET) + return true; + + const TM2_Vector_offsets* v_attrs = GetTmPtr(mmap_buf, tm_node->offset_vo_attrs); + for(unsigned int i = 0; i < v_attrs->v_num; i++) + { + const TM2_Attr* tm_attr = GetTmPtr(mmap_buf, v_attrs->offsets[i]); + const TM2_String* tm_attr_name = GetTmPtr(mmap_buf, tm_attr->offset_s_attrname); + const TM2_String* tm_attr_val = GetTmPtr(mmap_buf, tm_attr->offset_s_attrval); + + const char* attr_name = GetTmPtr(mmap_buf, tm_attr_name->offset_data); + const char* attr_val = GetTmPtr(mmap_buf, tm_attr_val->offset_data); + const char* type_name = int_to_typename(tm_attr->attr_type); + + if(NodeAddParamGeneric(node, attr_name, type_name, tm_attr_val->size) < 0 || + NodeSetParamGeneric(node, attr_name, type_name, attr_val, tm_attr_val->size) < 0) + { + LOG_ERROR() << "Add and set node param failed\n"; + return false; + } + } + + return true; +} + +bool TmSerializer2::LoadTensor(StaticGraph* graph, const TM2_Tensor* tm_tensor, const TM2_Buffer* tm_buf, void* mmap_buf) +{ + /* Set the tensor name */ + int idx = tm_tensor->tensor_id; + std::string tm_tensor_name; + if(tm_tensor->offset_s_tname == TM2_NOT_SET) + tm_tensor_name = "tensor_" + std::to_string(idx); + else + { + const TM2_String* tm_str = GetTmPtr(mmap_buf, tm_tensor->offset_s_tname); + tm_tensor_name.assign(GetTmPtr(mmap_buf, tm_str->offset_data), tm_str->size-1); + } + + /* Create the static tensor */ + StaticTensor* tensor; + if(tm_tensor->type == kConstTensor) + tensor = CreateStaticConstTensor(graph, tm_tensor_name); + else + tensor = CreateStaticTensor(graph, tm_tensor_name); + if(!tensor) + { + LOG_ERROR() << "Create static const tensor failed: " << tm_tensor_name << "\n"; + return false; + } + + /* Set the dims */ + if(tm_tensor->offset_vd_dims != TM2_NOT_SET) + { + const TM2_Vector_dims* v_dims = GetTmPtr(mmap_buf, tm_tensor->offset_vd_dims); + if(!v_dims || !(v_dims->v_num)) + { + LOG_ERROR() << "Get tensor dims failed\n"; + return false; + } + std::vector dims; + for(unsigned int i = 0; i < v_dims->v_num; i++) + dims.push_back(v_dims->dims[i]); + SetTensorDim(tensor, dims); + } + + /* Set the tensor type and the data type */ + SetTensorType(tensor, tm_tensor->type); + SetTensorDataType(tensor, tm_tensor->data_type); + + /* Set the memory size and pointer */ + if(tm_tensor->type == kConstTensor) + { + SetTensorSize(tensor, tm_buf->size); + void* buf = malloc(tm_buf->size); + if(tm_buf->offset_data != TM2_NOT_SET) + { + memcpy(buf, GetTmPtr(mmap_buf, tm_buf->offset_data), tm_buf->size); + } + + SetConstTensorBuffer(tensor, buf); + SetConstTensorFileLocation(tensor, -1, 0); + } + + /* Set the quant params */ + if(tm_tensor->offect_vo_quantparams != TM2_NOT_SET) + { + const TM2_Vector_offsets* v_quantparams = GetTmPtr(mmap_buf, tm_tensor->offect_vo_quantparams); + + /* currently only support one quant param */ + assert(v_quantparams->v_num == 1); + + const TM2_QuantParam* tm_qtparam = GetTmPtr(mmap_buf, v_quantparams->offsets[0]); + tensor->zero_point = tm_qtparam->zero_point; + tensor->scale = tm_qtparam->scale; + tensor->width = tm_qtparam->width; + } + + return true; +} + +bool TmSerializer2::LoadGraph(StaticGraph* graph, const TM2_Model* tm_model, void* mmap_buf) +{ + const TM2_Vector_offsets* v_graphs = GetTmPtr(mmap_buf, tm_model->offset_vo_subgraphs); + const TM2_Subgraph* tm_graph = GetTmPtr(mmap_buf, v_graphs->offsets[0]); + + const TM2_Vector_offsets* v_nodes = GetTmPtr(mmap_buf, tm_graph->offset_vo_seq_nodes); + const TM2_Vector_offsets* v_tensors = GetTmPtr(mmap_buf, tm_graph->offset_vo_tensors); + const TM2_Vector_offsets* v_buffers = GetTmPtr(mmap_buf, tm_graph->offset_vo_buffers); + + SetGraphLayout(graph, tm_graph->graph_layout); + SetModelLayout(graph, tm_graph->model_layout); + + /* Load const tensors */ + for(unsigned int i = 0; i < v_tensors->v_num; i++) + { + const TM2_Tensor* tm_tensor = GetTmPtr(mmap_buf, v_tensors->offsets[i]); + const TM2_Buffer* tm_buf; + if(tm_tensor->type == kConstTensor) + tm_buf = GetTmPtr(mmap_buf, v_buffers->offsets[tm_tensor->buffer_id]); + else + tm_buf = nullptr; + LoadTensor(graph, tm_tensor, tm_buf, mmap_buf); + } + + /* Create static nodes */ + unsigned int i; + for(i = 0; i < v_nodes->v_num; i++) + { + const TM2_Node* tm_node = GetTmPtr(mmap_buf, v_nodes->offsets[i]); + int idx = tm_node->node_id; + std::string tm_node_name; + if(tm_node->offset_s_nname == TM2_NOT_SET) + tm_node_name = "node_" + std::to_string(idx); + else + { + const TM2_String* tm_str = GetTmPtr(mmap_buf, tm_node->offset_s_nname); + tm_node_name.assign(GetTmPtr(mmap_buf, tm_str->offset_data), tm_str->size-1); + } + + const TM2_Operator* tm_operator = GetTmPtr(mmap_buf, tm_node->offset_t_operator); + const std::string& tm_op_name = GetOpStr(tm_operator->operator_type); + + if(!FindOpLoadMethod(tm_op_name)) + { + LOG_ERROR() << "cannot find load function for operator: " << tm_op_name << "\n"; + break; + } + + StaticNode* node = CreateStaticNode(graph, tm_node_name); + if(!LoadNode(graph, node, tm_node, mmap_buf)) + break; + + op_load_t op_func = any_cast(GetOpLoadMethod(tm_op_name)); + + if(!op_func(graph, node, mmap_buf, tm_operator)) + break; + + /* Set the dynamic shape of the operator */ + node->op->dynamic_shape = tm_node->dynamic_shape; + } + + if(i < v_nodes->v_num) + return false; + + const TM2_Vector_indices* v_input_nodes = GetTmPtr(mmap_buf, tm_graph->offset_vi_input_indices); + const TM2_Vector_indices* v_output_nodes = GetTmPtr(mmap_buf, tm_graph->offset_vi_output_indices); + + /* Set the input nodes */ + for(unsigned int i = 0; i < v_input_nodes->v_num; i++) + { + StaticNode* node = graph->node_list[v_input_nodes->indices[i]].get(); + if(!node) + { + LOG_ERROR() << "Input node #" << v_input_nodes->indices[i] << " not exist\n"; + return false; + } + AddGraphInputNode(graph, node); + } + + /* Set the output nodes */ + for(unsigned int i = 0; i < v_output_nodes->v_num; i++) + { + StaticNode* node = graph->node_list[v_output_nodes->indices[i]].get(); + if(!node) + { + LOG_ERROR() << "Output node #" << v_output_nodes->indices[i] << " not exist\n"; + return false; + } + AddGraphOutputNode(graph, node); + } + + return true; +} + +bool TmSerializer2::LoadModelFromMem(void* mmap_buf, StaticGraph* graph) +{ + const TM2_Header* tm_header = reinterpret_cast(mmap_buf); + + const TM2_Model* tm_model = GetTmPtr(mmap_buf, tm_header->offset_root); + + /* Load dla tengine model */ + //if(tm_model->orig_format == MODEL_FORMAT_DLA) + // return LoadDlaModel(mmap_buf, graph); + + if(tm_model->offset_s_mname == TM2_NOT_SET) + { + SetGraphIdentity(graph, "tengine", "tengine_model", "0"); + } + else + { + std::string tm_model_name; + const TM2_String* tm_str = GetTmPtr(mmap_buf, tm_model->offset_s_mname); + tm_model_name.assign(GetTmPtr(mmap_buf, tm_str->offset_data), tm_str->size-1); + SetGraphIdentity(graph, "tengine", tm_model_name, "0"); + } + + SetModelFormat(graph, tm_model->orig_format); + + if(LoadGraph(graph, tm_model, mmap_buf)) + return true; + else + return false; +} + +bool TmSerializerRegisterOpLoader2(void) +{ + TmSerializerPtr serializer; + + if(!TmSerializerManager::SafeGet("tm_v2", serializer)) + return false; + + TmSerializer2* p_tengine = dynamic_cast(serializer.get()); + + for(int i = 0; i < TM2_OPTYPE_NUM; i++) + { + p_tengine->RegisterOpLoadMethod(GetOpStr(i), op_load_t(LoadTmOpFunc(i))); + p_tengine->RegisterOpSaveMethod(GetOpStr(i), op_save_t(SaveTmOpFunc(i))); + } + + return true; +} + +} // namespace TMSerializer2 + +} // namespace TEngine diff --git a/serializer/tensorflow/tf_serializer.cpp b/serializer/tensorflow/tf_serializer.cpp index 90f05a1ed..e2f363071 100644 --- a/serializer/tensorflow/tf_serializer.cpp +++ b/serializer/tensorflow/tf_serializer.cpp @@ -30,6 +30,8 @@ #include "tf_serializer.hpp" +#include "tengine_c_api.h" +#include "exec_attr.hpp" #include "data_type.hpp" #include "tengine_errno.hpp" @@ -48,6 +50,8 @@ #include "operator/softmax_param.hpp" #include "operator/generic_param.hpp" #include "operator/lstm_param.hpp" +#include "operator/rnn_param.hpp" +#include "operator/gru_param.hpp" #include "operator_manager.hpp" #include "type_name.hpp" @@ -101,6 +105,9 @@ bool TFSerializer::LoadModel(const std::vector& file_list, StaticGr SetGraphSource(graph, file_list[0]); SetGraphSourceFormat(graph, "tensorflow"); SetGraphConstTensorFile(graph, file_list[0]); + SetGraphLayout(graph,TENGINE_LAYOUT_NCHW); + SetModelLayout(graph,TENGINE_LAYOUT_NHWC); + SetModelFormat(graph,MODEL_FORMAT_TENSORFLOW); return LoadGraph(tf_net, graph); } @@ -179,7 +186,7 @@ int TFSerializer::FindRNNScope(TFGraph& tf_graph, std::string& rnn_scope) break; } - cell_pos = name.find("gru", while_pos); + cell_pos = name.find("gru_cell", while_pos); if(cell_pos != std::string::npos) { @@ -196,6 +203,15 @@ int TFSerializer::FindRNNScope(TFGraph& tf_graph, std::string& rnn_scope) rnn_type = TF_RNN_BASIC_LSTM; break; } + + cell_pos = name.find("basic_rnn_cell", while_pos); + + if(cell_pos != std::string::npos) + { + rnn_node = node->name; + rnn_type = TF_RNN_BASIC_RNN; + break; + } } if(rnn_node.empty()) @@ -269,136 +285,469 @@ void TFSerializer::ParseLSTMGraph(TFGraph& tf_graph, LSTMNode* lstm_node, std::s rnn_ir++; } } - -void TFSerializer::StripRNNScope(TFGraph& tf_graph, std::string& rnn_scope, int rnn_type) +void TFSerializer::ParseRNNGraph(TFGraph& tf_graph, RNNNode* rnn_node, std::set& rnn_graph) { - LSTMNode* lstm_node = new LSTMNode(); + /* parse input node */ - lstm_node->name = rnn_scope + "lstm"; - lstm_node->op = "LSTM"; + for(unsigned int i = 0; i < rnn_node->inputs.size(); i++) + { + TFNode* node = rnn_node->inputs[i]; - std::set& rnn_graph = lstm_node->rnn_graph; + if(node->op != "Const") + continue; - std::set rnn_inputs; - std::set rnn_outputs; + // node->no_static_node=true; //do not automatically create Static Node - auto ir = tf_graph.seq_nodes.begin(); - std::string::size_type prefix_len = rnn_scope.size(); + if(node->name.find("basic_rnn_cell/kernel") != std::string::npos) + { + rnn_node->kernel = node; + } + else if(node->name.find("basic_rnn_cell/bias") != std::string::npos) + { + rnn_node->bias = node; + } + + } - while(ir != tf_graph.seq_nodes.end()) + auto rnn_ir = rnn_graph.begin(); + auto rnn_ir_end = rnn_graph.end(); + + while(rnn_ir != rnn_ir_end) { - TFNode* node = *ir; + TFNode* node = *rnn_ir; + int name_len = node->name.size(); + std::string zero_name = "BasicRNNCellZeroState/zeros"; - if(node->name.find(rnn_scope.c_str(), 0, prefix_len) == std::string::npos) - { - ir++; + if(node->name.find(zero_name, name_len - zero_name.size()) != std::string::npos) + rnn_node->init_h = node; + + rnn_ir++; + } +} +void TFSerializer::ParseGRUGraph(TFGraph& tf_graph, GRUNode* gru_node, std::set& rnn_graph) +{ + /* parse input node */ + + for(unsigned int i = 0; i < gru_node->inputs.size(); i++) + { + TFNode* node = gru_node->inputs[i]; + + if(node->op != "Const") continue; - } - /* this is a node, inside rnn scope, remove it from graph first */ - ir = tf_graph.seq_nodes.erase(ir); + // node->no_static_node=true; //do not automatically create Static Node - rnn_graph.insert(node); + if(node->name.find("gru_cell/gates/kernel") != std::string::npos) + { + gru_node->gate_kernel = node; + } + else if(node->name.find("gru_cell/gates/bias") != std::string::npos) + { + gru_node->gate_bias = node; + } + else if(node->name.find("gru_cell/candidate/kernel") != std::string::npos) + { + gru_node->candidate_kernel = node; + } + else if(node->name.find("gru_cell/candidate/bias") != std::string::npos) + { + gru_node->candidate_bias = node; + } + } auto rnn_ir = rnn_graph.begin(); - auto rnn_end = rnn_graph.end(); + auto rnn_ir_end = rnn_graph.end(); - while(rnn_ir != rnn_end) + while(rnn_ir != rnn_ir_end) { TFNode* node = *rnn_ir; + int name_len = node->name.size(); + std::string zero_name = "GRUCellZeroState/zeros"; - for(unsigned int i = 0; i < node->inputs.size(); i++) + if(node->name.find(zero_name, name_len - zero_name.size()) != std::string::npos) + gru_node->init_h = node; + + rnn_ir++; + } +} + +void TFSerializer::StripRNNScope(TFGraph& tf_graph, std::string& rnn_scope, int rnn_type) +{ + + // collect attributes according to rnn_type + + if(rnn_type == TF_RNN_LSTM) + { + LSTMNode* lstm_node = new LSTMNode(); + + lstm_node->name = rnn_scope + "lstm"; + lstm_node->op = "LSTM"; + + std::set& rnn_graph = lstm_node->rnn_graph; + + std::set rnn_inputs; + std::set rnn_outputs; + + auto ir = tf_graph.seq_nodes.begin(); + std::string::size_type prefix_len = rnn_scope.size(); + + while(ir != tf_graph.seq_nodes.end()) { - TFNode* input = node->inputs[i]; + TFNode* node = *ir; - if(!rnn_graph.count(input)) - rnn_inputs.insert(input); + if(node->name.find(rnn_scope.c_str(), 0, prefix_len) == std::string::npos) + { + ir++; + continue; + } + + /* this is a node, inside rnn scope, remove it from graph first */ + ir = tf_graph.seq_nodes.erase(ir); + + rnn_graph.insert(node); } - for(unsigned int i = 0; i < node->outputs.size(); i++) + auto rnn_ir = rnn_graph.begin(); + auto rnn_end = rnn_graph.end(); + + while(rnn_ir != rnn_end) { - TFNode* output = node->outputs[i]; + TFNode* node = *rnn_ir; + + for(unsigned int i = 0; i < node->inputs.size(); i++) + { + TFNode* input = node->inputs[i]; - if(!rnn_graph.count(output)) - rnn_outputs.insert(output); + if(!rnn_graph.count(input)) + rnn_inputs.insert(input); + } + + for(unsigned int i = 0; i < node->outputs.size(); i++) + { + TFNode* output = node->outputs[i]; + + if(!rnn_graph.count(output)) + rnn_outputs.insert(output); + } + + rnn_ir++; } - rnn_ir++; - } + // insert lstm node + auto seq_ir = tf_graph.seq_nodes.begin(); - // insert lstm node - auto seq_ir = tf_graph.seq_nodes.begin(); + while(seq_ir != tf_graph.seq_nodes.end()) + { + TFNode* node = *seq_ir; - while(seq_ir != tf_graph.seq_nodes.end()) - { - TFNode* node = *seq_ir; + if(rnn_inputs.count(node)) + { + tf_graph.seq_nodes.insert(seq_ir, lstm_node); + break; + } + + seq_ir++; + } + + // connect inputs and outputs + auto set_ir = rnn_inputs.begin(); + auto set_ir_end = rnn_inputs.end(); - if(rnn_inputs.count(node)) + while(set_ir != set_ir_end) { - tf_graph.seq_nodes.insert(seq_ir, lstm_node); - break; + TFNode* input_node = *set_ir; + + for(unsigned int j = 0; j < input_node->outputs.size(); j++) + { + TFNode* child_node = input_node->outputs[j]; + + if(rnn_graph.count(child_node)) + input_node->outputs[j] = lstm_node; + } + + lstm_node->inputs.push_back(input_node); + + if(input_node->op == "Identity") + { + TFNode* parent_node = input_node->inputs[0]; + + MergeChildNode(parent_node, input_node); + } + + set_ir++; } - seq_ir++; - } + set_ir = rnn_outputs.begin(); + set_ir_end = rnn_outputs.end(); - // connect inputs and outputs - auto set_ir = rnn_inputs.begin(); - auto set_ir_end = rnn_inputs.end(); + while(set_ir != set_ir_end) + { + TFNode* output_node = *set_ir; + + for(unsigned int j = 0; j < output_node->inputs.size(); j++) + { + TFNode* parent_node = output_node->inputs[j]; - while(set_ir != set_ir_end) + if(rnn_graph.count(parent_node)) + output_node->inputs[j] = lstm_node; + } + + lstm_node->outputs.push_back(output_node); + set_ir++; + } + ParseLSTMGraph(tf_graph, lstm_node, rnn_graph); + } + + if(rnn_type == TF_RNN_BASIC_RNN) { - TFNode* input_node = *set_ir; + RNNNode* rnn_node = new RNNNode(); + + rnn_node->name = rnn_scope + "rnn"; + //std::cout<op = "RNN"; - for(unsigned int j = 0; j < input_node->outputs.size(); j++) + std::set& rnn_graph = rnn_node->rnn_graph; + + std::set rnn_inputs; + std::set rnn_outputs; + + auto ir = tf_graph.seq_nodes.begin(); + std::string::size_type prefix_len = rnn_scope.size(); + + while(ir != tf_graph.seq_nodes.end()) { - TFNode* child_node = input_node->outputs[j]; + TFNode* node = *ir; - if(rnn_graph.count(child_node)) - input_node->outputs[j] = lstm_node; + if(node->name.find(rnn_scope.c_str(), 0, prefix_len) == std::string::npos) + { + ir++; + continue; + } + + /* this is a node, inside rnn scope, remove it from graph first */ + ir = tf_graph.seq_nodes.erase(ir); + + rnn_graph.insert(node); } - lstm_node->inputs.push_back(input_node); + auto rnn_ir = rnn_graph.begin(); + auto rnn_end = rnn_graph.end(); - if(input_node->op == "Identity") + while(rnn_ir != rnn_end) { - TFNode* parent_node = input_node->inputs[0]; + TFNode* node = *rnn_ir; + + for(unsigned int i = 0; i < node->inputs.size(); i++) + { + TFNode* input = node->inputs[i]; + + if(!rnn_graph.count(input)) + rnn_inputs.insert(input); + } + + for(unsigned int i = 0; i < node->outputs.size(); i++) + { + TFNode* output = node->outputs[i]; + + if(!rnn_graph.count(output)) + rnn_outputs.insert(output); + } - MergeChildNode(parent_node, input_node); + rnn_ir++; } - set_ir++; - } + // insert rnn node + auto seq_ir = tf_graph.seq_nodes.begin(); - set_ir = rnn_outputs.begin(); - set_ir_end = rnn_outputs.end(); + while(seq_ir != tf_graph.seq_nodes.end()) + { + TFNode* node = *seq_ir; - while(set_ir != set_ir_end) - { - TFNode* output_node = *set_ir; + if(rnn_inputs.count(node)) + { + tf_graph.seq_nodes.insert(seq_ir, rnn_node); + break; + } + + seq_ir++; + } + + // connect inputs and outputs + auto set_ir = rnn_inputs.begin(); + auto set_ir_end = rnn_inputs.end(); + + while(set_ir != set_ir_end) + { + TFNode* input_node = *set_ir; - for(unsigned int j = 0; j < output_node->inputs.size(); j++) + for(unsigned int j = 0; j < input_node->outputs.size(); j++) + { + TFNode* child_node = input_node->outputs[j]; + + if(rnn_graph.count(child_node)) + input_node->outputs[j] = rnn_node; + } + + rnn_node->inputs.push_back(input_node); + + if(input_node->op == "Identity") + { + TFNode* parent_node = input_node->inputs[0]; + + MergeChildNode(parent_node, input_node); + } + + set_ir++; + } + + set_ir = rnn_outputs.begin(); + set_ir_end = rnn_outputs.end(); + + while(set_ir != set_ir_end) { - TFNode* parent_node = output_node->inputs[j]; + TFNode* output_node = *set_ir; + + for(unsigned int j = 0; j < output_node->inputs.size(); j++) + { + TFNode* parent_node = output_node->inputs[j]; - if(rnn_graph.count(parent_node)) - output_node->inputs[j] = lstm_node; + if(rnn_graph.count(parent_node)) + output_node->inputs[j] = rnn_node; + } + + rnn_node->outputs.push_back(output_node); + set_ir++; } - lstm_node->outputs.push_back(output_node); - set_ir++; + ParseRNNGraph(tf_graph, rnn_node, rnn_graph); } + if(rnn_type == TF_RNN_GRU) + { + GRUNode* gru_node = new GRUNode(); - // collect attributes according to rnn_type + gru_node->name = rnn_scope + "gru"; + //std::cout<op = "GRU"; - if(rnn_type == TF_RNN_LSTM) - { - ParseLSTMGraph(tf_graph, lstm_node, rnn_graph); + std::set& rnn_graph = gru_node->rnn_graph; + + std::set rnn_inputs; + std::set rnn_outputs; + + auto ir = tf_graph.seq_nodes.begin(); + std::string::size_type prefix_len = rnn_scope.size(); + + while(ir != tf_graph.seq_nodes.end()) + { + TFNode* node = *ir; + + if(node->name.find(rnn_scope.c_str(), 0, prefix_len) == std::string::npos) + { + ir++; + continue; + } + + /* this is a node, inside rnn scope, remove it from graph first */ + ir = tf_graph.seq_nodes.erase(ir); + + rnn_graph.insert(node); + } + + auto rnn_ir = rnn_graph.begin(); + auto rnn_end = rnn_graph.end(); + + while(rnn_ir != rnn_end) + { + TFNode* node = *rnn_ir; + + for(unsigned int i = 0; i < node->inputs.size(); i++) + { + TFNode* input = node->inputs[i]; + + if(!rnn_graph.count(input)) + rnn_inputs.insert(input); + } + + for(unsigned int i = 0; i < node->outputs.size(); i++) + { + TFNode* output = node->outputs[i]; + + if(!rnn_graph.count(output)) + rnn_outputs.insert(output); + } + + rnn_ir++; + } + + // insert rnn node + auto seq_ir = tf_graph.seq_nodes.begin(); + + while(seq_ir != tf_graph.seq_nodes.end()) + { + TFNode* node = *seq_ir; + + if(rnn_inputs.count(node)) + { + tf_graph.seq_nodes.insert(seq_ir, gru_node); + break; + } + + seq_ir++; + } + + // connect inputs and outputs + auto set_ir = rnn_inputs.begin(); + auto set_ir_end = rnn_inputs.end(); + + while(set_ir != set_ir_end) + { + TFNode* input_node = *set_ir; + + for(unsigned int j = 0; j < input_node->outputs.size(); j++) + { + TFNode* child_node = input_node->outputs[j]; + + if(rnn_graph.count(child_node)) + input_node->outputs[j] = gru_node; + } + + gru_node->inputs.push_back(input_node); + + if(input_node->op == "Identity") + { + TFNode* parent_node = input_node->inputs[0]; + + MergeChildNode(parent_node, input_node); + } + + set_ir++; + } + + set_ir = rnn_outputs.begin(); + set_ir_end = rnn_outputs.end(); + + while(set_ir != set_ir_end) + { + TFNode* output_node = *set_ir; + + for(unsigned int j = 0; j < output_node->inputs.size(); j++) + { + TFNode* parent_node = output_node->inputs[j]; + + if(rnn_graph.count(parent_node)) + output_node->inputs[j] = gru_node; + } + + gru_node->outputs.push_back(output_node); + set_ir++; + } + + ParseGRUGraph(tf_graph, gru_node, rnn_graph); } // cleanup zero in/zero out node - seq_ir = tf_graph.seq_nodes.begin(); + auto seq_ir = tf_graph.seq_nodes.begin(); while(seq_ir != tf_graph.seq_nodes.end()) { @@ -1352,7 +1701,6 @@ bool TFSerializer::GenerateStaticGraph(TFGraph& tf_graph, StaticGraph* graph) /* create tensor */ StaticTensor* tensor = CreateStaticTensor(graph, tf_node->name); - SetTensorDataLayout(tensor, "NCHW"); SetTensorDataType(tensor, DataType::GetTypeID("float32")); AddNodeOutputTensor(node, tensor); @@ -1422,7 +1770,6 @@ static void CreateInputNode(TFNode* tf_node, StaticGraph* graph) StaticTensor* tensor = CreateStaticTensor(graph, tf_node->name); - SetTensorDataLayout(tensor, "NCHW"); SetTensorDataType(tensor, DataType::GetTypeID("float32")); // if has shape, set it @@ -1678,7 +2025,6 @@ static bool LoadConstTensor(TFNode* tf_node, StaticGraph* graph) SetTensorDim(tensor, dims); SetTensorSize(tensor, mem_size); - SetTensorDataLayout(tensor, layout); SetConstTensorBuffer(tensor, mem_ptr); } @@ -1730,13 +2076,17 @@ static bool LoadConv2D(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph) { if(value.s() == "VALID") { - param.pad_h = 0; - param.pad_w = 0; + param.pad_h0 = 0; + param.pad_h1 = 0; + param.pad_w0 = 0; + param.pad_w1 = 0; } else if(value.s() == "SAME") { - param.pad_h = -1; - param.pad_w = -1; + param.pad_h0 = -1; + param.pad_h1 = -1; + param.pad_w0 = -1; + param.pad_w1 = -1; } } @@ -1790,7 +2140,6 @@ static bool LoadConv2D(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph) dims.push_back(kernel_w); SetTensorDim(weight_tensor, dims); - SetTensorDataLayout(weight_tensor, "NCHW"); param.kernel_h = kernel_h; param.kernel_w = kernel_w; @@ -1872,15 +2221,12 @@ static bool LoadConv2D(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph) } } - /* update the padding arguments */ - saved_param.pads.resize(4); - /* h pad */ - saved_param.pads[0] = shape_data[2]; - saved_param.pads[2] = shape_data[3]; + saved_param.pad_h0 = shape_data[2]; + saved_param.pad_h1 = shape_data[3]; /* w pad */ - saved_param.pads[1] = shape_data[4]; - saved_param.pads[3] = shape_data[5]; + saved_param.pad_w0 = shape_data[4]; + saved_param.pad_w1 = shape_data[5]; SetOperatorParam(op, saved_param); } @@ -1919,13 +2265,17 @@ static bool LoadPool(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph) { if(value.s() == "VALID") { - param.pad_h = 0; - param.pad_w = 0; + param.pad_h0 = 0; + param.pad_h1 = 0; + param.pad_w0 = 0; + param.pad_w1 = 0; } else if(value.s() == "SAME") { - param.pad_h = -1; - param.pad_w = -1; + param.pad_h0 = -1; + param.pad_h1 = -1; + param.pad_w0 = -1; + param.pad_w1 = -1; } } @@ -1938,21 +2288,6 @@ static bool LoadPool(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph) param.alg = kPoolMax; } - // convert to onnx format - param.kernel_shape.resize(2); - param.kernel_shape[0] = param.kernel_h; - param.kernel_shape[1] = param.kernel_w; - - param.pads.resize(4); - param.pads[0] = param.pad_h; - param.pads[1] = param.pad_w; - param.pads[2] = param.pad_h; - param.pads[3] = param.pad_w; - - param.strides.resize(2); - param.strides[0] = param.stride_h; - param.strides[1] = param.stride_w; - StaticOp* op = CreateStaticOp(graph, "Pooling"); SetOperatorParam(op, param); SetNodeOp(node, op); @@ -2160,7 +2495,6 @@ static void CreatePresetNode(StaticGraph* graph, StaticNode* node, const char* n StaticTensor* tensor = CreateStaticConstTensor(graph, new_tensor_name); SetTensorDim(tensor, dims); SetTensorDataType(tensor, DataType::GetTypeID("float32")); - SetTensorDataLayout(tensor, layout); int elem_size = 1; @@ -2409,14 +2743,11 @@ static bool LoadGemm(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph) param.beta = 1; StaticTensor* weight_tensor = FindTensor(graph, input1->name); - SetTensorDataLayout(weight_tensor, "HW"); if(tf_node->inputs.size() > 2) { TFNode* bias = tf_node->inputs[2]; AddNodeInputTensor(node, bias->static_tensor); - StaticTensor* bias_tensor = FindTensor(graph, bias->name); - SetTensorDataLayout(bias_tensor, "W"); } if(param.transA) @@ -2552,7 +2883,6 @@ static bool LoadLSTMInitState(LSTMNode* lstm_node, TFNode* init_node, StaticGrap SetTensorDataType(const_tensor, DataType::GetTypeID("float32")); SetTensorDim(const_tensor, dims); SetTensorSize(const_tensor, dims[0] * dims[1] * sizeof(float)); - SetTensorDataLayout(const_tensor, "W"); SetConstTensorBuffer(const_tensor, mem_ptr); SetConstTensorFileLocation(const_tensor, -1, 0); @@ -2565,6 +2895,136 @@ static bool LoadLSTMInitState(LSTMNode* lstm_node, TFNode* init_node, StaticGrap return true; } +static bool LoadGRUInitState(GRUNode* gru_node, TFNode* init_node, StaticGraph* graph) +{ + /* load const value */ + TFNode* const_val_node; + TFNode* concat_node; + + if(init_node->inputs[0]->op == "Const") + { + const_val_node = init_node->inputs[0]; + concat_node = init_node->inputs[1]; + } + else + { + const_val_node = init_node->inputs[1]; + concat_node = init_node->inputs[0]; + } + + int* const_ptr = ( int* )LoadConstParam(const_val_node); + float const_val = const_ptr[0]; + + free(const_ptr); + + // int* dim0_ptr = ( int* )LoadConstParam(concat_node->inputs[0]); + int* dim0_ptr = ( int* )LoadConstParam(concat_node->inputs[1]); + + std::vector dims(1); + + dims[0] = dim0_ptr[0]; + // dims[1] = dim1_ptr[0]; + + free(dim0_ptr); + // free(dim1_ptr); + + float* mem_ptr = ( float* )malloc(dims[0] * sizeof(float)); + + for(int i = 0; i < dims[0]; i++) + { + mem_ptr[i] = const_val; + } + + /* create node and tensor */ + + std::string const_node_name; + + if(init_node == gru_node->init_h) + const_node_name = gru_node->name + "/init_h"; + + StaticNode* const_node = CreateStaticNode(graph, const_node_name); + StaticTensor* const_tensor = CreateStaticConstTensor(graph, const_node_name); + + SetTensorDataType(const_tensor, DataType::GetTypeID("float32")); + SetTensorDim(const_tensor, dims); + SetTensorSize(const_tensor, dims[0]* sizeof(float)); + SetConstTensorBuffer(const_tensor, mem_ptr); + SetConstTensorFileLocation(const_tensor, -1, 0); + + AddNodeOutputTensor(const_node, const_tensor); + + StaticOp* const_op = CreateStaticOp(graph, "Const"); + SetNodeOp(const_node, const_op); + + AddNodeInputTensor(gru_node->static_node, const_tensor); + + return true; +} +static bool LoadRNNInitState(RNNNode* rnn_node, TFNode* init_node, StaticGraph* graph) +{ + /* load const value */ + TFNode* const_val_node; + TFNode* concat_node; + + if(init_node->inputs[0]->op == "Const") + { + const_val_node = init_node->inputs[0]; + concat_node = init_node->inputs[1]; + } + else + { + const_val_node = init_node->inputs[1]; + concat_node = init_node->inputs[0]; + } + + int* const_ptr = ( int* )LoadConstParam(const_val_node); + float const_val = const_ptr[0]; + + free(const_ptr); + + int* dim0_ptr = ( int* )LoadConstParam(concat_node->inputs[0]); + int* dim1_ptr = ( int* )LoadConstParam(concat_node->inputs[1]); + + std::vector dims(2); + + dims[0] = dim0_ptr[0]; + dims[1] = dim1_ptr[0]; + + free(dim0_ptr); + free(dim1_ptr); + + float* mem_ptr = ( float* )malloc(dims[0] * dims[1] * sizeof(float)); + + for(int i = 0; i < dims[0] * dims[1]; i++) + { + mem_ptr[i] = const_val; + } + + /* create node and tensor */ + + std::string const_node_name; + + if(init_node == rnn_node->init_h) + const_node_name = rnn_node->name + "/init_h"; + + StaticNode* const_node = CreateStaticNode(graph, const_node_name); + StaticTensor* const_tensor = CreateStaticConstTensor(graph, const_node_name); + + SetTensorDataType(const_tensor, DataType::GetTypeID("float32")); + SetTensorDim(const_tensor, dims); + SetTensorSize(const_tensor, dims[0] * dims[1] * sizeof(float)); + SetConstTensorBuffer(const_tensor, mem_ptr); + SetConstTensorFileLocation(const_tensor, -1, 0); + + AddNodeOutputTensor(const_node, const_tensor); + + StaticOp* const_op = CreateStaticOp(graph, "Const"); + SetNodeOp(const_node, const_op); + + AddNodeInputTensor(rnn_node->static_node, const_tensor); + + return true; +} static bool LoadLSTM(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph) { @@ -2629,6 +3089,8 @@ static bool LoadLSTM(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph) int cell_size = kernel_dims[1] / 4; param.cell_size = cell_size; + //mxnet false + param.mxnet_flag =0; if(lstm_node->projection) { @@ -2650,6 +3112,99 @@ static bool LoadLSTM(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph) return true; } + +static bool LoadRNN(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph) +{ + StaticNode* node = tf_node->static_node; + + RNNNode* rnn_node = dynamic_cast(tf_node); + RNNParam param = any_cast(OpManager::GetOpDefParam("RNN")); + + // those two are mandatory + AddNodeInputTensor(node, tf_node->inputs[0]->static_tensor); + AddNodeInputTensor(node, rnn_node->kernel->static_tensor); + + // optional tensors + if(rnn_node->bias) + { + param.has_bias = 1; + AddNodeInputTensor(node, rnn_node->bias->static_tensor); + } + + if(rnn_node->init_h) + { + param.has_init_state = 1; + LoadRNNInitState(rnn_node, rnn_node->init_h, graph); + } + + /* calculate and set other paremeters*/ + const std::vector& kernel_dims = GetTensorDim(rnn_node->kernel->static_tensor); + + int data_size = kernel_dims[0]; + + int hidden_size = kernel_dims[1]; + + param.hidden_size = hidden_size; + + param.input_size = data_size - param.hidden_size; + + StaticOp* op = CreateStaticOp(graph, "RNN"); + SetOperatorParam(op, param); + + SetNodeOp(node, op); + + return true; +} +static bool LoadGRU(TFNode* tf_node, TFGraph& tf_graph, StaticGraph* graph) +{ + StaticNode* node = tf_node->static_node; + + GRUNode* gru_node = dynamic_cast(tf_node); + GRUParam param = any_cast(OpManager::GetOpDefParam("GRU")); + + // those 3 are mandatory + AddNodeInputTensor(node, tf_node->inputs[0]->static_tensor); + AddNodeInputTensor(node, gru_node->gate_kernel->static_tensor); + AddNodeInputTensor(node, gru_node->candidate_kernel->static_tensor); + + // optional tensors + if(gru_node->gate_bias) + { + param.has_gate_bias = 1; + AddNodeInputTensor(node, gru_node->gate_bias->static_tensor); + } + if(gru_node->candidate_bias) + { + param.has_candidate_bias = 1; + AddNodeInputTensor(node, gru_node->candidate_bias->static_tensor); + } + + if(gru_node->init_h) + { + param.has_init_state = 1; + LoadGRUInitState(gru_node, gru_node->init_h, graph); + } + + /* calculate and set other paremeters*/ + const std::vector& kernel_dims = GetTensorDim(gru_node->gate_kernel->static_tensor); + + int data_size = kernel_dims[0]; + + int hidden_size = kernel_dims[1]; + + param.hidden_size = hidden_size/2; + + param.input_size = data_size - param.hidden_size; + + param.mxnet_flag=0; + StaticOp* op = CreateStaticOp(graph, "GRU"); + SetOperatorParam(op, param); + + SetNodeOp(node, op); + + return true; +} + } // namespace tf_serializer using namespace tf_serializer; @@ -2688,7 +3243,8 @@ bool TFSerializerRegisterOpLoader(void) p_tf->RegisterOpLoadMethod("AudioSpectrogram", op_load_t(LoadGeneric)); p_tf->RegisterOpLoadMethod("Mfcc", op_load_t(LoadGeneric)); p_tf->RegisterOpLoadMethod("LSTM", op_load_t(LoadLSTM)); - + p_tf->RegisterOpLoadMethod("RNN", op_load_t(LoadRNN)); + p_tf->RegisterOpLoadMethod("GRU", op_load_t(LoadGRU)); return true; } diff --git a/serializer/tf_lite/Makefile b/serializer/tf_lite/Makefile new file mode 100644 index 000000000..8a973b585 --- /dev/null +++ b/serializer/tf_lite/Makefile @@ -0,0 +1,3 @@ +obj-y+=tf_lite_serializer.o + +COMMON_CFLAGS+=-I../include/tf_lite diff --git a/serializer/tf_lite/tf_lite_serializer.cpp b/serializer/tf_lite/tf_lite_serializer.cpp new file mode 100644 index 000000000..88dd6be58 --- /dev/null +++ b/serializer/tf_lite/tf_lite_serializer.cpp @@ -0,0 +1,846 @@ +#include +#include + +#include "tengine_c_api.h" +#include "exec_attr.hpp" +#include "tf_lite_serializer.hpp" +#include "logger.hpp" +#include "data_type.hpp" + +#include "operator/conv_param.hpp" +#include "operator/pool_param.hpp" +#include "operator/concat_param.hpp" +#include "operator/reshape_param.hpp" +#include "operator/softmax_param.hpp" +#include "operator/detection_postprocess_param.hpp" +#include "operator/eltwise_param.hpp" +#include "flatbuffers/flexbuffers.h" + +namespace TEngine { + +using LiteNode = TFLiteSerializer::LiteNode; +using LiteTensor = TFLiteSerializer::LiteTensor; +using LiteGraph = TFLiteSerializer::LiteGraph; + +using op_load_t = std::function; + +bool TFLiteSerializer::LoadModel(const std::vector& file_list, StaticGraph* graph) +{ + if(file_list.size() != GetFileNum()) + return false; + + std::ifstream input_file; + + input_file.open(file_list[0], std::ios::binary | std::ios::in); + input_file.seekg(0, std::ios::end); + + int model_len = input_file.tellg(); + char* model_data = new char[model_len]; + + input_file.seekg(0, std::ios::beg); + input_file.read(model_data, model_len); + input_file.close(); + + SetGraphSource(graph, file_list[0]); + SetGraphSourceFormat(graph, "tflite"); + SetGraphLayout(graph,TENGINE_LAYOUT_NHWC); + SetModelLayout(graph,TENGINE_LAYOUT_NHWC); + SetModelFormat(graph,MODEL_FORMAT_TFLITE); + + bool ret = LoadModelFromMem(model_data, model_len, graph); + + if(!ret) + delete[] model_data; + + return ret; +} + +bool TFLiteSerializer::LoadModelFromMem(char* mem_addr, int mem_size, StaticGraph* graph) +{ + ::flatbuffers::Verifier verifier(( const unsigned char* )mem_addr, mem_size); + + if(!::tflite::VerifyModelBuffer(verifier)) + { + LOG_ERROR() << "bad tf lite model file\n"; + return false; + } + + const LiteModel* lite_model = ::tflite::GetModel(mem_addr); + + if(!lite_model->subgraphs() || lite_model->subgraphs()->size() != 1) + { + LOG_ERROR() << "bad graph format\n"; + return false; + } + + LiteGraph lite_graph; + + lite_graph.lite_model = lite_model; + + if(!ConstructGraph(lite_model, &lite_graph)) + return false; + + // DumpLiteGraph(&lite_graph); + + if(!OptimizeGraph(&lite_graph)) + return false; + + if(!GenerateStaticGraph(&lite_graph, graph)) + return false; + + return true; +} + +bool TFLiteSerializer::ConstructGraph(const LiteModel* lite_model, LiteGraph* lite_graph) +{ + // load all tensors first + + auto tensors = (*lite_model->subgraphs())[0]->tensors(); + + int i = 0; + + for(auto* tensor : *tensors) + { + LiteTensor* lite_tensor = new LiteTensor(); + + lite_tensor->tf_tensor = tensor; + lite_tensor->idx = i++; + lite_tensor->name = tensor->name()->c_str(); + + auto shape = tensor->shape(); + + for(unsigned int i = 0; i < shape->Length(); ++i) + lite_tensor->shape.push_back(shape->Get(i)); + + int type = tensor->type(); + + switch(type) + { + case ::tflite::TensorType_FLOAT32: + lite_tensor->type = "FP32"; + break; + case ::tflite::TensorType_UINT8: + lite_tensor->type = "UINT8"; + break; + case ::tflite::TensorType_INT32: + lite_tensor->type = "INT32"; + break; + default: + lite_tensor->type = "unknown"; + } + + lite_graph->tensor_list.push_back(lite_tensor); + } + + // load ops + + const auto ops = (*lite_model->subgraphs())[0]->operators(); + const auto opcodes = lite_model->operator_codes(); + + i = 0; + + for(auto* op : *ops) + { + LiteNode* lite_node = new LiteNode(); + + lite_node->lite_op = op; + + /* get op name */ + + int op_code_idx = op->opcode_index(); + + const auto* op_code = opcodes->Get(op_code_idx); + + if(op_code->builtin_code() == ::tflite::BuiltinOperator_CUSTOM) + lite_node->op = op_code->custom_code()->c_str(); + else + lite_node->op = EnumNameBuiltinOperator(op_code->builtin_code()); + + /*inputs and outputs */ + auto inputs = op->inputs(); + + for(unsigned int i = 0; i < inputs->Length(); i++) + { + auto input_idx = inputs->Get(i); + + if(input_idx != -1) + { + LiteTensor* lite_tensor = lite_graph->tensor_list.at(input_idx); + lite_node->inputs.push_back(lite_tensor); + } + else + { + LiteTensor* lite_tensor = new LiteTensor(); + + lite_tensor->name = "NoData"; + lite_tensor->idx = lite_graph->tensor_list.size(); + + lite_graph->tensor_list.push_back(lite_tensor); + + lite_node->inputs.push_back(lite_tensor); + } + } + + auto outputs = op->outputs(); + + for(unsigned int i = 0; i < outputs->Length(); i++) + { + auto output_idx = outputs->Get(i); + LiteTensor* lite_tensor; + + if(output_idx != -1) + { + lite_tensor = lite_graph->tensor_list.at(output_idx); + lite_node->outputs.push_back(lite_tensor); + } + else + { + lite_tensor = new LiteTensor(); + lite_node->outputs.push_back(lite_tensor); + } + + lite_tensor->producer = lite_node; + } + + lite_node->name = lite_node->outputs[0]->name; + + lite_graph->seq_nodes.push_back(lite_node); + } + + // setup graph inputs/outputs + auto inputs = (*lite_model->subgraphs())[0]->inputs(); + + if(inputs) + { + for(int input : *inputs) + { + LiteTensor* tensor = lite_graph->tensor_list.at(input); + lite_graph->input_tensors.push_back(tensor); + tensor->graph_input = true; + } + } + + auto outputs = (*lite_model->subgraphs())[0]->outputs(); + + if(outputs) + { + for(int output : *outputs) + { + LiteTensor* tensor = lite_graph->tensor_list.at(output); + tensor->graph_output = true; + lite_graph->output_tensors.push_back(tensor); + } + } + + return true; +} + +bool TFLiteSerializer::OptimizeGraph(LiteGraph* lite_graph) +{ + return true; +} + +bool TFLiteSerializer::LoadTensorScaleAndZero(StaticTensor* static_tensor, LiteTensor* lite_tensor) +{ + auto quantization = lite_tensor->tf_tensor->quantization(); + float scale = 1.f; + int zero_point = 0; + + if(quantization->scale() && quantization->zero_point()) + { + scale = quantization->scale()->Get(0); + zero_point = quantization->zero_point()->Get(0); + } + static_tensor->scale = scale; + static_tensor->zero_point = zero_point; + + return true; +} + +bool TFLiteSerializer::LoadConstLiteTensor(StaticTensor* static_tensor, LiteTensor* tensor, LiteGraph* lite_graph, + StaticGraph* graph) +{ + void* mem_buf; + int shape_size = 1; + int mem_size; + const TFLiteTensor* tf_tensor = tensor->tf_tensor; + + auto* buffers = lite_graph->lite_model->buffers(); + int buf_idx = tf_tensor->buffer(); + + auto* buffer = buffers->Get(buf_idx); + auto* src_buf = buffer->data(); + + for(unsigned int i = 0; i < tensor->shape.size(); i++) + shape_size *= tensor->shape[i]; + + int element_size = DataType::GetTypeSize(static_tensor->data_type); + mem_size = shape_size * element_size; + + mem_buf = malloc(mem_size); + + if(tensor->type == "UINT8") + { + const uint8_t* src_ptr = ( const uint8_t* )(src_buf->data()); + memcpy(mem_buf, src_ptr, mem_size); + } + else if(tensor->type == "INT32") + { + const int* src_ptr = ( const int* )src_buf->data(); + memcpy(mem_buf, src_ptr, mem_size); + } + else + { + const void* src_ptr = src_buf->data(); + memcpy(mem_buf, src_ptr, mem_size); + } + + // DIM SWITCH WILL BE DELAYED to OP LOAD + SetConstTensorBuffer(static_tensor, mem_buf); + SetConstTensorFileLocation(static_tensor, -1, 0); + + StaticOp* op = CreateStaticOp(graph, "Const"); + StaticNode* node = CreateStaticNode(graph, tensor->name); + + SetNodeOp(node, op); + + AddNodeOutputTensor(node, static_tensor); + + return true; +} + +bool TFLiteSerializer::LoadLiteTensor(LiteTensor* tensor, LiteGraph* lite_graph, StaticGraph* graph) +{ + StaticTensor* static_tensor; + bool const_tensor = false; + + if(tensor->producer || tensor->graph_input) + { + static_tensor = CreateStaticTensor(graph, tensor->name); + } + else + { + const_tensor = true; + static_tensor = CreateStaticConstTensor(graph, tensor->name); + } + int data_type; + if(tensor->type == "UINT8") + data_type = TENGINE_DT_UINT8; + else if(tensor->type == "INT32") + data_type = TENGINE_DT_INT32; + else + { + data_type = TENGINE_DT_FP32; + } + SetTensorDataType(static_tensor, data_type); + SetTensorDim(static_tensor, tensor->shape); + + LoadTensorScaleAndZero(static_tensor, tensor); + + tensor->static_tensor = static_tensor; + + // layout will be set during the op load + + // Load Const Tensor + if(const_tensor) + return LoadConstLiteTensor(static_tensor, tensor, lite_graph, graph); + + return true; +} + +bool TFLiteSerializer::LoadLiteNode(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph) +{ + if(!FindOpLoadMethod(node->op)) + { + LOG_ERROR() << "cannot find load method for op: " << node->op << "\n"; + return false; + } + + StaticNode* static_node = CreateStaticNode(graph, node->name); + + // handle input + for(unsigned int i = 0; i < node->inputs.size(); i++) + { + LiteTensor* input = node->inputs.at(i); + AddNodeInputTensor(static_node, input->static_tensor); + } + + // handle output + + for(unsigned int i = 0; i < node->outputs.size(); i++) + { + LiteTensor* output = node->outputs.at(i); + AddNodeOutputTensor(static_node, output->static_tensor); + } + + // for each op, load the op + op_load_t op_func = any_cast(GetOpLoadMethod(node->op)); + + node->static_node = static_node; + + if(!op_func(node, lite_graph, graph)) + { + LOG_ERROR() << "failed to load node: " << node->name << " op: " << node->op << "\n"; + return false; + } + + return true; +} + +void TFLiteSerializer::CreateGraphInputNode(LiteTensor* tensor, StaticGraph* graph) +{ + StaticOp* op = CreateStaticOp(graph, "InputOp"); + StaticNode* node = CreateStaticNode(graph, tensor->name); + + SetNodeOp(node, op); + + AddNodeOutputTensor(node, tensor->static_tensor); + + AddGraphInputNode(graph, node); +} + +bool TFLiteSerializer::GenerateStaticGraph(LiteGraph* lite_graph, StaticGraph* graph) +{ + // first load all tensor + int tensor_number = lite_graph->tensor_list.size(); + + for(int i = 0; i < tensor_number; i++) + { + LiteTensor* tensor = lite_graph->tensor_list.at(i); + + LoadLiteTensor(tensor, lite_graph, graph); + } + + // create input node for graph_input tensor + for(unsigned int i = 0; i < lite_graph->input_tensors.size(); i++) + { + LiteTensor* tensor = lite_graph->input_tensors.at(i); + + CreateGraphInputNode(tensor, graph); + } + + // second load all nodes + int node_number = lite_graph->seq_nodes.size(); + + for(int i = 0; i < node_number; i++) + { + LiteNode* node = lite_graph->seq_nodes.at(i); + + if(!LoadLiteNode(node, lite_graph, graph)) + return false; + } + + return true; +} + +void TFLiteSerializer::DumpLiteTensor(LiteTensor* tensor) +{ + std::cout << tensor->name << " " << tensor->type << " ["; + for(unsigned int i = 0; i < tensor->shape.size(); i++) + std::cout << " " << tensor->shape[i]; + + std::cout << "] "; + + if(!tensor->producer && !tensor->graph_input) + std::cout << " Const "; +} + +void TFLiteSerializer::DumpLiteGraph(LiteGraph* lite_graph) +{ + for(unsigned int i = 0; i < lite_graph->seq_nodes.size(); i++) + { + LiteNode* node = lite_graph->seq_nodes.at(i); + + std::cout << i << ":\t" << node->op << " \t" << node->name << "\n"; + std::cout << "\tInput: " << node->inputs.size() << " Output: " << node->outputs.size() << "\n"; + + for(unsigned int j = 0; j < node->inputs.size(); j++) + { + LiteTensor* tensor = node->inputs[j]; + std::cout << "\t I" << j << ": "; + DumpLiteTensor(tensor); + std::cout << "\n"; + } + + for(unsigned int j = 0; j < node->outputs.size(); j++) + { + LiteTensor* tensor = node->outputs[j]; + std::cout << "\t O" << j << ": "; + DumpLiteTensor(tensor); + std::cout << "\n"; + } + } + std::cout << "\nGraph Inputs:\n"; + + for(unsigned int i = 0; i < lite_graph->input_tensors.size(); i++) + { + LiteTensor* tensor = lite_graph->input_tensors.at(i); + std::cout << "\t" << i << "\t" << tensor->name << "\n"; + } + + std::cout << "\nGraph Outputs:\n"; + + for(unsigned int i = 0; i < lite_graph->output_tensors.size(); i++) + { + LiteTensor* tensor = lite_graph->output_tensors.at(i); + std::cout << "\t" << i << "\t" << tensor->name << "\n"; + } +} + +namespace tf_lite_serializer { + +static void ExchangeNC(const std::vector& shape, std::vector& new_shape) +{ + new_shape.resize(4); + + new_shape[0] = shape[3]; + new_shape[1] = shape[1]; + new_shape[2] = shape[2]; + new_shape[3] = shape[0]; +} + +static bool LoadConv2D(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph) +{ + StaticNode* static_node = node->static_node; + int kernel_h = 1, kernel_w = 1, output_channel = 1; + LiteTensor* lite_tensor = node->inputs[1]; + + output_channel = lite_tensor->shape[0]; + kernel_h = lite_tensor->shape[1]; + kernel_w = lite_tensor->shape[2]; + + ConvParam param = any_cast(OpManager::GetOpDefParam("Convolution")); + const tflite::Conv2DOptions* lite_param = node->lite_op->builtin_options_as(); + + int lite_activation = lite_param->fused_activation_function(); + switch(lite_activation) + { + case 0: + param.activation = -1; + break; + case 1: + param.activation = 0; + break; + case 2: + param.activation = 1; + break; + case 3: + param.activation = 6; + break; + default: + param.activation = -4; + break; + } + param.stride_h = lite_param->stride_h(); + param.stride_w = lite_param->stride_w(); + int padding = lite_param->padding(); + if(padding == 0) + { + param.pad_h0 = -1; + param.pad_h1 = -1; + param.pad_w0 = -1; + param.pad_w1 = -1; + } + else + { + param.pad_h0 = 0; + param.pad_h1 = 0; + param.pad_w0 = 0; + param.pad_w1 = 0; + } + param.dilation_h = 1; + param.dilation_w = 1; + param.group = 1; + param.kernel_h = kernel_h; + param.kernel_w = kernel_w; + param.output_channel = output_channel; + + StaticOp* op = CreateStaticOp(graph, "Convolution"); + + SetOperatorParam(op, param); + SetNodeOp(static_node, op); + + // bias + + return true; +} + +static bool LoadConv2DDepthwise(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph) +{ + StaticNode* static_node = node->static_node; + int kernel_h = 1, kernel_w = 1, output_channel = 1; + LiteTensor* lite_tensor = node->inputs[1]; + { + output_channel = lite_tensor->static_tensor->dims[3]; + kernel_h = lite_tensor->static_tensor->dims[1]; + kernel_w = lite_tensor->static_tensor->dims[2]; + } + ConvParam param = any_cast(OpManager::GetOpDefParam("Convolution")); + const tflite::DepthwiseConv2DOptions* lite_param = + node->lite_op->builtin_options_as(); + + int lite_activation = lite_param->fused_activation_function(); + switch(lite_activation) + { + case 0: + param.activation = -1; + break; + case 1: + param.activation = 0; + break; + case 2: + param.activation = 1; + break; + case 3: + param.activation = 6; + break; + default: + param.activation = -4; + break; + } + + param.stride_h = lite_param->stride_h(); + param.stride_w = lite_param->stride_w(); + param.group = output_channel / lite_param->depth_multiplier(); + int padding = lite_param->padding(); + if(padding == 0) + { + param.pad_h0 = -1; + param.pad_h1 = -1; + param.pad_w0 = -1; + param.pad_w1 = -1; + } + else + { + param.pad_h0 = 0; + param.pad_h1 = 0; + param.pad_w0 = 0; + param.pad_w1 = 0; + } + + param.dilation_h = 1; + param.dilation_w = 1; + param.kernel_h = kernel_h; + param.kernel_w = kernel_w; + param.output_channel = output_channel; + + StaticOp* op = CreateStaticOp(graph, "Convolution"); + + SetOperatorParam(op, param); + SetNodeOp(static_node, op); + + std::vector new_shape; + ExchangeNC(node->inputs[1]->shape, new_shape); + SetTensorDim(node->inputs[1]->static_tensor, new_shape); + + + return true; +} + +static bool LoadPooling(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph) +{ + StaticNode* static_node = node->static_node; + PoolParam param = any_cast(OpManager::GetOpDefParam("Pooling")); + const tflite::Pool2DOptions* lite_param = node->lite_op->builtin_options_as(); + + param.kernel_h = lite_param->filter_height(); + param.kernel_w = lite_param->filter_width(); + + param.stride_h = lite_param->stride_h(); + param.stride_w = lite_param->stride_w(); + + if(lite_param->padding() == 0) + { + param.pad_h0 = -1; + param.pad_h1 = -1; + param.pad_w0 = -1; + param.pad_w1 = -1; + } + else + { + param.pad_h0 = 0; + param.pad_h1 = 0; + param.pad_w0 = 0; + param.pad_w1 = 0; + } + + if(node->op == "AVERAGE_POOL_2D") + param.alg = kPoolAvg; + else if(node->op == "MAX_POOL_2D") + param.alg = kPoolMax; + + StaticOp* op = CreateStaticOp(graph, "Pooling"); + SetOperatorParam(op, param); + SetNodeOp(static_node, op); + + return true; +} + +static bool LoadConcat(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph) +{ + StaticNode* static_node = node->static_node; + ConcatParam param = any_cast(OpManager::GetOpDefParam("Concat")); + const tflite::ConcatenationOptions* lite_param = node->lite_op->builtin_options_as(); + int activation = lite_param->fused_activation_function(); + + param.axis = lite_param->axis(); + + StaticOp* op = CreateStaticOp(graph, "Concat"); + if(activation) + AddOperatorAttr(op, "Activation", activation); + SetOperatorParam(op, param); + SetNodeOp(static_node, op); + + + return true; +} + +static bool LoadReshape(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph) +{ + StaticNode* static_node = node->static_node; + + ReshapeParam param = any_cast(OpManager::GetOpDefParam("Reshape")); + StaticTensor* output_tensor = node->outputs[0]->static_tensor; + // const tflite::ReshapeOptions * lite_param = + // node->lite_op->builtin_options_as(); + // set dims + auto new_shape = output_tensor->dims; + if(new_shape.size() == 4) + { + param.dim_0 = new_shape[0]; + param.dim_1 = new_shape[1]; + param.dim_2 = new_shape[2]; + param.dim_3 = new_shape[3]; + } + else if(new_shape.size() == 3) + { + param.dim_0 = new_shape[0]; + param.dim_1 = new_shape[1]; + param.dim_2 = new_shape[2]; + } + else if(new_shape.size() == 2) + { + param.dim_0 = new_shape[0]; + param.dim_1 = new_shape[1]; + } + else + return false; + + StaticOp* op = CreateStaticOp(graph, "Reshape"); + SetOperatorParam(op, param); + SetNodeOp(static_node, op); + + return true; +} + +static bool LoadLogistic(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph) +{ + StaticNode* static_node = node->static_node; + + StaticOp* op = CreateStaticOp(graph, "Logistic"); + SetNodeOp(static_node, op); + + return true; +} + +static bool LoadSoftmax(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph) +{ + StaticNode* static_node = node->static_node; + SoftmaxParam param = any_cast(OpManager::GetOpDefParam("Softmax")); + + param.axis = 1; + StaticOp* op = CreateStaticOp(graph, "Softmax"); + SetOperatorParam(op, param); + SetNodeOp(static_node, op); + + return true; +} + +static bool LoadEltwise(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph) +{ + StaticNode* static_node = node->static_node; + EltwiseParam param = any_cast(OpManager::GetOpDefParam("Eltwise")); + + if(node->op == "ADD") + param.type = ELT_SUM; + else if(node->op == "SUB") + param.type = ELT_SUB; + else if(node->op == "PROD") + param.type = ELT_PROD; + else if(node->op == "RSQRT") + param.type = ELT_RSQRT; + else if(node->op == "DIV") + param.type = ELT_DIV; + else if(node->op == "LOG") + param.type = ELT_LOG; + else if(node->op == "EXP") + param.type = ELT_EXP; + else if(node->op == "POW") + param.type = ELT_POW; + else if(node->op == "SQRT") + param.type = ELT_SQRT; + else if(node->op == "FLOOR") + param.type = ELT_FLOOR; + StaticOp* op = CreateStaticOp(graph, "Eltwise"); + SetOperatorParam(op, param); + SetNodeOp(static_node, op); + + return true; +} + +static bool LoadDetectionPostProcess(LiteNode* node, LiteGraph* lite_graph, StaticGraph* graph) +{ + StaticNode* static_node = node->static_node; + + DetectionPostProcessParam param = + any_cast(OpManager::GetOpDefParam("DetectionPostProcess")); + const uint8_t* lite_buffer = node->lite_op->custom_options()->data(); + size_t lite_buffer_len = node->lite_op->custom_options()->size(); + + const flexbuffers::Map& m = flexbuffers::GetRoot(lite_buffer, lite_buffer_len).AsMap(); + param.max_detections = m["max_detections"].AsInt32(); + param.max_classes_per_detection = m["max_classes_per_detection"].AsInt32(); + param.nms_score_threshold = m["nms_score_threshold"].AsFloat(); + param.nms_iou_threshold = m["nms_iou_threshold"].AsFloat(); + param.num_classes = m["num_classes"].AsInt32(); + param.scales.resize(4); + param.scales[0] = m["y_scale"].AsFloat(); + param.scales[1] = m["x_scale"].AsFloat(); + param.scales[2] = m["h_scale"].AsFloat(); + param.scales[3] = m["w_scale"].AsFloat(); + + StaticOp* op = CreateStaticOp(graph, "DetectionPostProcess"); + SetOperatorParam(op, param); + SetNodeOp(static_node, op); + return true; +} + +} // namespace tf_lite_serializer + +using namespace tf_lite_serializer; + +bool TFLiteSerializerRegisterOpLoader(void) +{ + SerializerPtr serializer; + + if(!SerializerManager::SafeGet("tflite", serializer)) + return false; + + TFLiteSerializer* tf_lite = dynamic_cast(serializer.get()); + + tf_lite->RegisterOpLoadMethod("CONV_2D", op_load_t(LoadConv2D)); + tf_lite->RegisterOpLoadMethod("AVERAGE_POOL_2D", op_load_t(LoadPooling)); + tf_lite->RegisterOpLoadMethod("MAX_POOL_2D", op_load_t(LoadPooling)); + tf_lite->RegisterOpLoadMethod("DEPTHWISE_CONV_2D", op_load_t(LoadConv2DDepthwise)); + tf_lite->RegisterOpLoadMethod("RESHAPE", op_load_t(LoadReshape)); + tf_lite->RegisterOpLoadMethod("SQUEEZE", op_load_t(LoadReshape)); + tf_lite->RegisterOpLoadMethod("CONCATENATION", op_load_t(LoadConcat)); + tf_lite->RegisterOpLoadMethod("LOGISTIC", op_load_t(LoadLogistic)); + tf_lite->RegisterOpLoadMethod("SOFTMAX", op_load_t(LoadSoftmax)); + tf_lite->RegisterOpLoadMethod("ADD", op_load_t(LoadEltwise)); + tf_lite->RegisterOpLoadMethod("TFLite_Detection_PostProcess", op_load_t(LoadDetectionPostProcess)); + + return true; +} + +} // namespace TEngine diff --git a/sysroot/Makefile b/sysroot/Makefile index 1c8007936..00e7cb9ea 100644 --- a/sysroot/Makefile +++ b/sysroot/Makefile @@ -14,3 +14,4 @@ debian32: .PHONY: ubuntu debian ubuntu32 debian32 + diff --git a/tests/Makefile b/tests/Makefile index b3534eb7b..b6dd4e0ca 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -69,6 +69,7 @@ $(BUILD_DIR)/%: $(BUILD_DIR)/%.o OPENCV_LIB=$(shell pkg-config --libs-only-l --libs-only-L opencv) LIBS+=-L ../build/ -ltengine -Wl,-allow-shlib-undefined + SO_LIBS+=-lprotobuf -ldl -lpthread SO_LIBS+=-Wl,-rpath,./build/ SO_LIBS+=$(OPENCV_LIB) diff --git a/tests/bin/Makefile b/tests/bin/Makefile index 0d1345d50..85b49b8c2 100644 --- a/tests/bin/Makefile +++ b/tests/bin/Makefile @@ -1,32 +1,43 @@ bin-obj-y+=bench_sqz.o bin-obj-y+=bench_mobilenet.o -bin-obj-y+=test_mxnet_sqz.o -bin-obj-y+=test_mxnet_mobilenet.o -bin-obj-y+=test_onnx_sqz.o bin-obj-y+=vgg16.o bin-obj-y+=test_deploy.o bin-obj-y+=demo.o bin-obj-y+=test_perf_stat.o bin-obj-y+=test_node_dump.o bin-obj-y+=two_model_demo.o -bin-obj-y+=test_lstm.o + +bin-obj-y+=test_rnn.o bin-obj-$(CONFIG_ACL_GPU)+=mt_mssd.o +ifeq ($(CONFIG_MXNET_SERIALIZER),y) +bin-obj-y+=test_mxnet.o +bin-obj-y+=test_mxnet_sqz.o +bin-obj-y+=test_mxnet_mobilenet.o +bin-obj-y+=test_mxnet_mobileface.o +bin-obj-y+=test_mxnet_lstm.o +bin-obj-y+=test_mxnet_gru.o +endif + ifeq ($(CONFIG_TF_SERIALIZER),y) bin-obj-y+=test_tf_mobilenet.o bin-obj-y+=test_tf_inceptionv3.o bin-obj-y+=test_tf_resnet50.o bin-obj-y+=test_tf.o +bin-obj-y+=test_tf_gru.o +bin-obj-y+=test_tf_lstm.o endif ifeq ($(CONFIG_TENGINE_SERIALIZER),y) bin-obj-y+=test_tm.o bin-obj-y+=save_model_src.o bin-obj-y+=load_model_src.o +bin-obj-y+=test_mobilenet.o endif ifeq ($(CONFIG_ONNX_SERIALIZER),y) +bin-obj-y+=test_onnx_sqz.o bin-obj-y+=test_onnx.o endif @@ -35,3 +46,4 @@ bin-obj-y+=tf_lite_mssd.o bin-obj-y+=tf_lite_mssd_quant.o bin-obj-y+=tf_lite_mobilenet_quant.o endif + diff --git a/tests/bin/bench_mobilenet.cpp b/tests/bin/bench_mobilenet.cpp index fe4fdda54..d28969acd 100644 --- a/tests/bin/bench_mobilenet.cpp +++ b/tests/bin/bench_mobilenet.cpp @@ -126,10 +126,9 @@ int main(int argc, char* argv[]) get_input_data(image_file, input_data, img_h, img_w, channel_mean, 0.017); - if(cpu_list_str) - set_cpu_list(cpu_list_str); - + set_cpu_list(cpu_list_str); + init_tengine(); std::cout << "run-time library version: " << get_tengine_version() << "\n"; @@ -137,7 +136,6 @@ int main(int argc, char* argv[]) if(request_tengine_version("0.9") < 0) return -1; - graph_t graph = create_graph(nullptr, "caffe", text_file, model_file); if(graph == nullptr) diff --git a/tests/bin/bench_sqz.cpp b/tests/bin/bench_sqz.cpp index 55f82bb93..e6afcc152 100644 --- a/tests/bin/bench_sqz.cpp +++ b/tests/bin/bench_sqz.cpp @@ -133,7 +133,7 @@ int main(int argc, char* argv[]) get_input_data(image_file, input_data, img_h, img_w, channel_mean, 1); if(cpu_list_str) - set_cpu_list(cpu_list_str); + set_cpu_list(cpu_list_str); init_tengine(); @@ -142,6 +142,7 @@ int main(int argc, char* argv[]) if(request_tengine_version("0.9") < 0) return -1; + graph_t graph = create_graph(nullptr, "caffe", text_file, model_file); if(graph == nullptr) diff --git a/tests/bin/load_model_src.cpp b/tests/bin/load_model_src.cpp index cde37a44d..1c5b64ab7 100644 --- a/tests/bin/load_model_src.cpp +++ b/tests/bin/load_model_src.cpp @@ -139,17 +139,15 @@ int main(int argc, char* argv[]) float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3); get_input_data(image_file, input_data, img_h, img_w, channel_mean, 1); - + if(cpu_list_str) set_cpu_list(cpu_list_str); - + init_tengine(); if(request_tengine_version("0.9") < 0) return 1; - - /* src_tm: the serailizer registered name * squeeze_net: the model name when saving the model */ diff --git a/tests/bin/test_deploy.cpp b/tests/bin/test_deploy.cpp index a92029b84..daa3f3051 100644 --- a/tests/bin/test_deploy.cpp +++ b/tests/bin/test_deploy.cpp @@ -72,6 +72,14 @@ int main(int argc, char* argv[]) tensor_t input_tensor = get_graph_tensor(graph, input_tensor_name); int dims[] = {1, 3, img_h, img_w}; set_tensor_shape(input_tensor, dims, 4); + + // if use gpu + int use_gpu = 0; + const char* gpu_flag = std::getenv("USE_GPU"); + if (gpu_flag) use_gpu= atoi(gpu_flag); + if (use_gpu) set_graph_device(graph, "acl_opencl"); + // + int ret_prerun = prerun_graph(graph); if(ret_prerun < 0) { diff --git a/tests/bin/test_mobilenet.cpp b/tests/bin/test_mobilenet.cpp new file mode 100644 index 000000000..07342f5a3 --- /dev/null +++ b/tests/bin/test_mobilenet.cpp @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2017, Open AI Lab + * Author: haitao@openailab.com + */ +#include +#include + +#include +#include +#include +#include +#include + +#include "tengine_c_api.h" +#include "opencv2/imgproc/imgproc.hpp" +#include "opencv2/highgui/highgui.hpp" + +const char* model_file = "./models/mobilenet.tm"; +const char* image_file = "./tests/images/cat.jpg"; +const char* label_file = "./models/synset_words.txt"; + +const float channel_mean[3] = {104.007, 116.669, 122.679}; + + +int repeat_count = 100; + +unsigned long get_cur_time(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + + return (tv.tv_sec * 1000000 + tv.tv_usec); +} + + +void LoadLabelFile(std::vector& result, const char* fname) +{ + std::ifstream labels(fname); + + std::string line; + while(std::getline(labels, line)) + result.push_back(line); +} + +static inline bool PairCompare(const std::pair& lhs, const std::pair& rhs) +{ + return lhs.first > rhs.first; +} + + +static inline std::vector Argmax(const std::vector& v, int N) +{ + std::vector> pairs; + for(size_t i = 0; i < v.size(); ++i) + pairs.push_back(std::make_pair(v[i], i)); + std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare); + + std::vector result; + for(int i = 0; i < N; ++i) + result.push_back(pairs[i].second); + return result; +} + + +void get_input_data(const char* image_file, float* input_data, int img_h, int img_w, const float* mean, float scale) +{ + cv::Mat img = cv::imread(image_file, -1); + + if(img.empty()) + { + std::cerr << "failed to read image file " << image_file << "\n"; + return; + } + cv::resize(img, img, cv::Size(img_h, img_w)); + img.convertTo(img, CV_32FC3); + float* img_data = ( float* )img.data; + int hw = img_h * img_w; + for(int h = 0; h < img_h; h++) + for(int w = 0; w < img_w; w++) + for(int c = 0; c < 3; c++) + { + input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale; + img_data++; + } +} + +int main(int argc, char* argv[]) +{ + int res; + + while((res = getopt(argc, argv, "r:")) != -1) + { + switch(res) + { + case 'r': + repeat_count = strtoul(optarg, NULL, 10); + break; + + default: + break; + } + } + + int img_h = 224; + int img_w = 224; + + /* prepare input data */ + float* input_data = ( float* )malloc(sizeof(float) * img_h * img_w * 3); + + get_input_data(image_file, input_data, img_h, img_w, channel_mean, 0.017); + + + init_tengine(); + + std::cout << "run-time library version: " << get_tengine_version() << "\n"; + + if(request_tengine_version("1.0") < 0) + return -1; + + graph_t graph = create_graph(nullptr, "tengine", model_file); + + if(graph == nullptr) + { + std::cout << "Create graph0 failed\n"; + std::cout << "errno: " << get_tengine_errno() << "\n"; + return -1; + } + + /* get input tensor */ + int node_idx = 0; + int tensor_idx = 0; + + tensor_t input_tensor = get_graph_input_tensor(graph, node_idx, tensor_idx); + + if(input_tensor == nullptr) + { + std::printf("Cannot find input tensor,node_idx: %d,tensor_idx: %d\n", node_idx, tensor_idx); + return -1; + } + + int dims[] = {1, 3, img_h, img_w}; + + set_tensor_shape(input_tensor, dims, 4); + + /* setup input buffer */ + + if(set_tensor_buffer(input_tensor, input_data, 3 * img_h * img_w * 4) < 0) + { + std::printf("Set buffer for tensor failed\n"); + return -1; + } + + + /* run the graph */ + int ret_prerun = prerun_graph(graph); + if(ret_prerun < 0) + { + std::printf("prerun failed\n"); + return -1; + } + + dump_graph(graph); + + run_graph(graph, 1); + + // benchmark start here + printf("REPEAT COUNT= %d\n", repeat_count); + + unsigned long start_time = get_cur_time(); + + for(int i = 0; i < repeat_count; i++) + run_graph(graph, 1); + + unsigned long end_time = get_cur_time(); + + unsigned long off_time = end_time - start_time; + std::printf("Repeat [%d] time %.2f us per RUN. used %lu us\n", repeat_count, 1.0f * off_time / repeat_count, + off_time); + + /* get output tensor */ + tensor_t output_tensor = get_graph_output_tensor(graph, node_idx, tensor_idx); + + if(output_tensor == nullptr) + { + std::printf("Cannot find output tensor , node_idx: %d,tensor_idx: %d\n", node_idx, tensor_idx); + return -1; + } + + int count = get_tensor_buffer_size(output_tensor) / 4; + + float* data = ( float* )(get_tensor_buffer(output_tensor)); + float* end = data + count; + + std::vector result(data, end); + + std::vector top_N = Argmax(result, 5); + + std::vector labels; + + LoadLabelFile(labels, label_file); + + for(unsigned int i = 0; i < top_N.size(); i++) + { + int idx = top_N[i]; + + std::cout << std::fixed << std::setprecision(4) << result[idx] << " - \""; + std::cout << labels[idx] << "\"\n"; + } + + release_graph_tensor(output_tensor); + release_graph_tensor(input_tensor); + postrun_graph(graph); + destroy_graph(graph); + + free(input_data); + + std::cout << "ALL TEST DONE\n"; + + release_tengine(); + return 0; +} diff --git a/tests/bin/test_mxnet.cpp b/tests/bin/test_mxnet.cpp new file mode 100644 index 000000000..e62f85903 --- /dev/null +++ b/tests/bin/test_mxnet.cpp @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2018, Open AI Lab + * Author: chunyinglv@openailab.com + */ +#include +#include +#include +#include "tengine_c_api.h" +#include + +int main(int argc, char* argv[]) +{ + if(argc < 4) + { + std::cout << "[Usage]: " << argv[0] << " \n"; + return 1; + } + + // init tengine + init_tengine(); + if(request_tengine_version("0.9") < 0) + return 1; + + // create graph + printf("%s\n",argv[1]); + printf("%s\n",argv[2]); + graph_t graph = create_graph(nullptr, "mxnet", argv[1],argv[2]); + if(graph == nullptr) + { + std::cout << "Create graph failed\n"; + std::cout << "errno: " << get_tengine_errno() << "\n"; + return 1; + } + std::cout << "Create graph success\n"; + + // input + int img_h = atoi(argv[3]); + int img_w = img_h; + if(argc == 5) + img_w = atoi(argv[4]); + int img_size = img_h * img_w * 1 * 1; + + float* input_data = ( float* )malloc(sizeof(float) * img_size); + for(int i =0;i + +#include +#include +#include +#include +#include +#include + +#include "tengine_c_api.h" + +std::string model_name1 = "./models/GRU/Fused_Neural_Net-symbol.json"; +std::string model_name2 = "./models/GRU/Fused_Neural_Net-0100.params"; + +int main(int argc, char* argv[]) +{ + int steps = 1; + int res; + + while((res = getopt(argc, argv, "n:")) != -1) + { + switch(res) + { + case 'n': + steps = strtoul(optarg, NULL, 10); + break; + default: + break; + } + } + + init_tengine(); + + graph_t graph = create_graph(nullptr, "mxnet", model_name1.c_str(),model_name2.c_str()); + + // set_graph_layout(graph,TENGINE_LAYOUT_NCHW); + // dump_graph(graph); + + if(graph == nullptr) + { + std::cout << "Create graph0 failed\n"; + return 1; + } + + tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); + + int dim[3] = {steps,28, 28}; + + set_tensor_shape(input_tensor, dim, 3); + + int input_size = get_tensor_buffer_size(input_tensor); + float* input_data = ( float* )malloc(input_size); + + for(unsigned int i = 0; i < input_size / sizeof(float); i++) + input_data[i] = 45; + + set_tensor_buffer(input_tensor, input_data, input_size); + + // std::cout<<"intensr "<