diff --git a/ModelOptimizations/DlQuantization/include/DlQuantization/Fp16Quantization.hpp b/ModelOptimizations/DlQuantization/include/DlQuantization/Fp16Quantization.hpp index 21bf2527ede..d6d732a0c39 100644 --- a/ModelOptimizations/DlQuantization/include/DlQuantization/Fp16Quantization.hpp +++ b/ModelOptimizations/DlQuantization/include/DlQuantization/Fp16Quantization.hpp @@ -49,7 +49,7 @@ namespace DlQuantization * @param cnt total size of input tensor * @param out pointer to the output tensor */ - void quantizeDequantizeFp16Gpu(const float* in, int cnt, float* out); + void quantizeDequantizeFp16Gpu(const float* in, int cnt, float* out, void* stream = nullptr); } diff --git a/ModelOptimizations/DlQuantization/include/DlQuantization/ITensorQuantizationSim.h b/ModelOptimizations/DlQuantization/include/DlQuantization/ITensorQuantizationSim.h index ea9074f543e..270584e1088 100644 --- a/ModelOptimizations/DlQuantization/include/DlQuantization/ITensorQuantizationSim.h +++ b/ModelOptimizations/DlQuantization/include/DlQuantization/ITensorQuantizationSim.h @@ -52,6 +52,13 @@ class ITensorQuantizationSim double encodingMin, double encodingMax, uint8_t bw, RoundingMode roundMode, bool use_cuda) = 0; + + virtual void quantizeDequantizeTensor(const DTYPE* inputTensorData, size_t inputTensorCount, + DTYPE* outputTensorData, + double encodingMin, double encodingMax, + uint8_t bw, RoundingMode roundMode, + bool use_cuda, void* stream) = 0; + virtual void quantizeTensor(const DTYPE* inputTensorData, size_t inputTensorCount, DTYPE* outputTensorData, double encodingMin, double encodingMax, uint8_t bw, RoundingMode roundMode, bool use_cuda, bool shiftToSigned) = 0; @@ -112,6 +119,12 @@ class ITensorQuantizationSim DTYPE* encodingDelta, DTYPE* encodingOffset, RoundingMode roundingMode, bool useCuda) = 0; + virtual void quantizeDequantizeTensorPerChannel(const DTYPE* inputTensorData, size_t numChannel, + size_t numElement, size_t numElementPerChannel, + DTYPE* outputTensorData, DTYPE* encodingMin, DTYPE* encodingMax, + DTYPE* encodingDelta, DTYPE* encodingOffset, + RoundingMode roundingMode, bool useCuda, void* stream) = 0; + }; } // namespace DlQuantization diff --git a/ModelOptimizations/DlQuantization/include/DlQuantization/TensorQuantizer.h b/ModelOptimizations/DlQuantization/include/DlQuantization/TensorQuantizer.h index a54401ee40a..b525290cc21 100644 --- a/ModelOptimizations/DlQuantization/include/DlQuantization/TensorQuantizer.h +++ b/ModelOptimizations/DlQuantization/include/DlQuantization/TensorQuantizer.h @@ -118,6 +118,9 @@ class TensorQuantizer : public TensorQuantizerOpFacade void quantizeDequantize(const float* input, std::size_t tensorSize, float* output, double encodingMin, double encodingMax, unsigned int bitwidth, bool useCuda) override; + void quantizeDequantize(const float* input, std::size_t tensorSize, float* output, double encodingMin, + double encodingMax, unsigned int bitwidth, bool useCuda, void* stream) override; + /** * @brief Convert a tensor from DTYPE to quantized 8-bit packed format * @relates quantizeDequantize, except output is stored in 8-bit packed format diff --git a/ModelOptimizations/DlQuantization/include/DlQuantization/TensorQuantizerOpFacade.h b/ModelOptimizations/DlQuantization/include/DlQuantization/TensorQuantizerOpFacade.h index c458b438de1..63d020c5265 100644 --- a/ModelOptimizations/DlQuantization/include/DlQuantization/TensorQuantizerOpFacade.h +++ b/ModelOptimizations/DlQuantization/include/DlQuantization/TensorQuantizerOpFacade.h @@ -90,6 +90,8 @@ class TensorQuantizerOpFacade virtual void quantizeDequantize(const float* input, std::size_t tensorSize, float* output, double encodingMin, double encodingMax, unsigned int bitwidth, bool useCuda) = 0; + virtual void quantizeDequantize(const float* input, std::size_t tensorSize, float* output, double encodingMin, + double encodingMax, unsigned int bitwidth, bool useCuda, void* stream) = 0; /** * Compute the encoding for this tensor using stats collected so far */ diff --git a/ModelOptimizations/DlQuantization/src/Fp16Quantization.cpp b/ModelOptimizations/DlQuantization/src/Fp16Quantization.cpp index 07ef579aa10..eb3bbde25c5 100644 --- a/ModelOptimizations/DlQuantization/src/Fp16Quantization.cpp +++ b/ModelOptimizations/DlQuantization/src/Fp16Quantization.cpp @@ -43,10 +43,10 @@ namespace DlQuantization { - void quantizeDequantizeFp16Gpu(const float* in, int cnt, float* out) + void quantizeDequantizeFp16Gpu(const float* in, int cnt, float* out, void* stream) { #ifdef GPU_QUANTIZATION_ENABLED - quantizeDequantizeFp16ForGPU(in, cnt, out); + quantizeDequantizeFp16ForGPU(in, cnt, out, stream); #else throw std::runtime_error("Not compiled for GPU mode."); #endif diff --git a/ModelOptimizations/DlQuantization/src/MainQuantizationClass.cpp b/ModelOptimizations/DlQuantization/src/MainQuantizationClass.cpp index 3709713c7c5..ffaa0c631c2 100644 --- a/ModelOptimizations/DlQuantization/src/MainQuantizationClass.cpp +++ b/ModelOptimizations/DlQuantization/src/MainQuantizationClass.cpp @@ -116,7 +116,7 @@ void MainQuantizationClass::QuantizeDequantizeActs(const string& layer, L for (unsigned int blob_id = 0; blob_id < acts.size(); ++blob_id) { quantizeDequantize(acts[blob_id], count[blob_id], encoding[blob_id], acts_quantized[blob_id], m_ModeCpuGpu, - ROUND_NEAREST); + ROUND_NEAREST, nullptr); } } @@ -126,7 +126,7 @@ void MainQuantizationClass::QuantizeDequantizeParams(int bw, DTYPE* param TfEncoding& encoding) { m_QuantAlgo->NumberDistributionToFxpFormat(bw, params, count, encoding); - quantizeDequantize(params, count, encoding, params_quantized, m_ModeCpuGpu, mode_rounding); + quantizeDequantize(params, count, encoding, params_quantized, m_ModeCpuGpu, mode_rounding, nullptr); } template diff --git a/ModelOptimizations/DlQuantization/src/TensorQuantizationSim.cpp b/ModelOptimizations/DlQuantization/src/TensorQuantizationSim.cpp index 40208062d22..7550109fcb8 100644 --- a/ModelOptimizations/DlQuantization/src/TensorQuantizationSim.cpp +++ b/ModelOptimizations/DlQuantization/src/TensorQuantizationSim.cpp @@ -95,11 +95,21 @@ void TensorQuantizationSim::quantizeDequantizeTensor(const DTYPE* inputTe DTYPE* outputTensorData, double encodingMin, double encodingMax, uint8_t bw, RoundingMode roundingMode, bool use_cuda) +{ + quantizeDequantizeTensor(inputTensorData, inputTensorCount, outputTensorData, encodingMin, encodingMax, + bw, roundingMode, use_cuda, nullptr); +} + +template +void TensorQuantizationSim::quantizeDequantizeTensor(const DTYPE* inputTensorData, size_t inputTensorCount, + DTYPE* outputTensorData, double encodingMin, + double encodingMax, uint8_t bw, RoundingMode roundingMode, + bool use_cuda, void* stream) { TfEncoding encoding; fillEncodingInfo(encoding, bw, encodingMin, encodingMax); quantizeDequantize(inputTensorData, inputTensorCount, encoding, outputTensorData, getComputationMode(use_cuda), - roundingMode); + roundingMode, stream); } template @@ -145,7 +155,7 @@ void TensorQuantizationSim::quantizeDequantizePerChannelTensor( for (uint32_t i = 0; i < splits.size(); ++i) { auto& split = splits[i]; quantizeDequantize(split.data(), split.size(), completeEncodings[i], split.data(), getComputationMode(useCuda), - roundMode); + roundMode, nullptr); } // Concatenate the quantized data back into its original shape. @@ -269,6 +279,19 @@ void TensorQuantizationSim::quantizeDequantizeTensorPerChannel(const DTYP DTYPE* encodingMax, DTYPE* encodingDelta, DTYPE* encodingOffset, RoundingMode roundingMode, bool useCuda) +{ + quantizeDequantizeTensorPerChannel(inputTensorData, numChannel, numElement, numElementPerChannel, outputTensorData, + encodingMin, encodingMax, encodingDelta, encodingOffset, roundingMode, useCuda, + nullptr); +} + +template +void TensorQuantizationSim::quantizeDequantizeTensorPerChannel(const DTYPE* inputTensorData, size_t numChannel, + size_t numElement, size_t numElementPerChannel, + DTYPE* outputTensorData, DTYPE* encodingMin, + DTYPE* encodingMax, DTYPE* encodingDelta, + DTYPE* encodingOffset, RoundingMode roundingMode, + bool useCuda, void* stream) { DlQuantization::ComputationMode cpuGpuMode; if (useCuda) @@ -277,7 +300,8 @@ void TensorQuantizationSim::quantizeDequantizeTensorPerChannel(const DTYP cpuGpuMode = DlQuantization::ComputationMode::COMP_MODE_CPU; quantizeDequantizePerChannel(inputTensorData, numChannel, numElement, numElementPerChannel, outputTensorData, - encodingMin, encodingMax, encodingDelta, encodingOffset, cpuGpuMode, roundingMode); + encodingMin, encodingMax, encodingDelta, encodingOffset, cpuGpuMode, roundingMode, + stream); } template class TensorQuantizationSim; diff --git a/ModelOptimizations/DlQuantization/src/TensorQuantizationSim.h b/ModelOptimizations/DlQuantization/src/TensorQuantizationSim.h index 8e6701e6b55..ef109f198b5 100644 --- a/ModelOptimizations/DlQuantization/src/TensorQuantizationSim.h +++ b/ModelOptimizations/DlQuantization/src/TensorQuantizationSim.h @@ -60,6 +60,10 @@ class TensorQuantizationSim : public ITensorQuantizationSim double encodingMin, double encodingMax, uint8_t bw, RoundingMode roundMode, bool use_cuda) override; + void quantizeDequantizeTensor(const DTYPE* inputTensorData, size_t inputTensorCount, DTYPE* outputTensorData, + double encodingMin, double encodingMax, uint8_t bw, RoundingMode roundMode, + bool use_cuda, void* stream) override; + void quantizeTensor(const DTYPE* inputTensorData, size_t inputTensorCount, DTYPE* outputTensorData, double encodingMin, double encodingMax, uint8_t bw, RoundingMode roundMode, bool use_cuda, bool shiftToSigned) @@ -99,6 +103,11 @@ class TensorQuantizationSim : public ITensorQuantizationSim DTYPE* encodingMax, DTYPE* encodingDelta, DTYPE* encodingOffset, RoundingMode roundingMode, bool useCuda) override; + void quantizeDequantizeTensorPerChannel(const DTYPE* inputTensorData, size_t numChannel, size_t numElement, + size_t numElementPerChannel, DTYPE* outputTensorData, DTYPE* encodingMin, + DTYPE* encodingMax, DTYPE* encodingDelta, DTYPE* encodingOffset, + RoundingMode roundingMode, bool useCuda, void* stream) override; + inline DlQuantization::ComputationMode getComputationMode(bool use_cuda) { return (use_cuda ? DlQuantization::ComputationMode::COMP_MODE_GPU diff --git a/ModelOptimizations/DlQuantization/src/TensorQuantizationSimForPython.cpp b/ModelOptimizations/DlQuantization/src/TensorQuantizationSimForPython.cpp index d08b18b7ba9..48aae4826b9 100644 --- a/ModelOptimizations/DlQuantization/src/TensorQuantizationSimForPython.cpp +++ b/ModelOptimizations/DlQuantization/src/TensorQuantizationSimForPython.cpp @@ -75,7 +75,7 @@ py::array_t TensorQuantizationSimForPython::quantizeDequantize(py::array_ _tensorQuantizationSim->quantizeDequantizeTensor(inputDataPtr, inputTensorSize, outputDataPtr, encoding.min, encoding.max, bitwidth, roundingMode, - use_cuda); + use_cuda, nullptr); return output; } diff --git a/ModelOptimizations/DlQuantization/src/TensorQuantizer.cpp b/ModelOptimizations/DlQuantization/src/TensorQuantizer.cpp index ebcc966270c..03b99649694 100644 --- a/ModelOptimizations/DlQuantization/src/TensorQuantizer.cpp +++ b/ModelOptimizations/DlQuantization/src/TensorQuantizer.cpp @@ -164,10 +164,16 @@ void TensorQuantizer::computeEncodingFromData(uint8_t bw, const float* data, siz void TensorQuantizer::quantizeDequantize(const float* input, std::size_t tensorSize, float* output, double encodingMin, double encodingMax, unsigned int bitwidth, bool useCuda) +{ + quantizeDequantize(input, tensorSize, output, encodingMin, encodingMax, bitwidth, useCuda, nullptr); +} + +void TensorQuantizer::quantizeDequantize(const float* input, std::size_t tensorSize, float* output, double encodingMin, + double encodingMax, unsigned int bitwidth, bool useCuda, void* stream) { assert(isEncodingValid); _tensorQuantizationSim->quantizeDequantizeTensor(input, tensorSize, output, encodingMin, encodingMax, bitwidth, - roundingMode, useCuda); + roundingMode, useCuda, stream); } void TensorQuantizer::quantizeTensorPacked(const float* input, std::size_t tensorSize, std::vector& output, diff --git a/ModelOptimizations/DlQuantization/src/trim_functions.cpp b/ModelOptimizations/DlQuantization/src/trim_functions.cpp index 57de092b419..44d181be9c9 100644 --- a/ModelOptimizations/DlQuantization/src/trim_functions.cpp +++ b/ModelOptimizations/DlQuantization/src/trim_functions.cpp @@ -92,7 +92,7 @@ Lambda parallelize(const uint32_t number_of_threads, Lambda lambda) // encoding: TF: rounded template void quantizeDequantize(const DTYPE* in, int cnt, const TfEncoding& encoding, DTYPE* out, - ComputationMode mode_cpu_gpu, RoundingMode rounding_mode) + ComputationMode mode_cpu_gpu, RoundingMode rounding_mode, void* stream) { switch (mode_cpu_gpu) { @@ -101,7 +101,7 @@ void quantizeDequantize(const DTYPE* in, int cnt, const TfEncoding& encoding, DT break; case COMP_MODE_GPU: #ifdef GPU_QUANTIZATION_ENABLED - quantizeDequantizeGpu(in, cnt, encoding, out, rounding_mode); + quantizeDequantizeGpu(in, cnt, encoding, out, rounding_mode, stream); #else throw runtime_error("Not compiled for GPU mode."); #endif @@ -602,7 +602,7 @@ void dequantizeFromPackedFxpCpu(const uint8_t* input, int cnt, template void quantizeDequantizePerChannel(const DTYPE* in, int numChannel, int numElement, int numElementPerChannel, DTYPE* out, DTYPE* encodingMin, DTYPE* encodingMax, DTYPE* encodingDelta, DTYPE* encodingOffset, - ComputationMode modeCpuGpu, RoundingMode roundingMode) + ComputationMode modeCpuGpu, RoundingMode roundingMode, void* stream) { switch (modeCpuGpu) { @@ -613,7 +613,7 @@ void quantizeDequantizePerChannel(const DTYPE* in, int numChannel, int numElemen case COMP_MODE_GPU: #ifdef GPU_QUANTIZATION_ENABLED quantizeDequantizePerChannelGpu(in, numChannel, numElement, numElementPerChannel, out, encodingMin, encodingMax, - encodingDelta, encodingOffset, roundingMode); + encodingDelta, encodingOffset, roundingMode, stream); #else throw runtime_error("Not compiled for GPU mode."); #endif @@ -643,10 +643,10 @@ void quantizeDequantizePerChannelCpu(const DTYPE* in, int numChannel, int numEle // Explicit instantiations template void quantizeDequantize(const double* in, int cnt, const TfEncoding& encoding, double* out, - ComputationMode mode_cpu_gpu, RoundingMode rounding_mode); + ComputationMode mode_cpu_gpu, RoundingMode rounding_mode, void* stream); template void quantizeDequantize(const float* in, int cnt, const TfEncoding& encoding, float* out, - ComputationMode mode_cpu_gpu, RoundingMode rounding_mode); + ComputationMode mode_cpu_gpu, RoundingMode rounding_mode, void* stream); template void quantizeToFxp(const double* in, int cnt, const TfEncoding& encoding, double* out, ComputationMode mode_cpu_gpu, RoundingMode rounding_mode, bool shiftToSigned); @@ -669,9 +669,9 @@ template void dequantizeFromPackedFxp(const uint8_t* input, int cnt, template void quantizeDequantizePerChannel(const float* in, int numChannel, int numElement, int numElementPerChannel, float* out, float* encodingMin, float* encodingMax, float* encodingDelta, float* encodingOffset, - ComputationMode modeCpuGpu, RoundingMode roundingMode); + ComputationMode modeCpuGpu, RoundingMode roundingMode, void* stream); template void quantizeDequantizePerChannel(const double* in, int numChannel, int numElement, int numElementPerChannel, double* out, double* encodingMin, double* encodingMax, double* encodingDelta, double* encodingOffset, - ComputationMode modeCpuGpu, RoundingMode roundingMode); + ComputationMode modeCpuGpu, RoundingMode roundingMode, void* stream); } // End of namespace DlQuantization diff --git a/ModelOptimizations/DlQuantization/src/trim_functions.cu b/ModelOptimizations/DlQuantization/src/trim_functions.cu index 8fd8dc3dd75..8ed78f207aa 100644 --- a/ModelOptimizations/DlQuantization/src/trim_functions.cu +++ b/ModelOptimizations/DlQuantization/src/trim_functions.cu @@ -36,7 +36,6 @@ // //============================================================================== -#include #include "cuda_fp16.h" #include "cuda_util.hpp" #include "trim_functions.cuh" @@ -93,12 +92,12 @@ __global__ void quantizeDequantizePerChannelKernel(const DTYPE* in, int numChann } template -void quantizeDequantizeGpu(const DTYPE* in, int cnt, const TfEncoding& encoding, - DTYPE* out, RoundingMode rounding_mode) +void quantizeDequantizeGpu(const DTYPE* in, int cnt, const TfEncoding& encoding, DTYPE* out, RoundingMode rounding_mode, + void* stream) { - quantizeDequantizeKernel<<>>( - in, cnt, out, encoding.min, encoding.max, encoding.delta, - encoding.offset, rounding_mode); + quantizeDequantizeKernel + <<(stream)>>>( + in, cnt, out, encoding.min, encoding.max, encoding.delta, encoding.offset, rounding_mode); } @@ -111,9 +110,10 @@ __global__ void quantizeDequantizeFp16Kernel(const float* in, int cnt, float* ou } -void quantizeDequantizeFp16ForGPU(const float* in, int cnt, float* out) +void quantizeDequantizeFp16ForGPU(const float* in, int cnt, float* out, void* stream) { - quantizeDequantizeFp16Kernel<<>>(in, cnt, out); + quantizeDequantizeFp16Kernel<<(stream)>>>( + in, cnt, out); } @@ -131,23 +131,22 @@ void quantizeToFxpGpu(const DTYPE* in, int cnt, const TfEncoding& encoding, } template -void quantizeDequantizePerChannelGpu(const DTYPE* in, int numChannel, int numElement, int numElementPerChannel, DTYPE* out, - DTYPE* encodingMin, DTYPE* encodingMax, DTYPE* encodingDelta, DTYPE* encodingOffset, - RoundingMode roundingMode) +void quantizeDequantizePerChannelGpu(const DTYPE* in, int numChannel, int numElement, int numElementPerChannel, + DTYPE* out, DTYPE* encodingMin, DTYPE* encodingMax, DTYPE* encodingDelta, + DTYPE* encodingOffset, RoundingMode roundingMode, void* stream) { - - quantizeDequantizePerChannelKernel<<>>( - in, numChannel, numElement, numElementPerChannel, out, encodingMin, encodingMax, encodingDelta, - encodingOffset, roundingMode); - + quantizeDequantizePerChannelKernel + <<(stream)>>>( + in, numChannel, numElement, numElementPerChannel, out, encodingMin, encodingMax, encodingDelta, + encodingOffset, roundingMode); } // Explicit instantiations template void quantizeDequantizeGpu(const double* in, int cnt, const TfEncoding& encoding, double* out, - RoundingMode rounding_mode); + RoundingMode rounding_mode, void* stream); template void quantizeDequantizeGpu(const float* in, int cnt, const TfEncoding& encoding, float* out, - RoundingMode rounding_mode); + RoundingMode rounding_mode, void* stream); template void quantizeToFxpGpu(const double* in, int cnt, const TfEncoding& encoding, double* out, RoundingMode rounding_mode, bool shiftToSigned); @@ -156,12 +155,13 @@ template void quantizeToFxpGpu(const double* in, int cnt, const TfEncoding& enco template void quantizeToFxpGpu(const float* in, int cnt, const TfEncoding& encoding, float* out, RoundingMode rounding_mode, bool shiftToSigned); -template void quantizeDequantizePerChannelGpu(const float* in, int numChannel, int numElement, int numElementPerChannel, float* out, - float* encodingMin, float* encodingMax, float* encodingDelta, float* encodingOffset, - RoundingMode roundingMode); +template void quantizeDequantizePerChannelGpu(const float* in, int numChannel, int numElement, int numElementPerChannel, + float* out, float* encodingMin, float* encodingMax, float* encodingDelta, + float* encodingOffset, RoundingMode roundingMode, void* stream); -template void quantizeDequantizePerChannelGpu(const double* in, int numChannel, int numElement, int numElementPerChannel, double* out, - double* encodingMin, double* encodingMax, double* encodingDelta, double* encodingOffset, - RoundingMode roundingMode); +template void quantizeDequantizePerChannelGpu(const double* in, int numChannel, int numElement, + int numElementPerChannel, double* out, double* encodingMin, + double* encodingMax, double* encodingDelta, double* encodingOffset, + RoundingMode roundingMode, void* stream); } // End of namespace DlQuantization diff --git a/ModelOptimizations/DlQuantization/src/trim_functions.hpp b/ModelOptimizations/DlQuantization/src/trim_functions.hpp index 3734e863f56..cce6235bd4b 100644 --- a/ModelOptimizations/DlQuantization/src/trim_functions.hpp +++ b/ModelOptimizations/DlQuantization/src/trim_functions.hpp @@ -51,10 +51,10 @@ inline double randUniformCpu(); template void quantizeDequantize(const DTYPE* in, int cnt, const TfEncoding& encoding, DTYPE* out, ComputationMode mode_cpu_gpu, - RoundingMode rounding_mode); + RoundingMode rounding_mode, void* stream); -void quantizeDequantizeFp16ForGPU(const float* in, int cnt, float* out); +void quantizeDequantizeFp16ForGPU(const float* in, int cnt, float* out, void* stream); template @@ -98,7 +98,7 @@ double computeOffset(double encodingMin, double delta); template void quantizeDequantizePerChannel(const DTYPE* in, int numChannel, int numElement, int numElementPerChannel, DTYPE* out, DTYPE* encodingMin, DTYPE* encodingMax, DTYPE* encodingDelta, DTYPE* encodingOffset, - ComputationMode modeCpuGpu, RoundingMode roundingMode); + ComputationMode modeCpuGpu, RoundingMode roundingMode, void* stream); // GPU implementations ... #ifdef GPU_QUANTIZATION_ENABLED @@ -109,12 +109,12 @@ void quantizeToFxpGpu(const DTYPE* in, int cnt, const TfEncoding& encoding, DTYP template void quantizeDequantizeGpu(const DTYPE* in, int cnt, const TfEncoding& encoding, DTYPE* out, - RoundingMode rounding_mode); + RoundingMode rounding_mode, void* stream); template void quantizeDequantizePerChannelGpu(const DTYPE* in, int numChannel, int numElement, int numElementPerChannel, DTYPE* out, DTYPE* encodingMin, DTYPE* encodingMax, DTYPE* encodingDelta, DTYPE* encodingOffset, - RoundingMode roundingMode); + RoundingMode roundingMode, void* stream); #endif // GPU_QUANTIZATION_ENABLED diff --git a/TrainingExtensions/onnx/src/AimetOpUtils.cpp b/TrainingExtensions/onnx/src/AimetOpUtils.cpp index e74b5fd63d6..85c9ddd17b2 100644 --- a/TrainingExtensions/onnx/src/AimetOpUtils.cpp +++ b/TrainingExtensions/onnx/src/AimetOpUtils.cpp @@ -39,13 +39,14 @@ #include "AimetOpUtils.h" template -void copyInputTensorsToOutputTensors(const T* inTensor, size_t count, T* outTensor, bool useCuda) +void copyInputTensorsToOutputTensors(const T* inTensor, size_t count, T* outTensor, bool useCuda, void* stream) { // copy input_tensor to output_tensor if (useCuda) { #ifdef ONNX_CUDA - cudaMemcpy(outTensor, inTensor, count * sizeof(float), cudaMemcpyDeviceToDevice); + cudaMemcpyAsync(outTensor, inTensor, count * sizeof(float), cudaMemcpyDeviceToDevice, + reinterpret_cast(stream)); #else throw std::runtime_error("Not compiled for GPU mode."); #endif @@ -66,4 +67,5 @@ void quantizeDequantizeFp16Cpu(const float* in, int cnt, float* out) } -template void copyInputTensorsToOutputTensors(const float* inTensor, size_t count, float* outTensor, bool useCuda); +template void copyInputTensorsToOutputTensors(const float* inTensor, size_t count, float* outTensor, bool useCuda, + void* stream); diff --git a/TrainingExtensions/onnx/src/AimetOpUtils.h b/TrainingExtensions/onnx/src/AimetOpUtils.h index 2dacb2adb06..f874a63f05b 100644 --- a/TrainingExtensions/onnx/src/AimetOpUtils.h +++ b/TrainingExtensions/onnx/src/AimetOpUtils.h @@ -89,7 +89,7 @@ class OnnxCpuAllocator : public DlQuantization::IAllocator }; template -void copyInputTensorsToOutputTensors(const T* inTensor, size_t count, T* outTensor, bool useCuda); +void copyInputTensorsToOutputTensors(const T* inTensor, size_t count, T* outTensor, bool useCuda, void* stream); void quantizeDequantizeFp16Cpu(const float* in, int cnt, float* out); @@ -98,9 +98,9 @@ template void modeSpecificActionInt(const T* inTensor, size_t count, T* outTensor, DlQuantization::TensorQuantizer* tensorQuantizer, const DlQuantization::TensorQuantizerOpMode opMode, DlQuantization::TfEncoding* encoding, - const bool useSymmetricEncoding, DlQuantization::IAllocator* allocator, bool useCuda) + const bool useSymmetricEncoding, DlQuantization::IAllocator* allocator, bool useCuda, + void* stream) { - switch (opMode) { case DlQuantization::TensorQuantizerOpMode::oneShotQuantizeDequantize: @@ -110,7 +110,7 @@ void modeSpecificActionInt(const T* inTensor, size_t count, T* outTensor, DlQuantization::TfEncoding initial_encoding = tensorQuantizer->computeEncoding(encoding->bw, useSymmetricEncoding); tensorQuantizer->quantizeDequantize(inTensor, count, outTensor, initial_encoding.min, initial_encoding.max, - encoding->bw, useCuda); + encoding->bw, useCuda, stream); // Update encoding object with computed encoding encoding->min = initial_encoding.min; encoding->max = initial_encoding.max; @@ -121,18 +121,18 @@ void modeSpecificActionInt(const T* inTensor, size_t count, T* outTensor, case DlQuantization::TensorQuantizerOpMode::updateStats: { tensorQuantizer->updateStats(inTensor, count, useCuda, allocator); - copyInputTensorsToOutputTensors(inTensor, count, outTensor, useCuda); + copyInputTensorsToOutputTensors(inTensor, count, outTensor, useCuda, stream); break; } case DlQuantization::TensorQuantizerOpMode::quantizeDequantize: { tensorQuantizer->quantizeDequantize(inTensor, count, outTensor, encoding->min, encoding->max, encoding->bw, - useCuda); + useCuda, stream); break; } case DlQuantization::TensorQuantizerOpMode::passThrough: { - copyInputTensorsToOutputTensors(inTensor, count, outTensor, useCuda); + copyInputTensorsToOutputTensors(inTensor, count, outTensor, useCuda, stream); break; } default: @@ -148,7 +148,7 @@ void modeSpecificActionPerChannelInt( const T* inTensor, size_t count, T* outTensor, int axis, OrtTensorDimensions& dims, std::vector& tensorQuantizers, const DlQuantization::TensorQuantizerOpMode opMode, std::vector& encodings, const bool useSymmetricEncoding, - DlQuantization::IAllocator* allocator, bool useCuda, + DlQuantization::IAllocator* allocator, bool useCuda, void* stream, std::unique_ptr>& tensorQuantizationSim) { size_t numChannels = dims[axis]; @@ -180,7 +180,7 @@ void modeSpecificActionPerChannelInt( encodings[ch]->delta = channelEncoding.delta; } quantizeDequantizePerChannel(inTensor, dims, axis, outTensor, encodings, tensorQuantizers, useCuda, allocator, - tensorQuantizationSim); + stream, tensorQuantizationSim); allocator->deleteRaw(channelBuffer); break; } @@ -194,18 +194,18 @@ void modeSpecificActionPerChannelInt( tensorQuantizer->updateStats(channelBuffer, channelSize, useCuda, allocator); } allocator->deleteRaw(channelBuffer); - copyInputTensorsToOutputTensors(inTensor, count, outTensor, useCuda); + copyInputTensorsToOutputTensors(inTensor, count, outTensor, useCuda, stream); break; } case DlQuantization::TensorQuantizerOpMode::quantizeDequantize: { quantizeDequantizePerChannel(inTensor, dims, axis, outTensor, encodings, tensorQuantizers, useCuda, allocator, - tensorQuantizationSim); + stream, tensorQuantizationSim); break; } case DlQuantization::TensorQuantizerOpMode::passThrough: { - copyInputTensorsToOutputTensors(inTensor, count, outTensor, useCuda); + copyInputTensorsToOutputTensors(inTensor, count, outTensor, useCuda, stream); break; } default: @@ -217,17 +217,17 @@ void modeSpecificActionPerChannelInt( template void modeSpecificActionFloat(const T* inTensor, size_t count, T* outTensor, - const DlQuantization::TensorQuantizerOpMode opMode, - DlQuantization::IAllocator* allocator, bool useCuda) + const DlQuantization::TensorQuantizerOpMode opMode, DlQuantization::IAllocator* allocator, + bool useCuda, void* stream) { switch (opMode) { case DlQuantization::TensorQuantizerOpMode::oneShotQuantizeDequantize: case DlQuantization::TensorQuantizerOpMode::quantizeDequantize: { - if(useCuda) + if (useCuda) { - DlQuantization::quantizeDequantizeFp16Gpu(inTensor, count, outTensor); + DlQuantization::quantizeDequantizeFp16Gpu(inTensor, count, outTensor, stream); } else quantizeDequantizeFp16Cpu(inTensor, count, outTensor); @@ -236,7 +236,7 @@ void modeSpecificActionFloat(const T* inTensor, size_t count, T* outTensor, case DlQuantization::TensorQuantizerOpMode::updateStats: case DlQuantization::TensorQuantizerOpMode::passThrough: { - copyInputTensorsToOutputTensors(inTensor, count, outTensor, useCuda); + copyInputTensorsToOutputTensors(inTensor, count, outTensor, useCuda, stream); break; } default: diff --git a/TrainingExtensions/onnx/src/QcQuantizeOp.cpp b/TrainingExtensions/onnx/src/QcQuantizeOp.cpp index aba0df3c52b..1d893937e51 100644 --- a/TrainingExtensions/onnx/src/QcQuantizeOp.cpp +++ b/TrainingExtensions/onnx/src/QcQuantizeOp.cpp @@ -82,11 +82,12 @@ void QcQuantizeKernel::Compute(OrtKernelContext* context) api_.ReleaseTensorTypeAndShapeInfo(outputInfo); DlQuantization::IAllocator* allocator = &cpuAllocator; + void* stream = nullptr; #ifdef ONNX_CUDA if (useCuda) { allocator = &cudaAllocator; - cudaDeviceSynchronize(); + stream = api_.KernelContext_GetGPUComputeStream(context); } #endif @@ -97,17 +98,17 @@ void QcQuantizeKernel::Compute(OrtKernelContext* context) int axis = quantInfo->channelAxis; modeSpecificActionPerChannelInt(inputData, size, result, axis, dimensions, quantInfo->tensorQuantizerRef, opMode, encodings, quantInfo->useSymmetricEncoding, allocator, useCuda, - tensorQuantizationSim); + stream, tensorQuantizationSim); } else { modeSpecificActionInt(inputData, size, result, quantInfo->tensorQuantizerRef[0], opMode, encodings[0], - quantInfo->useSymmetricEncoding, allocator, useCuda); + quantInfo->useSymmetricEncoding, allocator, useCuda, stream); } } else { - modeSpecificActionFloat(inputData, size, result, opMode, allocator, useCuda); + modeSpecificActionFloat(inputData, size, result, opMode, allocator, useCuda, stream); } // We only ever need to run in oneShotQuantizeDequantize once, afterwards just use quantizeDequantize @@ -115,14 +116,6 @@ void QcQuantizeKernel::Compute(OrtKernelContext* context) { quantInfo->opMode = DlQuantization::TensorQuantizerOpMode::quantizeDequantize; } - -#ifdef ONNX_CUDA - // Wait for our calls to finish before continuing since we do not use the onnxruntime stream - if (useCuda) - { - cudaDeviceSynchronize(); - } -#endif } diff --git a/TrainingExtensions/onnx/src/QuantizeDequantizeUtils.hpp b/TrainingExtensions/onnx/src/QuantizeDequantizeUtils.hpp index f21e202c480..76c5be03b2d 100644 --- a/TrainingExtensions/onnx/src/QuantizeDequantizeUtils.hpp +++ b/TrainingExtensions/onnx/src/QuantizeDequantizeUtils.hpp @@ -99,7 +99,7 @@ void quantizeDequantizePerChannel( const T* inTensor, std::vector& shape, int axis, T* outTensor, std::vector& encodings, std::vector& tensorQuantizers, bool useCuda, - DlQuantization::IAllocator* allocator, + DlQuantization::IAllocator* allocator, void* stream, std::unique_ptr >& tensorQuantizationSim) { size_t channels = shape[axis]; @@ -144,7 +144,7 @@ void quantizeDequantizePerChannel( tensorQuantizationSim->quantizeDequantizeTensorPerChannel(inTensor, channels, numElement, innerDims, outTensor, encodingMin, encodingMax, encodingDelta, encodingOffset, - tensorQuantizers[0]->roundingMode, useCuda); + tensorQuantizers[0]->roundingMode, useCuda, stream); if (useCuda) { allocator->deleteRaw(encodingVectorDevice);