Skip to content

Commit

Permalink
Add cuda stream argument for onnx quantize-dequantize
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Tuttle <[email protected]>
  • Loading branch information
quic-mtuttle authored Oct 10, 2023
1 parent 626913b commit 60fbf1f
Show file tree
Hide file tree
Showing 17 changed files with 133 additions and 81 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ namespace DlQuantization
* @param cnt total size of input tensor
* @param out pointer to the output tensor
*/
void quantizeDequantizeFp16Gpu(const float* in, int cnt, float* out);
void quantizeDequantizeFp16Gpu(const float* in, int cnt, float* out, void* stream = nullptr);

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ class ITensorQuantizationSim
double encodingMin, double encodingMax,
uint8_t bw, RoundingMode roundMode,
bool use_cuda) = 0;

virtual void quantizeDequantizeTensor(const DTYPE* inputTensorData, size_t inputTensorCount,
DTYPE* outputTensorData,
double encodingMin, double encodingMax,
uint8_t bw, RoundingMode roundMode,
bool use_cuda, void* stream) = 0;

virtual void quantizeTensor(const DTYPE* inputTensorData, size_t inputTensorCount, DTYPE* outputTensorData,
double encodingMin, double encodingMax, uint8_t bw, RoundingMode roundMode,
bool use_cuda, bool shiftToSigned) = 0;
Expand Down Expand Up @@ -112,6 +119,12 @@ class ITensorQuantizationSim
DTYPE* encodingDelta, DTYPE* encodingOffset,
RoundingMode roundingMode, bool useCuda) = 0;

virtual void quantizeDequantizeTensorPerChannel(const DTYPE* inputTensorData, size_t numChannel,
size_t numElement, size_t numElementPerChannel,
DTYPE* outputTensorData, DTYPE* encodingMin, DTYPE* encodingMax,
DTYPE* encodingDelta, DTYPE* encodingOffset,
RoundingMode roundingMode, bool useCuda, void* stream) = 0;

};

} // namespace DlQuantization
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ class TensorQuantizer : public TensorQuantizerOpFacade
void quantizeDequantize(const float* input, std::size_t tensorSize, float* output,
double encodingMin, double encodingMax, unsigned int bitwidth, bool useCuda) override;

void quantizeDequantize(const float* input, std::size_t tensorSize, float* output, double encodingMin,
double encodingMax, unsigned int bitwidth, bool useCuda, void* stream) override;

/**
* @brief Convert a tensor from DTYPE to quantized 8-bit packed format
* @relates quantizeDequantize, except output is stored in 8-bit packed format
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ class TensorQuantizerOpFacade
virtual void quantizeDequantize(const float* input, std::size_t tensorSize, float* output,
double encodingMin, double encodingMax, unsigned int bitwidth, bool useCuda) = 0;

virtual void quantizeDequantize(const float* input, std::size_t tensorSize, float* output, double encodingMin,
double encodingMax, unsigned int bitwidth, bool useCuda, void* stream) = 0;
/**
* Compute the encoding for this tensor using stats collected so far
*/
Expand Down
4 changes: 2 additions & 2 deletions ModelOptimizations/DlQuantization/src/Fp16Quantization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@
namespace DlQuantization
{

void quantizeDequantizeFp16Gpu(const float* in, int cnt, float* out)
void quantizeDequantizeFp16Gpu(const float* in, int cnt, float* out, void* stream)
{
#ifdef GPU_QUANTIZATION_ENABLED
quantizeDequantizeFp16ForGPU(in, cnt, out);
quantizeDequantizeFp16ForGPU(in, cnt, out, stream);
#else
throw std::runtime_error("Not compiled for GPU mode.");
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ void MainQuantizationClass<DTYPE>::QuantizeDequantizeActs(const string& layer, L
for (unsigned int blob_id = 0; blob_id < acts.size(); ++blob_id)
{
quantizeDequantize(acts[blob_id], count[blob_id], encoding[blob_id], acts_quantized[blob_id], m_ModeCpuGpu,
ROUND_NEAREST);
ROUND_NEAREST, nullptr);
}
}

Expand All @@ -126,7 +126,7 @@ void MainQuantizationClass<DTYPE>::QuantizeDequantizeParams(int bw, DTYPE* param
TfEncoding& encoding)
{
m_QuantAlgo->NumberDistributionToFxpFormat(bw, params, count, encoding);
quantizeDequantize(params, count, encoding, params_quantized, m_ModeCpuGpu, mode_rounding);
quantizeDequantize(params, count, encoding, params_quantized, m_ModeCpuGpu, mode_rounding, nullptr);
}

template <typename DTYPE>
Expand Down
30 changes: 27 additions & 3 deletions ModelOptimizations/DlQuantization/src/TensorQuantizationSim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,21 @@ void TensorQuantizationSim<DTYPE>::quantizeDequantizeTensor(const DTYPE* inputTe
DTYPE* outputTensorData, double encodingMin,
double encodingMax, uint8_t bw, RoundingMode roundingMode,
bool use_cuda)
{
quantizeDequantizeTensor(inputTensorData, inputTensorCount, outputTensorData, encodingMin, encodingMax,
bw, roundingMode, use_cuda, nullptr);
}

template <typename DTYPE>
void TensorQuantizationSim<DTYPE>::quantizeDequantizeTensor(const DTYPE* inputTensorData, size_t inputTensorCount,
DTYPE* outputTensorData, double encodingMin,
double encodingMax, uint8_t bw, RoundingMode roundingMode,
bool use_cuda, void* stream)
{
TfEncoding encoding;
fillEncodingInfo(encoding, bw, encodingMin, encodingMax);
quantizeDequantize(inputTensorData, inputTensorCount, encoding, outputTensorData, getComputationMode(use_cuda),
roundingMode);
roundingMode, stream);
}

template <typename DTYPE>
Expand Down Expand Up @@ -145,7 +155,7 @@ void TensorQuantizationSim<DTYPE>::quantizeDequantizePerChannelTensor(
for (uint32_t i = 0; i < splits.size(); ++i) {
auto& split = splits[i];
quantizeDequantize(split.data(), split.size(), completeEncodings[i], split.data(), getComputationMode(useCuda),
roundMode);
roundMode, nullptr);
}

// Concatenate the quantized data back into its original shape.
Expand Down Expand Up @@ -269,6 +279,19 @@ void TensorQuantizationSim<DTYPE>::quantizeDequantizeTensorPerChannel(const DTYP
DTYPE* encodingMax, DTYPE* encodingDelta,
DTYPE* encodingOffset, RoundingMode roundingMode,
bool useCuda)
{
quantizeDequantizeTensorPerChannel(inputTensorData, numChannel, numElement, numElementPerChannel, outputTensorData,
encodingMin, encodingMax, encodingDelta, encodingOffset, roundingMode, useCuda,
nullptr);
}

template <typename DTYPE>
void TensorQuantizationSim<DTYPE>::quantizeDequantizeTensorPerChannel(const DTYPE* inputTensorData, size_t numChannel,
size_t numElement, size_t numElementPerChannel,
DTYPE* outputTensorData, DTYPE* encodingMin,
DTYPE* encodingMax, DTYPE* encodingDelta,
DTYPE* encodingOffset, RoundingMode roundingMode,
bool useCuda, void* stream)
{
DlQuantization::ComputationMode cpuGpuMode;
if (useCuda)
Expand All @@ -277,7 +300,8 @@ void TensorQuantizationSim<DTYPE>::quantizeDequantizeTensorPerChannel(const DTYP
cpuGpuMode = DlQuantization::ComputationMode::COMP_MODE_CPU;

quantizeDequantizePerChannel(inputTensorData, numChannel, numElement, numElementPerChannel, outputTensorData,
encodingMin, encodingMax, encodingDelta, encodingOffset, cpuGpuMode, roundingMode);
encodingMin, encodingMax, encodingDelta, encodingOffset, cpuGpuMode, roundingMode,
stream);
}

template class TensorQuantizationSim<float>;
Expand Down
9 changes: 9 additions & 0 deletions ModelOptimizations/DlQuantization/src/TensorQuantizationSim.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ class TensorQuantizationSim : public ITensorQuantizationSim<DTYPE>
double encodingMin, double encodingMax, uint8_t bw, RoundingMode roundMode,
bool use_cuda) override;

void quantizeDequantizeTensor(const DTYPE* inputTensorData, size_t inputTensorCount, DTYPE* outputTensorData,
double encodingMin, double encodingMax, uint8_t bw, RoundingMode roundMode,
bool use_cuda, void* stream) override;

void quantizeTensor(const DTYPE* inputTensorData, size_t inputTensorCount, DTYPE* outputTensorData,
double encodingMin, double encodingMax, uint8_t bw, RoundingMode roundMode, bool use_cuda,
bool shiftToSigned)
Expand Down Expand Up @@ -99,6 +103,11 @@ class TensorQuantizationSim : public ITensorQuantizationSim<DTYPE>
DTYPE* encodingMax, DTYPE* encodingDelta, DTYPE* encodingOffset,
RoundingMode roundingMode, bool useCuda) override;

void quantizeDequantizeTensorPerChannel(const DTYPE* inputTensorData, size_t numChannel, size_t numElement,
size_t numElementPerChannel, DTYPE* outputTensorData, DTYPE* encodingMin,
DTYPE* encodingMax, DTYPE* encodingDelta, DTYPE* encodingOffset,
RoundingMode roundingMode, bool useCuda, void* stream) override;

inline DlQuantization::ComputationMode getComputationMode(bool use_cuda)
{
return (use_cuda ? DlQuantization::ComputationMode::COMP_MODE_GPU
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ py::array_t<float> TensorQuantizationSimForPython::quantizeDequantize(py::array_

_tensorQuantizationSim->quantizeDequantizeTensor(inputDataPtr, inputTensorSize, outputDataPtr,
encoding.min, encoding.max, bitwidth, roundingMode,
use_cuda);
use_cuda, nullptr);

return output;
}
Expand Down
8 changes: 7 additions & 1 deletion ModelOptimizations/DlQuantization/src/TensorQuantizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,16 @@ void TensorQuantizer::computeEncodingFromData(uint8_t bw, const float* data, siz

void TensorQuantizer::quantizeDequantize(const float* input, std::size_t tensorSize, float* output, double encodingMin,
double encodingMax, unsigned int bitwidth, bool useCuda)
{
quantizeDequantize(input, tensorSize, output, encodingMin, encodingMax, bitwidth, useCuda, nullptr);
}

void TensorQuantizer::quantizeDequantize(const float* input, std::size_t tensorSize, float* output, double encodingMin,
double encodingMax, unsigned int bitwidth, bool useCuda, void* stream)
{
assert(isEncodingValid);
_tensorQuantizationSim->quantizeDequantizeTensor(input, tensorSize, output, encodingMin, encodingMax, bitwidth,
roundingMode, useCuda);
roundingMode, useCuda, stream);
}

void TensorQuantizer::quantizeTensorPacked(const float* input, std::size_t tensorSize, std::vector<uint8_t>& output,
Expand Down
16 changes: 8 additions & 8 deletions ModelOptimizations/DlQuantization/src/trim_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ Lambda parallelize(const uint32_t number_of_threads, Lambda lambda)
// encoding: TF: rounded
template <typename DTYPE>
void quantizeDequantize(const DTYPE* in, int cnt, const TfEncoding& encoding, DTYPE* out,
ComputationMode mode_cpu_gpu, RoundingMode rounding_mode)
ComputationMode mode_cpu_gpu, RoundingMode rounding_mode, void* stream)
{
switch (mode_cpu_gpu)
{
Expand All @@ -101,7 +101,7 @@ void quantizeDequantize(const DTYPE* in, int cnt, const TfEncoding& encoding, DT
break;
case COMP_MODE_GPU:
#ifdef GPU_QUANTIZATION_ENABLED
quantizeDequantizeGpu(in, cnt, encoding, out, rounding_mode);
quantizeDequantizeGpu(in, cnt, encoding, out, rounding_mode, stream);
#else
throw runtime_error("Not compiled for GPU mode.");
#endif
Expand Down Expand Up @@ -602,7 +602,7 @@ void dequantizeFromPackedFxpCpu(const uint8_t* input, int cnt,
template <typename DTYPE>
void quantizeDequantizePerChannel(const DTYPE* in, int numChannel, int numElement, int numElementPerChannel, DTYPE* out,
DTYPE* encodingMin, DTYPE* encodingMax, DTYPE* encodingDelta, DTYPE* encodingOffset,
ComputationMode modeCpuGpu, RoundingMode roundingMode)
ComputationMode modeCpuGpu, RoundingMode roundingMode, void* stream)
{
switch (modeCpuGpu)
{
Expand All @@ -613,7 +613,7 @@ void quantizeDequantizePerChannel(const DTYPE* in, int numChannel, int numElemen
case COMP_MODE_GPU:
#ifdef GPU_QUANTIZATION_ENABLED
quantizeDequantizePerChannelGpu(in, numChannel, numElement, numElementPerChannel, out, encodingMin, encodingMax,
encodingDelta, encodingOffset, roundingMode);
encodingDelta, encodingOffset, roundingMode, stream);
#else
throw runtime_error("Not compiled for GPU mode.");
#endif
Expand Down Expand Up @@ -643,10 +643,10 @@ void quantizeDequantizePerChannelCpu(const DTYPE* in, int numChannel, int numEle

// Explicit instantiations
template void quantizeDequantize(const double* in, int cnt, const TfEncoding& encoding, double* out,
ComputationMode mode_cpu_gpu, RoundingMode rounding_mode);
ComputationMode mode_cpu_gpu, RoundingMode rounding_mode, void* stream);

template void quantizeDequantize(const float* in, int cnt, const TfEncoding& encoding, float* out,
ComputationMode mode_cpu_gpu, RoundingMode rounding_mode);
ComputationMode mode_cpu_gpu, RoundingMode rounding_mode, void* stream);

template void quantizeToFxp(const double* in, int cnt, const TfEncoding& encoding, double* out,
ComputationMode mode_cpu_gpu, RoundingMode rounding_mode, bool shiftToSigned);
Expand All @@ -669,9 +669,9 @@ template void dequantizeFromPackedFxp(const uint8_t* input, int cnt,

template void quantizeDequantizePerChannel(const float* in, int numChannel, int numElement, int numElementPerChannel, float* out,
float* encodingMin, float* encodingMax, float* encodingDelta, float* encodingOffset,
ComputationMode modeCpuGpu, RoundingMode roundingMode);
ComputationMode modeCpuGpu, RoundingMode roundingMode, void* stream);
template void quantizeDequantizePerChannel(const double* in, int numChannel, int numElement, int numElementPerChannel, double* out,
double* encodingMin, double* encodingMax, double* encodingDelta, double* encodingOffset,
ComputationMode modeCpuGpu, RoundingMode roundingMode);
ComputationMode modeCpuGpu, RoundingMode roundingMode, void* stream);

} // End of namespace DlQuantization
Loading

0 comments on commit 60fbf1f

Please sign in to comment.