microsoft · shiyu1994 · Oct 10, 2023 · Oct 26, 2023 · Oct 27, 2023 · Nov 8, 2023
@@ -32,7 +32,7 @@ if [ "$PY_MINOR_VER" -gt 7 ]; then
             --inspect \
             --ignore 'compiled-objects-have-debug-symbols'\
             --ignore 'distro-too-large-compressed' \
-            --max-allowed-size-uncompressed '100M' \
+            --max-allowed-size-uncompressed '500M' \
             --max-allowed-files 800 \
             "$(echo "${DIST_DIR}"/*)" || exit 1
     elif { test "$(uname -m)" = "aarch64"; }; then

@@ -218,6 +218,7 @@ endif()
 
 if(USE_CUDA)
     find_package(CUDAToolkit 11.0 REQUIRED)
+    find_package(Nccl REQUIRED)
     include_directories(${CUDAToolkit_INCLUDE_DIRS})
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall")
 
@@ -451,6 +452,7 @@ set(
     LGBM_CUDA_SOURCES
       src/boosting/cuda/cuda_score_updater.cpp
       src/boosting/cuda/cuda_score_updater.cu
+      src/boosting/cuda/nccl_gbdt.cpp
       src/metric/cuda/cuda_binary_metric.cpp
       src/metric/cuda/cuda_pointwise_metric.cpp
       src/metric/cuda/cuda_regression_metric.cpp
@@ -601,6 +603,10 @@ if(USE_GPU)
   target_link_libraries(lightgbm_objs PUBLIC ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
 endif()
 
+if(USE_CUDA)
+  target_link_libraries(lightgbm_objs PUBLIC ${NCCL_LIBRARY})
+endif(USE_CUDA)
+
 if(__INTEGRATE_OPENCL)
   # targets OpenCL and Boost are added in IntegratedOpenCL.cmake
   add_dependencies(lightgbm_objs OpenCL Boost)

@@ -1107,7 +1107,7 @@ test_that("all parameters are stored correctly with save_model_to_string()", {
         , "[gpu_platform_id: -1]"
         , "[gpu_device_id: -1]"
         , "[gpu_use_dp: 0]"
-        , "[num_gpu: 1]"
+        , "[num_gpus: 1]"
     )
     all_param_entries <- c(non_default_param_entries, default_param_entries)
 

@@ -0,0 +1,70 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Tries to find NCCL headers and libraries.
+#
+# Usage of this module as follows:
+#
+#  find_package(NCCL)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  NCCL_ROOT - When set, this path is inspected instead of standard library
+#              locations as the root of the NCCL installation.
+#              The environment variable NCCL_ROOT overrides this variable.
+#
+# This module defines
+#  Nccl_FOUND, whether nccl has been found
+#  NCCL_INCLUDE_DIR, directory containing header
+#  NCCL_LIBRARY, directory containing nccl library
+#  NCCL_LIB_NAME, nccl library name
+#  USE_NCCL_LIB_PATH, when set, NCCL_LIBRARY path is also inspected for the
+#                     location of the nccl library. This would disable
+#                     switching between static and shared.
+#
+# This module assumes that the user has already called find_package(CUDA)
+
+if (NCCL_LIBRARY)
+  if(NOT USE_NCCL_LIB_PATH)
+    # Don't cache NCCL_LIBRARY to enable switching between static and shared.
+    unset(NCCL_LIBRARY CACHE)
+  endif(NOT USE_NCCL_LIB_PATH)
+endif()
+
+if (BUILD_WITH_SHARED_NCCL)
+  # libnccl.so
+  set(NCCL_LIB_NAME nccl)
+else ()
+  # libnccl_static.a
+  set(NCCL_LIB_NAME nccl_static)
+endif (BUILD_WITH_SHARED_NCCL)
+
+find_path(NCCL_INCLUDE_DIR
+  NAMES nccl.h
+  PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
+
+find_library(NCCL_LIBRARY
+  NAMES ${NCCL_LIB_NAME}
+  PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
+
+message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Nccl DEFAULT_MSG
+                                  NCCL_INCLUDE_DIR NCCL_LIBRARY)
+
+mark_as_advanced(
+  NCCL_INCLUDE_DIR
+  NCCL_LIBRARY
+)
@@ -1371,6 +1371,18 @@ GPU Parameters
 
    -  **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
 
+-  ``num_gpus`` :raw-html:`<a id="num_gpus" title="Permalink to this parameter" href="#num_gpus">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int
+
+   -  Number of GPUs to use for training, used with device_type=cuda
+
+   -  When <= 0, only 1 GPU will be used
+
+-  ``gpu_device_id_list`` :raw-html:`<a id="gpu_device_id_list" title="Permalink to this parameter" href="#gpu_device_id_list">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
+
+   -  List of CUDA device IDs used when device_type=cuda
+
+   -  When empty, the devices with the smallest IDs will be used
+
 -  ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
 
    -  set this to ``true`` to use double precision math on GPU (by default single precision is used)

@@ -309,9 +309,11 @@ class LIGHTGBM_EXPORT Boosting {
   * \param format Format of model
   * \param config config for boosting
   * \param filename name of model file, if existing will continue to train from this model
+  * \param device_type type of device, can be cpu, gpu or cuda
+  * \param num_gpu number of GPUs to use
   * \return The boosting object
   */
-  static Boosting* CreateBoosting(const std::string& type, const char* filename);
+  static Boosting* CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpu);
 
   virtual std::string GetLoadedParam() const = 0;
 

@@ -1125,6 +1125,14 @@ struct Config {
   // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
   int gpu_device_id = -1;
 
+  // desc = Number of GPUs to use for training, used with device_type=cuda
+  // desc = When <= 0, only 1 GPU will be used
+  int num_gpus = 1;
+
+  // desc = List of CUDA device IDs used when device_type=cuda
+  // desc = When empty, the devices with the smallest IDs will be used
+  std::string gpu_device_id_list = "";
+
   // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
   // desc = **Note**: can be used only in OpenCL implementation (``device_type="gpu"``), in CUDA implementation only double precision is currently supported
   bool gpu_use_dp = false;

@@ -13,6 +13,7 @@
 #include <LightGBM/bin.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
+#include <memory>
 #include <cstdint>
 #include <vector>
 
@@ -39,11 +40,11 @@ class CUDAColumnData {
             const std::vector<uint8_t>& feature_mfb_is_na,
             const std::vector<int>& feature_to_column);
 
-  const void* GetColumnData(const int column_index) const { return data_by_column_[column_index]; }
+  const void* GetColumnData(const int column_index) const { return data_by_column_[column_index]->RawData(); }
 
   void CopySubrow(const CUDAColumnData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices);
 
-  void* const* cuda_data_by_column() const { return cuda_data_by_column_; }
+  void* const* cuda_data_by_column() const { return cuda_data_by_column_.RawData(); }
 
   uint32_t feature_min_bin(const int feature_index) const { return feature_min_bin_[feature_index]; }
 
@@ -63,42 +64,50 @@ class CUDAColumnData {
 
   uint8_t feature_mfb_is_na(const int feature_index) const { return feature_mfb_is_na_[feature_index]; }
 
-  const uint32_t* cuda_feature_min_bin() const { return cuda_feature_min_bin_; }
+  const uint32_t* cuda_feature_min_bin() const { return cuda_feature_min_bin_.RawData(); }
 
-  const uint32_t* cuda_feature_max_bin() const { return cuda_feature_max_bin_; }
+  const uint32_t* cuda_feature_max_bin() const { return cuda_feature_max_bin_.RawData(); }
 
-  const uint32_t* cuda_feature_offset() const { return cuda_feature_offset_; }
+  const uint32_t* cuda_feature_offset() const { return cuda_feature_offset_.RawData(); }
 
-  const uint32_t* cuda_feature_most_freq_bin() const { return cuda_feature_most_freq_bin_; }
+  const uint32_t* cuda_feature_most_freq_bin() const { return cuda_feature_most_freq_bin_.RawData(); }
 
-  const uint32_t* cuda_feature_default_bin() const { return cuda_feature_default_bin_; }
+  const uint32_t* cuda_feature_default_bin() const { return cuda_feature_default_bin_.RawData(); }
 
-  const uint8_t* cuda_feature_missing_is_zero() const { return cuda_feature_missing_is_zero_; }
+  const uint8_t* cuda_feature_missing_is_zero() const { return cuda_feature_missing_is_zero_.RawData(); }
 
-  const uint8_t* cuda_feature_missing_is_na() const { return cuda_feature_missing_is_na_; }
+  const uint8_t* cuda_feature_missing_is_na() const { return cuda_feature_missing_is_na_.RawData(); }
 
-  const uint8_t* cuda_feature_mfb_is_zero() const { return cuda_feature_mfb_is_zero_; }
+  const uint8_t* cuda_feature_mfb_is_zero() const { return cuda_feature_mfb_is_zero_.RawData(); }
 
-  const uint8_t* cuda_feature_mfb_is_na() const { return cuda_feature_mfb_is_na_; }
+  const uint8_t* cuda_feature_mfb_is_na() const { return cuda_feature_mfb_is_na_.RawData(); }
 
-  const int* cuda_feature_to_column() const { return cuda_feature_to_column_; }
+  const int* cuda_feature_to_column() const { return cuda_feature_to_column_.RawData(); }
 
-  const uint8_t* cuda_column_bit_type() const { return cuda_column_bit_type_; }
+  const uint8_t* cuda_column_bit_type() const { return cuda_column_bit_type_.RawData(); }
 
   int feature_to_column(const int feature_index) const { return feature_to_column_[feature_index]; }
 
   uint8_t column_bit_type(const int column_index) const { return column_bit_type_[column_index]; }
 
  private:
   template <bool IS_SPARSE, bool IS_4BIT, typename BIN_TYPE>
-  void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, void** out_column_data_pointer);
+  void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, CUDAVector<void>* out_column_data_pointer);
 
   void LaunchCopySubrowKernel(void* const* in_cuda_data_by_column);
 
   void InitColumnMetaInfo();
 
   void ResizeWhenCopySubrow(const data_size_t num_used_indices);
 
+  std::vector<void*> GetDataByColumnPointers(const std::vector<std::unique_ptr<CUDAVector<void>>>& data_by_column) const {
+    std::vector<void*> data_by_column_pointers(data_by_column.size(), nullptr);
+    for (size_t i = 0; i < data_by_column.size(); ++i) {
+      data_by_column_pointers[i] = reinterpret_cast<void*>(data_by_column[i]->RawData());
+    }
+    return data_by_column_pointers;
+  }
+
   int gpu_device_id_;
   int num_threads_;
   data_size_t num_data_;
@@ -113,24 +122,24 @@ class CUDAColumnData {
   std::vector<uint8_t> feature_missing_is_na_;
   std::vector<uint8_t> feature_mfb_is_zero_;
   std::vector<uint8_t> feature_mfb_is_na_;
-  void** cuda_data_by_column_;
+  CUDAVector<void*> cuda_data_by_column_;
   std::vector<int> feature_to_column_;
-  std::vector<void*> data_by_column_;
-
-  uint8_t* cuda_column_bit_type_;
-  uint32_t* cuda_feature_min_bin_;
-  uint32_t* cuda_feature_max_bin_;
-  uint32_t* cuda_feature_offset_;
-  uint32_t* cuda_feature_most_freq_bin_;
-  uint32_t* cuda_feature_default_bin_;
-  uint8_t* cuda_feature_missing_is_zero_;
-  uint8_t* cuda_feature_missing_is_na_;
-  uint8_t* cuda_feature_mfb_is_zero_;
-  uint8_t* cuda_feature_mfb_is_na_;
-  int* cuda_feature_to_column_;
+  std::vector<std::unique_ptr<CUDAVector<void>>> data_by_column_;
+
+  CUDAVector<uint8_t> cuda_column_bit_type_;
+  CUDAVector<uint32_t> cuda_feature_min_bin_;
+  CUDAVector<uint32_t> cuda_feature_max_bin_;
+  CUDAVector<uint32_t> cuda_feature_offset_;
+  CUDAVector<uint32_t> cuda_feature_most_freq_bin_;
+  CUDAVector<uint32_t> cuda_feature_default_bin_;
+  CUDAVector<uint8_t> cuda_feature_missing_is_zero_;
+  CUDAVector<uint8_t> cuda_feature_missing_is_na_;
+  CUDAVector<uint8_t> cuda_feature_mfb_is_zero_;
+  CUDAVector<uint8_t> cuda_feature_mfb_is_na_;
+  CUDAVector<int> cuda_feature_to_column_;
 
   // used when bagging with subset
-  data_size_t* cuda_used_indices_;
+  CUDAVector<data_size_t> cuda_used_indices_;
   data_size_t num_used_indices_;
   data_size_t cur_subset_buffer_size_;
 };

@@ -35,20 +35,20 @@ class CUDAMetadata {
 
   void SetInitScore(const double* init_score, data_size_t len);
 
-  const label_t* cuda_label() const { return cuda_label_; }
+  const label_t* cuda_label() const { return cuda_label_.RawData(); }
 
-  const label_t* cuda_weights() const { return cuda_weights_; }
+  const label_t* cuda_weights() const { return cuda_weights_.RawData(); }
 
-  const data_size_t* cuda_query_boundaries() const { return cuda_query_boundaries_; }
+  const data_size_t* cuda_query_boundaries() const { return cuda_query_boundaries_.RawData(); }
 
-  const label_t* cuda_query_weights() const { return cuda_query_weights_; }
+  const label_t* cuda_query_weights() const { return cuda_query_weights_.RawData(); }
 
  private:
-  label_t* cuda_label_;
-  label_t* cuda_weights_;
-  data_size_t* cuda_query_boundaries_;
-  label_t* cuda_query_weights_;
-  double* cuda_init_score_;
+  CUDAVector<label_t> cuda_label_;
+  CUDAVector<label_t> cuda_weights_;
+  CUDAVector<data_size_t> cuda_query_boundaries_;
+  CUDAVector<label_t> cuda_query_weights_;
+  CUDAVector<double> cuda_init_score_;
 };
 
 }  // namespace LightGBM