microsoft · shiyu1994 · Oct 10, 2023 · Oct 26, 2023 · Oct 27, 2023 · Nov 8, 2023
diff --git a/.ci/check_python_dists.sh b/.ci/check_python_dists.sh
@@ -25,7 +25,7 @@ if [ $PY_MINOR_VER -gt 7 ]; then
         pydistcheck \
             --inspect \
             --ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \
-            --max-allowed-size-uncompressed '100M' \
+            --max-allowed-size-uncompressed '500M' \
             --max-allowed-files 800 \
             ${DIST_DIR}/* || exit 1
     elif { test $(uname -m) = "aarch64"; }; then

@@ -199,6 +199,7 @@ endif()
 
 if(USE_CUDA)
     find_package(CUDA 11.0 REQUIRED)
+    find_package(Nccl REQUIRED)
     include_directories(${CUDA_INCLUDE_DIRS})
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall")
 
@@ -587,6 +588,10 @@ if(USE_GPU)
   target_link_libraries(lightgbm_objs PUBLIC ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
 endif()
 
+if(USE_CUDA)
+  target_link_libraries(lightgbm_objs PUBLIC ${NCCL_LIBRARY})
+endif(USE_CUDA)
+
 if(__INTEGRATE_OPENCL)
   # targets OpenCL and Boost are added in IntegratedOpenCL.cmake
   add_dependencies(lightgbm_objs OpenCL Boost)

@@ -938,7 +938,7 @@ test_that("all parameters are stored correctly with save_model_to_string()", {
         , "[gpu_platform_id: -1]"
         , "[gpu_device_id: -1]"
         , "[gpu_use_dp: 0]"
-        , "[num_gpu: 1]"
+        , "[num_gpus: 1]"
     )
     all_param_entries <- c(non_default_param_entries, default_param_entries)
 

@@ -0,0 +1,70 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Tries to find NCCL headers and libraries.
+#
+# Usage of this module as follows:
+#
+#  find_package(NCCL)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  NCCL_ROOT - When set, this path is inspected instead of standard library
+#              locations as the root of the NCCL installation.
+#              The environment variable NCCL_ROOT overrides this variable.
+#
+# This module defines
+#  Nccl_FOUND, whether nccl has been found
+#  NCCL_INCLUDE_DIR, directory containing header
+#  NCCL_LIBRARY, directory containing nccl library
+#  NCCL_LIB_NAME, nccl library name
+#  USE_NCCL_LIB_PATH, when set, NCCL_LIBRARY path is also inspected for the
+#                     location of the nccl library. This would disable
+#                     switching between static and shared.
+#
+# This module assumes that the user has already called find_package(CUDA)
+
+if (NCCL_LIBRARY)
+  if(NOT USE_NCCL_LIB_PATH)
+    # Don't cache NCCL_LIBRARY to enable switching between static and shared.
+    unset(NCCL_LIBRARY CACHE)
+  endif(NOT USE_NCCL_LIB_PATH)
+endif()
+
+if (BUILD_WITH_SHARED_NCCL)
+  # libnccl.so
+  set(NCCL_LIB_NAME nccl)
+else ()
+  # libnccl_static.a
+  set(NCCL_LIB_NAME nccl_static)
+endif (BUILD_WITH_SHARED_NCCL)
+
+find_path(NCCL_INCLUDE_DIR
+  NAMES nccl.h
+  PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
+
+find_library(NCCL_LIBRARY
+  NAMES ${NCCL_LIB_NAME}
+  PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
+
+message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Nccl DEFAULT_MSG
+                                  NCCL_INCLUDE_DIR NCCL_LIBRARY)
+
+mark_as_advanced(
+  NCCL_INCLUDE_DIR
+  NCCL_LIBRARY
+)
@@ -1276,31 +1276,39 @@ GPU Parameters
 
 -  ``gpu_platform_id`` :raw-html:`<a id="gpu_platform_id" title="Permalink to this parameter" href="#gpu_platform_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
 
-   -  OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
+   -  OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform
 
    -  ``-1`` means the system-wide default platform
 
    -  **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
 
 -  ``gpu_device_id`` :raw-html:`<a id="gpu_device_id" title="Permalink to this parameter" href="#gpu_device_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
 
-   -  OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
+   -  Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu.
+
+   -  Each GPU in the selected platform has a unique device ID
 
    -  ``-1`` means the default device in the selected platform
 
    -  **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
 
--  ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+-  ``num_gpus`` :raw-html:`<a id="num_gpus" title="Permalink to this parameter" href="#num_gpus">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int
 
-   -  set this to ``true`` to use double precision math on GPU (by default single precision is used)
+   -  Number of GPUs to use for training, used with device_type=cuda
 
-   -  **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
+   -  When <= 0, only 1 GPU will be used
+
+-  ``gpu_device_id_list`` :raw-html:`<a id="gpu_device_id_list" title="Permalink to this parameter" href="#gpu_device_id_list">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
 
--  ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``
+   -  List of CUDA device IDs used when device_type=cuda
 
-   -  number of GPUs
+   -  When empty, the devices with the smallest IDs will be used
 
-   -  **Note**: can be used only in CUDA implementation
+-  ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  set this to ``true`` to use double precision math on GPU (by default single precision is used)
+
+   -  **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
 
 .. end params list
 

@@ -309,9 +309,11 @@ class LIGHTGBM_EXPORT Boosting {
   * \param format Format of model
   * \param config config for boosting
   * \param filename name of model file, if existing will continue to train from this model
+  * \param device_type type of device, can be cpu, gpu or cuda
+  * \param num_gpu number of GPUs to use
   * \return The boosting object
   */
-  static Boosting* CreateBoosting(const std::string& type, const char* filename);
+  static Boosting* CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpu);
 
   virtual std::string GetLoadedParam() const = 0;
 

@@ -1091,25 +1091,29 @@ struct Config {
   #pragma region GPU Parameters
   #endif  // __NVCC__
 
-  // desc = OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
+  // desc = OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform
   // desc = ``-1`` means the system-wide default platform
   // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
   int gpu_platform_id = -1;
 
-  // desc = OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
+  // desc = Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu.
+  // desc = Each GPU in the selected platform has a unique device ID
   // desc = ``-1`` means the default device in the selected platform
   // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
   int gpu_device_id = -1;
 
+  // desc = Number of GPUs to use for training, used with device_type=cuda
+  // desc = When <= 0, only 1 GPU will be used
+  int num_gpus = 1;
+
+  // desc = List of CUDA device IDs used when device_type=cuda
+  // desc = When empty, the devices with the smallest IDs will be used
+  std::string gpu_device_id_list = "";
+
   // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
   // desc = **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
   bool gpu_use_dp = false;
 
-  // check = >0
-  // desc = number of GPUs
-  // desc = **Note**: can be used only in CUDA implementation
-  int num_gpu = 1;
-
   #ifndef __NVCC__
   #pragma endregion
 

@@ -44,6 +44,8 @@ class CUDAColumnData {
 
   void* const* cuda_data_by_column() const { return cuda_data_by_column_; }
 
+  void* const* data_by_column() const { return data_by_column_.data(); }
+
   uint32_t feature_min_bin(const int feature_index) const { return feature_min_bin_[feature_index]; }
 
   uint32_t feature_max_bin(const int feature_index) const { return feature_max_bin_[feature_index]; }