Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CUDA] Multi-GPU for CUDA Version #6138

Open
wants to merge 70 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
70 commits
Select commit Hold shift + click to select a range
ee3923b
initialize nccl
shiyu1994 Oct 10, 2023
82668d0
Merge branch 'master' into nccl-dev
shiyu1994 Oct 26, 2023
6189cbb
Merge branch 'master' into nccl-dev
shiyu1994 Oct 27, 2023
f39f877
change year in header
shiyu1994 Nov 8, 2023
e513662
Merge branch 'master' into nccl-dev
shiyu1994 Nov 8, 2023
47f3e50
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Nov 8, 2023
985780f
add implementation of nccl gbdt
shiyu1994 Nov 8, 2023
35b0ca1
add nccl topology
shiyu1994 Nov 9, 2023
7d36a14
clean up
shiyu1994 Nov 9, 2023
5470d99
Merge branch 'master' into nccl-dev
shiyu1994 Nov 9, 2023
7b47a1e
clean up
shiyu1994 Nov 9, 2023
839c375
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Nov 9, 2023
8eaf3ad
Merge branch 'master' into nccl-dev
shiyu1994 Dec 15, 2023
cc72fc8
Merge branch 'master' into nccl-dev
shiyu1994 Dec 22, 2023
209e25d
set nccl info
shiyu1994 Jan 25, 2024
431f967
support quantized training with categorical features on cpu
shiyu1994 Feb 5, 2024
b07caf2
remove white spaces
shiyu1994 Feb 5, 2024
cf60467
add tests for quantized training with categorical features
shiyu1994 Feb 5, 2024
bf2f649
skip tests for cuda version
shiyu1994 Feb 5, 2024
2fc9525
fix cases when only 1 data block in row-wise quantized histogram cons…
shiyu1994 Feb 6, 2024
dce770c
remove useless capture
shiyu1994 Feb 6, 2024
f0c44fc
Merge branch 'master' into nccl-dev
shiyu1994 Feb 6, 2024
e2cb41f
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Feb 6, 2024
f3985ef
fix inconsistency of gpu devices
shiyu1994 Feb 7, 2024
d000a41
fix creating boosting object from file
shiyu1994 Feb 7, 2024
ecdccd5
change num_gpu to num_gpus in test case
shiyu1994 Feb 7, 2024
dfa4419
fix objective initialization
shiyu1994 Feb 9, 2024
f4b8906
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Feb 9, 2024
f0b22d1
fix c++ compilation warning
shiyu1994 Feb 9, 2024
617b3b2
fix lint errors
shiyu1994 Feb 9, 2024
6d090b2
Merge branch 'master' into fix-6257
shiyu1994 Feb 20, 2024
736ab8a
Merge branch 'master' into nccl-dev
shiyu1994 Feb 20, 2024
ad72d9f
Merge branch 'fix-6257' into nccl-dev
shiyu1994 Feb 20, 2024
2670f48
fix compilation warnings
shiyu1994 Feb 20, 2024
02b725b
change num_gpu to num_gpus in R test case
shiyu1994 Feb 20, 2024
3bfb784
add nccl synchronization in tree training
shiyu1994 Feb 20, 2024
fe1f592
fix global num data update
shiyu1994 Feb 21, 2024
a528bd6
merge master
shiyu1994 Feb 22, 2024
996d70b
fix ruff-format issues
shiyu1994 Feb 22, 2024
671bed3
merge master
shiyu1994 Feb 23, 2024
34610fb
use global num data in split finder
shiyu1994 Feb 23, 2024
041018b
Merge branch 'master' into nccl-dev
shiyu1994 Mar 6, 2024
e1b4512
explicit initialization of NCCLInfo members
shiyu1994 Mar 11, 2024
0a21b5f
Merge branch 'master' into nccl-dev
shiyu1994 Mar 25, 2024
be29624
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Mar 25, 2024
06cfde4
Merge branch 'master' into nccl-dev
shiyu1994 Apr 11, 2024
75afe5e
Merge branch 'master' into nccl-dev
shiyu1994 May 20, 2024
1e6e4a1
Merge branch 'master' into nccl-dev
shiyu1994 Jun 30, 2024
614605c
merge master
shiyu1994 Oct 8, 2024
18babb0
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Oct 8, 2024
11f4062
fix compilation
shiyu1994 Oct 8, 2024
b4c21c2
use CUDAVector
shiyu1994 Oct 9, 2024
70fe10f
use CUDAVector
shiyu1994 Oct 9, 2024
849a554
merge master
shiyu1994 Oct 18, 2024
19a2662
merge master
shiyu1994 Oct 18, 2024
6db879a
use CUDAVector
shiyu1994 Oct 25, 2024
b43f88b
use CUDAVector for cuda tree and column data
shiyu1994 Oct 25, 2024
582c760
update gbdt
shiyu1994 Oct 25, 2024
b9e143b
changes for cuda tree
shiyu1994 Oct 25, 2024
483e521
use CUDAVector for cuda column data
shiyu1994 Oct 25, 2024
950199d
fix bug in GetDataByColumnPointers
shiyu1994 Oct 25, 2024
f30ee85
Merge branch 'master' into nccl-dev
shiyu1994 Oct 25, 2024
d11991a
disable cuda by default
shiyu1994 Oct 25, 2024
4bb4411
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Oct 25, 2024
b56b39e
fix single machine gbdt
shiyu1994 Oct 25, 2024
3bebc19
merge main
shiyu1994 Dec 17, 2024
47b4364
clean up
shiyu1994 Dec 17, 2024
a326c87
fix typo
shiyu1994 Dec 17, 2024
d8ea043
Merge branch 'master' into nccl-dev
shiyu1994 Dec 18, 2024
2f040b7
Merge branch 'master' into nccl-dev
shiyu1994 Dec 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/check-python-dists.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ if [ "$PY_MINOR_VER" -gt 7 ]; then
--inspect \
--ignore 'compiled-objects-have-debug-symbols'\
--ignore 'distro-too-large-compressed' \
--max-allowed-size-uncompressed '100M' \
--max-allowed-size-uncompressed '500M' \
--max-allowed-files 800 \
"$(echo "${DIST_DIR}"/*)" || exit 1
elif { test "$(uname -m)" = "aarch64"; }; then
Expand Down
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ endif()

if(USE_CUDA)
find_package(CUDAToolkit 11.0 REQUIRED)
find_package(Nccl REQUIRED)
include_directories(${CUDAToolkit_INCLUDE_DIRS})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall")

Expand Down Expand Up @@ -451,6 +452,7 @@ set(
LGBM_CUDA_SOURCES
src/boosting/cuda/cuda_score_updater.cpp
src/boosting/cuda/cuda_score_updater.cu
src/boosting/cuda/nccl_gbdt.cpp
src/metric/cuda/cuda_binary_metric.cpp
src/metric/cuda/cuda_pointwise_metric.cpp
src/metric/cuda/cuda_regression_metric.cpp
Expand Down Expand Up @@ -601,6 +603,10 @@ if(USE_GPU)
target_link_libraries(lightgbm_objs PUBLIC ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
endif()

if(USE_CUDA)
target_link_libraries(lightgbm_objs PUBLIC ${NCCL_LIBRARY})
endif(USE_CUDA)

if(__INTEGRATE_OPENCL)
# targets OpenCL and Boost are added in IntegratedOpenCL.cmake
add_dependencies(lightgbm_objs OpenCL Boost)
Expand Down
2 changes: 1 addition & 1 deletion R-package/tests/testthat/test_lgb.Booster.R
Original file line number Diff line number Diff line change
Expand Up @@ -1107,7 +1107,7 @@ test_that("all parameters are stored correctly with save_model_to_string()", {
, "[gpu_platform_id: -1]"
, "[gpu_device_id: -1]"
, "[gpu_use_dp: 0]"
, "[num_gpu: 1]"
, "[num_gpus: 1]"
)
all_param_entries <- c(non_default_param_entries, default_param_entries)

Expand Down
70 changes: 70 additions & 0 deletions cmake/modules/FindNccl.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Tries to find NCCL headers and libraries.
#
# Usage of this module as follows:
#
# find_package(NCCL)
#
# Variables used by this module, they can change the default behaviour and need
# to be set before calling find_package:
#
# NCCL_ROOT - When set, this path is inspected instead of standard library
# locations as the root of the NCCL installation.
# The environment variable NCCL_ROOT overrides this variable.
#
# This module defines
# Nccl_FOUND, whether nccl has been found
# NCCL_INCLUDE_DIR, directory containing header
# NCCL_LIBRARY, directory containing nccl library
# NCCL_LIB_NAME, nccl library name
# USE_NCCL_LIB_PATH, when set, NCCL_LIBRARY path is also inspected for the
# location of the nccl library. This would disable
# switching between static and shared.
#
# This module assumes that the user has already called find_package(CUDA)

if (NCCL_LIBRARY)
if(NOT USE_NCCL_LIB_PATH)
# Don't cache NCCL_LIBRARY to enable switching between static and shared.
unset(NCCL_LIBRARY CACHE)
endif(NOT USE_NCCL_LIB_PATH)
endif()

if (BUILD_WITH_SHARED_NCCL)
# libnccl.so
set(NCCL_LIB_NAME nccl)
else ()
# libnccl_static.a
set(NCCL_LIB_NAME nccl_static)
endif (BUILD_WITH_SHARED_NCCL)

find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)

find_library(NCCL_LIBRARY
NAMES ${NCCL_LIB_NAME}
PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)

message(STATUS "Using nccl library: ${NCCL_LIBRARY}")

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Nccl DEFAULT_MSG
NCCL_INCLUDE_DIR NCCL_LIBRARY)

mark_as_advanced(
NCCL_INCLUDE_DIR
NCCL_LIBRARY
)
12 changes: 12 additions & 0 deletions docs/Parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1371,6 +1371,18 @@ GPU Parameters

- **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details

- ``num_gpus`` :raw-html:`<a id="num_gpus" title="Permalink to this parameter" href="#num_gpus">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int

- Number of GPUs to use for training, used with device_type=cuda

- When <= 0, only 1 GPU will be used

- ``gpu_device_id_list`` :raw-html:`<a id="gpu_device_id_list" title="Permalink to this parameter" href="#gpu_device_id_list">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string

- List of CUDA device IDs used when device_type=cuda

- When empty, the devices with the smallest IDs will be used

- ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool

- set this to ``true`` to use double precision math on GPU (by default single precision is used)
Expand Down
4 changes: 3 additions & 1 deletion include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,9 +309,11 @@ class LIGHTGBM_EXPORT Boosting {
* \param format Format of model
* \param config config for boosting
* \param filename name of model file, if existing will continue to train from this model
* \param device_type type of device, can be cpu, gpu or cuda
* \param num_gpu number of GPUs to use
* \return The boosting object
*/
static Boosting* CreateBoosting(const std::string& type, const char* filename);
static Boosting* CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpu);

virtual std::string GetLoadedParam() const = 0;

Expand Down
8 changes: 8 additions & 0 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -1125,6 +1125,14 @@ struct Config {
// desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
int gpu_device_id = -1;

// desc = Number of GPUs to use for training, used with device_type=cuda
// desc = When <= 0, only 1 GPU will be used
int num_gpus = 1;

// desc = List of CUDA device IDs used when device_type=cuda
// desc = When empty, the devices with the smallest IDs will be used
std::string gpu_device_id_list = "";

// desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
// desc = **Note**: can be used only in OpenCL implementation (``device_type="gpu"``), in CUDA implementation only double precision is currently supported
bool gpu_use_dp = false;
Expand Down
67 changes: 38 additions & 29 deletions include/LightGBM/cuda/cuda_column_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h>

#include <memory>
#include <cstdint>
#include <vector>

Expand All @@ -39,11 +40,11 @@ class CUDAColumnData {
const std::vector<uint8_t>& feature_mfb_is_na,
const std::vector<int>& feature_to_column);

const void* GetColumnData(const int column_index) const { return data_by_column_[column_index]; }
const void* GetColumnData(const int column_index) const { return data_by_column_[column_index]->RawData(); }

void CopySubrow(const CUDAColumnData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices);

void* const* cuda_data_by_column() const { return cuda_data_by_column_; }
void* const* cuda_data_by_column() const { return cuda_data_by_column_.RawData(); }

uint32_t feature_min_bin(const int feature_index) const { return feature_min_bin_[feature_index]; }

Expand All @@ -63,42 +64,50 @@ class CUDAColumnData {

uint8_t feature_mfb_is_na(const int feature_index) const { return feature_mfb_is_na_[feature_index]; }

const uint32_t* cuda_feature_min_bin() const { return cuda_feature_min_bin_; }
const uint32_t* cuda_feature_min_bin() const { return cuda_feature_min_bin_.RawData(); }

const uint32_t* cuda_feature_max_bin() const { return cuda_feature_max_bin_; }
const uint32_t* cuda_feature_max_bin() const { return cuda_feature_max_bin_.RawData(); }

const uint32_t* cuda_feature_offset() const { return cuda_feature_offset_; }
const uint32_t* cuda_feature_offset() const { return cuda_feature_offset_.RawData(); }

const uint32_t* cuda_feature_most_freq_bin() const { return cuda_feature_most_freq_bin_; }
const uint32_t* cuda_feature_most_freq_bin() const { return cuda_feature_most_freq_bin_.RawData(); }

const uint32_t* cuda_feature_default_bin() const { return cuda_feature_default_bin_; }
const uint32_t* cuda_feature_default_bin() const { return cuda_feature_default_bin_.RawData(); }

const uint8_t* cuda_feature_missing_is_zero() const { return cuda_feature_missing_is_zero_; }
const uint8_t* cuda_feature_missing_is_zero() const { return cuda_feature_missing_is_zero_.RawData(); }

const uint8_t* cuda_feature_missing_is_na() const { return cuda_feature_missing_is_na_; }
const uint8_t* cuda_feature_missing_is_na() const { return cuda_feature_missing_is_na_.RawData(); }

const uint8_t* cuda_feature_mfb_is_zero() const { return cuda_feature_mfb_is_zero_; }
const uint8_t* cuda_feature_mfb_is_zero() const { return cuda_feature_mfb_is_zero_.RawData(); }

const uint8_t* cuda_feature_mfb_is_na() const { return cuda_feature_mfb_is_na_; }
const uint8_t* cuda_feature_mfb_is_na() const { return cuda_feature_mfb_is_na_.RawData(); }

const int* cuda_feature_to_column() const { return cuda_feature_to_column_; }
const int* cuda_feature_to_column() const { return cuda_feature_to_column_.RawData(); }

const uint8_t* cuda_column_bit_type() const { return cuda_column_bit_type_; }
const uint8_t* cuda_column_bit_type() const { return cuda_column_bit_type_.RawData(); }

int feature_to_column(const int feature_index) const { return feature_to_column_[feature_index]; }

uint8_t column_bit_type(const int column_index) const { return column_bit_type_[column_index]; }

private:
template <bool IS_SPARSE, bool IS_4BIT, typename BIN_TYPE>
void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, void** out_column_data_pointer);
void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, CUDAVector<void>* out_column_data_pointer);

void LaunchCopySubrowKernel(void* const* in_cuda_data_by_column);

void InitColumnMetaInfo();

void ResizeWhenCopySubrow(const data_size_t num_used_indices);

std::vector<void*> GetDataByColumnPointers(const std::vector<std::unique_ptr<CUDAVector<void>>>& data_by_column) const {
std::vector<void*> data_by_column_pointers(data_by_column.size(), nullptr);
for (size_t i = 0; i < data_by_column.size(); ++i) {
data_by_column_pointers[i] = reinterpret_cast<void*>(data_by_column[i]->RawData());
}
return data_by_column_pointers;
}

int gpu_device_id_;
int num_threads_;
data_size_t num_data_;
Expand All @@ -113,24 +122,24 @@ class CUDAColumnData {
std::vector<uint8_t> feature_missing_is_na_;
std::vector<uint8_t> feature_mfb_is_zero_;
std::vector<uint8_t> feature_mfb_is_na_;
void** cuda_data_by_column_;
CUDAVector<void*> cuda_data_by_column_;
std::vector<int> feature_to_column_;
std::vector<void*> data_by_column_;

uint8_t* cuda_column_bit_type_;
uint32_t* cuda_feature_min_bin_;
uint32_t* cuda_feature_max_bin_;
uint32_t* cuda_feature_offset_;
uint32_t* cuda_feature_most_freq_bin_;
uint32_t* cuda_feature_default_bin_;
uint8_t* cuda_feature_missing_is_zero_;
uint8_t* cuda_feature_missing_is_na_;
uint8_t* cuda_feature_mfb_is_zero_;
uint8_t* cuda_feature_mfb_is_na_;
int* cuda_feature_to_column_;
std::vector<std::unique_ptr<CUDAVector<void>>> data_by_column_;

CUDAVector<uint8_t> cuda_column_bit_type_;
CUDAVector<uint32_t> cuda_feature_min_bin_;
CUDAVector<uint32_t> cuda_feature_max_bin_;
CUDAVector<uint32_t> cuda_feature_offset_;
CUDAVector<uint32_t> cuda_feature_most_freq_bin_;
CUDAVector<uint32_t> cuda_feature_default_bin_;
CUDAVector<uint8_t> cuda_feature_missing_is_zero_;
CUDAVector<uint8_t> cuda_feature_missing_is_na_;
CUDAVector<uint8_t> cuda_feature_mfb_is_zero_;
CUDAVector<uint8_t> cuda_feature_mfb_is_na_;
CUDAVector<int> cuda_feature_to_column_;

// used when bagging with subset
data_size_t* cuda_used_indices_;
CUDAVector<data_size_t> cuda_used_indices_;
data_size_t num_used_indices_;
data_size_t cur_subset_buffer_size_;
};
Expand Down
18 changes: 9 additions & 9 deletions include/LightGBM/cuda/cuda_metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,20 @@ class CUDAMetadata {

void SetInitScore(const double* init_score, data_size_t len);

const label_t* cuda_label() const { return cuda_label_; }
const label_t* cuda_label() const { return cuda_label_.RawData(); }

const label_t* cuda_weights() const { return cuda_weights_; }
const label_t* cuda_weights() const { return cuda_weights_.RawData(); }

const data_size_t* cuda_query_boundaries() const { return cuda_query_boundaries_; }
const data_size_t* cuda_query_boundaries() const { return cuda_query_boundaries_.RawData(); }

const label_t* cuda_query_weights() const { return cuda_query_weights_; }
const label_t* cuda_query_weights() const { return cuda_query_weights_.RawData(); }

private:
label_t* cuda_label_;
label_t* cuda_weights_;
data_size_t* cuda_query_boundaries_;
label_t* cuda_query_weights_;
double* cuda_init_score_;
CUDAVector<label_t> cuda_label_;
CUDAVector<label_t> cuda_weights_;
CUDAVector<data_size_t> cuda_query_boundaries_;
CUDAVector<label_t> cuda_query_weights_;
CUDAVector<double> cuda_init_score_;
};

} // namespace LightGBM
Expand Down
Loading
Loading