Skip to content

Commit

Permalink
Add NCCL to the superbuild (#2441)
Browse files Browse the repository at this point in the history
* Add NCCL to the superbuild

* Update copyright year

* Update Superbuild README
  • Loading branch information
benson31 authored Apr 29, 2024
1 parent 7d1d17c commit 29b2dcc
Show file tree
Hide file tree
Showing 5 changed files with 192 additions and 5 deletions.
1 change: 1 addition & 0 deletions scripts/superbuild/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ set(LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY "COMMON"
# descending into subdirectories.
lbann_sb_add_packages(
# Ack, a "third-order" dependency
NCCL
RCCL

# These are "second-order" dependencies
Expand Down
4 changes: 4 additions & 0 deletions scripts/superbuild/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,17 @@ The following packages are known by the SuperBuild framework:
algebra library.
- [JPEG-TURBO](https://github.com/libjpeg-turbo/libjpeg-turbo) - JPEG
but in turbo mode. Zoom zoom zoom.
- [NCCL](https://github.com/NVIDIA/nccl) - The NVIDIA Collective
Communications Library.
- [OpenBLAS](https://github.com/xianyi/OpenBLAS.git) - BLAS library
for when your vendor doesn't do a good job.
- [OpenCV](https://github.com/opencv/opencv) - Computer vision
library.
- [protobuf](https://github.com/protocolbuffers/protobuf.git) - And
yet *another* serialization format that LBANN (and others) (ab)use
for model topology description and configuration.
- [RCCL](https://github.com/ROCm/rccl) - The ROCm Communication
Collectives Library.
- [spdlog](https://github.com/gabime/spdlog) - Fast C++ logging
library.
- [zstr](https://github.com/mateidavid/zstr) - C++ ZLib wrapper.
Expand Down
2 changes: 1 addition & 1 deletion scripts/superbuild/aluminum/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ lbann_sb_add_cmake_extern_pkg(
OPTIONAL_LANGUAGES CUDA HIP
GITHUB_URL llnl/Aluminum.git
GIT_TAG "master"
DEPENDS_ON Caliper RCCL)
DEPENDS_ON Caliper NCCL RCCL)

set(Aluminum_DIR ${LBANN_SB_Aluminum_PREFIX}
CACHE INTERNAL "The install prefix of Aluminum.")
16 changes: 12 additions & 4 deletions scripts/superbuild/cmake/modules/LBANNSuperBuildAddPackages.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,22 @@ include(CMakeDependentOption)

macro(lbann_sb_default_pkg_option PKG_NAME OPTION_NAME DOC_STR VALUE)
option(LBANN_SB_FWD_${PKG_NAME}_${OPTION_NAME}
"${DOC_STR}"
"${PKG_NAME}: ${DOC_STR}"
${VALUE})
endmacro ()

# This assumes PKG_NAME is defined already. No check to ensure this,
# just don't misuse. :)
macro(lbann_sb_this_pkg_option OPTNAME DOCSTR DEFVAL)
option(LBANN_SB_FWD_${PKG_NAME}_${OPTNAME}
"${PKG_NAME}: ${DOCSTR}"
${DEFVAL})
endmacro ()

macro(lbann_sb_default_cuda_option PKG_NAME OPTION_NAME DOC_STR VALUE)
cmake_dependent_option(
LBANN_SB_FWD_${PKG_NAME}_${OPTION_NAME}
"${DOC_STR}"
"${PKG_NAME}: ${DOC_STR}"
${VALUE}
"LBANN_SB_DEFAULT_CUDA_OPTS"
OFF)
Expand All @@ -43,7 +51,7 @@ endmacro ()
macro(lbann_sb_default_rocm_option PKG_NAME OPTION_NAME DOC_STR VALUE)
cmake_dependent_option(
LBANN_SB_FWD_${PKG_NAME}_${OPTION_NAME}
"${DOC_STR}"
"${PKG_NAME}: ${DOC_STR}"
${VALUE}
"LBANN_SB_DEFAULT_ROCM_OPTS"
OFF)
Expand All @@ -52,7 +60,7 @@ endmacro ()
macro(lbann_sb_default_gpu_option PKG_NAME OPTION_NAME DOC_STR VALUE)
cmake_dependent_option(
LBANN_SB_FWD_${PKG_NAME}_${OPTION_NAME}
"${DOC_STR}"
"${PKG_NAME}: ${DOC_STR}"
${VALUE}
"LBANN_SB_DEFAULT_CUDA_OPTS OR LBANN_SB_DEFAULT_ROCM_OPTS"
OFF)
Expand Down
174 changes: 174 additions & 0 deletions scripts/superbuild/nccl/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
################################################################################
## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC.
## Produced at the Lawrence Livermore National Laboratory.
## Written by the LBANN Research Team (B. Van Essen, et al.) listed in
## the CONTRIBUTORS file. <[email protected]>
##
## LLNL-CODE-697807.
## All rights reserved.
##
## This file is part of LBANN: Livermore Big Artificial Neural Network
## Toolkit. For details, see http://software.llnl.gov/LBANN or
## https://github.com/LLNL/LBANN.
##
## Licensed under the Apache License, Version 2.0 (the "Licensee"); you
## may not use this file except in compliance with the License. You may
## obtain a copy of the License at:
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
## implied. See the License for the specific language governing
## permissions and limitations under the license.
################################################################################

# The goal of this package is to enable *basic* builds of NCCL. Any
# configuration that would require modification to the provided
# makefile is considered out-of-scope and users needing such
# configuration should consider a standalone build rather than
# superbuilding NCCL.
#
# Note that the default NCCL makefile is rather rigid. It looks like
# one _can_ inject flags, but they have a high likelihood of being
# trampled by the makefile. E.g., the makefiles specifies optimization
# flags *after* the user injection point, so trying to modify the
# optimization level manually would be moot.

# Interprets the given variable as a boolean value and converts it to
# 1 (true) or 0 (false).
macro(bool_as_num VAR)
if (${VAR})
set(${VAR} 1)
else ()
set(${VAR} 0)
endif ()
endmacro ()

lbann_sb_init_extern_pkg(
NAME NCCL
LANGUAGES C CXX # CUDA <- can't set explicitly; inferred from ${CUDA_HOME}
GITHUB_URL NVIDIA/nccl
GIT_TAG "master")

# User-facing options
lbann_sb_this_pkg_option(
VERBOSE
"Print build commands?"
ON)
bool_as_num(LBANN_SB_FWD_NCCL_VERBOSE)

lbann_sb_this_pkg_option(
KEEP
"Keep intermediate files generated during compilation"
OFF)
bool_as_num(LBANN_SB_FWD_NCCL_KEEP)

lbann_sb_this_pkg_option(
ASAN
"Build with address sanitizer enabled"
OFF)
bool_as_num(LBANN_SB_FWD_NCCL_ASAN)

lbann_sb_this_pkg_option(
TRACE
"Build with tracing enabled"
OFF)
bool_as_num(LBANN_SB_FWD_NCCL_TRACE)

# Debug build?
string(TOLOWER "${LBANN_SB_${PKG_NAME}_BUILD_TYPE}" _nccl_build_type)
if (_nccl_build_type STREQUAL "debug")
set(_nccl_debug 1)
else ()
set(_nccl_debug 0)
endif ()

# Prefer a user-specified CUDA path, then check CUDA_HOME
if (LBANN_SB_FWD_NCCL_CUDA_PATH)
set(_nccl_cuda_path_opt
"CUDA_HOME=${LBANN_SB_FWD_NCCL_CUDA_PATH}")
elseif (DEFINED ENV{CUDA_HOME})
set(_nccl_cuda_path_opt
"CUDA_HOME=$ENV{CUDA_HOME}")
else ()
message(WARNING
"You have enabled NCCL package, but CUDA_HOME "
"is not available in your environment.")
endif ()

# Gencode control
if (LBANN_SB_FWD_NCCL_NVCC_GENCODE)
set(_nccl_nvcc_gencode_opt
"NVCC_GENCODE=${LBANN_SB_FWD_NCCL_NVCC_GENCODE}")
elseif (DEFINED $ENV{NVCC_GENCODE})
set(_nccl_nvcc_gencode_opt
"NVCC_GENCODE=$ENV{NVCC_GENCODE}")
else ()
message(WARNING
"You have enabled NCCL package, but you have not set "
"the NVCC_GENCODE. This will build all gencodes supported "
"by NCCL, which may increase the build time.")
endif ()

# The build system here is just a set of makefiles.
find_program(GNU_MAKE_PROGRAM make)

include (ExternalProject)
ExternalProject_Add(${PKG_NAME}
PREFIX "${CMAKE_CURRENT_BINARY_DIR}"
${LBANN_SB_GIT_REPOSITORY_TAG} ${LBANN_SB_${PKG_NAME}_URL}
${LBANN_SB_GIT_TAG_TAG} ${LBANN_SB_${PKG_NAME}_TAG}
TMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/tmp"
STAMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/stamp"

SOURCE_DIR "${LBANN_SB_${PKG_NAME}_SOURCE_DIR}"
INSTALL_DIR "${LBANN_SB_${PKG_NAME}_PREFIX}"

GIT_SHALLOW 1

BUILD_IN_SOURCE 1
USES_TERMINAL_BUILD 1
LOG_DOWNLOAD 1
LOG_UPDATE 1
LOG_CONFIGURE 1
LOG_BUILD 1
LOG_INSTALL 1
LOG_TEST 1

CONFIGURE_COMMAND ""

BUILD_COMMAND
${GNU_MAKE_PROGRAM}
src.build
"PREFIX=${LBANN_SB_${PKG_NAME}_PREFIX}"
"CC=${LBANN_SB_${PKG_NAME}_C_COMPILER}"
"CXX=${LBANN_SB_${PKG_NAME}_CXX_COMPILER}"
${_nccl_cuda_path_opt}
${_nccl_nvcc_gencode_opt}
"DEBUG=${_nccl_debug}"
"VERBOSE=${LBANN_SB_FWD_NCCL_VERBOSE}"
"KEEP=${LBANN_SB_FWD_NCCL_KEEP}"
"ASAN=${LBANN_SB_FWD_NCCL_ASAN}"
"TRACE=${LBANN_SB_FWD_NCCL_TRACE}"
-j${${PKG_NAME}_MAX_MAKE_JOBS}

INSTALL_COMMAND
${GNU_MAKE_PROGRAM}
src.install
"PREFIX=${LBANN_SB_${PKG_NAME}_PREFIX}"
"CC=${LBANN_SB_${PKG_NAME}_C_COMPILER}"
"CXX=${LBANN_SB_${PKG_NAME}_CXX_COMPILER}"
${_nccl_cuda_path_opt}
${_nccl_nvcc_gencode_opt}
"DEBUG=${_nccl_debug}"
"VERBOSE=${LBANN_SB_FWD_NCCL_VERBOSE}"
"KEEP=${LBANN_SB_FWD_NCCL_KEEP}"
"ASAN=${LBANN_SB_FWD_NCCL_ASAN}"
"TRACE=${LBANN_SB_FWD_NCCL_TRACE}"
-j${${PKG_NAME}_MAX_MAKE_JOBS}
)

set(${PKG_NAME}_DIR ${LBANN_SB_${PKG_NAME}_PREFIX}
CACHE INTERNAL "The install prefix of ${PKG_NAME}.")

0 comments on commit 29b2dcc

Please sign in to comment.