Skip to content

Commit

Permalink
Add aws-ofi-rccl plugin to the superbuild (#2440)
Browse files Browse the repository at this point in the history
* Add aws-ofi-rccl plugin to the superbuild

* Minor adjustment to if/else blocks

* Add aws-ofi-rccl build to the example rocm script

* Add license statement to aws-ofi-rccl recipe

* Remove a superfluous DEPENDS_ON
  • Loading branch information
benson31 authored Apr 29, 2024
1 parent 052c602 commit 7d1d17c
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 0 deletions.
12 changes: 12 additions & 0 deletions scripts/superbuild/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ lbann_sb_add_packages(
protobuf
zstr

# This should just be provided on relevant systems.
AWS_OFI_RCCL

# And finally add the option to build LBANN
LBANN)

Expand Down Expand Up @@ -140,6 +143,15 @@ message("\n-----------------------------------------------------------------\n")
file(WRITE "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh"
"export CMAKE_PREFIX_PATH=${LBANN_SB_SUGG_CMAKE_PREFIX_PATH}\n")

if (LBANN_SB_BUILD_AWS_OFI_RCCL)
message("-----------------------------------------------------------------\n")
message("You are building the AWS_OFI_RCCL plugin. This plugin's library")
message("path *must* be in the dynamic library search path at run time to")
message("have an effect. It may be useful to do the following:\n")
message("export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBARY_PATH\}\n")
message("-----------------------------------------------------------------\n")
endif ()

# Add a custom target for bundling all things up
if (UNIX)
find_program(__FIND_EXE find)
Expand Down
3 changes: 3 additions & 0 deletions scripts/superbuild/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ The following packages are known by the SuperBuild framework:
- [Aluminum](https://github.com/LLNL/Aluminum) - High-performance
communication library that provides a stream-aware interface and
semantics.
- [AWS\_OFI\_RCCL](https://github.com/ROCm/aws-ofi-rccl) - A plugin
that should be provided by vendors or supercomputing centers to
allow RCCL to use libfabric in place of IB.
- [Catch2](https://github.com/catchorg/catch2) - A unit-testing
framework for C++ packages. (Mostly for developers; also used by
H2 and Hydrogen, if enabled.)
Expand Down
152 changes: 152 additions & 0 deletions scripts/superbuild/aws_ofi_rccl/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
################################################################################
## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC.
## Produced at the Lawrence Livermore National Laboratory.
## Written by the LBANN Research Team (B. Van Essen, et al.) listed in
## the CONTRIBUTORS file. <[email protected]>
##
## LLNL-CODE-697807.
## All rights reserved.
##
## This file is part of LBANN: Livermore Big Artificial Neural Network
## Toolkit. For details, see http://software.llnl.gov/LBANN or
## https://github.com/LLNL/LBANN.
##
## Licensed under the Apache License, Version 2.0 (the "Licensee"); you
## may not use this file except in compliance with the License. You may
## obtain a copy of the License at:
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
## implied. See the License for the specific language governing
## permissions and limitations under the license.
################################################################################

lbann_sb_init_extern_pkg(
NAME AWS_OFI_RCCL
LANGUAGES C
GITHUB_URL ROCm/aws-ofi-rccl
GIT_TAG "cxi") # This is the head of their develop.

# Prefer a user-specified HIP path, then check ROCM_PATH.
if (LBANN_SB_FWD_AWS_OFI_RCCL_HIP_PATH)
set(_aws_ofi_rccl_hip_path_opt
"--with-hip=${LBANN_SB_FWD_AWS_OFI_RCCL_HIP_PATH}")
elseif (DEFINED ENV{ROCM_PATH})
set(_aws_ofi_rccl_hip_path_opt
"--with-hip=$ENV{ROCM_PATH}")
else ()
message(WARNING
"You have enabled AWS_OFI_RCCL package, but ROCM_PATH "
"is not available in your environment.")
endif ()

# First look for a user-provided path. Then look for a RCCL built by
# this superbuild. Finally, land back on the ROCM_PATH.
if (LBANN_SB_FWD_AWS_OFI_RCCL_RCCL_PATH)
set(_aws_ofi_rccl_rccl_path_opt
"--with-rccl=${LBANN_SB_FWD_AWS_OFI_RCCL_RCCL_PATH}")
elseif (LBANN_SB_BUILD_RCCL)
ExternalProject_Get_property(RCCL INSTALL_DIR)
set(_aws_ofi_rccl_rccl_path_opt
"--with-rccl=${INSTALL_DIR}")
set(_aws_ofi_rccl_rccl_dep
"DEPENDS RCCL")
elseif (DEFINED ENV{ROCM_PATH})
set(_aws_ofi_rccl_rccl_path_opt
"--with-rccl=$ENV{ROCM_PATH}")
endif ()

# A user-provided path is preferred. Otherwise we try to
if (LBANN_SB_FWD_AWS_OFI_RCCL_LIBFABRIC_PATH)
set(_aws_ofi_rccl_libfabric_path_opt
"--with-libfabric=${LBANN_SB_FWD_AWS_OFI_RCCL_LIBFABRIC_PATH}")
else ()
# First pkg-config, then probe the environment manually
find_package(PkgConfig)
if (PKG_CONFIG_FOUND)
pkg_get_variable(_libfabric_dir libfabric prefix)
endif ()
if (NOT _libfabric_dir)
find_program(_fi_info_exe fi_info)
if (_fi_info_exe)
get_filename_component(_fi_info_dir "${_fi_info_exe}" DIRECTORY)
get_filename_component(_libfabric_dir "${_fi_info_dir}" DIRECTORY)
endif ()
endif ()
if (_libfabric_dir)
set(_aws_ofi_rccl_libfabric_path_opt
"--with-libfabric=${_libfabric_dir}")
endif ()
endif ()

# Finally we poke a bit at MPI. First we look for a user-provided
# path. Otherwise, we look for "MPICH_DIR" in the environment. This
# should be safe even if the Cray compiler is being used. If neither
# is found, we let the configure script do its thing and error out if
# it cannot sufficiently detect MPI.
if (LBANN_SB_FWD_AWS_OFI_RCCL_MPI_PATH)
set(_aws_ofi_rccl_mpi_path_opt
"--with-mpi=${LBANN_SB_FWD_AWS_OFI_RCCL_MPI_PATH}")
elseif (DEFINED ENV{MPICH_DIR})
set(_aws_ofi_rccl_mpi_path_opt
"--with-mpi=$ENV{MPICH_DIR}")
endif ()

# This is an autotools package, so we need 'make'.
find_program(GNU_MAKE_PROGRAM make)

include (ExternalProject)
ExternalProject_Add(${PKG_NAME}
PREFIX "${CMAKE_CURRENT_BINARY_DIR}"
${LBANN_SB_GIT_REPOSITORY_TAG} ${LBANN_SB_${PKG_NAME}_URL}
${LBANN_SB_GIT_TAG_TAG} ${LBANN_SB_${PKG_NAME}_TAG}
TMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/tmp"
STAMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/stamp"

SOURCE_DIR "${LBANN_SB_${PKG_NAME}_SOURCE_DIR}"
INSTALL_DIR "${LBANN_SB_${PKG_NAME}_PREFIX}"

BUILD_IN_SOURCE 1
USES_TERMINAL_BUILD 1
LOG_DOWNLOAD 1
LOG_UPDATE 1
LOG_CONFIGURE 1
LOG_BUILD 1
LOG_INSTALL 1
LOG_TEST 1

# RCCL dependency, if needed.
${_aws_ofi_rccl_rccl_dep}

CONFIGURE_COMMAND
"${LBANN_SB_${PKG_NAME}_SOURCE_DIR}/configure"
"--prefix=${LBANN_SB_${PKG_NAME}_PREFIX}"
"CC=${LBANN_SB_${PKG_NAME}_C_COMPILER}"
${_aws_ofi_rccl_hip_path_opt}
${_aws_ofi_rccl_rccl_path_opt}
${_aws_ofi_rccl_libfabric_path_opt}
${_aws_ofi_rccl_mpi_path_opt}

BUILD_COMMAND
${GNU_MAKE_PROGRAM} -j${${PKG_NAME}_MAX_MAKE_JOBS}
INSTALL_COMMAND
${GNU_MAKE_PROGRAM} install
)

ExternalProject_Add_Step(${PKG_NAME} autogen
COMMAND "<SOURCE_DIR>/autogen.sh"
COMMENT "Running autogen.sh for ${PKG_NAME}"
DEPENDEES download
DEPENDERS configure
INDEPENDENT YES
BYPRODUCTS "<SOURCE_DIR>/configure"
ALWAYS NO
WORKING_DIRECTORY "<SOURCE_DIR>"
LOG YES
USES_TERMINAL NO)

set(${PKG_NAME}_DIR ${LBANN_SB_${PKG_NAME}_PREFIX}
CACHE INTERNAL "The install prefix of ${PKG_NAME}.")
10 changes: 10 additions & 0 deletions scripts/superbuild/examples/rocm-distconv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ BUILD_LBANN_STACK=ON
# LBANN stack.
BUILD_WITH_DISTCONV=ON

# Set to ON if you're on a Cray machine that doesn't provide the AWS
# plugin as part of its default RCCL installation.
#
# It might also be advisable to build this if you build a custom RCCL.
# The configuration script takes a RCCL path as a parameter, so it
# could matter, but it's not clear how much.
BUILD_AWS_OFI_RCCL_PLUGIN=OFF

# Improve debugging info and remove some misguided warnings. These are
# passed only to the LBANN stack.
EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations"
Expand Down Expand Up @@ -102,6 +110,8 @@ cmake \
-D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \
-D LBANN_SB_OpenCV_TAG=4.x \
\
-D LBANN_SB_BUILD_AWS_OFI_RCCL=${BUILD_AWS_OFI_RCCL_PLUGIN} \
\
-D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \
-D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \
-D LBANN_SB_Aluminum_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \
Expand Down

0 comments on commit 7d1d17c

Please sign in to comment.