diff --git a/CMakeLists.txt b/CMakeLists.txt index a573099d8..29e39f79c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR}) set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}") cmake_minimum_required(VERSION 3.25) -enable_language(CXX) +project(mscclpp LANGUAGES CXX) list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) @@ -61,7 +61,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") if(MSCCLPP_USE_CUDA) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra") - project(mscclpp LANGUAGES CXX CUDA) + enable_language(CUDA) # CUDA 11 or higher is required if(CUDAToolkit_VERSION_MAJOR LESS 11) @@ -83,7 +83,6 @@ if(MSCCLPP_USE_CUDA) else() set(CMAKE_HIP_STANDARD 17) set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra") - project(mscclpp LANGUAGES CXX) set(CMAKE_HIP_ARCHITECTURES gfx90a gfx941 gfx942) diff --git a/apps/nccl/src/broadcast.hpp b/apps/nccl/src/broadcast.hpp index 76899f93d..a453bcb2c 100644 --- a/apps/nccl/src/broadcast.hpp +++ b/apps/nccl/src/broadcast.hpp @@ -17,12 +17,7 @@ __global__ void __launch_bounds__(1024, 1) broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff, mscclpp::DeviceHandle* smChannels, size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root, size_t nRanksPerNode, size_t nelemsPerGPU) { - const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; - const size_t lid = tid % WARP_SIZE; - const size_t wid = tid / WARP_SIZE; - const size_t nThread = blockDim.x * gridDim.x; - const size_t nWarp = nThread / WARP_SIZE; const size_t nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 5f2070323..3e5b0682e 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -118,43 +118,6 @@ static size_t ncclTypeSize(ncclDataType_t type) { return 0; } -static double parseSize(const char* value) { - std::string valueStr(value); - std::istringstream iss(valueStr); - long long int units; - double size; - char size_lit = 0; - - if (iss >> size) { - iss >> std::ws; // eat whitespace - iss >> size_lit; - } else { - return -1.0; - } - - if (size_lit != 0 && !std::isspace(size_lit)) { - switch (size_lit) { - case 'G': - case 'g': - units = 1024 * 1024 * 1024; - break; - case 'M': - case 'm': - units = 1024 * 1024; - break; - case 'K': - case 'k': - units = 1024; - break; - default: - return -1.0; - }; - } else { - units = 1; - } - return size * units; -} - static mscclpp::Transport getTransport(int, int) { // if (rank / nRanksPerNode == peerRank / nRanksPerNode) { // return mscclpp::Transport::CudaIpc; diff --git a/cmake/AddFormatTargets.cmake b/cmake/AddFormatTargets.cmake index 9829bd135..8b9b0e4ef 100644 --- a/cmake/AddFormatTargets.cmake +++ b/cmake/AddFormatTargets.cmake @@ -9,7 +9,7 @@ add_custom_target(format) find_program(CLANG_FORMAT clang-format) if(CLANG_FORMAT) message(STATUS "Found clang-format: ${CLANG_FORMAT}") - set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps) + set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps/nccl/src) add_custom_target(check-format-cpp ALL COMMAND ${CLANG_FORMAT} -style=file --dry-run `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu` ) diff --git a/include/mscclpp/proxy_channel_device.hpp b/include/mscclpp/proxy_channel_device.hpp index b93b215eb..38237978a 100644 --- a/include/mscclpp/proxy_channel_device.hpp +++ b/include/mscclpp/proxy_channel_device.hpp @@ -92,10 +92,11 @@ struct BaseProxyChannelDeviceHandle { // can produce for and the sole proxy thread consumes it. FifoDeviceHandle fifo_; - BaseProxyChannelDeviceHandle() {} + MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle() {} - BaseProxyChannelDeviceHandle(SemaphoreId semaphoreId, Host2DeviceSemaphoreDeviceHandle semaphore, - FifoDeviceHandle fifo) + MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle(SemaphoreId semaphoreId, + Host2DeviceSemaphoreDeviceHandle semaphore, + FifoDeviceHandle fifo) : semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo) {} #if defined(MSCCLPP_DEVICE_COMPILE) @@ -185,10 +186,11 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle { MemoryId dst_; MemoryId src_; - ProxyChannelDeviceHandle(){}; + MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(){}; - ProxyChannelDeviceHandle(SemaphoreId semaphoreId, Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo, - MemoryId dst, MemoryId src) + MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(SemaphoreId semaphoreId, + Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo, + MemoryId dst, MemoryId src) : BaseProxyChannelDeviceHandle(semaphoreId, semaphore, fifo), dst_(dst), src_(src) {} #if defined(MSCCLPP_DEVICE_COMPILE)