Skip to content

Commit

Permalink
TensorRT-LLM v0.13 Update (#2269)
Browse files Browse the repository at this point in the history
  • Loading branch information
Shixiaowei02 authored Sep 30, 2024
1 parent 28fb9aa commit 201135e
Show file tree
Hide file tree
Showing 592 changed files with 773,759 additions and 87,300 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@
[submodule "3rdparty/NVTX"]
path = 3rdparty/NVTX
url = https://github.com/NVIDIA/NVTX.git
[submodule "3rdparty/ucxx"]
path = 3rdparty/ucxx
url = https://github.com/GuanLuo/ucxx.git
1 change: 1 addition & 0 deletions 3rdparty/ucxx
Submodule ucxx added at b99181
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ TensorRT-LLM
[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.3.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.12.0-green)](./tensorrt_llm/version.py)
[![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.13.0-green)](./tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

[Architecture](./docs/source/architecture/overview.md)   |   [Results](./docs/source/performance/perf-overview.md)   |   [Examples](./examples/)   |   [Documentation](./docs/source/)
Expand All @@ -17,11 +17,11 @@ TensorRT-LLM
<div align="left">

## Latest News
* [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
[➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)

* [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere
[➡️ link](https://developer.nvidia.com/blog/revolutionizing-code-completion-with-codestral-mamba-the-next-gen-coding-llm/)
<div align="center">
<img src="docs/source/media/picture-08-13-2024.png" width="50%">
<div align="left">

* [2024/08/06] 🗫 Multilingual Challenge Accepted 🗫
🤖 #TensorRT #LLM boosts low-resource languages like Hebrew, Indonesian and Vietnamese ⚡[➡️ link](https://developer.nvidia.com/blog/accelerating-hebrew-llm-performance-with-nvidia-tensorrt-llm/?linkId=100000278659647)
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,10 @@ for nloras in ${NUM_LORAS[@]}; do
--input-mean 256 --input-stdev 16 --output-mean 128 --output-stdev 24
done

# Generate random lora weights for 256 adapters
# Generate random lora weights for 16 adapters
python benchmarks/cpp/utils/generate_rand_loras.py ${CPP_LORA} ${EG_DIR}/loras 16

# perform benchmarking
# Perform benchmarking

# First run inference without LoRAs
mkdir -p ${EG_DIR}/log-base-lora
Expand Down
142 changes: 106 additions & 36 deletions benchmarks/cpp/gptManagerBenchmark.cpp

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions benchmarks/cpp/gptSessionBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,8 @@ int main(int argc, char* argv[])

options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value<int>());
options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value<int>());
options.add_options()("max_attention_window", "Max kv cache length per sequence.", cxxopts::value<int>());
options.add_options()(
"max_attention_window", "Max kv cache length per sequence.", cxxopts::value<std::vector<int>>());
options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
options.add_options()("sink_token_len", "Sink token length in kv cache per sequence.", cxxopts::value<int>());
options.add_options()(
Expand Down Expand Up @@ -535,7 +536,7 @@ int main(int argc, char* argv[])
// Argument: Max KV Cache Length
if (result.count("max_attention_window"))
{
sessionConfig.kvCacheConfig.maxAttentionWindow = result["max_attention_window"].as<int>();
sessionConfig.kvCacheConfig.maxAttentionWindowVec = result["max_attention_window"].as<std::vector<int>>();
}
// Argument: Sink token length
if (result.count("sink_token_len"))
Expand Down
25 changes: 12 additions & 13 deletions benchmarks/python/all_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

import tensorrt_llm as tllm
from tensorrt_llm import Mapping, Tensor
from tensorrt_llm._ipc_utils import peer_access
from tensorrt_llm._utils import OMPI_COMM_TYPE_HOST, mpi_comm
from tensorrt_llm.functional import AllReduceStrategy, allreduce
from tensorrt_llm.plugin.plugin import current_all_reduce_helper
Expand Down Expand Up @@ -106,18 +105,18 @@ def allreduce_benchmark(dtype: str,
_, start = cuda.cuEventCreate(0)
_, stop = cuda.cuEventCreate(0)
runtimes = []
with peer_access(mapping):
tllm.mpi_barrier()

for _ in range(10):
cuda.cuEventRecord(start, stream.cuda_stream)
session.run(inputs=feed_dict,
outputs={"output": output},
stream=stream.cuda_stream)
cuda.cuEventRecord(stop, stream.cuda_stream)
torch.cuda.synchronize()
_, ms = cuda.cuEventElapsedTime(start, stop)
runtimes.append(ms)

tllm.mpi_barrier()

for _ in range(10):
cuda.cuEventRecord(start, stream.cuda_stream)
session.run(inputs=feed_dict,
outputs={"output": output},
stream=stream.cuda_stream)
cuda.cuEventRecord(stop, stream.cuda_stream)
torch.cuda.synchronize()
_, ms = cuda.cuEventElapsedTime(start, stop)
runtimes.append(ms)

median_ms = sorted(runtimes)[len(runtimes) // 2]
assert torch.allclose(output, (input * world_size)**inner_loop)
Expand Down
12 changes: 5 additions & 7 deletions benchmarks/python/check_accuracy_mlperf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from transformers import AutoTokenizer, LlamaTokenizerFast

nltk.download("punkt", quiet=False)
nltk.download('punkt_tab')
import argparse


Expand All @@ -25,10 +26,9 @@ class Model(Enum):
"tokens_per_sample": 294.45 * 0.9
},
Model.GPT_J: {
"rouge1": 42.9435135,
"rouge2": 20.1033765,
"rougeL": 29.9581119,
# "tokens_per_sample": ??
"rouge1": 42.9865 * 0.99,
"rouge2": 20.1235 * 0.99,
"rougeL": 29.9881 * 0.99,
}
}

Expand Down Expand Up @@ -138,7 +138,6 @@ def main():
target_texts = get_reference_df(args.dataset)
model = Model.Llama_v2_70B
tokenizer = LlamaTokenizerFast.from_pretrained(args.base_model)
relaxing_factor = 1.0
elif args.dataset.lower().endswith(".json"):
target_texts = get_reference_json(args.dataset)
model = Model.GPT_J
Expand All @@ -147,7 +146,6 @@ def main():
padding_side="left",
use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
relaxing_factor = 0.93
else:
raise RuntimeError(
"Dataset expected to be pkl (open-orca) or json (cnn-dailymail)")
Expand All @@ -169,7 +167,7 @@ def main():
print("Targets: ", targets)

for k, _ in targets.items():
assert targets[k] * relaxing_factor <= achieved_scores[k]
assert targets[k] <= achieved_scores[k]


if __name__ == "__main__":
Expand Down
6 changes: 5 additions & 1 deletion benchmarks/python/enc_dec_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.runtime.session import TensorInfo
from tensorrt_llm.runtime import ModelConfig
from tensorrt_llm.models.modeling_utils import get_kv_cache_type_from_legacy


class EncDecBenchmark(BaseBenchmark):
Expand Down Expand Up @@ -100,6 +101,9 @@ def read_config(component):
dtype = pretrained_config["dtype"]

paged_kv_cache = plugin_config['paged_kv_cache']
kv_cache_type = get_kv_cache_type_from_legacy(
True, paged_kv_cache)

tokens_per_block = plugin_config['tokens_per_block']

gather_context_logits = builder_config.get(
Expand All @@ -120,7 +124,7 @@ def read_config(component):
num_layers=num_layers,
gpt_attention_plugin=use_gpt_attention_plugin,
remove_input_padding=remove_input_padding,
paged_kv_cache=paged_kv_cache,
kv_cache_type=kv_cache_type,
tokens_per_block=tokens_per_block,
cross_attention=cross_attention,
has_position_embedding=has_position_embedding,
Expand Down
11 changes: 9 additions & 2 deletions benchmarks/python/gpt_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import torch

import tensorrt_llm
from tensorrt_llm.bindings import KVCacheType
from tensorrt_llm.builder import Engine
from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
SamplingConfig)
Expand Down Expand Up @@ -77,6 +78,13 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
if hasattr(self, item):
rnn_configs_kwargs[item] = getattr(self, item)

kv_cache_type = KVCacheType.CONTINUOUS
if hasattr(self, 'kv_cache_type'):
kv_cache_type = self.kv_cache_type
else:
if hasattr(self, 'paged_kv_cache'):
kv_cache_type = KVCacheType.PAGED if self.paged_kv_cache == True else KVCacheType.CONTINUOUS

model_config = tensorrt_llm.runtime.ModelConfig(
max_batch_size=self.max_batch_size,
max_beam_width=self.num_beams,
Expand All @@ -86,8 +94,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
num_kv_heads=ceil(self.num_kv_heads / self.world_size),
hidden_size=self.hidden_size // self.world_size,
gpt_attention_plugin=self.use_gpt_attention_plugin,
paged_kv_cache=self.paged_kv_cache if hasattr(
self, 'paged_kv_cache') else False,
kv_cache_type=kv_cache_type,
paged_state=self.paged_state
if hasattr(self, 'paged_state') else False,
dtype=self.dtype,
Expand Down
45 changes: 43 additions & 2 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,23 @@ else()
message(STATUS "Importing nvrtc wrapper")
endif()

if(EXISTS
"${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/kernels/internal_cutlass_kernels/CMakeLists.txt"
)
set(BUILD_INTERNAL_CUTLASS_KERNELS_DEFAULT ON)
else()
set(BUILD_INTERNAL_CUTLASS_KERNELS_DEFAULT OFF)
endif()
option(BUILD_INTERNAL_CUTLASS_KERNELS
"Build internal cutlass kernels from source"
${BUILD_INTERNAL_CUTLASS_KERNELS_DEFAULT})

if(BUILD_INTERNAL_CUTLASS_KERNELS)
message(STATUS "Building internal cutlass kernels")
else()
message(STATUS "Importing internal cutlass kernels")
endif()

if(BUILD_PYT)
message(STATUS "Building PyTorch")
else()
Expand Down Expand Up @@ -289,7 +306,7 @@ set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
find_library(RT_LIB rt)

set_ifndef(ENABLE_MULTI_DEVICE 1)
if(ENABLE_MULTI_DEVICE EQUAL 1)
if(ENABLE_MULTI_DEVICE)
# NCCL dependencies
set_ifndef(NCCL_LIB_DIR /usr/lib/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu/)
set_ifndef(NCCL_INCLUDE_DIR /usr/include/)
Expand Down Expand Up @@ -364,7 +381,7 @@ endif()
# set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G")

set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE}"
"${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE} -DENABLE_UCX=${ENABLE_UCX}"
)

# Fix linking issue with TRT 10, the detailed description about `--mcmodel` can
Expand Down Expand Up @@ -521,6 +538,30 @@ elseif(NOT WIN32)
message(STATUS "Build without PyTorch, USE_CXX11_ABI=${USE_CXX11_ABI}")
endif()

# Defer UCX/UCXX setup until after USE_CXX11_ABI is well defined, as UCXX will
# need to be built to have aligned symbols
set_ifndef(ENABLE_UCX 0)
if(ENABLE_UCX)
# Only enable UCX related features if the system has UCX library
find_package(ucx)
if(NOT ${ucx_FOUND})
set(ENABLE_UCX 0)
else()
# installing ucxx via add_subdirectory results in strange cudart linking
# error, thus using their installation script to isolate the installation
# process until the issue is understood. And always trigger the build so
# that change in USE_CXX11_ABI will not be ignored.
execute_process(
COMMAND
${3RDPARTY_DIR}/ucxx/build.sh libucxx -n
--cmake-args=\"-DBUILD_SHARED_LIBS=OFF
-DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=${USE_CXX11_ABI}\"
COMMAND_ECHO STDOUT)
find_package(ucxx REQUIRED PATHS ${3RDPARTY_DIR}/ucxx/cpp/build
NO_DEFAULT_PATH)
endif()
endif()

file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
REGEX "#define NV_TENSORRT_.*")
foreach(TYPE MAJOR MINOR PATCH BUILD)
Expand Down
23 changes: 23 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/inferenceRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "tensorrt_llm/batch_manager/llmRequest.h"
#include "tensorrt_llm/batch_manager/namedTensor.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/runtime/iTensor.h"

#include <algorithm>
Expand All @@ -35,10 +36,12 @@ namespace inference_request
{
// Input tensors
auto constexpr kInputIdsTensorName = "input_ids";
auto constexpr kPositionIdsTensorName = "position_ids";
auto constexpr kDraftInputIdsTensorName = "draft_input_ids";
auto constexpr kDraftLogitsTensorName = "draft_logits";
auto constexpr kMaxNewTokensTensorName = "request_output_len";
auto constexpr kBeamWidthTensorName = "beam_width";
auto constexpr kNumReturnSequencesTensorName = "num_return_sequences";
auto constexpr kEndIdTensorName = "end_id";
auto constexpr kPadIdTensorName = "pad_id";
auto constexpr kBadWordsListTensorName = "bad_words_list";
Expand Down Expand Up @@ -165,17 +168,34 @@ class GenericInferenceRequest
mLogitsPostProcessor = cb;
}

[[nodiscard]] std::optional<executor::LookaheadDecodingConfig> getLookaheadConfig() const
{
return mLookaheadConfig;
}

void setLookaheadConfig(executor::LookaheadDecodingConfig config)
{
mLookaheadConfig = config;
}

void clearLookaheadConfig()
{
mLookaheadConfig = std::nullopt;
}

std::optional<LogitsPostProcessor> getLogitsPostProcessor()
{
return mLogitsPostProcessor;
}

static std::array constexpr kTensorNames = {
inference_request::kInputIdsTensorName,
inference_request::kPositionIdsTensorName,
inference_request::kDraftInputIdsTensorName,
inference_request::kDraftLogitsTensorName,
inference_request::kMaxNewTokensTensorName,
inference_request::kBeamWidthTensorName,
inference_request::kNumReturnSequencesTensorName,
inference_request::kEndIdTensorName,
inference_request::kPadIdTensorName,
inference_request::kBadWordsListTensorName,
Expand Down Expand Up @@ -240,10 +260,12 @@ class GenericInferenceRequest
}

TENSOR_GETTER_SETTER(InputIds, inference_request::kInputIdsTensorName)
TENSOR_GETTER_SETTER(PositionIds, inference_request::kPositionIdsTensorName)
TENSOR_GETTER_SETTER(DraftInputIds, inference_request::kDraftInputIdsTensorName)
TENSOR_GETTER_SETTER(DraftLogits, inference_request::kDraftLogitsTensorName)
TENSOR_GETTER_SETTER(MaxNewTokens, inference_request::kMaxNewTokensTensorName)
TENSOR_GETTER_SETTER(BeamWidth, inference_request::kBeamWidthTensorName)
TENSOR_GETTER_SETTER(NumReturnSequences, inference_request::kNumReturnSequencesTensorName)
TENSOR_GETTER_SETTER(EndId, inference_request::kEndIdTensorName)
TENSOR_GETTER_SETTER(PadId, inference_request::kPadIdTensorName)
TENSOR_GETTER_SETTER(BadWordsList, inference_request::kBadWordsListTensorName)
Expand Down Expand Up @@ -282,6 +304,7 @@ class GenericInferenceRequest
bool mIsStreaming;
TensorMap mInputTensors;
std::optional<LogitsPostProcessor> mLogitsPostProcessor;
std::optional<executor::LookaheadDecodingConfig> mLookaheadConfig;
};

class InferenceRequest : public GenericInferenceRequest<tensorrt_llm::runtime::ITensor::SharedPtr, NamedTensor>
Expand Down
Loading

0 comments on commit 201135e

Please sign in to comment.