Skip to content

Commit

Permalink
Merge branch 'master' into xsn/fix_logprobs
Browse files Browse the repository at this point in the history
  • Loading branch information
ngxson committed Dec 18, 2024
2 parents 630ddcc + 152610e commit c0cca53
Show file tree
Hide file tree
Showing 100 changed files with 14,840 additions and 1,082 deletions.
3 changes: 2 additions & 1 deletion .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
# Increases the runtime closure size by ~700M
useMpi ? false,
useRocm ? config.rocmSupport,
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true,
useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
Expand Down Expand Up @@ -188,7 +189,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
]
++ optionals useRocm [
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
]
++ optionals useMetalKit [
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
Expand Down
10 changes: 5 additions & 5 deletions .devops/tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,23 @@ arg1="$1"
shift

if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert_hf_to_gguf.py "$@"
exec python3 ./convert_hf_to_gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./llama-quantize "$@"
exec ./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./llama-cli "$@"
exec ./llama-cli "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do
if [ -f "${i/f16/q4_0}" ]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
fi
done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./llama-server "$@"
exec ./llama-server "$@"
else
echo "Unknown command: $arg1"
echo "Available commands: "
Expand Down
34 changes: 32 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ jobs:
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential vulkan-sdk
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
- name: Build
id: cmake_build
Expand All @@ -327,6 +327,12 @@ jobs:
cmake -DGGML_VULKAN=ON ..
cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.0.2
Expand Down Expand Up @@ -662,6 +668,8 @@ jobs:
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'llvm-arm64-opencl-adreno'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'

steps:
- name: Clone
Expand Down Expand Up @@ -703,6 +711,28 @@ jobs:
run: |
choco install ninja
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
run: |
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
mkdir build && cd build
cmake .. `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build . --target install
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
mkdir build-arm64-release && cd build-arm64-release
cmake .. `
-A arm64 `
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build . --target install --config release
- name: Build
id: cmake_build
run: |
Expand Down Expand Up @@ -732,7 +762,7 @@ jobs:
- name: Test
id: cmake_test
# not all machines have native AVX-512
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
Expand Down
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ BUILD_TARGETS = \
llama-infill \
llama-llava-cli \
llama-minicpmv-cli\
llama-qwen2vl-cli\
llama-lookahead \
llama-lookup \
llama-lookup-create \
Expand Down Expand Up @@ -1404,6 +1405,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift
(cd examples/batched.swift; make build)
Expand Down
20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)

#### Multimodal

Expand All @@ -110,6 +111,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)

</details>

Expand Down Expand Up @@ -219,7 +221,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](docs/build.md#hipblas) | AMD GPU |
| [HIP](docs/build.md#hip) | AMD GPU |
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |

Expand Down Expand Up @@ -412,7 +414,7 @@ To learn more about model quantization, [read this documentation](examples/quant
[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)

## [`llama-bench`](example/bench)
## [`llama-bench`](examples/llama-bench)

#### Benchmark the performance of the inference for various parameters.

Expand All @@ -433,6 +435,20 @@ To learn more about model quantization, [read this documentation](examples/quant

</details>

## [`llama-run`](examples/run)

#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].

- <details>
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
```bash
llama-run granite-code
```
</details>
[^3]: [https://github.com/containers/ramalama](RamaLama)
## [`llama-simple`](examples/simple)
Expand Down
2 changes: 1 addition & 1 deletion common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url
if (LLAMA_CURL)
find_package(CURL REQUIRED)
add_definitions(-DLLAMA_USE_CURL)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
Expand Down
13 changes: 6 additions & 7 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -855,13 +855,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.ignore_eos = true;
}
).set_sparam());
add_opt(common_arg(
{"--penalize-nl"},
string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
[](common_params & params) {
params.sampling.penalize_nl = true;
}
).set_sparam());
add_opt(common_arg(
{"--temp"}, "N",
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
Expand Down Expand Up @@ -916,6 +909,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--repeat-last-n"}, "N",
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
[](common_params & params, int value) {
if (value < -1) {
throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
}
params.sampling.penalty_last_n = value;
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
}
Expand Down Expand Up @@ -970,6 +966,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--dry-penalty-last-n"}, "N",
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
[](common_params & params, int value) {
if (value < -1) {
throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
}
params.sampling.dry_penalty_last_n = value;
}
).set_sparam());
Expand Down
29 changes: 22 additions & 7 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -940,6 +940,25 @@ struct common_init_result common_init_from_params(common_params & params) {
params.sampling.ignore_eos = false;
}

if (params.sampling.ignore_eos) {
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
if (llama_token_is_eog(model, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias.push_back({i, -INFINITY});
}
}
}

if (params.sampling.penalty_last_n == -1) {
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.penalty_last_n = llama_n_ctx(lctx);
}

if (params.sampling.dry_penalty_last_n == -1) {
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
}

if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

Expand Down Expand Up @@ -1076,12 +1095,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2


static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}

static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;

Expand Down Expand Up @@ -1767,7 +1780,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
break;
case 0: // max absolute
for (int i = 0; i < n; i++) {
if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
if (sum < std::abs(inp[i])) {
sum = std::abs(inp[i]);
}
}
sum /= 32760.0; // make an int16 range
break;
Expand Down
29 changes: 19 additions & 10 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;

// build info
extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT;
extern char const * LLAMA_COMPILER;
extern char const * LLAMA_BUILD_TARGET;
extern const char * LLAMA_COMMIT;
extern const char * LLAMA_COMPILER;
extern const char * LLAMA_BUILD_TARGET;

struct common_control_vector_load_info;

Expand Down Expand Up @@ -95,6 +95,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
COMMON_SAMPLER_TYPE_XTC = 8,
COMMON_SAMPLER_TYPE_INFILL = 9,
COMMON_SAMPLER_TYPE_PENALTIES = 10,
};

// dimensionality reduction methods, used by cvector-generator
Expand Down Expand Up @@ -130,7 +131,6 @@ struct common_params_sampling {
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
bool no_perf = false; // disable performance metrics
bool timing_per_token = false;
Expand All @@ -140,6 +140,7 @@ struct common_params_sampling {


std::vector<enum common_sampler_type> samplers = {
COMMON_SAMPLER_TYPE_PENALTIES,
COMMON_SAMPLER_TYPE_DRY,
COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TYPICAL_P,
Expand Down Expand Up @@ -194,11 +195,13 @@ struct common_params {
float defrag_thold = 0.1f; // KV cache defragmentation threshold

// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs

enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
Expand Down Expand Up @@ -438,6 +441,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
return parts;
}

static bool string_starts_with(const std::string & str,
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}

bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);

Expand Down Expand Up @@ -589,7 +597,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
// Embedding utils
//

void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
// TODO: repace embd_norm with an enum
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);

float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);

Expand Down
Loading

0 comments on commit c0cca53

Please sign in to comment.