From c5ac2b85bc137b3d7b316e8ef8677dfe99d1b6ba Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 31 Dec 2024 12:14:15 +0100 Subject: [PATCH] server : clean up built-in template detection --- common/common.cpp | 13 +++++++++++++ common/common.h | 3 +++ examples/server/server.cpp | 23 +++++++++-------------- examples/server/utils.hpp | 13 ------------- 4 files changed, 25 insertions(+), 27 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 9071999a7a8e3..ba596923dc465 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1614,6 +1614,19 @@ std::string common_detokenize(llama_context * ctx, const std::vector model_template(res + 1, 0); + llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size()); + return std::string(model_template.data(), model_template.size() - 1); + } +} + bool common_chat_verify_template(const std::string & tmpl) { llama_chat_message chat[] = {{"user", "test"}}; int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); diff --git a/common/common.h b/common/common.h index 1d2bd932c211d..589f65d098888 100644 --- a/common/common.h +++ b/common/common.h @@ -571,6 +571,9 @@ struct common_chat_msg { std::string content; }; +// Get the built-in chat template for the model. Return empty string if not present. +std::string common_get_builtin_chat_template(const struct llama_model * model); + // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid bool common_chat_verify_template(const std::string & tmpl); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3558ddb7c711f..904904fcac49b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1623,17 +1623,10 @@ struct server_context { return true; } - bool validate_model_chat_template() const { - std::vector model_template(2048, 0); // longest known template is about 1200 bytes - std::string template_key = "tokenizer.chat_template"; - int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); - if (res >= 0) { - llama_chat_message chat[] = {{"user", "test"}}; - std::string tmpl = std::string(model_template.data(), model_template.size()); - int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0); - return chat_res > 0; - } - return false; + bool validate_builtin_chat_template() const { + llama_chat_message chat[] = {{"user", "test"}}; + int32_t chat_res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0); + return chat_res > 0; } void init() { @@ -3484,7 +3477,7 @@ int main(int argc, char ** argv) { { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params_base.n_parallel }, { "model_path", ctx_server.params_base.model }, - { "chat_template", llama_get_chat_template(ctx_server.model) }, + { "chat_template", common_get_builtin_chat_template(ctx_server.model) }, { "build_info", build_info }, }; @@ -4113,14 +4106,16 @@ int main(int argc, char ** argv) { // if a custom chat template is not supplied, we will use the one that comes with the model (if any) if (params.chat_template.empty()) { - if (!ctx_server.validate_model_chat_template()) { + if (!ctx_server.validate_builtin_chat_template()) { LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); params.chat_template = "chatml"; } } // print sample chat example to make it clear which template is used - LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str()); + LOG_INF("%s: chat template, chat_template: %d, example_format: '%s'\n", __func__, + params.chat_template.empty() ? "(built-in)" : params.chat_template, + common_chat_format_example(ctx_server.model, params.chat_template).c_str()); ctx_server.queue_tasks.on_new_task(std::bind( &server_context::process_single_task, &ctx_server, std::placeholders::_1)); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 334f2f19207ef..4727699cf68a5 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -382,19 +382,6 @@ inline std::string format_chat(const struct llama_model * model, const std::stri return formatted_chat; } -static std::string llama_get_chat_template(const struct llama_model * model) { - std::string template_key = "tokenizer.chat_template"; - // call with NULL buffer to get the total size of the string - int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0); - if (res < 2) { - return ""; - } else { - std::vector model_template(res + 1, 0); - llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); - return std::string(model_template.data(), model_template.size() - 1); - } -} - // // base64 utils (TODO: move to common in the future) //