Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llmodel: dlopen llama.cpp libraries lazily instead of eagerly #3308

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 25 additions & 11 deletions gpt4all-backend/include/gpt4all-backend/llmodel.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <cstddef>
#include <cstdint>
#include <expected>
#include <filesystem>
#include <functional>
#include <optional>
#include <span>
Expand All @@ -19,6 +20,7 @@
class Dlhandle;

using namespace std::string_literals;
namespace fs = std::filesystem;

#define LLMODEL_MAX_PROMPT_BATCH 128

Expand Down Expand Up @@ -94,12 +96,13 @@ class LLModel {

class Implementation {
public:
Implementation(std::string buildBackend, Dlhandle &&dlhandle);
Implementation(const Implementation &) = delete;
Implementation(Implementation &&);
~Implementation();

std::string_view modelType() const { return m_modelType; }
std::string_view buildVariant() const { return m_buildVariant; }
const std::string &buildBackend() const { return m_buildBackend; }
std::string_view modelType () const { return m_modelType; }

static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
Expand All @@ -114,19 +117,17 @@ class LLModel {
static int cpuSupportsAVX2();

private:
Implementation(Dlhandle &&);

static const std::vector<Implementation> &implementationList();
static const Implementation *implementation(const char *fname, const std::string &buildVariant);
static const Implementation *findImplementation(const char *fname, const std::string &buildBackend);
static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);

char *(*m_getFileArch)(const char *fname);
bool (*m_isArchSupported)(const char *arch);
LLModel *(*m_construct)();
std::string m_buildBackend;
Dlhandle *m_dlhandle;

char *(*m_getFileArch) (const char *fname);
bool (*m_isArchSupported)(const char *arch);
LLModel *(*m_construct) ();

std::string_view m_modelType;
std::string_view m_buildVariant;
Dlhandle *m_dlhandle;
};

struct PromptContext {
Expand All @@ -141,6 +142,16 @@ class LLModel {
float contextErase = 0.5f; // percent of context to erase if we exceed the context window
};

private:
struct LazyImplementation {
std::string buildBackend;
fs::path path;
std::optional<Implementation> impl = {};

const Implementation &get();
};

public:
explicit LLModel() {}
virtual ~LLModel() {}

Expand Down Expand Up @@ -267,6 +278,9 @@ class LLModel {
const PromptContext &promptCtx,
int32_t nPast);

private:
static std::vector<LazyImplementation> &getImplementations();

friend class LLMImplementation;
};

Expand Down
9 changes: 0 additions & 9 deletions gpt4all-backend/src/llamamodel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1278,21 +1278,12 @@ void LLamaModel::embedInternal(
#endif

extern "C" {
DLL_EXPORT bool is_g4a_backend_model_implementation()
{
return true;
}

DLL_EXPORT const char *get_model_type()
{
return modelType_;
}

DLL_EXPORT const char *get_build_variant()
{
return GGML_BUILD_VARIANT;
}

DLL_EXPORT char *get_file_arch(const char *fname)
{
char *arch = nullptr;
Expand Down
130 changes: 56 additions & 74 deletions gpt4all-backend/src/llmodel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
#include <iterator>
#include <memory>
#include <optional>
#include <regex>
#include <sstream>
#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>

Expand All @@ -32,6 +32,8 @@
# include "sysinfo.h" // for getSystemTotalRAMInBytes
#endif

using namespace std::string_literals;
using namespace std::string_view_literals;
namespace fs = std::filesystem;

#ifndef __APPLE__
Expand Down Expand Up @@ -66,29 +68,30 @@ std::string s_implementations_search_path = ".";
#define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
#endif

LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
: m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
LLModel::Implementation::Implementation(std::string buildBackend, Dlhandle &&dlhandle)
: m_buildBackend(std::move(buildBackend))
, m_dlhandle(new Dlhandle(std::move(dlhandle)))
{
auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
assert(get_model_type);
m_modelType = get_model_type();
auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
assert(get_build_variant);
m_buildVariant = get_build_variant();
m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
assert(m_getFileArch);
m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
assert(m_isArchSupported);
m_construct = m_dlhandle->get<LLModel *()>("construct");
assert(m_construct);

m_modelType = get_model_type();
}

LLModel::Implementation::Implementation(Implementation &&o)
: m_getFileArch(o.m_getFileArch)
: m_buildBackend(o.m_buildBackend)
, m_dlhandle(o.m_dlhandle)
, m_getFileArch(o.m_getFileArch)
, m_isArchSupported(o.m_isArchSupported)
, m_construct(o.m_construct)
, m_modelType(o.m_modelType)
, m_buildVariant(o.m_buildVariant)
, m_dlhandle(o.m_dlhandle) {
{
o.m_dlhandle = nullptr;
}

Expand All @@ -97,11 +100,6 @@ LLModel::Implementation::~Implementation()
delete m_dlhandle;
}

static bool isImplementation(const Dlhandle &dl)
{
return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
}

// Add the CUDA Toolkit to the DLL search path on Windows.
// This is necessary for chat.exe to find CUDA when started from Qt Creator.
static void addCudaSearchPath()
Expand All @@ -117,78 +115,60 @@ static void addCudaSearchPath()
#endif
}

const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
auto LLModel::LazyImplementation::get() -> const Implementation &
{
if (!impl) impl.emplace(buildBackend, Dlhandle(path));
return *impl;
}

auto LLModel::getImplementations() -> std::vector<LazyImplementation> &
{
// in no particular order
static const std::array ALL_BUILD_BACKENDS { "cpu"sv, "metal"sv, "kompute"sv, "vulkan"sv, "cuda"sv };
static const std::string_view LIB_EXT(LIB_FILE_EXT);

if (cpu_supports_avx() == 0) {
throw std::runtime_error("CPU does not support AVX");
}

// NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
// individual models without the cleanup of the static list interfering
static auto* libs = new std::vector<Implementation>([] () {
std::vector<Implementation> fres;
static auto* libs = new std::vector<LazyImplementation>([] () {
std::vector<LazyImplementation> fres;

addCudaSearchPath();

std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
if (cpu_supports_avx2() == 0) {
impl_name_re += "-avxonly";
}
std::regex re(impl_name_re);
auto search_in_directory = [&](const std::string& paths) {
std::stringstream ss(paths);
std::string path;
// Split the paths string by the delimiter and process each path.
while (std::getline(ss, path, ';')) {
std::u8string u8_path(path.begin(), path.end());
// Iterate over all libraries
for (const auto &f : fs::directory_iterator(u8_path)) {
const fs::path &p = f.path();

if (p.extension() != LIB_FILE_EXT) continue;
if (!std::regex_search(p.stem().string(), re)) continue;

// Add to list if model implementation
Dlhandle dl;
try {
dl = Dlhandle(p);
} catch (const Dlhandle::Exception &e) {
std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
continue;
}
if (!isImplementation(dl)) {
std::cerr << "Not an implementation: " << p.filename().string() << "\n";
continue;
}
fres.emplace_back(Implementation(std::move(dl)));
}
bool avxonly = cpu_supports_avx2() == 0;
std::stringstream ss(s_implementations_search_path);
std::string piece;
// Split the paths string by the delimiter and process each path.
while (std::getline(ss, piece, ';')) {
auto basePath = fs::path(std::u8string(piece.begin(), piece.end()));
// Iterate over all libraries
for (auto &buildBackend : ALL_BUILD_BACKENDS) {
auto path = basePath /
"llamamodel-mainline-"s.append(buildBackend).append(avxonly ? "-avxonly" : "").append(LIB_EXT);
if (fs::exists(path))
fres.push_back(LazyImplementation { std::string(buildBackend), path });
}
};

search_in_directory(s_implementations_search_path);
}

return fres;
}());
// Return static result
return *libs;
}

static std::string applyCPUVariant(const std::string &buildVariant)
auto LLModel::Implementation::findImplementation(const char *fname, const std::string &buildBackend)
-> const Implementation *
{
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
return buildVariant + "-avxonly";
}
return buildVariant;
}

const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
{
bool buildVariantMatched = false;
bool buildBackendMatched = false;
std::optional<std::string> archName;
for (const auto& i : implementationList()) {
if (buildVariant != i.m_buildVariant) continue;
buildVariantMatched = true;
for (auto &li : getImplementations()) {
if (li.buildBackend != buildBackend) continue;
buildBackendMatched = true;

auto &i = li.get();
char *arch = i.m_getFileArch(fname);
if (!arch) continue;
archName = arch;
Expand All @@ -198,7 +178,7 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
if (archSupported) return &i;
}

if (!buildVariantMatched)
if (!buildBackendMatched)
return nullptr;
if (!archName)
throw UnsupportedModelError("Unsupported file format");
Expand All @@ -216,7 +196,7 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, const
}

for (const auto &desiredBackend: desiredBackends) {
const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
const auto *impl = findImplementation(modelPath.c_str(), desiredBackend);

if (impl) {
// Construct llmodel implementation
Expand Down Expand Up @@ -251,11 +231,11 @@ LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::
{
static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;

const std::vector<Implementation> *impls;
std::vector<LazyImplementation> *impls;
try {
impls = &implementationList();
impls = &getImplementations();
} catch (const std::runtime_error &e) {
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
std::cerr << __func__ << ": getImplementations() failed: " << e.what() << "\n";
return nullptr;
}

Expand All @@ -268,13 +248,15 @@ LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::

const Implementation *impl = nullptr;

for (const auto &desiredBackend: desiredBackends) {
for (const auto &desiredBackend : desiredBackends) {
auto cacheIt = implCache.find(desiredBackend);
if (cacheIt != implCache.end())
return cacheIt->second.get(); // cached

for (const auto &i: *impls) {
if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
for (auto &li : *impls) {
if (li.buildBackend == desiredBackend) {
auto &i = li.get();
assert(i.m_modelType == "LLaMA");
impl = &i;
break;
}
Expand Down
2 changes: 1 addition & 1 deletion gpt4all-chat/src/chatllm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
bool actualDeviceIsCPU = true;

#if defined(Q_OS_MAC) && defined(__aarch64__)
if (m_llModelInfo.model->implementation().buildVariant() == "metal")
if (m_llModelInfo.model->implementation().buildBackend() == "metal")
actualDeviceIsCPU = false;
#else
if (requestedDevice != "CPU") {
Expand Down
2 changes: 1 addition & 1 deletion gpt4all-chat/src/embllm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ bool EmbeddingLLMWorker::loadModel()
bool actualDeviceIsCPU = true;

#if defined(Q_OS_MAC) && defined(__aarch64__)
if (m_model->implementation().buildVariant() == "metal")
if (m_model->implementation().buildBackend() == "metal")
actualDeviceIsCPU = false;
#else
if (requestedDevice != "CPU") {
Expand Down
Loading