nomic-ai · cebtenzzre · Dec 16, 2024
diff --git a/gpt4all-backend/include/gpt4all-backend/llmodel.h b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@@ -6,6 +6,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <expected>
+#include <filesystem>
 #include <functional>
 #include <optional>
 #include <span>
@@ -19,6 +20,7 @@
 class Dlhandle;
 
 using namespace std::string_literals;
+namespace fs = std::filesystem;
 
 #define LLMODEL_MAX_PROMPT_BATCH 128
 
@@ -94,12 +96,13 @@ class LLModel {
 
     class Implementation {
     public:
+        Implementation(std::string buildBackend, Dlhandle &&dlhandle);
         Implementation(const Implementation &) = delete;
         Implementation(Implementation &&);
         ~Implementation();
 
-        std::string_view modelType() const { return m_modelType; }
-        std::string_view buildVariant() const { return m_buildVariant; }
+        const std::string &buildBackend() const { return m_buildBackend; }
+        std::string_view   modelType   () const { return m_modelType;    }
 
         static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
         static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
@@ -114,19 +117,17 @@ class LLModel {
         static int cpuSupportsAVX2();
 
     private:
-        Implementation(Dlhandle &&);
-
-        static const std::vector<Implementation> &implementationList();
-        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
+        static const Implementation *findImplementation(const char *fname, const std::string &buildBackend);
         static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
 
-        char *(*m_getFileArch)(const char *fname);
-        bool (*m_isArchSupported)(const char *arch);
-        LLModel *(*m_construct)();
+        std::string  m_buildBackend;
+        Dlhandle    *m_dlhandle;
+
+        char    *(*m_getFileArch)    (const char *fname);
+        bool     (*m_isArchSupported)(const char *arch);
+        LLModel *(*m_construct)      ();
 
         std::string_view m_modelType;
-        std::string_view m_buildVariant;
-        Dlhandle *m_dlhandle;
     };
 
     struct PromptContext {
@@ -141,6 +142,16 @@ class LLModel {
         float   contextErase = 0.5f;    // percent of context to erase if we exceed the context window
     };
 
+private:
+    struct LazyImplementation {
+        std::string                   buildBackend;
+        fs::path                      path;
+        std::optional<Implementation> impl = {};
+
+        const Implementation &get();
+    };
+
+public:
     explicit LLModel() {}
     virtual ~LLModel() {}
 
@@ -267,6 +278,9 @@ class LLModel {
                           const PromptContext    &promptCtx,
                           int32_t                 nPast);
 
+private:
+    static std::vector<LazyImplementation> &getImplementations();
+
     friend class LLMImplementation;
 };
 

diff --git a/gpt4all-backend/src/llamamodel.cpp b/gpt4all-backend/src/llamamodel.cpp
@@ -1278,21 +1278,12 @@ void LLamaModel::embedInternal(
 #endif
 
 extern "C" {
-DLL_EXPORT bool is_g4a_backend_model_implementation()
-{
-    return true;
-}
 
 DLL_EXPORT const char *get_model_type()
 {
     return modelType_;
 }
 
-DLL_EXPORT const char *get_build_variant()
-{
-    return GGML_BUILD_VARIANT;
-}
-
 DLL_EXPORT char *get_file_arch(const char *fname)
 {
     char *arch = nullptr;

diff --git a/gpt4all-backend/src/llmodel.cpp b/gpt4all-backend/src/llmodel.cpp
@@ -10,9 +10,9 @@
 #include <iterator>
 #include <memory>
 #include <optional>
-#include <regex>
 #include <sstream>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <vector>
 
@@ -32,6 +32,8 @@
 #   include "sysinfo.h" // for getSystemTotalRAMInBytes
 #endif
 
+using namespace std::string_literals;
+using namespace std::string_view_literals;
 namespace fs = std::filesystem;
 
 #ifndef __APPLE__
@@ -66,29 +68,30 @@ std::string s_implementations_search_path = ".";
     #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
 #endif
 
-LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
-    : m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
+LLModel::Implementation::Implementation(std::string buildBackend, Dlhandle &&dlhandle)
+    : m_buildBackend(std::move(buildBackend))
+    , m_dlhandle(new Dlhandle(std::move(dlhandle)))
+{
     auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
     assert(get_model_type);
-    m_modelType = get_model_type();
-    auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
-    assert(get_build_variant);
-    m_buildVariant = get_build_variant();
     m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
     assert(m_getFileArch);
     m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
     assert(m_isArchSupported);
     m_construct = m_dlhandle->get<LLModel *()>("construct");
     assert(m_construct);
+
+    m_modelType = get_model_type();
 }
 
 LLModel::Implementation::Implementation(Implementation &&o)
-    : m_getFileArch(o.m_getFileArch)
+    : m_buildBackend(o.m_buildBackend)
+    , m_dlhandle(o.m_dlhandle)
+    , m_getFileArch(o.m_getFileArch)
     , m_isArchSupported(o.m_isArchSupported)
     , m_construct(o.m_construct)
     , m_modelType(o.m_modelType)
-    , m_buildVariant(o.m_buildVariant)
-    , m_dlhandle(o.m_dlhandle) {
+{
     o.m_dlhandle = nullptr;
 }
 
@@ -97,11 +100,6 @@ LLModel::Implementation::~Implementation()
     delete m_dlhandle;
 }
 
-static bool isImplementation(const Dlhandle &dl)
-{
-    return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
-}
-
 // Add the CUDA Toolkit to the DLL search path on Windows.
 // This is necessary for chat.exe to find CUDA when started from Qt Creator.
 static void addCudaSearchPath()
@@ -117,78 +115,60 @@ static void addCudaSearchPath()
 #endif
 }
 
-const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
+auto LLModel::LazyImplementation::get() -> const Implementation &
+{
+    if (!impl) impl.emplace(buildBackend, Dlhandle(path));
+    return *impl;
+}
+
+auto LLModel::getImplementations() -> std::vector<LazyImplementation> &
 {
+    // in no particular order
+    static const std::array ALL_BUILD_BACKENDS { "cpu"sv, "metal"sv, "kompute"sv, "vulkan"sv, "cuda"sv };
+    static const std::string_view LIB_EXT(LIB_FILE_EXT);
+
     if (cpu_supports_avx() == 0) {
         throw std::runtime_error("CPU does not support AVX");
     }
 
     // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
     // individual models without the cleanup of the static list interfering
-    static auto* libs = new std::vector<Implementation>([] () {
-        std::vector<Implementation> fres;
+    static auto* libs = new std::vector<LazyImplementation>([] () {
+        std::vector<LazyImplementation> fres;
 
         addCudaSearchPath();
 
-        std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
-        if (cpu_supports_avx2() == 0) {
-            impl_name_re += "-avxonly";
-        }
-        std::regex re(impl_name_re);
-        auto search_in_directory = [&](const std::string& paths) {
-            std::stringstream ss(paths);
-            std::string path;
-            // Split the paths string by the delimiter and process each path.
-            while (std::getline(ss, path, ';')) {
-                std::u8string u8_path(path.begin(), path.end());
-                // Iterate over all libraries
-                for (const auto &f : fs::directory_iterator(u8_path)) {
-                    const fs::path &p = f.path();
-
-                    if (p.extension() != LIB_FILE_EXT) continue;
-                    if (!std::regex_search(p.stem().string(), re)) continue;
-
-                    // Add to list if model implementation
-                    Dlhandle dl;
-                    try {
-                        dl = Dlhandle(p);
-                    } catch (const Dlhandle::Exception &e) {
-                        std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
-                        continue;
-                    }
-                    if (!isImplementation(dl)) {
-                        std::cerr << "Not an implementation: " << p.filename().string() << "\n";
-                        continue;
-                    }
-                    fres.emplace_back(Implementation(std::move(dl)));
-                }
+        bool avxonly = cpu_supports_avx2() == 0;
+        std::stringstream ss(s_implementations_search_path);
+        std::string piece;
+        // Split the paths string by the delimiter and process each path.
+        while (std::getline(ss, piece, ';')) {
+            auto basePath = fs::path(std::u8string(piece.begin(), piece.end()));
+            // Iterate over all libraries
+            for (auto &buildBackend : ALL_BUILD_BACKENDS) {
+                auto path = basePath /
+                    "llamamodel-mainline-"s.append(buildBackend).append(avxonly ? "-avxonly" : "").append(LIB_EXT);
+                if (fs::exists(path))
+                    fres.push_back(LazyImplementation { std::string(buildBackend), path });
             }
-        };
-
-        search_in_directory(s_implementations_search_path);
+        }
 
         return fres;
     }());
     // Return static result
     return *libs;
 }
 
-static std::string applyCPUVariant(const std::string &buildVariant)
+auto LLModel::Implementation::findImplementation(const char *fname, const std::string &buildBackend)
+    -> const Implementation *
 {
-    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
-        return buildVariant + "-avxonly";
-    }
-    return buildVariant;
-}
-
-const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
-{
-    bool buildVariantMatched = false;
+    bool buildBackendMatched = false;
     std::optional<std::string> archName;
-    for (const auto& i : implementationList()) {
-        if (buildVariant != i.m_buildVariant) continue;
-        buildVariantMatched = true;
+    for (auto &li : getImplementations()) {
+        if (li.buildBackend != buildBackend) continue;
+        buildBackendMatched = true;
 
+        auto &i = li.get();
         char *arch = i.m_getFileArch(fname);
         if (!arch) continue;
         archName = arch;
@@ -198,7 +178,7 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
         if (archSupported) return &i;
     }
 
-    if (!buildVariantMatched)
+    if (!buildBackendMatched)
         return nullptr;
     if (!archName)
         throw UnsupportedModelError("Unsupported file format");
@@ -216,7 +196,7 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, const
     }
 
     for (const auto &desiredBackend: desiredBackends) {
-        const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
+        const auto *impl = findImplementation(modelPath.c_str(), desiredBackend);
 
         if (impl) {
             // Construct llmodel implementation
@@ -251,11 +231,11 @@ LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::
 {
     static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
 
-    const std::vector<Implementation> *impls;
+    std::vector<LazyImplementation> *impls;
     try {
-        impls = &implementationList();
+        impls = &getImplementations();
     } catch (const std::runtime_error &e) {
-        std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
+        std::cerr << __func__ << ": getImplementations() failed: " << e.what() << "\n";
         return nullptr;
     }
 
@@ -268,13 +248,15 @@ LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::
 
     const Implementation *impl = nullptr;
 
-    for (const auto &desiredBackend: desiredBackends) {
+    for (const auto &desiredBackend : desiredBackends) {
         auto cacheIt = implCache.find(desiredBackend);
         if (cacheIt != implCache.end())
             return cacheIt->second.get(); // cached
 
-        for (const auto &i: *impls) {
-            if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
+        for (auto &li : *impls) {
+            if (li.buildBackend == desiredBackend) {
+                auto &i = li.get();
+                assert(i.m_modelType == "LLaMA");
                 impl = &i;
                 break;
             }

diff --git a/gpt4all-chat/src/chatllm.cpp b/gpt4all-chat/src/chatllm.cpp
@@ -528,7 +528,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
     bool actualDeviceIsCPU = true;
 
 #if defined(Q_OS_MAC) && defined(__aarch64__)
-    if (m_llModelInfo.model->implementation().buildVariant() == "metal")
+    if (m_llModelInfo.model->implementation().buildBackend() == "metal")
         actualDeviceIsCPU = false;
 #else
     if (requestedDevice != "CPU") {

diff --git a/gpt4all-chat/src/embllm.cpp b/gpt4all-chat/src/embllm.cpp
@@ -108,7 +108,7 @@ bool EmbeddingLLMWorker::loadModel()
     bool actualDeviceIsCPU = true;
 
 #if defined(Q_OS_MAC) && defined(__aarch64__)
-    if (m_model->implementation().buildVariant() == "metal")
+    if (m_model->implementation().buildBackend() == "metal")
         actualDeviceIsCPU = false;
 #else
     if (requestedDevice != "CPU") {