Use qdrant instead of faiss for vectorstore

vemonet · Jan 10, 2024 · 7cfeda3 · 7cfeda3
1 parent b5ddc0e
commit 7cfeda3
Show file tree

Hide file tree

Showing 8 changed files with 82 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ Easily configure and deploy a **fully self-hosted chatbot web service** based on
 
 </div>
 
-- 🌐 Free and Open Source chatbot web service with UI and API
+- 🌐 Free and Open Source chatbot web service with UI and API.
 - 🏡 Fully self-hosted, not tied to any service, and offline capable. Forget about API keys! Models and embeddings can be pre-downloaded, and the training and inference processes can run off-line if necessary.
 - 🔌 Web API described using OpenAPI specs: GET/POST operations, websocket for streaming response
 - 🪶 Chat web UI working well on desktop and mobile, with streaming response, and markdown rendering. Alternative gradio-based UI also available.
@@ -23,7 +23,7 @@ Easily configure and deploy a **fully self-hosted chatbot web service** based on
 - 🤖 Various types of agents can be deployed:
   - **💬 Generic conversation**: do not need any additional training, just configure settings such as the template prompt
   - **📚 Documents-based question answering** (experimental): automatically build similarity vectors from documents uploaded through the API UI, the chatbot will use them to answer your question, and return which documents were used to generate the answer (PDF, CSV, HTML, JSON, markdown, and more supported).
-- 🔍 Readable logs to understand what is going on
+- 🔍 Readable logs to understand what is going on.
 
 ## 📖 Documentation
 

diff --git a/chat.yml b/chat.yml
@@ -1,35 +1,53 @@
-# Config for a generic conversational agent
+# Config for a Question Answering (qa) agent
+# Will answer based on provided documents, and return which docs was used to answer the question
 llm:
   model_path: ./models/mixtral-8x7b-instruct-v0.1.Q2_K.gguf
   model_download: https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q2_K.gguf
-  temperature: 0.01    # Config how creative, but also potentially wrong, the model can be. 0 is safe, 1 is adventurous
+  temperature: 0.01    # Config how creative (but also potentially wrong) the model can be. 0 is safe, 1 is adventurous
   max_new_tokens: 1024 # Max number of words the LLM can generate
 
 prompt:
-  # Always use input for the human input variable with a generic agent
-  variables: [input, history]
+  variables: ["question", "context"]
   template: |
-    Your are an assistant, answer the question briefly.
+    Use the following pieces of information to answer the user's question.
+    If you don't know the answer, just say that you don't know, don't try to make up an answer.
 
-    {history}
-    User: {input}
-    AI Assistant:
+    Context: {context}
+    Question: {question}
+
+    Only return the helpful answer below and nothing else.
+    Helpful answer:
+
+vector:
+  vector_path: ./vectorstore/db_faiss # Path to the vectorstore to do QA retrieval
+  vector_download: null
+  embeddings_path: ./embeddings/all-MiniLM-L6-v2 # Embeddings used to generate the vectors. To use from HF: sentence-transformers/all-MiniLM-L6-v2
+  embeddings_download: https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/all-MiniLM-L6-v2.zip
+  documents_path: ./documents # Path to documents to vectorize
+  # When vectorizing we split the text up into small, semantically meaningful chunks (often sentences):
+  chunk_size: 500             # Maximum size of chunks, in terms of number of characters
+  chunk_overlap: 50           # Overlap in characters between chunks
+  chain_type: stuff           # Or: map_reduce, reduce, map_rerank. More details: https://docs.langchain.com/docs/components/chains/index_related_chains
+  search_type: similarity     # Or: similarity_score_threshold, mmr. More details: https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore
+  return_sources_count: 2     # Number of sources to return when generating an answer
+  score_threshold: null       # If using the similarity_score_threshold search_type. Between 0 and 1
 
 info:
   title: "Libre Chat"
   version: "0.1.0"
   description: |
-    Open source and free chatbot powered by [LangChain](https://python.langchain.com) and [llama.cpp](https://github.com/ggerganov/llama.cpp)
+    Open source and free **question-answering** chatbot powered by [LangChain](https://python.langchain.com) and [llama.cpp](https://github.com/ggerganov/llama.cpp)
   examples:
   - What is the capital of the Netherlands?
   - Which drugs are approved by the FDA to mitigate Alzheimer symptoms?
-  - How can I create a logger with timestamp using python logging?
+  - What was the GDP of France in 1998?
   favicon: https://raw.github.com/vemonet/libre-chat/main/docs/docs/assets/logo.png
   repository_url: https://github.com/vemonet/libre-chat
   public_url: https://chat.semanticscience.org
   contact:
-    name: Vincent Emonet
-    email: [email protected]
+    name: "Vincent Emonet"
+    email: "[email protected]"
   license_info:
-    name: MIT license
-    url: https://raw.github.com/vemonet/libre-chat/main/LICENSE.txt
+    name: "MIT license"
+    url: "https://raw.github.com/vemonet/libre-chat/main/LICENSE.txt"
+  workers: 4
diff --git a/config/chat-conversation.yml b/config/chat-conversation.yml
@@ -33,4 +33,3 @@ info:
   license_info:
     name: MIT license
     url: https://raw.github.com/vemonet/libre-chat/main/LICENSE.txt
-  workers: 4
diff --git a/deploy.sh b/deploy.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+if [ "$1" = "--no-cache" ]; then
+    echo "📦️ Building without cache"
+    ssh idsg1 'cd /mnt/um-share-drive/vemonet/libre-chat ; git pull ; docker-compose build --no-cache ; docker-compose down ; docker-compose up --force-recreate -d'
+else
+    echo "♻️ Building with cache"
+    ssh ids2 'cd /mnt/um-share-drive/vemonet/libre-chat ; git pull ; docker-compose up --force-recreate --build -d'
+fi
+
+
+# Build with cache:
+# ssh ids2 'cd /data/deploy-services/knowledge-collaboratory ; git pull ; docker-compose -f docker-compose.yml -f docker-compose.prod.yml up --force-recreate --build -d'
+
+# Build without cache:
+# ssh ids2 'cd /data/deploy-services/knowledge-collaboratory ; git pull ; docker-compose -f docker-compose.yml -f docker-compose.prod.yml build ; docker-compose -f docker-compose.yml -f docker-compose.prod.yml down ; docker-compose -f docker-compose.yml -f docker-compose.prod.yml up --force-recreate -d'
diff --git a/docs/docs/index.md b/docs/docs/index.md
@@ -10,10 +10,11 @@ Easily configure and deploy a **fully self-hosted chatbot web service** based on
 
 </div>
 
-- 🌐 Free and Open Source chatbot web service with UI and API
+- 🌐 Free and Open Source chatbot web service with UI and API.
 - 🏡 Fully self-hosted, not tied to any service, and offline capable. Forget about API keys! Models and embeddings can be pre-downloaded, and the training and inference processes can run off-line if necessary.
+- 🔌 Web API described using OpenAPI specs: GET/POST operations, websocket for streaming response
+- 🪶 Chat web UI working well on desktop and mobile, with streaming response, and markdown rendering. Alternative gradio-based UI also available.
 - 🚀 Easy to setup, no need to program, just configure the service with a [YAML](https://yaml.org/) file, and start it with 1 command
-- 🪶 Chat web interface (Gradio-based, or custom HTML), working well on desktop and mobile, with streaming response, and markdown rendering.
 - 📦 Available as a `pip` package 🐍, or `docker` image 🐳
 - 🐌 No need for GPU, this will work even on your laptop CPU! That said, just running on CPUs can be quite slow (up to 1min to answer a documents-base question on recent laptops).
 - 🦜 Powered by [`LangChain`](https://python.langchain.com) and [`llama.cpp`](https://github.com/ggerganov/llama.cpp) to perform inference locally.
@@ -37,7 +38,7 @@ No need to program! The whole deployment can be configured from a YAML file: pat
 
 2. Configure the service in a `chat.yml` file
 
-3. Start the chat web service from the terminal with `libre-chat start` or `docker compose up`
+3. Start the chat web service from the terminal with `libre-chat start` or `docker compose up`. The first time it will take some time to download the model if not already done (models size are around 15+GB)
 
 Seasoned developers can also manipulate LLM models, and deploy the API in python scripts using the `libre_chat` module.
 
@@ -63,8 +64,7 @@ The web service is deployed using a [**⚡ FastAPI**](https://fastapi.tiangolo.c
 
 - 📮 `GET` and `POST` on `/prompt` to query the model
 - 🔌 Websocket on `/chat` to open a connection with the API, and query the model
-- 🖥️ Chatbot web UI served on the root URL `/`
-    - The web UI is contained within a single HTML file templated using [Jinja2](https://jinja.palletsprojects.com), written in vanilla JS, using [Tailwind](https://tailwindcss.com) CSS for styling, and [marked](https://marked.js.org/) for markdown rendering
+- 🖥️ Chatbot web UI served on the root URL `/`, built with Astro, SolidJS, [Tailwind](https://tailwindcss.com) and daisyUI for styling, and [marked](https://marked.js.org/) for markdown rendering.
 
 All files required for querying the model are stored and accessed locally using [**🦜🔗 LangChain**](https://python.langchain.com): the main model binary, the embeddings and documents to create the vectors, and the [vectorstore](https://python.langchain.com/docs/modules/data_connection/vectorstores/).
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -33,9 +33,10 @@ dependencies = [
     "langchain",
     "langchain-community",
     "llama-cpp-python",      # To perform LLM inference
+    "qdrant-client",         # Vectorstore
+    # "fastembed",
     "faiss-cpu >=1.7.4",     # To generate the vectorstore
     "sentence_transformers", # To produce the embeddings
-    # TODO: replace with fastembed?
     "pypdf",                 # For PDFloader, or use PDFMiner?
     "unstructured",          # For the email loader
     "typer >=0.6.0",
@@ -105,7 +106,7 @@ post-install-commands = [
 ]
 
 [tool.hatch.envs.default.scripts]
-dev = "uvicorn scripts.main:app --reload {args}"
+dev = "uvicorn scripts.main:app {args}"
 vector = [
     "libre-chat start config/chat-vectorstore-qa.yml",
 ]

diff --git a/src/libre_chat/llm.py b/src/libre_chat/llm.py
@@ -27,7 +27,7 @@
 )
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.llms import LlamaCpp
-from langchain_community.vectorstores import FAISS
+from langchain_community.vectorstores import Qdrant
 
 from libre_chat.conf import ChatConf, default_conf
 from libre_chat.utils import BOLD, CYAN, END, log, parallel_download
@@ -163,11 +163,18 @@ def get_llm(self, config: Optional[Dict[str, Any]] = None) -> LlamaCpp:
     def setup_dbqa(self) -> None:
         """Setup the vectorstore for QA"""
         if self.has_vectorstore():
+            from qdrant_client import QdrantClient
+
             embeddings = HuggingFaceEmbeddings(
                 model_name=self.conf.vector.embeddings_path, model_kwargs={"device": self.device}
             )
             # FAISS should automatically use GPU?
-            vectorstore = FAISS.load_local(self.get_vectorstore(), embeddings)
+            # vectorstore = FAISS.load_local(self.get_vectorstore(), embeddings)
+            vectorstore = Qdrant(
+                QdrantClient(path=self.conf.vector.vector_path),
+                collection_name="libre_chat_rag",
+                embeddings=embeddings,
+            )
 
             search_args: Dict[str, Any] = {"k": self.conf.vector.return_sources_count}
             if self.conf.vector.score_threshold is not None:
@@ -176,14 +183,15 @@ def setup_dbqa(self) -> None:
                 llm=self.llm,
                 chain_type=self.conf.vector.chain_type,
                 retriever=vectorstore.as_retriever(
-                    search_type=self.conf.vector.search_type, search_kwargs=search_args
+                    # search_type=self.conf.vector.search_type, search_kwargs=search_args
                 ),
                 return_source_documents=self.conf.vector.return_sources_count > 0,
                 chain_type_kwargs={"prompt": self.prompt},
             )
 
-    def build_vectorstore(self, documents_path: Optional[str] = None) -> Optional[FAISS]:
-        """Build vectorstore from PDF documents with FAISS."""
+    def build_vectorstore(self, documents_path: Optional[str] = None) -> Optional[Qdrant]:
+        """Build vectorstore from documents."""
+        # https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/qdrant.py
         time_start = datetime.now()
         documents_path = documents_path if documents_path else self.conf.vector.documents_path
         docs_count = len(os.listdir(documents_path))
@@ -214,19 +222,19 @@ def build_vectorstore(self, documents_path: Optional[str] = None) -> Optional[FA
                 chunk_size=self.conf.vector.chunk_size, chunk_overlap=self.conf.vector.chunk_overlap
             )
             splitted_texts = text_splitter.split_documents(documents)
+            # TODO: use fastembed?
             embeddings = HuggingFaceEmbeddings(
                 model_name=self.conf.vector.embeddings_path, model_kwargs={"device": self.device}
             )
-            vectorstore = FAISS.from_documents(splitted_texts, embeddings)
-            # TODO: use Qdrant
-            # vectorstore = Qdrant.from_documents(
-            #     splitted_texts,
-            #     embeddings,
-            #     path=self.vector_path,
-            #     collection_name="libre_chat_rag",
-            # )
-            if self.vector_path:
-                vectorstore.save_local(self.vector_path)
+            vectorstore = Qdrant.from_documents(
+                splitted_texts,
+                embeddings,
+                path=self.conf.vector.vector_path,
+                collection_name="libre_chat_rag",
+            )
+            # vectorstore = FAISS.from_documents(splitted_texts, embeddings)
+            # if self.vector_path:
+            #     vectorstore.save_local(self.vector_path)
             log.info(f"✅ Vectorstore built in {datetime.now() - time_start}")
             return vectorstore
         return None

diff --git a/src/libre_chat/router.py b/src/libre_chat/router.py
@@ -132,6 +132,7 @@ def upload_documents(
                         with zipfile.ZipFile(file_path, "r") as zip_ref:
                             zip_ref.extractall(self.conf.vector.documents_path)
                         os.remove(file_path)
+            # TODO: add just the uploaded files instead of rebuilding the triplestore
             self.llm.build_vectorstore()
             self.llm.setup_dbqa()
             return JSONResponse(