diff --git a/README.md b/README.md index 089f55f..5a161aa 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Easily configure and deploy a **fully self-hosted chatbot web service** based on -- 🌐 Free and Open Source chatbot web service with UI and API +- 🌐 Free and Open Source chatbot web service with UI and API. - ðŸĄ Fully self-hosted, not tied to any service, and offline capable. Forget about API keys! Models and embeddings can be pre-downloaded, and the training and inference processes can run off-line if necessary. - 🔌 Web API described using OpenAPI specs: GET/POST operations, websocket for streaming response - ðŸŠķ Chat web UI working well on desktop and mobile, with streaming response, and markdown rendering. Alternative gradio-based UI also available. @@ -23,7 +23,7 @@ Easily configure and deploy a **fully self-hosted chatbot web service** based on - ðŸĪ– Various types of agents can be deployed: - **💎 Generic conversation**: do not need any additional training, just configure settings such as the template prompt - **📚 Documents-based question answering** (experimental): automatically build similarity vectors from documents uploaded through the API UI, the chatbot will use them to answer your question, and return which documents were used to generate the answer (PDF, CSV, HTML, JSON, markdown, and more supported). -- 🔍 Readable logs to understand what is going on +- 🔍 Readable logs to understand what is going on. ## 📖 Documentation diff --git a/chat.yml b/chat.yml index 26bf254..798b5ef 100644 --- a/chat.yml +++ b/chat.yml @@ -1,35 +1,53 @@ -# Config for a generic conversational agent +# Config for a Question Answering (qa) agent +# Will answer based on provided documents, and return which docs was used to answer the question llm: model_path: ./models/mixtral-8x7b-instruct-v0.1.Q2_K.gguf model_download: https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q2_K.gguf - temperature: 0.01 # Config how creative, but also potentially wrong, the model can be. 0 is safe, 1 is adventurous + temperature: 0.01 # Config how creative (but also potentially wrong) the model can be. 0 is safe, 1 is adventurous max_new_tokens: 1024 # Max number of words the LLM can generate prompt: - # Always use input for the human input variable with a generic agent - variables: [input, history] + variables: ["question", "context"] template: | - Your are an assistant, answer the question briefly. + Use the following pieces of information to answer the user's question. + If you don't know the answer, just say that you don't know, don't try to make up an answer. - {history} - User: {input} - AI Assistant: + Context: {context} + Question: {question} + + Only return the helpful answer below and nothing else. + Helpful answer: + +vector: + vector_path: ./vectorstore/db_faiss # Path to the vectorstore to do QA retrieval + vector_download: null + embeddings_path: ./embeddings/all-MiniLM-L6-v2 # Embeddings used to generate the vectors. To use from HF: sentence-transformers/all-MiniLM-L6-v2 + embeddings_download: https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/all-MiniLM-L6-v2.zip + documents_path: ./documents # Path to documents to vectorize + # When vectorizing we split the text up into small, semantically meaningful chunks (often sentences): + chunk_size: 500 # Maximum size of chunks, in terms of number of characters + chunk_overlap: 50 # Overlap in characters between chunks + chain_type: stuff # Or: map_reduce, reduce, map_rerank. More details: https://docs.langchain.com/docs/components/chains/index_related_chains + search_type: similarity # Or: similarity_score_threshold, mmr. More details: https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore + return_sources_count: 2 # Number of sources to return when generating an answer + score_threshold: null # If using the similarity_score_threshold search_type. Between 0 and 1 info: title: "Libre Chat" version: "0.1.0" description: | - Open source and free chatbot powered by [LangChain](https://python.langchain.com) and [llama.cpp](https://github.com/ggerganov/llama.cpp) + Open source and free **question-answering** chatbot powered by [LangChain](https://python.langchain.com) and [llama.cpp](https://github.com/ggerganov/llama.cpp) examples: - What is the capital of the Netherlands? - Which drugs are approved by the FDA to mitigate Alzheimer symptoms? - - How can I create a logger with timestamp using python logging? + - What was the GDP of France in 1998? favicon: https://raw.github.com/vemonet/libre-chat/main/docs/docs/assets/logo.png repository_url: https://github.com/vemonet/libre-chat public_url: https://chat.semanticscience.org contact: - name: Vincent Emonet - email: vincent.emonet@gmail.com + name: "Vincent Emonet" + email: "vincent.emonet@gmail.com" license_info: - name: MIT license - url: https://raw.github.com/vemonet/libre-chat/main/LICENSE.txt + name: "MIT license" + url: "https://raw.github.com/vemonet/libre-chat/main/LICENSE.txt" + workers: 4 diff --git a/config/chat-conversation.yml b/config/chat-conversation.yml index 7a2801e..26bf254 100644 --- a/config/chat-conversation.yml +++ b/config/chat-conversation.yml @@ -33,4 +33,3 @@ info: license_info: name: MIT license url: https://raw.github.com/vemonet/libre-chat/main/LICENSE.txt - workers: 4 diff --git a/deploy.sh b/deploy.sh new file mode 100755 index 0000000..656bff2 --- /dev/null +++ b/deploy.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +if [ "$1" = "--no-cache" ]; then + echo "ðŸ“Ķïļ Building without cache" + ssh idsg1 'cd /mnt/um-share-drive/vemonet/libre-chat ; git pull ; docker-compose build --no-cache ; docker-compose down ; docker-compose up --force-recreate -d' +else + echo "â™ŧïļ Building with cache" + ssh ids2 'cd /mnt/um-share-drive/vemonet/libre-chat ; git pull ; docker-compose up --force-recreate --build -d' +fi + + +# Build with cache: +# ssh ids2 'cd /data/deploy-services/knowledge-collaboratory ; git pull ; docker-compose -f docker-compose.yml -f docker-compose.prod.yml up --force-recreate --build -d' + +# Build without cache: +# ssh ids2 'cd /data/deploy-services/knowledge-collaboratory ; git pull ; docker-compose -f docker-compose.yml -f docker-compose.prod.yml build ; docker-compose -f docker-compose.yml -f docker-compose.prod.yml down ; docker-compose -f docker-compose.yml -f docker-compose.prod.yml up --force-recreate -d' diff --git a/docs/docs/index.md b/docs/docs/index.md index 48ea7b3..2c3c29e 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -10,10 +10,11 @@ Easily configure and deploy a **fully self-hosted chatbot web service** based on -- 🌐 Free and Open Source chatbot web service with UI and API +- 🌐 Free and Open Source chatbot web service with UI and API. - ðŸĄ Fully self-hosted, not tied to any service, and offline capable. Forget about API keys! Models and embeddings can be pre-downloaded, and the training and inference processes can run off-line if necessary. +- 🔌 Web API described using OpenAPI specs: GET/POST operations, websocket for streaming response +- ðŸŠķ Chat web UI working well on desktop and mobile, with streaming response, and markdown rendering. Alternative gradio-based UI also available. - 🚀 Easy to setup, no need to program, just configure the service with a [YAML](https://yaml.org/) file, and start it with 1 command -- ðŸŠķ Chat web interface (Gradio-based, or custom HTML), working well on desktop and mobile, with streaming response, and markdown rendering. - ðŸ“Ķ Available as a `pip` package 🐍, or `docker` image ðŸģ - 🐌 No need for GPU, this will work even on your laptop CPU! That said, just running on CPUs can be quite slow (up to 1min to answer a documents-base question on recent laptops). - ðŸĶœ Powered by [`LangChain`](https://python.langchain.com) and [`llama.cpp`](https://github.com/ggerganov/llama.cpp) to perform inference locally. @@ -37,7 +38,7 @@ No need to program! The whole deployment can be configured from a YAML file: pat 2. Configure the service in a `chat.yml` file -3. Start the chat web service from the terminal with `libre-chat start` or `docker compose up` +3. Start the chat web service from the terminal with `libre-chat start` or `docker compose up`. The first time it will take some time to download the model if not already done (models size are around 15+GB) Seasoned developers can also manipulate LLM models, and deploy the API in python scripts using the `libre_chat` module. @@ -63,8 +64,7 @@ The web service is deployed using a [**⚡ FastAPI**](https://fastapi.tiangolo.c - ðŸ“Ū `GET` and `POST` on `/prompt` to query the model - 🔌 Websocket on `/chat` to open a connection with the API, and query the model -- ðŸ–Ĩïļ Chatbot web UI served on the root URL `/` - - The web UI is contained within a single HTML file templated using [Jinja2](https://jinja.palletsprojects.com), written in vanilla JS, using [Tailwind](https://tailwindcss.com) CSS for styling, and [marked](https://marked.js.org/) for markdown rendering +- ðŸ–Ĩïļ Chatbot web UI served on the root URL `/`, built with Astro, SolidJS, [Tailwind](https://tailwindcss.com) and daisyUI for styling, and [marked](https://marked.js.org/) for markdown rendering. All files required for querying the model are stored and accessed locally using [**ðŸĶœðŸ”— LangChain**](https://python.langchain.com): the main model binary, the embeddings and documents to create the vectors, and the [vectorstore](https://python.langchain.com/docs/modules/data_connection/vectorstores/). diff --git a/pyproject.toml b/pyproject.toml index 41146ce..4567762 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,9 +33,10 @@ dependencies = [ "langchain", "langchain-community", "llama-cpp-python", # To perform LLM inference + "qdrant-client", # Vectorstore + # "fastembed", "faiss-cpu >=1.7.4", # To generate the vectorstore "sentence_transformers", # To produce the embeddings - # TODO: replace with fastembed? "pypdf", # For PDFloader, or use PDFMiner? "unstructured", # For the email loader "typer >=0.6.0", @@ -105,7 +106,7 @@ post-install-commands = [ ] [tool.hatch.envs.default.scripts] -dev = "uvicorn scripts.main:app --reload {args}" +dev = "uvicorn scripts.main:app {args}" vector = [ "libre-chat start config/chat-vectorstore-qa.yml", ] diff --git a/src/libre_chat/llm.py b/src/libre_chat/llm.py index 1f531b8..67bbebd 100644 --- a/src/libre_chat/llm.py +++ b/src/libre_chat/llm.py @@ -27,7 +27,7 @@ ) from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.llms import LlamaCpp -from langchain_community.vectorstores import FAISS +from langchain_community.vectorstores import Qdrant from libre_chat.conf import ChatConf, default_conf from libre_chat.utils import BOLD, CYAN, END, log, parallel_download @@ -163,11 +163,18 @@ def get_llm(self, config: Optional[Dict[str, Any]] = None) -> LlamaCpp: def setup_dbqa(self) -> None: """Setup the vectorstore for QA""" if self.has_vectorstore(): + from qdrant_client import QdrantClient + embeddings = HuggingFaceEmbeddings( model_name=self.conf.vector.embeddings_path, model_kwargs={"device": self.device} ) # FAISS should automatically use GPU? - vectorstore = FAISS.load_local(self.get_vectorstore(), embeddings) + # vectorstore = FAISS.load_local(self.get_vectorstore(), embeddings) + vectorstore = Qdrant( + QdrantClient(path=self.conf.vector.vector_path), + collection_name="libre_chat_rag", + embeddings=embeddings, + ) search_args: Dict[str, Any] = {"k": self.conf.vector.return_sources_count} if self.conf.vector.score_threshold is not None: @@ -176,14 +183,15 @@ def setup_dbqa(self) -> None: llm=self.llm, chain_type=self.conf.vector.chain_type, retriever=vectorstore.as_retriever( - search_type=self.conf.vector.search_type, search_kwargs=search_args + # search_type=self.conf.vector.search_type, search_kwargs=search_args ), return_source_documents=self.conf.vector.return_sources_count > 0, chain_type_kwargs={"prompt": self.prompt}, ) - def build_vectorstore(self, documents_path: Optional[str] = None) -> Optional[FAISS]: - """Build vectorstore from PDF documents with FAISS.""" + def build_vectorstore(self, documents_path: Optional[str] = None) -> Optional[Qdrant]: + """Build vectorstore from documents.""" + # https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/qdrant.py time_start = datetime.now() documents_path = documents_path if documents_path else self.conf.vector.documents_path docs_count = len(os.listdir(documents_path)) @@ -214,19 +222,19 @@ def build_vectorstore(self, documents_path: Optional[str] = None) -> Optional[FA chunk_size=self.conf.vector.chunk_size, chunk_overlap=self.conf.vector.chunk_overlap ) splitted_texts = text_splitter.split_documents(documents) + # TODO: use fastembed? embeddings = HuggingFaceEmbeddings( model_name=self.conf.vector.embeddings_path, model_kwargs={"device": self.device} ) - vectorstore = FAISS.from_documents(splitted_texts, embeddings) - # TODO: use Qdrant - # vectorstore = Qdrant.from_documents( - # splitted_texts, - # embeddings, - # path=self.vector_path, - # collection_name="libre_chat_rag", - # ) - if self.vector_path: - vectorstore.save_local(self.vector_path) + vectorstore = Qdrant.from_documents( + splitted_texts, + embeddings, + path=self.conf.vector.vector_path, + collection_name="libre_chat_rag", + ) + # vectorstore = FAISS.from_documents(splitted_texts, embeddings) + # if self.vector_path: + # vectorstore.save_local(self.vector_path) log.info(f"✅ Vectorstore built in {datetime.now() - time_start}") return vectorstore return None diff --git a/src/libre_chat/router.py b/src/libre_chat/router.py index 3a7fb6a..12c87dc 100644 --- a/src/libre_chat/router.py +++ b/src/libre_chat/router.py @@ -132,6 +132,7 @@ def upload_documents( with zipfile.ZipFile(file_path, "r") as zip_ref: zip_ref.extractall(self.conf.vector.documents_path) os.remove(file_path) + # TODO: add just the uploaded files instead of rebuilding the triplestore self.llm.build_vectorstore() self.llm.setup_dbqa() return JSONResponse(