Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial vector db implementation #335

Draft
wants to merge 18 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
- '3.11'
services:
postgres:
image: postgres:latest
image: pgvector/pgvector:pg16
env:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres_password
Expand Down
40 changes: 39 additions & 1 deletion apps/experiments/tasks.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
import time
from datetime import datetime

import pymupdf4llm
from celery.app import shared_task
from langchain.schema import AIMessage, HumanMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownTextSplitter
from pymupdf import Document as PyMuPDFDocument
from taskbadger.celery import Task as TaskbadgerTask

from apps.channels.datamodels import WebMessage
from apps.chat.bots import create_conversation
from apps.chat.channels import WebChannel
from apps.experiments.models import ExperimentSession, PromptBuilderHistory, SourceMaterial
from apps.experiments.models import Experiment, ExperimentSession, PromptBuilderHistory, SourceMaterial
from apps.files.models import File
from apps.service_providers.models import LlmProvider
from apps.users.models import CustomUser
from apps.utils.taskbadger import update_taskbadger_data
from apps.vectordb.vectorstore import PGVector


@shared_task(bind=True, base=TaskbadgerTask)
Expand All @@ -23,6 +30,37 @@ def get_response_for_webchat_task(self, experiment_session_id: int, message_text
return message_handler.new_user_message(message)


@shared_task(bind=True, base=TaskbadgerTask)
def store_rag_embedding(self, experiment_id: int, file_id: int) -> None:
experiment = Experiment.objects.get(id=experiment_id)
file = experiment.files.get(id=file_id)
documents = load_rag_file(file)
embeddings_model = experiment.get_llm_service().get_openai_embeddings()
PGVector.from_documents(documents, embeddings_model, experiment)


def load_rag_file(file: File) -> list[Document]:
"""
Loads a text file of any supported type (PDF, TXT, HTML) into Langchain.
"""

if file.content_type == "application/pdf":
doc = PyMuPDFDocument(stream=file.file.open(), filetype="pdf")
md_text = pymupdf4llm.to_markdown(doc)
splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = splitter.create_documents([md_text])
elif file.content_type.startswith("text"):
with file.file.open() as f:
metadata = {"source": file.name}
doc = Document(page_content=f.read(), metadata=metadata)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents([doc])
else:
raise ValueError(f"Unsupported file type: {file.content_type}")

return documents


@shared_task
def get_prompt_builder_response_task(team_id: int, user_id, data_dict: dict) -> dict[str, str | int]:
llm_service = LlmProvider.objects.get(id=data_dict["provider"]).get_llm_service()
Expand Down
7 changes: 6 additions & 1 deletion apps/experiments/views/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
from apps.experiments.helpers import get_real_user_or_none
from apps.experiments.models import Experiment, ExperimentSession, Participant, SessionStatus, SyntheticVoice
from apps.experiments.tables import ExperimentSessionsTable, ExperimentTable
from apps.experiments.tasks import get_response_for_webchat_task
from apps.experiments.tasks import get_response_for_webchat_task, store_rag_embedding
from apps.experiments.views.prompt import PROMPT_DATA_SESSION_KEY
from apps.files.forms import get_file_formset
from apps.files.views import BaseAddFileHtmxView, BaseDeleteFileView
Expand Down Expand Up @@ -220,6 +220,10 @@ def _validate_prompt_variables(form_data):
available_variables = set()
if form_data.get("source_material"):
available_variables.add("source_material")
# available_variables below should be added by making a
# db request to check if there are any RAG files uploaded
available_variables.add("context")
available_variables.add("input")
missing_vars = required_variables - available_variables
known_vars = {"source_material"}
if missing_vars:
Expand Down Expand Up @@ -361,6 +365,7 @@ def form_valid(self, form):
experiment = get_object_or_404(Experiment, team=self.request.team, pk=self.kwargs["pk"])
file = super().form_valid(form)
experiment.files.add(file)
store_rag_embedding(experiment.id, file.id)
return file

def get_delete_url(self, file):
Expand Down
9 changes: 9 additions & 0 deletions apps/service_providers/llm_service/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from langchain_community.chat_models import ChatAnthropic
from langchain_core.callbacks import BaseCallbackHandler
from langchain_core.language_models import BaseLanguageModel
from langchain_openai import OpenAIEmbeddings
from langchain_openai.chat_models import AzureChatOpenAI, ChatOpenAI
from openai import OpenAI
from openai._base_client import SyncAPIClient
Expand Down Expand Up @@ -42,6 +43,14 @@ class OpenAILlmService(LlmService):
openai_api_base: str = None
openai_organization: str = None

def get_openai_embeddings(self, model="text-embedding-3-small") -> OpenAIEmbeddings:
return OpenAIEmbeddings(
openai_api_key=self.openai_api_key,
openai_api_base=self.openai_api_base,
openai_organization=self.openai_organization,
model=model,
)

def get_raw_client(self) -> OpenAI:
return OpenAI(api_key=self.openai_api_key, organization=self.openai_organization, base_url=self.openai_api_base)

Expand Down
23 changes: 22 additions & 1 deletion apps/service_providers/llm_service/runnables.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from apps.chat.conversation import compress_chat_history
from apps.chat.models import ChatMessage, ChatMessageType
from apps.experiments.models import Experiment, ExperimentSession
from apps.vectordb.vectorstore import PGVector

logger = logging.getLogger(__name__)

Expand All @@ -51,7 +52,8 @@ def create_experiment_runnable(experiment: Experiment, session: ExperimentSessio
assert experiment.llm_provider, "Experiment must have an LLM provider"
if experiment.tools_enabled:
return AgentExperimentRunnable(experiment=experiment, session=session)

if experiment.files.exists():
return RagExperimentRunnable(experiment=experiment, session=session)
return SimpleExperimentRunnable(experiment=experiment, session=session)


Expand Down Expand Up @@ -219,6 +221,25 @@ def _build_chain(self) -> Runnable[dict[str, Any], str]:
)


class RagExperimentRunnable(ExperimentRunnable):
def _build_chain(self) -> Runnable[dict[str, Any], str]:
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)

model = self.llm_service.get_chat_model(self.experiment.llm, self.experiment.temperature)
embeddings = self.experiment.get_llm_service().get_openai_embeddings()
retriever = PGVector(self.experiment, embeddings).as_retriever()
return (
{"context": retriever | format_docs, "input": RunnablePassthrough()}
| RunnablePassthrough.assign(
history=RunnableLambda(self.memory.load_memory_variables) | itemgetter("history")
)
| self.prompt
| model
| StrOutputParser()
)


class AgentExperimentRunnable(ExperimentRunnable):
def _parse_output(self, output):
return output.get("output", "")
Expand Down
32 changes: 32 additions & 0 deletions apps/utils/chunked.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from itertools import islice


def chunked(it, n, collection=tuple):
"""
>>> for nums in chunked(range(10), 4):
... print(nums)
...
(0, 1, 2, 3)
(4, 5, 6, 7)
(8, 9)
>>> for nums in chunked(range(10), 4, list):
... print(nums)
...
[0, 1, 2, 3]
[4, 5, 6, 7]
[8, 9]
"""
itr = iter(it)
while True:
try:
items = take(n, itr, collection)
except StopIteration:
break
if not items:
break
yield items


def take(n, iterable, collection=list):
# https://docs.python.org/2/library/itertools.html#recipes
return collection(islice(iterable, n))
Empty file added apps/vectordb/__init__.py
Empty file.
6 changes: 6 additions & 0 deletions apps/vectordb/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from django.apps import AppConfig


class VectorDbConfig(AppConfig):
name = "apps.vectordb"
label = "vectordb"
9 changes: 9 additions & 0 deletions apps/vectordb/const.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
META_EMBEDDING_ID = "embedding_id"

META_EXPERIMENT_ID = "experiment_id"

META_FILE_ID = "file_id"

META_SEARCH_SCORE = "search_score"

META_ALL = (META_EMBEDDING_ID, META_EXPERIMENT_ID, META_FILE_ID, META_SEARCH_SCORE)
66 changes: 66 additions & 0 deletions apps/vectordb/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Generated by Django 4.2.7 on 2024-03-22 12:11

from django.db import migrations, models
import django.db.models.deletion
import pgvector.django


class Migration(migrations.Migration):
initial = True

dependencies = [
("files", "0001_initial"),
("experiments", "0070_alter_consentform_name_alter_experiment_llm_and_more"),
("teams", "0005_invitation_groups"),
]

operations = [
pgvector.django.VectorExtension(),
migrations.CreateModel(
name="Embedding",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
("embedding", pgvector.django.VectorField(dimensions=1536)),
("document", models.TextField(null=True)),
("metadata", models.JSONField(null=True)),
(
"experiment",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="embeddings",
to="experiments.experiment",
),
),
(
"file",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
to="files.file",
),
),
(
"team",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="teams.team",
verbose_name="Team",
),
),
],
options={
"abstract": False,
},
),
]
Empty file.
14 changes: 14 additions & 0 deletions apps/vectordb/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from django.db import models
from pgvector.django import VectorField

from apps.teams.models import BaseTeamModel

ADA_TOKEN_COUNT = 1536


class Embedding(BaseTeamModel):
experiment = models.ForeignKey("experiments.Experiment", on_delete=models.CASCADE, related_name="embeddings")
embedding = VectorField(dimensions=ADA_TOKEN_COUNT)
document = models.TextField(null=True) # noqa: DJ001
metadata = models.JSONField(null=True)
file = models.ForeignKey("files.File", on_delete=models.CASCADE, null=True, blank=True)
Empty file added apps/vectordb/tests/__init__.py
Empty file.
Loading