BCG-X-Official · GageWAnderson · Nov 17, 2024
diff --git a/.env_example b/.env_example
@@ -12,3 +12,6 @@ HF_TOKEN=your_huggingface_api_key_here
 
 # OpenAI credentials
 OPENAI_API_KEY=your_openai_api_key_here
+
+# vLLM URL
+VLLM_URL=http://localhost:8000
diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst
@@ -1,6 +1,12 @@
 Release Notes
 =============
 
+*artkit* 1.0.10
+--------------
+This release adds a connector for the [vLLM](https://github.com/vllm-project/vllm) LLM server.
+
+- API: Added :class:`.VLLMChat` to allow ARTKIT to interface with vLLM-compatible LLM servers.
+
 *artkit* 1.0.9
 --------------
 

diff --git a/src/artkit/__init__.py b/src/artkit/__init__.py
@@ -26,4 +26,4 @@
 remains synchronized with the latest codebase updates.
 """
 
-__version__ = "1.0.9"
+__version__ = "1.0.10"
diff --git a/src/artkit/api.py b/src/artkit/api.py
@@ -51,6 +51,7 @@
 from .model.llm.openai import *
 from .model.llm.util import *
 from .model.llm.vertexai import *
+from .model.llm.vllm import *
 from .model.vision import *
 from .model.vision.base import VisionModel
 from .model.vision.openai import *

diff --git a/src/artkit/model/llm/vllm/__init__.py b/src/artkit/model/llm/vllm/__init__.py
@@ -0,0 +1,21 @@
+# -----------------------------------------------------------------------------
+# © 2024 Boston Consulting Group. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# -----------------------------------------------------------------------------
+
+"""
+Base classes for vLLM LLM connection
+"""
+
+from ._vllm import *
diff --git a/src/artkit/model/llm/vllm/_vllm.py b/src/artkit/model/llm/vllm/_vllm.py
@@ -0,0 +1,161 @@
+# -----------------------------------------------------------------------------
+# © 2024 Boston Consulting Group. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# -----------------------------------------------------------------------------
+
+"""
+vLLM LLM systems.
+"""
+from __future__ import annotations
+
+import logging
+from abc import ABCMeta
+from collections.abc import Iterator
+from contextlib import AsyncExitStack
+from typing import Any, TypeVar
+
+from openai import AsyncOpenAI, RateLimitError
+from openai.types.chat import ChatCompletion
+
+from artkit.model.llm.history._history import ChatHistory
+from pytools.api import appenddoc, inheritdoc, subsdoc
+
+from ...util import RateLimitException
+from ..base import ChatModelConnector
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["VLLMChat"]
+
+
+T_VLLMChat = TypeVar("T_VLLMChat", bound="VLLMChat")
+
+
+@inheritdoc(match="""[see superclass]""")
+class VLLMChat(ChatModelConnector[AsyncOpenAI], metaclass=ABCMeta):
+    """
+    Base class for vLLM LLMs.
+    """
+
+    vllm_url: str
+
+    @classmethod
+    def get_default_api_key_env(cls) -> str:
+        """vLLM requires no API key since it's a self-managed server."""
+        return ""
+
+    def _make_client(self) -> AsyncOpenAI:  # pragma: no cover
+        """
+        This method handles the authentication and connection to the vLLM server.
+        Since vLLM implements the OpenAI API spec, we can use the OpenAI client
+        to connect to it.
+        """
+        return AsyncOpenAI(api_key="EMPTY", base_url=self.vllm_url)
+
+    @subsdoc(
+        pattern=r"(:param model_params: .*\n)((:?.|\n)*\S)(\n|\s)*",
+        replacement=r"\2\1",
+    )
+    @appenddoc(to=ChatModelConnector.__init__)
+    def __init__(
+        self,
+        *,
+        model_id: str,
+        api_key_env: str | None = None,
+        initial_delay: float = 1,
+        exponential_base: float = 2,
+        jitter: bool = True,
+        max_retries: int = 10,
+        system_prompt: str | None = None,
+        vllm_url: str,
+        **model_params: Any,
+    ) -> None:
+        """
+        :param vllm_url: The URL of the vLLM server.
+        """
+        super().__init__(
+            model_id=model_id,
+            api_key_env=api_key_env,
+            initial_delay=initial_delay,
+            exponential_base=exponential_base,
+            jitter=jitter,
+            max_retries=max_retries,
+            system_prompt=system_prompt,
+            **model_params,
+        )
+        self.vllm_url = vllm_url
+
+    async def get_response(
+        self,
+        message: str,
+        *,
+        history: ChatHistory | None = None,
+        **model_params: dict[str, Any],
+    ) -> list[str]:
+        """[see superclass]"""
+        async with AsyncExitStack():
+            try:
+                completion = await self.get_client().chat.completions.create(
+                    messages=list(
+                        self._messages_to_openai_format(  # type: ignore[arg-type]
+                            message, history=history
+                        )
+                    ),
+                    model=self.model_id,
+                    **{**self.get_model_params(), **model_params},
+                )
+            except RateLimitError as e:
+                raise RateLimitException(
+                    "Rate limit exceeded. Please try again later."
+                ) from e
+
+        return list(self._responses_from_completion(completion))
+
+    def _messages_to_openai_format(
+        self, user_message: str, *, history: ChatHistory | None = None
+    ) -> Iterator[dict[str, str]]:
+        """
+        Get the messages to send to the vLLM, based on the given user prompt
+        and chat history, and the system prompt for this LLM.
+
+        :param user_message: the user prompt to send to the OpenAI LLM
+        :param history: the chat history to include in the messages (optional)
+        :return: the messages object, in the format expected by the OpenAI API
+        """
+        if self.system_prompt:
+            yield {"role": "system", "content": self.system_prompt}
+
+        if history is not None:
+            for message in history.messages:
+                yield {"role": message.role, "content": message.text}
+
+        yield {"role": "user", "content": user_message}
+
+    @staticmethod
+    def _responses_from_completion(completion: ChatCompletion) -> Iterator[str]:
+        """
+        Get the response from the given chat completion.
+
+        :param completion: the chat completion to process
+        :return: the alternate responses from the chat completion
+        """
+
+        for choice in completion.choices:
+            message = choice.message
+            if message.role != "assistant":
+                logger.warning(
+                    "Expected only assistant messages, but got completion choice "
+                    f"{choice!r}"
+                )
+            yield str(message.content)
diff --git a/test/artkit_test/conftest.py b/test/artkit_test/conftest.py
@@ -10,6 +10,7 @@
 from artkit.model.llm import CachedChatModel
 from artkit.model.llm.base import ChatModel
 from artkit.model.llm.openai import OpenAIChat
+from artkit.model.llm.vllm import VLLMChat
 from artkit.util import Image
 
 log = logging.getLogger(__name__)
@@ -95,3 +96,24 @@ def image() -> Image:
     binary_png = base64.b64decode(base64_png)
 
     return Image(data=binary_png)
+
+
+@pytest.fixture(scope="session")
+def vllm_url() -> str:
+    return "http://localhost:8000"
+
+
+@pytest.fixture
+def vllm_model_id() -> str:
+    return "meta-llama/Meta-Llama-3-8B-Instruct"
+
+
+@pytest.fixture
+def vllm_chat(vllm_url: str, vllm_model_id: str) -> VLLMChat:
+    return VLLMChat(
+        model_id=vllm_model_id,
+        max_retries=2,
+        initial_delay=0.1,
+        exponential_base=1.5,
+        vllm_url=vllm_url,
+    )
diff --git a/test/artkit_test/model/llm/test_vllm.py b/test/artkit_test/model/llm/test_vllm.py
@@ -0,0 +1,70 @@
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from openai import RateLimitError
+
+from artkit.model.llm.vllm import VLLMChat
+from artkit.model.util import RateLimitException
+
+_ = pytest.importorskip("google.generativeai")
+
+
+@pytest.mark.asyncio
+async def test_vllm(vllm_chat: VLLMChat) -> None:
+    # Mock openai Client
+    with patch("artkit.model.llm.vllm._vllm.AsyncOpenAI") as mock_get_client:
+        # Mock openai Client response
+        mock_response = AsyncMock(
+            return_value=AsyncMock(
+                choices=[MagicMock(message=MagicMock(content="blue", role="assistant"))]
+            )
+        )
+
+        # Set mock response as return value
+        mock_get_client.return_value.chat.completions.create = mock_response
+
+        # Call mocked model
+        messages = await vllm_chat.get_response(
+            message="What color is the sky? Please answer in one word."
+        )
+        assert "blue" in messages[0].lower()
+
+
+@pytest.mark.asyncio
+async def test_vllm_retry(
+    vllm_chat: VLLMChat, caplog: pytest.LogCaptureFixture
+) -> None:
+    # Mock openai Client
+    with patch("artkit.model.llm.vllm._vllm.AsyncOpenAI") as mock_get_client:
+        # Set mock response as return value
+        response = MagicMock()
+        response.status_code = 429
+
+        # Mock exception on method call
+        mock_get_client.return_value.chat.completions.create.side_effect = (
+            RateLimitError(
+                message="Rate Limit exceeded",
+                response=response,
+                body=MagicMock(),
+            )
+        )
+
+        with pytest.raises(RateLimitException):
+            # Call mocked model
+            await vllm_chat.get_response(
+                message="What color is the sky? Please answer in one word."
+            )
+        assert (
+            mock_get_client.return_value.chat.completions.create.call_count
+            == vllm_chat.max_retries
+        )
+    assert (
+        len(
+            [
+                record
+                for record in caplog.records
+                if record.message.startswith("Rate limit exceeded")
+            ]
+        )
+        == vllm_chat.max_retries
+    )