Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a new VLLMChat class for integrating with vLLM servers #81

Open
wants to merge 1 commit into
base: 1.0.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .env_example
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ HF_TOKEN=your_huggingface_api_key_here

# OpenAI credentials
OPENAI_API_KEY=your_openai_api_key_here

# vLLM URL
VLLM_URL=http://localhost:8000
6 changes: 6 additions & 0 deletions RELEASE_NOTES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Release Notes
=============

*artkit* 1.0.10
--------------
This release adds a connector for the [vLLM](https://github.com/vllm-project/vllm) LLM server.

- API: Added :class:`.VLLMChat` to allow ARTKIT to interface with vLLM-compatible LLM servers.

*artkit* 1.0.9
--------------

Expand Down
2 changes: 1 addition & 1 deletion src/artkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@
remains synchronized with the latest codebase updates.
"""

__version__ = "1.0.9"
__version__ = "1.0.10"
1 change: 1 addition & 0 deletions src/artkit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from .model.llm.openai import *
from .model.llm.util import *
from .model.llm.vertexai import *
from .model.llm.vllm import *
from .model.vision import *
from .model.vision.base import VisionModel
from .model.vision.openai import *
Expand Down
21 changes: 21 additions & 0 deletions src/artkit/model/llm/vllm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -----------------------------------------------------------------------------
# © 2024 Boston Consulting Group. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -----------------------------------------------------------------------------

"""
Base classes for vLLM LLM connection
"""

from ._vllm import *
161 changes: 161 additions & 0 deletions src/artkit/model/llm/vllm/_vllm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# -----------------------------------------------------------------------------
# © 2024 Boston Consulting Group. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -----------------------------------------------------------------------------

"""
vLLM LLM systems.
"""
from __future__ import annotations

import logging
from abc import ABCMeta
from collections.abc import Iterator
from contextlib import AsyncExitStack
from typing import Any, TypeVar

from openai import AsyncOpenAI, RateLimitError
from openai.types.chat import ChatCompletion

from artkit.model.llm.history._history import ChatHistory
from pytools.api import appenddoc, inheritdoc, subsdoc

from ...util import RateLimitException
from ..base import ChatModelConnector

logger = logging.getLogger(__name__)

__all__ = ["VLLMChat"]


T_VLLMChat = TypeVar("T_VLLMChat", bound="VLLMChat")


@inheritdoc(match="""[see superclass]""")
class VLLMChat(ChatModelConnector[AsyncOpenAI], metaclass=ABCMeta):
"""
Base class for vLLM LLMs.
"""

vllm_url: str

@classmethod
def get_default_api_key_env(cls) -> str:
"""vLLM requires no API key since it's a self-managed server."""
return ""

def _make_client(self) -> AsyncOpenAI: # pragma: no cover
"""
This method handles the authentication and connection to the vLLM server.
Since vLLM implements the OpenAI API spec, we can use the OpenAI client
to connect to it.
"""
return AsyncOpenAI(api_key="EMPTY", base_url=self.vllm_url)

@subsdoc(
pattern=r"(:param model_params: .*\n)((:?.|\n)*\S)(\n|\s)*",
replacement=r"\2\1",
)
@appenddoc(to=ChatModelConnector.__init__)
def __init__(
self,
*,
model_id: str,
api_key_env: str | None = None,
initial_delay: float = 1,
exponential_base: float = 2,
jitter: bool = True,
max_retries: int = 10,
system_prompt: str | None = None,
vllm_url: str,
**model_params: Any,
) -> None:
"""
:param vllm_url: The URL of the vLLM server.
"""
super().__init__(
model_id=model_id,
api_key_env=api_key_env,
initial_delay=initial_delay,
exponential_base=exponential_base,
jitter=jitter,
max_retries=max_retries,
system_prompt=system_prompt,
**model_params,
)
self.vllm_url = vllm_url

async def get_response(
self,
message: str,
*,
history: ChatHistory | None = None,
**model_params: dict[str, Any],
) -> list[str]:
"""[see superclass]"""
async with AsyncExitStack():
try:
completion = await self.get_client().chat.completions.create(
messages=list(
self._messages_to_openai_format( # type: ignore[arg-type]
message, history=history
)
),
model=self.model_id,
**{**self.get_model_params(), **model_params},
)
except RateLimitError as e:
raise RateLimitException(
"Rate limit exceeded. Please try again later."
) from e

return list(self._responses_from_completion(completion))

def _messages_to_openai_format(
self, user_message: str, *, history: ChatHistory | None = None
) -> Iterator[dict[str, str]]:
"""
Get the messages to send to the vLLM, based on the given user prompt
and chat history, and the system prompt for this LLM.

:param user_message: the user prompt to send to the OpenAI LLM
:param history: the chat history to include in the messages (optional)
:return: the messages object, in the format expected by the OpenAI API
"""
if self.system_prompt:
yield {"role": "system", "content": self.system_prompt}

if history is not None:
for message in history.messages:
yield {"role": message.role, "content": message.text}

yield {"role": "user", "content": user_message}

@staticmethod
def _responses_from_completion(completion: ChatCompletion) -> Iterator[str]:
"""
Get the response from the given chat completion.

:param completion: the chat completion to process
:return: the alternate responses from the chat completion
"""

for choice in completion.choices:
message = choice.message
if message.role != "assistant":
logger.warning(
"Expected only assistant messages, but got completion choice "
f"{choice!r}"
)
yield str(message.content)
22 changes: 22 additions & 0 deletions test/artkit_test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from artkit.model.llm import CachedChatModel
from artkit.model.llm.base import ChatModel
from artkit.model.llm.openai import OpenAIChat
from artkit.model.llm.vllm import VLLMChat
from artkit.util import Image

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -95,3 +96,24 @@ def image() -> Image:
binary_png = base64.b64decode(base64_png)

return Image(data=binary_png)


@pytest.fixture(scope="session")
def vllm_url() -> str:
return "http://localhost:8000"


@pytest.fixture
def vllm_model_id() -> str:
return "meta-llama/Meta-Llama-3-8B-Instruct"


@pytest.fixture
def vllm_chat(vllm_url: str, vllm_model_id: str) -> VLLMChat:
return VLLMChat(
model_id=vllm_model_id,
max_retries=2,
initial_delay=0.1,
exponential_base=1.5,
vllm_url=vllm_url,
)
70 changes: 70 additions & 0 deletions test/artkit_test/model/llm/test_vllm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from unittest.mock import AsyncMock, MagicMock, patch

import pytest
from openai import RateLimitError

from artkit.model.llm.vllm import VLLMChat
from artkit.model.util import RateLimitException

_ = pytest.importorskip("google.generativeai")


@pytest.mark.asyncio
async def test_vllm(vllm_chat: VLLMChat) -> None:
# Mock openai Client
with patch("artkit.model.llm.vllm._vllm.AsyncOpenAI") as mock_get_client:
# Mock openai Client response
mock_response = AsyncMock(
return_value=AsyncMock(
choices=[MagicMock(message=MagicMock(content="blue", role="assistant"))]
)
)

# Set mock response as return value
mock_get_client.return_value.chat.completions.create = mock_response

# Call mocked model
messages = await vllm_chat.get_response(
message="What color is the sky? Please answer in one word."
)
assert "blue" in messages[0].lower()


@pytest.mark.asyncio
async def test_vllm_retry(
vllm_chat: VLLMChat, caplog: pytest.LogCaptureFixture
) -> None:
# Mock openai Client
with patch("artkit.model.llm.vllm._vllm.AsyncOpenAI") as mock_get_client:
# Set mock response as return value
response = MagicMock()
response.status_code = 429

# Mock exception on method call
mock_get_client.return_value.chat.completions.create.side_effect = (
RateLimitError(
message="Rate Limit exceeded",
response=response,
body=MagicMock(),
)
)

with pytest.raises(RateLimitException):
# Call mocked model
await vllm_chat.get_response(
message="What color is the sky? Please answer in one word."
)
assert (
mock_get_client.return_value.chat.completions.create.call_count
== vllm_chat.max_retries
)
assert (
len(
[
record
for record in caplog.records
if record.message.startswith("Rate limit exceeded")
]
)
== vllm_chat.max_retries
)