Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vLLM 5.3+ support #60

Merged
merged 2 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
pyv: ["3.11"]
vllm_version:
# - "" # skip the pypi version as it will not work on CPU
- "git+https://github.com/vllm-project/vllm@v0.5.2"
- "git+https://github.com/vllm-project/vllm@v0.5.3.post1"
- "git+https://github.com/vllm-project/vllm@main"
- "git+https://github.com/opendatahub-io/vllm@main"

Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,11 @@ classifiers = [
requires-python = ">=3.9"
dynamic = ["version"]
dependencies = [
"vllm>=0.5.2",
"vllm>=0.5.3.post1",
"prometheus_client==0.20.0",
"grpcio==1.62.2",
"grpcio-health-checking==1.62.2",
"grpcio-reflection==1.62.2",
"transformers==4.42.4",
joerunde marked this conversation as resolved.
Show resolved Hide resolved
"accelerate==0.32.1",
"hf-transfer==0.1.6",
# additional dependencies for OpenTelemetry tracing
Expand Down
89 changes: 50 additions & 39 deletions src/vllm_tgis_adapter/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from vllm import envs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.protocol import ( # noqa: TCH002 # pydantic needs to access these annotations
ChatCompletionRequest,
Expand All @@ -37,6 +38,9 @@
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from vllm.entrypoints.openai.serving_tokenization import (
OpenAIServingTokenization,
)
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser

Expand All @@ -51,22 +55,12 @@
from vllm.config import ModelConfig


try:
from vllm.entrypoints.openai.serving_tokenization import (
OpenAIServingTokenization, # noqa: TCH002
)
except ImportError: # vllm<=0.5.2
has_tokenization = False
else:
has_tokenization = True

TIMEOUT_KEEP_ALIVE = 5 # seconds

openai_serving_chat: OpenAIServingChat
openai_serving_completion: OpenAIServingCompletion
openai_serving_embedding: OpenAIServingEmbedding
if has_tokenization:
openai_serving_tokenization: OpenAIServingTokenization
openai_serving_tokenization: OpenAIServingTokenization

logger = init_logger(__name__)

Expand All @@ -91,31 +85,29 @@ async def health() -> Response:
return Response(status_code=200)


if has_tokenization:
assert has_tokenization
@router.post("/tokenize")
async def tokenize(request: TokenizeRequest) -> JSONResponse:
generator = await openai_serving_tokenization.create_tokenize(request)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(),
status_code=generator.code,
)
assert isinstance(generator, TokenizeResponse)
return JSONResponse(content=generator.model_dump())

@router.post("/tokenize")
async def tokenize(request: TokenizeRequest) -> JSONResponse:
generator = await openai_serving_tokenization.create_tokenize(request) # noqa: F821
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(),
status_code=generator.code,
)
assert isinstance(generator, TokenizeResponse)
return JSONResponse(content=generator.model_dump())

@router.post("/detokenize")
async def detokenize(request: DetokenizeRequest) -> JSONResponse:
generator = await openai_serving_tokenization.create_detokenize(request) # noqa: F821
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(),
status_code=generator.code,
)

assert isinstance(generator, DetokenizeResponse)
return JSONResponse(content=generator.model_dump())
@router.post("/detokenize")
async def detokenize(request: DetokenizeRequest) -> JSONResponse:
generator = await openai_serving_tokenization.create_detokenize(request)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(),
status_code=generator.code,
)

assert isinstance(generator, DetokenizeResponse)
return JSONResponse(content=generator.model_dump())


@router.get("/v1/models")
Expand Down Expand Up @@ -251,28 +243,47 @@ async def run_http_server(
else:
served_model_names = [args.model]

if args.disable_log_requests:
request_logger = None
else:
request_logger = RequestLogger(max_log_len=args.max_log_len)

global openai_serving_chat # noqa: PLW0603
global openai_serving_completion # noqa: PLW0603
global openai_serving_embedding # noqa: PLW0603
global openai_serving_tokenization # noqa: PLW0603

openai_serving_chat = OpenAIServingChat(
engine,
model_config,
served_model_names,
args.response_role,
args.lora_modules,
args.chat_template,
lora_modules=args.lora_modules,
prompt_adapters=args.prompt_adapters,
request_logger=request_logger,
chat_template=args.chat_template,
)

openai_serving_completion = OpenAIServingCompletion(
engine,
model_config,
served_model_names,
args.lora_modules,
lora_modules=args.lora_modules,
prompt_adapters=args.prompt_adapters,
request_logger=request_logger,
)
openai_serving_embedding = OpenAIServingEmbedding(
engine, model_config, served_model_names
engine,
model_config,
served_model_names,
request_logger=request_logger,
)
openai_serving_tokenization = OpenAIServingTokenization(
engine,
model_config,
served_model_names,
lora_modules=args.lora_modules,
request_logger=request_logger,
chat_template=args.chat_template,
)
app.root_path = args.root_path
config = UvicornConfig(
Expand Down
25 changes: 8 additions & 17 deletions src/vllm_tgis_adapter/grpc/grpc_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from vllm import AsyncLLMEngine, SamplingParams
from vllm.engine.async_llm_engine import _AsyncLLMEngine
from vllm.entrypoints.openai.serving_completion import merge_async_iterators
from vllm.inputs import TextTokensPrompt
from vllm.inputs import LLMInputs
from vllm.tracing import (
contains_trace_headers,
extract_trace_headers,
Expand Down Expand Up @@ -243,7 +243,7 @@ async def Generate(
sampling_params, truncate_input_tokens, req.text, tokenizer, context
)

inputs = TextTokensPrompt(
inputs = LLMInputs(
prompt=req.text,
prompt_token_ids=input_ids,
)
Expand Down Expand Up @@ -344,8 +344,9 @@ async def GenerateStream(
context,
)

inputs = TextTokensPrompt(
prompt=request.request.text, prompt_token_ids=input_ids
inputs = LLMInputs(
prompt=request.request.text,
prompt_token_ids=input_ids,
)

result_generator = self.engine.generate(
Expand Down Expand Up @@ -639,19 +640,9 @@ async def _validate_adapters(
async def _get_tokenizer(
self, adapter_kwargs: dict[str, Any]
) -> PreTrainedTokenizer:
lora_request = adapter_kwargs.get("lora_request")
try:
return await self.engine.get_tokenizer(lora_request)
except TypeError as exc:
# vllm <= 0.5.2
if "takes 1 positional argument but 2 were given" not in str(exc):
raise

return (
await self.engine.engine.get_tokenizer_group().get_lora_tokenizer_async(
lora_request
)
)
return await self.engine.get_tokenizer(
adapter_kwargs.get("lora_request"),
)

@staticmethod
def _convert_reason(
Expand Down