mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-03 21:32:46 -04:00
feat: mixtral support (#770)
Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
from __future__ import annotations
|
||||
import logging, typing as t
|
||||
import _service_vars as svars
|
||||
import bentoml, openllm
|
||||
import bentoml, openllm, _service_vars as svars
|
||||
from openllm_core._schemas import MessageParam
|
||||
from bentoml.io import JSON, Text
|
||||
|
||||
@@ -16,23 +15,23 @@ llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
|
||||
|
||||
@svc.api(
|
||||
route='/v1/generate',
|
||||
input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()), #
|
||||
input=JSON.from_sample(llm_model_class.examples()),
|
||||
output=JSON.from_sample(openllm.GenerationOutput.examples()),
|
||||
)
|
||||
async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]: return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
|
||||
|
||||
@svc.api(
|
||||
route='/v1/generate_stream',
|
||||
input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'), #
|
||||
input=JSON.from_sample(llm_model_class.examples()),
|
||||
output=Text(content_type='text/event-stream'),
|
||||
)
|
||||
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
|
||||
async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
|
||||
yield f'data: {it.model_dump_json()}\n\n'
|
||||
async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()): yield f'data: {it.model_dump_json()}\n\n'
|
||||
yield 'data: [DONE]\n\n'
|
||||
|
||||
_Metadata = openllm.MetadataOutput(
|
||||
timeout=llm.config['timeout'], model_name=llm.config['model_name'], #
|
||||
backend=llm.__llm_backend__, model_id=llm.model_id, #
|
||||
configuration=llm.config.model_dump_json().decode(),
|
||||
backend=llm.__llm_backend__, model_id=llm.model_id, configuration=llm.config.model_dump_json().decode(), #
|
||||
)
|
||||
|
||||
@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
|
||||
|
||||
@@ -31,7 +31,7 @@ def build_editable(path, package='openllm'):
|
||||
raise RuntimeError('Please install OpenLLM from PyPI or built it from Git source.')
|
||||
def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
|
||||
from . import RefResolver
|
||||
packages = ['scipy', 'bentoml[tracing]>=1.1.10', 'vllm==0.2.4', 'ray==2.6.0', f'openllm>={RefResolver.from_strategy("release").version}'] # apparently bnb misses this one
|
||||
packages = ['scipy', 'bentoml[tracing]>=1.1.10', f'openllm[vllm]>={RefResolver.from_strategy("release").version}'] # apparently bnb misses this one
|
||||
if adapter_map is not None: packages += ['openllm[fine-tune]']
|
||||
if extra_dependencies is not None: packages += [f'openllm[{k}]' for k in extra_dependencies]
|
||||
if llm.config['requirements'] is not None: packages.extend(llm.config['requirements'])
|
||||
@@ -50,7 +50,7 @@ def construct_docker_options(llm, _, quantize, adapter_map, dockerfile_template,
|
||||
environ['OPENLLM_CONFIG'] = f"'{environ['OPENLLM_CONFIG']}'"
|
||||
environ.pop('BENTOML_HOME', None) # NOTE: irrelevant in container
|
||||
environ['NVIDIA_DRIVER_CAPABILITIES'] = 'compute,utility'
|
||||
return DockerOptions(cuda_version='12.1', env=environ, dockerfile_template=dockerfile_template)
|
||||
return DockerOptions(cuda_version='12.1', python_version='3.11', env=environ, dockerfile_template=dockerfile_template)
|
||||
@inject
|
||||
def create_bento(
|
||||
bento_tag, llm_fs, llm, #
|
||||
|
||||
@@ -73,6 +73,7 @@ requestBody:
|
||||
stream: false
|
||||
chat_template: __chat_template__
|
||||
add_generation_prompt: __add_generation_prompt__
|
||||
echo: false
|
||||
streaming:
|
||||
summary: Streaming input example
|
||||
value:
|
||||
@@ -92,6 +93,7 @@ requestBody:
|
||||
- "<|endoftext|>"
|
||||
chat_template: __chat_template__
|
||||
add_generation_prompt: __add_generation_prompt__
|
||||
echo: false
|
||||
schema:
|
||||
$ref: '#/components/schemas/ChatCompletionRequest'
|
||||
responses:
|
||||
|
||||
@@ -57,6 +57,7 @@ class ChatCompletionRequest:
|
||||
max_tokens: t.Optional[int] = attr.field(default=None)
|
||||
presence_penalty: t.Optional[float] = attr.field(default=None)
|
||||
frequency_penalty: t.Optional[float] = attr.field(default=None)
|
||||
echo: t.Optional[bool] = attr.field(default=False)
|
||||
logit_bias: t.Optional[t.Dict[str, float]] = attr.field(default=None)
|
||||
user: t.Optional[str] = attr.field(default=None)
|
||||
# supported by vLLM and us
|
||||
|
||||
Reference in New Issue
Block a user