feat: mixtral support (#770)

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
2026-05-03 21:32:46 -04:00 · 2023-12-12 01:33:13 -05:00
parent b9260a8df3
commit d3328343d7
6 changed files with 14 additions and 12 deletions
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 import logging, typing as t
-import _service_vars as svars
-import bentoml, openllm
+import bentoml, openllm, _service_vars as svars
 from openllm_core._schemas import MessageParam
 from bentoml.io import JSON, Text

@@ -16,23 +15,23 @@ llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)

@svc.api(
  route='/v1/generate',
-  input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()), #
+  input=JSON.from_sample(llm_model_class.examples()),
+  output=JSON.from_sample(openllm.GenerationOutput.examples()),
 )
 async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]: return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()

@svc.api(
  route='/v1/generate_stream',
-  input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'), #
+  input=JSON.from_sample(llm_model_class.examples()),
+  output=Text(content_type='text/event-stream'),
 )
 async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
-  async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
-    yield f'data: {it.model_dump_json()}\n\n'
+  async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()): yield f'data: {it.model_dump_json()}\n\n'
  yield 'data: [DONE]\n\n'

 _Metadata = openllm.MetadataOutput(
  timeout=llm.config['timeout'], model_name=llm.config['model_name'], #
-  backend=llm.__llm_backend__, model_id=llm.model_id, #
-  configuration=llm.config.model_dump_json().decode(),
+  backend=llm.__llm_backend__, model_id=llm.model_id, configuration=llm.config.model_dump_json().decode(), #
 )

@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -31,7 +31,7 @@ def build_editable(path, package='openllm'):
  raise RuntimeError('Please install OpenLLM from PyPI or built it from Git source.')
 def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
  from . import RefResolver
-  packages = ['scipy', 'bentoml[tracing]>=1.1.10', 'vllm==0.2.4', 'ray==2.6.0', f'openllm>={RefResolver.from_strategy("release").version}']  # apparently bnb misses this one
+  packages = ['scipy', 'bentoml[tracing]>=1.1.10', f'openllm[vllm]>={RefResolver.from_strategy("release").version}']  # apparently bnb misses this one
  if adapter_map is not None: packages += ['openllm[fine-tune]']
  if extra_dependencies is not None: packages += [f'openllm[{k}]' for k in extra_dependencies]
  if llm.config['requirements'] is not None: packages.extend(llm.config['requirements'])
@@ -50,7 +50,7 @@ def construct_docker_options(llm, _, quantize, adapter_map, dockerfile_template,
  environ['OPENLLM_CONFIG'] = f"'{environ['OPENLLM_CONFIG']}'"
  environ.pop('BENTOML_HOME', None)  # NOTE: irrelevant in container
  environ['NVIDIA_DRIVER_CAPABILITIES'] = 'compute,utility'
-  return DockerOptions(cuda_version='12.1', env=environ, dockerfile_template=dockerfile_template)
+  return DockerOptions(cuda_version='12.1', python_version='3.11', env=environ, dockerfile_template=dockerfile_template)
@inject
 def create_bento(
  bento_tag, llm_fs, llm, #
--- a/openllm-python/src/openllm/entrypoints/_openapi.py
+++ b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -73,6 +73,7 @@ requestBody:
            stream: false
            chat_template: __chat_template__
            add_generation_prompt: __add_generation_prompt__
+            echo: false
        streaming:
          summary: Streaming input example
          value:
@@ -92,6 +93,7 @@ requestBody:
              - "<|endoftext|>"
            chat_template: __chat_template__
            add_generation_prompt: __add_generation_prompt__
+            echo: false
      schema:
        $ref: '#/components/schemas/ChatCompletionRequest'
 responses:
--- a/openllm-python/src/openllm/protocol/openai.py
+++ b/openllm-python/src/openllm/protocol/openai.py
@@ -57,6 +57,7 @@ class ChatCompletionRequest:
  max_tokens: t.Optional[int] = attr.field(default=None)
  presence_penalty: t.Optional[float] = attr.field(default=None)
  frequency_penalty: t.Optional[float] = attr.field(default=None)
+  echo: t.Optional[bool] = attr.field(default=False)
  logit_bias: t.Optional[t.Dict[str, float]] = attr.field(default=None)
  user: t.Optional[str] = attr.field(default=None)
  # supported by vLLM and us