From d3328343d7f59c92737a777e83e21989c6cd4a0b Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Tue, 12 Dec 2023 01:33:13 -0500
Subject: [PATCH] feat: mixtral support (#770)

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
---
 openllm-python/pyproject.toml                     |  2 +-
 openllm-python/src/openllm/_service.py            | 15 +++++++--------
 openllm-python/src/openllm/bundle/_package.py     |  4 ++--
 .../src/openllm/entrypoints/_openapi.py           |  2 ++
 openllm-python/src/openllm/protocol/openai.py     |  1 +
 tools/dependencies.py                             |  2 +-
 6 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
index 6d89f7ba..7853262d 100644
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -119,7 +119,7 @@ openai = ["openai[datalib]>=1", "tiktoken"]
 playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
 qwen = ["cpm-kernels", "tiktoken"]
 starcoder = ["bitsandbytes"]
-vllm = ["vllm>=0.2.4"]
+vllm = ["vllm>=0.2.4", "megablocks", "stanford-stk", "ray==2.6.0"]
 
 [tool.hatch.version]
 fallback-version = "0.0.0"
diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py
index 46c119d9..f99f4339 100644
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 import logging, typing as t
-import _service_vars as svars
-import bentoml, openllm
+import bentoml, openllm, _service_vars as svars
 from openllm_core._schemas import MessageParam
 from bentoml.io import JSON, Text
 
@@ -16,23 +15,23 @@ llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
 
 @svc.api(
   route='/v1/generate',
-  input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()), #
+  input=JSON.from_sample(llm_model_class.examples()),
+  output=JSON.from_sample(openllm.GenerationOutput.examples()),
 )
 async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]: return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
 
 @svc.api(
   route='/v1/generate_stream',
-  input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'), #
+  input=JSON.from_sample(llm_model_class.examples()),
+  output=Text(content_type='text/event-stream'),
 )
 async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
-  async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
-    yield f'data: {it.model_dump_json()}\n\n'
+  async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()): yield f'data: {it.model_dump_json()}\n\n'
   yield 'data: [DONE]\n\n'
 
 _Metadata = openllm.MetadataOutput(
   timeout=llm.config['timeout'], model_name=llm.config['model_name'], #
-  backend=llm.__llm_backend__, model_id=llm.model_id, #
-  configuration=llm.config.model_dump_json().decode(),
+  backend=llm.__llm_backend__, model_id=llm.model_id, configuration=llm.config.model_dump_json().decode(), #
 )
 
 @svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index e83f0a53..227ce50e 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -31,7 +31,7 @@ def build_editable(path, package='openllm'):
   raise RuntimeError('Please install OpenLLM from PyPI or built it from Git source.')
 def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
   from . import RefResolver
-  packages = ['scipy', 'bentoml[tracing]>=1.1.10', 'vllm==0.2.4', 'ray==2.6.0', f'openllm>={RefResolver.from_strategy("release").version}']  # apparently bnb misses this one
+  packages = ['scipy', 'bentoml[tracing]>=1.1.10', f'openllm[vllm]>={RefResolver.from_strategy("release").version}']  # apparently bnb misses this one
   if adapter_map is not None: packages += ['openllm[fine-tune]']
   if extra_dependencies is not None: packages += [f'openllm[{k}]' for k in extra_dependencies]
   if llm.config['requirements'] is not None: packages.extend(llm.config['requirements'])
@@ -50,7 +50,7 @@ def construct_docker_options(llm, _, quantize, adapter_map, dockerfile_template,
   environ['OPENLLM_CONFIG'] = f"'{environ['OPENLLM_CONFIG']}'"
   environ.pop('BENTOML_HOME', None)  # NOTE: irrelevant in container
   environ['NVIDIA_DRIVER_CAPABILITIES'] = 'compute,utility'
-  return DockerOptions(cuda_version='12.1', env=environ, dockerfile_template=dockerfile_template)
+  return DockerOptions(cuda_version='12.1', python_version='3.11', env=environ, dockerfile_template=dockerfile_template)
 @inject
 def create_bento(
   bento_tag, llm_fs, llm, #
diff --git a/openllm-python/src/openllm/entrypoints/_openapi.py b/openllm-python/src/openllm/entrypoints/_openapi.py
index db1abf36..05d5111b 100644
--- a/openllm-python/src/openllm/entrypoints/_openapi.py
+++ b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -73,6 +73,7 @@ requestBody:
             stream: false
             chat_template: __chat_template__
             add_generation_prompt: __add_generation_prompt__
+            echo: false
         streaming:
           summary: Streaming input example
           value:
@@ -92,6 +93,7 @@ requestBody:
               - "<|endoftext|>"
             chat_template: __chat_template__
             add_generation_prompt: __add_generation_prompt__
+            echo: false
       schema:
         $ref: '#/components/schemas/ChatCompletionRequest'
 responses:
diff --git a/openllm-python/src/openllm/protocol/openai.py b/openllm-python/src/openllm/protocol/openai.py
index b9b1b422..390f437f 100644
--- a/openllm-python/src/openllm/protocol/openai.py
+++ b/openllm-python/src/openllm/protocol/openai.py
@@ -57,6 +57,7 @@ class ChatCompletionRequest:
   max_tokens: t.Optional[int] = attr.field(default=None)
   presence_penalty: t.Optional[float] = attr.field(default=None)
   frequency_penalty: t.Optional[float] = attr.field(default=None)
+  echo: t.Optional[bool] = attr.field(default=False)
   logit_bias: t.Optional[t.Dict[str, float]] = attr.field(default=None)
   user: t.Optional[str] = attr.field(default=None)
   # supported by vLLM and us
diff --git a/tools/dependencies.py b/tools/dependencies.py
index 74e640bd..e8312b61 100755
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -155,7 +155,7 @@ GGML_DEPS = ['ctransformers']
 CTRANSLATE_DEPS = ['ctranslate2>=3.22.0']
 AWQ_DEPS = ['autoawq']
 GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2']
-VLLM_DEPS = ['vllm>=0.2.4']
+VLLM_DEPS = ['vllm>=0.2.4', 'megablocks', 'stanford-stk', 'ray==2.6.0']
 
 _base_requirements: dict[str, t.Any] = {
   inflection.dasherize(name): config_cls.__openllm_requirements__